0
我使用HTMLAgility包来从我的Pinterest板上刮掉所有图像。我的代码只有返回25个结果时,应该有更多的项目。我怎样才能从电路板上刮去所有图像标签?刮掉Pinterest板上的所有图像
使用浏览器控件加载DOM,所以我们可以等待它刮前:通过所有的图像标签来迭代在DOM这一功能
private void LoadHtmlWithBrowser(String url, string dir)
{
webBrowser1.ScriptErrorsSuppressed = true;
webBrowser1.Navigate(url);
waitTillLoad(this.webBrowser1);
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
var documentAsIHtmlDocument3 = (mshtml.IHTMLDocument3)webBrowser1.Document.DomDocument;
StringReader sr = new StringReader(documentAsIHtmlDocument3.documentElement.outerHTML);
doc.Load(sr);
Scraper.ScrapeBoard(doc, dir);
}
通
public static bool ScrapeBoard(HtmlDocument document, string dir)
{
//var document = new HtmlWeb().Load(url);
var urls = document.DocumentNode.Descendants("img")
.Select(e => e.GetAttributeValue("src", null))
.Where(s => !String.IsNullOrEmpty(s));
//string dir = DateTime.Now.ToShortDateString().Replace("/", "_") + url.Replace("https://www.", "_");
Directory.CreateDirectory(dir);
string localFilename = "";
foreach (string s in urls)
{
try
{
localFilename = dir + "/" + Path.GetFileName(s);
using (WebClient client = new WebClient())
{
client.DownloadFile(s, localFilename);
}
}
catch (Exception ex)
{
return false;
}
}
return true;
}
确保在加载之前加载整个页面的功能
private void waitTillLoad(WebBrowser webBrControl)
{
WebBrowserReadyState loadStatus;
int waittime = 100000;
int counter = 0;
while (true)
{
loadStatus = webBrControl.ReadyState;
Application.DoEvents();
if ((counter > waittime) || (loadStatus == WebBrowserReadyState.Uninitialized) || (loadStatus == WebBrowserReadyState.Loading) || (loadStatus == WebBrowserReadyState.Interactive))
{
break;
}
counter++;
}
counter = 0;
while (true)
{
loadStatus = webBrControl.ReadyState;
Application.DoEvents();
if (loadStatus == WebBrowserReadyState.Complete && webBrControl.IsBusy != true)
{
break;
}
counter++;
}
}
当我检查返回的DOM(Stringreader sr)时,它只显示25个图像标签。为什么其余部分不是使用上述技术提取或加载的?