2017-03-03 45 views
1

我尝试用ABOT抓取网站地图。我从here激发我的代码。用ABOT抓取网站地图

抓取页面完成后,内容文本为空(e.CrawledPage,Crawler_PageCrawlCompleted)。另外,SiteMapFinder.GetLinks从未碰过。

请指教我,我的问题在哪里。

using Abot.Core; 
using Abot.Crawler; 
using Abot.Poco; 
using CsQuery.ExtensionMethods; 
using System; 
using System.Collections.Generic; 

namespace WebCrawler 
{ 


public class SiteMapFinder : IHyperLinkParser 
{ 
    private readonly HyperLinkParser _linkParser; 
    public SiteMapFinder() 
    { 
     _linkParser = new AngleSharpHyperlinkParser(); 
    } 

    IEnumerable<Uri> IHyperLinkParser.GetLinks(CrawledPage crawledPage) 
    { 
     if (crawledPage.HttpWebResponse.ContentType == "text/xml") 
     { 
      Console.WriteLine(crawledPage.Uri.AbsoluteUri); 

     } 



     return _linkParser.GetLinks(crawledPage); 

    } 
} 
class Program 
{ 
    static void Main(string[] args) 
    { 
     SiteMapFinder finder = new SiteMapFinder(); 
     PoliteWebCrawler crawler = new PoliteWebCrawler(null, null, null, null, null, finder, null, null, null); 


     crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted; 
     CrawlResult result = crawler.Crawl(new Uri("http://www.example.com/sitemap/")); 


    } 

    private static void Crawler_PageCrawlCompleted(object sender, PageCrawlCompletedArgs e) 
    { 
     Console.WriteLine(e.CrawledPage.Uri.AbsoluteUri); 
     e.CrawledPage.HttpWebResponse.Headers.AllKeys.ForEach(k => Console.WriteLine($"{k}: {e.CrawledPage.HttpWebResponse.Headers[k]}")); 
    } 
} 

}

回答

1

好吧,我的问题是关于app.config。应该添加text/XMLdownloadableContentTypes

<abot> 
    <crawlBehavior 
     .... 
     .... 
     downloadableContentTypes="text/html, text/plain, text/xml" 

这里是我完成的加载XML和获取站点地图链接的代码。

using Abot.Core; 
using Abot.Crawler; 
using Abot.Poco; 
using CsQuery.ExtensionMethods; 
using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Xml; 

namespace WebCrawler 
{ 

    public class SiteMapFinder : IHyperLinkParser 
    { 
     private readonly HyperLinkParser _linkParser; 
     public SiteMapFinder() 
     { 
      _linkParser = new AngleSharpHyperlinkParser(); 
     } 

     IEnumerable<Uri> IHyperLinkParser.GetLinks(CrawledPage crawledPage) 
     { 
      if (crawledPage.HttpWebResponse.ContentType == "text/xml") 
      { 
       XmlDocument xml = new XmlDocument(); 
       xml.LoadXml(crawledPage.Content.Text); 

       if (xml.DocumentElement == null) return new Uri[] {}; 


       XmlNamespaceManager manager = new XmlNamespaceManager(xml.NameTable); 
       manager.AddNamespace("s", xml.DocumentElement.NamespaceURI); 


       var links = xml.SelectNodes("/s:sitemapindex/s:sitemap", manager); 
       if(links == null) return new Uri[] { }; 
       return links 
         .Cast<XmlNode>() 
         .Select(x => new Uri(x.InnerText)); 




      } 



      return _linkParser.GetLinks(crawledPage); 

     } 
    } 
    class Program 
    { 
     static void Main(string[] args) 
     { 
      SiteMapFinder finder = new SiteMapFinder(); 
      PoliteWebCrawler crawler = new PoliteWebCrawler(null, null, null, null, null, finder, null, null, null); 


      crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted; 
      CrawlResult result = crawler.Crawl(new Uri("http://tenders.rfpalertservices.com/sitemap/")); 


     } 

     private static void Crawler_PageCrawlCompleted(object sender, PageCrawlCompletedArgs e) 
     { 
      Console.WriteLine(e.CrawledPage.Uri.AbsoluteUri); 
      e.CrawledPage.HttpWebResponse.Headers.AllKeys.ForEach(k => Console.WriteLine($"{k}: {e.CrawledPage.HttpWebResponse.Headers[k]}")); 
     } 
    } 
}