CrawlSpider无法遵循一些网站的规则

我试图开始我的第一个scrapy项目，我陷入了一个奇怪的问题。对于某些网站，我的抓取工具运行良好，对于其他网站，它不遵循提取链接的规则。我在SO上搜索，看到其他人有类似的问题，但在他们的情况下，他们的格式错误allow参数导致Filtered offsite request，这不会发生在我身上。我的日志在这里http://pastebin.com/r1pXmeJW（首先是失败的url，然后是一个正常工作的url，因为我不能发布超过2个链接...）。CrawlSpider无法遵循一些网站的规则

我的蜘蛛是通过使用该API的Python脚本控制：

# -*- coding: utf-8 -*- 

from twisted.internet import reactor 
from scrapy.crawler import Crawler 
from scrapy import log, signals 
from scrapy.utils.project import get_project_settings 
from govcrawl.spiders.main_spider import DomainSpider 
import sys, urlparse, re 
from scrapy.contrib.spiders import Rule 
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor 

args = sys.argv[1].split('§') 
url_id = args[0] 
start_url = args[1] 
url_parts = urlparse.urlparse(start_url) 
allowed_domain = url_parts.netloc 
allowed_path = '/'.join(url_parts.path.split('/')[:-1]) 
cur_state = sys.argv[2] 

spider = DomainSpider(
    start_urls = [start_url], 
    allowed_domains = [allowed_domain], 
    url_id = url_id, 
    cur_state = cur_state, 
    rules = (
     Rule(
     LxmlLinkExtractor(
      allow = re.compile(r".*%s.*" % re.escape(allowed_path), re.IGNORECASE), 
      allow_domains = [allowed_domain], 
      tags = ('a', 'area', 'frame'), 
      attrs = ('href', 'src') 
     ), 
     callback = "parse_items", 
     follow = True 
    ), 
    ) 
) 
settings = get_project_settings() 
crawler = Crawler(settings) 
crawler.signals.connect(reactor.stop, signal = signals.spider_closed) 
crawler.configure() 
crawler.crawl(spider) 
crawler.start() 
log.start() 
reactor.run()

这是我DomainSpider：

import re 
from govcrawl.items import DomainItem 
from scrapy.utils.markup import remove_tags 
from scrapy.contrib.spiders import CrawlSpider 
from scrapy import log 

class DomainSpider(CrawlSpider): 
    name = "govcrawl_main" 

    def parse_start_url(self, response): 
     return self.parse_items(response) 

    def parse_items(self, response): 
     pages_done = self.crawler.stats.get_value('downloader/response_count') 
     pages_todo = self.crawler.stats.get_value('scheduler/enqueued') - self.crawler.stats.get_value('downloader/response_count') 
     log.msg("URL: %s (%s) Crawled %d pages. To Crawl: %d" % (self.start_urls[0], self.url_id, pages_done, pages_todo), spider = self) 
     links = [] 
     for sel in response.xpath('//a'): 
      href = sel.xpath('@href').extract() 
      if len(href) > 0: 
       href = href[0] 
       if href.startswith("http"): 
       links.append(href) 
     item = DomainItem() 
     item["url"] = response.url 
     item["text"] = re.sub(r'\s{2,}', ' ', remove_tags(' '.join(response.xpath('//body//text()').extract()))).strip() 
     item["links"] = links 
     self.crawler.stats.inc_value('pages_crawled') 
     yield item

任何想法如何使履带遵循网站规则那失败了？

来源

2014-11-04 Mikk

有了'ìpdb'我能在这里https://github.com/scrapy/scrapy/blob/master/scrapy/contrib/linkextractors/lxmlhtml.py#L97把休息，看到在' html'正文的内容不正确，没有链接。我**知道**服务器正在向我发送正确的html，所以问题出在数据获取和链接提取器的调用者之间...... – Mikk 2014-11-04 22:56:06

原来，返回错误的页面有一个格式不正确的html代码，其中多个</html>，其中lxml解析器不喜欢。由于scrapy不允许使用CrawlSpider有不同的解析器，我最终重新实现定期Spider对象，其行为或多或少为CrawlSpider：

import urlparse, re 
from scrapy import Spider, log 
from bs4 import BeautifulSoup 
from scrapy.http import Request 
from govcrawl.items import DomainItem 

class DomainSimpleSpider(Spider): 
    name = "govcrawl_simple" 

    def parse(self, response): 
     pages_done = self.crawler.stats.get_value('downloader/response_count') 
     pages_todo = self.crawler.stats.get_value('scheduler/enqueued') - self.crawler.stats.get_value('downloader/response_count') 
     log.msg("URL: %s (%s) Crawled %d pages. To Crawl: %d" % (self.start_urls[0], self.url_id, pages_done, pages_todo), spider = self) 
     #import ipdb 
     #ipdb.set_trace() 
     soup = BeautifulSoup(response._body, "html5lib") 
     links = [] 
     for tag in self.tags: 
      for a in soup.find_all(tag): 
       for attr in self.attrs: 
       if attr in a.attrs: 
        href = a.attrs[attr] 
        if href.startswith("http"): 
         links.append(href) 
        href = urlparse.urljoin(response.url, href) 
        href_parts = urlparse.urlparse(href.replace('\t', '').replace('\r', '').replace('\n', '').replace(' ', '+')) 
        if re.match(self.allow, href_parts.path) and not self.forbidden_extension(href_parts.path): 
         yield Request(href) 
     for script in soup(["script", "style"]): 
      script.extract() 
     item = DomainItem() 
     item["url"] = response.url 
     #item["text"] = re.sub(r'\s{2,}', ' ', remove_tags(' '.join(response.xpath('//body//text()').extract()))).strip() 
     item["text"] = soup.get_text() 
     item["links"] = links 
     self.crawler.stats.inc_value('pages_crawled') 
     yield item 

    def forbidden_extension(self, url): 
     url = url.lower() 
     return url.endswith("pdf") or url.endswith("jpg") or url.endswith("wmv") or url.endswith("avi") or url.endswith("pptx") or url.endswith("gif") or url.endswith("mp3") or url.endswith("mp4") or url.endswith("wav") or url.endswith("mov") or url.endswith("ppt") or url.endswith("xls") or url.endswith("doc") or url.endswith("docx") or url.endswith("xlsx") or url.endswith("flv") or url.endswith("wma") or url.endswith("jpeg") or url.endswith("png") or url.endswith("odf") or url.endswith("ods") or url.endswith("zip") or url.endswith("gz") or url.endswith("tar") or url.endswith("7z") or url.endswith("rar") or url.endswith("vob")

这种蜘蛛可以通过以下Python脚本进行控制：

from twisted.internet import reactor 
from scrapy.crawler import Crawler 
from scrapy import log, signals 
from scrapy.utils.project import get_project_settings 
from govcrawl.spiders.simple_spider import DomainSimpleSpider 
import urlparse, re 
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor 

start_url = ... 
url_parts = urlparse.urlparse(start_url) 
allowed_domain = url_parts.netloc 
allowed_path = '/'.join(url_parts.path.split('/')[:-1]) 

spider = DomainSimpleSpider(
    start_urls = [start_url], 
    allowed_domains = [allowed_domain], 
    allow = re.compile(r".*%s.*" % re.escape(allowed_path), re.IGNORECASE), 
    tags = ('a', 'area', 'frame'), 
    attrs = ('href', 'src'), 
    response_type_whitelist = [r"text/html", r"application/xhtml+xml", r"application/xml"] 
) 
settings = get_project_settings() 
crawler = Crawler(settings) 
crawler.signals.connect(reactor.stop, signal = signals.spider_closed) 
crawler.configure() 
crawler.crawl(spider) 
crawler.start() 
log.start() 
reactor.run()

需要注意的是：

我使用html5lib解析器从BeautifulSoup而不是lxml。 html5lib可以很好地处理多个</html>，但它是一个外部依赖项，因此您必须安装它。
出于某种原因，mimetype检查似乎不起作用。因此，我增加了一个forbidden_extensions功能防止Request创建非html文件，我不得不添加另一个DownloaderMiddleware，它利用了蜘蛛的response_type_whitelist（见Python Scrapy - mimetype based filter to avoid non-text file downloads为中间件实现）
看来，这个蜘蛛正在处理起始页面两次，但我坦率地不在意解决这个问题。

来源

2014-11-05 18:56:16 Mikk

CrawlSpider无法遵循一些网站的规则

回答

相关问题