2012-11-20 143 views
2

我在Linux上使用scrapy 0.16.2版本。我运行:scrapy未处理的异常

scrapy crawl mycrawlspider -s JOBDIR=/mnt/mycrawlspider 

我得到这个错误,块scrapy(挂起,但不会自动完成,只有^ C停止它)

2012-11-20 15:04:51+0000 [-] Unhandled Error Traceback (most recent call last): File "/usr/lib/python2.7/site-packages/scrapy/commands/crawl.py", line 45, in run 
     self.crawler.start()  File "/usr/lib/python2.7/site-packages/scrapy/crawler.py", line 80, in start 
     reactor.run(installSignalHandlers=False) # blocking call  File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 1169, in run 
     self.mainLoop()  File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 1178, in mainLoop 
     self.runUntilCurrent() --- <exception caught here> ---  File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 800, in runUntilCurrent 
     call.func(*call.args, **call.kw)  File "/usr/lib/python2.7/site-packages/scrapy/utils/reactor.py", line 41, in __call__ 
     return self._func(*self._a, **self._kw)  File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 116, in 
_next_request 
     self.crawl(request, spider)  File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 172, in crawl 
     self.schedule(request, spider) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 176, in schedule 
     return self.slots[spider].scheduler.enqueue_request(request)  File "/usr/lib/python2.7/site-packages/scrapy/core/scheduler.py", line 48, in enqueue_request 
     if not request.dont_filter and self.df.request_seen(request): exceptions.AttributeError: 'NoneType' object has no attribute 'dont_filter' 

BTW这个曾在0.14

版本

下面是代码:

class MySpider(CrawlSpider): 
    name = 'alrroya' 

    NEW_IGNORED_EXTENSIONS = list(IGNORED_EXTENSIONS) 
    NEW_IGNORED_EXTENSIONS.remove('pdf') 

    download_delay = 0.05 
    # Stay within these domains when crawling 
    allowed_domains = [] 

    all_domains = {} 

    start_urls = [] 

    # Add our callback which will be called for every found link 
    rules = [ 
     Rule(SgmlLinkExtractor(deny_extensions=NEW_IGNORED_EXTENSIONS, tags=('a', 'area', 'frame', 'iframe'), attrs=('href', 'src')), follow=True, callback='parse_crawled_page') 
    ] 

    # How many pages crawled 
    crawl_count = 0 

    # How many PDFs we have found 
    pdf_count = 0 

    def __init__(self, *args, **kwargs): 
     CrawlSpider.__init__(self, *args, **kwargs) 
     dispatcher.connect(self._spider_closed, signals.spider_closed) 
     dispatcher.connect(self._spider_opened, signals.spider_opened) 
     self.load_allowed_domains_and_start_urls() 

    def allowed_to_start(self): 
     curr_date = datetime.today() 
     curr_date = datetime(curr_date.year, curr_date.month, curr_date.day) 
     jobdir = self.settings['JOBDIR'] 
     if jobdir: 
      mnt = os.path.dirname(os.path.normpath(jobdir)) 
     else: 
      mnt = '' 

     checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name) 
     day = timedelta(days=1) 
     if os.path.exists(checkfile): 
      f = open(checkfile, 'r') 
      data = f.read() 
      f.close() 
      data = data.split('\n') 
      reason = data[0] 
      try: 
       reason_date = datetime.strptime(data[1], '%Y-%m-%d') 

      except Exception as ex: 
       reason_date = None 

      if reason_date and 'shutdown' in reason: 
       reason = True 

      else: 
       if reason_date and reason_date + day <= curr_date and 'finished' in reason: 
        reason = True 

       else: 
        reason = False 
     else: 
      reason = True 

     return reason 

    def _spider_opened(self, spider): 
     if spider is not self: 
      return 

     curr_date = datetime.today() 
     curr_date = datetime(curr_date.year, curr_date.month, curr_date.day) 
     jobdir = spider.settings['JOBDIR'] 
     if jobdir: 
      mnt = os.path.dirname(os.path.normpath(jobdir)) 
     else: 
      mnt = '' 

     checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name) 
     day = timedelta(days=1) 
     if os.path.exists(checkfile): 
      f = open(checkfile, 'r') 
      data = f.read() 
      f.close() 
      data = data.split('\n') 
      reason = data[0] 
      try: 
       reason_date = datetime.strptime(data[1], '%Y-%m-%d') 

      except Exception as ex: 
       reason_date = None 

      if reason_date and 'shutdown' in reason: 
       f = open(checkfile, 'w') 
       f.write('started\n') 
       f.write(str(date.today())) 
       f.close() 

      else: 
       if reason_date and reason_date + day <= curr_date and 'finished' in reason: 
        f = open(checkfile, 'w') 
        f.write('started\n') 
        f.write(str(date.today())) 
        f.close() 

       else: 
        crawler.engine.close_spider(self, 'finished') 
        if jobdir and os.path.exists(jobdir): 
         shutil.rmtree(jobdir) 
         f = open(checkfile, 'w') 
         f.write('finished\n') 
         f.write(str(date.today())) 
         f.close() 
        os._exit(1) 
     else: 
      f = open(checkfile, 'w') 
      f.write('started\n') 
      f.write(str(date.today())) 
      f.close() 

    def _spider_closed(self, spider, reason): 
     if spider is not self: 
      return 

     jobdir = spider.settings['JOBDIR'] 
     if jobdir: 
      mnt = os.path.dirname(os.path.normpath(jobdir)) 
     else: 
      mnt = '' 

     checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name) 
     if 'shutdown' in reason: 
      f = open(checkfile, 'w') 
      f.write('shutdown\n') 
      f.write(str(date.today())) 
      f.close() 
     else: 
      if jobdir and os.path.exists(jobdir): 
       shutil.rmtree(jobdir) 
       f = open(checkfile, 'w') 
       f.write('finished\n') 
       f.write(str(date.today())) 
       f.close() 

    def _requests_to_follow(self, response): 
     if getattr(response, 'encoding', None) != None: 
      return CrawlSpider._requests_to_follow(self, response) 
     else: 
      return [] 

    def make_requests_from_url(self, url): 
     http_client = httplib2.Http() 
     try: 
      headers = { 
       'content-type': 'text/html', 
       'user-agent': random.choice(USER_AGENT_LIST) 
      } 
      response, content = http_client.request(url, method='HEAD', headers=headers) 
      #~ if 'pdf' in response['content-type'].lower() or (url.endswith('.pdf') and 'octet-stream' in response['content-type'].lower()): 
      if 'pdf' in response['content-type'].lower() or 'octet-stream' in response['content-type'].lower(): 
       if self.allowed_to_start(): 
        self.get_pdf_link(url) 

      else: 
       return CrawlSpider.make_requests_from_url(self, url) 

     except Exception as ex: 
      return CrawlSpider.make_requests_from_url(self, url) 

    def get_pdf_link(self, url): 
     source = self.__class__.name 
     parsed_url = urlparse(url) 
     url_domain = parsed_url.netloc 
     url_path = parsed_url.path 
     if url_domain: 
      for domain, paths in self.__class__.all_domains[source]['allow_domains'].iteritems(): 
       if url_domain.endswith(domain): 
        pre_and = False 
        pre_or = False 
        and_cond = True 
        or_cond = False 
        for path in paths: 
         if path[0:1] == '!': 
          pre_and = True 
          if path[1:] not in url_path: 
           and_cond = and_cond and True 
          else: 
           and_cond = and_cond and False 

         else: 
          pre_or = True 
          if path in url_path: 
           or_cond = or_cond or True 
          else: 
           or_cond = or_cond or False 

        if pre_and and pre_or: 
         if and_cond and or_cond: 
          self.pdf_process(source, url) 
          return 
        elif pre_and: 
         if and_cond: 
          self.pdf_process(source, url) 
          return 
        elif pre_or: 
         if or_cond: 
          self.pdf_process(source, url) 
          return 
        else: 
         self.pdf_process(source, url) 
         return 

    def parse_crawled_page(self, response): 
     self.__class__.crawl_count += 1 
     crawl_count = self.__class__.crawl_count 
     if crawl_count % 100 == 0: 
      print 'Crawled %d pages' % crawl_count 

     if 'pdf' in response.headers.get('content-type', '').lower(): 
      self.get_pdf_link(response.url) 

     return Item() 

    def load_allowed_domains_and_start_urls(self): 
     day = timedelta(days=1) 
     currdate = date.today() 

     alrroya = ('http://epaper.alrroya.com/currentissues.php?editiondt=' + currdate.strftime('%Y/%m/%d'),) 

     self.__class__.all_domains = { 
      'alrroya': { 
       'start_urls': alrroya, 
       'allow_domains': { 
        'epaper.alrroya.com': frozenset(()), 
       } 
      } 
     } 

     for domain in self.__class__.all_domains[self.__class__.name]['allow_domains']: 
      self.__class__.allowed_domains.append(domain) 

     self.__class__.start_urls.extend(self.__class__.all_domains[self.__class__.name]['start_urls']) 

    def pdf_process(self, source, url): 
     print '!!! ' + source + ' ' + url 
+0

从'mycrawlspider'发布代码 –

回答

3

这似乎是Scrapy的错误。当前版本似乎不接受从make_requests_from_url()返回的列表。我能够通过以下方式修改Scrapy代码来解决问题。

在文件Scrapy-0.16.5-py2.7.egg/scrapy/spider.py

变化:

def start_requests(self): 
    for url in self.start_urls: 
     yield self.make_requests_from_url(url) 

要:

def start_requests(self): 
    for url in self.start_urls: 
     requests = self.make_requests_from_url(url) 
     if type(requests) is list: 
      for request in requests: 
       yield request 
     else: 
      yield requests 

我想到的是,官方的Scrapy人员最终会解决这个问题。