2016-02-19 56 views
0

我需要得到一个tracback错误的帮助。在某些情况下,我有一个scrapy蜘蛛,很明显会出现一些处理错误,它意味着要废弃一个网站并在MongoDB中存储URL,Date和Title。我不确定该在哪里开始调试。这里是错误;在Python/Scrapy中调试回溯函数

2016-02-19 18:34:47 [scrapy] DEBUG: Crawled (200) <GET http://www.stuff.co.nz/business/farming/77040713/farmer-thomas-king-loses-supreme-court-case-over-farm-eviction> (referer: http://www.stuff.co.nz/business/) 
2016-02-19 18:34:47 [scrapy] ERROR: Error processing {'date': 'Thu Feb 18 19:49:51 UTC 2016', 
'title': 'Farmer loses eviction case', 
'url': 'http://www.stuff.co.nz/business/farming/77040713/farmer-thomas-king-loses-supreme-court-case-over-farm-eviction'} 
Traceback (most recent call last): 
    File "/usr/lib64/python2.7/site-packages/twisted/internet/defer.py", line 588, in _runCallbacks 
    current.result = callback(current.result, *args, **kw) 
    File "/usr/lib/python2.7/site-packages/scrapy_mongodb.py", line 222, in process_item 
    return self.insert_item(item, spider) 
    File "/usr/lib/python2.7/site-packages/scrapy_mongodb.py", line 251, in insert_item 
    self.collection.insert(item, continue_on_error=True) 
    File "/usr/lib64/python2.7/site-packages/pymongo/collection.py", line 1926, in insert 
    check_keys, manipulate, write_concern) 
    File "/usr/lib64/python2.7/site-packages/pymongo/collection.py", line 430, in _insert 
    gen(), check_keys, self.codec_options, sock_info) 
    File "/usr/lib64/python2.7/site-packages/pymongo/pool.py", line 254, in write_command 
    helpers._check_command_response(result) 
    File "/usr/lib64/python2.7/site-packages/pymongo/helpers.py", line 188, in _check_command_response 
    raise OperationFailure(msg % errmsg, code, response) 
OperationFailure: not authorized on article to execute command { insert: "stuffconz", ordered: false, writeConcern: { fsync: false }, documents: [ { _id: ObjectId('56c6a97702f22371605f4668'), url: "http://www.stuff.co.nz/business/farming/77040713/farmer-thomas-king-loses-supreme-court-case-over-farm-eviction", date: "Thu Feb 18 19:49:51 UTC 2016", title: "Farmer loses eviction case" } ] } 

这是scrapy蜘蛛;

from __future__ import absolute_import 

from scrapy import Spider 
from scrapy.selector import Selector 
from scrapy.http import Request 
from scrapy.linkextractors import LinkExtractor 
from harland.items import * 
from scrapy.spiders import CrawlSpider, Rule 

class StuffSpider(CrawlSpider): 
    name = "stuff" 
    allowed_domains = ["stuff.co.nz"] 
    start_urls = [ 
     "http://stuff.co.nz/business/", 
    ] 

    rules = (
     Rule(LinkExtractor(allow=".*/business.*"), callback='parse_article_page', follow=True), 
    ) 

    def parse_article_page(self, response): 

     article = Selector(response) 
     page = Selector(response).xpath('/html/head/meta[9]') 
     page_type = page.xpath('//meta[@property="og:type"]/@content').extract()[0] 

     if "article" in page_type: 
      item = StuffItem() 

      item_url = page.xpath('//meta[@property="og:url"]/@content').extract()[0] 
      item['url'] = str(item_url) 
      item_title = page.xpath('//meta[@property="og:title"]/@content').extract()[0] 
      item['title'] = str(item_title) 
      item_date = page.xpath('//*[@itemprop="datePublished"]/@content').extract()[0] 
      item['date'] = str(item_date) 
      yield item 

而这里的管道,直接从这里取:https://github.com/sebdah/scrapy-mongodb/blob/master/scrapy_mongodb.py

""" 
scrapy-mongodb - MongoDB pipeline for Scrapy 
Homepage: https://github.com/sebdah/scrapy-mongodb 
Author: Sebastian Dahlgren <[email protected]> 
License: Apache License 2.0 <http://www.apache.org/licenses/LICENSE-2.0.html> 
Copyright 2013 Sebastian Dahlgren 
Licensed under the Apache License, Version 2.0 (the "License"); 
you may not use this file except in compliance with the License. 
You may obtain a copy of the License at 
    http://www.apache.org/licenses/LICENSE-2.0 
Unless required by applicable law or agreed to in writing, software 
distributed under the License is distributed on an "AS IS" BASIS, 
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
See the License for the specific language governing permissions and 
limitations under the License. 
""" 
import datetime 

from pymongo import errors 
from pymongo.mongo_client import MongoClient 
from pymongo.mongo_replica_set_client import MongoReplicaSetClient 
from pymongo.read_preferences import ReadPreference 

from scrapy import log 
from scrapy.contrib.exporter import BaseItemExporter 

VERSION = '0.9.1' 


def not_set(string): 
    """ Check if a string is None or '' 
    :returns: bool - True if the string is empty 
    """ 
    if string is None: 
     return True 
    elif string == '': 
     return True 
    return False 


class MongoDBPipeline(BaseItemExporter): 
    """ MongoDB pipeline class """ 
    # Default options 
    config = { 
     'uri': 'mongodb://harland:[email protected]:27017', 
     'fsync': False, 
     'write_concern': 0, 
     'database': 'article', 
     'collection': 'stuffconz', 
     'replica_set': None, 
     'unique_key': None, 
     'buffer': None, 
     'append_timestamp': True, 
     'stop_on_duplicate': 0, 
    } 

    # Item buffer 
    current_item = 0 
    item_buffer = [] 

    # Duplicate key occurence count 
    duplicate_key_count = 0 

    def load_spider(self, spider): 
     self.crawler = spider.crawler 
     self.settings = spider.settings 

     # Versions prior to 0.25 
     if not hasattr(spider, 'update_settings') and hasattr(spider, 'custom_settings'): 
      self.settings.setdict(spider.custom_settings or {}, priority='project') 

    def open_spider(self, spider): 
     self.load_spider(spider) 

     # Configure the connection 
     self.configure() 

     if self.config['replica_set'] is not None: 
      connection = MongoReplicaSetClient(
       self.config['uri'], 
       replicaSet=self.config['replica_set'], 
       w=self.config['write_concern'], 
       fsync=self.config['fsync'], 
       read_preference=ReadPreference.PRIMARY_PREFERRED) 
     else: 
      # Connecting to a stand alone MongoDB 
      connection = MongoClient(
       self.config['uri'], 
       fsync=self.config['fsync'], 
       read_preference=ReadPreference.PRIMARY) 

     # Set up the collection 
     database = connection[self.config['database']] 
     self.collection = database[self.config['collection']] 
     log.msg(u'Connected to MongoDB {0}, using "{1}/{2}"'.format(
      self.config['uri'], 
      self.config['database'], 
      self.config['collection'])) 

     # Ensure unique index 
     if self.config['unique_key']: 
      self.collection.ensure_index(self.config['unique_key'], unique=True) 
      log.msg(u'Ensuring index for key {0}'.format(
       self.config['unique_key'])) 

     # Get the duplicate on key option 
     if self.config['stop_on_duplicate']: 
      tmpValue = self.config['stop_on_duplicate'] 
      if tmpValue < 0: 
       log.msg(
        (
         u'Negative values are not allowed for' 
         u' MONGODB_STOP_ON_DUPLICATE option.' 
        ), 
        level=log.ERROR 
       ) 
       raise SyntaxError(
        (
         'Negative values are not allowed for' 
         ' MONGODB_STOP_ON_DUPLICATE option.' 
        ) 
       ) 
      self.stop_on_duplicate = self.config['stop_on_duplicate'] 
     else: 
      self.stop_on_duplicate = 0 

    def configure(self): 
     """ Configure the MongoDB connection """ 
     # Handle deprecated configuration 
     if not not_set(self.settings['MONGODB_HOST']): 
      log.msg(
       u'DeprecationWarning: MONGODB_HOST is deprecated', 
       level=log.WARNING) 
      mongodb_host = self.settings['MONGODB_HOST'] 

      if not not_set(self.settings['MONGODB_PORT']): 
       log.msg(
        u'DeprecationWarning: MONGODB_PORT is deprecated', 
        level=log.WARNING) 
       self.config['uri'] = 'mongodb://{0}:{1:i}'.format(
        mongodb_host, 
        self.settings['MONGODB_PORT']) 
      else: 
       self.config['uri'] = 'mongodb://{0}:27017'.format(mongodb_host) 

     if not not_set(self.settings['MONGODB_REPLICA_SET']): 
      if not not_set(self.settings['MONGODB_REPLICA_SET_HOSTS']): 
       log.msg(
        (
         u'DeprecationWarning: ' 
         u'MONGODB_REPLICA_SET_HOSTS is deprecated' 
        ), 
        level=log.WARNING) 
       self.config['uri'] = 'mongodb://{0}'.format(
        self.settings['MONGODB_REPLICA_SET_HOSTS']) 

     # Set all regular options 
     options = [ 
      ('uri', 'MONGODB_URI'), 
      ('fsync', 'MONGODB_FSYNC'), 
      ('write_concern', 'MONGODB_REPLICA_SET_W'), 
      ('database', 'MONGODB_DATABASE'), 
      ('collection', 'MONGODB_COLLECTION'), 
      ('replica_set', 'MONGODB_REPLICA_SET'), 
      ('unique_key', 'MONGODB_UNIQUE_KEY'), 
      ('buffer', 'MONGODB_BUFFER_DATA'), 
      ('append_timestamp', 'MONGODB_ADD_TIMESTAMP'), 
      ('stop_on_duplicate', 'MONGODB_STOP_ON_DUPLICATE') 
     ] 

     for key, setting in options: 
      if not not_set(self.settings[setting]): 
       self.config[key] = self.settings[setting] 

     # Check for illegal configuration 
     if self.config['buffer'] and self.config['unique_key']: 
      log.msg(
       (
        u'IllegalConfig: Settings both MONGODB_BUFFER_DATA ' 
        u'and MONGODB_UNIQUE_KEY is not supported' 
       ), 
       level=log.ERROR) 
      raise SyntaxError(
       (
        u'IllegalConfig: Settings both MONGODB_BUFFER_DATA ' 
        u'and MONGODB_UNIQUE_KEY is not supported' 
       )) 

    def process_item(self, item, spider): 
     """ Process the item and add it to MongoDB 
     :type item: Item object 
     :param item: The item to put into MongoDB 
     :type spider: BaseSpider object 
     :param spider: The spider running the queries 
     :returns: Item object 
     """ 
     item = dict(self._get_serialized_fields(item)) 

     if self.config['buffer']: 
      self.current_item += 1 

      if self.config['append_timestamp']: 
       item['scrapy-mongodb'] = {'ts': datetime.datetime.utcnow()} 

      self.item_buffer.append(item) 

      if self.current_item == self.config['buffer']: 
       self.current_item = 0 
       return self.insert_item(self.item_buffer, spider) 

      else: 
       return item 

     return self.insert_item(item, spider) 

    def close_spider(self, spider): 
     """ Method called when the spider is closed 
     :type spider: BaseSpider object 
     :param spider: The spider running the queries 
     :returns: None 
     """ 
     if self.item_buffer: 
      self.insert_item(self.item_buffer, spider) 

    def insert_item(self, item, spider): 
     """ Process the item and add it to MongoDB 
     :type item: (Item object) or [(Item object)] 
     :param item: The item(s) to put into MongoDB 
     :type spider: BaseSpider object 
     :param spider: The spider running the queries 
     :returns: Item object 
     """ 
     if not isinstance(item, list): 
      item = dict(item) 

      if self.config['append_timestamp']: 
       item['scrapy-mongodb'] = {'ts': datetime.datetime.utcnow()} 

     if self.config['unique_key'] is None: 
      try: 
       self.collection.insert(item, continue_on_error=True) 
       log.msg(
        u'Stored item(s) in MongoDB {0}/{1}'.format(
         self.config['database'], self.config['collection']), 
        level=log.DEBUG, 
        spider=spider) 
      except errors.DuplicateKeyError: 
       log.msg(u'Duplicate key found', level=log.DEBUG) 
       if (self.stop_on_duplicate > 0): 
        self.duplicate_key_count += 1 
        if (self.duplicate_key_count >= self.stop_on_duplicate): 
         self.crawler.engine.close_spider(
          spider, 
          'Number of duplicate key insertion exceeded' 
         ) 
       pass 

     else: 
      key = {} 
      if isinstance(self.config['unique_key'], list): 
       for k in dict(self.config['unique_key']).keys(): 
        key[k] = item[k] 
      else: 
       key[self.config['unique_key']] = item[self.config['unique_key']] 

      self.collection.update(key, item, upsert=True) 

      log.msg(
       u'Stored item(s) in MongoDB {0}/{1}'.format(
        self.config['database'], self.config['collection']), 
       level=log.DEBUG, 
       spider=spider) 

     return item 

如果有人可以只点我在正确的方向,将是这样的帮助!我已经玩了好几天了,但是我开始觉得我对Python和scrapy还不够了解。

干杯,

+0

我对MongoDB并不熟悉,但是在尝试将文档插入到“文章”集合中时,这看起来像是权限/授权错误。据我所知,这不是一个scrapy问题。来自''mongodb:// harland:Ase0peedi @ localhost:27017'的证书是否在尝试向mongodb客户端插入文档时工作? –

+0

是的,这些都是证书,这很奇怪,因为我可以使用这些证书对数据库进行身份验证......但是一旦Scrapy试图插入某些内容,我会收到验证错误。我真的被卡住了! – Hamish

回答

0

之前执行了该方法的parse_article_page',您需要登录到使用scrapy方法废弃从该网站数据的网站。

错误清楚地表明:“OperationFailure:未授权执行命令的文章”,表示您无权执行命令。

class MySpider(scrapy.Spider): 
    name = 'myspider' 
    def start_requests(self): 
     return [scrapy.FormRequest("http://www.example.com/login", 
      formdata={'user': 'john', 'pass': 'secret'}, 
      callback=self.logged_in)] 
    def logged_in(self, response): 
     # here you would extract links to follow and return Requests for 
     # each of them, with another callback 
     pass 
+0

我不认为这是指网站,而是数据库。该网站不需要验证即可访问该文章。数据库被称为“文章”。 – Hamish