2016-12-02 21 views
0

我编写了一个抓取中文网站的程序,但发生了一些问题,我运行了该项目,但什么也没发生,我不知道为什么,这里是代码。如何在scrapy版本1.21中配置scrapy用户代理

(3) [email protected]:~/L/crawlAll$ tree 
. 
├── crawlAll 
│   ├── __init__.py 
│   ├── items.py 
│   ├── pipelines.py 
│   ├── settings.py 
│   ├── spiders 
│   │   ├── __init__.py 
│   │   └── TouTiao.py 
│   └── useragent.py 
├── LICENSE 
├── README.md 
└── scrapy.cfg 

文件:useragent.py

# -*-coding:utf-8-*- 

#import logging 
import random 
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 


class MyUserAgentMiddleware(UserAgentMiddleware): 
    def __init__(self,user_agent = 'Scrapy'): 
     super(MyUserAgentMiddleware,self).__init__() 
     self.user_agent = user_agent 

    def process_request(self,request,spider): 
     ua = random.choice(self.user_agent_list) 
     if ua: 
      #logger = logging.getLogger('') 

      print("******Current User Agent :%s***********"),ua 
      #logging.warning("Current User Agent:" + ua , logging.INFO) 
      request.headers.setdefault('User-Agent',ua) 
    user_agent_list = [ 
     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " 
     "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 
     "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " 
     "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 
     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " 
     "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 
     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " 
     "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 
     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " 
     "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 
     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " 
     "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 
     "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " 
     "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 
     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " 
     "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " 
     "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " 
     "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " 
     "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " 
     "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " 
     "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " 
     "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
     "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " 
     "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " 
     "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 
     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " 
     "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 
     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " 
     "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 
    ] 

Toutiao.py

# -*- coding: utf-8 -*- 
import scrapy 
import json 
import time 
from crawlAll.items import NewsSpiderItem 

class TouTiaoSpider (scrapy.Spider): 
    name = "toutiao" 
    allowed_domains = ["toutiao.com"] 
    start_urls = ['http://www.toutiao.com/articles_news_society/p1/'] 
    base_cat_url = 'http://www.toutiao.com/articles_news_society' 
    base_url = 'http://www.toutiao.com' 

    maxpage = 1 
    category = [ 
     'articles_news_society', 
    ] 

    def parse(self, response): 
     for ctg in self.category: 
      for page in range(1,self.maxpage): 
       url = self.base_url + '/' + ctg + '/p' + page 
      yield scrapy.Request(url,callback = self.parseNewsHref) 

    def parseNewsHref(self, response): 
     urls = response.xpath("//div[@class='info']//a/@href").extract() 
     for url in urls: 
      new_url = self.base_url + url 
      yield scrapy.Request(new_url, callback = self.parseNews) 

    def parseNews(self, response): 

     articles = response.xpath("//div[@id='article-main']") 
     item = NewsSpiderItem() 
     title = articles.xpath("//h1/text()").extract()[0] 
     tm = articles.xpath("//span[@class='time']/text()").extract()[0] 
     content = articles.xpath("//div[@class='article-content']//p/text()").extract() 

     if(len(title) != 0 and len(tm) != 0 and len(content) != 0): 
      item['title'] = title 
      item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M'))) 
      item['url'] = response.url 
      cc = '' 
      if(len(content) != 0): 
       for c in content: 
        cc = cc + c + '\n' 
        item['content'] = cc 
        yield item 

settings.py

BOT_NAME = 'crawlAll' 
SPIDER_MODULES = ['crawlAll.spiders'] 
NEWSPIDER_MODULE = 'crawlAll.spiders' 
ROBOTSTXT_OBEY = False 

DOWNLOAD_DELAY = 3 
COOKIES_ENABLED = False 

DOWNLOADER_MIDDLEWARES = { 
    'crawlAll.useragent.MyUserAgentMiddleware':400, 
    'crawlAll.middlewares.MyCustomDownloaderMiddleware': None 
} 

但是,这是不行的,任何人都可以解决这个问题? 非常感谢!

回答

0

在你的分析功能,你不能得到的网址,因为你不输入的。删除或增加最大页数。

def parse(self, response): 
    for ctg in self.category: 
     url = self.base_url + '/' + ctg + '/p' + page 
     yield scrapy.Request(url,callback = self.parseNewsHref) 
+0

非常感谢,这是我的错,这个问题已经在您的帮助下解决了。祝你好运〜 –