1
我想让我的蜘蛛在网站的每个页面上刮掉列表。我用CrawlSpider和LinkExtractor。但是当我看着csv文件时,第一页上的任何内容(即起始url)都被刮掉了。从第2页开始抓取项目。我在Scrapy外壳上测试了我的抓取工具,看起来很好。我无法弄清问题出在哪里。以下是我的蜘蛛代码。请帮忙。非常感谢!Scrapy蜘蛛不会刮掉Page 1
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from shputuo.items_shputuo import ShputuoItem
class Shputuo(CrawlSpider):
name = "shputuo"
allowed_domains = ["shpt.gov.cn"] # DO NOT use www in allowed domains
start_urls = ["http://www.shpt.gov.cn/gb/n6132/n6134/n6156/n7110/n7120/index.html"]
rules = (
Rule(LinkExtractor(allow=(), restrict_xpaths=("//div[@class = 'page']/ul/li[5]/a",)), callback="parse_items", follow= True),
)
def parse_items(self, response):
for sel in response.xpath("//div[@class = 'neirong']/ul/li"):
item = ShputuoItem()
word = sel.xpath("a/text()").extract()[0]
item['id'] = word[3:11]
item['title'] = word[11:len(word)]
item['link'] = "http://www.shpt.gov.cn" + sel.xpath("a/@href").extract()[0]
item['time2'] = sel.xpath("span/text()").extract()[0][1:11]
request = scrapy.Request(item['link'], callback = self.parse_content)
request.meta['item'] = item
yield request
def parse_content(self, response):
item = response.meta['item']
item['question'] = response.xpath("//div[@id = 'ivs_content']/p[2]/text()").extract()[0]
item['question'] = "".join(map(unicode.strip, item['question'])) # get rid of unwated spaces and others
item['reply'] = response.xpath("//div[@id = 'ivs_content']/p[3]/text()").extract()[0]
item['reply'] = "".join(map(unicode.strip, item['reply']))
item['agency'] = item['reply'][6:10]
item['time1'] = "2015-" + item['question'][0] + "-" + item['question'][2]
yield item
是'start_urls'页面1内部的网址吗? – eLRuLL
@eLRuLL,是的。这是第1页。 –