我已经设置规则从start_url获取下一页,但它不工作,它只抓取start_urls页面以及该页面中的链接(使用parseLinks)。它不会转到规则中设置的下一页。我如何跳转到下一个页面在Scrapy规则
有帮助吗?
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy import log
from urlparse import urlparse
from urlparse import urljoin
from scrapy.http import Request
class MySpider(CrawlSpider):
name = 'testes2'
allowed_domains = ['example.com']
start_urls = [
'http://www.example.com/pesquisa/filtro/?tipo=0&local=0'
]
rules = (Rule(SgmlLinkExtractor(restrict_xpaths=('//a[@id="seguinte"]/@href')), follow=True),)
def parse(self, response):
sel = Selector(response)
urls = sel.xpath('//div[@id="btReserve"]/../@href').extract()
for url in urls:
url = urljoin(response.url, url)
self.log('URLS: %s' % url)
yield Request(url, callback = self.parseLinks)
def parseLinks(self, response):
sel = Selector(response)
titulo = sel.xpath('h1/text()').extract()
morada = sel.xpath('//div[@class="MORADA"]/text()').extract()
email = sel.xpath('//a[@class="sendMail"][1]/text()')[0].extract()
url = sel.xpath('//div[@class="contentContacto sendUrl"]/a/text()').extract()
telefone = sel.xpath('//div[@class="telefone"]/div[@class="contentContacto"]/text()').extract()
fax = sel.xpath('//div[@class="fax"]/div[@class="contentContacto"]/text()').extract()
descricao = sel.xpath('//div[@id="tbDescricao"]/p/text()').extract()
gps = sel.xpath('//td[@class="sendGps"]/@style').extract()
print titulo, email, morada
检查这个答案,这将解决这个问题:http://stackoverflow.com/questions/13227546/scrapy-crawls-first-page-but -does-not-follow-links?answertab = votes#tab-top – Perefexexos