4
我不明白为什么Scrapy抓取第一页但没有跟踪链接来抓取后续页面。这必须与规则有关。非常感激。谢谢!Scrapy抓取第一页,但没有遵循链接
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistItem
class MySpider(CrawlSpider):
name = "craig"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/acc/"]
rules = (Rule (SgmlLinkExtractor(allow=("index100\.html",),restrict_xpaths=('//p[@id="nextpage"]',))
, callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p")
items = []
for titles in titles:
item = CraigslistItem()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/@href").extract()
items.append(item)
return(items)
spider = MySpider()
craigs_sample.items中有什么?你是否也可以分享这段代码片段,以便从craigslist_sample.items导入CraigslistItem'起作用。 –