转到下一页上showthread.php与scrapy

我是新来scrapy。大约4天，我被困在转到下一页时提取showthread.php（论坛基于vbulletin）。转到下一页上showthread.php与scrapy

我的目标：http://forum.femaledaily.com/showthread.php?359-Hair-Smoothing

import scrapy 
from scrapy.spiders import CrawlSpider, Rule 
from scrapy.linkextractors import LinkExtractor 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 

from femaledaily.items import FemaledailyItem 

class Femaledaily(scrapy.Spider): 
    name = "femaledaily" 
    allowed_domains = ["femaledaily.com"] 
    start_urls = [ 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care", 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page2", 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page3", 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page4", 
    ] 

    def parse(self, response): 
     for thd in response.css("tbody > tr "): 
      print "==========NEW THREAD======" 
      url = thd.xpath('.//div[@class="threadlist-title"]/a/@href').extract() 
      url[0] = "http://forum.femaledaily.com/"+url[0] 
      print url[0] 
      yield scrapy.Request(url[0], callback=self.parse_thread) 

    def parse_thread(self, response): 
     for page in response.xpath('//ol[@id="posts"]/li'): 
      item = FemaledailyItem() 
      item['thread_title'] = response.selector.xpath('//span[@class="threadtitle"]/a/text()').extract() 
      # item['thread_starter'] = response.selector.xpath('//div[@class="username_container"]/a/text()').extract_first() 
      post_creator = page.xpath('.//div[@class="username_container"]/a/text()').extract() 

      if not post_creator: 
       item['post_creator'] = page.xpath('.//div[@class="username_container"]/a/span/text()').extract() 
      else: 
       item['post_creator'] = post_creator 

      item['post_content'] = "" 

      cot = page.xpath(".//blockquote[@class='postcontent restore ']/text()").extract() 
      for ct in cot: 
       item['post_content'] += ct.replace('\t','').replace('\n','') 

      yield item

我能够得到第一10个职位为每个线程，但我很困惑如何去到下一个页面。有任何想法吗？

来源

2015-07-01 Fathur Rachman Widhiantoko

在你的代码做，以便它可以正确分页略有变化，

import scrapy 
from scrapy.spiders import CrawlSpider, Rule 
from scrapy.linkextractors import LinkExtractor 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 

from femaledaily.items import FemaledailyItem 

class Femaledaily(scrapy.Spider): 
    name = "femaledaily" 
    allowed_domains = ["femaledaily.com"] 
    BASE_URL = "http://forum.femaledaily.com/" 
    start_urls = [ 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care", 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page2", 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page3", 
     "http://forum.femaledaily.com/forumdisplay.php?136-Hair-Care/page4", 
    ] 

    def parse(self, response): 
     for thd in response.css("tbody > tr "): 
      print "==========NEW THREAD======" 
      url = thd.xpath('.//div[@class="threadlist-title"]/a/@href').extract() 
      url = "http://forum.femaledaily.com/"+url[0] 
      yield scrapy.Request(url, callback=self.parse_thread) 

     # pagination 
     next_page = response.xpath('//li[@class="prev_next"]/a[@rel="next"]/@href').extract() 
     if next_page: 
      yield Request(self.BASE_URL + next_page[0], callback=self.parse) 
     else: 
      return 

    def parse_thread(self, response): 
     for page in response.xpath('//ol[@id="posts"]/li'): 
      item = FemaledailyItem() 
      item['thread_title'] = response.selector.xpath('//span[@class="threadtitle"]/a/text()').extract() 
      # item['thread_starter'] = response.selector.xpath('//div[@class="username_container"]/a/text()').extract_first() 
      post_creator = page.xpath('.//div[@class="username_container"]/a/text()').extract() 

      if not post_creator: 
       item['post_creator'] = page.xpath('.//div[@class="username_container"]/a/span/text()').extract() 
      else: 
       item['post_creator'] = post_creator 

      item['post_content'] = "" 

      cot = page.xpath(".//blockquote[@class='postcontent restore ']/text()").extract() 
      for ct in cot: 
       item['post_content'] += ct.replace('\t','').replace('\n','') 

      yield item 

     # pagination 
     next_page = response.xpath('//li[@class="prev_next"]/a[@rel="next"]/@href').extract() 
     if next_page: 
      yield Request(self.BASE_URL + next_page[0], callback=self.parse_thread) 
     else: 
      return

这里先提取下一个页面的链接（即单前锋箭头），并给予该next_page_url的请求，使回调功能与被调用的地方相同。当它到达最后一页时，next-page-url消失并停止。

来源

2015-07-01 07:14:19 Jithin

谢谢，我修改了我的代码库：D –

转到下一页上showthread.php与scrapy

回答

相关问题