2017-10-16 49 views
0

我是新手到Python和scrapy无法使用scrapy

这里是我的代码,把所有的产品名称,价格,图片,标题从所有的下一个页面,生成下一个页面CSV

import scrapy  
class TestSpider(scrapy.Spider):  
name = "testdoc1"  
start_urls = ["https://www.amazon.in/s/ref=amb_link_46?ie=UTF8&bbn=1389432031&rh=i%3Aelectronics%2Cn%3A976419031%2Cn%3A%21976420031%2Cn%3A1389401031%2Cn%3A1389432031%2Cp_89%3AApple&pf_rd_m=A1VBAL9TL5WCBF&pf_rd_s=merchandised-search-leftnav&pf_rd_r=CYS25V3W021MSYPQ32FB&pf_rd_r=CYS25V3W021MSYPQ32FB&pf_rd_t=101&pf_rd_p=1ce3e975-c6e8-479a-8485-2e490b9f58a9&pf_rd_p=1ce3e975-c6e8-479a-8485-2e490b9f58a9&pf_rd_i=1389401031"] 

def parse(self, response): 
    for post_link in response.xpath('//a/@href').extract(): 
     link = response.urljoin(post_link) 
     yield scrapy.Request(link, callback=self.parse_post) 

    # Checks if the main page has a link to next page if True keep parsing. 
    next_page = response.xpath('(//a[@class="pagnNext"])[1]/@href').extract_first() 
    if next_page: 
     yield scrapy.Request(next_page, callback=self.parse) 

def parse_post(self, response): 
    # Scrape name,price,image, link from product. 
    for post in response.xpath('//li[contains(@class,"s-result-item celwidget")]'): 
     item = dict() 
     item['Name'] = post.xpath('.//h2[contains(@class,"a-size-base s-inline s-access-title a-text-normal")]/text()').extract() 
     item['Price'] = post.xpath('.//span[contains(@class,"a-size-base a-color-price s-price a-text-bold")]/text()').extract() 
     item['Image'] = post.xpath('.//img[contains(@class,"s-access-image cfMarker")]/@src').extract() 
     item['Link'] = post.sel.xpath('.//a[contains(@class,"a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal")]/@href').extract() 
     yield item 

    # If the products page has a link to next page keep parsing. 
    next_page = response.xpath('(//a[@class="pagnNext"])[1]/@href').extract_first() 
    if next_page: 
     yield scrapy.Request(next_page, callback=self.parse_post) 

我爬不给任何错误,但我的CSV是空 `

+0

你的问题是什么? –

+0

请具体说明你的问题,你做了什么来解决它,以及你需要帮助的地方。 –

+0

你有错误的xpaths。 – Verz1Lka

回答

0

你的问题是低于线

yield scrapy.Request(next_page, callback=self.parse) 

该URL即将作为相对URL。所以,你应该用

yield response.follow(next_page, callback=self.parse) 

这会自动解决相对URL

编辑-1

刚刚意识到你正在浏览的个人网页,你只需要提取从结果页中的数据。所以你的parse_post函数根本就不需要。以下是你需要怎么做的

class TestSpider(scrapy.Spider): 
    name = "testdoc1" 
    allowed_domains = ['amazon.in'] 
    start_urls = [ 
     "https://www.amazon.in/s/ref=amb_link_46?ie=UTF8&bbn=1389432031&rh=i%3Aelectronics%2Cn%3A976419031%2Cn%3A%21976420031%2Cn%3A1389401031%2Cn%3A1389432031%2Cp_89%3AApple&pf_rd_m=A1VBAL9TL5WCBF&pf_rd_s=merchandised-search-leftnav&pf_rd_r=CYS25V3W021MSYPQ32FB&pf_rd_r=CYS25V3W021MSYPQ32FB&pf_rd_t=101&pf_rd_p=1ce3e975-c6e8-479a-8485-2e490b9f58a9&pf_rd_p=1ce3e975-c6e8-479a-8485-2e490b9f58a9&pf_rd_i=1389401031"] 

    def parse(self, response): 
     for post in response.css('li.s-result-item'): 
      item = dict() 
      item['Name'] = post.xpath(
       './/h2[contains(@class,"a-size-base s-inline s-access-title a-text-normal")]/text()').extract() 
      item['Price'] = post.xpath(
       './/span[contains(@class,"a-size-base a-color-price s-price a-text-bold")]/text()').extract() 
      item['Image'] = post.xpath('.//img[contains(@class,"s-access-image cfMarker")]/@src').extract() 
      item['Link'] = post.xpath(
       './/a[contains(@class,"a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal")]/@href').extract() 
      yield item 


     # Checks if the main page has a link to next page if True keep parsing. 
     next_page = response.xpath('(//a[@class="pagnNext"])[1]/@href').extract_first() 
     if next_page: 
      yield response.follow(next_page, callback=self.parse) 
+0

先生,我改变了我的代码,但是CSV没有生成所有下一页的确切结果,为什么会这样? –

+0

@Zarinaveeru,请检查更新回答 –

+0

是的,先生,它的工作。 –