2013-05-25 71 views
2

他创造了Scrapy蜘蛛: items.py:Scrapy不将数据写入到文件

from scrapy.item import Item, Field 

class dns_shopItem (Item): 
     # Define the fields for your item here like: 
     # Name = Field() 
    id = Field() 
    idd = Field() 

dns_shop_spider.py:

from scrapy.contrib.spiders import CrawlSpider, Rule 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 
from scrapy.contrib.loader.processor import TakeFirst 
from scrapy.contrib.loader import XPathItemLoader 
from scrapy.selector import HtmlXPathSelector 
from dns_shop.items import dns_shopItem 
  
class dns_shopLoader (XPathItemLoader): 
     default_output_processor = TakeFirst() 
  
class dns_shopSpider (CrawlSpider): 
    name = "dns_shop_spider" 
    allowed_domains = ["www.playground.ru"] 
    start_urls = ["http://www.playground.ru/files/stalker_clear_sky/"] 
    rules = (
    Rule (SgmlLinkExtractor (allow = ('/ files/s_t_a_l_k_e_r_chistoe_nebo')), follow = True), 
    Rule (SgmlLinkExtractor (allow = ('/ files/s_t_a_l_k_e_r_chistoe_nebo')), callback = 'parse_item'), 
    ) 

    def parse_item (self, response): 
     hxs = HtmlXPathSelector (response) 
     l = dns_shopLoader (dns_shopItem(), hxs) 
     l.add_xpath ('id', "/ html/body/table [2]/tbody/tr [5]/td [2]/table/tbody/tr/td/div [6]/h1/text()") 
     l.add_xpath ('idd', "//html/body/table [2]/tbody/tr [5]/td [2]/table/tbody/tr/td/div [6]/h1/text() ") 
     return l.load_item() 

运行以下命令:

scrapy crawl dns_shop_spider-o scarped_data_utf8.csv-t csv 

这个日志显示Scrapy通过了所有必要的url,但是当你启动spid时为什么不写入指定的文件呃。可能是什么问题?

回答

2

假设你想跟着http://www.playground.ru/files/stalker_clear_sky/网页上的所有链接,并获得标题,网址和链接下载:

from scrapy.contrib.spiders import CrawlSpider, Rule 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 
from scrapy.contrib.loader.processor import TakeFirst 
from scrapy.contrib.loader import XPathItemLoader 
from scrapy.selector import HtmlXPathSelector 

from scrapy.item import Item, Field 


class PlayGroundItem(Item): 
    title = Field() 
    url = Field() 
    download_url = Field() 


class PlayGroundLoader(XPathItemLoader): 
    default_output_processor = TakeFirst() 


class PlayGroundSpider(CrawlSpider): 
    name = "playground_spider" 
    allowed_domains = ["www.playground.ru"] 
    start_urls = ["http://www.playground.ru/files/stalker_clear_sky/"] 
    rules = (
     Rule(SgmlLinkExtractor(allow=('/files/s_t_a_l_k_e_r_chistoe_nebo')), follow=True, callback='parse_item'), 
    ) 


    def parse_item(self, response): 
     hxs = HtmlXPathSelector(response) 
     l = PlayGroundLoader(PlayGroundItem(), hxs) 
     l.add_value('url', response.url) 
     l.add_xpath('title', "//div[@class='downloads-container clearfix']/h1/text()") 
     l.add_xpath('download_url', "//div[@class='files-download-holder']/div/a/@href") 

     return l.load_item() 

保存到spider.py,并通过运行:

scrapy runspider test_scrapy.py -o output.json 

然后检查output.json

希望有所帮助。

+0

非常感谢! – user2420607

+0

我不知道在哪里点击什么,什么会让你加薪? – user2420607

+0

打勾响应打勾。仍然想问为什么我的xpath查询不起作用,并且你的工作?它们是: l.add_xpath('title',“//div [@ class ='downloads-container clearfix']///* [@ id ='mainTable']/tbody/tr [5]/td [2]/table/tbody/tr/td/div [6]/h1/text()“) 只运行第一个。我使用Firebug for Mozilla Firefox编写了xpath查询。而当你写xpath查询? – user2420607