2013-08-29 62 views
1

我的scrapy spider显示所有网页的标题。 请告诉我如何显示与该标题相关的标题和链接? 我想解析this页面。 我的代码:scrapy输出标题和相关链接

from scrapy.contrib.spiders import CrawlSpider, Rule 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 
from scrapy.selector import HtmlXPathSelector 
from probe1.items import SpiderItem 

class SpiderSpider(CrawlSpider): 
    name = "spider" 
    allowed_domains = ["WEB_PAGE"] 
    start_urls = [ 
    "http://www.WEB_PAGE" 
    ] 

    rules = (
     Rule(
      SgmlLinkExtractor(allow_domains=("WEB_PAGE",)), 
      callback='parse_page', follow=True 
     ), 
    ) 


    def parse_page(self, response): 
     hxs = HtmlXPathSelector(response) 
     print hxs 
     sites = hxs.select('//title') 
     items = [] 
     for s in sites: 
     item = SpiderItem() 
      item['title'] = s.select('//title').extract 
      items.append(item) 
     return items 

回答

1

response.url包含了你所需要的:

网址

一个包含响应的URL字符串。