2017-07-04 75 views
0

我想跟踪所有内部链接,同时跟踪网站的所有内部和外部链接。我刚刚开始使用Scrapy,我无法弄清楚在网站中的所有内部链接时如何抓取。Scrapy在抓取网站时无法关注内部链接

它只是获取深度链接,但不遵循它们。

class BRS(CrawlSpider): 
    name = "brs" 
    rules = (Rule(SgmlLinkExtractor(allow=()), callback='parse_obj', follow=True),) 
    def __init__(self): 
     global start_urls 
     #settings.overrides['DEPTH_LIMIT'] = 10                          
     path = os.path.dirname(os.path.abspath(__file__)) 
     with open(os.path.join(path,"urls.txt"), "rt") as f: 
      self.start_urls = filter(None,[url.strip() for url in f.readlines()]) 
     start_urls = self.start_urls 


    def parse(self, response): 
     brsitem = BrsItem() 
     brsitem['url'] = response.url 
     internal = LinkExtractor(allow_domains=[response.url]) 
     external = LinkExtractor(deny_domains=[response.url]) 
     links = internal.extract_links(response) 
     internal = [] 
     fd = open('output.txt','a+') 
     for link in links: 
      internal.append(link.url) 

     links = external.extract_links(response) 
     external = [] 
     for link in links: 
      external.append(link.url) 
     for link in internal: 
      fd.write(link+"\tinternal\n") 

     for link in external: 
      fd.write(link+"\texternal\n") 

     return brsitem 

我urls.txt包含截至目前: http://www.stackoverflow.com

任何帮助表示赞赏。

回答

0

使用这个link的引用得到它的工作,并且当我忘记设置DEPTH_LIMIT参数时,我的ip被阻塞在stackoverflow上。有些事情是艰难的。

import scrapy 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 
from scrapy.contrib.spiders import Rule, CrawlSpider 
from scrapy.linkextractors import LinkExtractor 
import urllib2,requests 
from scrapy.conf import settings 
from requests.auth import HTTPBasicAuth 
import urllib2,requests,os,sys 
from urlparse import urlparse 
from brs.items import BrsItem 


class BRS(CrawlSpider): 
    name = "brs" 

    def __init__(self): 
     global start_urls,rules 
     settings.overrides['DEPTH_LIMIT'] = 10 
     path = os.path.dirname(os.path.abspath(__file__)) 
     with open(os.path.join(path,"urls.txt"), "r+") as f: 
      self.start_urls = filter(None,[url.strip() for url in f.readlines()]) 

     start_urls = self.start_urls 
     self.rules = (Rule(SgmlLinkExtractor(allow=()), callback=self.parse_items, follow=True),) 
     rules = self.rules 
     self._rules = rules 



    def extract_domain(self,url): 
     return urlparse(url).netloc 


    def parse_items(self, response): 

     internal = LinkExtractor(allow_domains=[self.extract_domain(response.url)]) 
     external = LinkExtractor(deny_domains=[self.extract_domain(response.url)]) 
     links = internal.extract_links(response) 
     internal = [] 
     fd = open('output.txt','a+') 
     for link in links: 
      internal.append(link.url) 

     for link in internal: 
      fd.write(link+"\tinternal\n") 

     links = external.extract_links(response) 
     external = [] 
     for link in links: 
      external.append(link.url) 
     for link in external: 
      fd.write(link+"\texternal\n") 
     for link in internal: 
      yield scrapy.Request(link.strip(), callback=self.parse_attr) 



    def parse_attr(self, response): 
     brsitem = BrsItem() 
     brsitem['url'] = response.url.strip() 
     return brsitem