0
我想跟踪所有内部链接,同时跟踪网站的所有内部和外部链接。我刚刚开始使用Scrapy,我无法弄清楚在网站中的所有内部链接时如何抓取。Scrapy在抓取网站时无法关注内部链接
它只是获取深度链接,但不遵循它们。
class BRS(CrawlSpider):
name = "brs"
rules = (Rule(SgmlLinkExtractor(allow=()), callback='parse_obj', follow=True),)
def __init__(self):
global start_urls
#settings.overrides['DEPTH_LIMIT'] = 10
path = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(path,"urls.txt"), "rt") as f:
self.start_urls = filter(None,[url.strip() for url in f.readlines()])
start_urls = self.start_urls
def parse(self, response):
brsitem = BrsItem()
brsitem['url'] = response.url
internal = LinkExtractor(allow_domains=[response.url])
external = LinkExtractor(deny_domains=[response.url])
links = internal.extract_links(response)
internal = []
fd = open('output.txt','a+')
for link in links:
internal.append(link.url)
links = external.extract_links(response)
external = []
for link in links:
external.append(link.url)
for link in internal:
fd.write(link+"\tinternal\n")
for link in external:
fd.write(link+"\texternal\n")
return brsitem
我urls.txt包含截至目前: http://www.stackoverflow.com
任何帮助表示赞赏。