2017-03-19 51 views
0

爬行,这是我的代码:递归网络在Python

import requests 
from bs4 import BeautifulSoup 
import re 

class WebCrawler(): 
    def check(self, links): 
     global imgCount 
     for item in links: 
      targetURL = item['href'] 
      if(targetURL.startswith('/')): 
       targetURL = target + targetURL # add http:// and hostname to url 

      target_html = requests.get(targetURL) 
      parsed_html = BeautifulSoup(target_html.text, 'html.parser') 
      if parsed_html.title.text not in pages: 
       pages.append(parsed_html.title.text) 
       print "[+] Collecting images page : " + parsed_html.title.text 
       images = parsed_html.findAll('img', {'src': re.compile(r'(jpe?g)|(png)|(svg)$')}) 
       for img_url in images: 
        imgCount=imgCount + 1 
        # print img_url['src'] + ':::::::::' + img_url.get('alt', "") + "\n" 
pages = [] 
imgCount = 0 
target = raw_input("Please enter base url: ") 

data = BeautifulSoup(requests.get(target).text, 'html.parser') 

link = data.find_all('a') 
crawler = WebCrawler() 
crawler.check(link) 

print "===================== Total Collected Images =====================\n" 
print imgCount 

我想的是继续在其他页面。意味着它继续计数直到没有任何链接。 当我调用检查功能时,那是行不通的!

import requests 
from bs4 import BeautifulSoup 
import re 

class WebCrawler(): 
    def check(self, links): 
     global imgCount 
     for item in links: 
      targetURL = item['href'] 
      if(targetURL.startswith('/')): 
       targetURL = target + targetURL # add http:// and hostname to url 

      target_html = requests.get(targetURL) 
      parsed_html = BeautifulSoup(target_html.text, 'html.parser') 
      if parsed_html.title.text not in pages: 
       pages.append(parsed_html.title.text) 
       print "[+] Collecting images page : " + parsed_html.title.text 
       images = parsed_html.findAll('img', {'src': re.compile(r'(jpe?g)|(png)|(svg)$')}) 
       for img_url in images: 
        imgCount=imgCount + 1 
        # print img_url['src'] + ':::::::::' + img_url.get('alt', "") + "\n" 
      lnks = parsed_html.find_all('a') 
      self.check(lnks) 


pages = [] 
imgCount = 0 
target = raw_input("Please enter base url: ") 

data = BeautifulSoup(requests.get(target).text, 'html.parser') 

link = data.find_all('a') 
crawler = WebCrawler() 
crawler.check(link) 

print "===================== Total Collected Images =====================\n" 
print imgCount 

我添加这些行到它:

lnks = parsed_html.find_all('a') 
self.check(lnks) 

此时,循环只执行一次!

回答

1

尝试这样:

from scrapy.contrib.spiders import CrawlSpider, Rule 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 
from scrapy.selector import HtmlXPathSelector 
from craigslist_sample.items import CraigslistSampleItem 

class MySpider(CrawlSpider): 
    name = "craigs" 
    allowed_domains = ["sfbay.craigslist.org"] 
    start_urls = ["http://sfbay.craigslist.org/search/npo"] 

    rules = (
     Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse_items", follow= True), 
    ) 

    def parse_items(self, response): 
     hxs = HtmlXPathSelector(response) 
     titles = hxs.xpath('//span[@class="pl"]') 
     items = [] 
     for titles in titles: 
      item = CraigslistSampleItem() 
      item["title"] = titles.xpath("a/text()").extract() 
      item["link"] = titles.xpath("a/@href").extract() 
      items.append(item) 
     return(items) 
+0

我不想使用scrapy – unbl0ck3r