0
爬行,这是我的代码:递归网络在Python
import requests
from bs4 import BeautifulSoup
import re
class WebCrawler():
def check(self, links):
global imgCount
for item in links:
targetURL = item['href']
if(targetURL.startswith('/')):
targetURL = target + targetURL # add http:// and hostname to url
target_html = requests.get(targetURL)
parsed_html = BeautifulSoup(target_html.text, 'html.parser')
if parsed_html.title.text not in pages:
pages.append(parsed_html.title.text)
print "[+] Collecting images page : " + parsed_html.title.text
images = parsed_html.findAll('img', {'src': re.compile(r'(jpe?g)|(png)|(svg)$')})
for img_url in images:
imgCount=imgCount + 1
# print img_url['src'] + ':::::::::' + img_url.get('alt', "") + "\n"
pages = []
imgCount = 0
target = raw_input("Please enter base url: ")
data = BeautifulSoup(requests.get(target).text, 'html.parser')
link = data.find_all('a')
crawler = WebCrawler()
crawler.check(link)
print "===================== Total Collected Images =====================\n"
print imgCount
我想的是继续在其他页面。意味着它继续计数直到没有任何链接。 当我调用检查功能时,那是行不通的!
import requests
from bs4 import BeautifulSoup
import re
class WebCrawler():
def check(self, links):
global imgCount
for item in links:
targetURL = item['href']
if(targetURL.startswith('/')):
targetURL = target + targetURL # add http:// and hostname to url
target_html = requests.get(targetURL)
parsed_html = BeautifulSoup(target_html.text, 'html.parser')
if parsed_html.title.text not in pages:
pages.append(parsed_html.title.text)
print "[+] Collecting images page : " + parsed_html.title.text
images = parsed_html.findAll('img', {'src': re.compile(r'(jpe?g)|(png)|(svg)$')})
for img_url in images:
imgCount=imgCount + 1
# print img_url['src'] + ':::::::::' + img_url.get('alt', "") + "\n"
lnks = parsed_html.find_all('a')
self.check(lnks)
pages = []
imgCount = 0
target = raw_input("Please enter base url: ")
data = BeautifulSoup(requests.get(target).text, 'html.parser')
link = data.find_all('a')
crawler = WebCrawler()
crawler.check(link)
print "===================== Total Collected Images =====================\n"
print imgCount
我添加这些行到它:
lnks = parsed_html.find_all('a')
self.check(lnks)
此时,循环只执行一次!
我不想使用scrapy – unbl0ck3r