1
在这里,我已经写了代码使用python和美丽的汤来解析该页面上的所有链接到链接的存储库。接下来,它从刚刚创建的存储库中获取任何url的内容,将这个新内容中的链接解析到存储库中,并继续处理存储库中所有链接的这个过程,直到停止或获取给定数量的链接之后。Gevent链接爬虫
但是这段代码很慢。我如何通过在python中使用gevents使用异步编程来改善它?
代码
class Crawler(object):
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup.BeautifulSoup(html_code)
page_links = []
try :
page_links = itertools.ifilter( # Only deal with absolute links
lambda href: 'http://' in href,
(a.get('href') for a in self.soup.findAll('a')) )
except Exception as e: # Magnificent exception handling
print 'Error: ',e
pass
# Update links
self.links = self.links.union(set(page_links))
# Choose a random url from non-visited set
self.current_page = random.sample(self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
while len(self.visited_links) < 3 or (self.visited_links == self.links):
self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
更新1
import gevent.monkey; gevent.monkey.patch_thread()
from bs4 import BeautifulSoup
import urllib2
import itertools
import random
import urlparse
import sys
import gevent.monkey; gevent.monkey.patch_all(thread=False)
class Crawler(object):
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup(html_code)
page_links = []
try :
for link in [h.get('href') for h in self.soup.find_all('a')]:
print "Found link: '" + link + "'"
if link.startswith('http'):
print 'entered in if link: ',link
page_links.append(link)
print "Adding link" + link + "\n"
elif link.startswith('/'):
print 'entered in elif link: ',link
parts = urlparse.urlparse(self.current_page)
page_links.append(parts.scheme + '://' + parts.netloc + link)
print "Adding link " + parts.scheme + '://' + parts.netloc + link + "\n"
else:
print 'entered in else link: ',link
page_links.append(self.current_page+link)
print "Adding link " + self.current_page+link + "\n"
except Exception, ex: # Magnificent exception handling
print ex
# Update links
self.links = self.links.union(set(page_links))
# Choose a random url from non-visited set
self.current_page = random.sample(self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
crawling_greenlets = []
for i in range(3):
crawling_greenlets.append(gevent.spawn(self.open))
gevent.joinall(crawling_greenlets)
#while len(self.visited_links) < 4 or (self.visited_links == self.links):
# self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
收到这个错误异常KeyError异常:<从 '/ usr/lib中/ PY thon2.7/threading.pyc' 模块 '线程'>在KeyError异常(15886544)忽略 –
尽量不要修补线程模块:monkey.patch_all(thread = False);除了这个错误(顺便说一句,你可能会忽略),它是否按照你的预期工作? – mguijarr
为什么它打印'0:http:// www.python.org /'3次 –