2016-08-18 118 views
0

我试图运行我在网上找到的http://www.netinstructions.com/how-to-make-a-web-crawler-in-under-50-lines-of-python-code/Python的网络爬虫(NameError:名字“蜘蛛”没有定义)

然而,通过Python的3.5运行时,我遇到问题的例子。 2壳牌。

spider("http://www.dreamhost.com", "secure", 200) 给我的留言:
回溯(最近通话最后一个): 文件 “”,1号线,在 蜘蛛( “http://www.dreamhost.com”, “安全”,200) NameError:名字 '蜘蛛'没有定义

from html.parser import HTMLParser 
from urllib.request import urlopen 
from urllib import parse 

class LinkParser(HTMLParser): 

def handle_starttag(self, tag, attrs): 
    if tag == 'a': 
     for (key, value) in attrs: 
      if key == 'href': 
       newUrl = parse.urljoin(self.baseUrl, value) 
       self.links = self.links + [newUrl] 

def getLinks(self, url): 
    self.links = [] 
    self.baseUrl = url 
    response = urlopen(url) 
    if response.getheader('Content-Type')=='text/html': 
     htmlBytes = response.read() 
     htmlString = htmlBytes.decode("utf-8") 
     self.feed(htmlString) 
     return htmlString, self.links 
    else: 
     return "",[] 

def spider(url, word, maxPages): 
    pagesToVisit = [url] 
    numberVisited = 0 
    foundWord = False 
    while numberVisited < maxPages and pagesToVisit != [] and not  foundWord: 
    numberVisited = numberVisited +1 
    url = pagesToVisit[0] 
    pagesToVisit = pagesToVisit[1:] 
    try: 
     print(numberVisited, "Visiting:", url) 
     parser = LinkParser() 
     data, links = parser.getLinks(url) 
     if data.find(word)>-1: 
      foundWord = True 
     pagesToVisit = pagesToVisit + links 
     print(" **Success!**") 
    except: 
     print(" **Failed!**") 
if foundWord: 
    print("The word", word, "was found at", url) 
else: 
    print("Word never found") 
+0

你如何运行它?您在REPL中输入的所有说明是什么? –

回答

0

Yourno,

你在你的代码的好友都有缩进问题。在定义类之后,在方法handle_starttaggetLinks之前没有缩进。同样在函数spider中,在if-else部分中缺少缺口。请根据您提供的链接上发布的代码检查您的代码。请找到以下更新的工作代码:

from html.parser import HTMLParser 
from urllib.request import urlopen 
from urllib import parse 

# We are going to create a class called LinkParser that inherits some 
# methods from HTMLParser which is why it is passed into the definition 
class LinkParser(HTMLParser): 

    # This is a function that HTMLParser normally has 
    # but we are adding some functionality to it 
    def handle_starttag(self, tag, attrs): 
     # We are looking for the begining of a link. Links normally look 
     # like <a href="www.someurl.com"></a> 
     if tag == 'a': 
      for (key, value) in attrs: 
       if key == 'href': 
        # We are grabbing the new URL. We are also adding the 
        # base URL to it. For example: 
        # www.netinstructions.com is the base and 
        # somepage.html is the new URL (a relative URL) 
        # 
        # We combine a relative URL with the base URL to create 
        # an absolute URL like: 
        # www.netinstructions.com/somepage.html 
        newUrl = parse.urljoin(self.baseUrl, value) 
        # And add it to our colection of links: 
        self.links = self.links + [newUrl] 

    # This is a new function that we are creating to get links 
    # that our spider() function will call 
    def getLinks(self, url): 
     self.links = [] 
     # Remember the base URL which will be important when creating 
     # absolute URLs 
     self.baseUrl = url 
     # Use the urlopen function from the standard Python 3 library 
     response = urlopen(url) 
     # Make sure that we are looking at HTML and not other things that 
     # are floating around on the internet (such as 
     # JavaScript files, CSS, or .PDFs for example) 
     if response.getheader('Content-Type')=='text/html': 
      htmlBytes = response.read() 
      # Note that feed() handles Strings well, but not bytes 
      # (A change from Python 2.x to Python 3.x) 
      htmlString = htmlBytes.decode("utf-8") 
      self.feed(htmlString) 
      return htmlString, self.links 
     else: 
      return "",[] 

# And finally here is our spider. It takes in an URL, a word to find, 
# and the number of pages to search through before giving up 
def spider(url, word, maxPages): 
    pagesToVisit = [url] 
    numberVisited = 0 
    foundWord = False 
    # The main loop. Create a LinkParser and get all the links on the page. 
    # Also search the page for the word or string 
    # In our getLinks function we return the web page 
    # (this is useful for searching for the word) 
    # and we return a set of links from that web page 
    # (this is useful for where to go next) 
    while numberVisited < maxPages and pagesToVisit != [] or not foundWord: 
     numberVisited = numberVisited +1 
     # Start from the beginning of our collection of pages to visit: 
     url = pagesToVisit[0] 
     pagesToVisit = pagesToVisit[1:] 
     try: 
      print(numberVisited, "Visiting:", url) 
      parser = LinkParser() 
      data, links = parser.getLinks(url) 
      if data.find(word)>-1: 
       foundWord = True 
       foundAtUrl = url 
       # Add the pages that we visited to the end of our collection 
       # of pages to visit: 
       pagesToVisit = pagesToVisit + links 
       print(" **Success!**") 
      #Added else, so if desired word not found, then make foundWord = False 
      else: 
       foundWord = False 
     except: 
      print(" **Failed!**") 
     #Moved this if-else condition block inside while loop, so for every url, it will give us message whether the desired word found or not 
     if foundWord: 
      print("The word", word, "was found at", url) 
     else: 
      print("Word never found") 

spider("http://www.dreamhost.com", "secure", 200) 

请让我知道,如果您仍然有任何问题/查询。

+0

谢谢你的回应汗。不幸的是,即使使用您提供的“工作代码”,仍然会遇到原始错误。 –

+0

Hello @ EmilioPagan-Yourno,是的,我知道你使用python shell来运行代码。我更新了我的上面的代码。它肯定会工作。让我知道。 –

+0

非常感谢您的帮助。我想到了! –