2011-07-25 74 views
3

我正在使用以下代码段使用HTMLParser来提取页面上的所有链接。我收到了不少相对的网址。如何将这些转换为网域的绝对网址www.exmaple.com使用HTMLParser从页面中提取绝对链接

import htmllib, formatter 
import urllib, htmllib, formatter 

class LinksExtractor(htmllib.HTMLParser): 

    def __init__(self, formatter): 
     htmllib.HTMLParser.__init__(self, formatter) 
     self.links = [] 

    def start_a(self, attrs): 
     if len(attrs) > 0 : 
     for attr in attrs : 
      if attr[0] == "href": 
       self.links.append(attr[1]) 

    def get_links(self): 
     return self.links 


format = formatter.NullFormatter() 
htmlparser = LinksExtractor(format) 

data = urllib.urlopen("http://cis.poly.edu/index.htm") 
htmlparser.feed(data.read()) 
htmlparser.close() 

links = htmlparser.get_links() 
print links 

感谢

回答