2015-05-28 191 views
0

我试着编写一个定义类的代码:网页和crwaler。 对象:构建一个通用的搜索引擎。 我在我的电脑上找到了一个带有几个“web_pages”格式的文件夹。 .im抱歉所有的代码,但我不知道如何使这个问题具体没有它。如何打破循环而不打破包含它的循环?

import re 
import os 

def remove_html_tags(s): 
    tag = False 
    quote = False 
    out = "" 

    for c in s: 
      if c == '<' and not quote: 
       tag = True 
      elif c == '>' and not quote: 
       tag = False 
      elif (c == '"' or c == "'") and tag: 
       quote = not quote 
      elif not tag: 
       out = out + c 

    return out 


def lev(s1, s2): 
    return lev_iter(s1, s2, dict()) 

def lev_iter(s1, s2, mem): 

    (i,j) = (len(s1), len(s2)) 
    if (i,j) in mem: 
     return mem[(i,j)] 

    s1_low = s1.lower() 
    s2_low = s2.lower() 
    if len(s1_low) == 0 or len(s2_low) == 0: 
     return max(len(s1_low), len(s2_low)) 
    d1 = lev_iter(s1_low[:-1], s2_low, mem) + 1 
    d2 = lev_iter(s1_low, s2_low[:-1], mem) + 1 
    last = 0 if s1_low[-1] == s2_low[-1] else 1 
    d3 = lev_iter(s1_low[:-1], s2_low[:-1], mem) + last 
    result = min(d1, d2, d3) 

    mem[(i,j)] = result 

    return result 




""" A Class that holds data on a Web page """ 
class WebPage: 

    def __init__(self, filename): 

     self.filename = filename 

    def process(self): 

     f = open(self.filename,'r') 
     LINE_lst = f.readlines() 

     self.info = {} 

     for i in range(len(LINE_lst)): 
      LINE_lst[i] = LINE_lst[i].strip(' \n\t') 
      LINE_lst[i] = remove_html_tags(LINE_lst[i]) 
     lines = LINE_lst[:] 
     for line in lines: 
      if len(line) == 0: 
       LINE_lst.remove(line) 
     self.body = ' '.join(LINE_lst[1:]) 
     self.title = LINE_lst[0] 
     f.close() 

    def __str__(self): 
     return self.title + '\n' + self.body 

    def __repr__(self): 
     return self.title 

    def __eq__(self,other): 
     n = lev(self.body,other.body) 
     k = len(self.body) 
     m = len(other.body) 
     return float(n)/max(k,m) <= 0.15 

    def __lt__(self,other): 
     return self.title < other.title 

""" A Class that crawls the web """  
class Crawler: 
    def __init__(self, directory): 

     self.folder = directory 

    def crawl(self): 

     pages = [f for f in os.listdir(self.folder) if f.endswith('.html')] 

     final_list = [] 

     for page in pages: 

      page = WebPage(self.folder + '\\' + page) 
      page.process() 

      for k in range(len(final_list)+1): 

       if k == len(final_list): 

        final_list.append(page) 

       elif page == final_list[k]: 
        if page < final_list[k]: 
         final_list.remove(final_list[k]) 
         final_list.append(page) 

     self.crawl = final_list 

好吧..除了crawl方法之外的一切工作。 也许我会做这一切都错了,我不知道。 我想打破循环,只要k等于final_list的长度,但不打破包含它的循环。 任何sugestions?

回答

6

这就是break关键字确实。它只打破最内层的循环。