2013-08-30 71 views
-1

我做了一个脚本,该脚本将映射我的目录,给我统计一下...... 这里的脚本:- Python脚本抛出内存错误

import os 
import hashlib 
import platform 
import sys 
import argparse 
import HTML 

class Map(object): 

    def __init__(self,param): 
     self.param_list = param 
     self.slash = self.slash_by_os() 
     self.result_list = [] 
     self.os = "" 


    def calc_md5(self,file_path): 
     with open(file_path) as file_to_check: 
      data = file_to_check.read()  
      md5_returned = hashlib.md5(data).hexdigest() 

     return md5_returned 

    def slash_by_os(self): 
     general_id = platform.system() 
     actual_os = "" 

     if general_id == "Darwin" or general_id == "darwin": 
      actual_os = "UNIX" 
     elif general_id == "Linux" or general_id == "linux": 
      actual_os = "UNIX" 
     elif general_id == "SunOS": 
      actual_os = "UNIX" 
     elif general_id == "Windows" or general_id == "windows": 
      actual_os = "WIN" 
     else: 
      actual_os = general_id 

     if actual_os == "UNIX": 
      return '/' 
     elif actual_os == "WIN": 
      return '\\' 
     else: 
      return '/' 

     self.os = actual_os 

    def what_to_do(self,new_dir): 
     act = [] 
     act.append(new_dir[:-1]) 
     for param in self.param_list: 
      if param == "md5": 
       x = self.calc_md5(new_dir[:-1]) 
       act.append(x) 
      elif param == "size": 
       x = os.stat(new_dir[:-1]).st_size 
       act.append(x) 
      elif param == "access": 
       x = os.stat(new_dir[:-1]).st_atime 
       act.append(x) 
      elif param == "modify": 
       x = os.stat(new_dir[:-1]).st_mtime 
       act.append(x) 
      elif param == "creation": 
        x = os.stat(new_dir[:-1]).st_ctime 
        act.append(x) 

     return act 

    def list_of_files(self ,dir_name ,traversed = [], results = []): 

     dirs = os.listdir(dir_name) 
     if dirs: 
      for f in dirs: 
       new_dir = dir_name + f + self.slash 
       if os.path.isdir(new_dir) and new_dir not in traversed: 
        traversed.append(new_dir) 
        self.list_of_files(new_dir, traversed, results) 
       else: 
        act = self.what_to_do(new_dir) 
        results.append(act) 
     self.result_list = results 
     return results 


def parse_args(): 
    desc = "Welcom To dirmap.py 1.0" 
    parser = argparse.ArgumentParser(description=desc) 
    parser.add_argument('-p','--path', help='Path To Original Directory', required=True) 
    parser.add_argument('-md','--md5', action = 'store_true',help='Show md5 hash of file', required=False) 
    parser.add_argument('-s','--size', action = 'store_true', help='Show size of file', required=False) 
    parser.add_argument('-a','--access', action = 'store_true', help='Show access time of file', required=False) 
    parser.add_argument('-m','--modify', action = 'store_true', help='Show modification time of file', required=False) 
    parser.add_argument('-c','--creation', action = 'store_true', help='Show creation of file', required=False) 

    args = vars(parser.parse_args()) 

    params = [] 
    for key,value in args.iteritems(): 
     if value == True: 
      params.append(key) 

    return args,params 



def main(): 
    args , params = parse_args() 
    dir_path = args['path'] 
    map = Map(params) 
    dir_list = map.list_of_files(dir_path) 

    params.insert(0,"path") 


    htmlcode_dir = HTML.table(dir_list,header_row=params) 
    print htmlcode_dir 

main() 

当我尝试在中型到大型目录运行它抛出我MemoryError异常...... ,你可以在这里看到:

python(2374) malloc: *** mmap(size=140514183884800) failed (error code=12) 
*** error: can't allocate region 
*** set a breakpoint in malloc_error_break to debug 
Traceback (most recent call last): 
    File "dirmap.py", line 132, in <module> 
    main() 
    File "dirmap.py", line 124, in main 
    dir_list = map.list_of_files(dir_path) 
    File "dirmap.py", line 86, in list_of_files 
    self.list_of_files(new_dir, traversed, results) 
    File "dirmap.py", line 86, in list_of_files 
    self.list_of_files(new_dir, traversed, results) 
    File "dirmap.py", line 86, in list_of_files 
    self.list_of_files(new_dir, traversed, results) 
    File "dirmap.py", line 88, in list_of_files 
    act = self.what_to_do(new_dir) 
    File "dirmap.py", line 60, in what_to_do 
    x = self.calc_md5(new_dir[:-1]) 
    File "dirmap.py", line 25, in calc_md5 
    data = file_to_check.read() 
MemoryError 

什么想法?

+0

你可以粘贴回溯?我也建议不要[使用列表作为关键字参数的默认值](http://pythonconquerstheuniverse.wordpress.com/category/python-gotchas/)(正如你在'list_of_files'中所做的那样)。 – beetea

+0

我真的不知道什么是回溯..但是如果没有列出我应该使用什么? –

+0

@beetea:回溯在那里,只是没有格式化得很好。我修复了它。 –

回答

4

您正在一次性将大文件读入内存。不这样做,在成批读它,当您去更新哈希:

def calc_md5(self,file_path): 
    hash = hashlib.md5() 
    with open(file_path, 'rb') as file_to_check: 
     for chunk in iter(lambda: file_to_check.read(4096), ''):  
      hash.update(chunk) 

    return hash.hexdigest() 

这将打开二进制模式下的文件,避免了解释不同的行结束约定(这将改变哈希) 。

上述代码使用iter() function的双参数形式,其中第二个参数是sentinel值;当可调用的第一个参数返回第二个参数时,迭代停止。到达EOF时,Python文件对象返回一个空字符串。

+0

那么我应该使用哪种方法?块或线? –

+0

@FernandoRetimo:在仔细考虑了一下之后,再加上大块和二进制阅读。例如,在文本模式下打开文件可以改变行结束的解释方式。 –

+0

我会研究它尝试它,并告诉它的结果。 –

1

您可能正在运行一个大文件,无法将其全部读入到calc_md5()的内存中。使用缓冲方法