2016-01-13 109 views
0

我想帮助重构此代码以减少冗余行/概念。这个def的代码基本上重复了3次。重构此Python代码以简化重复的简单方法

限制条件: - 我是新手,所以一个真正的幻想列表理解或将事物转化为对象,使用dunders和方法覆盖是我先进的方法。 - 仅内置模块。这是Pyhton 2.7代码,只能导入os和re。

整个脚本的作用: 查找具有固定前缀的文件。这些文件是管道分隔的文本文件。第一行是标题。它有一个可以是1行或更多行的页脚。根据前缀,脚本会从文本文件中丢弃另一步中不需要的“列”。它将以逗号分隔的数据保存在扩展名为.csv的新文件中。

大部分工作都是在processRawFiles()中完成的。这是我想重构的,因为它是非常重复的。

def separateTranslationTypes(translationFileList): 
    '''Takes in list of all files to process and find which are roomtypes 
    , ratecodes or sourcecodes. The type of file determines how it will be processed.''' 
    rates = [] 
    rooms = [] 
    sources = [] 
    for afile in translationFileList: 
     rates.append([m.group() for m in re.finditer('cf_ratecodeheader+(.*)', afile)]) 
     rooms.append([m.group() for m in re.finditer('cf_roomtypes+(.*)', afile)]) 
     sources.append([m.group() for m in re.finditer('cf_sourcecodes+(.*)', afile)]) 
    # empty list equates to False. So if x is True if the list is not empty - thus kept. 
    rates = [x[0] for x in rates if x] 
    rooms = [x[0] for x in rooms if x] 
    sources = [x[0] for x in sources if x] 
    print '... rateCode files :: ',rates,'\n' 
    print '... roomType files :: ',rooms,'\n' 
    print '... sourceCode files :: ',sources, '\n' 

    return {'rateCodeFiles':rates, 
      'roomTypeFiles':rooms, 
      'sourceCodeFiles':sources} 

groupedFilestoProcess = separateTranslationTypes(allFilestoProcess) 


def processRawFiles(groupedFileDict): 
    for key in groupedFileDict: 
     # Process the rateCodes file 
     if key == 'rateCodeFiles': 
      for fname_Value in groupedFileDict[key]: # fname_Value is the filename 
       if os.path.exists(fname_Value): 
        workingfile = open(fname_Value,'rb') 
        filedatastring = workingfile.read() # turns entire file contents to a single string 
        workingfile.close() 
        outname = 'forUpload_' + fname_Value[:-4:] + '.csv' # removes .txt of any other 3 char extension 
        outputfile = open(outname,'wb') 
        filedatalines = filedatastring.split('\n') # a list containing each line of the file 
        rawheaders = filedatalines[0] # 1st element of the list is the first row of the file, with the headers 
        parsedheaders = rawheaders.split('|') # turn the header string into a list where | was delimiter 
        print '\n' 
        print 'outname: ', outname, '\n' 
        # print 'rawheaders: ', rawheaders, '\n' 
        # print 'parsedheaders: ',parsedheaders, '\n' 
        # print filedatalines[0:2] 
        print '\n' 
        ratecodeindex = parsedheaders.index('RATE_CODE') 
        ratecodemeaning = parsedheaders.index('DESCRIPTION') 
        for dataline in filedatalines: 
         if dataline[:4] == 'LOGO': 
          firstuselessline = filedatalines.index(dataline) 
          # print firstuselessline 
        # ignore the first line which was the headers 
        # stop before the line that starts with LOGO - the first useless line 
        for dataline in filedatalines[1:firstuselessline-1:]: 
         # print dataline.split('|') 
         theratecode = dataline.split('|')[ratecodeindex] 
         theratemeaning = dataline.split('|')[ratecodemeaning] 
         # print theratecode, '\t', theratemeaning, '\n' 
         linetowrite = theratecode + ',' + theratemeaning + '\n' 
         outputfile.write(linetowrite) 
        outputfile.close() 

     # Process the roomTypes file 
     if key == 'roomTypeFiles': 
      for fname_Value in groupedFileDict[key]: # fname_Value is the filename 
       if os.path.exists(fname_Value): 
        workingfile = open(fname_Value,'rb') 
        filedatastring = workingfile.read() # turns entire file contents to a single string 
        workingfile.close() 
        outname = 'forUpload_' + fname_Value[:-4:] + '.csv' # removes .txt of any other 3 char extension 
        outputfile = open(outname,'wb') 
        filedatalines = filedatastring.split('\n') # a list containing each line of the file 
        rawheaders = filedatalines[0] # 1st element of the list is the first row of the file, with the headers 
        parsedheaders = rawheaders.split('|') # turn the header string into a list where | was delimiter 
        print '\n' 
        print 'outname: ', outname, '\n' 
        # print 'rawheaders: ', rawheaders, '\n' 
        # print 'parsedheaders: ',parsedheaders, '\n' 
        # print filedatalines[0:2] 
        print '\n' 
        ratecodeindex = parsedheaders.index('LABEL') 
        ratecodemeaning = parsedheaders.index('SHORT_DESCRIPTION') 
        for dataline in filedatalines: 
         if dataline[:4] == 'LOGO': 
          firstuselessline = filedatalines.index(dataline) 
          # print firstuselessline 
        # ignore the first line which was the headers 
        # stop before the line that starts with LOGO - the first useless line 
        for dataline in filedatalines[1:firstuselessline-1:]: 
         # print dataline.split('|') 
         theratecode = dataline.split('|')[ratecodeindex] 
         theratemeaning = dataline.split('|')[ratecodemeaning] 
         # print theratecode, '\t', theratemeaning, '\n' 
         linetowrite = theratecode + ',' + theratemeaning + '\n' 
         outputfile.write(linetowrite) 
        outputfile.close() 
     # Process sourceCodes file 
     if key == 'sourceCodeFiles': 
      for fname_Value in groupedFileDict[key]: # fname_Value is the filename 
       if os.path.exists(fname_Value): 
        workingfile = open(fname_Value,'rb') 
        filedatastring = workingfile.read() # turns entire file contents to a single string 
        workingfile.close() 
        outname = 'forUpload_' + fname_Value[:-4:] + '.csv' # removes .txt of any other 3 char extension 
        outputfile = open(outname,'wb') 
        filedatalines = filedatastring.split('\n') # a list containing each line of the file 
        rawheaders = filedatalines[0] # 1st element of the list is the first row of the file, with the headers 
        parsedheaders = rawheaders.split('|') # turn the header string into a list where | was delimiter 
        print '\n' 
        print 'outname: ', outname, '\n' 
        # print 'rawheaders: ', rawheaders, '\n' 
        # print 'parsedheaders: ',parsedheaders, '\n' 
        # print filedatalines[0:2] 
        print '\n' 
        ratecodeindex = parsedheaders.index('SOURCE_CODE') 
        ratecodemeaning = parsedheaders.index('DESCRIPTION') 
        for dataline in filedatalines: 
         if dataline[:4] == 'LOGO': 
          firstuselessline = filedatalines.index(dataline) 
          # print firstuselessline 
        # ignore the first line which was the headers 
        # stop before the line that starts with LOGO - the first useless line 
        for dataline in filedatalines[1:firstuselessline-1:]: 
         # print dataline.split('|') 
         theratecode = dataline.split('|')[ratecodeindex] 
         theratemeaning = dataline.split('|')[ratecodemeaning] 
         # print theratecode, '\t', theratemeaning, '\n' 
         linetowrite = theratecode + ',' + theratemeaning + '\n' 
         outputfile.write(linetowrite) 
        outputfile.close() 

processRawFiles(groupedFilestoProcess) 

回答

0

必须重做我的代码,因为有一个新的事件,其中文件既没有标题行,也没有页脚行。但是,由于我想要的列仍然以相同的顺序出现,所以我只能保留它们。此外,如果下一行的列数少于所使用的两个索引中较大的一列,我们会停止阅读。

至于减少重复,processRawFiles包含两个def的删除需要重复大量的解析代码从以前。

def separateTranslationTypes(translationFileList): 
    '''Takes in list of all files to process and find which are roomtypes 
    , ratecodes or sourcecodes. The type of file determines how it will be processed.''' 
    rates = [] 
    rooms = [] 
    sources = [] 
    for afile in translationFileList: 
     rates.append([m.group() for m in re.finditer('cf_ratecode+(.*)', afile)]) 
     rooms.append([m.group() for m in re.finditer('cf_roomtypes+(.*)', afile)]) 
     sources.append([m.group() for m in re.finditer('cf_sourcecodes+(.*)', afile)]) 
    # empty list equates to False. So if x is True if the list is not empty - thus kept. 
    rates = [x[0] for x in rates if x] 
    rooms = [x[0] for x in rooms if x] 
    sources = [x[0] for x in sources if x] 
    print '... rateCode files :: ',rates,'\n' 
    print '... roomType files :: ',rooms,'\n' 
    print '... sourceCode files :: ',sources, '\n' 

    return {'rateCodeFiles':rates, 
      'roomTypeFiles':rooms, 
      'sourceCodeFiles':sources} 

groupedFilestoProcess = separateTranslationTypes(allFilestoProcess) 

def processRawFiles(groupedFileDict): 
    def someFixedProcess(bFileList, codeIndex, codeDescriptionIndex): 
     for fname_Value in bFileList: # fname_Value is the filename 
      if os.path.exists(fname_Value): 
       workingfile = open(fname_Value,'rb') 
       filedatastring = workingfile.read() # turns entire file contents to a single string 
       workingfile.close() 
       outname = 'forUpload_' + fname_Value[:-4:] + '.csv' # removes .txt of any other 3 char extension 
       outputfile = open(outname,'wb') 
       filedatalines = filedatastring.split('\n') # a list containing each line of the file 
       # print '\n','outname: ',outname,'\n\n' 
       # HEADERS ARE NOT IGNORED! Since the file might not have headers. 
       print outname 
       for dataline in filedatalines: 
        # print filedatalines.index(dataline), dataline.split('|') 
        # e.g. index 13, reuires len 14, so len > index is needed 
        if len(dataline.split('|')) > codeDescriptionIndex: 
         thecode_text = dataline.split('|')[codeIndex] 
         thedescription_text = dataline.split('|')[codeDescriptionIndex] 
         linetowrite = thecode_text + ',' + thedescription_text + '\n' 
         outputfile.write(linetowrite) 
        outputfile.close() 

    def processByType(aFileList, itsType): 
     typeDict = {'rateCodeFiles' : {'CODE_INDEX': 4,'DESC_INDEX':7}, 
        'roomTypeFiles' : {'CODE_INDEX': 1,'DESC_INDEX':13}, 
        'sourceCodeFiles': {'CODE_INDEX': 2,'DESC_INDEX':3}} 
     # print 'someFixedProcess(',aFileList,typeDict[itsType]['CODE_INDEX'],typeDict[itsType]['DESC_INDEX'],')' 
     someFixedProcess(aFileList, 
         typeDict[itsType]['CODE_INDEX'], 
         typeDict[itsType]['DESC_INDEX']) 

    for key in groupedFileDict: 
     processByType(groupedFileDict[key],key) 

processRawFiles(groupedFilestoProcess)