2016-11-17 30 views
0
"SURNAME","GIVENNAME","MIDDLENAME","UPIN","NAME","CODE" 
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770" 
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770" 
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770" 
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770" 
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770" 
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770" 
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770" 
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770 

让我们假设这是大文件的格式,我想分割成许多具有指定大小的文件,并且在每个文件中我需要标题(“SURNAME”,“GIVENNAME”,“MIDDLENAME “,”UPIN“,”NAME“,”CODE“)出现。提前感谢。我想将一个巨大的文件分割成许多文件,并在所有分割文件中使用头文件。使用python


import os 
import sys 

def getfilesize(filename): 
    with open(filename,"rb") as fr: 
     fr.seek(0,2) # move to end of the file 
     size=fr.tell() 
     print("getfilesize: size: %s" % size) 
     return fr.tell() 

def splitfile(filename, splitsize): 
    # Open original file in read only mode 
    if not os.path.isfile(filename): 
     print("No such file as: \"%s\"" % filename) 
     return 

    filesize=getfilesize(filename) 
    with open(filename,"rb") as fr: 
    counter=1 
    orginalfilename = filename.split(".") 
    readlimit = 1000000 #read 5kb at a time 
    n_splits = filesize//splitsize 
    print("splitfile: No of splits required: %s" % str(n_splits)) 
    for i in range(n_splits+1): 
     chunks_count = int(splitsize)//int(readlimit) 
     data_5kb = fr.read(readlimit) # read 
     # Create split files 
     print("chunks_count: %d" % chunks_count) 
     with open(orginalfilename[0]+"_{id}.".format(id=str(counter))+orginalfilename[1],"ab") as fw: 
      fw.seek(0) 
      fw.truncate()# truncate original if present 
      while data_5kb:     
       fw.write(data_5kb) 
       if chunks_count: 
        chunks_count-=1 
        data_5kb = fr.read(readlimit) 
       else: break    
     counter+=1 

if __name__ == "__main__": 
    if len(sys.argv) < 3: print("Filename or splitsize not provided: Usage:  filesplit.py filename splitsizeinkb ") 
    else: 
     filesize = int(sys.argv[2]) * 1000 #make into kb 
     filename = sys.argv[1] 
     splitfile(filename, filesize) 

这工作正常,但无法得到的头,我很抱歉,我是新来的#1。

+0

你尝试过什么吗?请发布您的尝试。还有什么样的文件分裂成这些?什么是标准,你能提供样品吗? – EoinS

+0

这里有什么问题? – marcadian

+0

它必须在Python吗?其他方法可能会更快。无论如何,告诉我们你试过了什么。 –

回答

0

这应该这样做

import os 

maxlines = 1000 # how many lines did you want each new file to have? 
infilepath = 'path/to/file' 
with open(infilepath) as infile: 
    dirpath = os.path.dirname(infilepath) 
    fname = os.path.basename(infilepath) 
    fname, ext = fname.rsplit('.',1) 

    header = infile.readline() 
    outfile = open(os.path.join(dirpath, "{}{}.{}".format(fname, 0, ext)), 'w') 

    for i,line in enumerate(infile): 
     if not i%maxlines: 
      outfile.close() 
      outfile = open(os.path.join(dirpath, "{}{}.{}".format(fname, i//maxlines, ext)), 'w') 
      outfile.write(header) 
     outfile.write(line) 

    try: outfile.close() 
    except: pass 
+0

谢谢inspectorG4dget,但我得到错误Traceback(最近呼叫最后): 文件“C:/Users/Henry/Desktop/G_Scripts/Py/split_new.py”,第7行,在 outfile.close() NameError:名称'outfile'没有定义,我是新来的Stackoverflow和python的基本知识,提前致谢 – GOU7HAM

+0

@ GOU7HAM:什么是错误 – inspectorG4dget

+0

回溯(最近一次调用最后一次): 文件“C:/Users/Henry/Desktop/G_Scripts/Py/split_new.py”,第7行,在 outfile.close() NameError:name'outfile'is not de罚款 – GOU7HAM

2

我用熊猫大文件分割成更小的

import pandas as pd 

infile = #path to your file 

n=0 
for chunk in pd.read_csv(infile, sep = ',', chunksize=1000000): 
    data = chunk 
    oPath = 'chunk_' +str(n)+'.csv' 
    data.to_csv(oPath, sep=' ',index=False, header=true) 
    n +=1 

chunksize表明您在输出文件要多少行。