0
我只想从两个tsv文件中自动提取小样本。采样行不一定必须精确,每个采样只需要均匀分布。当切割发生时,bash shell会输出'tail:stdout:Broken pipe',尽管程序起初似乎仍然运行正常。我不是特别喜欢我的程序输出“Broken”这个词,但我并不在乎。问题是每个后续的'砍'需要更长的时间,我不明白为什么。我有内存泄漏吗?有什么我应该关闭吗?我也不喜欢有除了声明之外的尝试,但我不确定解决这个问题的好方法。在python中使用bash命令切割文件运行缓慢'tail:stdout:Broken pipe'
import os
import sys
import subprocess
import commands
import csv as tsv
def main(scorebreaks, positives, negatives):
#just to isolate the attributeId
newpositives = os.path.basename(positives)
attributeid = newpositives.rstrip('-positive.tsv')
#create output folder if it doesn't exist
path_to_script_dir = os.path.dirname(os.path.abspath(positives))
newpath = path_to_script_dir + '/ezcut_output'
if not os.path.exists(newpath): os.makedirs(newpath)
with open(scorebreaks, 'rb') as tsvfile:
tsvreader = tsv.reader(tsvfile, delimiter='\t')
scorebreakslist = zip(*(line.strip().split('\t') for line in tsvfile))
#print scorebreakslist[0][1] #would give line number at .99
#print scorebreakslist[1][1] #would give .99
whatiteration = input('What iteration? ')
chunksize = input('Chunk size? ')
numberofchunks = int(input('Number of chunks? '))-1
scorejumpamt = 1.0/numberofchunks #number of chunks is 20? score jump amt == .05
#print scorejumpamt
scorei = 1.0
choparray = [100]
while True: #cause i needed a do-while loop
scorei = float(scorei) - float(scorejumpamt)
scorei = '%.2f'%(scorei)
#print scorei
if float(scorei) < 0.00: break
try:
arraynum = scorebreakslist[1].index(str(scorei))
except ValueError:
break
#print scorebreakslist[1]
#add the linenumber to an array for use in cutting
choparray.append(scorebreakslist[0][arraynum])
#print len(choparray)
#the actual file manipulation section of code
index=0
for number in choparray:
indexkinda = 1-float(scorejumpamt)*float(index)
indexkinda = '%.2f'%(indexkinda)
#print indexkinda
if indexkinda < 0: break
if float(indexkinda) > 0.50:
#print indexkinda
cmd = 'tail -n+%s %s | head -n%s > %s/%s-%s-%s.tsv' % (number, positives, chunksize, newpath, indexkinda, attributeid, whatiteration)
subprocess.call(cmd, shell=True)
#subprocess.call(cmd, shell=True)
index+=1
else: #maybe make this not get anything below 0.1 for speed
#print indexkinda
cmd = 'tail -n+%s %s | head -n%s > %s/%s-%s-%s.tsv' % (number, negatives, chunksize, newpath, indexkinda, attributeid, whatiteration)
subprocess.call(cmd, shell=True)
index+=1
main(sys.argv[1], sys.argv[2], sys.argv[3])
虽然我不重读任何文件的相同部分? 'tail -n +(somenumber)占据文件的底部。所以文件的相同部分永远不会被读取。 – 2014-09-28 16:21:14
@ user2005645,否,'tail'必须重新读取所有这些行才能跳过它们。没有办法直接说“跳到文件中的第n行”。 – o11c 2014-10-01 22:59:51