2017-08-16 72 views
0

我有一个CSV文件,其中第一行是日期,第一列是扇区。像这样的东西。解析csv文件,将列更改为行和列,并将其更改为列

Date,7/2/2007,7/3/2007,7/5/2007,7/6/2007,7/9/2007 
A,0,1,3,2,0 
AA,23,423,2,0,0 
AAL,34,23,5,0,234 
AGCG,234,0,9,234,23 
XL,0,65,34,34,34 

所以现在我要准备另一个文件,这就好比

Date,Sector 
7/2/2007,AA 
7/2/2007,AAL 
7/2/2007,AGCG 
7/3/2007,A 
7/3/2007,AA 
7/3/2007,AAL 
7/3/2007,XL 
... 

背后的逻辑是,我想在每个日期,其不为0

代码,我已经试过部门到目前为止: 从集合中导入csv,sys import defaultdict

dd = defaultdict(list) 
dateList = [] 
header = False 

def createFile(di): 
    ff = open("cum_file.csv","w") 
    csvwriter = csv.writer(ff) 
    row = [] 
    for d,t in di.iteritems(): 
     for tt in t: 
      print tt,d 
      row = [tt,d] 
      csvwriter.writerow(row) 
      del row[:] 

#with open("./data/StrategyAcctValue-Daily.csv") as f: 
with open("./try/test.csv") as f: 
    reader = csv.reader(f,delimiter=",") 
    for line in reader: 
     col1 = True 
     if header: 
      #sys.exit() 
      for eachCol in line: 
       if col1: 
        col1 = False 
        tkr = eachCol 
       elif eachCol != '0': 
        tkrIndex = line.index(eachCol) 
        tickerDate = dateList[tkrIndex - 1] 
        dd[tickerDate].append(tkr) 
       else: 
        continue 
      #print dd 
      #createFile(dd) 
      #sys.exit() 
     else: 
      header = True 
      for eachCol in line: 
      # print line.index(eachCol) 
      # continue 
       if col1: 
        col1 = False 
        tkr = eachCol 
       else: 
        dd[eachCol] = [] 
        dateList.append(eachCol) 
      print dateList 
    print dd 
    createFile(dd) 

这是给输出,如:

A 7/3/2007 
AA 7/3/2007 
AAL 7/3/2007 
XL 7/3/2007 
A 7/6/2007 
AAL 7/9/2007 
AGCG 7/9/2007 
AA 7/2/2007 
AAL 7/2/2007 
AGCG 7/2/2007 
AGCG 7/2/2007 
A 7/5/2007 
AA 7/5/2007 
AAL 7/5/2007 
AGCG 7/5/2007 
XL 7/5/2007 
XL 7/5/2007 
XL 7/5/2007 

我无法找到,在那里我做了错误。

回答

1
import pandas as pd 

df = pd.read_csv("input.csv") 
df1 = pd.DataFrame(columns=["Date", "Sector"]) 

for i, row in df.iterrows(): 
    dict_ = dict(row) 
    days = [key for key, value in dict_.items() if value != 0] 
    days.remove('Date') 
    for day in days: 
     df1.loc[len(df1)] = [day, dict_["Date"]] 

# df1.sort_values(by='Date') // to sort by date 
df1.to_csv("output.csv", index=False) 

Output.csv

Date,Sector 
7/6/2007,A 
7/3/2007,A 
7/5/2007,A 
7/3/2007,AA 
7/2/2007,AA 
7/5/2007,AA 
7/9/2007,AAL 
7/3/2007,AAL 
7/2/2007,AAL 
7/5/2007,AAL 
7/9/2007,AGCG 
7/6/2007,AGCG 
7/2/2007,AGCG 
7/5/2007,AGCG 
7/9/2007,XL 
7/6/2007,XL 
7/3/2007,XL 
7/5/2007,XL 
+0

谢谢你的工作! – ggupta

+0

@ggupta乐于提供帮助。 –

0
with open("test.csv", 'r') as f: 
    data = list(zip(*(line.rstrip("\n").split(',') for line in f))) 
    temp = dict(enumerate(data[0][1:])) 
    for key, *values in data[1:]: 
     for index, value in enumerate(values): 
      if value != '0': 
       print(key, temp[index]) 

7/2/2007 AA 
7/2/2007 AAL 
7/2/2007 AGCG 
7/3/2007 A 
7/3/2007 AA 
7/3/2007 AAL 
7/3/2007 XL 
7/5/2007 A 
7/5/2007 AA 
7/5/2007 AAL 
7/5/2007 AGCG 
7/5/2007 XL 
7/6/2007 A 
7/6/2007 AGCG 
7/6/2007 XL 
7/9/2007 AAL 
7/9/2007 AGCG 
7/9/2007 XL 
+0

文件过大的部门,无法准备临时 – ggupta

+0

@ggupta我已经更新了答案自动构建'temp'。 – stamaimer

+0

你为什么在第二行使用* * – ggupta

0

使用numpy,您可以将csv数据转换为数组(矩阵)。 然后你可以迭代转置矩阵。

import numpy as np 

data = np.matrix([['Date','7/2/2007','7/3/2007','7/5/2007','7/6/2007','7/9/2007'], 
['A',0,1,3,2,0], 
['AA',23,423,2,0,0], 
['AAL',34,23,5,0,234], 
['AGCG',234,0,9,234,23], 
['XL',0,65,34,34,34]]) 

#get the index of the places in a row (from header) 
#data.T is transposed matrix 
index = data.T[0] 
# you iterate over the dates (row of the transposed matrix) skipping the header row 
for date in data.T[1:]: 
    # get the non-zero element of the row, get the correpond place (using the index) 
    # range begin at 1 to avoid the first column (contains date) 
    for place in [index[0,i] for i in range(1,date.shape[1]) if date[0,i] != 0]: 
     print(date[0,0], place) 
0

我知道问题在哪里,我回答我的问题,并评论变化。

import csv,sys 
from collections import defaultdict 

dd = defaultdict(list) 
dateList = [] 
header = False 

def createFile(di): 
    ff = open("cum_file.csv","w") 
    csvwriter = csv.writer(ff) 
    row = [] 
    for d,t in di.iteritems(): 
     for tt in t: 
      print tt,d 
      row = [tt,d] 
      csvwriter.writerow(row) 
      del row[:] 

with open("./try/test.csv") as f: 
    reader = csv.reader(f,delimiter=",") 
    for line in reader: 
     col1 = True 
     if header: 
      #sys.exit() 
      for eachCol in line: 
       if col1: 
        col1 = False 
        tkr = eachCol 
       elif eachCol != '0': 
        tkrIndex = line.index(eachCol) #in case of duplicate non - zero values, it is returing the index of first one 
        tickerDate = dateList[tkrIndex - 1] 
        dd[tickerDate].append(tkr) 
        line[tkrIndex] = '' # make the entry blank os duplicate issue will not occur 
       else: 
        continue 
      #print dd 
      #createFile(dd) 
      #sys.exit() 
     else: 
      header = True 
      for eachCol in line: 
      # print line.index(eachCol) 
      # continue 
       if col1: 
        col1 = False 
        tkr = eachCol 
       else: 
        dd[eachCol] = [] 
        dateList.append(eachCol) 
      print dateList 
    print dd 
    createFile(dd) 

谢谢你的时间。