2016-05-11 37 views
1

这里是我的代码。#pandas DataFrame ValueError:传递值的形状是(1,3),索引暗示(3,3)

数据的形状:

data_dict.items() 
Out[57]: 
[('Sympathetic', defaultdict(<type 'int'>, {'2011-10-06': 1})), 
('protest', defaultdict(<type 'int'>, {'2011-10-06': 16})), 
('occupycanada', defaultdict(<type 'int'>, {'2011-10-06': 1})), 
('hating', defaultdict(<type 'int'>, {'2011-10-06': 1})), 
('AND', defaultdict(<type 'int'>, {'2011-10-06': 4})), 
('c', defaultdict(<type 'int'>, {'2011-10-06': 2})), 
...] 

的data_dict被定义为

data_dict = defaultdict(lambda: defaultdict(int)) 

我想构建一个数据帧,这样的:

columns = ['word','date',"number"] 

word date number 
"Sympathetic" '2011-10-06' 1 
"protest" '2011-10-06' 16 
'occupycanada' '2011-10-06' 1 
'hating' '2011-10-06' 1 
'AND' '2011-10-06' 4 
'comunity' '2011-10-06' 2 
... 

我试图做到这一点方式,使用熊猫:

import pandas as pd 
for d in data_dict: 
    for date in data_dict[d]: 
     data=[d,date,data_dict[d][date]] 
     dat = pd.DataFrame(data, columns = ['word','date',"number"]) 
     print dat 

但是当我运行这段代码,我有以下错误:

ValueError        Traceback (most recent call last) 
<ipython-input-56-80b3affa34fe> in <module>() 
     3  for date in data_dict[d]: 
     4   data=[d,date,data_dict[d][date]] 
----> 5   dat = pd.DataFrame(data, columns = ['word','date',"number"]) 
     6   print dat 
.... 
ValueError: Shape of passed values is (1, 3), indices imply (3, 3) 

我该如何解决呢?

约data_dict附加代码:

from collections import defaultdict 
import csv 
import re 
import sys 
def flushPrint(s): 
    sys.stdout.write('\r') 
    sys.stdout.write('%s' % s) 
    sys.stdout.flush() 

data_dict = defaultdict(lambda: defaultdict(int)) 
error_num = 0 
line_num = 0 
total_num = 0 

bigfile = open('D:/Data/ows/ows_sample.txt', 'rb') 
chunkSize = 10000000 
chunk = bigfile.readlines(chunkSize) 
while chunk: 
    total_num += len(chunk) 
    lines = csv.reader((line.replace('\x00','') for line in chunk), delimiter=',', quotechar='"') 
    for i in lines: 
     line_num+=1 
     if line_num%1000000==0: 
      flushPrint(line_num) 
     try: 
      i[1]= re.sub(r'http[s]?://(?:[a-z]|[0-9]|[[email protected]&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+|(?:@[\w_]+)', "", i[1]) 
      tweets=re.split(r"\W+",i[1]) 
      date=i[3] 
      for word in tweets: # error 
       if len(date)==10: 
        data_dict[word][date] += 1 
     except Exception, e: 
      print e 
      error_num+=1 
      pass 
    chunk = bigfile.readlines(chunkSize) 
print line_num, total_num,error_num 

样本数据

['"Twitter ID",Text,"Profile Image URL",Day,Hour,Minute,"Created At",Geo,"From User","From User ID",Language,"To User","To User ID",Source\n', 

'121813144174727168,"RT @AnonKitsu: ALERT!!!!!!!!!!COPS ARE KETTLING PROTESTERS IN PARK W HELICOPTERS AND PADDYWAGONS!!!! #OCCUPYWALLSTREET #OWS #OCCUPYNY PLEASE RT !!HELP!!!!",,2011-10-06,5,4,"2011-10-06 05:04:51",N;,Anonops_Cop,401240477,en,,0,"&lt;a href=&quot;&gt;web&lt;/a&gt;"\n', 

'121813146137657344,"@jamiekilstein @allisonkilkenny Interesting interview (never aired, wonder why??) by Fox with #ows protester 2011-10-06,5,4,"2011-10-06 05:04:51",N;,KittyHybrid,34532053,en,jamiekilstein,2149053,"&lt;a href=&quot;&gt;web&lt;/a&gt;"\n', 

'121813150000619521,"@Seductivpancake Right! Those guys have a victory condition: regime change. #ows doesn\'t seem to have a goal I can figure out.",2011-10-06,5,4,"2011-10-06 05:04:52",N;,nerdsherpa,95067344,en,Seductivpancake,19695580,"&lt;a href=&quot;nofollow&quot;&gt;Echofon&lt;/a&gt;"\n', 

'121813150701072385,"RT @bembel &quot;Occupy Wall Street&quot; als linke Antwort auf die Tea Party? #OccupyWallStreet #OWS",2011-10-06,5,4,"2011-10-06 05:04:52",N;,hamudistan,35862923,en,,0,"&lt;a href=&quot;rel=&quot;nofollow&quot;&gt;Plume\xc2\xa0\xc2\xa0&lt;/a&gt;"\n', 

'121813163778899968,"#ows White shirt= Brown shirt.",2011-10-06,5,4,"2011-10-06 05:04:56",N;,kl_knox,419580636,en,,0,"&lt;a href=&quot;&gt;web&lt;/a&gt;"\n', 

'121813169999065088,"RT @TheNewDeal: The #NYPD are Out of Control. Is This a Free Country or a Middle-East Dictatorship? #OccupyWallStreet #OWS #p2",2011-10-06,5,4,"2011-10-06 05:04:57",N;,vickycrampton,32151083,en,,0,"&lt;a href=&quot;&gt;web&lt;/a&gt;"\n', 
+0

你可以发布一些代码来生成'data_dict'吗? – MaxU

+0

@MaxU的代码已经上传,请帮我 –

+0

我会做完全不同的 - 你可以发布一个样本数据集5-7行(与'ows_sample.txt'中的格式相同)? – MaxU

回答

1

我会做这种方式:

# -*- coding: utf-8 -*- 
from collections import defaultdict, Counter 
import string 
import pandas as pd 

# prepare translate table, which will remove all punctuations and digits 
chars2remove = list(string.punctuation + string.digits) 
transl_tab = str.maketrans(dict(zip(chars2remove, list(' ' * len(chars2remove))))) 
# replace 'carriage return' and 'new line' characters with spaces 
transl_tab[10] = ' ' 
transl_tab[13] = ' ' 

def tokenize(s): 
    return s.translate(transl_tab).lower().split() 

chunksize = 100 
fn = r'D:\temp\.data\ows-sample.txt' 

# 
# read `Day` and `Text` columns from the source CSV file 
# 

# not-chunked version 
#df = pd.read_csv(fn, usecols=['Text','Day']) 

# "chunked" version - will prepare a list of "reduced" DFs, 
# containing word counts in the form: "{'we': 1, 'stand': 1, 'and': 1}" 
dfs = [] 
for df in pd.read_csv(fn, usecols=['Text','Day'], chunksize=chunksize): 
    # group DF by date and count words for each unique day, summing up counters 
    dfs.append(df.assign(count=df['Text'] 
           .apply(lambda x: Counter(tokenize(x)))) 
       .groupby('Day', as_index=False)['count'].sum() 
    ) 

# convert sets of {'word1': count, 'word2': count} into columns 
tmp = (pd.concat(dfs, ignore_index=True) 
     .set_index('Day')['count'] 
     .apply(pd.Series) 
     .reset_index() 
) 
tmp['Day'] = pd.to_datetime(tmp['Day']) 

# free up memory 
del dfs 

# transform (melt) columns into desired columns: [Day, word, number]] 
rslt = (pd.melt(tmp, id_vars='Day', var_name='word', value_name='number') 
      .fillna(0) 
) 

# delete temporary DF from memory 
del tmp 

# save results as HDF5 file 
rslt.to_hdf('d:/temp/.data/twit_words.h5', 'twit_words', mode='a', 
      format='t', complib='zlib', complevel=4) 

# save results as CSV file 
rslt.to_csv('d:/temp/.data/twit_words.csv.gz', index=False, 
      encoding='utf_8', compression='gzip') 

测试反对this样本数据:

In [254]: pd.melt(new, id_vars='Day', var_name='word', value_name='number').fillna(0) 
Out[254]: 
       Day   word number 
0  2011-11-13    a  4.0 
1  2011-11-14    a  9.0 
2  2011-11-15    a 92.0 
3  2011-11-16    a 111.0 
4  2011-11-17    a 93.0 
5  2011-11-18    a 141.0 
6  2011-11-19    a 77.0 
7  2011-11-20    a 58.0 
8  2011-11-21    a 29.0 
9  2011-11-22    a 70.0 
10  2011-11-23    a 55.0 
11  2011-11-24    a 49.0 
12  2011-11-25    a 41.0 
13  2011-11-26    a 67.0 
14  2011-11-27    a 27.0 
15  2011-11-28    a 34.0 
16  2011-11-29    a 23.0 
17  2011-11-30    a 33.0 
18  2011-12-01    a 26.0 
19  2011-12-02    a 32.0 
20  2011-12-03    a 46.0 
21  2011-12-04    a 29.0 
22  2011-12-05    a 22.0 
23  2011-12-06    a 60.0 
24  2011-12-07    a 32.0 
25  2011-12-08    a 33.0 
26  2011-12-09    a 16.0 
27  2011-11-13    aa  0.0 
28  2011-11-14    aa  0.0 
29  2011-11-15    aa  0.0 
...   ...    ...  ... 
+0

@qiangqin,有帮助吗? – MaxU

+0

是的,它的工作,非常感谢你 –

+0

这将是非常有益的,如果你还提供了解释为什么这个工程,而不是@qiangqin最初尝试。 –

1

您可以将某些几行添加到您的原代码,这样就可以用你的词典,看起来非常简单:

df=pd.DataFrame(data_dict.items()) 
df=df.rename(columns = {0:'word'}) 

f1 = lambda x: x.values()[0] 
df['number']=df[1].apply(f1) 

df=df.rename(columns = {1:'date'}) 
f2 = lambda x: x.keys()[0] 
df['date']=df['date'].apply(f2) 

完全PROGRAMM然后将:

from collections import defaultdict 
import csv 
import re 
import sys 
import pandas as pd 

def flushPrint(s): 
    sys.stdout.write('\r') 
    sys.stdout.write('%s' % s) 
    sys.stdout.flush() 


data_dict = defaultdict(lambda: defaultdict(int)) 
error_num = 0 
line_num = 0 
total_num = 0 

bigfile = open('data.txt', 'rb') 
chunkSize = 10000000 
chunk = bigfile.readlines(chunkSize) 
while chunk: 
    total_num += len(chunk) 
    lines = csv.reader((line.replace('\x00','') for line in chunk), delimiter=',', quotechar='"') 
    for i in lines: 
     line_num+=1 
     if line_num%1000000==0: 
      flushPrint(line_num) 
     try: 
      i[1]= re.sub(r'http[s]?://(?:[a-z]|[0-9]|[[email protected]&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+|(?:@[\w_]+)', "", i[1]) 
      tweets=re.split(r"\W+",i[1]) 
      date=i[3] 
      for word in tweets: # error 
       if len(date)==10: 
        data_dict[word][date] += 1 
     except Exception, e: 
      print e 
      error_num+=1 
      pass 
    chunk = bigfile.readlines(chunkSize) 
print line_num, total_num,error_num 


df=pd.DataFrame(data_dict.items()) 
df=df.rename(columns = {0:'word'}) 

f1 = lambda x: x.values()[0] 
df['number']=df[1].apply(f1) 

df=df.rename(columns = {1:'date'}) 
f2 = lambda x: x.keys()[0] 
df['date']=df['date'].apply(f2) 

print df 

结果:

   word  date number 
0     RT 2011-10-06  2 
1  HELICOPTERS 2011-10-06  1 
2    HELP 2011-10-06  1 
3      2011-10-06  1 
4   KETTLING 2011-10-06  1 
5    OWS 2011-10-06  1 
6 OCCUPYWALLSTREET 2011-10-06  1 
7    PARK 2011-10-06  1 
8   PROTESTERS 2011-10-06  1 
9    ALERT 2011-10-06  1 
10   OCCUPYNY 2011-10-06  1 
11    COPS 2011-10-06  1 
12    ARE 2011-10-06  1 
13     W 2011-10-06  1 
14    IN 2011-10-06  1 
15   PLEASE 2011-10-06  1 
16  PADDYWAGONS 2011-10-06  1 
17    AND 2011-10-06  1 
相关问题