-1
TfidfVectorizer在输出上返回稀疏矩阵,可以很容易地将其转换为SparseDataFrame(不是常规的)。但我无法弄清楚如何添加一列(s)并保存在csv文件中。如何添加一列并保存在文本文件中的稀疏矩阵?
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
def get_features(data, train=False):
tfv = TfidfVectorizer()
if train:
features = tfv.fit_transform(data["text"])
else:
features = tfv.transform(data["text"])
features_pd = pd.SparseDataFrame([ pd.SparseSeries(features[i].toarray().ravel())
for i in np.arange(features.shape[0]) ], columns = tfv.get_feature_names())
# the next 2 lines in replacement of the previous result in empty (commas only) output
# features_pd = pd.DataFrame([ pd.Series(features[i].toarray().ravel())
# for i in np.arange(features.shape[0]) ], columns = tfv.get_feature_names())
# the next line results in TypeError: ufunc 'isnan' not supported for the input types ...
# features_pd['_class_'] = pd.SparseSeries(data["class"])
print "F:",features_pd.iloc[[0]]
return features_pd
if __name__ == '__main__':
train = pd.read_csv('train.csv', header=None, names = ["class", "text"]).fillna("")
features = get_features(train, train=True)
features.to_csv('out.csv', index=False)