您可以先使用MultiLabelBinarizer
,然后使用join
:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
print (pd.DataFrame(mlb.fit_transform(df['services']),columns=mlb.classes_, index=df.index))
A B C
0 1 0 0
1 1 1 1
2 0 1 1
df1 = pd.DataFrame(mlb.fit_transform(df['services']),columns=mlb.classes_, index=df.index)
df = df.join(df1)
print (df)
uid services A B C
0 000c80b7d2b3643689b1e516918ec193 [A] 1 0 0
1 001b292c588ec6cc11f57324d40e422d [B, A, C] 1 1 1
2 006696f65899fdd87ba4894c784716f9 [C, B] 0 1 1
纯大熊猫替代与get_dummies
和groupby
按列与骨料max
:
df1 = pd.get_dummies(pd.DataFrame(df['services'].values.tolist()), prefix='', prefix_sep='')
.groupby(axis=1, level=0).max()
print (df1)
A B C
0 1 0 0
1 1 1 1
2 0 1 1
df = df.join(df1)
print (df)
uid services A B C
0 000c80b7d2b3643689b1e516918ec193 [A] 1 0 0
1 001b292c588ec6cc11f57324d40e422d [B, A, C] 1 1 1
2 006696f65899fdd87ba4894c784716f9 [C, B] 0 1 1
时序:
#3k rows
df = pd.concat([df]*1000).reset_index(drop=True)
#John Galt solution
In [255]: %timeit (df.join(df.services.apply(lambda x: pd.Series({y:1 for y in x})).fillna(0).astype(int)))
1 loop, best of 3: 658 ms per loop
#user1717828 solution
In [256]: %timeit (df.join(df['services'].apply(lambda x: "|".join(x)).str.get_dummies()))
100 loops, best of 3: 16.8 ms per loop
#Jez solution1
In [257]: %timeit (df.join(pd.DataFrame(mlb.fit_transform(df['services']),columns=mlb.classes_, index=df.index)))
100 loops, best of 3: 4.66 ms per loop
#Jez solution2
In [258]: %timeit (df.join(pd.get_dummies(pd.DataFrame(df['services'].values.tolist()), prefix='', prefix_sep='').groupby(axis=1, level=0).max()))
100 loops, best of 3: 7.04 ms per loop
#30k rows
df = pd.concat([df]*10000).reset_index(drop=True)
#John Galt solution
In [260]: %timeit (df.join(df.services.apply(lambda x: pd.Series({y:1 for y in x})).fillna(0).astype(int)))
1 loop, best of 3: 6.68 s per loop
#user1717828 solution
In [261]: %timeit (df.join(df['services'].apply(lambda x: "|".join(x)).str.get_dummies()))
10 loops, best of 3: 138 ms per loop
#Jez solution1
In [262]: %timeit (df.join(pd.DataFrame(mlb.fit_transform(df['services']),columns=mlb.classes_, index=df.index)))
10 loops, best of 3: 39.8 ms per loop
#Jez solution2
In [263]: %timeit (df.join(pd.get_dummies(pd.DataFrame(df['services'].values.tolist()), prefix='', prefix_sep='').groupby(axis=1, level=0).max()))
10 loops, best of 3: 20.6 ms per loop
您是否检查了此线程https://stackoverflow.com/questions/37474001/converting-list-in-panda-dataframe-into-columns –
[将熊猫数据框中的列表转换为列](https:// stackoverflow.com/questions/37474001/converting-list-in-panda-dataframe-into-co列) –