2014-10-30 104 views
0

我有以下数据帧:子集数据帧分组

  V1 subst_string 
1  NM_000171  L374R 
2  NM_000171  W421P 
3  NM_000171  T358A 
4  NM_000171  T358A 
5  NM_000171  T358A 
6  NM_000171  T358A 
7  NM_000171  S268R 
8  NM_000171   N35P 
9  NM_000171  T435F 
10 NM_000171  T435F 
11 NM_000171  T435F 
12 NM_000171  T435F 
13 NM_000171  L368E 
14 NM_000171  L368G 
15 NM_000171  L374R 
16 NM_000171  W421L 
17 NM_000171  W421P 
18 NM_000171  W421L 
19 NM_000171  R371R 
20 NM_000171  R371R 
21 NM_000171  S268R 
22 NM_000171   N35R 
23 NM_000171   N35P 
24 NM_000171   N35R 
25 NM_000171  F271L 
26 NM_000171  F271L 
27 NM_000171  F271L 
28 NM_000171  L368E 
29 NM_000171  L374R 
30 NM_000171  L374R 
31 NM_000171  M157N 
32 NM_000171  M157N 
33 NM_000171  M157N 
34 NM_000171  R371R 
35 NM_000171  S268R 
36 NM_000171  S268R 
37 NM_000171  Y201P 
38 NM_000171  Y201P 
39 NM_000171  Y201P 
40 NM_000171  Y201P 
41 NM_000171  F271L 
42 NM_000171  L368G 
43 NM_000171  Y397S 
44 NM_000171  Y397G 
45 NM_000171  Y397S 
46 NM_000171  Y397G 
47 NM_000171  M157N 
48 NM_000171  R371R 
49 NM_001146040  F271L 
50 NM_001146040  L368E 
51 NM_001146040  L374R 
52 NM_001146040  E429P 
53 NM_001146040  T358A 
54 NM_001146040  T358A 
55 NM_001146040  M157N 
56 NM_001146040  R371R 
57 NM_001146040  S268R 
58 NM_001146040   N35P 
59 NM_001146040  I443F 
60 NM_001146040  I443F 
61 NM_001146040  Y201P 
62 NM_001146040  Y201P 
63 NM_001146040  F271L 
64 NM_001146040  L368G 
65 NM_001146040  L374R 
66 NM_001146040  E429L 
67 NM_001146040  L405S 
68 NM_001146040  L405G 
69 NM_001146040  M157N 
70 NM_001146040  R371R 
71 NM_001146040  S268R 
72 NM_001146040   N35R 
73 NM_001292000  NANANA 
74 XM_005268412  NANANA 

这是相当简单的 - 第一列包括4倍不同的值,并且我想创建4个子集的数据,其中,所述数据帧是由这些不同的值进行子集化。即

  V1 subst_string 
1  NM_000171  L374R 
2  NM_000171  W421P 
3  NM_000171  T358A 
4  NM_000171  T358A 
5  NM_000171  T358A 
6  NM_000171  T358A 
7  NM_000171  S268R 
8  NM_000171   N35P 
9  NM_000171  T435F 
10 NM_000171  T435F 
11 NM_000171  T435F 
12 NM_000171  T435F 
13 NM_000171  L368E 
14 NM_000171  L368G 
15 NM_000171  L374R 
16 NM_000171  W421L 
17 NM_000171  W421P 
18 NM_000171  W421L 
19 NM_000171  R371R 
20 NM_000171  R371R 
21 NM_000171  S268R 
22 NM_000171   N35R 
23 NM_000171   N35P 
24 NM_000171   N35R 
25 NM_000171  F271L 
26 NM_000171  F271L 
27 NM_000171  F271L 
28 NM_000171  L368E 
29 NM_000171  L374R 
30 NM_000171  L374R 
31 NM_000171  M157N 
32 NM_000171  M157N 
33 NM_000171  M157N 
34 NM_000171  R371R 
35 NM_000171  S268R 
36 NM_000171  S268R 
37 NM_000171  Y201P 
38 NM_000171  Y201P 
39 NM_000171  Y201P 
40 NM_000171  Y201P 
41 NM_000171  F271L 
42 NM_000171  L368G 
43 NM_000171  Y397S 
44 NM_000171  Y397G 
45 NM_000171  Y397S 
46 NM_000171  Y397G 
47 NM_000171  M157N 
48 NM_000171  R371R 

将是这样一个子集。我想我可以做这个manuall,但是我在后面的数据集中设想了V1更独特的价值。有谁知道如何基于V1的不同值自动生成此数据框的子集?

谢谢

回答

4

只需使用split创建list

split(df, df$V1) 

如果您需要4数据集在全球环境

list2env(split(df, df$V1), envir=.GlobalEnv) 

head(NM_000171,2) 
#  V1 subst_string 
#1 NM_000171  L374R 
#2 NM_000171  W421P 
+0

太容易了哈哈。谢谢。我试图循环。 – brucezepplin 2014-10-30 14:05:13

+0

是的,第二部分真的很有帮助 - 我已经使用过了。 – brucezepplin 2014-10-31 14:30:43

3

如果你知道该怎么做你分割后,有很多方法;你不需要使用'分割'。例如找到长度的子群:

with(ddf, tapply(subst_string, V1, length)) 
    NM_000171 NM_001146040 NM_001292000 XM_005268412 
      48   24   1   1 

aggregate(subst_string~V1, data=ddf, length) 
      V1 subst_string 
1 NM_000171   48 
2 NM_001146040   24 
3 NM_001292000   1 
4 XM_005268412   1 

library(data.table) 
ddt = data.table(ddf) 
ddt[,list(len =.N),by=V1] 
      V1 len 
1: NM_000171 48 
2: NM_001146040 24 
3: NM_001292000 1 
4: XM_005268412 1 
+0

+1为正确的建议。在大多数情况下,不需要保持数据拆分。 – Arun 2014-11-03 23:21:32