2015-10-15 23 views
3

在这种情况下,我们的大数据集将是这样的:子集使用字符串从其他数据两列大数据集设置

structure(list(Car = c("Mazda RX4", "Maserati Bora", "Leticia", 
         "Hornet 4 Drive", "Hornet Sportabout", "Alex", "Duster 360", 
         "Merc 240D", "Merc 230", "Merc 280", "Merc 280C", "Merc 450SE", 
         "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood", "Lincoln Continental", 
         "Chrysler Imperial", "Fiat 128", "Honda Civic", "Toyota Corolla", 
         "Toyota Corona", "Datsun 710", "AMC Javelin", "Camaro Z28", 
         "Datsun 710", "Fiat X1-9", "Mazda RX4", "Lotus Europa", 
         "Ford Pantera L", "Ferrari Dino", "Mazda RX4 Wag", "Volvo 142E" 
), Name = c("Mark", "Random", "Datsun 710", "Trevor", "Joanna", 
      "Valiant", "Random", "Random", "Random", "Random", "Random", 
      "Random", "Random", "Random", "Random", "Random", "Random", "Random", 
      "Random", "Trevor", "Random", "Random", "Random", "Random", "Random", 
      "Random", "Mazda RX4", "Random", "Alex", "Random", "John", "Random" 
), disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8, 
      167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7, 
      71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145, 301, 
      121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 
         180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150, 150, 245, 
         175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9, 3.9, 3.85, 
                     3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92, 3.07, 3.07, 3.07, 
                     2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76, 3.15, 3.73, 3.08, 
                     4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11), wt = c(2.62, 2.875, 
                                  2.32, 3.215, 3.44, 3.46, 3.57, 3.19, 3.15, 3.44, 3.44, 4.07, 
                                  3.73, 3.78, 5.25, 5.424, 5.345, 2.2, 1.615, 1.835, 2.465, 3.52, 
                                  3.435, 3.84, 3.845, 1.935, 2.14, 1.513, 3.17, 2.77, 3.57, 2.78 
                     ), qsec = c(16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 
                        20, 22.9, 18.3, 18.9, 17.4, 17.6, 18, 17.98, 17.82, 17.42, 19.47, 
                        18.52, 19.9, 20.01, 16.87, 17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 
                        14.5, 15.5, 14.6, 18.6), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 
                                1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 
                                1), am = c(1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                                   1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 
                                                 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 
                                                 3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c(4, 4, 1, 1, 2, 1, 4, 2, 
                                                          2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1, 2, 2, 4, 
                                                          6, 8, 2)), .Names = c("Car", "Name", "disp", "hp", "drat", "wt", 
                                                                "qsec", "vs", "am", "gear", "carb"), row.names = c(NA, -32L), class = "data.frame") 

我想这个子集数据通过提取它的某些行设置。我想,以提取行存储在另一个数据帧:

> dput(list_save) 
structure(list(Car = c("Mazda RX4", "Mazda RX4 Wag", "Datsun 710", 
"Hornet 4 Drive", "Hornet Sportabout", "Valiant"), Name = c("Mark", 
"John", "Leticia", "Trevor", "Joanna", "Alex")), .Names = c("Car", 
"Name"), class = "data.frame", row.names = c(NA, -6L)) 

采取对list_save一看,因为某些字符串可以在不同的列根据df中查到,但是它必须被提取为好。

所需的输出应该是这样的:

   Car  Name disp hp drat wt qsec vs am gear carb 
1   Mazda RX4  Mark 160 110 3.90 2.620 16.46 0 1 4 4 
2  Mazda RX4 Wag  John 301 335 3.54 3.570 14.60 0 1 5 8 
3   Leticia Datsun 710 108 93 3.85 2.320 18.61 1 1 4 1 
4 Hornet 4 Drive  Trevor 258 110 3.08 3.215 19.44 1 0 3 1 
5 Hornet Sportabout  Joanna 360 175 3.15 3.440 17.02 0 0 3 2 
6    Alex Valiant 225 105 2.76 3.460 20.22 1 0 3 1 

我想类似的功能下面的一个发现:

test <- df[df[,1:2] %in% list_save, ] 

回答

2

我会运行两个二进制连接使用data.table一次CarName对他们的自我和一次反对对方,只是结合了。我们将使用在CRAN的最新版本为这个(V 1.9.6+)

library(data.table) # v 1.9.6+ 
res <- setDT(df)[list_save, on = c("Car", "Name")] 
res2 <- df[list_save, on = c(Name = "Car", Car = "Name"), nomatch = 0L] 
res[is.na(disp), (names(res)) := res2] 
#     Car  Name disp hp drat wt qsec vs am gear carb 
# 1:   Mazda RX4  Mark 160 110 3.90 2.620 16.46 0 1 4 4 
# 2:  Mazda RX4 Wag  John 301 335 3.54 3.570 14.60 0 1 5 8 
# 3:   Leticia Datsun 710 108 93 3.85 2.320 18.61 1 1 4 1 
# 4: Hornet 4 Drive  Trevor 258 110 3.08 3.215 19.44 1 0 3 1 
# 5: Hornet Sportabout  Joanna 360 175 3.15 3.440 17.02 0 0 3 2 
# 6:    Alex Valiant 225 105 2.76 3.460 20.22 1 0 3 1 

另外,一个更安全的方法是只rbind只有匹配的结果,但这样一来,你就失去了原有的行order

res <- setDT(df)[list_save, on = c("Car", "Name"), nomatch = 0L] 
res2 <- df[list_save, on = c(Name = "Car", Car = "Name"), nomatch = 0L] 
rbind(res, res2) 
#     Car  Name disp hp drat wt qsec vs am gear carb 
# 1:   Mazda RX4  Mark 160 110 3.90 2.620 16.46 0 1 4 4 
# 2:  Mazda RX4 Wag  John 301 335 3.54 3.570 14.60 0 1 5 8 
# 3: Hornet 4 Drive  Trevor 258 110 3.08 3.215 19.44 1 0 3 1 
# 4: Hornet Sportabout  Joanna 360 175 3.15 3.440 17.02 0 0 3 2 
# 5:   Leticia Datsun 710 108 93 3.85 2.320 18.61 1 1 4 1 
# 6:    Alex Valiant 225 105 2.76 3.460 20.22 1 0 3 1 
1
sub_df <- df[which(df[,1] %in% list_save[,1] & df[,2] %in% list_save[,2]),] 

虽然,你的意思有AlexCarValiant in Name?我只是问,因为上面假设那些是错误的。如果心不是的情况下使用:

EDITED

sub_df <- df[which(df[,1] %in% list_save[,1] & df[,2] %in% list_save[,2] | 
        df[,1] %in% list_save[,2] & df[,2] %in% list_save[,1]),] 
+0

我认为代码有问题。如果我将它用于原始数据,它将提取包含两列中任何字符串的每一行。 –

+0

不能看到您的原始数据,这很难说,这适用于示例数据。 – amwill04

+0

立即尝试。编辑的主要数据集。它与原来的相似。 –