2015-01-15 80 views
1

我无法从数据集中删除选定的数据。我有一个集合的例子,并且我有另一个选定行的表格(toremove)。我试图从原始集合中删除(删除)。从数据集中删除表格

我试图使用setdiff,但是当有行削减dow(根据环境变量),它不是删除选定的数据。

PROD1 < - PROD [setdiff(rownames(产品),rownames(文档,删除)),]在dput整个数据集的

实施例:

Prod <- structure(list(CountryCode = c(5000L, 5300L, 5300L, 5000L, 5400L, 
5300L, 5400L, 5200L, 5200L, 5200L, 5000L, 5000L), Country = structure(c(4L, 
2L, 2L, 4L, 3L, 2L, 3L, 1L, 1L, 1L, 4L, 4L), .Label = c("Americas + (Total)", 
"Asia + (Total)", "Europe + (Total)", "World + (Total)"), class = "factor"), 
ItemCode = c(1814L, 1717L, 1817L, 116L, 1717L, 1817L, 1817L, 
156L, 1717L, 1817L, 1735L, 1800L), Item = structure(c(3L, 
2L, 1L, 4L, 2L, 1L, 1L, 5L, 2L, 1L, 6L, 7L), .Label = c("Cereals (Rice Milled Eqv) + (Total)", 
"Cereals,Total + (Total)", "Coarse Grain, Total + (Total)", 
"Potatoes", "Sugar cane", "Vegetables Primary + (Total)", 
"Vegetables&Melons, Total + (Total)"), class = "factor"), 
ElementGroup = c(31L, 31L, 31L, 51L, 51L, 51L, 51L, 51L, 
51L, 51L, 51L, 51L), ElementCode = c(5312L, 5312L, 5312L, 
5510L, 5510L, 5510L, 5510L, 5510L, 5510L, 5510L, 5510L, 5510L 
), Element = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L), .Label = c("Area harvested", "Production" 
), class = "factor"), Unit = structure(c(1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Ha", "tonnes" 
), class = "factor"), Y1961 = c(3.29e+08, 2.72e+08, 2.72e+08, 
2.71e+08, 2.64e+08, 2.63e+08, 2.63e+08, 2.36e+08, 2.28e+08, 
2.24e+08, 2.23e+08, 2.23e+08), Y1962 = c(3.27e+08, 2.76e+08, 
2.76e+08, 2.53e+08, 2.81e+08, 2.78e+08, 2.81e+08, 2.22e+08, 
2.4e+08, 2.36e+08, 2.23e+08, 2.23e+08), Y1963 = c(3.33e+08, 
2.76e+08, 2.76e+08, 2.7e+08, 2.5e+08, 2.95e+08, 2.49e+08, 
2.26e+08, 2.62e+08, 2.58e+08, 2.23e+08, 2.23e+08), Y1964 = c(3.29e+08, 
2.82e+08, 2.82e+08, 2.85e+08, 2.96e+08, 3.1e+08, 2.96e+08, 
2.43e+08, 2.49e+08, 2.45e+08, 2.26e+08, 2.26e+08)), .Names = c("CountryCode", 
"Country", "ItemCode", "Item", "ElementGroup", "ElementCode", 
"Element", "Unit", "Y1961", "Y1962", "Y1963", "Y1964"), class = "data.frame", row.names = c(NA, 
-12L)) 

选择的数据以去除:

toremove <- structure(list(CountryCode = c(5000L, 5400L, 5300L, 5400L, 5200L 
), Country = structure(c(4L, 3L, 2L, 3L, 1L), .Label = c("Americas + (Total)", 
"Asia + (Total)", "Europe + (Total)", "World + (Total)"), class = "factor"), 
ItemCode = c(116L, 1717L, 1817L, 1817L, 1717L), Item = structure(c(3L, 
2L, 1L, 1L, 2L), .Label = c("Cereals (Rice Milled Eqv) + (Total)", 
"Cereals,Total + (Total)", "Potatoes"), class = "factor"), 
ElementGroup = c(51L, 51L, 51L, 51L, 51L), ElementCode = c(5510L, 
5510L, 5510L, 5510L, 5510L), Element = structure(c(1L, 1L, 
1L, 1L, 1L), .Label = "Production", class = "factor"), Unit = structure(c(1L, 
1L, 1L, 1L, 1L), .Label = "tonnes", class = "factor"), Y1961 = c(2.71e+08, 
2.64e+08, 2.63e+08, 2.63e+08, 2.28e+08), Y1962 = c(2.53e+08, 
2.81e+08, 2.78e+08, 2.81e+08, 2.4e+08), Y1963 = c(2.7e+08, 
2.5e+08, 2.95e+08, 2.49e+08, 2.62e+08), Y1964 = c(2.85e+08, 
2.96e+08, 3.1e+08, 2.96e+08, 2.49e+08)), .Names = c("CountryCode", 
"Country", "ItemCode", "Item", "ElementGroup", "ElementCode", 
"Element", "Unit", "Y1961", "Y1962", "Y1963", "Y1964"), class = "data.frame", row.names = c(NA, 
-5L)) 
+1

'PROD [(rownames(产品)在%rownames%(文档,删除))!]'也许?虽然我不明白你为什么要创建一个全新的数据集,只要你需要排名就可以了 –

+0

在这个例子中,我创建了一个新的Prod1只是为了良好的实践,所以我可以确保它的工作。在最后的剧本中我不需要新的设置。 – sammyramz

+0

无论哪种方式,我都不会推荐使用行名,因为它们在第一个子设置后往往会混乱。你最好有一些其他更可靠的索引。 –

回答

1
# Answer #1 --------------------------------------------------------------- 
AnswerinComments <- Prod[!(rownames(Prod) %in% rownames(toremove)),] 

也找到了:Delete rows that exist in another data frame?

# Answer #2 --------------------------------------------------------------- 
require(sqldf) 
AnotherWay <- sqldf("Delete a from Prod a inner join toremove b 
        on a.CountryCode = b.CountryCode 
        and a.ElementCode = b. ElementCode") 

# Answer #3 --------------------------------------------------------------- 
all <- rbind(Prod, toremove) 
duplicated(all) 
YetAnother <- all[!duplicated(all,fromLast = FALSE) & 
        !duplicated(all,fromLast = TRUE),] 
0

受欢迎的dplyr包也具有setdiff功能。然而,它需要相同的数据结构 - 你的情况:同样的因子水平:

## factors to character vectors if needed... 
# idx <- sapply(Prod, class) == "factor" 
# Prod[idx] <- sapply(Prod[idx], as.character) 
# toremove[idx] <- sapply(toremove[idx], as.character) 
library(dplyr) 
setdiff(Prod, toremove)