2013-01-23 96 views
-3
dput(x) 
structure(list(Date = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 
3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L), .Label = c("1/1/2012", 
"2/1/2012", "3/1/2012", "4/1/2012", "5/1/2012", "6/1/2012"), class = "factor"), 
    Continent = structure(c(3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 
    3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L), .Label = c("Asia", "Europe", 
    "South America"), class = "factor"), Score = c(10L, 4L, 9L, 
    1L, 9L, 3L, 10L, 0L, 0L, 10L, 4L, 9L, 10L, 4L, 9L, 0L, 0L, 
    5L), Country = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 
    3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("Brasil", 
    "China", "Germany"), class = "factor"), mean = c(6.83333333333333, 
    3.5, 5.83333333333333, 6.83333333333333, 3.5, 5.83333333333333, 
    6.83333333333333, 3.5, 5.83333333333333, 6.83333333333333, 
    3.5, 5.83333333333333, 6.83333333333333, 3.5, 5.83333333333333, 
    6.83333333333333, 3.5, 5.83333333333333), sd = c(4.91596040125088, 
    3.33166624979154, 3.81663027639129, 4.91596040125088, 3.33166624979154, 
    3.81663027639129, 4.91596040125088, 3.33166624979154, 3.81663027639129, 
    4.91596040125088, 3.33166624979154, 3.81663027639129, 4.91596040125088, 
    3.33166624979154, 3.81663027639129, 4.91596040125088, 3.33166624979154, 
    3.81663027639129), outlier1 = c(FALSE, FALSE, FALSE, TRUE, 
    TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, 
    FALSE, FALSE, TRUE, TRUE, FALSE)), .Names = c("Date", "Continent", 
"Score", "Country", "mean", "sd", "outlier1"), row.names = c(NA, 
-18L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000000005e70788>) 

我计算出每个国家的平均值,sd和outlier1。我想为每个国家申请一个outlier_score排名。有人可以指出如何计算此数据集上的离群值分数吗?计算outlier_score中的R

+0

你如何定义'outlier_score'?我会给他们一个'B-'......另外,你显示的'dput'不是'data.table'(这是一个包),而是'data.frame'。 – Justin

+0

@Justin,我可以计算出异常值。我想知道它是多么糟糕。数据点可能距离每个国家的平均值有多远? – user1471980

+5

请更新您的Q并提供您想要的输出。 [这可能有所帮助](http://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) –

回答

2
# if the record is an outlier, 
# take the absolute value of the difference 
# between the score and the mean 
# otherwise leave it blank 
x$distance.to.mean <- ifelse(x$outlier1 , abs(x$Score - x$mean) , NA) 

# for all records with non-missing distances, 
# add a `rank` variable based on its order in the data 
x[ !is.na(x$distance.to.mean) , 'rank' ] <- 
    rank(x[ !is.na(x$distance.to.mean) , 'distance.to.mean' ]) 

# see the result 
x 

# sum up the number of outliers in each country grouping 
outliers.by.country <- tapply(x$outlier1 , x$Country , sum) 

# take a look at those counts 
outliers.by.country 

# create a vector of all matches to the outliers.by.country table 
y <- match(x$Country , names(outliers.by.country)) 

# and merge on the contents of the outliers.by.country table to x 
x$sum.outliers <- 
    outliers.by.country[ y ] 

# sort by the sum if you like 
x <- x[ order(x$sum.outliers , decreasing = TRUE) , ] 
+0

@Anthody Damico,这太好了。我还有一个问题。我如何根据异常数量和排名来定购每个国家? – user1471980