2017-08-02 62 views
0

我正在尝试使用ggplot2创建一个带状图。以下是tbl的子集,其中包含我正在使用的相关列,以及dputggplot2:如何自定义颜色和点的形状?

> tbl[,c('Study_ID', 'Probe_ID', 'Group1','Group2','LogFC', 'adj_P_Value', 'P_Value', 'CI_L','CI_R','Disease')] 
    Study_ID Probe_ID    Group1     Group2  LogFC adj_P_Value  P_Value  CI_L  CI_R 
1 GSE2461 220307_at     Male     Female -0.09017596 1.000000e+00 5.662047e-01 -0.43955752 0.25920561 
2 GSE2461 220307_at ulcerative colitis irritable bowel syndrome 0.08704844 1.000000e+00 5.784053e-01 -0.26134341 0.43544028 
3 GSE27887 220307_at  nonlesional skin   lesional skin -0.03501474 1.000000e+00 4.409881e-01 -0.12677636 0.05674688 
4 GSE27887 220307_at   pretreatment   posttreatment 0.01096914 1.000000e+00 8.080366e-01 -0.08064105 0.10257932 
5 GSE42296 7921677   Infliximab   Before treatment -0.03707265 1.000000e+00 3.979403e-01 -0.12407201 0.04992672 
6 GSE42296 7921677   Responder    Nonresponder 0.07644834 1.000000e+00 1.505444e-01 -0.02849309 0.18138977 
7 GSE42296 7921677 Rheumatoid Arthritis   Crohn's Disease 0.42318863 3.960125e-06 1.989713e-10 0.31076269 0.53561457 
8 GSE58558 220307_at     M      F -0.11881801 1.000000e+00 1.130180e-01 -0.26629675 0.02866072 
9 GSE58558 220307_at non lesional skin   lesional skin -0.18914128 1.000000e+00 3.696739e-03 -0.31525660 -0.06302596 
10 GSE58558 220307_at   responder    nonresponder -0.14470319 1.000000e+00 2.328062e-01 -0.38396386 0.09455748 
11 GSE58558 220307_at    week 12     day 1 -0.39619004 4.311942e-01 2.215798e-05 -0.57226227 -0.22011781 
12 GSE58558 220307_at    week 2     day 1 -0.28765455 1.000000e+00 8.753977e-04 -0.45375957 -0.12154953 
13 GSE59294 220307_at C Dupilumab 300 mg  B Dupilumab 150 mg 0.16853309 1.000000e+00 1.140155e-01 -0.04273877 0.37980494 
14 GSE59294 220307_at   D Placebo  B Dupilumab 150 mg -0.18995566 1.000000e+00 2.264691e-01 -0.50367856 0.12376724 
15 GSE59294 220307_at    NL skin     LS skin 0.01376129 1.000000e+00 9.041383e-01 -0.21711706 0.24463964 
16 GSE59294 220307_at     Pre      Post 0.02234607 1.000000e+00 8.069367e-01 -0.16235054 0.20704268 
             Disease 
1 irritable bowel syndrome; ulcerative colitis 
2 irritable bowel syndrome; ulcerative colitis 
3        atopic Dermatitis 
4        atopic Dermatitis 
5   Crohn's Disease; Rheumatoid Arthritis 
6   Crohn's Disease; Rheumatoid Arthritis 
7   Crohn's Disease; Rheumatoid Arthritis 
8        Atopic Dermatitis 
9        Atopic Dermatitis 
10       Atopic Dermatitis 
11       Atopic Dermatitis 
12       Atopic Dermatitis 
13       atopic Dermatitis 
14       atopic Dermatitis 
15       atopic Dermatitis 
16       atopic Dermatitis 

这里是dput

> dput(droplevels(tbl[,c('Study_ID', 'Probe_ID', 'Group1','Group2','LogFC', 'adj_P_Value', 'P_Value', 'CI_L','CI_R','Disease')])) 
structure(list(Study_ID = c("GSE2461", "GSE2461", "GSE27887", 
"GSE27887", "GSE42296", "GSE42296", "GSE42296", "GSE58558", "GSE58558", 
"GSE58558", "GSE58558", "GSE58558", "GSE59294", "GSE59294", "GSE59294", 
"GSE59294"), Probe_ID = c("220307_at", "220307_at", "220307_at", 
"220307_at", "7921677", "7921677", "7921677", "220307_at", "220307_at", 
"220307_at", "220307_at", "220307_at", "220307_at", "220307_at", 
"220307_at", "220307_at"), Group1 = c("Male", "ulcerative colitis", 
"nonlesional skin", "pretreatment", "Infliximab", "Responder", 
"Rheumatoid Arthritis", "M", "non lesional skin", "responder", 
"week 12", "week 2", "C Dupilumab 300 mg", "D Placebo", "NL skin", 
"Pre"), Group2 = c("Female", "irritable bowel syndrome", "lesional skin", 
"posttreatment", "Before treatment", "Nonresponder", "Crohn's Disease", 
"F", "lesional skin", "nonresponder", "day 1", "day 1", "B Dupilumab 150 mg", 
"B Dupilumab 150 mg", "LS skin", "Post"), LogFC = c(-0.0901759558643281, 
0.0870484364429408, -0.0350147376937934, 0.0109691380052655, 
-0.0370726462749328, 0.0764483363743359, 0.423188628619509, -0.118818013184408, 
-0.189141277685995, -0.144703191279992, -0.396190039768736, -0.28765454670704, 
0.168533085440721, -0.189955660434197, 0.0137612879743023, 0.0223460675171673 
), adj_P_Value = c(1, 1, 1, 1, 1, 1, 3.96012504622782e-06, 1, 
1, 1, 0.431194244819507, 1, 1, 1, 1, 1), P_Value = c(0.566204678925109, 
0.578405275354266, 0.440988072013756, 0.808036622723435, 0.397940346528484, 
0.150544373610059, 1.98971262936634e-10, 0.11301796668591, 0.00369673863311212, 
0.232806229179741, 2.21579776371792e-05, 0.000875397680320129, 
0.114015475901252, 0.226469133014055, 0.904138332714553, 0.806936684043586 
), CI_L = c(-0.439557521861354, -0.261343410788222, -0.12677635951562, 
-0.0806410486876688, -0.124072011981945, -0.0284930943795223, 
0.310762687356251, -0.26629674914578, -0.315256597358499, -0.383963864121397, 
-0.57226227039893, -0.453759565458485, -0.0427387734415052, -0.503678563834605, 
-0.217117064412363, -0.162350541147386), CI_R = c(0.259205610132698, 
0.435440283674103, 0.0567468841280329, 0.1025793246982, 0.0499267194320791, 
0.181389767128194, 0.535614569882768, 0.0286607227769647, -0.0630259580134921, 
0.0945574815614131, -0.220117809138542, -0.121549527955595, 0.379804944322947, 
0.12376724296621, 0.244639640360967, 0.207042676181721), Disease = c("irritable bowel syndrome; ulcerative colitis", 
"irritable bowel syndrome; ulcerative colitis", "atopic Dermatitis", 
"atopic Dermatitis", "Crohn's Disease; Rheumatoid Arthritis", 
"Crohn's Disease; Rheumatoid Arthritis", "Crohn's Disease; Rheumatoid Arthritis", 
"Atopic Dermatitis", "Atopic Dermatitis", "Atopic Dermatitis", 
"Atopic Dermatitis", "Atopic Dermatitis", "atopic Dermatitis", 
"atopic Dermatitis", "atopic Dermatitis", "atopic Dermatitis" 
)), .Names = c("Study_ID", "Probe_ID", "Group1", "Group2", "LogFC", 
"adj_P_Value", "P_Value", "CI_L", "CI_R", "Disease"), row.names = c(NA, 
-16L), class = "data.frame") 

最后,这里是我到目前为止的代码。

#test using ggplot2 
maxFC = max(as.numeric(as.character(tbl$LogFC))) 
minFC = min(as.numeric(as.character(tbl$LogFC))) 


datasetList = tbl$Study_ID 
hLines =(which(duplicated(datasetList) == FALSE) - 0.5) 


tbl$ylab <- paste(tbl$Group2," \U2192 ","\n", tbl$Group1, sep = "") 


p <- ggplot(data = tbl, aes(x = LogFC, y = Probe_ID, group = Study_ID)) + 
    geom_vline(xintercept = log(0.5,2), size = 0.2) + 
    geom_vline(xintercept = log(2/3,2), size = 0.2) + 
    geom_vline(xintercept = log(1.5,2), size = 0.2) + 
    geom_vline(xintercept = log(2,2), size = 0.2) + 
    labs(title = tbl$gene, y = "Contrasts", x = bquote(~Log[2]~'(Fold Change)')) + 
    geom_errorbarh(aes(x = LogFC, xmin = CI_L, xmax = CI_R), height = .1) + 
    geom_point(aes(colour = cut(adj_P_Value, c(-Inf, 0.01, 0.05, Inf)))) + 
    scale_color_manual(name = "P Value", 
        values = c("(-Inf,0.01]" = "red", 
           "(0.01,0.05)" = "orange", 
           "(0.05, Inf]" = "black"), 
        labels = c("<= 0.01", "0.01 < P Value <= 0.05", "> 0.05")) + 
    scale_shape_manual(values = c(4,15,19)) + 
    coord_cartesian(xlim = c(min(-2,minFC),max(2,maxFC))) + 

    theme(axis.text.y = element_blank(), strip.text.y = element_text(angle = 180), 
     #panel.grid.major = element_blank(), 
     #panel.grid.minor = element_blank(), 
     axis.line.y = element_blank(), 
     axis.line.x = element_blank(), 
     #panel.background = element_rect(fill = 'white', colour = 'white'), 
     #panel.grid = element_blank(), 
     panel.spacing.y = unit(0.5,'lines'), 
     axis.ticks.y = element_blank()) + 
    facet_grid(Study_ID+ylab~ ., scales = 'free', space = 'free', switch = 'both') 


p 

基本上,点实际位置是通过其LogFC值确定的,但具有一个adj_P_Value <= 0.01应在0.01和0.05,为橙色正方形和>= 0.05作为黑色十字之间被显示为红色圆圈,在点(即我提供的数据不应该显示任何方块)。我这样做的尝试是在geom_point中使用cut,但这似乎不起作用。颜色显示正确,但形状不正确。这一直在困扰我一段时间。如果有任何我打破的约定或标准(我可能是),请让我知道并提出一些可以实现我已经完成的事情。谢谢!

更新

#test using ggplot2 
maxFC = max(as.numeric(as.character(tbl$LogFC))) 
minFC = min(as.numeric(as.character(tbl$LogFC))) 

datasetList = tbl$Study_ID 
hLines =(which(duplicated(datasetList) == FALSE) - 0.5) 


tbl$ylab <- paste(tbl$Group2," \U2192 ","\n", tbl$Group1, sep = "") 

tbl <- tbl %>% 
    mutate(colourgroup = case_when(
    .$adj_P_Value <= 0.01 ~ 1, 
    .$adj_P_Value > 0.01 & .$adj_P_Value < 0.05 ~ 2, 
    .$adj_P_Value >= 0.05 ~ 3 
    )) 

p <- ggplot(data = tbl, aes(x = LogFC, y = Probe_ID, colour = colourgroup, shape = colourgroup)) + 
    #geom_point() + 
    geom_vline(xintercept = log(0.5,2), size = 0.2) + 
    geom_vline(xintercept = log(2/3,2), size = 0.2) + 
    geom_vline(xintercept = log(1.5,2), size = 0.2) + 
    geom_vline(xintercept = log(2,2), size = 0.2) + 
    labs(title = tbl$gene, y = "Contrasts", x = bquote(~Log[2]~'(Fold Change)')) + 
    geom_errorbarh(aes(x = LogFC, xmin = CI_L, xmax = CI_R), height = .1) + 
    geom_point() + 
    scale_color_manual(c('red','orange','black')) + 
    scale_shape_manual(c(19,15,4))+ 
    coord_cartesian(xlim = c(min(-2,minFC),max(2,maxFC))) + 
    theme(axis.text.y = element_blank(), strip.text.y = element_text(angle = 180), 
     #panel.grid.major = element_blank(), 
     #panel.grid.minor = element_blank(), 
     axis.line.y = element_blank(), 
     axis.line.x = element_blank(), 
     #panel.background = element_rect(fill = 'white', colour = 'white'), 
     #panel.grid = element_blank(), 
     panel.spacing.y = unit(0.5,'lines'), 
     axis.ticks.y = element_blank()) + 
    facet_grid(Study_ID+ylab~ ., scales = 'free', space = 'free', switch = 'both') 


p 
+0

如果下面的回答对您有所帮助,请考虑通过单击选中标记向左接受它。这让社区知道问题已经结束。如果没有帮助,不用担心 – CPak

回答

1

添加因子的一列你想

library(dplyr) 
tbl <- tbl %>% 
     mutate(colourgroup = case_when(
            adj_P_Value <= 0.01 ~ 1, 
            adj_P_Value > 0.01 & adj_P_Value < 0.05 ~ 2, 
            adj_P_Value >= 0.05 ~ 3)) 

然后改变

aes(x = LogFC, y = Probe_ID, group = Study_ID) 

aes(x = LogFC, y = Probe_ID, colour = factor(colourgroup), shape = factor(colourgroup)) 

而且

scale_color_manual(values=c("red","orange","black")) + 
scale_shape_manual(values=c(1,2,3)) 

小例子

这个最小ggplot命令为我工作。注意我接通目的xy值,以及redorange可能很难区分

ggplot(df2, aes(x = Probe_ID, y=LogFC, colour=factor(colourgroup), shape=factor(colourgroup))) + 
    geom_point() + 
    scale_color_manual(values=c("red","orange","black")) + 
    scale_shape_manual(values=c(1,2,3)) 
+0

然后我可以自由删除'geom_point'和两条'scale_ *'行,对吗? –

+0

另外,TRUE〜as.character(x)是做什么的? –

+0

在'TRUE〜as.factor(x)''说'找不到'object'x' –