学习笔记总结于『生信技能树』马拉松课程
本文学习复现《Oxidative Stress Response Biomarkers of Ovarian Cancer Based on Single-Cell and Bulk RNA Sequencing》(基于单细胞和Bulk转录组的卵巢癌氧化应激反应生物标志物)一文中的图,其中Oxidative Stress Response缩写为ROS。文章包含了芯片、转录组、单细胞等技术,适合用来复现及学习
七、森林图&诺模图
本篇学习四个作图:高/低风险KMplot
、timeROC曲线
、森林图
、诺模图
1.高/低风险KMplot & timeROC曲线
1.1模型
rm(list = ls())
load("../5.GEOdatapre/GSE17260.Rdata")
load("../5.GEOdatapre/GSE26712.Rdata")
load("../6.model/lassogene.Rdata")
load("../6.model/TCGA-OV_sur_model.Rdata")
library(survival)
library(survminer)
library(glmnet)
load("../6.model/lasso_model.Rdata")
load("../6.model/rsurv.Rdata")
genes = lassoGene
# 计算风险评分
dats = list(dat1 = cbind(meta,t(exprSet[genes,])),
dat2 = cbind(pd1,t(exp1[genes,])),
dat3 = cbind(pd2,t(exp2[genes,])))
library(dplyr)
# 如果选用最佳截点就if(F);如果按中位数截断就if(T),对于得到的结果,哪个值让log-rank test的p值最小就用哪个
# 文章用的是中位数,如果我们也用中位数,p值为0.058 > 0.05,所以用最佳截点
if(F){
#中位数
survs = lapply(dats, function(x){
RS = apply(x[,genes], 1,function(k)sum(coef$coefficient* k)) #这个函数的内容即加权求和
x$RS = RS
x$group = ifelse(x$RS<median(x$RS),"low","high") #中位数和最佳截点区别在于分组,一个按照median,一个按照cut
x$group = factor(x$group,levels = c("low","high"))
return(x)})
}else{
#最佳截点
survs = lapply(dats, function(x){
RS = apply(x[,genes], 1,function(k)sum(coef$coefficient* k))
x$RS = RS
res.cut = survminer::surv_cutpoint(x, time = "time", event = "event", variables = "RS")
cut = res.cut[["cutpoint"]][1, 1]
x$group = ifelse(x$RS< cut,"low","high")
x$group = factor(x$group,levels = c("low","high"))
return(x)})
}
head(survs[[1]])# fp:预测值;group:风险分组
save(cvfit,genes,survs,file = "model_genes_survs.Rdata")
# survs里的每个数据都是由临床信息、建模基因的表达量、RS(预测值)、group(风险分组)构成,整理成了相同格式
1.2timeROC曲线
library(timeROC)
result = list()
p = list()
for(i in 1:3){
result[[i]] <-with(survs[[i]], timeROC(T=time,
delta=event,
marker=RS,
cause=1,
times=c(12,36,60),
iid = TRUE))
dat = data.frame(fpr = as.numeric(result[[i]]$FP),
tpr = as.numeric(result[[i]]$TP),
time = rep(as.factor(c(12,36,60)),each = nrow(result[[i]]$TP)))
library(ggplot2)
p[[i]] = ggplot() +
geom_line(data = dat,aes(x = fpr, y = tpr,color = time),size = 1) +
scale_color_manual(name = NULL,values = c("#92C5DE", "#F4A582", "#66C2A5"),
labels = paste0("AUC of ",c(1,3,5),"-y survival: ",
format(round(result[[i]]$AUC,2),nsmall = 2)))+
geom_line(aes(x=c(0,1),y=c(0,1)),color = "grey")+
theme_bw()+
theme(panel.grid = element_blank(),
legend.background = element_rect(linetype = 1, size = 0.2, colour = "black"),
legend.position = c(0.765,0.125))+
scale_x_continuous(expand = c(0.005,0.005))+
scale_y_continuous(expand = c(0.005,0.005))+
labs(x = "1 - Specificity",
y = "Sensitivity")+
coord_fixed()
}
library(patchwork)
wrap_plots(p)
ggsave("time_ROC.png",height = 7,width = 15)
得到如图76,如果模型能达到0.8+就比较稳
C-index
、ROC曲线
可以来衡量模型的好坏
C-index
:C指数或一致性指数(concordance index),反映模型预测结果与实际情况的一致程度,0.5为完全不一致,说明该模型没有预测作用;1为完全一致,说明该模型预测结果与实际完全一致
AUC值
:ROC曲线下的面积,它的横纵坐标范围在0-1之间,总面积为1,ROC曲线下的面积范围0.5-1之间。AUC值还分年份,如图76(左)所示。假设一个病人在三年半时死亡,即3.5年,那么在1年、3年、5年时的预测结果就会不一样
1.3高/低风险KM_plot
library(survival)
library(survminer)
corhorts = c("TCGA","geo1","geo2")
splots = list()
for(i in 1:3){
x = survs[[i]]
sfit1 = survfit(Surv(time, event) ~ group, data = x)
splots[[i]] = ggsurvplot(sfit1, pval = TRUE, palette = "jco",
data = x, legend = c(0.8, 0.8), title =corhorts[[i]],risk.table = T)
}
png("survs.png",height = 600,width = 1500)
arrange_ggsurvplots(splots,ncol = 3)
dev.off()
三图联动
source("risk_plot2.R")
risk_plot2(survs[[1]],genes,cut.point = T)#T即选择最佳截点,= F即选择中位数
risk_plot2(survs[[2]],genes,cut.point = T)
risk_plot2(survs[[3]],genes,cut.point = T)
得到如图78,三张图都是按照从低风险到高风险,从左到右排序,①最底下的热图即选中的那26个基因的表达量;②中间图的横坐标是病人的ID,纵坐标是时间;③最主要的是第一张图风险分数
2.森林图 & 诺模图
回顾一下文章的单因素/多因素cox分析的风险森林图,如图79所示,美化版本可参考https://zhuanlan.zhihu.com/p/510548311,此处分析简单版
图79的一个特点:都是分类变量。风险分数分为高低组、年龄等也做了分类。单因素无所谓,多因素的话要么全部使用连续型变量,要么全部离散型,如果都有就不太好
2.1森林图
rm(list = ls())
load("../6.model/TCGA-OV_sur_model.Rdata")
load("../6.model/lasso_model.Rdata")
load("../6.model/lassogene.Rdata")
load("../6.model/rsurv.Rdata")
library(stringr)
library(survival)
library(survminer)
dat = rsurv[,-c(1,7)]
model1 = coxph(Surv(time,event)~.,data = dat)
ggforest(model1,data = dat)
save(dat,file = "nomodat.Rdata")
2.2诺模图
library(rms)
dat$time = dat$time*12 #本来是月份,我们使用了天数
dd<-datadist(dat)
options(datadist="dd")
mod <- cph(as.formula(paste("Surv(time, event) ~ ",paste(colnames(dat)[3:ncol(dat)],collapse = "+"))),
data=dat,x=T,y=T,surv = T)
surv<-Survival(mod)
m1<-function(x) surv(365,x)
m3<-function(x) surv(1095,x)
m5<-function(x) surv(1825,x)
x<-nomogram(mod,
fun = list(m1,m3,m5),
funlabel = c('1-y survival',
'3-y survival',
'5-y survival'),
lp = F)
plot(x)
得到图81
诺模图的含义以图82为例
谢谢观看