关于概念,可以阅读笔者的或者其他老师的推文:
基础概念:临床预测模型概述5-临床预测模型评价指标(区分度,校准度和临床决策曲线)临床预测模型概述4-统计模型(Logistic,Cox,Lasso) logistic和cox回归:临床预测模型概述6-统计模型实操-单/多因素Logistic回归 临床预测模型概述6-统计模型实操-单/多因素Cox回归
分析流程
1.导入
rm(list = ls())
library(stringr)
library(survival)
library(survminer)
proj <- "ttt"
load('data.Rdata') # TCGA-HNSC数据
2.数据预处理
colnames(meta)
variables <- c("cluster", "gender", "neoadjuvant")
meta <- cbind(meta[,c(1:3)],
meta[,c("cluster", "gender", "neoadjuvant")])
data <- meta
data <- na.omit(data)
dim(data)
# 如果是连续的代码,需要设置data
data$OS.time <- data$OS.time
# 对变量进行因子化
data$cluster <- factor(data$cluster,levels = c("1","2"))
data$gender <- factor(data$gender,levels = c("FEMALE","MALE"))
data$neoadjuvant <- factor(data$neoadjuvant,levels = c("No","Yes"))
# 数据分割 7:3,8:2 均可
# 划分是随机的,设置种子数可以让结果复现
set.seed(123)
ind <- sample(1:nrow(data), size = 0.7*nrow(data))
train <- data[ind,]
test <- data[-ind, ]
3.coxph计算C-index
# 训练集c-index和95%CI
Ctrain <- coxph(Surv(OS.time,OS)~cluster+gender+neoadjuvant,data=train)
summary(Ctrain)
# Call:
# coxph(formula = Surv(OS.time, OS) ~ cluster + gender + neoadjuvant,
# data = train)
#
# n= 345, number of events= 142
#
# coef exp(coef) se(coef) z Pr(>|z|)
# cluster2 0.3916 1.4793 0.1713 2.285 0.0223 *
# genderMALE -0.3254 0.7223 0.1793 -1.815 0.0695 .
# neoadjuvantYes 0.2128 1.2371 0.4674 0.455 0.6490
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# exp(coef) exp(-coef) lower .95 upper .95
# cluster2 1.4793 0.6760 1.0573 2.070
# genderMALE 0.7223 1.3845 0.5083 1.026
# neoadjuvantYes 1.2371 0.8084 0.4949 3.092
#
# Concordance= 0.565 (se = 0.025 )
# Likelihood ratio test= 8.81 on 3 df, p=0.03
# Wald test = 9.07 on 3 df, p=0.03
# Score (logrank) test = 9.18 on 3 df, p=0.03
# 这里的Concordance就是C指数,为0.565
# 验证集c-index和95%CI
# 这里中间的环节是使用predict函数去预测test数据集中每个样本的风险值
# 然后将得到的每个样本的风险值再进行coxph拟合成一个新的函数
# 最后使用predict(Ctrain, newdata = test)计算的风险评分作为一个新的自变量来拟合Cox模型
Ctest <- coxph(Surv(OS.time,OS)~predict(Ctrain,newdata = test,type="lp"),data =test)
summary(Ctest)
# Call:
# coxph(formula = Surv(OS.time, OS) ~ predict(Ctrain, newdata = test,
# type = "lp"), data = test)
#
# n= 148, number of events= 70
#
# coef exp(coef) se(coef) z Pr(>|z|)
# predict(Ctrain, newdata = test, type = "lp") 1.2090 3.3501 0.4553 2.656 0.00792 **
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# exp(coef) exp(-coef) lower .95 upper .95
# predict(Ctrain, newdata = test, type = "lp") 3.35 0.2985 1.373 8.176
#
# Concordance= 0.607 (se = 0.036 )
# Likelihood ratio test= 6.74 on 1 df, p=0.009
# Wald test = 7.05 on 1 df, p=0.008
# Score (logrank) test = 7.18 on 1 df, p=0.007
# 这里的Concordance就是C指数,为0.607
4.时间依赖的ROC曲线
方法一(survivalROC)
################## 训练集
# 拟合 Cox 回归模型
fit <- coxph(Surv(OS.time,OS)~cluster+gender+neoadjuvant,data=train)
# 训练集12,24和36个月的死亡概率
# 为什么需要计算死亡概率,这是因为我们这里的发生事件是死亡!!
# 其他的就用其他的含义哈~ 比如疾病发生率啥的
train$deadPro12 <- c(1 - (summary(survfit(fit, newdata = train), times = 12)$surv))
train$deadPro24 <- c(1 - (summary(survfit(fit, newdata = train), times = 24)$surv))
train$deadPro36 <- c(1 - (summary(survfit(fit, newdata = train), times = 24)$surv))
# 验证集 3 个月和 6 个月死亡概率
test$deadPro12 <- c(1 - (summary(survfit(fit, newdata = test), times = 12)$surv))
test$deadPro24 <- c(1 - (summary(survfit(fit, newdata = test), times = 24)$surv))
test$deadPro36 <- c(1 - (summary(survfit(fit, newdata = test), times = 24)$surv))
# 开始计算
library(survivalROC)
# 训练集用"KM"法拟合12个月生存的ROC曲线
SROC <- survivalROC(Stime = train$OS.time,
status = train$OS,
marker = train$deadPro12,
predict.time = 12,
method = "KM")
# 计算最佳截断值
cut.op <- SROC$cut.values[which.max(SROC$TP - SROC$FP)]
cut.op # 输出最佳断值,注意这里是12月死亡率
# 把SROC转化为dataFrame
SROC_df <- as.data.frame(SROC)
# 训练集用"KM"法拟合24个月生存的ROC曲线
SROC1 <- survivalROC(Stime = train$OS.time,
status = train$OS,
marker = train$deadPro24,
predict.time = 24,
method = "KM")
# 计算最佳截断值
cut.op <- SROC1$cut.values[which.max(SROC1$TP - SROC1$FP)]
cut.op
# 把SROC1转化为dataFrame
SROC1_df <- as.data.frame(SROC1)
# 训练集用"KM"法拟合24个月生存的ROC曲线
SROC2 <- survivalROC(Stime = train$OS.time,
status = train$OS,
marker = train$deadPro36,
predict.time = 36,
method = "KM")
# 计算最佳截断值
cut.op <- SROC2$cut.values[which.max(SROC2$TP - SROC2$FP)]
cut.op
# 把SROC1转化为dataFrame
SROC2_df <- as.data.frame(SROC2)
################## 验证集
# 训练集用"KM"法拟合12个月生存的ROC曲线
AROC <- survivalROC(Stime = test$OS.time,
status = test$OS,
marker = test$deadPro12,
predict.time = 12,
method = "KM")
# 计算最佳截断值
cut.op <- AROC$cut.values[which.max(AROC$TP - AROC$FP)]
cut.op # 输出最佳断值,注意这里是12月死亡率
# 把SROC转化为dataFrame
AROC_df <- as.data.frame(AROC)
# 训练集用"KM"法拟合24个月生存的ROC曲线
AROC1 <- survivalROC(Stime = test$OS.time,
status = test$OS,
marker = test$deadPro24,
predict.time = 24,
method = "KM")
# 计算最佳截断值
cut.op <- AROC1$cut.values[which.max(AROC1$TP - AROC1$FP)]
cut.op
# 把SROC1转化为dataFrame
AROC1_df <- as.data.frame(AROC1)
# 训练集用"KM"法拟合24个月生存的ROC曲线
AROC2 <- survivalROC(Stime = test$OS.time,
status = test$OS,
marker = test$deadPro36,
predict.time = 36,
method = "KM")
# 计算最佳截断值
cut.op <- AROC2$cut.values[which.max(AROC2$TP - AROC2$FP)]
cut.op
# 把SROC1转化为dataFrame
AROC2_df <- as.data.frame(AROC2)
可视化(survivalROC)
################## 训练集
# SROC_df, SROC1_df, SROC2_df分别是 12, 24, 36个月的数据
SROC_df$time_point <- "12 months"
SROC1_df$time_point <- "24 months"
SROC2_df$time_point <- "36 months"
SROC_total <- rbind(SROC_df,SROC1_df,SROC2_df)
# 提取 AUC 值
auc_12 <- unique(SROC_df$AUC)[1]
auc_24 <- unique(SROC1_df$AUC)[1]
auc_36 <- unique(SROC2_df$AUC)[1]
# 设置颜色和线条标签
colors <- c("12 months" = "red", "24 months" = "blue", "36 months" = "orange")
labels <- c(
paste("AUC at 1 year:", round(auc_12, 3)),
paste("AUC at 2 years:", round(auc_24, 3)),
paste("AUC at 3 years:", round(auc_36, 3))
)
# TP就是敏感度,FP就是1-特异度
# 使用 ggplot2 绘制带 AUC 信息的 ROC 曲线
ggplot(SROC_total, aes(x = FP, y = TP, color = time_point)) +
geom_line(size = 1) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
labs(
title = "ROC Curve (Train)",
x = "False Positive Rate (FP)",
y = "True Positive Rate (TP)"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
legend.position = c(0.8, 0.15), # 控制图例的位置,(x, y) 取值范围 0 到 1
legend.background = element_rect(fill = "white", color = "black"), # 添加白色背景和边框
legend.title = element_blank() # 移除图例标题
) +
scale_color_manual(
values = colors,
labels = labels
)
ggsave("ROCurve(train).pdf",width = 9,height = 7)
################## 验证集
# AROC_df, AROC1_df, AROC2_df分别是 12, 24, 36个月的数据
AROC_df$time_point <- "12 months"
AROC1_df$time_point <- "24 months"
AROC2_df$time_point <- "36 months"
AROC_total <- rbind(AROC_df,AROC1_df,AROC2_df)
# 提取 AUC 值
auc_12 <- unique(AROC_df$AUC)[1]
auc_24 <- unique(AROC1_df$AUC)[1]
auc_36 <- unique(AROC2_df$AUC)[1]
# 设置颜色和线条标签
colors <- c("12 months" = "red", "24 months" = "blue", "36 months" = "orange")
labels <- c(
paste("AUC at 1 year:", round(auc_12, 3)),
paste("AUC at 2 years:", round(auc_24, 3)),
paste("AUC at 3 years:", round(auc_36, 3))
)
# TP就是敏感度,FP就是1-特异度
# 使用 ggplot2 绘制带 AUC 信息的 ROC 曲线
ggplot(AROC_total, aes(x = FP, y = TP, color = time_point)) +
geom_line(size = 1) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
labs(
title = "ROC Curve (Test)",
x = "False Positive Rate (FP)",
y = "True Positive Rate (TP)"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
legend.position = c(0.8, 0.15), # 控制图例的位置,(x, y) 取值范围 0 到 1
legend.background = element_rect(fill = "white", color = "black"), # 添加白色背景和边框
legend.title = element_blank() # 移除图例标题
) +
scale_color_manual(
values = colors,
labels = labels
)
ggsave("ROCurve(test).pdf",width = 9,height = 7)
方法二(timeROC)
# 训练集建模
fit <- coxph(Surv(OS.time,OS)~cluster+gender+neoadjuvant,data=train)
variables <- c("cluster","gender","neoadjuvant")
# 数据整合,分别提取想要分析的数据
val_dd_list2 <- list(
Train = train[, c("OS.time", "OS", variables)],
Test = test[, c("OS.time", "OS", variables)]
)
# 预测并计算 C-index
rs <- lapply(val_dd_list2, function(x) {
cbind(x[, 1:2],RS = predict(fit, newdata = x,type = "lp"))
})
可视化(timeROC)
# C-index--绘制
library(timeROC)
# 计算时间依赖性ROC曲线和AUC(使用第一个验证数据集作为示例)
# 命名
g <- "Test"
# 定义一个函数来根据名称查找数据集在列表中的位置
find_dataset_position <- function(list, dataset_name) {
# 在列表的名称中查找匹配的数据集名称
position <- which(names(list) == dataset_name)
# 返回位置
return(position)
}
# 使用例子
position <- find_dataset_position(rs, g)
print(position)
i <- position
# 开始计算
ROC <- timeROC(T = rs[[i]]$OS.time,
delta = rs[[i]]$OS,
marker = rs[[i]]$RS,
cause = 1,
weighting = "marginal", #计算方法默认marginal
times = c(12,24,36),
iid = TRUE)
ROC
#开始绘图
pdf(paste0(g,".pdf"),width = 6, height = 6)
plot(ROC,
time = 12,
title = F, #timeROC不能直接添加标题,要额外添加
col= "#f5695e",
#print.thres.col="black",
#identity.col="grey", timeROC函数中这个没用
lwd = 3,
cex.main = 1.5, # 主标题字体大小
cex.lab = 1.2, # 轴标签字体大小
cex.axis = 1, # 刻度字体大小
font.main = 1, # 主标题字体样式(1为普通,2为粗体)
font.lab = 1, # 轴标签字体样式
font.axis = 1) # 刻度字体样式
# 添加一条灰色的对角线
abline(a = 0, b = 1, col = "grey", lty = 2, lwd = 2)
# 使用 title() 函数添加额外的图形标题
title(main = paste0("ROC Curve(",g,")"))
#添加线
plot(ROC,
time=24, col="#2667bb", add=TRUE, lwd=2) #add指是否添加在上一张图中
plot(ROC,
time=36, col="#e6b707", add=TRUE, lwd=2)
#添加标签信息
legend("bottomright",
c(paste0("AUC at 1 year: ",round(ROC[["AUC"]][1],3)),
paste0("AUC at 2 year: ",round(ROC[["AUC"]][2],3)),
paste0("AUC at 3 year: ",round(ROC[["AUC"]][3],3))),
col=c("#f5695e", "#2667bb", "#e6b707"),
lty=1, lwd=2,bty = "o")
dev.off() # 关闭设备,完成保存
两个R包最终的结果总体还是差不多的~
参考资料:
survivalROC:https://github.com/cran/survivalROC timeROC:https://github.com/cran/timeROC 生信星球:https://mp.weixin.qq.com/s/xkvGFobVt78---_J8-MYBA https://mp.weixin.qq.com/s/E_gG4YZEVz232opsjc8xMA 医学和生信笔记:https://mp.weixin.qq.com/s/Ya-XHd8khMXCSHnY2l7Xtg https://mp.weixin.qq.com/s/hjXFgLxtegyfx_SD437m2Q https://mp.weixin.qq.com/s/np7r7njHUZJplu0bxCelRw
注:若对内容有疑惑或者有发现明确错误的朋友,请联系后台(欢迎交流)。更多内容可关注公众号:生信方舟
- END -