全网最全最准确的肿瘤样本药物敏感性预测教程

学术   2024-10-31 19:00   上海  
今天果叔给大家带来的分享也非常实用,利用pRRophetic包基于两个药物敏感性数据库PRISM和CTRP进行肿瘤样本药物敏感性预测,通过AUC值作为药物敏感性的衡量标准,较低的 AUC 值表明对治疗的敏感性增加,这样预测的结果会更加准确和全面,有需要的小伙伴赶紧行动起来吧,跟着果叔一起开始今天的实操学习吧!
1.加载需要的R包
library(tidyverse)library(impute) # 用于KNN填补药敏数据library(pRRophetic) # 用于药敏预测library(SimDesign) # 用于禁止药敏预测过程输出的信息library(ggplot2) # 绘图library(cowplot) #拼图
2.数据读取与处理
# 读取肝癌TPM表达谱,行名为基因名,列名为样本名。expr <- read.table("LIHC.TPM.txt",sep = "\t",row.names = 1,check.names = F,stringsAsFactors = F,header = T)
normsam <- colnames(expr[,which(substr(colnames(expr),11,12) == "11")])tumosam <- colnames(expr[,which(substr(colnames(expr),11,12) == "01")])# 读取maf突变文件(于cBioPortal下载)maf <- read_tsv("data_mutations_mskcc.txt", comment = "#")maf$Tumor_Sample_Barcode <- paste0("LIHC",substr(maf$Tumor_Sample_Barcode,8,15))# 提取既有表达数据又有突变数据的肿瘤样本 tumosam <- intersect(tumosam,unique(maf$Tumor_Sample_Barcode))maf <- maf[which(maf$Tumor_Sample_Barcode %in% tumosam),]expr <- expr[,c(tumosam,normsam)]# 提取TP53突变信息,并创建样本注释tp53 <- c()for (i in tumosam) { tmp <- maf[which(maf$Tumor_Sample_Barcode == i),] if(is.element("TP53", tmp$Hugo_Symbol)) { # 如果存在TP53 tp53 <- c(tp53,1) # 记录1 } else { tp53 <- c(tp53,0) # 否则记录0 }}names(tp53) <- tumosam# 取出有TP53突变的患者tp53.mutsam <- names(tp53[which(tp53 == 1)])# 1.读取17-gene signature,计算PPS得分(原文Table S3)signature <- read.table("17gene.txt",sep = "\t",row.names = 1,check.names = F,stringsAsFactors = F,header = T)dim(signature)# 计算pps,用到TP53突变患者的17个基因的表达矩阵pps <- as.numeric(apply(t(log2(expr[rownames(signature),tp53.mutsam] + 1)), 1, function(x) {x %*% signature$Coefficient}))# 标准化,把pps处理到0-1之间range01 <- function(x){(x-min(x))/(max(x)-min(x))}npps <- range01(pps)# 创建样本信息Sinfo <- data.frame(PPS = npps, TP53 = tp53[tp53.mutsam], row.names = tp53.mutsam, stringsAsFactors = F)# 把pps保存到文件write.csv(Sinfo, "output_PPS.csv", quote = F)normexpr <- as.matrix(expr[,normsam]) tumoexpr <- as.matrix(expr[,tp53.mutsam]) runpure <- F # 如果想运行就把这个改为Tif(runpure) { set.seed(123) # Run ISOpureR Step 1 - Cancer Profile Estimation ISOpureS1model <- ISOpure.step1.CPE(tumoexpr, normexpr) # For reproducible results, set the random seed set.seed(456); # Run ISOpureR Step 2 - Patient Profile Estimation ISOpureS2model <- ISOpure.step2.PPE(tumoexpr,normexpr,ISOpureS1model) pure.tumoexpr <- ISOpureS2model$cc_cancerprofiles} if(!runpure) { pure.tumoexpr <- tumoexpr}# 制作CTRP AUC矩阵,保存到CTRP_AUC.txt文件auc <- read.table("CTRP_AUC_raw.txt",sep = "\t",row.names = NULL,check.names = F,stringsAsFactors = F,header = T) # Supplementary Data Set 3auc$comb <- paste(auc$master_cpd_id,auc$master_ccl_id,sep = "-")auc <- apply(auc[,"area_under_curve",drop = F], 2, function(x) tapply(x, INDEX=factor(auc$comb), FUN=max, na.rm=TRUE)) # 重复项取最大AUCauc <- as.data.frame(auc)auc$master_cpd_id <- sapply(strsplit(rownames(auc),"-",fixed = T),"[",1)auc$master_ccl_id <- sapply(strsplit(rownames(auc),"-",fixed = T),"[",2)auc <- reshape(auc, direction = "wide", timevar = "master_cpd_id", idvar = "master_ccl_id")colnames(auc) <- gsub("area_under_curve.","",colnames(auc),fixed = T)ctrp.ccl.anno <- read.table("CTRP_ccl_anno.txt",sep = "\t",row.names = NULL,check.names = F,stringsAsFactors = F,header = T) # Supplementary Data Set 1ctrp.cpd.anno <- read.delim("CTRP_cpd_anno.txt",sep = "\t",row.names = NULL,check.names = F,stringsAsFactors = F,header = T) # Supplementary Data Set 2# 保存到文件 write.table(auc,"CTRP_AUC.txt",sep = "\t",row.names = F,col.names = T,quote = F)# 加载药敏AUC矩阵并进行数据处理ctrp.auc <- read.table("CTRP_AUC.txt",sep = "\t",row.names = 1,check.names = F,stringsAsFactors = F,header = T)prism.auc <- read.delim("PRISM_AUC.txt",sep = "\t",row.names = 1,check.names = F,stringsAsFactors = F,header = T) # 数据来自https://depmap.org/portal/download/ Drug sensitivity AUC (PRISM Repurposing Secondary Screen) 19Q4prism.ccl.anno <- prism.auc[,1:5] # 前5列为细胞系注释信息prism.auc <- prism.auc[,-c(1:5)]## a. 移除缺失值大于20%的药物ctrp.auc <- ctrp.auc[,apply(ctrp.auc,2,function(x) sum(is.na(x))) < 0.2*nrow(ctrp.auc)]prism.auc <- prism.auc[,apply(prism.auc,2,function(x) sum(is.na(x))) < 0.2*nrow(prism.auc)]## b. 移除CTRP数据里源自haematopoietic_and_lymphoid_tissue的细胞系rmccl <- paste0("CCL",na.omit(ctrp.ccl.anno[which(ctrp.ccl.anno$ccle_primary_site == "haematopoietic_and_lymphoid_tissue"),"master_ccl_id"]))rownames(ctrp.auc) <- paste0("CCL",rownames(ctrp.auc))ctrp.auc <- ctrp.auc[setdiff(rownames(ctrp.auc),rmccl),]## c. KNN填补缺失值ctrp.auc.knn <- impute.knn(as.matrix(ctrp.auc))$dataprism.auc.knn <- impute.knn(as.matrix(prism.auc))$data## d. 数据量级修正ctrp.auc.knn <- ctrp.auc.knn/ceiling(max(ctrp.auc.knn)) # 参考Expression Levels of Therapeutic Targets as Indicators of Sensitivity to Targeted Therapeutics (2019, Molecular Cancer Therapeutics)prism.auc.knn <- prism.auc.knn/ceiling(max(prism.auc.knn))# 加载CCLE细胞系的表达谱,作为训练集ccl.expr <- read.table("CCLE_RNAseq_rsem_genes_tpm_20180929.txt",sep = "\t",row.names = 1,check.names = F,stringsAsFactors = F,header = T)
  
# 加载基因注释文件,用于基因ID转换Ginfo <- read.table("overlapTable27.txt",sep = "\t",row.names = 1,check.names = F,stringsAsFactors = F,header = T)
# 把基因的ensembl ID转换为gene symbolccl.expr <- ccl.expr[,-1]; rownames(ccl.expr) <- sapply(strsplit(rownames(ccl.expr),".",fixed = T),"[",1)comgene <- intersect(rownames(ccl.expr),rownames(Ginfo))ccl.expr <- ccl.expr[comgene,]ccl.expr$gene <- Ginfo[comgene,"genename"]; ccl.expr <- ccl.expr[!duplicated(ccl.expr$gene),]; rownames(ccl.expr) <- ccl.expr$gene; ccl.expr <- ccl.expr[,-ncol(ccl.expr)]keepgene <- apply(ccl.expr, 1, mad) > 0.5 # 保留表达值有效的基因trainExpr <- log2(ccl.expr[keepgene,] + 1)colnames(trainExpr) <- sapply(strsplit(colnames(trainExpr),"_",fixed = T),"[",1) # 重置细胞系名trainPtype <- as.data.frame(ctrp.auc.knn)ccl.name <- ccl.miss <- c() # 替换细胞系名for (i in rownames(trainPtype)) { if(!is.element(gsub("CCL","",i),ctrp.ccl.anno$master_ccl_id)) { cat(i,"\n") ccl.miss <- c(ccl.miss, i) # 没有匹配到的细胞系 ccl.name <- c(ccl.name, i) # 插入未匹配的细胞系 } else { ccl.name <- c(ccl.name, ctrp.ccl.anno[which(ctrp.ccl.anno$master_ccl_id == gsub("CCL","",i)),"ccl_name"]) # 插入匹配的细胞系 }}cpd.name <- cpd.miss <- c() # 替换药物名for (i in colnames(trainPtype)) { if(!is.element(i,ctrp.cpd.anno$master_cpd_id)) { cat(i,"\n") cpd.miss <- c(cpd.miss, i) # 没有匹配到的药物 cpd.name <- c(cpd.name, i) # 插入未匹配的药物 } else { cpd.name <- c(cpd.name, ctrp.cpd.anno[which(ctrp.cpd.anno$master_cpd_id == i),"cpd_name"]) # 插入匹配的药物 }}rownames(trainPtype) <- ccl.nametrainPtype <- trainPtype[setdiff(rownames(trainPtype),ccl.miss),] # 去除未匹配的细胞系colnames(trainPtype) <- cpd.nametrainPtype <- trainPtype[,setdiff(colnames(trainPtype),cpd.miss)] # 去除未匹配的药物comccl <- intersect(rownames(trainPtype),colnames(trainExpr)) # 提取有表达且有药敏的细胞系trainExpr <- trainExpr[,comccl]trainPtype <- trainPtype[comccl,]# 测试集keepgene <- apply(pure.tumoexpr, 1, mad) > 0.5 # 纯化的测试集取表达稳定的基因 testExpr <- log2(pure.tumoexpr[keepgene,] + 1) # 表达谱对数化# 取训练集和测试集共有的基因comgene <- intersect(rownames(trainExpr),rownames(testExpr))trainExpr <- as.matrix(trainExpr[comgene,])testExpr <- testExpr[comgene,]


线上课程教学

课题设计、定制生信分析

云服务器租赁

加微信备注99领取使用

3.预测药物敏感性
outTab <- NULL# 循环很慢,请耐心for (i in 1:ncol(trainPtype)) {  d <- colnames(trainPtype)[i]  tmp <- log2(as.vector(trainPtype[,d]) + 0.00001) # 由于CTRP的AUC可能有0值,因此加一个较小的数值防止报错  # 岭回归预测药物敏感性  ptypeOut <- quiet(calcPhenotype(trainingExprData = trainExpr,                                  trainingPtype = tmp,                                  testExprData = testExpr,                                  powerTransformPhenotype = F,                                  selection = 1))  ptypeOut <- 2^ptypeOut - 0.00001 # 反对数  outTab <- rbind.data.frame(outTab,ptypeOut)}dimnames(outTab) <- list(colnames(trainPtype),colnames(testExpr))prism.pred.auc <- outTabtop.pps <- Sinfo[Sinfo$PPS >= quantile(Sinfo$PPS,probs = seq(0,1,0.1))[10],] # 定义上十分位的样本bot.pps <- Sinfo[Sinfo$PPS <= quantile(Sinfo$PPS,probs = seq(0,1,0.1))[2],] # 定义下十分位的样本ctrp.log2fc <- c()for (i in 1:nrow(ctrp.pred.auc)) {  d <- rownames(ctrp.pred.auc)[i]  a <- mean(as.numeric(ctrp.pred.auc[d,rownames(top.pps)])) # 上十分位数的AUC均值      b <- mean(as.numeric(ctrp.pred.auc[d,rownames(bot.pps)])) # 下十分位数的AUC均值  fc <- b/a  log2fc <- log2(fc); names(log2fc) <- d  ctrp.log2fc <- c(ctrp.log2fc,log2fc)}candidate.ctrp <- ctrp.log2fc[ctrp.log2fc > 0.2] # 这里我调整了阈值,控制结果数目prism.log2fc <- c()for (i in 1:nrow(prism.pred.auc)) {  display.progress(index = i,totalN = nrow(prism.pred.auc))  d <- rownames(prism.pred.auc)[i]  a <- mean(as.numeric(prism.pred.auc[d,rownames(top.pps)])) # 上十分位数的AUC均值  b <- mean(as.numeric(prism.pred.auc[d,rownames(bot.pps)])) # 下十分位数的AUC均值  fc <- b/a  log2fc <- log2(fc); names(log2fc) <- d  prism.log2fc <- c(prism.log2fc,log2fc)}candidate.prism <- prism.log2fc[prism.log2fc > 0.2] # 这里我调整了阈值,控制结果数目ctrp.cor <- ctrp.cor.p <- c()for (i in 1:nrow(ctrp.pred.auc)) {  d <- rownames(ctrp.pred.auc)[i]  a <- as.numeric(ctrp.pred.auc[d,rownames(Sinfo)])  b <- as.numeric(Sinfo$PPS)  r <- cor.test(a,b,method = "spearman")$estimate; names(r) <- d  p <- cor.test(a,b,method = "spearman")$p.value; names(p) <- d  ctrp.cor <- c(ctrp.cor,r)  ctrp.cor.p <- c(ctrp.cor.p,p)}candidate.ctrp2 <- ctrp.cor[ctrp.cor < -0.4]  # 这里我调整了阈值,控制结果数目    ctrp.candidate <- intersect(names(candidate.ctrp),names(candidate.ctrp2))prism.cor <- prism.cor.p <- c()for (i in 1:nrow(prism.pred.auc)) {  d <- rownames(prism.pred.auc)[i]  a <- as.numeric(prism.pred.auc[d,rownames(Sinfo)])  b <- as.numeric(Sinfo$PPS)  r <- cor.test(a,b,method = "spearman")$estimate; names(r) <- d  p <- cor.test(a,b,method = "spearman")$p.value; names(p) <- d  prism.cor <- c(prism.cor,r)  prism.cor.p <- c(prism.cor.p,p)}candidate.prism2 <- prism.cor[prism.cor < -0.35] prism.candidate <- intersect(names(candidate.prism),names(candidate.prism2))
4.绘制棒棒糖图和显著差异分析箱线图
#绘制相关性棒棒糖图# 设置颜色darkblue <- "#0772B9"lightblue <- "#48C8EF"cor.data <- data.frame(drug = ctrp.candidate, r = ctrp.cor[ctrp.candidate], p = -log10(ctrp.cor.p[ctrp.candidate]))p1 <- ggplot(data = cor.data,aes(r,forcats::fct_reorder(drug,r,.desc = T))) + geom_segment(aes(xend=0,yend=drug),linetype = 2) + geom_point(aes(size=p),col = darkblue) + scale_size_continuous(range =c(2,8)) + scale_x_reverse(breaks = c(0, -0.3, -0.5), expand = expansion(mult = c(0.01,.1))) + #左右留空 theme_classic() + labs(x = "Correlation coefficient", y = "", size = bquote("-log"[10]~"("~italic(P)~"-value)")) + theme(legend.position = "bottom", axis.line.y = element_blank())cor.data <- data.frame(drug = prism.candidate, r = prism.cor[prism.candidate], p = -log10(prism.cor.p[prism.candidate]))cor.data$drug <- sapply(strsplit(cor.data$drug," (",fixed = T), "[",1)p2 <- ggplot(data = cor.data,aes(r,forcats::fct_reorder(drug,r,.desc = T))) + geom_segment(aes(xend=0,yend=drug),linetype = 2) + geom_point(aes(size=p),col = darkblue) + scale_size_continuous(range =c(2,8)) + scale_x_reverse(breaks = c(0, -0.3, -0.5), expand = expansion(mult = c(0.01,.1))) + #左右留空 theme_classic() + labs(x = "Correlation coefficient", y = "", size = bquote("-log"[10]~"("~italic(P)~"-value)")) + theme(legend.position = "bottom", axis.line.y = element_blank())ctrp.boxdata <- NULLfor (d in ctrp.candidate) { a <- as.numeric(ctrp.pred.auc[d,rownames(top.pps)]) b <- as.numeric(ctrp.pred.auc[d,rownames(bot.pps)]) p <- wilcox.test(a,b)$p.value s <- as.character(cut(p,c(0,0.001,0.01,0.05,1),labels = c("***","**","*",""))) ctrp.boxdata <- rbind.data.frame(ctrp.boxdata, data.frame(drug = d, auc = c(a,b), p = p, s = s, group = rep(c("High PPS","Low PPS"),c(nrow(top.pps),nrow(bot.pps))), stringsAsFactors = F), stringsAsFactors = F)}#绘制显著差异箱线图p3 <- ggplot(ctrp.boxdata, aes(drug, auc, fill=group)) + geom_boxplot(aes(col = group),outlier.shape = NA) + geom_text(aes(drug, y=max(auc)), label=ctrp.boxdata$s, data=ctrp.boxdata, inherit.aes=F) + scale_fill_manual(values = c(darkblue, lightblue)) + scale_color_manual(values = c(darkblue, lightblue)) + xlab(NULL) + ylab("Estimated AUC value") + theme_classic() + theme(axis.text.x = element_text(angle = 45, hjust = 0.5,vjust = 0.5,size = 10), legend.position = "bottom", legend.title = element_blank())dat <- ggplot_build(p3)$data[[1]]p3 <- p3 + geom_segment(data=dat, aes(x=xmin, xend=xmax, y=middle, yend=middle), color="white", inherit.aes = F) prism.boxdata <- NULLfor (d in prism.candidate) { a <- as.numeric(prism.pred.auc[d,rownames(top.pps)]) b <- as.numeric(prism.pred.auc[d,rownames(bot.pps)]) p <- wilcox.test(a,b)$p.value s <- as.character(cut(p,c(0,0.001,0.01,0.05,1),labels = c("***","**","*",""))) prism.boxdata <- rbind.data.frame(prism.boxdata, data.frame(drug = d, auc = c(a,b), p = p, s = s, group = rep(c("High PPS","Low PPS"),c(nrow(top.pps),nrow(bot.pps))), stringsAsFactors = F), stringsAsFactors = F)}prism.boxdata$drug <- sapply(strsplit(prism.boxdata$drug," (",fixed = T), "[",1)p4 <- ggplot(prism.boxdata, aes(drug, auc, fill=group)) + geom_boxplot(aes(col = group),outlier.shape = NA) + geom_text(aes(drug, y=max(auc)), label=prism.boxdata$s, data=prism.boxdata, inherit.aes=F) + scale_fill_manual(values = c(darkblue, lightblue)) + scale_color_manual(values = c(darkblue, lightblue)) + xlab(NULL) + ylab("Estimated AUC value") + theme_classic() + theme(axis.text.x = element_text(angle = 45, hjust = 0.5,vjust = 0.5,size = 10), legend.position = "bottom", legend.title = element_blank())dat <- ggplot_build(p4)$data[[1]]p4 <- p4 + geom_segment(data=dat, aes(x=xmin, xend=xmax, y=middle, yend=middle), color="white", inherit.aes = F)#进行拼图plot_grid(p1, p3, p2, p4, labels=c("A", "", "B", ""), ncol=2, rel_widths = c(2, 2)) #左右两列的宽度比例#保存图片 ggsave(filename = "drug target.pdf",width = 8,height = 8)
5.结果展示
5.1drug target.pdf
最终果叔利用pRRophetic包成功的预测了肿瘤样本药物敏感性,这里利用了两个药物敏感性数据库PRISM和CTRP进行预测,结果更加全面,肿瘤药物敏感性和肿瘤药物预测相关分析内容,比如GDSC细胞系药物敏感性分析(http://www.biocloudservice.com/757/757.php),cmap方法进行药物预测(http://www.biocloudservice.com/787/787.php)等都可以用本公司新开发的零代码云平台生信分析小工具,一键完成该分析奥,感兴趣的小伙伴欢迎来尝试奥,文末阅读原文查看,网址:http://www.biocloudservice.com/home.html。欢迎大家和果叔一起讨论学习呀!
果叔还提供思路设计、定制生信分析、文献思路复现;有需要的小伙伴欢迎直接扫码咨询果叔,竭诚为您的科研助力!

定制生信分析

服务器租赁

扫码咨询果叔


往期回顾

01

UKB数据库真的牛!3天接受,10天发表!免费新数据绝佳发文时期,拼的就是手速!仅2张图就能拿下IF:13.4分?!

02

NC优质平替!飞升1区Top,超10分的综合性毕业神刊!性价比超高,国人友好,Case Report也收!这波安全上车!

03

不做实验照样发Nature Communications!借诺奖东风“机器学习”+多组学分析,打造创新思路,每一步都踩在点子上!

04

IF=58.7,这泼天的多组学富贵可得接住!系统生物学研究团队开挂思路,机器学习助力个性化医疗,你就学吧,一看一个不吱声!



生信果
生信入门、R语言、生信图解读与绘制、软件操作、代码复现、生信硬核知识技能、服务器等
 最新文章