学习笔记总结于『生信技能树』马拉松课程
本文学习复现《Oxidative Stress Response Biomarkers of Ovarian Cancer Based on Single-Cell and Bulk RNA Sequencing》(基于单细胞和Bulk转录组的卵巢癌氧化应激反应生物标志物)一文中的图,其中Oxidative Stress Response缩写为ROS。文章包含了芯片、转录组、单细胞等技术,适合用来复现及学习
九、药敏预测、免疫治疗
0.前情提要
关于药敏预测
预测哪些药物会对疾病表现好的治疗效果,通过IC50来看。IC50,半数抑制浓度,指在体外试验中在特定暴露时间后,药物抑制50%(细胞/靶点/特定蛋白等)所需的药物浓度
关于免疫治疗
TCGA数据没有提供免疫治疗结果,所以需要外部的数据来验证高/低风险组之间的免疫治疗结果是否有差别。根据免疫治疗的有效性,分为完全缓解(CR)、部分缓解(PR)、疾病稳定(SD)和疾病进展(PD)
CR和PR是好的结果,SD和PD是坏的结果,分别放在一起。不过图101(右)应该按照CR、PR、SD、PD的顺序来放
1.药物敏感性预测
文章使用了一个2014年的R包来计算IC50值,我们使用同一个作者但比较新的R包
1.1载入数据并完成预测
补充一点背景知识:oncoPredict
是根据基因表达量来预测药物敏感性的R包。也就是说它可以根据你的样本基因表达量来告诉你每个药物的IC50值,这个值越低就说明药物越管用
提到药物预测,还有一个pRRophetic
包,建议不用看了,因为oncoPredict
是它的plus版本;还有一个cellMiner包,之前写过,可以翻翻看
#### 1.载入数据
# 代码参考自:https://mp.weixin.qq.com/s/QRaTd-fIsqq6sPsLmOPvIw,一些背景知识也可以补充下.
# 数据来源于:https://osf.io/c6tfx/ 在Training Data文件夹下存放着R包作者准备好的数据,用作药物预测的训练集
# install.packages("oncoPredict")
# BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene")
rm(list = ls())
library(oncoPredict)
library(data.table)
library(gtools)
library(reshape2)
library(ggpubr)
dir='./DataFiles/DataFiles/Training Data/'
dir(dir)
# 可以看到其中包括了Cancer Therapeutics Response Portal (CTRP)和Genomics of Drug Sensitivity in Cancer (GDSC),相当于两个训练集
# 两个数据库的数据,都是提供了基因表达矩阵和药物IC50表格
# rds格式的数据,需要用readRDS()函数读取
exp = readRDS(file=file.path(dir,'GDSC2_Expr (RMA Normalized and Log Transformed).rds'))
exp[1:4,1:4]
dim(exp) # 17419个基因 805个样本
range(exp) # 2.094251 13.929729 经过了log,不过不用逆log
drug = readRDS(file = file.path(dir,"GDSC2_Res.rds"))
drug <- exp(drug) #下载到的数据是被log转换过的,用这句代码逆转回去
drug[1:4,1:4]# 行是样本,列是药物,数字是药物在这个样本中的IC50
dim(drug)
identical(rownames(drug),colnames(exp))
# drug是药物IC50值,exp是对应细胞系基因的表达矩阵。可以看到二者的样本名称是对应的。
#### 2.拿自己的数据来完成预测
load("../6.model/TCGA-OV_sur_model.Rdata")
test = exprSet#我们自己的表达矩阵
# 运行时间很长,大概2h,所以if(F)注释掉
if(F){
calcPhenotype(trainingExprData = exp,
trainingPtype = drug,
testExprData = test,
batchCorrect = 'standardize', # "eb" for array,standardize for rnaseq
powerTransformPhenotype = TRUE,
removeLowVaryingGenes = 0.2,
minNumSamples = 10,
printOutput = TRUE,
removeLowVaringGenesFrom = 'rawData' )
}
# R包Vignette里关于batchCorrect参数的说明
# batchCorrect options: "eb" for ComBat, "qn" for quantiles normalization, "standardize", or "none"
# "eb" is good to use when you use microarray training data to build models on microarray testing data.
# "standardize is good to use when you use microarray training data to build models on RNA-seq testing data (this is what Paul used in the 2017 IDWAS paper that used GDSC microarray to impute in TCGA RNA-Seq data, see methods section of that paper for rationale)
# R包Vignette里关于removeLowVaringGenesFrom参数的说明
#Determine method to remove low varying genes.
#Options are 'homogenizeData' and 'rawData'
#homogenizeData is likely better if there is ComBat batch correction, raw data was used in the 2017 IDWAS paper that used GDSC microarray to impute in TCGA RNA-Seq data.
# 也就是说,芯片数据就用上面代码里的参数,转录组数据的话,就将batchCorrect改为standardize
# removeLowVaringGenesFrom,作者说的也模糊,就随便了
#### 3.看看结果
# 这是运行之后的结果,被存在固定文件夹calcPhenotype_Output下。文件名也是固定的DrugPredictions.csv。因此一个工作目录只能计算一个数据,不要混着用了
testPtype <- read.csv('./calcPhenotype_Output/DrugPredictions.csv', row.names = 1,check.names = F)
testPtype[1:4, 1:4]
dim(testPtype)
identical(colnames(testPtype),colnames(drug))
# IC50越小,药效越好
# 198种药物IC50的预测结果就在这个表格里了
1.2结合高低风险画图
load("../6.model/rsurv.Rdata")
identical(rownames(testPtype),rownames(rsurv))
a = apply(testPtype, 2, function(x){
#x = testPtype[,1],每个药物的IC50值
data.frame(p = wilcox.test(x~rsurv$group)$p.value, # wilcox.test组间非参数检验,和高低风险的计算,计算p值和方差
i = var(x)
)
})
a = do.call(rbind,a)
k1 = a$p<0.01;table(k1)
k2 = a$i>25;table(k2)
table(k1&k2)
dg = rownames(a)[k1&k2]
library(tinyarray)
draw_boxplot(t(testPtype[,dg]),rsurv$group)
# 比较希望能看到高低风险组的差别
1.3相关性热图
load("../6.model/lassogene.Rdata")
nn = names(head(sort(apply(testPtype, 2, sum)),30))
testPtype = testPtype[,nn]
nc = cbind(testPtype,t(exprSet[lassoGene,])) %>% as.matrix()
library(Hmisc)
m = rcorr(nc)$r[1:ncol(testPtype),(ncol(nc)-length(lassoGene)+1):ncol(nc)]
p = rcorr(nc)$P[1:ncol(testPtype),(ncol(nc)-length(lassoGene)+1):ncol(nc)]
p[1:4,1:4]
library(dplyr)
tmp = matrix(case_when(as.logical(p<0.01)~"**",
as.logical(p<0.05)~"*",
T~""),nrow = nrow(p))
library(pheatmap)
pheatmap(t(m),
display_numbers =t(tmp),
angle_col =45,
color = colorRampPalette(c("#2fa1dd", "white", "#f87669"))(100),
border_color = "white",
width = 7,
height=9.1,
treeheight_col = 0,
treeheight_row = 0)
2.免疫治疗
### 1.输入数据
# 细胞丰度矩阵;risk结果;表达矩阵
rm(list = ls())
load("../6.model/rsurv.Rdata")#临床信息、分组
### 2.免疫数据验证
# IMvigor210CoreBiologies这个包比较过时了,更新时间停留在2018年,很难安装。喜欢挑战可以点进去 http://research-pub.gene.com/IMvigor210CoreBiologies/ 这个网页下载本地安装包,折腾一下,不喜欢就直接用存好的数据
# 这个时间点也是R语言版本从3.4切换到3.5的时候,那次换了bioconductor安装方式,导致这个网页中的安装方式早已不能用了
library(BiocGenerics)
library(Biobase)
f_cds = "cds.Rdata"
if(!file.exists(f_cds)){
library(IMvigor210CoreBiologies)
data(cds)
counts = counts(cds) #表达矩阵
an = fData(cds) #feature,基因的信息
pd = pData(cds) #列的信息
save(counts,an,pd,file = "cds.Rdata")
}
load(f_cds)
# 3.表达矩阵和临床信息表格的一系列整理
counts = counts[,match(rownames(pd),colnames(counts))]
an = an[!duplicated(rownames(an)),]
counts = counts[!duplicated(rownames(counts)),]
g = intersect(an$entrez_id,rownames(counts))
an = an[g,]
counts = counts[g,]
k = (!duplicated(an$symbol))&(!is.na(an$symbol));table(k)
counts = counts[k,]
an = an[k,]
rownames(counts) = an$symbol
meta = pd[,c("os","censOS","Best Confirmed Overall Response")]
colnames(meta)[3] = "Response"
meta$Response[meta$Response=="NE"]=NA
str(meta)
colnames(meta)[1:2] = c("time","event")
exp = log2(edgeR::cpm(counts)+1)
load("../6.model/lassogene.Rdata")
exp = exp[lassoGene,]
identical(rownames(meta),colnames(exp))
head(meta)#行是基因,列是样本
library(survminer)
lassoGene
load("../6.model/lasso_model.Rdata")
library(glmnet)
library(survival)
coef = coef(fit,s = cvfit$lambda.min)
dat = data.frame(gene = rownames(coef),
coefficient = as.numeric(coef[,1]))
head(dat)
dat = dat[dat$coefficient!=0,]
identical(dat$gene,colnames(exp))
dat = dat[match(rownames(exp),dat$gene),]
#meta$riskscore = as.numeric(predict(cvfit,newx = t(exp),s = cvfit$lambda.min)) predict有时候会莫名奇妙报错,所以不用
meta$riskscore = apply(exp,2,function(x){sum(dat$coefficient*x)})
res.cut = surv_cutpoint(meta, time = "time", event = "event", variables = "riskscore")
cut = res.cut[["cutpoint"]][1, 1] #最佳截断值
cut
ri = ifelse(meta$riskscore<cut,"lowrisk","highrisk")
names(ri) = rownames(meta)
ri = factor(ri,levels = c("lowrisk","highrisk"))
table(ri)
meta$ri = ri # 至此,数据框有time和event用于做生存分析,response是免疫治疗结果(有NA无所谓),riskscore和ri是高低风险分组
箱线图 & 条形图
library(ggpubr)
dat1 = na.omit(meta)
ggboxplot(data= dat1,x = "Response",y = "riskscore",color = "Response",add = "jitter")+
stat_compare_means(comparisons = list(c("CR","PR"),
c("CR","SD"),
c("CR","PD"),
c("PR","SD"),
c("PR","PD"),
c("SD","PD")))
library(dplyr)
meta$Response2 = ifelse(meta$Response %in% c("SD","PD"),"SD/PD","CR/PR")
dat = count(meta,ri,Response2)
dat = dat %>% group_by(ri) %>%
summarise(Response = Response2,n = n/sum(n))
dat$Response = factor(dat$Response,levels = c("SD/PD","CR/PR"))
library(ggplot2)
ggplot(data = dat)+
geom_bar(aes(x = ri,y = n,
fill = Response),
stat = "identity")+
scale_fill_manual(values = c("#f87669","#2fa1dd"))+
geom_text(aes(x = ri,y = n,
label = scales::percent(n)),
color = "white",
size = 6,
position = position_fill(vjust = 0.5))+
theme_minimal()+
theme(legend.position = "top")
至此,该篇文献的复现学习结束
谢谢观看