miRNA芯片数据的差异分析与mRNA数据的差异分析是相类似的,同时在既往的推文里我们也已经做了高通量测序数据的差异分析,后续我们会比较一下两者代码的区别,并且尝试解释异常火山图的可能原因。
随意挑选了一个GEO数据集 :GSE246130
分析流程:
1、导入
rm(list = ls())
options(timeout = 10000000)
options(scipen = 20)#不要以科学计数法表示
library(tinyarray)
proj <- "GSE246130"
2、芯片数据下载
geo = geo_download(proj,colon_remove = T)
#boxplot(geo$exp[,1:10])
exp = geo$exp
pd = geo$pd
gpl_number = geo$gpl
head(exp)
# GSM7856872 GSM7856873 GSM7856874 GSM7856875 GSM7856876 GSM7856877
# hsa-let-7a-3p 4.4191700 4.3974220 4.5855060 4.1689777 4.917222 5.098484
# hsa-let-7a-5p 13.8694340 14.4513580 14.5720320 12.4815650 14.572032 14.572032
# hsa-let-7b-3p 4.8332195 5.2882895 5.5354414 1.7899636 4.266596 4.318655
# hsa-let-7b-5p 14.5720320 14.2305620 13.8694340 14.5720320 14.230562 13.869434
# hsa-let-7c-3p 0.3637888 0.3669181 0.4047206 0.4204817 1.205251 1.094209
# hsa-let-7c-5p 12.9547150 12.4336980 12.4336980 13.0124380 12.296271 12.433698
这个数据集不需要再取log了
3、临床分组设置
library(stringr)
head(pd)[1:4,1:4]
# title description.1 source_name_ch1
# GSM7856872 ExVivoCell_72Hr_0μM_miRNA_rep1 [N1](normalized) L-02 cells,72hr, 0μM
# GSM7856873 ExVivoCell_72Hr_0μM_miRNA_rep2 [N2](normalized) L-02 cells,72hr, 0μM
# GSM7856874 ExVivoCell_72Hr_0μM_miRNA_rep3 [N3](normalized) L-02 cells,72hr, 0μM
# GSM7856875 ExVivoCell_72Hr_20μM_miRNA_rep1 [cd1](normalized) L-02 cells,72hr, 20μM
# description
# GSM7856872 Gene expression in L-02 cells were cultured normally after 72h
# GSM7856873 Gene expression in L-02 cells were cultured normally after 72h
# GSM7856874 Gene expression in L-02 cells were cultured normally after 72h
# GSM7856875 Gene expression after 72 h of exposure to 20 μM of CdCl2 in L-02 cell
# 使用字符串处理的函数获取分组
k = str_detect(pd$treatment,"normally");table(k)
Group = ifelse(k,"Normal","Treatment")
Group = factor(Group,levels = c("Normal","Treatment"))
Group
4、保存数据
save(pd,exp,gpl_number,Group,proj,file = "step1output.Rdata")
5、加载数据
rm(list = ls())
load(file = "step1output.Rdata")
6、芯片数据使用limma差异分析-差异基因火山图
design = model.matrix(~Group):构建设计矩阵,表示实验组和对照组的分组情况。 fit = lmFit(exp,design):使用 limma 包中的 lmFit 函数,对阵列数据 exp 进行线性模型拟合。 fit = eBayes(fit):使用经验贝叶斯方法(eBayes)对模型拟合结果进行统计增强,提高统计检验的稳定性。 deg = topTable(fit, coef = 2, number = Inf):提取差异表达结果,coef=2 指定对第二个系数(通常是实验组和对照组的对比)进行分析,返回所有结果。
library(dplyr)
library(limma)
design = model.matrix(~Group)
fit = lmFit(exp,design)
fit = eBayes(fit)
deg = topTable(fit,coef = 2,number = Inf)
logFC_t = 1
p_t = 0.05
k1 = (deg$P.Value < p_t)&(deg$logFC < -logFC_t)
k2 = (deg$P.Value < p_t)&(deg$logFC > logFC_t)
deg = mutate(deg,change = ifelse(k1,"down",ifelse(k2,"up","stable")))
table(deg$change)
#火山图
library(ggplot2)
ggplot(data = deg, aes(x = logFC, y = -log10(P.Value))) +
geom_point(alpha=0.4, size=3.5, aes(color=change)) +
scale_color_manual(values=c("blue", "grey","red"))+
geom_vline(xintercept=c(-logFC_t,logFC_t),lty=4,col="black",linewidth=0.8) +
geom_hline(yintercept = -log10(p_t),lty=4,col="black",linewidth=0.8) +
theme_bw()
可以看到这个火山图的logFC值也是非常正常的,没有很诡异。
完成miRNA的高通量和芯片下游分析再学习之后,结合曾老师近期的一个推文:每个生信小白都应该避坑的小细节!我们来思考一下为什么会出现这么诡异的火山图。
出现这种情况的可能原因有很多,笔者认为其中一个最有可能的就是芯片和高通量测序代码出现了混用。那么笔者就尝试做一个类似的试一试。
使用miRNA分析流程学习(二)推文中的高通量测序数据集,并采用limma包的芯片流程进行差异分析,核心代码如下
library(limma)
library(dplyr)
# limma-array
design = model.matrix(~Group)
fit = lmFit(exp,design)
fit = eBayes(fit)
deg = topTable(fit,coef = 2,number = Inf)
# limma-RNA-seq
#dge <- edgeR::DGEList(counts=exp)
#dge <- edgeR::calcNormFactors(dge)
#design <- model.matrix(~Group)
#v <- voom(dge,design, normalize="quantile")
#fit <- lmFit(v, design)
#fit= eBayes(fit)
#deg = topTable(fit, coef=2, n=Inf)
#deg = na.omit(deg)
# 加change列,标记上下调基因
logFC_t = 1
p_t = 0.05
k1 = (deg$P.Value < p_t)&(deg$logFC < -logFC_t)
k2 = (deg$P.Value < p_t)&(deg$logFC > logFC_t)
deg = mutate(deg,change = ifelse(k1,"down",ifelse(k2,"up","stable")))
table(deg$change)
write.csv(deg,"degs.csv")
library(ggplot2)
ggplot(data = deg, aes(x = logFC, y = -log10(P.Value))) +
geom_point(alpha=0.4, size=3.5, aes(color=change)) +
scale_color_manual(values=c("blue", "grey","red"))+
geom_vline(xintercept=c(-logFC_t,logFC_t),lty=4,col="black",linewidth=0.8) +
geom_hline(yintercept = -log10(p_t),lty=4,col="black",linewidth=0.8) +
theme_bw()
这不就出现了诡异的火山图。这是因为Microarray的数据已标准化,可以直接用lmFit拟合,而RNA-seq需先采用calcNormFactors矫正测序深度的差异,并用voom去归一化转换数据,以适应线性模型。
参考资料:
生信技能树时间线:https://mp.weixin.qq.com/mp/appmsgalbum?action=getalbum&__biz=MzAxMDkxODM1Ng==&scene=24&album_id=2201138830328528899&count=3&uin=&key=&devicetype=iMac+Mac14%2C7+OSX+OSX+14.6.1+build(23G93)&version=13080810&lang=zh_CN&nettype=WIFI&ascene=0&fontScale=100 生信技能树B站视频:https://www.bilibili.com/video/BV1zK411n7qr/?vd_source=3a13860df939bc922ad1fd6099e42c1d 生信技能树/小洁老师:https://mp.weixin.qq.com/s/zTN32UA6Nu6PMmqXYTOnnQ 生信星球:https://mp.weixin.qq.com/s/LMacx8lAOOGvR60YO7sYzQ
既往推文:
miRNA分析流程学习(二)/TCGAmiRNA数据三大R包整合差异分析再学习
miRNA分析流程学习(三)/miRNA靶基因预测-ENCORI数据库数据下载
致谢:感谢曾老师/小洁老师以及生信技能树团队全体成员。
注:若对内容有疑惑或者有发现明确错误的朋友,请联系后台(欢迎交流)。更多内容可关注公众号:生信方舟
- END -