非靶向代谢组当前最佳性能限制排序OPLS-DA分析及其代谢特征挖掘(数据工程)

学术   2024-09-30 17:39   江苏  

本节内容属于思农数据工程产物,欢迎大家批评指正。

library(dplyr)library(phyloseq)library(ggClusterNet)library(ggplot2)library(pacman)p_load(ropls)#有则library,无则install并librarylibrary(ggsci)library(reshape2)
rm(list = ls())#实际使用优先加载工作目录 setwd(“ ”)#确定文件和图片输出路径outpath <- "C:/result"oplspath = paste(outpath,"/OPLS-DA分析/",sep ="")dir.create(oplspath)#导入包自带数据data(sacurine)#提取代谢物组成和样品信息dat<- sacurine$dataMatrixgroup <- sacurine$sampleMetadata#定义oplsda分析组别group1 <- "Gender"#oplsda分析,orthoI = NA表示执行oplsda,特征向量数量由模型效果确认,默认为0表示plsda分析;#建立模型过程中,当报错显示第一预测主成分不显著时,需手动指定正交特征向量数量,最大为9;oplsda = opls(dat,group$gender, predI = 1,orthoI = NA,permI = 200,crossvalI = 12)#permI常用200,crossvalI代表k着验证法中的k,根据样本数量确定

模型运行结果如上图所示。R2X(cum) 代表模型对X变量的累计解释度,其值为0.275;R2Y(cum)模型对Y变量的累计解释度,其值为0.73;Q2(cum) 代表模型的可预测性,其值为0.617;RMSEE 代表 根均方误差估计(Root Mean Square Error of Estimate),其值为0.262,它是衡量模型预测值与实际值之间差异的指标,RMSEE 的值越小,表示模型的预测效果越好;pre表示模型预测主成分个数,1;ort代表模型正交成分个数,2;pR2Y,pQ2分别代表其显著性检验结果,0.005。

#提取模型解释度oplsda.eig <- data.frame(oplsda@modelDF)#导出数据file <- paste(oplspath,group1,"_model_statistic.csv",sep="")write.csv(oplsda.eig,file,quote =T)

#提取检验数据permutation_data <- data.frame(oplsda@suppLs[["permMN"]])#检验数据会有201行,第一行为模型结果,剩下的200次为随机检验结果#添加行名,使其看起来更加直观一些permutation_name <- paste("permutation",1:200,sep = "")rownames(permutation_data) <- c("model",permutation_name)#导出数据file <- paste(oplspath,group1,"_model_permutation.csv",sep="")write.csv(permutation_data,file,quote =T)#列名含义同模型含义解释,sim代表随机置换的Y变量与原模型Y变量的相似度,即置换保留度,1代表与原模型保持一致

#提取模型中样本得分sample.score = oplsda@scoreMN %>%  #得分矩阵  as.data.frame() %>%  mutate(Group = group$gender,         o1=oplsda@orthoScoreMN[,"o1"],         o2=oplsda@orthoScoreMN[,"o2"]) #正交矩阵#oplsda@scoreMN代表样本预测主成分得分;oplsda@orthoScoreMN代表样本正交成分得分

head(sample.score)#查看file <- paste(oplspath,group1,"_sample_score.csv",sep="")write.csv(sample.score,file,quote =T)

#变量重要性,VIP#VIP 值帮助寻找重要的代谢物,添加orthol为模型vip值,不添加为第一主成分即组间差异vip值ovip <- data.frame( VIP_o1 = getVipVn(oplsda,orthoL = TRUE), #正交模型的变量重要性值。                    VIP_p1 = getVipVn(oplsda,orthoL = FALSE)) #预测成分的变量的重要性。#导出数据file <- paste(oplspath,group1,"_VIP_score.csv",sep="")write.csv(ovip,file,quote =T)

#根据vip筛选差异微生物ovip_select <- ovip %>%  filter(VIP_p1 >= 1) %>% # 以VIP>=1为阈值进行筛选。  arrange(VIP_p1) %>% #根据VIP值升序排序 ,使可视化时y轴从上到下递减 ,ggplot2中非连续变量默认y轴呈现顺序为从下到上,即第一行的样品在y轴最下方,最后一行的样品在y轴最上方  mutate(feature = factor(rownames(.),levels = rownames(.))) ovip_select %>% head()dim(ovip_select)#vip大于1的物质个数#top30,选择可视化展示数量,20即为nrow(ovip_select)-19,数量较少时可选择全部展示vip_plot <- ovip_select[(nrow(ovip_select)-29):nrow(ovip_select),]#导出数据file <- paste(oplspath,group1,"_VIP_difference_top30.csv",sep="")write.csv(vip_plot,file,quote =T)
#将检验数据转换为长数据框,便于可视化,借助于reshape2包中melt函数permutation_plot <- melt(permutation_data[,c("R2Y.cum.","Q2.cum.","sim")],id="sim")

#可视化#得分图可视化,逐行解释p1 <- ggplot(sample.score, aes(p1, o1, color = Group)) +#指定绘图对象,x轴,y轴和颜色分类  geom_hline(yintercept = 0, linetype = 'dashed', size = 0.5) + #添加横向等值线,y=0,线型为虚线,线的尺寸为0.5即粗细,值越大线越粗  geom_vline(xintercept = 0, linetype = 'dashed', size = 0.5) + #添加纵向等值线,x=0,线型为虚线,线的尺寸为0.5即粗细,值越大线越粗  geom_point(alpha = 0.6, size = 2)+#添加散点,透明度为0.6,大小为2,默认为实心圆,修改形状可通过shape参数  xlab(paste("T score[1] (",round(oplsda.eig["p1",1]*100,2), "%)",sep=""))+  #指定x轴标题内容——预测主成分加其组间解释度百分比  ylab(paste("Orthogonal T score[1] (",round(oplsda.eig["o1",1]*100,2), "%)",sep=""))+##指定y轴标题内容——正交成分轴1加其组内解释度百分比  ggtitle(paste(group1,"                                     OPLS-DA Scores Plot",sep = ""))+##指定标题,空格是为其刚好与图形尺寸匹配  stat_ellipse(level = 0.95, linetype = 'solid',                size = 1,alpha=0.7) + #添加置信椭圆,置信水平0.95,实线,尺寸为1,透明度为0.7,默认不对椭圆进行填充,可通过fill参数设置  scale_color_manual(values=c("F"="#E41A1C","M"="#377EB8"))+  #这里指定类别颜色,需根据实际情况将标签调整为自己的组别标签,例如,组别为A,B,则调整为values=c("A"="#E41A1C","B"="#377EB8")  theme_bw() +  theme(panel.border=element_rect(colour= "black",fill=NA,size=0.75),#图框外边线颜色为黑色,粗细为0.75,不填充        panel.grid.minor=element_blank(),#隐藏图中的次要网格线        panel.background=element_blank(),#绘图背景为空白        plot.background=element_blank(),#全部背景为空白        plot.title =element_text(face="bold",color="black",size=12,vjust=0.7,hjust = 0.5),#标题文字设置,face调整字体格式,”bold”代表加粗,颜色为黑,大小为12,垂直对齐为0.7,值越大距离图框越远,水平方向上居中对齐 ,值越大越靠右        axis.title.x = element_text(size = 11,colour = "black",face = "bold",vjust = 1),#x轴标题设置        axis.title.y = element_text(size = 11,colour = "black",face = "bold",hjust = 0.5,vjust = 1.6),#y轴标题设置        axis.text=element_text(color="black",size=10,vjust = 0.5,hjust = 0.5),#坐标轴标签设置        legend.background=element_blank(),#图例区域背景设置        legend.title=element_blank(),#图例标题设置        legend.text=element_text(face="bold",color="black",size=10),#图例标签设置        legend.spacing.x=unit(0.2,"cm"),#图例水平间距设置        legend.key = element_blank(),#图例中形状背景设置        legend.key.size =unit(0.45,"cm"),#图例中形状大小        legend.position=c(0.65,0.99),#图例在图中位置,数值分别代表占据x轴和轴比例        legend.direction = "horizontal",#图例方向        legend.justification = c(0.95,0.84))#图例对齐方式,即图例位置与设定位置之间的关系#主题设置,theme_bw为黑白主题,即画面背景为空白,theme()是对绘图细节进行调整的重要函数,其中element_blank()代表空白,element_text()为字体设置函数p1FileName <- paste(oplspath,group1,"_score.plot", ".pdf", sep = "")ggsave(FileName, p1,width = 5.5, height =4)
FileName <- paste(oplspath,group1,"_score.plot", ".png", sep = "")ggsave(FileName, p1,width = 5.5, height =4)

#前30vip棒棒糖p2 <- ggplot(vip_plot,aes(x= VIP_p1,y=feature))+    geom_segment(aes(x=1,xend=VIP_p1,y=feature,yend=feature),linewidth=0.6)+    geom_point(color="#377EB8",shape = 19,size=2.8)+  theme_bw()+    xlab("VIP scores")+labs(title="Top30 different features scores")+  scale_x_continuous(limits=c(1,max(vip_plot$VIP_p1)+0.1),expand = c(0,0))+  theme(panel.border=element_rect(colour= "black",fill=NA,size=0.75),        panel.grid.minor=element_blank(),        panel.background=element_blank(),        plot.background=element_blank(),        plot.title =element_text(face="bold",color="black",size=12,vjust=0.7,hjust = 0.5),         axis.title.x = element_text(size = 11,colour = "black",face = "bold",vjust = 1),        axis.title.y = element_text(size = 11,colour = "black",face = "bold",hjust = 0.5,vjust = 1.6),        axis.text.y = element_text(color="black",size=7.5,vjust = 0.5,hjust = 1),  )
p2FileName <- paste(oplspath,group1,"_TOP30_VIP.plot", ".pdf", sep = "")ggsave(FileName, p2,width = 9, height =6)
FileName <- paste(oplspath,group1,"_TOP30_VIP.plot", ".png", sep = "")ggsave(FileName, p2,width = 9, height =6)

#检验结果可视化
#拟合直线颜色mycol <- rgb(95/255,95/255,95/255)#R2,Q2截距,斜率,图例标签IR2 <- summary(lm(value~sim,data = permutation_plot[which(permutation_plot$variable=="R2Y.cum."),]))$coefficients["(Intercept)","Estimate"]IQ2 <- summary(lm(value~sim,data = permutation_plot[which(permutation_plot$variable=="Q2.cum."),]))$coefficients["(Intercept)","Estimate"]
labelR2 <- expression(paste("R"^2, "Y(cum)",sep = ""))#expression,可表达数学公式labelQ2 <- expression(paste("Q"^2,"(cum)",sep = ""))##expression,可表达数学公式#出图p3 <- ggplot(permutation_plot,aes(x= sim,y=value,color=variable,shape = variable))+ theme_bw()+geom_point(alpha=0.8,size=1.5)+ scale_color_manual(values=c("Q2.cum."="#377EB8","R2Y.cum."="#E41A1C"), labels=c(labelR2,labelQ2))+ scale_shape_manual(values=c(19,17),labels=c(labelR2,labelQ2))+ geom_segment(x=0,y=IR2,xend=1,yend=permutation_data["model","R2Y.cum."],linetype="dashed",color=mycol,linewidth=0.4)+ geom_segment(x=0,y=IQ2,xend=1,yend=permutation_data["model","Q2.cum."],linetype="dashed",color=mycol,linewidth=0.4)+ scale_y_continuous(limits = c(min(permutation_plot$value)-0.1,max(permutation_plot$value)+0.1),expand = c(0,0), breaks = seq(round(min(permutation_plot$value),1),round(max(permutation_plot$value),1),0.4))+ ggtitle(paste(group1," OPLS-DA permutation",sep = ""))+ labs(x="Correlation Coefficient",y = expression(paste("R"^2, "Y(cum) and ","Q"^2,"(cum)",sep = "")), subtitle = bquote(Intercepts ~ ":" ~ R^2 ~ Y(cum) ~ "=" ~ .(round(IR2, 2))~"," ~ Q^2 ~ "(cum)"~"=" ~ .(round(IQ2, 2))))+ theme(panel.border=element_rect(colour= "black",fill=NA,size=0.75), panel.grid.minor=element_blank(), panel.background=element_blank(), plot.background=element_blank(), plot.title =element_text(face="bold",color="black",size=12,vjust=0.7,hjust = 0.5), plot.subtitle = element_text(face="bold",color="black",size=11,vjust=0.7,hjust = 0.5), axis.title.x = element_text(size = 11,colour = "black",face = "bold",vjust = 1), axis.title.y = element_text(size = 11,colour = "black",face = "bold",hjust = 0.5,vjust = 1), axis.text = element_text(color="black",size=10,vjust = 0.5,hjust = 0.5), legend.background=element_blank(), legend.title=element_blank(), legend.text=element_text(color="black",size=10,face="bold",vjust = 0.5,hjust = 0.5), legend.spacing.x=unit(0,"cm"), legend.key = element_blank(), legend.key.size =unit(0.6,"cm"), legend.position=c(0.99,0.01), legend.direction = "vertical", legend.justification = c(0.95,0))+ geom_hline(yintercept=0,linetype=1,color = 'black',size = 0.5) + geom_vline(xintercept=0,linetype=1,color = 'black',size = 0.5)p3
FileName <- paste(oplspath,group1,"_permutation.plot", ".pdf", sep = "")ggsave(FileName, p3,width = 5.5, height =4.6)
FileName <- paste(oplspath,group1,"_permutation.plot", ".png", sep = "")ggsave(FileName, p3,width = 5.5, height =4.6)

作者:思农生信团队-汪志祥



微生信生物
根际互作生物学研究室是沈其荣院士土壤微生物与有机肥团队下的一个关注于根际互作的研究小组。本小组由袁军教授带领,主要关注:1.植物和微生物互作在抗病过程中的作用 2 环境微生物大数据整合研究3 环境代谢组及其与微生物过程研究体系开发
 最新文章