共表达网络| WGCNA与hdWGCNA实操

文摘科学 2023-04-07 23:02 江苏

写在前面的话

加权基因共表达网络分析 (WGCNA, Weighted correlation network analysis)是用来描述不同样品之间基因关联模式的系统生物学方法，利用数千或近万个变化最大的基因或全部基因的信息识别感兴趣的基因集并结合与表型之间的关联鉴定候补生物标记基因或治疗靶点。感受Bulk水平到单细胞水平WGCNA的应用~~~

普通转录组

引：Langfelder, P., Horvath, S. WGCNA: an R package for weighted correlation network analysis. BMC Bioinformatics 9, 559 (2008).

数据清洗与整理

# 读取基因表达矩阵数据
fpkm <- read.table("data/fpkm.txt", header = T, row.names = 1, check.names = F)
## 推荐用DESeq2处理后的矩阵
counts <- read.table("data/counts.txt", header = T, row.names = 1, check.names = F)
condition <- factor(c(rep("Con",6)),rep("Treat",6))
colData <- data.frame(row.names=colnames(counts),condition)
dds <- DESeqDataSetFromMatrix(round(counts),colData, design = ~condition))
vst <- vst(dds, blind=FALSE)
vst <- assay(vst) # 得到标准化后的矩阵
# 选取基因
## 1.标准差选择
fpkm_sd <-  apply(fpkm,1,sd)
fpkm_sd_sorted <-  order(fpkm_sd, decreasing = T)
## 选择前5000个标准差较大的基因
fpkm_num <- fpkm_sd_sorted[1:5000]
fpkm_filter <-  fpkm[fpkm_num,]
WGCNA_matrix <- t(fpkm_filter)
## 2.绝对中位差选择
WGCNA_matrix = t(fpkm[order(apply(fpkm,1,mad), decreasing = T)[1:5000],])
## 任性点，那么好的电脑当然全部基因
WGCNA_matrix = t(fpkm)

library(WGCNA)
### 通过样本聚类识别离群样本，去除离群样本 ###
sampleTree = hclust(dist(WGCNA_matrix), method = "average");
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,cex.axis = 1.5, cex.main = 2)

# 去除离群的聚类样本，高于cutHeight的样本剔除
clust <- cutreeStatic(sampleTree, cutHeight = 120, minSize = 10)
table(clust)
# clust
# 0   1     ##0是要去除的，1是要保存的
# 15 162
# 提取clust 1聚类中的样本
keepSamples <- (clust==1)
WGCNA_matrix <- WGCNA_matrix[keepSamples, ]
# 记录基因和样本数
nGenes = ncol(WGCNA_matrix)#基因数
nSamples = nrow(WGCNA_matrix)#样本数

软阈值的确认

### 选择软阈值 ###
powers <- c(c(1:10), seq(from = 12, to=20, by=2))
# 进行网络拓扑分析
sft <- pickSoftThreshold(WGCNA_matrix, powerVector = powers, verbose = 5)
# 无尺度网络阈值的选择
plot(sft$fitIndices[,1], 
     -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",#x轴
     ylab="Scale Free Topology Model Fit,signed R^2",type="n",#y轴
     main = paste("Scale independence"));#标题
text(sft$fitIndices[,1], 
     -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,
     cex=cex1,
     col="red");

# 用红线标出R^2的参考值
abline(h=0.90,col="red")
# 平均连接度
plot(sft$fitIndices[,1], 
     sft$fitIndices[,5],
     xlab="Soft Threshold (power)",
     ylab="Mean Connectivity", 
     type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], 
     sft$fitIndices[,5], 
     labels=powers, 
     cex=cex1,
     col="red")

softpower <- sft$powerEstimate
ADJ <- abs(cor(datExpr,use="p")) #相关性取绝对值再幂次
k <- as.vector(apply(ADJ,2,sum,na.rm=T))#对ADJ的每一列取和
hist(k)
scaleFreePlot(k,main="Check scale free topology")

共表达网络构建

### 一步构建网络 ###
net <- blockwiseModules(WGCNA_matrix, #处理好的表达矩阵
                       power = sft$powerEstimate,#选择的软阈值
                       TOMType = "unsigned", #拓扑矩阵类型，none表示邻接矩阵聚类，unsigned最常用，构建无方向
                       maxBlockSize =nGenes，#计算机能处理的最大模块的基因数量
                       minModuleSize = 30,#网络模块包含的最少基因数
                       reassignThreshold = 0, #模块间基因重分类的阈值
                       mergeCutHeight = 0.25,#合并相异度低于0.25的模块
                       numericLabels = TRUE, #true，返回模块的数字标签 false返回模块的颜色标签
                       pamRespectsDendro = FALSE,#调用动态剪切树算法识别网络模块后，进行第二次的模块比较，合并相关性高的模块
                       saveTOMs = TRUE,#保存拓扑矩阵
                       saveTOMFileBase = "data/TOM", 
                       verbose = 3,#0，不返回任何信息，＞0返回计算过程
                       randomSeed = 0407 #确保数据可重复性
                       )

可视化

# 将标签转化为颜色
mergedColors = labels2colors(net$colors)
# 绘制聚类和网络模块对应图
plotDendroAndColors(dendro = net$dendrograms[[1]], #hclust函数生成的聚类结果
                    colors = mergedColors[net$blockGenes[[1]]],#基因对应的模块颜色
                    groupLabels = "Module colors",#分组标签
                    dendroLabels = FALSE, #false,不显示聚类图的每个分支名称
                    hang = 0.03,#调整聚类图分支所占的高度
                    addGuide = TRUE, #为聚类图添加辅助线
                    guideHang = 0.05,#辅助线所在高度
                    main = "Gene dendrogram and module colors")
# 加载TOM矩阵
load("data/TOM.RData")
# 网络特征向量
MEs = moduleEigengenes(WGCNA_matrix, mergedColors)$eigengenes
# 对特征向量排序
MEs = orderMEs(MEs)#这个MEs是可用于寻找hub基因
# 可视化模块间的相关性
plotEigengeneNetworks(MEs, "", marDendro = c(0,4,1,2), 
                      marHeatmap = c(3,4,1,2), cex.lab = 0.8, 
                      xLabelsAngle = 90)
# TOMplot
load(net$TOMFiles, verbose=T)
dissTOM = 1-TOMsimilarityFromExpr(WGCNA_matrix, power = sft$powerEstimate); #1-相关性=相异性
TOMplot(dissTOM, #拓扑矩阵，该矩阵记录了每个节点之间的相关性
        net$dendrograms[[1]], #基因的聚类结果
        mergedColors[net$blockGenes[[1]]], #基因对应的模块颜色
        main = "Network heatmap plot, selected genes")

模块与特征矩阵的联系

# 读入特征矩阵
## 用于关联分析的性状必须是数值型特征，区域或分类变量，需要转换为0-1矩阵的形式
trait <- "TraitsClean.txt"

# 将特征信息转换为颜色，白色表示低，红色表示高，灰色表示缺失
traitColors = numbers2colors(datTraits, signed = FALSE)

# 样本聚类图与样本性状热图
plotDendroAndColors(sampleTree2, 
                    traitColors,
                    groupLabels = names(datTraits), 
                    main = "Sample dendrogram and trait heatmap")

# 网络的分析
## 基因和所在模块信息
gene_module <- data.frame(ID=colnames(WGCNA_matrix), module=mergedColors)
gene_module = gene_module[order(gene_module$module),]
write.table(gene_module,"gene_module.csv"),row.names=F)
## 保存模块代表性信息
MEs_col <- MEs
MEs_colt <- as.data.frame(t(MEs_col))
colnames(MEs_colt) <- rownames(WGCNA_matrix)
write.csv(MEs_colt,"module_eipgengene.csv")
## 基因模块与特征信息的关系
# 计算模块特征矩阵和样本特征矩阵的相关度。
moduleTraitCor=cor(MEs, Trait, use="p")
write.csv("modPhysiological.cor.csv",moduleTraitCor,sep="\t",quote=F)
moduleTraitPvalue=corPvalueStudent(moduleTraitCor, nSamples)
write.csv("modPhysiological.p.csv",moduleTraitPvalue,sep="\t",quote=F)
textMatrix=paste(signif(moduleTraitCor,2),"\n(",signif(moduleTraitPvalue,1),")",sep="")
# 基因模块与样本特征信息相关性图
labeledHeatmap(Matrix=moduleTraitCor,#模块和表型的相关性矩阵
               xLabels=colnames(datTraits),
               yLabels=names(MEs),
               ySymbols=names(MEs),
               colorLabels=FALSE,
               colors=blueWhiteRed(50),
               textMatrix=textMatrix,
               setStdMargins=FALSE,
               cex.text=0.5,
               cex.lab=0.5,
               zlim=c(-1,1),
               main=paste("Module-trait relationships"))

## 最关键的就是得到基因模块与样本特征这张图。找到自己感兴趣的模块，可以找hub基因，功能富集分析等等
## 模块中的hub基因
## 为每一个模块寻找hub基因
HubGenes <- chooseTopHubInEachModule(WGCNA_matrix,#WGCNA分析输入的表达矩阵
                                     mergedColors )#模块颜色信息
# 保存hub基因结果
write.csv (HubGenes,"HubGenes_of_each_module.csv",col.names = F)
#### 与某种特征相关的hub基因
NS <- networkScreening(trait$age,#年龄
                      MEs,
                      WGCNA_matrix)
## 单一特征与所有模块关联分析
M_stage <- as.numeric(cor(trait$M_stage,datExpr, use="p"))
GeneSignificance = abs(GS)
plotModuleSignificance(GeneSignificance, moduleColors, ylim=c(0,0.2), main="Gene significance for M stage across module")
## 单一模块与某一表型相关性
M_stage = as.data.frame(trait$M_stage)
# 分析自己感兴趣的特征信息，此处以M_stage为示例
names(M_stage) = "M_stage"
# 模块对应的颜色
modNames = substring(names(MEs), 3)
# 模块对应的颜色
modNames = substring(names(MEs), 3)
# 计算基因模块特征
geneModuleMembership = as.data.frame(cor(WGCNA_matrix, MEs, use = "p"));
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));
# 对结果进行命名
names(geneModuleMembership) = paste("MM", modNames, sep="");
names(MMPvalue) = paste("p.MM", modNames, sep="");
# 计算M分期基因特征显著性
geneTraitSignificance = as.data.frame(cor(WGCNA_matrix, M_stage, use = "p"));
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));
# 对结果进行命名
names(geneTraitSignificance) = paste("GS.", names(M_stage), sep="")
names(GSPvalue) = paste("p.GS.", names(M_stage), sep="")
# 设置需要分析的模块名称，此处为brown模块
module = "brown"
# 提取brown模块数据
column = match(module, modNames);
moduleGenes = moduleColors==module;
# 可视化brown模块与M分期的相关性分析结果
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
                   abs(geneTraitSignificance[moduleGenes, 1]),
                   xlab = paste("Module Membership in", module, "module"),
                   ylab = "Gene significance for M Stage",
                   main = paste("Module membership vs. gene significance\n"),
                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)

转化cytoscape格式文件

### 输出cytoscape可视化
# 输出全部网络模块
cyt = exportNetworkToCytoscape(TOM,
                               edgeFile = "CytoscapeInput-edges-all.txt",#基因间的共表达关系
                               nodeFile = "CytoscapeInput-nodes-all.txt",#
                               weighted = TRUE,
                               threshold = 0.1,
                               nodeNames = geneID$SYMBOL,
                               altNodeNames = geneID$ENTREZID,
                               nodeAttr = moduleColors)
# 输出感兴趣网络模块
modules = c("brown", "red")
# 选择上面模块中包含的基因
inModule = is.finite(match(moduleColors, modules))
modGenes = geneID[inModule,]
# 选择指定模块的TOM矩阵
modTOM = TOM[inModule, inModule]

单细胞转录组

引：Morabito, S., Miyoshi, E., Michael, N. et al. Single-nucleus chromatin accessibility and transcriptomic characterization of Alzheimer’s disease. Nat Genet 53, 1143–1155 (2021).

细胞亚群选取与数据清洗

library(hdWGCNA)
library(Seurat)
library(tidyverse)
library(WGCNA)
# 读取单细胞数据集
scRNA=readRDS('scRNA.RDS')
# 提T细胞亚群,重新降维聚类
scRNA=subset(scRNA,celltype %in% 'T_cells')
scRNA <- SCTransform(scRNA)
scRNA <- FindNeighbors(scRNA, dims = 1:10)
scRNA <- FindClusters(scRNA)
scRNA <- RunUMAP(scRNA, dims = 1:10)
DimPlot(scRNA,label=T)
#过滤出至少在5%的细胞中表达的基因
scRNA <- SetupForWGCNA(
  scRNA,
  gene_select = "fraction",
  fraction = 0.05,
  wgcna_name = "T_wgcna"
)

构建metacells及软阈值的确定

# 构建metacells
scRNA<- MetacellsByGroups(
  seurat_obj = scRNA,k=20,
  max_shared = 10,
  # group.by可以是组织类型、细胞类型或分组
  group.by = c("celltype",'orig.ident'), 
  ident.group = 'celltype' 
)
# 标准化
scRNA <- NormalizeMetacells(scRNA)
metacell_obj <- GetMetacellObject(scRNA)
# 转置表达矩阵
seurat_obj  <- SetDatExpr(
  scRNA,
  group_name = "T_cells", 
  group.by='celltype'
)
# 选择softpower
seurat_obj <- TestSoftPowers(
  seurat_obj,
  setDatExpr = FALSE
)
# 可视化
plot_list <- PlotSoftPowers(seurat_obj)
wrap_plots(plot_list, ncol=2)
# 查看powerTable
power_table <- GetPowerTable(seurat_obj)
head(power_table)

共表达网络构建

# 构建共表达网络
softpower=4
seurat_obj <- ConstructNetwork(
  seurat_obj, soft_power=softpower,
  group.by='celltype', group_name='T_cells',setDatExpr = F)
# 可视化WGCNA网络
PlotDendrogram(seurat_obj, main='hdWGCNA Dendrogram')
# 获取TOM矩阵
TOM <- GetTOM(seurat_obj)
# 计算模块协调特征
seurat_obj <- Seurat::ScaleData(
  seurat_obj,
  features = GetWGCNAGenes(seurat_obj)
)
# 计算ME
library(harmony)
seurat_obj <- ModuleEigengenes(
  seurat_obj,
  group.by.vars="orig.ident" #harmony对象
)
seurat_obj <- ModuleConnectivity(seurat_obj)
# 可视化每个模块根据kME打分排序的基因
PlotKMEs(seurat_obj, ncol=3)
# 获取hub genes
hub_df <- GetHubGenes(seurat_obj, n_hubs = 25)

PlotKMEs(seurat_obj)

模块与细胞亚群的联系

## 模块间的相关性
library(igraph)
library(qgraph)
# 画模块间相关性图
ModuleCorrelogram(seurat_obj, sig.level = 0.001, pch.cex=2)
# hub基因打分
# 1.Seurat
seurat_obj <- ModuleExprScore(
  seurat_obj,
  n_genes = 25,
  method='Seurat'
)
# 2.Ucell
library(UCell)
seurat_obj <- ModuleExprScore(
  seurat_obj,
  n_genes = 25,
  method='UCell'
)
# 可视化
plot_list <- ModuleFeaturePlot(
  seurat_obj,
  features='hMEs', 
  order=TRUE 
)
wrap_plots(plot_list)

# 模块与类群的联系
MEs <- GetMEs(seurat_obj, harmonized=TRUE)
mods <- colnames(MEs); mods <- mods[mods != 'grey']
seurat_obj@meta.data <- cbind(seurat_obj@meta.data, MEs)
p <- DotPlot(seurat_obj, features=mods)

总而言之，整理bulk水平和单细胞的WGCNA分析会发现，因单细胞转录组的稀疏性问题，引入构建metacells，其次两者共性都在于共表达网络的构建和模块的选取，MEs的定义是给定模型的第一主成分，代表整个模型的基因表达谱。相较于差异基因而言，WGCNA分析考虑的因素更多，与表型的联系更符合生物学意义！！！

彩蛋分享

外接Chatgpt和Bing的小插件，前提是要有账号哦！

https://platform.openai.com/account/api-keys

朴素的科研打工仔

专注于文献的分享，浙大研究生学习生活的记录。