写在前面的话
单细胞转录组的细胞代谢通量预测分析,运行日志分享,因稀疏矩阵问题,先构建元细胞(可使用metacell和supercell算法),遇到的报错非常多,所以文末留下联系方式,欢迎交流。
元细胞构建
hdWGCNA
library(hdWGCNA)
scRNA <- SetupForWGCNA(scRNA,
wgcna_name = "scRNA",
#Select genes that will be used for co-expression network analysis
gene_select = "fraction",
fraction = 0.05#基因表达百分比,0.05表示这个基因必在5%的细胞中表达
### variable: 使用存储在Seurat对象的VariableFeatures中的基因,就是高变基因
### fraction: 使用在整个数据集或每组细胞中特定部分细胞中表达的基因
### custom:使用自定义列表中指定的基因,就是自己选定基因集
)
metaFields = c("celltype","orig.ident","sex","year","group","tech","age","RecSID")
scRNA_Metacell <- MetacellsByGroups(
seurat_obj=scRNA,
group.by = metaFields,#在 scRNA@meta.data 中指定要分组的列
k = 50, # 最近邻参数
reduction = "harmony", # 降维方法
slot='counts',
max_shared = 40, # 两个元细胞区之间共享细胞区的最大数量
ident.group = 'celltype',# 设置元细胞 seurat 对象的 Idents
min_cells = 100
)
# 出现以下报错,查询github作者的回答包括ConstructMetacells函数,就是k值和max_shared参数差值为10就不会报错了,不过确实构建时间好长。
# 对大型数据构建,内存不够友好,所以提取部分子集进行构建
# 从30w细胞缩到10w细胞
dim(metacell_obj)
[1] 45481105838
dim(scRNA)
[1] 45481388985
Supercell
library(SuperCell)
# 定义一个函数
makeSeuratMC <- function(scRNA,
genes = NULL,
metaFields = c("celltype","orig.ident","sex","year","group","tech","age","RecSID"),
returnMC = F) {
if(is.null(genes)) {
genes <-rownames(scRNA@assays$RNA@scale.data)
}
# 构建元细胞
MC <- SCimplify(GetAssayData(scRNA,slot = "data"), # 标准化后的矩阵
n.pc = 15,
k.knn = 5, # 建立 kNN 网络的最近邻数
gamma = gamma, # 粒化水平
genes.use = rownames(scRNA@assays$RNA@scale.data) )
# 计算元细胞的纯度
MC$purity <- supercell_purity(clusters =scRNA$celltype,
supercell_membership = MC$membership)
# 为元细胞注释
for (m in metaFields) {
MC[[m]]<- supercell_assign(clusters = scRNA@meta.data[,m], # 单细胞注释信息
supercell_membership = MC$membership, # 单细胞对应元细胞组件
method = "absolute") # 方法可选 "jaccard", "relative", "absolute"
}
# 计算元细胞的基因表达量
GE <- supercell_GE(as.matrix(GetAssayData(scRNA,slot = "data")),groups = MC$membership)
# 转换为Seurat对象
seuratMC <- supercell_2_Seurat(SC.GE = GE,MC,fields = c(metaFields,"purity"))
res <- seuratMC
if (returnMC) {
res <- list(seuratMC = seuratMC,SC = MC)
}
return(res)
}
# 元细胞的下游分析
supercells <- makeSeuratMC(scRNA,returnMC = T)
seuratMC <- supercells$seuratMC
seuratMC <- RunHarmony(seuratMC, group.by.vars="orig.ident", max.iter.harmony = 20,plot_convergence = T)
seuratMC <- RunUMAP(seuratMC, dims = 1:15, verbose = FALSE)
DimPlot(seuratMC,group.by = "celltype")
# 从100w个细胞缩到了2w细胞
dim(seuratMC)
[1] 4548120954
dim(scRNA)
[1] 454811047720
单细胞代谢通量预测
compass
R
# 提取单细胞矩阵 tsv文件
counts <- GetAssayData(scRNA, slot = 'data')
write.table(counts, file = 'counts.tsv', quote = F, sep = "\t", col.names = T, row.names = T)
# 细胞注释矩阵
cellmeta <- scRNA@meta.data
write.csv(cellmeta, file = 'cellmeta.csv')
# 运行compass
nohup compass --data counts.tsv --num-processes 10 --species homo_sapiens &
# 时间太久挂后台吧,留给下一期吧
scFEA
R
# 提取单细胞矩阵
counts <- scRNA@assays$RNA@counts#表达矩阵
write.csv(counts, file='./counts.csv', row.names = T)
cd scFEA
python src/scFEA.py --data_dir data \
--input_dir ./ \
--res_dir ./ \
--moduleGene_file module_gene_m168.csv\
--test_file counts.csv\
--cName_file cName_c70_m168.csv\
--sc_imputation False \
--stoichiometry_matrix cmMat_c70_m168.csv\
--output_flux_file scRNA_flux_Super.csv\
--output_balance_file scRNA_balance_super.csv
# --data_dir scFEA model文件路径
# --input_dir 单细胞数据输入路径
# --res_dir output结果输出路径,scFEA输出结果有两个矩阵,一个是预测的单细胞分辨率的代谢通量矩阵和代谢物应激矩阵
#--test_file 需要进行分析的数据,对于单细胞来说,就是表达矩阵,行是基因,列是cell。count矩阵和normalised矩阵都可以。放在input路径下
#--moduleGene_file scFEA提供了人和小鼠的module基因,这些文件都在scFEA包的data文件夹
# 人的默认使用module_gene_m168.csv, 小鼠的是module_gene_complete_mouse_m168.csv
#--stoichiometry_matrix 该表描述了化合物和模块之间的关系。每一行为中间代谢物,每一列为代谢模块。
# 人的默认使用cmMat_c70_m168.csv,小鼠使用cmMat_complete_mouse_c70_m168.csv
#--cName_file 这个表格包含化合物的名称和对应的标识符。其中,第一行是化合物的名称,第二行是对应的标识符。
#人使用cName_c70_m168.csv,小鼠使用cName_complete_mouse_c70_m168.csv
#--sc_imputation 是否对单细胞数据集进行数据插补,对于 10x 数据推荐设置为 True,进行元细胞构建后可以设置为 False。
#--output_flux_file 用户自定义预测通量文件名。
#--output_balance_file 用户自定义预测balance文件名
# 遇到报错AttributeError: 'DataFrame' object has no attribute 'append'
# 更换一下低版本的pandas==1.5.3
# 查看各组代谢通量
## 读入flux文件,添加到seurat对象
super_flux <- read.csv("./scRNA_flux_Super.csv",row.names = 1)
rownames(super_flux) <- gsub("cell", "", rownames(super_flu## 构建flux矩阵
predFlux <- t(data.matrix(super_flux))
## 修改下module标签注释,显示代谢物
human_moduleInfo <- read.csv("Human_M168_information.symbols.csv", header = T, row.names = 1)
human_moduleInfo <- human_moduleInfo[rownames(predFlux),]
rownames(predFlux) <- paste0("M","",human_moduleInfo$Module_id, ": ",human_moduleInfo$Compound_IN_name,"_",human_moduleInfo$Compound_OUT_name)
seuratMC[["FLUX"]] <- CreateAssayObject(counts = predFlux)
DefaultAssay(seuratMC) <- 'FLUX'
## UMAP图展示
FeaturePlot(adj_scRNA, features = "M2: G6P-G3P", split.by = "group",pt.size = 0.1, min.cutoff = 0) &
theme_bw()&
theme(legend.margin = margin(-0.2,-0.2,0,-0.3,'cm'),#legend边缘
legend.key.height = unit(1,'cm'),#lehend高度
legend.key.width = unit(0.2,'cm'))&
scale_color_gradientn(colours = c('#5749a0', '#0f7ab0', '#00bbb1',
'#bef0b0', '#fdf4af', '#f9b64b',
'#ec840e', '#ca443d', '#a51a49'), #连续型legend颜色设置
guide = guide_colorbar(frame.colour = "black", #legend边框
ticks.colour = NA))
## 气泡图展示
Idents(seuratMC) <- "group"
DotPlot(seuratMC, features = head(rownames(seuratMC)),cols = c("RdYlBu")) +
geom_point(aes(size=pct.exp), shape = 21, colour="black", stroke=0.5) +
guides(size=guide_legend(override.aes=list(shape=21, colour="black", fill="white"))) +
RotatedAxis() +
theme(
panel.background = element_blank(),
panel.border = element_rect(fill = NA),
panel.grid.major.x = element_line(color = "grey80"),
panel.grid.major.y = element_line(color = "grey80"),
axis.title = element_blank())+
scale_y_discrete(labels=c("A","B"))+
coord_flip()
# 不同组差异分析(火山图)
Idents(seuratMC) = 'celltype'
seuratMC_Hep_CV= subset(seuratMC, idents = "Hep_CV")
seuratMC_Hep_CV$celltype.group <- paste(Idents(seuratMC_Hep_CV), seuratMC_Hep_CV$group, sep = "_")
Idents(seuratMC_Hep_CV) <- "celltype.group"
# 差异分析
module_deg = FindMarkers(seuratMC_Hep_CV, ident.1="Hep_CV_Female",
ident.2="Hep_CV_Male",
test.use = "LR",
min.cells.group = 1,
min.cells.feature = 1,
min.pct = 0,
logfc.threshold = 0)
#修改下in-out产物之间的连接线
rownames(module_deg) <- gsub("-", "\u2192", rownames(module_deg))
# 计算Cohen’s d函数
cohens_d <- function(x, y) {
pooled_std <- sqrt(((length(x)-1) * var(x) + (length(y)-1) * var(y)) / (length(x) + length(y) - 2))
return ((mean(x) - mean(y)) / pooled_std)
}
# Cohen’s d 的值可以用来衡量两组数据之间的效应量大小。通常情况下,Cohen’s d 的值约为 0.2 表示小效应,约为 0.5 表示中等效应,约为 0.8 表示大效应
super_flux <- as.data.frame(t(predFlux))
super_flux$celltype <- seuratMC$celltype
super_flux$group <- seuratMC$sex
cells_flux <- super_flux[super_flux$celltype == "Hep_CV", ]
cells_Female <- rownames(cells_flux)[cells_flux$group == "Female"]
cells_Male <- rownames(cells_flux)[cells_flux$group == "Male"]
df <- as.data.frame(t(cells_flux))
df <- df[-c(169,170),]
df_Female <- df[,cells_Female]
df_Male <- df[,cells_Male]
rownames(df_Female) <- gsub("_", "\u2192", rownames(df_Female))
rownames(df_Male) <- gsub("_", "\u2192", rownames(df_Male))
for (flux_id in rownames(module_deg)) {
A <- as.numeric(df_Female[flux_id, ])
B <- as.numeric(df_Male[flux_id, ])
c_d <- cohens_d(A, B)
module_deg[flux_id, 'cohens_d'] <- c_d
}
#火山图
module_deg$name <- rownames(module_deg)
# pvalue值设置不超过y边界
min_pvalue <- 1e-307
module_deg$p_val_adj[module_deg$p_val_adj == 0] <- min_pvalue
p = ggplot(module_deg, aes(x=cohens_d, y=-log10(p_val_adj))) +
geom_hline(aes(yintercept=1.3), color = "#999999", linetype="dashed", size=1) +#添加横线
geom_vline(aes(xintercept=0), color = "#999999", linetype="dashed", size=1) + #添加纵线
geom_point(size=4,color="grey80")+
geom_point(data = module_deg[module_deg$p_val_adj<=0.01 & abs(module_deg$cohens_d)>0.5,], stroke = 0.5, size=4, shape=16, color="#B51F29") +
labs(x = "Cohen's D",y = "-Log10(P-value)", title = "") +
theme_bw() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(size=1, colour = "black"),
axis.title =element_text(size = 14),axis.text =element_text(size = 8, color = "black"),
plot.title =element_text(hjust = 0.5, size = 12)) +
theme(plot.margin=unit(c(0, 1, 2, 1),'cm'))+
geom_text_repel(data=module_deg[module_deg$p_val_adj<=0.01 & abs(module_deg$cohens_d)>0.5, ],
aes(label=name), color="black", size=4,
arrow = arrow(ends="first", length = unit(0.01, "npc")), box.padding = 0.2,
point.padding = 0.3, segment.color = 'black', segment.size = 0.3, force = 1, max.iter = 3e3,
max.overlaps = Inf)+
ggtitle("A vs B")
ggdraw(xlim = c(0, 1), ylim = c(0,1.1))+ # 设置界限
draw_plot(p,x = 0, y =0) +
draw_line(x = c(0.55,0.75),
y = c(0.1,0.1),
lineend = "round",
size =1, col = "#B51F29", # 箭头
arrow=arrow(angle = 15,
length = unit(0.1,"inches"),
type = "closed"))+
draw_line(x = c(0.25,0.45),
y = c(0.1,0.1),
lineend = "round",
size =1, col = "#006699", # 箭头
arrow=arrow(angle = 15,
length = unit(0.1,"inches"),
type = "closed",
ends="first"))+
draw_text(text = "Activate in A", size = 12,
x = 0.88, y = 0.1,
color="black",fontface = "italic")+
draw_text(text = "Activate in B", size = 12,
x = 0.15, y = 0.1,
color="black",fontface = "italic")
联系作者[图片]