新手入学ScRNA-seq分析流程

文摘   科学   2023-03-18 10:00   江苏  

ScRNA-seq pipeline

ScRNA-seq pipeline                                                                        yusenwei@zju.edu.cn

  • 数据导入

  • 多样本整合

  • 细胞鉴定

  • 两组细胞数目和比例对比

  • 两组间差异基因

  • 富集分析GO和KEGG

  • GSEA和GSVA

  • 细胞通讯 CellphoneBD 和 CellChat

  • 拟时序分析 monocle2和monocle3

  • 基因调控网络 SCENIC 和DoRothEA


写在前头,总结自己假期的学习成果,在这段学习时间中收获了很多。

1. 关于生信,它更像是一把武器,在分析的过程给试验带来新的思路和总结。

2. 关于R,几乎零基础的开始,实操才是最快的学习方法,?函数带来新的世界。

3. 关于脚本,感受每个R包,每个函数的逻辑性,学会用R包和精通某个R包是不同的。

  作为新手一直有个问题困扰着我,这个领域已经有很多大佬,我这样的小白能否进入?其实不然,想想在你之后也会有很多萌新入坑,而这些“老一辈人”会很欢迎我们的加入,就看他们在教程里是真的很想把我们教会的样子。最后感谢生信技能树,生信会客厅等等一系列公众号和小伙伴们在这一个月中对我的帮助。

使用的数据集是GSE129516,小鼠肝硬化的10x单细胞转录组数据,练习和熟练代码为主,对于细胞分群没有仔细细分。挑鼠的数据集的原因,大部分数据库以人的基因为主,就涉及基因id的转换和鼠数据库信息的提取等操作的训练。

  最后,牢记一句话,代码有时很简单,如何解读和具有生物学意义才是关键!!!!

数据导入

加载包

#options(stringsAsFactors = F) 
#library(Seurat)
#library(dplyr)
#library(stringr)
#library(ggplot2)
#library(clustree)
#library(cowplot)
#library(Matrix)
#library(gtable)
#library(gridExtra)
#library(patchwork)
#library(EnhancedVolcano)
#library(msigdbr)
#library(GSVA)
#library(tidyverse)
#library(clusterProfiler)
#library(fgsea)
#library(enrichplot)
#library(pheatmap)
#library(org.Mm.eg.db)
#library(DOSE)
#library(ggalluvial)
#library(CellChat)
#library(monocle)
#library(Hmisc)
##library(dorothea)
#library(SCENIC)

分别读入每个样本的原始数据

#setwd("E:/ScRNA/fuxian/mouse")
#path<-"data"
#files <- list.files(path = path,pattern = 'csv$')
#fullpath<-paste(path,files,sep="/")
#names(fullpath) <- lapply(files,function(x) {str_extract(x, '[\\w\\d]+')})
#rawdata <- lapply(fullpath, function(x) {
#cells.table <- read.table(x, sep = ",",check.names = F, header = TRUE, row.names = 1)
#})
#lapply(rawdata, dim)

创建seurat对象,添加分组信息

#obj.list <- lapply(names(rawdata), function(x) {
#CreateSeuratObject(counts = rawdata[[x]],
# project = x,
# min.cells = 3, ##保留至少在三个细胞里面表达的基因
# min.features = 200)}) ##保留至少表达200个基因的细胞
#names(obj.list) <- names(rawdata)
#obj.list

添加分组信息

#obj.list$chow1@meta.data$group <- 'Health'
#obj.list$chow2@meta.data$group <- 'Health'
#obj.list$chow3@meta.data$group <- 'Health'
#obj.list$nash1@meta.data$group <- 'Disease'
#obj.list$nash2@meta.data$group <- 'Disease'
#obj.list$nash3@meta.data$group <- 'Disease'

质控QC,计算每种特征在每个细胞里面的百分比

#for (x in names(obj.list)){
#obj.list[[x]][["percent.MT"]] <- PercentageFeatureSet(obj.list[[x]], pattern = "^mt-")
#obj.list[[x]][["percent.RP"]] <- PercentageFeatureSet(obj.list[[x]], pattern = "^Rp[sl]")
#obj.list[[x]][["percent.HB"]] <- PercentageFeatureSet(obj.list[[x]], pattern = "^Hb[^(p)]")
#}

循环绘制每个样本的QC图

#qc_feature <- c("nFeature_RNA", "nCount_RNA", "percent.HB", "percent.MT",  "percent.RP")
#for(x in names(obj.list)){
#pdf(file=paste0("1_",x,"_quality_control.pdf"),width = 15,height=7)
#print(VlnPlot(obj.list[[x]], features = qc_feature, ncol = 5, pt.size = 0.5))
#dev.off()
#}

quality_control

循环绘制特征之间的相互关系图

#for(x in names(obj.list)){
#pdf(file=paste0("1_",x,"_feature_relationship.pdf"),width = 12,height=10)
#p1 <- FeatureScatter(obj.list[[x]], feature1 = "nCount_RNA", #feature2 = "nFeature_RNA")
#p2 <- FeatureScatter(obj.list[[x]], feature1 = "nCount_RNA", feature2 = "percent.HB")
#p3 <- FeatureScatter(obj.list[[x]], feature1 = "nCount_RNA", feature2 = "percent.MT")
#p4 <- FeatureScatter(obj.list[[x]], feature1 = "nCount_RNA", feature2 = "percent.RP")
#print(p1 + p2 + p3 + p4)
#dev.off()
#}

feature_relationship

过滤细胞

 #obj.list <- lapply(obj.list, function(x) {
#subset(x, subset = nFeature_RNA > 200 &
# nFeature_RNA < 7500 &
# percent.MT < 20)})
#obj.list

多样本整合

归一化

#obj.list <- lapply(obj.list,function(x) {
# NormalizeData(x)
#})

寻找高变异度基因

#obj.list <- lapply(obj.list, function(x) {
# FindVariableFeatures(x, selection.method = "vst", nfeatures = 3000)
# })
#obj.list

整合成一个对象

寻找锚点

#obj.anchors <- FindIntegrationAnchors(object.list = obj.list, dims = 1:20)

根据锚点来整合

#obj.combined <- IntegrateData(anchorset = obj.anchors, dims = 1:20)
#DefaultAssay(obj.combined) <- "integrated" #更改默认数组

对整合后的数据进行尺度变换

#all.genes <- rownames(obj.combined[["RNA"]]@data) #针对所有基因
#length(all.genes)
#obj.combined <- ScaleData(obj.combined, features = all.genes)

对整合之后的数据进行降维

主成分分析

#obj.combined <- RunPCA(obj.combined, npcs = 50, verbose = FALSE)

可视化PCA降维之后的结果

#pdf('2_PCA.pdf', width = 10, height = 6)
#DimPlot(obj.combined, reduction = "pca")
#dev.off()

PCA

热图展示前5个主成分

#pdf(file="2_PCtop5_Heatmap.pdf",width=14,height=20)
#DimHeatmap(obj.combined, dims = 1:5, balanced = TRUE)
#dev.off()

PCtop5_Heatmap

热图展示第6-10个主成分

#pdf(file="2_PC6-10_Heatmap.pdf",width=14,height=20)
#DimHeatmap(obj.combined, dims = 1:5, balanced = TRUE)
#dev.off()

PC6-10_Heatmap

确定用几个主成分做后续分析

Elbowplot

#pdf('2_ElbowPlot.pdf', width = 10, height = 6)
#ElbowPlot(obj.combined, ndims = 50)
#dev.off()

ElbowPlot

对整合之后的数据进行聚类

#obj.combined <- RunUMAP(obj.combined, reduction = "pca", dims = 1:20)
## SWNE Visualizing single-cell RNA-seq datasets with Similarity Weighted Nonnegative Embedding (tSNE可准确的捕获数据集的局部结构,但是会扭曲数据集的全局结构,比如簇之间的距离)
#obj.combined <- FindNeighbors(obj.combined, reduction = "pca", dims = 1:20)
#for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
#obj.combined <- FindClusters(obj.combined,resolution = res)
#} #设置不同的分辨率,观察分群效果(选择哪一个?)

可视化不同分辨率,分群效果

#apply(obj.combined@meta.data[,grep("integrated_snn",colnames(obj.combined@meta.data))],2,table)
#pdf('2_umap_resolution_high.pdf', width = 18)
#plot_grid(ncol = 3, DimPlot(obj.combined, reduction = "umap", group.by = "integrated_snn_res.0.8") +
# ggtitle("louvain_0.8"), DimPlot(obj.combined, #reduction = "umap", group.by = "integrated_snn_res.1") +
# ggtitle("louvain_1"), DimPlot(obj.combined, #reduction = "umap", group.by = "integrated_snn_res.0.3") +
# ggtitle("louvain_0.3"))
#dev.off()
#pdf('2_umap_resolution_low.pdf', width = 18)
#plot_grid(ncol = 3, DimPlot(obj.combined, reduction = "umap", group.by = "integrated_snn_res.0.01") +
# ggtitle("louvain_0.01"), DimPlot(obj.combined, #reduction = "umap", group.by = "integrated_snn_res.0.1") +
# ggtitle("louvain_0.1"), DimPlot(obj.combined, reduction = "umap", group.by = "integrated_snn_res.0.2") +
# ggtitle("louvain_0.2"))
#dev.off()

umap_resolution_low

umap_resolution_high

#pdf('2_Tree_diff_resolution.pdf', width = 10,height = 10)
#clustree(obj.combined@meta.data, prefix = "integrated_snn_res.",layout = "sugiyama")
#dev.off()

Tree_diff_resolution

接下来分析,按照分辨率为0.2进行

#sel.clust = "integrated_snn_res.0.2"
#obj.combined <- SetIdent(obj.combined, value = sel.clust)
#table(obj.combined@active.ident)

绘制UMAP聚类图

#pdf('3_UMAP_multi_samples_combined.pdf',width = 11,height = 6)
#p1 <- DimPlot(obj.combined, reduction = "umap", group.by = "group")
#p2 <- DimPlot(obj.combined, reduction = "umap", label = TRUE)
#p1 + p2
#dev.off()

UMAP_multi_samples

分别展示两组的UMAP聚类图

#pdf('3_UMAP_multi_samples_split.pdf',width = 10,height = 6)
#DimPlot(obj.combined, reduction = "umap", split.by = "group",label = TRUE)
#dev.off()

UMAP_samples_split

查看细胞周期影响

#pdf(file="3_Pcna_Mki67_expression.pdf",width=12)
#FeaturePlot(obj.combined,features = c("Pcna","Mki67"),reduction = "umap")##s期基因PCNA,G2/M期基因MKI67均高表达,说明细胞均处于细胞周期
#dev.off()

Pcna_Mki67_expression

计算细胞周期分值,判断这些细胞分别处于什么期

#cc_gene=unlist(cc.genes)#cc.genes为细胞周期相关的基因,为list,有两个元素s.genes,g2m.genes
#s.genes=cc.genes$s.genes
#s.genes<-tolower(s.genes)
#s.genes<-capitalize(s.genes)
#g2m.genes=cc.genes$g2m.genes
#g2m.genes<-tolower(g2m.genes)
#g2m.genes<-capitalize(g2m.genes)
#obj.combined <- CellCycleScoring(obj.combined, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
#pdf(file="3_cellcycle_score.pdf",width=8)
#obj.combined@meta.data %>% ggplot(aes(S.Score,G2M.Score))+geom_point(aes(color=Phase))+theme_minimal()
#dev.off()
##去除细胞周期影响
#obj.combined1 <- ScaleData(obj.combined, vars.to.regress = c("S.Score", "G2M.Score"), features = rownames(obj.combined))

cellcycle_score

细胞鉴定

鉴定marker基因

#sel.clust = "integrated_snn_res.0.2"
#obj.combined <- SetIdent(obj.combined, value = sel.clust)
#sample.markers <- FindAllMarkers(obj.combined, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
#top10 <- sample.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)

保存每个cluster top10的marker基因

#top10_table=unstack(top10, gene ~ cluster)
#names(top10_table)=gsub("X","cluster",names(top10_table))
#write.csv(file="4_top10_marker_genes.csv",top10_table,row.names=F)

细胞亚群注释

#new.cluster.ids <- c("Endo","T cell", "Mac","B cell",
# "Mac","T cell","Endo","DC", "Chol",
# "Hep", "DC", "Plasma B","B cell","DC","Fibro","HSC","Dividing")
#names(new.cluster.ids) <- levels(obj.combined)
#obj.combined <- RenameIdents(obj.combined, new.cluster.ids)
#table(obj.combined@active.ident)

细胞亚群注释后的UMAP

#pdf('4_cluster_annotations.pdf',width = 9,height = 7)
#DimPlot(obj.combined, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
#dev.off()

cluster_annotations

鉴定Marker基因

#T_cell = c("Cd3e","Cd3d","Cd3g")
#B_cell= c("Cd79a","Cd79b","Ms4a1")
#Plasma_B_cell= c("Igkc","Mzb1","Jchain")
#Macrophage_cell= c("C1qa","C1qb","Cd5l")
#Endothelial_cell= c("Pecam1","Vwf","Ehd3","Stab2")
#Hepatic_stellate_cell= c("Dcn","Cxcl12")
#Hepatocyte= c("Apoa2","Mup20","Apoc3")
#Cholangiocyte= c("Spp1","Tm4sf4","Sox9")
#Dendritic_cell= c("Irf8","Siglech","Cd74")
#Dividing=c("C3","Igfbp5","Igfbp6")
#Fibro=c("Top2a","Ccna2","Stmn1")
#gene_list = list(
# T_cell= T_cell,
#B_cell= B_cell,
# Plasma_B = Plasma_B_cell,
# Mac = Macrophage_cell,
#Endo = Endothelial_cell,
#HSC = Hepatic_stellate_cell,
# Hep = Hepatocyte,
# Chol = Cholangiocyte,
#DC = Dendritic_cell,
#Dividing = Dividing,
#Fibro = Fibro
#)
#genes_to_check=unique(unlist(gene_list))
#pdf('4_umap_markers.pdf',width = 9,height = 7)
#DotPlot(obj.combined, assay = "RNA", features = genes_to_check, cols = c("blue", "red"), split.by = "group") + coord_flip()+ RotatedAxis()
#dev.off()

umap_markers

两组细胞数目和比例对比

整体细胞数

#cell.num <- table(Idents(obj.combined))
#cell.freq <- round(prop.table(table(Idents(obj.combined)))*100,2)
#cell.combined <- rbind(cell.num, cell.freq)
#write.csv(file="5_combined_cell_counts_freq.csv",cell.combined)

分组计算细胞数目和比例

#cell.num.group <- table(Idents(obj.combined), obj.combined$group) 
#colnames(cell.num.group)<-paste0(colnames(cell.num.group),'_cell_counts')
#cell.freq.group <- round(prop.table(table(Idents(obj.combined), obj.combined$group), margin = 2) *100,2)
#colnames(cell.freq.group)<-paste0(colnames(cell.freq.group),'_cell_Freq')
#cell.group <- cbind(cell.num.group, cell.freq.group)
#write.csv(file="5_group_cell_counts_freq.csv",cell.group)

表格和UMAP展示细胞数目变化

#pdf('5_UMAP_multi_samples_split_anno.pdf',width = 20)
#p<-DimPlot(obj.combined, reduction = "umap", split.by = "group",label=T,repel=T) + NoLegend()
#tb <- tableGrob(cell.group)
#plot_grid(p, tb,ncol=2,rel_widths=c(0.6,0.4))
#dev.off() ##火山图展示 文献:Integrated single cell analysis of blood and cerebrospinal fluid leukocytes in multiple sclerosis

split_anno

两组间差异基因

将每个细胞的identity转换成celltype.group

#DefaultAssay(obj.combined) <- "RNA"
#obj.combined$celltype <- Idents(obj.combined)#保存每个细胞的细胞类型
#obj.combined$celltype.group <- paste(Idents(obj.combined), #obj.combined$group, sep = "_") ##细胞类型前面加上分组信息
#Idents(obj.combined) <- "celltype.group"

鉴定Mac和Endo细胞在Disease和Health组中差异表达的基因

#Mac.diff <- FindMarkers(obj.combined, ident.1 = "Mac_Disease", ident.2 = "Mac_Health", verbose = FALSE)
#write.csv(file="6_Diff_exp_genes_Mac.csv",Mac.diff)
#Endo.diff <- FindMarkers(obj.combined, ident.1 = "Endo_Disease", ident.2 = "Endo_Health", verbose = FALSE)
#write.csv(file="6_Diff_exp_genes_Mac.csv",Endo.diff)

提取出Mac细胞

#Idents(obj.combined) <- 'celltype'
#Mac <- subset(obj.combined, idents = "Mac")
#Idents(Mac) <- "group"
#avg.Mac <- log1p(AverageExpression(Mac, verbose = FALSE)$RNA)
#avg.Mac <- data.frame(avg.Mac ,gene=rownames(avg.Mac))

提取出Endo细胞

#Endo <- subset(obj.combined, idents = "Endo")
#Idents(Endo) <- "group"
#avg.Endo <- log1p(AverageExpression(Endo, verbose = FALSE)$RNA)
#avg.Endo <- data.frame(avg.Endo ,gene=rownames(avg.Endo))

火山图可视化差异基因

#low<-floor(range(Mac.diff$avg_log2FC)[1])
#high<-ceiling(range(Mac.diff$avg_log2FC)[2])
#pdf('6_DEG_Mac.pdf',width = 15)
#print(EnhancedVolcano(Mac.diff,
# title = 'Mac disease versus health',
# lab = rownames(Mac.diff),
# x = 'avg_log2FC',
# y = 'p_val_adj',
# FCcutoff = 1,
# xlim = c(low, high)))
#dev.off()

6_DEG_Mac

富集分析GO和KEGG

Mac细胞两组的差异基因GO富集

#ego_ALL <- enrichGO(gene        = row.names(Mac.diff),
# OrgDb = 'org.Mm.eg.db',
# keyType = 'SYMBOL',
# ont = "ALL",
# pAdjustMethod = "BH",
# pvalueCutoff = 0.01,
# qvalueCutoff = 0.05)
#ego_all <- data.frame(ego_ALL)
#write.csv(file="7_Mac_GO_result.csv",data.frame(ego_all))

GO中CC,MF,BP

#ego_CC <- enrichGO(gene          = row.names(Mac.diff),
# OrgDb = 'org.Mm.eg.db',
# keyType = 'SYMBOL',
# ont = "CC",
# pAdjustMethod = "BH",
# pvalueCutoff = 0.01,
# qvalueCutoff = 0.05)
#ego_MF <- enrichGO(gene = row.names(Mac.diff),
# OrgDb = 'org.Mm.eg.db',
# keyType = 'SYMBOL',
# ont = "MF",
# pAdjustMethod = "BH",
# pvalueCutoff = 0.01,
# qvalueCutoff = 0.05)
#ego_BP <- enrichGO(gene = row.names(Mac.diff),
# OrgDb = 'org.Mm.eg.db',
# keyType = 'SYMBOL',
# ont = "BP",
# pAdjustMethod = "BH",
# pvalueCutoff = 0.01,
# qvalueCutoff = 0.05)

可视化GO富集结果

#ego_CC@result$Description <- #substring(ego_CC@result$Description,1,70)
#ego_MF@result$Description <- #substring(ego_MF@result$Description,1,70)
#ego_BP@result$Description <- #substring(ego_BP@result$Description,1,70)##截取结果中Description的1-70个字符,方便图形显示
#pdf(file="7_Mac_GO_barplot.pdf",width=20,height = 20)
#p_BP <- barplot(ego_BP, showCategory = 10, label_format = 80) + ggtitle("barplot for Biological process")
#p_CC <- barplot(ego_CC, showCategory = 10,label_format = 80) + ggtitle("barplot for Cellular component")
#p_MF <- barplot(ego_MF, showCategory = 10,label_format = 80) + ggtitle("barplot for Molecular function")
#plotc <- p_BP/p_CC/p_MF
#plotc
#dev.off()
#pdf(file="7_Mac_GO_dot.pdf",width=20,height = 20)
#enrichplot::dotplot(ego_ALL,split="ONTOLOGY") + #facet_grid(ONTOLOGY~., scale="free")
#dev.off()

Mac_GO_barplot

Mac_GO_dot

KEGG富集

#genelist <- bitr(row.names(Mac.diff), fromType="SYMBOL",
# toType="ENTREZID", OrgDb='org.Mm.eg.db')
#genelist <- pull(genelist, ENTREZID)
#ekegg <- enrichKEGG(gene = genelist, organism = 'mmu')
#write.csv(data.frame(ekegg),'7_Mac_enrichKEGG.csv',row.names = F)

KEGG可视化

#pdf(file="7_Mac_KEGG.pdf",width=20,height = 15)
#p1 <- barplot(ekegg, showCategory=20,label_format=80)
#p2 <- enrichplot::dotplot(ekegg, showCategory=20,label_format=80)
#plotc = p1/p2
#plotc
#dev.off()

Mac_KEGG

GSEA和GSVA

GSEA 挑选两组Mac细胞的差异基因

#geneList= Mac.diff$avg_log2FC #制作基因list
#names(geneList)= toupper(rownames(Mac.diff))#name为基因名字
#geneList=sort(geneList,decreasing = T)#按FC从大到小排序
#head(geneList)

从GSEA官网下载GSEA分析需要的基因集 http://www.gsea-msigdb.org/gsea/index.jsp

下载免疫相关的基因集,c7: immunologic signature gene sets

#gmtfile ='c7.immunesigdb.v7.5.1.symbols.gmt'
#pathway<-read.gmt(gmtfile) #读取gmt文件中的pathway信息
#y <- GSEA(geneList,TERM2GENE =pathway) #进行GSEA分析
#write.csv(file="8_Mac_GSEA_result.csv",data.frame(y))

气泡图展示显著富集的前20条通路

#pdf(file="8_GSEA_dotplot.pdf",width=18)
#clusterProfiler::dotplot(y,showCategory=20,label_format=80)
#dev.off()

GSEA_dotplot

绘制具体通路的GSEA图

#pdf(file="8_TH1_VS_TH17_UP.pdf",width=18)
#gseaplot2(y,geneSetID = 'GSE26030_TH1_VS_TH17_RESTIMULATED_DAY5_POST_POLARIZATION_UP',pvalue_table=T)
#dev.off()

TH1_VS_TH17_UP

GSVA

#Idents(obj.combined) <- 'celltype.group'
#Idents(obj.combined) #获取细胞类型
#expr <- AverageExpression(obj.combined, assays = "RNA", slot = "data")[[1]]
#View(expr) #计算每个基因在每个细胞亚群中的平均表达值
#expr <- expr[rowSums(expr)>0,] #选取非零基因
#rownames(expr) <- toupper(rownames(expr)) #大写基因名
#expr <- as.matrix(expr) #转换成矩阵

从GSEA官网下载GSEA分析需要的基因集

#gmtfile ='c2.cp.kegg.v7.5.1.symbols.gmt'
#pathway<-read.gmt(gmtfile)[,c(2,1)]#读取gmt文件中的pathway信息
#genesets=unstack(pathway)#去堆叠,转换成list
#gsva.res <- gsva(expr, genesets, method="ssgsea") #进行GSVA分析
#gsva.df <- data.frame(Genesets=rownames(gsva.res), gsva.res, check.names = F)
#write.csv(gsva.df, "8_gsva_res.csv", row.names = F)

绘制热图可视化

#pdf(file="8_GSVA_heatmap.pdf",width=12,height=40)
#pheatmap(gsva.res, show_colnames = T, scale = "row",cellheight=14,fontsize_row=14)
#dev.off()

GSVA_heatmap

细胞通讯 CellphoneBD 和 CellChat

CellChat对象创建

挑选疾病组的所有细胞亚群

#Idents(obj.combined) <- "group"
#obj.combine <- subset(obj.combined,idents = "Disease")
#Idents(obj.combine) <- "celltype"
#cellchat <- createCellChat(obj.combine@assays$RNA@data)#获取表达矩阵
#obj.combine$cellType <- Idents(obj.combine)
#meta <- data.frame(cellType = obj.combine$cellType, row.names = Cells(obj.combine))##获取标签
#cellchat <- addMeta(cellchat, meta = meta, meta.name = "cellType")
##把metadata信息加到CellChat对象中,添加细胞标签
#cellchat <- setIdent(cellchat, ident.use = "cellType") #把细胞标签设置成默认的ID
#groupSize <- as.numeric(table(cellchat@idents))##统计每个细胞亚群中的细胞数目

选择数据库

#CellChatDB <- CellChatDB.mouse
#View(CellChatDB) ##CellChat提供了人和小鼠的配受体数据库
#unique(CellChatDB$interaction$annotation)##查看有哪些可以选择的子数据库
#CellChatDB.use <- subsetDB(CellChatDB, search = "Secreted Signaling") ##用Secreted Signaling来分析细胞通信
#cellchat@DB <- CellChatDB.use##将使用的数据库信息写入到cellchat对象中
#cellchat <- subsetData(cellchat) ##抽取信号通路相关基因的表达矩阵

对表达数据进行预处理,用于细胞间的通信分析

#cellchat <- identifyOverExpressedGenes(cellchat) 
#cellchat <- identifyOverExpressedInteractions(cellchat)
#cellchat <- projectData(cellchat, PPI.mouse)
##首先在一个细胞组中识别过表达的配体或受体,然后将基因表达数据投射到蛋白-蛋白相互作用(PPI)网络上。
##如果配体或受体过表达,则识别过表达配体和受体之间的相互作用。

相互作用推断

#cellchat <- computeCommunProb(cellchat)
##为每个相互作用分配一个概率值并进行置换检验来推断生物意义上的细胞-细胞通信
#cellchat <- computeCommunProbPathway(cellchat)
#cellchat <- aggregateNet(cellchat)
##通过计算与每个信号通路相关的所有配体-受体相互作用的通信概率来推断信号通路水平上的通信概率
##通过计算链路的数量或汇总通信概率来计算细胞间的聚合通信网络
#cellchat@netP$pathways
#levels(cellchat@idents)##显示重要通信的信号路径
##针对某一类群与其余细胞类群的通讯,或对比在疾病和正常组中细胞通讯变化情况

可视化CellChat结果

层次图

#vertex.receiver = seq(1,4) ##在层次绘图的时候,第一列显示的细胞类型的数目
#pathways.show <- "MIF"##显示的信号通路
#pdf(file="9_MIF_signaling_pathway_hierarchy.pdf",width=12)
#netVisual_aggregate(cellchat, signaling = pathways.show, layout = "hierarchy", vertex.receiver = vertex.receiver, vertex.weight = groupSize)
#dev.off()

MIF_signaling_pathway_hierarchy

绘制环状图

#pdf(file="9_MIF_signaling_pathway_circle.pdf",width=7)
#netVisual_aggregate(cellchat, signaling = pathways.show, layout = "circle", vertex.receiver = vertex.receiver, vertex.weight = groupSize)
#dev.off()

MIF_signaling_pathway_circle

绘制弦图

#pdf(file="9_MIF_signaling_pathway_chord.pdf",width=7)
#netVisual_aggregate(cellchat, signaling = pathways.show, layout = "chord", vertex.receiver = vertex.receiver, vertex.weight = groupSize)
#dev.off()

MIF_signaling_pathway_chord

识别细胞群的信号转导作用

#cellchat <- netAnalysis_computeCentrality(cellchat, slot.name = "netP") # the slot 'netP' means the inferred intercellular communication network of signaling pathways
#pdf(file="9_signalingRole_network.pdf",width=7,height=5)
#netAnalysis_signalingRole_network(cellchat, signaling = pathways.show, width = 8, height = 2.5, font.size = 10)
#dev.off() ##CellChat允许随时识别细胞间通信网络中的主要发送者、接收者、调解者和影响者。通过计算每个细胞群的网络中心性指标。

signalingRole_network

显示所有的显著的(L-R对)

#pdf(file="9_netVisual_bubble.pdf",width=12,height=12)
#netVisual_bubble(cellchat, sources.use = 1:9, targets.use = c(1:9), remove.isolate = FALSE) ##从'sources.use'定义的细胞亚群到'targets.use'定义的细胞亚群
#dev.off()

netVisual_bubble

拟时序分析 monocle2和monocle3

将seurat对象转换成monocle对象

挑选疾病组的所有细胞亚群进行拟时序分析

#scRNA.markers <- FindAllMarkers(obj.combine, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
#obj.combine$celltype1 <- Idents(obj.combine)
#data <- obj.combine[['RNA']]@data
#pd <- new('AnnotatedDataFrame', data = obj.combine@meta.data)
#fData <- data.frame(gene_short_name = row.names(data), row.names = row.names(data))
#fd <- new('AnnotatedDataFrame', data = fData)
#HSMM <- newCellDataSet(data,
# phenoData = pd,
# featureData = fd
# )

构建细胞发育轨迹

#ordering_genes <- obj.combine[["RNA"]]@var.features
#HSMM <- setOrderingFilter(HSMM, ordering_genes)
#HSMM <- estimateSizeFactors(HSMM)
#HSMM <- reduceDimension(HSMM,
# norm_method="none",
# reduction_method="DDRTree",
# max_components=3,
# scaling=TRUE,
# verbose=TRUE,
# pseudo_expr=0)#降维
#HSMM <- orderCells(HSMM) #将细胞摆放到轨迹上
##针对某一细胞类群进行发育分化的模拟,或分别比较疾病和健康组某类细胞类型发育偏好和基因表达的时间变化。

可视化拟时序分析

以state来展示

#pdf('10_trajectory_state.pdf')
#plot_cell_trajectory(HSMM)
#dev.off()

trajectory_state

以细胞类型来展示

#pdf('10_trajectory_celltype.pdf')
#plot_cell_trajectory(HSMM, color_by = 'celltype')
#dev.off()

trajectory_celltype

分别展示每一个细胞类型的发育轨迹

#pdf('10_trajectory_celltype_separate.pdf')
#plot_cell_trajectory(HSMM, color_by = 'celltype')+ facet_wrap(~celltype, nrow = 3) + NoLegend()
#dev.off()

trajectory_celltype_separate

查看特定的gene在发育轨迹上的表达情况

#pdf("10_5gene_trajectory.pdf",width=12)
#plot_cell_trajectory(HSMM,markers=c("Rps4x","Abca1","Fabp4","Hspa1b","Ctla2a"),use_color_gradient=T)
#dev.off()

比较拟时State1和State3的差异表达基因

#p_data <- subset(pData(HSMM),select='State')
#obj.combine.sub <- subset(obj.combine, cells=row.names(p_data))
#obj.combine.sub <- AddMetaData(obj.combine.sub,p_data,col.name = 'State')
#dge.State <- FindMarkers(obj.combine.sub, ident.1 = 1, ident.2 = 3, group.by = 'State')
#sig_dge.State <- subset(dge.State, p_val_adj<0.01&abs(avg_log2FC)>1)

热图

##heatmap
#top5=scRNA.markers %>% group_by(cluster) %>% top_n(n = 5, wt = avg_log2FC)
#sig_gene_names <- unique(top5$gene)
#pdf(file="10_monocle_heatmap.pdf")
#pseudotemporalplot<- plot_pseudotime_heatmap(HSMM[sig_gene_names,],
# num_clusters = 11, #亚群数需要对应修改
# cores = 4,
# hmcols = NULL,
# show_rownames = T,
# return_heatmap = T)
#pseudotemporalplot
#dev.off()

monocle_heatmap

基因调控网络 SCENIC 和DoRothEA

DoRothEA

获取包自带数据库

#dorothea_regulon_mouse <- get(data("dorothea_mm", package = "dorothea"))

推断regulons基于levelA,B,C

#regulon <- dorothea_regulon_mouse %>%
# dplyr::filter(confidence %in% c("A","B","C"))

viper得分矩阵获取

#obj.combine <- run_viper(obj.combine, regulon,
# options = list(method = "scale", minsize = 4,
# eset.filter = FALSE, cores = 1,
# verbose = FALSE))
#obj.combine@assays$dorothea@data[1:4,1:4]
#dim(obj.combine@assays$dorothea@data)

根据细胞亚群汇总

#viper_scores_df <-GetAssayData(obj.combine,slot="data",assay="dorothea")%>%data.frame(check.names = F) %>%t()
#CellsClusters <- data.frame(cell = names(Idents(obj.combine)),
# cell_type=as.character(Idents(obj.combine)),
# check.names = F)
#viper_scores_clusters <- viper_scores_df %>%
# data.frame() %>%
#rownames_to_column("cell") %>%
#gather(tf, activity, -cell) %>%
#inner_join(CellsClusters)
#summarized_viper_scores <- viper_scores_clusters %>%
# group_by(tf, cell_type) %>%
#summarise(avg = mean(activity),
# std = sd(activity))
#head(summarized_viper_scores)
### 各个单细胞亚群特异性的转录因子
#DefaultAssay(object = obj.combine) <- "dorothea"
#markers <- FindAllMarkers(object = obj.combine, only.pos = TRUE,min.pct = 0.25, thresh.use = 0.25)
#pro='dorothea-markers-for-mouse'
#write.csv(markers,file=paste0(pro,'_.markers.csv'))

挑选前20TFs可视化

#highly_variable_tfs <- summarized_viper_scores %>%
# group_by(tf) %>%
#mutate(var = var(avg)) %>%
#ungroup() %>%
#top_n(220, var) %>% #20*11
#distinct(tf)
#highly_variable_tfs

准备数据可视化

#summarized_viper_scores_df <- summarized_viper_scores %>%
# semi_join(highly_variable_tfs, by = "tf") %>%
#dplyr::select(-std) %>%
#spread(tf, avg) %>%
#data.frame(row.names = 1, check.names = FALSE)
#palette_length = 100
#my_color = colorRampPalette(c("Darkblue","white","red"))(palette_length)
#colnames(summarized_viper_scores_df)

pheatmap热图可视化

#my_breaks <- c(seq(min(summarized_viper_scores_df), 0, 
# length.out=ceiling(palette_length/2) + 1),
# seq(max(summarized_viper_scores_df)/palette_length,
# max(summarized_viper_scores_df),
# length.out=floor(palette_length/2)))
#pdf("11_DoRothEA_heatmap.pdf",width=12)
#pheatmap(t(summarized_viper_scores_df),fontsize=14,
# fontsize_row = 10,
# color=my_color, breaks = my_breaks,
# main = "DoRothEA (ABC)", angle_col = 45,
# treeheight_col = 0, border_color = NA)
#dev.off()

DoRothEA_heatmap

SCENIC

参考数据库和结果输出文件夹

##数据库下载https://resources.aertslab.org/cistarget/
#dir.create("SCENIC")
#dir.create("SCENIC/int")
#setwd("E:/ScRNA/fuxian/mouse/SCENIC")

准备细胞meta信息和表达矩阵(挑选疾病组的细胞亚群)

#cellInfo <- data.frame(obj.combine@meta.data)
#colnames(cellInfo)[which(colnames(cellInfo)=="orig.ident")] <- "sample"
#colnames(cellInfo)[which(colnames(cellInfo)=="integrated_snn_res.0.2")] <- "cluster"
#colnames(cellInfo)[which(colnames(cellInfo)=="celltype")] <- "celltype"
#cellInfo <- cellInfo[,c("sample","cluster","celltype")]
##随机抽取1000个细胞的数据
#subcell <- sample(colnames(obj.combine),1000)
#combine.sub <- obj.combine[,subcell]
#saveRDS(combine.sub, "combine.sub.rds")
##准备meta信息
#exprMat <- as.matrix(combine.sub@assays$RNA@counts) #准备表达矩阵

设置分析环境

#mydbs <- c("mm9-500bp-upstream-7species.mc9nr.feather",
# "mm9-tss-centered-10kb-7species.mc9nr.feather")
#names(mydbs) <- c("500bp", "10kb")
#scenicOptions <- initializeScenic(org="mgi",
# nCores=14,
# dbs = mydbs,
# dbDir="cisTarget_databases")
#saveRDS(scenicOptions, "int/scenicOptions.rds")

转录调控网络推断

#基因过滤
#genesKept <- geneFiltering(exprMat, scenicOptions,
# minCountsPerGene = 3 * 0.01 * ncol(exprMat),
# minSamples = ncol(exprMat) * 0.01)
#exprMat_filtered <- exprMat[genesKept, ]
##过滤标准是基因表达量之和>细胞数*3%,且在1%的细胞中表达

计算相关性矩阵

#runCorrelation(exprMat_filtered, scenicOptions)
#exprMat_filtered_log <- log2(exprMat_filtered+1) ##TF-Targets相关性分析

共表达模块和regulon推断

#runGenie3(exprMat_filtered_log, scenicOptions, nParts = 2)   ##这一步需要根据自己的处理器核心进行,时间比较长
#runSCENIC_1_coexNetwork2modules(scenicOptions)#推断共表达模块
#runSCENIC_2_createRegulons(scenicOptions,coexMethods = "top5perTarget")#推断转录调控网络(regulon)
##以上代码可增加参数coexMethod=c("w001", "w005", "top50", "top5perTarget", "top10perTarget", "top50perTarget"))
#默认6种方法的共表达网络都计算,可以少选几种方法以减少计算量

regulon活性评分与可视化

#exprMat_all <- as.matrix(obj.combine@assays$RNA@counts)
#exprMat_all <- log2(exprMat_all+1)
#runSCENIC_3_scoreCells(scenicOptions, exprMat=exprMat_all)

Cebpb (325g)_AUC

Cebpb (325g)_binaryAUC

Cebpb (325g)_expression

Cebpb (325g)_histogram

活性评分转换为二进制

## #使用shiny互动调整阈值
## aucellApp <- plotTsne_AUCellApp(scenicOptions, exprMat_all)
## savedSelections <- shiny::runApp(aucellApp)
## #保存调整后的阈值
## newThresholds <- savedSelections$thresholds
## scenicOptions@fileNames$int["aucell_thresholds",1] <- "int/newThresholds.Rds"
## saveRDS(newThresholds, file=getIntName(scenicOptions, "aucell_thresholds"))
## saveRDS(scenicOptions, file="int/scenicOptions.Rds")
#runSCENIC_4_aucell_binarize(scenicOptions, exprMat=exprMat_all)

Cebpb (325g)_AUC

Cebpb (325g)_binaryAUC

Cebpb (325g)_expression

Cebpb (325g)_expression

分析结果可视化

利用Seurat可视化AUC

##转录因子富集结果
##scenicOptions=readRDS(file="int/scenicOptions.Rds")
##motifEnrichment_selfMotifs_wGenes <- loadInt(scenicOptions, "motifEnrichment_selfMotifs_wGenes")
##as.data.frame(sort(table(motifEnrichment_selfMotifs_wGenes$highlightedTFs),decreasing = T)) 每个基因的motif数量
##可视化某个基因的motif序列特征
#tableSubset <- motifEnrichment_selfMotifs_wGenes[highlightedTFs=="Cebpb"]
#viewMotifs(tableSubset)
#AUCmatrix <- readRDS("int/3.4_regulonAUC.Rds")
#AUCmatrix <- AUCmatrix@assays@data@listData$AUC
#AUCmatrix <- data.frame(t(AUCmatrix), check.names=F)
#RegulonName_AUC <- colnames(AUCmatrix)
#RegulonName_AUC <- gsub(' \\(','_',RegulonName_AUC)
#RegulonName_AUC <- gsub('\\)','',RegulonName_AUC)
#colnames(AUCmatrix) <- RegulonName_AUC #整合原始regulonAUC矩阵
#obj.combine.auc <- AddMetaData(obj.combine, AUCmatrix)
#obj.combine.auc@assays$integrated <- NULL
#saveRDS(obj.combine.auc,'obj.combine.auc.rds')
##整合二进制regulonAUC矩阵
#BINmatrix <- readRDS("int/4.1_binaryRegulonActivity.Rds")
#BINmatrix <- data.frame(t(BINmatrix), check.names=F)
#RegulonName_BIN <- colnames(BINmatrix)
#RegulonName_BIN <- gsub(' \\(','_',RegulonName_BIN)
#RegulonName_BIN <- gsub('\\)','',RegulonName_BIN)
#colnames(BINmatrix) <- RegulonName_BIN
#obj.combine.bin <- AddMetaData(obj.combine, BINmatrix)
#obj.combine.bin@assays$integrated <- NULL
#saveRDS(obj.combine.bin, 'obj.combine.bin.rds')

FeaturePlot&idgePlot&VlnPlot

#pdf("Irf7_extended_323g.pdf",  width=14 ,height=4)
#p1 = FeaturePlot(obj.combine.auc, features='Irf7_extended_323g', label=T, reduction = 'umap')
#p2 = FeaturePlot(obj.combine.bin, features='Irf7_extended_323g', label=T, reduction = 'umap')
#p3 = DimPlot(obj.combine, reduction = 'umap', group.by = "celltype", label=T)
#plotc = p1|p2|p3
#plotc
#dev.off()
## FeaturePlot
#pdf(file="Ridge-Vln_Irf7_extended_323g.pdf",width=10,height=8)
#p1 = RidgePlot(obj.combine.auc, features = "Irf7_extended_323g", group.by="celltype") +
# theme(legend.position='none')
#p2 = VlnPlot(obj.combine.auc, features = "Irf7_extended_323g", pt.size = 0, group.by="celltype") +
# theme(legend.position='none')
#plotc = p1 + p2
#plotc
#dev.off()
##RidgePlot&VlnPlot

Irf7_extended_323g

Ridge-Vln_Irf7_extended_323g

利用pheatmap可视化

#celltype = subset(cellInfo,select = 'celltype')
#AUCmatrix <- t(AUCmatrix)
#BINmatrix <- t(BINmatrix)
#my.regulons <- c('Irf8_166','Cebpb_325g','Irf7_277g','Irf5_17g','Elk3_38g') #挑选部分感兴趣的regulons
#myAUCmatrix <- AUCmatrix[rownames(AUCmatrix)%in%my.regulons,]
#myBINmatrix <- BINmatrix[rownames(BINmatrix)%in%my.regulons,]
##使用regulon原始AUC值绘制热图
#pheatmap(myAUCmatrix, show_colnames=F, annotation_col=celltype,
# filename = 'myAUCmatrix_heatmap.pdf',
# width = 6, height = 5)
##使用regulon二进制AUC值绘制热图
#pheatmap(myBINmatrix, show_colnames=F, annotation_col=celltype,
# filename = 'myBINmatrix_heatmap.pdf',
# color = colorRampPalette(colors = c("white","black"))(100),
# width = 6, height = 5)

 


朴素的科研打工仔
专注于文献的分享,浙大研究生学习生活的记录。
 最新文章