正当其时的单细胞转录组_02.学习Seurat的基本使用
❝早在2019年的时候,正是scRNAseq单细胞转录组学冉冉升起的时代。
那时候单细胞转录组学的热点,CNS的Paper如雨后春笋般的发发发。
并且生信技能树的创始人--曾健明大佬开创了全网第一个单细胞教程,一切的片段和学习经历仍在眼前。
❝时至今日,scRNAseq虽说是热度已过,但是仍不过时,因此这一系列专题我将其命名为正当其时的单细胞转录组,旨在利用当前单细胞数据海量迸发的时刻,捡起来拾到拾到,记录一下自己的学习过程,实现自己转向医学的想法。同时也提醒自己,学习任何技能(包括单细胞转录组),永远都不会晚!
❝至今为止,
Seurat
是R
语言进行单细胞分析无可替代的最佳工具!本期推文,简单的介绍了Seurat4.3
版本的使用方法。基于此推文,可以大体了解单细胞转录组的分析流程。
加载R包
rm(list = ls())
####----load R Package----####
library(Seurat)
library(scCustomize)
library(tidyverse)
library(scRNAtoolVis)
library(patchwork)
library(Nebulosa)
library(ggrepel)
step1.读入数据
#####-----step1.读入数据-----#####
scdata <- Read10X(data.dir = "Input")
class(scdata)
# [1] "dgCMatrix"
# attr(,"package")
# [1] "Matrix"
# 一个稀疏矩阵
dim(scdata)
# 查看基因和细胞个数
step2:创建Seurat对象
#####-----step2:创建Seurat对象-----#####
scobj <- CreateSeuratObject(counts = scdata,
project = "PBMC",
min.cells = 3,
min.features = 200
)
class(scobj)
# [1] "Assay"
# attr(,"package")
# [1] "SeuratObject"
# 这是一个SeuratObject的数据类型
# 我们可以使用@和$等符号,查看scobj中存储的对象
scobj@assays$RNA@counts
step3:数据的质量控
#####-----step3:数据的质量控制-----#####
# 计算含有特定特征(这里面是"MT")开头的基因,占据所有计数的百分比
scobj[["percent.mt"]] <- PercentageFeatureSet(scobj, pattern = "^MT-")
VlnPlot(scobj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
ggsave(filename = "./Output/VlnPlot.pdf",
height = 5,
width = 10)
# 筛选细胞
scobj <- subset(scobj, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
dim(scobj)
metadata <- scobj@meta.data
step4:数据的标准化
#####-----step4:数据的标准化-----#####
scobj <- NormalizeData(scobj, normalization.method = "LogNormalize", scale.factor = 10000)
step5:筛选高变基因
#####-----step5:筛选高变基因-----#####
scobj <- FindVariableFeatures(scobj, selection.method = "vst", nfeatures = 2000)
# 可视化高变基因
top10 <- head(VariableFeatures(scobj), 10)
# 使用VariableFeaturePlot进行画图
plot1 <- VariableFeaturePlot(scobj)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2
ggsave(filename = "./Output/VariableFeaturePlot.pdf",
height = 5,
width = 12)
# 使用ggplot2进行可视化
data_scobj <- scobj@assays$RNA@meta.features
# 其实这就是一个tidydata的格式
p1 <- ggplot(data = data_scobj, aes(vst.mean, vst.variance.standardized)) +
geom_point(aes(color = vst.variable)) +
scale_x_log10() +
scale_color_manual(values = c("black", "red")) +
labs(x = "Average Expression", y = "Standardized Variance") +
theme_classic() +
theme(
axis.text = element_text(color = "#000000", size = 12)
)
p2 <- ggplot(data = data_scobj, aes(vst.mean, vst.variance.standardized)) +
geom_point(aes(color = vst.variable)) +
geom_text_repel(data = data_scobj %>% dplyr::filter(rownames(.) %in% top10),
aes(label = top10),
color = "black") +
scale_x_log10() +
scale_color_manual(values = c("black", "red")) +
labs(x = "Average Expression", y = "Standardized Variance") +
theme_classic() +
theme(
axis.text = element_text(color = "#000000", size = 12)
)
p_combine <- p1 + p2
p_combine
ggsave(filename = "./Output/VariableFeaturePlot_ggplot2_version.pdf",
height = 5,
width = 12)
step6:缩放数据
#####-----step6:缩放数据-----#####
# 为数据降维度做准备
scobj <- ScaleData(scobj, features = rownames(scobj))
# 缩放后的数据
scale.data <- scobj@assays$RNA@scale.data
# 缩放后的数据非常的大, 缩放的最终效果是,基因的平均值为0,方差为1
class(scale.data)
# "matrix" "array"
step7:PCA线性降维
#####-----step7:PCA线性降维-----#####
scobj <- RunPCA(scobj, features = VariableFeatures(object = scobj))
# 这里指出了,只对高变基因的基因进行PCA线性降维
# 查看一下PCA降维的结果
DimPlot(scobj, reduction = "pca")
# 我们也可以基于ggplot2进行绘制
pcadata <- scobj@reductions$pca@cell.embeddings
ggplot(data = pcadata, aes(x = PC_1, y = PC_2)) +
geom_point(color = "red", alpha = 0.4)
ggsave(filename = "Output/pca_ggplot2.pdf",
height = 5,
width = 6)
# 我们也可以选择合适的PCA降维的水平
ElbowPlot(scobj)
ggsave(filename = "Output/pca_select.pdf",
height = 5,
width = 6)
step8:UMAP非线性降维
#####-----step8:UMAP非线性降维-----#####
scobj <- RunUMAP(scobj, dims = 1:10)
# 这里默认调用了PCA的结果,所以要指出使用了那些PCA的维度
DimPlot(scobj, reduction = "umap")
# 同样,我们也可以使用ggplot2手动进行绘制
umapdata <- scobj@reductions$umap@cell.embeddings
ggplot(data = umapdata, aes(x = UMAP_1, y = UMAP_2)) +
geom_point(color = "red", alpha = 0.4)
ggsave(filename = "Output/umap_ggplot2.pdf",
height = 5,
width = 6)
step9:聚类分群
#####-----step9:聚类分群-----#####
# 默认是基于UMAP的结果进行聚类分群的
# 先找邻居
scobj <- FindNeighbors(scobj, dims = 1:10)
# 当然,我们也可以基于PCA以及其维度的数据进行聚类分群
# scobj <- FindNeighbors(scobj, reduction = "pca", dims = 1:10)
# 然后再分群
scobj <- FindClusters(scobj, resolution = 0.5)
# 然后看看结果
DimPlot(scobj, reduction = "umap", label = T)
ggsave(filename = "Output/Neighbors_Cluster.pdf",
height = 5,
width = 6)
step10:分群注释
#####-----step10:分群注释-----#####
# 主要是基于marker基因,然后进行分群
# Seurat提供了一个函数可以自己分析marker基因
all_markers <- FindAllMarkers(object = scobj)
# 对marker基因进行可视化
FeaturePlot(scobj, features = c("CD3E", "CCR7"), order = T, ncol = 2)
ggsave(filename = "Output/Marker_gene_FeaturePlot.pdf",
height = 6,
width = 12)
plot_density(scobj,features = c("CD3E", "CCR7"))+plot_layout(ncol = 2)
ggsave(filename = "Output/Marker_gene_NebulosaPlot.pdf",
height = 6,
width = 12)
step11:最终的分群注释结
#####-----step11:最终的分群注释结果-----#####
# 1.确认分群的个数
table(Idents(scobj))
Idents(scobj) <- scobj@meta.data$seurat_clusters
# 2. 给每个分群添加注释
scobj <- RenameIdents(scobj,
"0"="Naive CD4+ T",
"1"="CD14+ Mono",
"2"="Memory CD4+",
"3"= "B cell",
"4"= "CD8+ T",
"5"= "FCGR3A+ Mono",
"6"= "NK",
"7"= "DC",
"8"= "Platelet"
)
# 可视化分群结果
DimPlot(scobj, label = F)
DimPlot(scobj, label = T)
ggsave(filename = "Output/scobj_cluster_annotation.pdf",
height = 7,
width = 8)
step12:细胞分群的可视化
#####-----step12:细胞分群的可视化-----#####
DimPlot_scCustom(scobj, figure_plot = TRUE)
ggsave(filename = "Output/scobj_cluster_annotation_2.pdf",
height = 7,
width = 8)
top5_markers <- all_markers %>%
group_by(cluster) %>%
arrange(desc(avg_log2FC))%>%
slice(1:5) %>%
ungroup() %>%
pull(gene) %>%
unique()
# 热图展示
pdf(file = "Output/marker_heatmap.pdf",
height = 8,
width = 7)
AverageHeatmap(object = scobj,
markerGene = top5_markers)
dev.off()
版本信息
####----sessionInfo----####
sessionInfo()
R version 4.3.0 (2023-04-21)
Platform: x86_64-apple-darwin20 (64-bit)
Running under: macOS 15.0.1
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: Asia/Shanghai
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggrepel_0.9.6 Nebulosa_1.0.1 patchwork_1.2.0.9000 scRNAtoolVis_0.0.7 lubridate_1.9.3 forcats_1.0.0
[7] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
[13] ggplot2_3.5.1 tidyverse_2.0.0 scCustomize_1.1.3 SeuratObject_4.1.3 Seurat_4.3.0
loaded via a namespace (and not attached):
[1] RcppAnnoy_0.0.21 splines_4.3.0 later_1.3.1 prismatic_1.1.1
[5] bitops_1.0-7 R.oo_1.25.0 polyclip_1.10-7 janitor_2.2.0
[9] lifecycle_1.0.4 CVST_0.2-3 doParallel_1.0.17 globals_0.16.2
[13] lattice_0.22-5 MASS_7.3-60 magrittr_2.0.3 limma_3.58.1
[17] plotly_4.10.3 httpuv_1.6.12 sctransform_0.4.1 sp_2.1-2
[21] spatstat.sparse_3.0-3 reticulate_1.34.0 cowplot_1.1.3 pbapply_1.7-2
[25] RColorBrewer_1.1-3 abind_1.4-5 zlibbioc_1.48.0 Rtsne_0.16
[29] GenomicRanges_1.54.1 R.utils_2.12.3 BiocGenerics_0.48.1 RCurl_1.98-1.13
[33] pracma_2.4.4 circlize_0.4.15 GenomeInfoDbData_1.2.11 IRanges_2.36.0
[37] S4Vectors_0.40.2 irlba_2.3.5.1 listenv_0.9.0 spatstat.utils_3.0-4
[41] goftest_1.2-3 spatstat.random_3.2-1 fitdistrplus_1.1-11 parallelly_1.36.0
[45] leiden_0.4.3.1 codetools_0.2-19 DelayedArray_0.28.0 tidyselect_1.2.1
[49] shape_1.4.6 farver_2.1.2 matrixStats_1.1.0 stats4_4.3.0
[53] spatstat.explore_3.2-5 jsonlite_1.8.9 GetoptLong_1.0.5 ks_1.14.1
[57] ellipsis_0.3.2 progressr_0.14.0 iterators_1.0.14 ggridges_0.5.4
[61] survival_3.5-7 systemfonts_1.1.0 foreach_1.5.2 tools_4.3.0
[65] ragg_1.2.6 ica_1.0-3 Rcpp_1.0.13 glue_1.7.0
[69] gridExtra_2.3 SparseArray_1.2.2 dimRed_0.2.6 MatrixGenerics_1.14.0
[73] GenomeInfoDb_1.38.1 withr_3.0.1 fastmap_1.2.0 fansi_1.0.6
[77] digest_0.6.36 timechange_0.2.0 R6_2.5.1 mime_0.12
[81] textshaping_0.3.7 ggprism_1.0.4 colorspace_2.1-1 Cairo_1.6-1
[85] scattermore_1.2 tensor_1.5 spatstat.data_3.0-3 R.methodsS3_1.8.2
[89] utf8_1.2.4 generics_0.1.3 ggsci_3.0.0 data.table_1.16.0
[93] httr_1.4.7 htmlwidgets_1.6.3 S4Arrays_1.2.0 uwot_0.1.16
[97] pkgconfig_2.0.3 gtable_0.3.5 ComplexHeatmap_2.18.0 lmtest_0.9-40
[101] SingleCellExperiment_1.24.0 XVector_0.42.0 htmltools_0.5.7 clue_0.3-65
[105] scales_1.3.0 Biobase_2.62.0 png_0.1-8 snakecase_0.11.1
[109] ggdendro_0.1.23 rstudioapi_0.15.0 rjson_0.2.21 tzdb_0.4.0
[113] reshape2_1.4.4 nlme_3.1-163 zoo_1.8-12 GlobalOptions_0.1.2
[117] KernSmooth_2.23-22 parallel_4.3.0 miniUI_0.1.1.1 vipor_0.4.5
[121] ggrastr_1.0.2 pillar_1.9.0 grid_4.3.0 vctrs_0.6.5
[125] RANN_2.6.1 promises_1.2.1 xtable_1.8-4 cluster_2.1.6
[129] DRR_0.0.4 beeswarm_0.4.0 paletteer_1.5.0 magick_2.8.1
[133] mvtnorm_1.2-3 cli_3.6.3 compiler_4.3.0 rlang_1.1.4
[137] crayon_1.5.2 future.apply_1.11.0 labeling_0.4.3 mclust_6.1.1
[141] rematch2_2.1.2 plyr_1.8.9 ggbeeswarm_0.7.2 stringi_1.8.3
[145] viridisLite_0.4.2 deldir_2.0-2 munsell_0.5.1 lazyeval_0.2.2
[149] spatstat.geom_3.2-7 Matrix_1.6-5 hms_1.1.3 future_1.33.0
[153] statmod_1.5.0 shiny_1.8.0 SummarizedExperiment_1.32.0 kernlab_0.9-32
[157] ROCR_1.0-11 igraph_2.0.3
历史绘图合集
进化树合集
环状图
散点图
基因家族合集
换一个排布方式:
首先查看基础版热图:
然后再看进阶版热图:
基因组共线性
WGCNA ggplot2版本
其他科研绘图
合作、联系和交流
有很多小伙伴在后台私信作者,非常抱歉,我经常看不到导致错过,请添加下面的微信联系作者,一起交流数据分析和可视化。