作者用了SCT来整合24个数据集,建议大家换成harmony,否则电脑根本承受不来~
因为斑马鱼的基因格式和我们常用的人或鼠不太一样,所以过滤这一步显得麻烦一些:
list_mitochondrial = c("ENSDARG00000063895", "ENSDARG00000063899", "ENSDARG00000063905", "ENSDARG00000063908", "ENSDARG00000063910", "ENSDARG00000063911", "ENSDARG00000063912", "ENSDARG00000063914", "ENSDARG00000063916", "ENSDARG00000063917", "ENSDARG00000063921", "ENSDARG00000063921", "ENSDARG00000063922", "ENSDARG00000063924")
for (i in 1:length(sc_all_datasets)) {
try(sc_all_datasets[[i]][["percent.mt"]] <- PercentageFeatureSet(sc_all_datasets[[i]], features = list_mitochondrial))
}
for (i in 1:length(sc_all_datasets)) {
print(sum(sc_all_datasets[[i]]@meta.data[["percent.mt"]]))
}
ids_list <- list()
for (i in 1:length(sc_all_datasets)) {
ids <- rownames(sc_all_datasets[[i]])
ids_list <- append(ids_list, list(ids))
}
length(Reduce(intersect,ids_list))
#common genes - 6904
sum_cells = 0
for (i in 1:length(sc_all_datasets)) {
cell_num <- length(colnames(sc_all_datasets[[i]]))
sum_cells = sum_cells + cell_num
}
#251442 cells before filtering
for (i in 1:length(sc_all_datasets)) {
sc_all_datasets[[i]] <- subset(sc_all_datasets[[i]], subset = nFeature_RNA > 200 )
}
#250206 cells after filtering
接下来就是harmony整合:
rm(list = ls())
source("step0_lib.R")
library(future)
# options(future.globals.maxSize = 1024 * 1024 * 1024) # 设置为1GB
# 或者设置为2GB
options(future.globals.maxSize = 16 * 1024^3) # 将最大内存限制增加到5GB
#harmony
load("./tidydata/backup-before-seurat-integr-11-datasets.RData")
sc_gsm_merge <- merge(sc_all_datasets[[1]], c(sc_all_datasets[2:length(sc_all_datasets)]))
如果为了方便,接下来当然直接可以和Jimmy老师的常规单细胞代码衔接起来:
rm(sc_all_datasets)
# SCTransform 的内存占用过高,尝试其他内存较低的整合方法
# sc_gsm_merge <- SCTransform(sc_gsm_merge, method = "glmGamPoi", verbose = T)
# DefaultAssay(sc_gsm_merge) <- "SCT"
# from jimmy
sc_gsm_merge <- NormalizeData(sc_gsm_merge,
normalization.method = "LogNormalize",
scale.factor = 1e4)
sc_gsm_merge <- FindVariableFeatures(sc_gsm_merge)
sc_gsm_merge <- ScaleData(sc_gsm_merge)
sc_gsm_merge <- RunPCA(sc_gsm_merge, features = VariableFeatures(object = sc_gsm_merge))
p1 <- DimPlot(sc_gsm_merge, reduction = "pca", group.by = "orig.ident", pt.size = 0.1)
p2 <- VlnPlot(sc_gsm_merge, features = "PC_1", group.by = "orig.ident", pt.size = 0.1)
dir.create("./Figures_out")
plot_grid(p1, p2, ncol = 2, rel_widths = c(1, 1.5)) %>% ggsave(filename = paste("./Figures_out/", "2_sc_gsm_merge_check_PCA.png", sep=""), width = 500, height = 150, units = "mm")
sc_gsm_merge <- RunHarmony(sc_gsm_merge, group.by.vars = "orig.ident")
p1 <- DimPlot(sc_gsm_merge, reduction = "harmony", group.by = "orig.ident", pt.size = 0.1)
p2 <- VlnPlot(sc_gsm_merge, features = "harmony_1", group.by = "orig.ident", pt.size = 0.1)
plot_grid(p1, p2, ncol = 2, rel_widths = c(1, 1.5)) %>% ggsave(filename = paste("./Figures_out/", "2_sc_gsm_integrated_check_harmony.png", sep=""), width = 500, height = 150, units = "mm")
figure_dir <- "./Figures_out/"
ElbowPlot(sc_gsm_merge, ndims = 30, reduction = "harmony") %>% ggsave(filename = paste(figure_dir, "2_sc_gsm_integrated_check_harmony_dims_ElbowPlot.png", sep=""), width = 150, height = 150, units = "mm")
sc_gsm_merge <- RunUMAP(sc_gsm_merge, reduction = "harmony", dims = 1:20)
p1 <- DimPlot(sc_gsm_merge, reduction = "umap", group.by = "orig.ident", label = F, pt.size = 0.1)
p1[[1]]$layers[[1]]$aes_params$alpha = 0.2
p1 %>% ggsave(filename = paste(figure_dir, "2_sc_gsm_integrated_check_UMAP_harmony.png", sep=""), width = 150, height = 130, units = "mm") # display the profile of the integrated dataset
# Save pheno.ident, stage.ident and sex.ident
p1 <- DimPlot(sc_gsm_merge, reduction = "umap", group.by = "orig.ident", label = F, pt.size = 0.1)
p1[[1]]$layers[[1]]$aes_params$alpha = 0.2
p1 %>% ggsave(filename = paste(figure_dir, "2_sc_gsm_integrated_check_harmony_OrigIdent.png", sep=""), width = 150, height = 130, units = "mm")
p1 <- DimPlot(sc_gsm_merge, reduction = "umap", group.by = "batch.ident", label = F, pt.size = 0.1)
p1[[1]]$layers[[1]]$aes_params$alpha = 0.2
p1 %>% ggsave(filename = paste(figure_dir, "2_sc_gsm_integrated_check_harmony_BatchIdent.png", sep=""), width = 150, height = 130, units = "mm")
saveRDS(sc_gsm_merge,file = "./tidydata/harmonied_sc_gsm_merge.rds")
细胞聚类
rm(list = ls())
source("step0_lib.R")
sc_gsm_merge <- readRDS("./tidydata/harmonied_sc_gsm_merge.rds")
sc_gsm_merge <- FindNeighbors(sc_gsm_merge, reduction = "pca", dims = 1:30)
sc_gsm_merge <- FindClusters(sc_gsm_merge, resolution = 1, verbose = T)
p1 <- DimPlot(sc_gsm_merge, reduction = "umap", group.by = "seurat_clusters", label = T, repel = T, pt.size = 0.1, raster=FALSE)
p2 <- DimPlot(sc_gsm_merge, reduction = "umap", group.by = "orig.ident", pt.size = 0.1, raster=FALSE)
plot_grid(p1, p2, ncol = 2, rel_widths = c(1, 1.15)) %>% ggsave(filename = paste("./Figures_out/", "3_gsm_integrated_harmonyClusters.png", sep=""), width = 15, height = 13, dpi = 300)
saveRDS(sc_gsm_merge, paste("./tidydata/", "sc_gse_after_harmony.rds", sep = ""))
# CHECK CLUSTER COMPOSITION
sc_gse_merge <- sc_gsm_merge
# Check how much samples are represented in each of the clusters
cluster_count <- as.data.frame(table(subset(sc_gse_merge, seurat_clusters == as.character(0))@meta.data$orig.ident)) # set the structure
names(cluster_count)[1] <- "GSM_ID"
names(cluster_count)[2] <- "ToRemove"
cluster_count$ToRemove <- 0
for (i in levels(sc_gse_merge$seurat_clusters)) {
print(paste("cluster ", i, sep = ""))
tmp_df <- as.data.frame(table(subset(sc_gse_merge, seurat_clusters == as.character(i))@meta.data$orig.ident))
names(tmp_df)[1] <- "GSM_ID"
names(tmp_df)[2] <- as.character(paste("Cluster_", i, sep =""))
cluster_count <- merge(cluster_count, tmp_df, all.x = T, all.y = T)
}
rownames(cluster_count) <- as.character(cluster_count$GSM_ID)
cluster_count <- cluster_count[, 3:length(cluster_count)]
for (i in 1:length(cluster_count)) {
for (j in 1:length(cluster_count[[i]])) {
if (is.na(cluster_count[j, i]) == T) {
cluster_count[j, i] <- 0
}
}
}
library(pheatmap)
library(RColorBrewer)
# Create the annotations needed for the heatmap (which needs dataframe structures for annotations)
tmp_df <- data.frame(table(sc_gse_merge$orig.ident, sc_gse_merge$batch.ident))
lines_to_manage <- c()
for (i in 1:length(rownames(tmp_df))) {
if (tmp_df$Freq[i] == 0) {
lines_to_manage <- c(lines_to_manage, i)
}
}
tmp_df <- tmp_df[-lines_to_manage,]
rownames(tmp_df) <- tmp_df[,2]
tmp_df <- tmp_df[, -1]
colnames(tmp_df) <- c("Batch", "Cell.Count")
# Then draw the pheatmap
# You should save the heatmap from RStudio | 1000 x 535
p1 <- pheatmap(cluster_count,
cluster_rows = F,
cluster_cols = F,
show_rownames = TRUE,
color = c("white", "yellow", "green"),
breaks = c(0, 100, 1000, 10000), # distances 0 to 3 are red, 3 to 9 black
main = 'cells distribution')
p1
write.table(cluster_count,file = "./tidydata/cluster_count.csv") # export "cluster_count" as a table
到了这一步,后续的细胞注释也就简单多了,但是鉴于斑马鱼物种的特殊性,还是会在接下来的几周里陆续更完代码,感兴趣的小伙伴可以一起交流~