【斑马鱼肾脏的单细胞数据集合】:脊椎动物的中性粒与巨噬细胞的分子特征,根据作者善心提供的代码,我们尝试复现一下文章~
加载包
library(Seurat)
# BiocManager::install(version = "3.19")
# BiocManager::install("org.Dr.eg.db")
library('org.Dr.eg.db')
# BiocManager::install("TxDb.Drerio.UCSC.danRer11.refGene")
library('TxDb.Drerio.UCSC.danRer11.refGene')
# BiocManager::install("BSgenome.Drerio.UCSC.danRer11")
library('BSgenome.Drerio.UCSC.danRer11')
# devtools::install_github("kudusch/ktools@main")
library(ktools)
library('readr')
library('AnnotationHub')
library('genekitr') # for gene name parsing
library("hdf5r")
library(dplyr)
library('glmGamPoi')
library('harmony')
library('cowplot')
library('ggplot2')
library(pheatmap)
Ensemble转换
gse100910 <- read.table("./rawdata/GSE100910_indrop_counts.txt.gz", header = TRUE, row.names = 1)
# 基因ID转换
ens_GSE100910 <- transId(rownames(gse100910), transTo = "ensembl", org = 'drerio', unique = TRUE)
# 筛选转换成功的基因行
GSE100910 <- gse100910[rownames(gse100910) %in% ens_GSE100910$input_id,]
names_GSE100910 <- ens_GSE100910$ensembl[match(rownames(GSE100910),ens_GSE100910$input_id)]
GSE100910_new <- data.frame(EnsId = names_GSE100910, GSE100910)
# dir.create("./tidydata")
write.table(GSE100910_new, "./tidydata/GSE100910-ENSG.txt",row.names = TRUE, sep=",")
数据merge
# GSE112438 ---------------------------------------------------------------
# 列出所有文件
files1 <- list.files("./rawdata/GSE112438/",pattern = "TD")
files2 <- list.files("./rawdata/GSE112438/",pattern = "WKM.*unenriched\\.txt\\.gz$")
files <- c(files1,files2)
# 使用循环读取数据并设置列名
data_list <- lapply(files, function(file) {
file_path <- paste0("./rawdata/GSE112438/", file)
sample_id <- strsplit(file, "_")[[1]][1]
data <- data.frame(read.table(file_path, sep = "\t", row.names = 1, header = TRUE))
colnames(data) <- paste0(sample_id, "_", trimws(colnames(data)))
return(data)
})
# 使用multimerge函数合并数据
gse112438_unenriched <- data.frame(multimerge(data_list))
# 解析行名
ids_to_parse_gse112438_unenriched <- unlist(strsplit(rownames(gse112438_unenriched), split='_', fixed = TRUE))
ids_gse112438_unenriched <- ids_to_parse_gse112438_unenriched[seq(1, length(ids_to_parse_gse112438_unenriched), 2)]
# 更新行名并处理缺失值
rownames(gse112438_unenriched) <- ids_gse112438_unenriched
gse112438_unenriched[is.na(gse112438_unenriched)] <- 0
write.table(gse112438_unenriched, "./tidydata/GSE112438_unenriched-ENSG.txt", row.names = TRUE, sep=",")
### hspcs
gsm3070124 <- read.table('./rawdata/GSE112438/GSM3070124_WKM10_hspcs.txt.gz', sep = "\t", row.names = 1, header = T)
colnames(gsm3070124) <- paste0("gsm3070124_", trimws(colnames(gsm3070124)))
gsm3070136 <- read.table('./rawdata/GSE112438/GSM3070136_WKM5_hspcs.txt.gz', sep = "\t", row.names = 1, header = T)
colnames(gsm3070136) <- paste0("gsm3070136_", trimws(colnames(gsm3070136)))
gse112438_hspc <- data.frame(multimerge( list (gsm3070124, gsm3070136 )))
ids_to_parse_gse112438_hspc <- unlist(strsplit(rownames(gse112438_hspc), split='_',fixed = T))
ids_gse112438_hspc <- ids_to_parse_gse112438_hspc[seq(1,length(ids_to_parse_gse112438_hspc),2)]
rownames(gse112438_hspc) <- ids_gse112438_hspc
gse112438_hspc[is.na(gse112438_hspc)] = 0
write.table(gse112438_hspc, "./tidydata/GSE112438_hspc-ENSG.txt", row.names = TRUE, sep=",")
### lymphocytes
gsm3070125 <- read.table('./rawdata/GSE112438/GSM3070125_WKM10_lymphocytes.txt.gz', sep = "\t", row.names = 1, header = T)
colnames(gsm3070125) <- paste0("gsm3070125_", trimws(colnames(gsm3070125)))
gsm3070137 <- read.table('./rawdata/GSE112438/GSM3070137_WKM5_lymphocytes.txt.gz', sep = "\t", row.names = 1, header = T)
colnames(gsm3070137) <- paste0("gsm3070137_", trimws(colnames(gsm3070137)))
gsm3070139 <- read.table('./rawdata/GSE112438/GSM3070139_WKM6_lymphocytes.txt.gz', sep = "\t", row.names = 1, header = T)
colnames(gsm3070139) <- paste0("gsm3070139_", trimws(colnames(gsm3070139)))
gse112438_lymphocytes <- data.frame(multimerge( list (gsm3070125, gsm3070137, gsm3070139 )))
ids_to_parse_gse112438_lymphocytes <- unlist(strsplit(rownames(gse112438_lymphocytes), split='_',fixed = T))
ids_gse112438_lymphocytes <- ids_to_parse_gse112438_lymphocytes[seq(1,length(ids_to_parse_gse112438_lymphocytes),2)]
rownames(gse112438_lymphocytes) <- ids_gse112438_lymphocytes
gse112438_lymphocytes[is.na(gse112438_lymphocytes)] = 0
write.table(gse112438_lymphocytes, "./tidydata/GSE112438_lymphocytes-ENSG.txt", row.names = TRUE, sep=",")
### eosinophils
gsm3070143 <- read.table('./rawdata/GSE112438/GSM3070143_WKM8_eosinophils_and_lymphocytes.txt.gz', sep = "\t", row.names = 1, header = T)
gsm3070128 <- read.table('./rawdata/GSE112438/GSM3070128_WKM2_classicalgate-eosinophils.txt.gz', sep = "\t", row.names = 1, header = T)
gsm3070131 <- read.table('./rawdata/GSE112438/GSM3070131_WKM3_classicalgate-eosinophils.txt.gz', sep = "\t", row.names = 1, header = T)
gsm3070129 <- read.table('./rawdata/GSE112438/GSM3070129_WKM2_eosinophils.txt.gz', sep = "\t", row.names = 1, header = T)
gsm3070132 <- read.table('./rawdata/GSE112438/GSM3070132_WKM3_eosinophils.txt.gz', sep = "\t", row.names = 1, header = T)
gsm3070134 <- read.table('./rawdata/GSE112438/GSM3070134_WKM4_eosinophils.txt.gz', sep = "\t", row.names = 1, header = T)
gsm3070146 <- read.table('./rawdata/GSE112438/GSM3070146_WKM9_eosinophils.txt.gz', sep = "\t", row.names = 1, header = T)
colnames(gsm3070143) <- paste0("gsm3070143_", trimws(colnames(gsm3070143)))
colnames(gsm3070128) <- paste0("gsm3070128_", trimws(colnames(gsm3070128)))
colnames(gsm3070131) <- paste0("gsm3070131_", trimws(colnames(gsm3070131)))
colnames(gsm3070129) <- paste0("gsm3070129_", trimws(colnames(gsm3070129)))
colnames(gsm3070132) <- paste0("gsm3070132_", trimws(colnames(gsm3070132)))
colnames(gsm3070134) <- paste0("gsm3070134_", trimws(colnames(gsm3070134)))
colnames(gsm3070146) <- paste0("gsm3070146_", trimws(colnames(gsm3070146)))
gse112438_eosinophils <- data.frame(multimerge( list ( gsm3070143, gsm3070128, gsm3070131, gsm3070129, gsm3070132, gsm3070134, gsm3070146 )))
ids_to_parse_gse112438_eosinophils <- unlist(strsplit(rownames(gse112438_eosinophils), split='_',fixed = T))
ids_gse112438_eosinophils <- ids_to_parse_gse112438_eosinophils[seq(1,length(ids_to_parse_gse112438_eosinophils),2)]
rownames(gse112438_eosinophils) <- ids_gse112438_eosinophils
gse112438_eosinophils[is.na(gse112438_eosinophils)] = 0
write.table(gse112438_eosinophils, "./tidydata/GSE112438_eosinophils-ENSG.txt", row.names = TRUE, sep=",")
### monocytes
gsm3070144 <- read.table('./rawdata/GSE112438/GSM3070144_WKM8_monocytes.txt.gz', sep = "\t", row.names = 1, header = T)
gsm3070147 <- read.table('./rawdata/GSE112438/GSM3070147_WKM9_monocytes.txt.gz', sep = "\t", row.names = 1, header = T)
gsm3070141 <- read.table('./rawdata/GSE112438/GSM3070141_WKM7_monocytes.txt.gz', sep = "\t", row.names = 1, header = T)
colnames(gsm3070144) <- paste0("gsm3070144_", trimws(colnames(gsm3070144)))
colnames(gsm3070147) <- paste0("gsm3070147_", trimws(colnames(gsm3070147)))
colnames(gsm3070141) <- paste0("gsm3070141_", trimws(colnames(gsm3070141)))
gse112438_monocytes <- data.frame(multimerge( list ( gsm3070144, gsm3070147, gsm3070141)))
ids_to_parse_gse112438_monocytes <- unlist(strsplit(rownames(gse112438_monocytes), split='_',fixed = T))
ids_gse112438_monocytes <- ids_to_parse_gse112438_monocytes[seq(1,length(ids_to_parse_gse112438_monocytes),2)]
rownames(gse112438_monocytes) <- ids_gse112438_monocytes
gse112438_monocytes[is.na(gse112438_monocytes)] = 0
write.table(gse112438_monocytes, "./tidydata/gse112438_monocytes-ENSG.txt", row.names = TRUE, sep=",")
使用multimerge函数合并数据:
multimerge <- function (mylist) {
## mimics a recursive merge or full outer join
unames <- unique(unlist(lapply(mylist, rownames)))
n <- length(unames)
out <- lapply(mylist, function(df) {
tmp <- matrix(nr = n, nc = ncol(df), dimnames = list(unames,colnames(df)))
tmp[rownames(df), ] <- as.matrix(df)
rm(df); gc()
return(tmp)
})
stopifnot( all( sapply(out, function(x) identical(rownames(x), unames)) ) )
bigout <- do.call(cbind, out)
colnames(bigout) <- paste(rep(names(mylist), sapply(mylist, ncol)), unlist(sapply(mylist, colnames)), sep = "_")
return(bigout)
}
创建Seurat对象
GSE112438_eosinophils[is.na(GSE112438_eosinophils)] = 0
GSE112438_hspc[is.na(GSE112438_hspc)] = 0
GSE112438_lymphocytes[is.na(GSE112438_lymphocytes)] = 0
GSE112438_unenriched[is.na(GSE112438_unenriched)] = 0
GSE112438_eosinophils <- CreateSeuratObject(GSE112438_eosinophils, project = "GSE112438_eosinophils", min.cells = 10, min.features = 200)
GSE112438_hspc <- CreateSeuratObject(GSE112438_hspc, project = "GSE112438_hspc", min.cells = 10, min.features = 200)
GSE112438_lymphocytes <- CreateSeuratObject(GSE112438_lymphocytes, project = "GSE112438_lymphocytes", min.cells = 10, min.features = 200)
GSE112438_unenriched <- CreateSeuratObject(GSE112438_unenriched, project = "GSE112438_unenriched", min.cells = 10, min.features = 200)
整合
sc_all_datasets <- c(GSE112438_eosinophils, GSE112438_hspc, GSE112438_lymphocytes, GSE112438_unenriched)
integration_features <- SelectIntegrationFeatures(object.list = sc_all_datasets, nfeatures = 2500)
sc_all_datasets <- PrepSCTIntegration(object.list = sc_all_datasets, anchor.features = integration_features)
sc_gsm_anchors <- FindIntegrationAnchors(object.list = sc_all_datasets, normalization.method = "SCT", anchor.features = integration_features, reduction = "rpca")
sc_gsm_integrated <- IntegrateData(anchorset = sc_gsm_anchors, normalization.method = "SCT", new.assay.name = "seurat.integration")
DefaultAssay(sc_gsm_integrated) <- "seurat.integration"