【斑马鱼肾脏】多个单细胞数据整合分析(一)

文摘   2024-11-06 12:31   北京  

【斑马鱼肾脏的单细胞数据集合】:脊椎动物的中性粒与巨噬细胞的分子特征,根据作者善心提供的代码,我们尝试复现一下文章~


加载包

library(Seurat)
# BiocManager::install(version = "3.19")
# BiocManager::install("org.Dr.eg.db")
library('org.Dr.eg.db')
# BiocManager::install("TxDb.Drerio.UCSC.danRer11.refGene")
library('TxDb.Drerio.UCSC.danRer11.refGene')
# BiocManager::install("BSgenome.Drerio.UCSC.danRer11")
library('BSgenome.Drerio.UCSC.danRer11')
# devtools::install_github("kudusch/ktools@main")
library(ktools)
library('readr')
library('AnnotationHub')
library('genekitr'# for gene name parsing
library("hdf5r")
library(dplyr)
library('glmGamPoi')
library('harmony')
library('cowplot')
library('ggplot2')
library(pheatmap)

Ensemble转换

gse100910 <- read.table("./rawdata/GSE100910_indrop_counts.txt.gz", header = TRUE, row.names = 1)

# 基因ID转换
ens_GSE100910 <- transId(rownames(gse100910), transTo = "ensembl", org = 'drerio', unique = TRUE)

# 筛选转换成功的基因行
GSE100910 <- gse100910[rownames(gse100910) %in% ens_GSE100910$input_id,]

names_GSE100910 <- ens_GSE100910$ensembl[match(rownames(GSE100910),ens_GSE100910$input_id)]

GSE100910_new <- data.frame(EnsId = names_GSE100910, GSE100910)


# dir.create("./tidydata")
write.table(GSE100910_new, "./tidydata/GSE100910-ENSG.txt",row.names = TRUE, sep=",")

数据merge

# GSE112438 ---------------------------------------------------------------
# 列出所有文件
files1 <- list.files("./rawdata/GSE112438/",pattern = "TD")
files2 <- list.files("./rawdata/GSE112438/",pattern = "WKM.*unenriched\\.txt\\.gz$")
files <- c(files1,files2)

# 使用循环读取数据并设置列名
data_list <- lapply(files, function(file) {
  file_path <- paste0("./rawdata/GSE112438/", file)
  sample_id <- strsplit(file, "_")[[1]][1]
  data <- data.frame(read.table(file_path, sep = "\t", row.names = 1, header = TRUE))
  colnames(data) <- paste0(sample_id, "_", trimws(colnames(data)))
  return(data)
})

# 使用multimerge函数合并数据
gse112438_unenriched <- data.frame(multimerge(data_list))

# 解析行名
ids_to_parse_gse112438_unenriched <- unlist(strsplit(rownames(gse112438_unenriched), split='_', fixed = TRUE))
ids_gse112438_unenriched <- ids_to_parse_gse112438_unenriched[seq(1length(ids_to_parse_gse112438_unenriched), 2)]

# 更新行名并处理缺失值
rownames(gse112438_unenriched) <- ids_gse112438_unenriched
gse112438_unenriched[is.na(gse112438_unenriched)] <- 0

write.table(gse112438_unenriched, "./tidydata/GSE112438_unenriched-ENSG.txt", row.names = TRUE, sep=",")


### hspcs
gsm3070124 <- read.table('./rawdata/GSE112438/GSM3070124_WKM10_hspcs.txt.gz', sep = "\t", row.names = 1, header = T)
colnames(gsm3070124) <- paste0("gsm3070124_", trimws(colnames(gsm3070124)))

gsm3070136 <- read.table('./rawdata/GSE112438/GSM3070136_WKM5_hspcs.txt.gz', sep = "\t", row.names = 1, header = T)
colnames(gsm3070136) <- paste0("gsm3070136_", trimws(colnames(gsm3070136)))


gse112438_hspc <- data.frame(multimerge( list (gsm3070124, gsm3070136 )))
ids_to_parse_gse112438_hspc <- unlist(strsplit(rownames(gse112438_hspc), split='_',fixed = T))
ids_gse112438_hspc <- ids_to_parse_gse112438_hspc[seq(1,length(ids_to_parse_gse112438_hspc),2)]
rownames(gse112438_hspc) <- ids_gse112438_hspc
gse112438_hspc[is.na(gse112438_hspc)] = 0

write.table(gse112438_hspc, "./tidydata/GSE112438_hspc-ENSG.txt", row.names = TRUE, sep=",")


### lymphocytes
gsm3070125 <- read.table('./rawdata/GSE112438/GSM3070125_WKM10_lymphocytes.txt.gz', sep = "\t", row.names = 1, header = T)
colnames(gsm3070125) <- paste0("gsm3070125_", trimws(colnames(gsm3070125)))

gsm3070137 <- read.table('./rawdata/GSE112438/GSM3070137_WKM5_lymphocytes.txt.gz', sep = "\t", row.names = 1, header = T)
colnames(gsm3070137) <- paste0("gsm3070137_", trimws(colnames(gsm3070137)))

gsm3070139 <- read.table('./rawdata/GSE112438/GSM3070139_WKM6_lymphocytes.txt.gz', sep = "\t", row.names = 1, header = T)
colnames(gsm3070139) <- paste0("gsm3070139_", trimws(colnames(gsm3070139)))

gse112438_lymphocytes <- data.frame(multimerge( list (gsm3070125, gsm3070137, gsm3070139 )))
ids_to_parse_gse112438_lymphocytes <- unlist(strsplit(rownames(gse112438_lymphocytes), split='_',fixed = T))
ids_gse112438_lymphocytes <- ids_to_parse_gse112438_lymphocytes[seq(1,length(ids_to_parse_gse112438_lymphocytes),2)]
rownames(gse112438_lymphocytes) <- ids_gse112438_lymphocytes
gse112438_lymphocytes[is.na(gse112438_lymphocytes)] = 0

write.table(gse112438_lymphocytes, "./tidydata/GSE112438_lymphocytes-ENSG.txt", row.names = TRUE, sep=",")



### eosinophils
gsm3070143 <- read.table('./rawdata/GSE112438/GSM3070143_WKM8_eosinophils_and_lymphocytes.txt.gz', sep = "\t", row.names = 1, header = T)
gsm3070128 <- read.table('./rawdata/GSE112438/GSM3070128_WKM2_classicalgate-eosinophils.txt.gz', sep = "\t", row.names = 1, header = T)
gsm3070131 <- read.table('./rawdata/GSE112438/GSM3070131_WKM3_classicalgate-eosinophils.txt.gz', sep = "\t", row.names = 1, header = T)
gsm3070129 <- read.table('./rawdata/GSE112438/GSM3070129_WKM2_eosinophils.txt.gz', sep = "\t", row.names = 1, header = T)
gsm3070132 <- read.table('./rawdata/GSE112438/GSM3070132_WKM3_eosinophils.txt.gz', sep = "\t", row.names = 1, header = T)
gsm3070134 <- read.table('./rawdata/GSE112438/GSM3070134_WKM4_eosinophils.txt.gz', sep = "\t", row.names = 1, header = T)
gsm3070146 <- read.table('./rawdata/GSE112438/GSM3070146_WKM9_eosinophils.txt.gz', sep = "\t", row.names = 1, header = T)

colnames(gsm3070143) <- paste0("gsm3070143_", trimws(colnames(gsm3070143)))
colnames(gsm3070128) <- paste0("gsm3070128_", trimws(colnames(gsm3070128)))
colnames(gsm3070131) <- paste0("gsm3070131_", trimws(colnames(gsm3070131)))
colnames(gsm3070129) <- paste0("gsm3070129_", trimws(colnames(gsm3070129)))
colnames(gsm3070132) <- paste0("gsm3070132_", trimws(colnames(gsm3070132)))
colnames(gsm3070134) <- paste0("gsm3070134_", trimws(colnames(gsm3070134)))
colnames(gsm3070146) <- paste0("gsm3070146_", trimws(colnames(gsm3070146)))


gse112438_eosinophils <- data.frame(multimerge( list ( gsm3070143, gsm3070128, gsm3070131, gsm3070129, gsm3070132, gsm3070134, gsm3070146 )))

ids_to_parse_gse112438_eosinophils <- unlist(strsplit(rownames(gse112438_eosinophils), split='_',fixed = T))
ids_gse112438_eosinophils <- ids_to_parse_gse112438_eosinophils[seq(1,length(ids_to_parse_gse112438_eosinophils),2)]
rownames(gse112438_eosinophils) <- ids_gse112438_eosinophils
gse112438_eosinophils[is.na(gse112438_eosinophils)] = 0

write.table(gse112438_eosinophils, "./tidydata/GSE112438_eosinophils-ENSG.txt", row.names = TRUE, sep=",")

### monocytes
gsm3070144 <- read.table('./rawdata/GSE112438/GSM3070144_WKM8_monocytes.txt.gz', sep = "\t", row.names = 1, header = T)
gsm3070147 <- read.table('./rawdata/GSE112438/GSM3070147_WKM9_monocytes.txt.gz', sep = "\t", row.names = 1, header = T)
gsm3070141 <- read.table('./rawdata/GSE112438/GSM3070141_WKM7_monocytes.txt.gz', sep = "\t", row.names = 1, header = T)

colnames(gsm3070144) <- paste0("gsm3070144_", trimws(colnames(gsm3070144)))
colnames(gsm3070147) <- paste0("gsm3070147_", trimws(colnames(gsm3070147)))
colnames(gsm3070141) <- paste0("gsm3070141_", trimws(colnames(gsm3070141)))

gse112438_monocytes <- data.frame(multimerge( list ( gsm3070144, gsm3070147, gsm3070141)))

ids_to_parse_gse112438_monocytes <- unlist(strsplit(rownames(gse112438_monocytes), split='_',fixed = T))
ids_gse112438_monocytes <- ids_to_parse_gse112438_monocytes[seq(1,length(ids_to_parse_gse112438_monocytes),2)]
rownames(gse112438_monocytes) <- ids_gse112438_monocytes
gse112438_monocytes[is.na(gse112438_monocytes)] = 0

write.table(gse112438_monocytes, "./tidydata/gse112438_monocytes-ENSG.txt", row.names = TRUE, sep=",")

使用multimerge函数合并数据:

multimerge <- function (mylist) {
  ## mimics a recursive merge or full outer join
 
  unames <- unique(unlist(lapply(mylist, rownames)))
 
  n <- length(unames)
 
  out <- lapply(mylist, function(df) {
 
    tmp <- matrix(nr = n, nc = ncol(df), dimnames = list(unames,colnames(df)))
    tmp[rownames(df), ] <- as.matrix(df)
    rm(df); gc()
 
    return(tmp)
  })
 
  stopifnot( all( sapply(out, function(x) identical(rownames(x), unames)) ) )
 
  bigout <- do.call(cbind, out)
  colnames(bigout) <- paste(rep(names(mylist), sapply(mylist, ncol)), unlist(sapply(mylist, colnames)), sep = "_")
  return(bigout)
}

创建Seurat对象

GSE112438_eosinophils[is.na(GSE112438_eosinophils)] = 0
GSE112438_hspc[is.na(GSE112438_hspc)] = 0
GSE112438_lymphocytes[is.na(GSE112438_lymphocytes)] = 0
GSE112438_unenriched[is.na(GSE112438_unenriched)] = 0
GSE112438_eosinophils <- CreateSeuratObject(GSE112438_eosinophils, project = "GSE112438_eosinophils", min.cells = 10, min.features = 200)
GSE112438_hspc <- CreateSeuratObject(GSE112438_hspc, project = "GSE112438_hspc", min.cells = 10, min.features = 200)
GSE112438_lymphocytes <- CreateSeuratObject(GSE112438_lymphocytes, project = "GSE112438_lymphocytes", min.cells = 10, min.features = 200)
GSE112438_unenriched <- CreateSeuratObject(GSE112438_unenriched, project = "GSE112438_unenriched", min.cells = 10, min.features = 200)

整合

sc_all_datasets <- c(GSE112438_eosinophils, GSE112438_hspc, GSE112438_lymphocytes, GSE112438_unenriched)
integration_features <- SelectIntegrationFeatures(object.list = sc_all_datasets, nfeatures = 2500)
sc_all_datasets <- PrepSCTIntegration(object.list = sc_all_datasets, anchor.features = integration_features)
sc_gsm_anchors <- FindIntegrationAnchors(object.list = sc_all_datasets, normalization.method = "SCT", anchor.features = integration_features, reduction = "rpca")
sc_gsm_integrated <- IntegrateData(anchorset = sc_gsm_anchors, normalization.method = "SCT", new.assay.name = "seurat.integration")
DefaultAssay(sc_gsm_integrated) <- "seurat.integration"


生信菜鸟团
生信菜鸟团荣誉归来,让所有想分析生物信息学数据的小伙伴找到归属,你值得拥有!
 最新文章