Window上用PyCharm(python3.9.7) Python版本的Posit(Rstudio)
提取Seurat单细胞数据
# 矩阵信息
write.table(t(as.matrix(scRNA@assays$RNA@counts)),"scRNA_counts.csv",
sep="\t",quote=FALSE,row.names=T,col.names=T)
# 降维信息
embedding <- Embeddings(scRNA, "umap")
write.table(embedding,"scRNA_embedding.csv",
sep="\t",quote=FALSE,row.names=T,col.names=T)
# metadata
write.table(scRNA@meta.data,"metadata.csv",
sep="\t",quote=FALSE,row.names=T,col.names=T)
Cell2location安装(报错解决)
pip install cell2location
pip install pip install scvi-tools==0.17.4
vim /ifs/home/yusenwei/miniconda3/envs/velocyto/lib/python3.9/site-packages/pytorch_lightning/callbacks/progress/rich_progress.py
# 修改
from lightning_utilities.core.imports import compare_version
细胞类型-基因估计表达模型构建
import os
os.getcwd()
# 单细胞部分
import scanpy as sc
import scipy.sparse as sp
import pandas as pd
sc_adata = sc.read_text("../scRNA_counts.csv",delimiter='\t',first_column_names=True)
sc_adata.X = sp.csr_matrix(sc_adata.X)
metadata = pd.read_csv("../metadata.csv",sep='\t', header=0, index_col=0)
sc_adata.obs = metadata
r_embedding = pd.read_csv("../scRNA_embedding.csv",sep='\t', header=0, index_col=0)
sc_adata.obsm["X_umap"] = r_embedding.values
sc.pl.umap(sc_adata, color='celltype')
# 过滤线粒体基因
sc_adata.var['MT_gene'] = [gene.startswith('MT-') for gene in sc_adata.var.index]
sc_adata.obsm['MT'] = sc_adata[:, sc_adata.var['MT_gene'].values].X.toarray()
sc_adata = sc_adata[:, ~sc_adata.var['MT_gene'].values]
# 挑选基因
from cell2location.utils.filtering import filter_genes
selected = filter_genes(sc_adata, cell_count_cutoff=5, cell_percentage_cutoff2=0.15,nonz_mean_cutoff=3)
# 为回归模型准备 anndata 对象,估计参考细胞类型特征,计算每个基因在每个细胞类型的表达特征
import cell2location
cell2location.models.RegressionModel.setup_anndata(adata=sc_adata,
# cell type, covariate used for constructing signatures
labels_key='celltype',
batch_key='orig.ident'
)
# 创建回归模型
from cell2location.models import RegressionModel
mod = RegressionModel(sc_adata)
# 使用所有数据进行训练(尚未实施验证,train_size=1)
mod.train(max_epochs=500,train_size=1, lr=0.002)
mod.plot_history(20)
# 保存模型数据至单细胞数据
sc_adata = mod.export_posterior(
sc_adata, sample_kwargs={'num_samples': 1000,'batch_size': 2500}
)
# 质控结果,强对角线为优异
mod.plot_QC()
#
sc_adata = mod.export_posterior(
sc_adata, use_quantiles=True,
# choose quantiles
add_to_varm=["q05","q50", "q95", "q0001"],
sample_kwargs={'batch_size': 2500}
)
# 输出每个基因在每种细胞类型中的估计表达
if 'means_per_cluster_mu_fg' in sc_adata.varm.keys():
inf_aver = sc_adata.varm['means_per_cluster_mu_fg'][[f'means_per_cluster_mu_fg_{i}'
for i in sc_adata.uns['mod']['factor_names']]].copy()
else:
inf_aver = sc_adata.var[[f'means_per_cluster_mu_fg_{i}'
for i in sc_adata.uns['mod']['factor_names']]].copy()
inf_aver.columns = sc_adata.uns['mod']['factor_names']
inf_aver.iloc[0:5, 0:13]
解析空转spot细胞组成
# 空间转录组部分
from pathlib import Path
adata_dct = {}
for i in Path("../").glob("rawdata_st./*"):
_s = str(i).split('_')[2]
_a = sc.read_visium(i,library_id=_s)
_a.obs.index = [_s + "_" + bc[0:len(bc)] for bc in _a.obs.index.tolist()]
_a.var_names_make_unique()
adata_dct[_s] = _a
st_adata = sc.concat(adata_dct,label="sample",uns_merge="unique")
# 提取共享基因并准备anndata
intersect = np.intersect1d(st_adata.var_names, inf_aver.index)
st_adata = st_adata[:, intersect].copy()
inf_aver = inf_aver.loc[intersect, :].copy()
# 构建训练模型,注意 N_cells_per_location(每个spot的细胞数)和detection_alpha(试验受技术影响从成都)参数
mod = cell2location.models.Cell2location(
st_adata, cell_state_df=inf_aver,
# the expected average cell abundance: tissue-dependent
# hyper-prior which can be estimated from paired histology:
N_cells_per_location=30,
# hyperparameter controlling normalisation of
# within-experiment variation in RNA detection:
detection_alpha=20
)
# 训练模型
mod.train(max_epochs=30000,
# train using full data (batch_size=None)
batch_size=None,
# use all data points in training because
# we need to estimate cell abundance at all locations
train_size=1
)
mod.plot_history(200)
# 保存训练数据于空转对象中
st_adata = mod.export_posterior(
st_adata, sample_kwargs={'num_samples': 1000, 'batch_size': mod.adata.n_obs}
)
# 质控,强对角线为优
mod.plot_QC()
# 输出结果,每个spot的细胞组成情况
pd.DataFrame(st_adata.obsm['q05_cell_abundance_w_sf']).to_csv("../rawdata_st/st_cell2location_res.csv")
相关可视化
# 绘图-细胞丰度
st_adata.obs[st_adata.uns['mod']['factor_names']] = st_adata.obsm['q05_cell_abundance_w_sf']
from cell2location.utils import select_slide
import matplotlib as mpl
# 选取切片样本
slide = select_slide(st_adata, 'D3')
# slide = select_slide(st_adata, 'H1')
#
with mpl.rc_context({'axes.facecolor': 'black',
'figure.figsize': [4.5, 5]}):
sc.pl.spatial(slide, cmap='magma',
# show first 8 cell types
color=['Hep1','Hep2','Hep3','Hep4','LSEC','Cho'],
ncols=2, size=1.3,
img_key='hires',
# limit color scale at 99.2% quantile of cell abundance
vmin=0, vmax='p99.2'
)
clust_labels = ['Hep1','Hep2','Hep3','Hep4','LSEC','Cho']
clust_col = ['' + str(i) for i in clust_labels] # in case column names differ from labels
# 绘图-细胞叠加
fig = plot_spatial(
adata=slide,
# labels to show on a plot
color=clust_col, labels=clust_labels,
show_img=True,
# 'fast' (white background) or 'dark_background'
style='fast',
# limit color scale at 99.2% quantile of cell abundance
max_color_quantile=0.992,
# size of locations (adjust depending on figure size)
circle_diameter=6,
colorbar_position='right'
)
# 单细胞数据集亚群特征基因集
scRNA <- readRDS("../liver-spatial/rawdata_sc/scRNA.rds")
DefaultAssay(scRNA) <- "RNA"
sc.markers <- FindAllMarkers(scRNA, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
sc.markers = sc.markers %>% arrange(cluster,desc(avg_log2FC))
celltype_specific = sc.markers[,c("cluster","gene")]
colnames(celltype_specific)[1]="celltype"
# 空转数据集亚群特征基因集
stRNA <- readRDS("../liver-spatial/stRNA.rds")
st.markers <- FindAllMarkers(stRNA, only.pos = TRUE, min.pct = 0.2, logfc.threshold = 0.2)
st.markers = st.markers %>% arrange(cluster,desc(avg_log2FC))
region_specific = st.markers[,c("cluster","gene")]
colnames(region_specific)[1]="region"
区域细胞分布-MIA
MIA_sc_st<-function(sp.diff,sc.diff,sample,outdir){
PVALUE.CUTOFF = 1
QVALUE.CUTOFF = 1
MIN.GSSIZE = 1
MAX.GSSIZE = 10000
PADJUST.METHOD = "BH"
library(clusterProfiler)
library(ggplot2)
library(foreach)
library(ComplexHeatmap)
library(circlize)
enrich.list = list()
TERM2NAME = data.frame(term = sp.diff$region, name = sp.diff$region, row.names=NULL, stringsAsFactors=FALSE)
TERM2GENE = data.frame(sp.diff, row.names=NULL, stringsAsFactors=FALSE)
gmt.annot = list(term2gene = TERM2GENE, term2name = TERM2NAME)
clusters.list = unique(sc.diff$celltype)
for(each.cluster in clusters.list){
diff.gene = as.character(sc.diff[which(sc.diff$celltype == each.cluster),]$gene)
enrich = enricher(
gene = diff.gene,
TERM2GENE = gmt.annot$term2gene,
TERM2NAME = gmt.annot$term2name,
pAdjustMethod = PADJUST.METHOD,
pvalueCutoff = PVALUE.CUTOFF,
qvalueCutoff = QVALUE.CUTOFF,
minGSSize = MIN.GSSIZE,
maxGSSize = MAX.GSSIZE
)
enrich.res = enrich[,c(2,5,6)]
enrich.res['sc.celltpye'] = each.cluster
enrich.res['ES'] = -log(enrich.res$p.adjust)
enrich.list[[each.cluster]] = enrich.res
}
for (i in 1:length(enrich.list)){
data = rbind(data,enrich.list[[i]])
}
write.csv(data,file = paste(outdir , 'MIA.Result.csv',sep = '/'),quote = F,row.names = F)
all.terms = NULL
for(x in enrich.list){
if(class(x)!="logical"){
indices = which(!rownames(x) %in% names(all.terms))
new.terms = x$Description[indices]
names(new.terms) = rownames(x)[indices]
all.terms = c(all.terms, new.terms)
}
}
padj.df = foreach(x = enrich.list, .combine=rbind) %do% {
if(class(x)!="logical"){
padj = x[names(all.terms),"p.adjust"]
padj[is.na(padj)] = 1
}else{
padj = rep(1,length(all.terms))
}
names(padj) = names(all.terms)
return(padj)
}
rownames(padj.df) = names(enrich.list)
padj.df = t(padj.df)
min.qadj.terms = apply(padj.df, 1, min)
min.qadj.terms = sort(min.qadj.terms)
indices = which(min.qadj.terms<0.05 & 1:length(min.qadj.terms)<=20)
if(length(indices) == 0){
indices = 1:min(20,length(min.qadj.terms))
}
plot.terms = names(min.qadj.terms)[indices]
if(length(plot.terms) > 1){
plot.data = padj.df[plot.terms,]
plot.data = -log10(plot.data)
plot.data[plot.data>8] = 8
plot.data = plot.data[1:min(nrow(plot.data),20),]
ora.pal = colorRampPalette(c("#FFFFFF","#D96354", "#500019"))
dbname = 'MIA'
outdir = outdir
prefix = sample
ht = Heatmap(plot.data,
col = colorRamp2(breaks=0:8, colors=ora.pal(9)),
column_title="cluster",
column_title_side = "bottom",
rect_gp = gpar(col = "black"),
show_row_names=FALSE,
heatmap_legend_param = list(
title = "Pvalue adjust\n (-log10)",
title_position="leftcenter",
border=NULL,
at = seq(0,8,2),
labels = c(0,2,4,6,">8"),
legend_width = unit(0.5,"npc"),
legend_direction = "horizontal"))
width.genename = min(max(nchar(all.terms[plot.terms]))/5,10)
genename = rowAnnotation(pct = row_anno_text(all.terms[plot.terms], just = "left",
offset = unit(0, "npc"), gp = gpar(col="black",fontsize=9)), width = unit(width.genename,"cm"))
png(file.path(outdir, paste(prefix,".cluster.",dbname,".enrich.heatmap.png",sep="")),type="cairo-png",width=10*200,height=6*200,res=200)
draw(ht+genename, heatmap_legend_side = "bottom")
dev.off()
pdf(file.path(outdir, paste(prefix,".cluster.",dbname,".enrich.heatmap.pdf",sep="")),width=10,height=6)
draw(ht+genename, heatmap_legend_side = "bottom")
dev.off()
}
}
MIA_result=MIA_sc_st(region_specific,celltype_specific,"liver",'./')
# 结合cell2location和Seurat降维信息查看
cell2location <- read.csv("../liver-spatial/rawdata_st/st_cell2location_res.csv",row.names=1)
cell2location <- cell2location[colnames(stRNA),]
cell2location <- cell2location[colnames(stRNA),]
stRNA <- AddMetaData(stRNA,metadata = cell2location)
SpatialFeaturePlot(stRNA, features = "q05cell_abundance_w_sf_Hep3", pt.size.factor = 1,
crop = FALSE, alpha = c(0.1,1))
细胞组成降维(Seurat/scanpy)
# 利用cell2location的结果进行降维 (Seurat)
DefaultAssay(stRNA) <- "SCT"
stRNA@reductions$harmony@cell.embeddings <- as.matrix(cell2location)
stRNA <- RunUMAP(stRNA, reduction = "harmony", dims = 1:13)
stRNA <- FindNeighbors(stRNA, reduction = "harmony", dims = 1:13)
stRNA <- FindClusters(stRNA, verbose = T,resolution = 0.5)
DimPlot(stRNA, reduction = "umap", label = TRUE,group.by = 'SCT_snn_res.0.5',pt.size = 1)
SpatialDimPlot(stRNA,label = T)
# 利用cell2location的结果进行降维 (Scanpy)
st_adata_ = st_adata_[st_adata.obs.index, :]
st_adata_.obsm['q05_cell_abundance_w_sf'] = st_adata.obsm['q05_cell_abundance_w_sf']
sc.pp.neighbors(st_adata_, use_rep='q05_cell_abundance_w_sf')
sc.tl.leiden(st_adata_, resolution=0.5)
st_adata_.obs["region_cluster"] = st_adata_.obs["leiden"].astype("category")
sc.pl.umap(st_adata_, color=['region_cluster'], size=30,
color_map='magma', legend_loc='on data',
legend_fontsize=15)
fig, axs = plt.subplots(1, 5, figsize=(15, 10))
for i, library in enumerate(
['D1','D2',"H1","D3","H2"]
):
ad = st_adata_[st_adata_.obs.library_id == library, :].copy()
sc.pl.spatial(
ad,
img_key="hires",
library_id=library,
color=['region_cluster'],
size=1.5,
color_map='magma',
ax=axs[i],
)
plt.tight_layout()