单细胞和空间转录组联合分析-Cell2location+MIA

文摘   科学   2024-03-17 02:02   浙江  
数据来源
Spatial proteogenomics reveals distinct and evolutionarily conserved hepatic macrophage niches,Cell,2022
Single-Cell, Single-Nucleus, and Spatial RNA Sequencing of the Human Liver Identifies Cholangiocyte and Mesenchymal Heterogeneity,Hepatology communications,2021

Window上用PyCharm(python3.9.7) Python版本的Posit(Rstudio)

提取Seurat单细胞数据

# 矩阵信息
write.table(t(as.matrix(scRNA@assays$RNA@counts)),"scRNA_counts.csv",
            sep="\t",quote=FALSE,row.names=T,col.names=T)
# 降维信息
embedding <- Embeddings(scRNA, "umap")
write.table(embedding,"scRNA_embedding.csv",
            sep="\t",quote=FALSE,row.names=T,col.names=T)
# metadata
write.table(scRNA@meta.data,"metadata.csv",
            sep="\t",quote=FALSE,row.names=T,col.names=T)

Cell2location安装(报错解决)

pip install cell2location
pip install pip install scvi-tools==0.17.4 
vim /ifs/home/yusenwei/miniconda3/envs/velocyto/lib/python3.9/site-packages/pytorch_lightning/callbacks/progress/rich_progress.py
# 修改
from lightning_utilities.core.imports import compare_version

细胞类型-基因估计表达模型构建

import os
os.getcwd()
# 单细胞部分
import scanpy as sc
import scipy.sparse as sp
import pandas as pd
sc_adata = sc.read_text("../scRNA_counts.csv",delimiter='\t',first_column_names=True)
sc_adata.X = sp.csr_matrix(sc_adata.X)
metadata = pd.read_csv("../metadata.csv",sep='\t', header=0, index_col=0)
sc_adata.obs = metadata
r_embedding = pd.read_csv("../scRNA_embedding.csv",sep='\t', header=0, index_col=0)
sc_adata.obsm["X_umap"] = r_embedding.values
sc.pl.umap(sc_adata, color='celltype')
# 过滤线粒体基因
sc_adata.var['MT_gene'] = [gene.startswith('MT-'for gene in sc_adata.var.index]
sc_adata.obsm['MT'] = sc_adata[:, sc_adata.var['MT_gene'].values].X.toarray()
sc_adata = sc_adata[:, ~sc_adata.var['MT_gene'].values]
# 挑选基因
from cell2location.utils.filtering import filter_genes
selected = filter_genes(sc_adata, cell_count_cutoff=5, cell_percentage_cutoff2=0.15,nonz_mean_cutoff=3)
# 为回归模型准备 anndata 对象,估计参考细胞类型特征,计算每个基因在每个细胞类型的表达特征
import cell2location
cell2location.models.RegressionModel.setup_anndata(adata=sc_adata,
                        # cell type, covariate used for constructing signatures
                        labels_key='celltype',
                        batch_key='orig.ident'
                       )
# 创建回归模型
from cell2location.models import RegressionModel
mod = RegressionModel(sc_adata)
# 使用所有数据进行训练(尚未实施验证,train_size=1
mod.train(max_epochs=500,train_size=1, lr=0.002)
mod.plot_history(20)
# 保存模型数据至单细胞数据
sc_adata = mod.export_posterior(
    sc_adata, sample_kwargs={'num_samples'1000,'batch_size'2500}
)
# 质控结果,强对角线为优异
mod.plot_QC()

sc_adata = mod.export_posterior(
    sc_adata, use_quantiles=True,
    # choose quantiles
    add_to_varm=["q05","q50""q95""q0001"],
    sample_kwargs={'batch_size'2500}
)
# 输出每个基因在每种细胞类型中的估计表达
if 'means_per_cluster_mu_fg' in sc_adata.varm.keys():
    inf_aver = sc_adata.varm['means_per_cluster_mu_fg'][[f'means_per_cluster_mu_fg_{i}'
                                    for i in sc_adata.uns['mod']['factor_names']]].copy()
else:
    inf_aver = sc_adata.var[[f'means_per_cluster_mu_fg_{i}'
                                    for i in sc_adata.uns['mod']['factor_names']]].copy()
inf_aver.columns = sc_adata.uns['mod']['factor_names']
inf_aver.iloc[0:50:13]

解析空转spot细胞组成

# 空间转录组部分
from pathlib import Path
adata_dct = {}
for i in Path("../").glob("rawdata_st./*"):
  _s 
= str(i).split('_')[2]
  _a = sc.read_visium(i,library_id=_s)
  _a.obs.index = [_s + "_" + bc[0:len(bc)] for bc in _a.obs.index.tolist()]
  _a.var_names_make_unique()
  adata_dct[_s] = _a
st_adata = sc.concat(adata_dct,label="sample",uns_merge="unique")
# 提取共享基因并准备anndata
intersect = np.intersect1d(st_adata.var_names, inf_aver.index)
st_adata = st_adata[:, intersect].copy()
inf_aver = inf_aver.loc[intersect, :].copy()
# 构建训练模型,注意 N_cells_per_location(每个spot的细胞数)和detection_alpha(试验受技术影响从成都)参数
mod = cell2location.models.Cell2location(
    st_adata, cell_state_df=inf_aver,
    # the expected average cell abundance: tissue-dependent
    # hyper-prior which can be estimated from paired histology:
    N_cells_per_location=30,
    # hyperparameter controlling normalisation of
    # within-experiment variation in RNA detection:
    detection_alpha=20
)
# 训练模型
mod.train(max_epochs=30000
          # train using full data (batch_size=None)
          batch_size
=None,
          # use all data points in training because
          # we need to estimate cell abundance at all locations
          train_size=1
         )
mod.plot_history(200)
# 保存训练数据于空转对象中
st_adata = mod.export_posterior(
    st_adata, sample_kwargs={'num_samples'1000'batch_size': mod.adata.n_obs}
)
# 质控,强对角线为优
mod.plot_QC()
# 输出结果,每个spot的细胞组成情况
pd.DataFrame(st_adata.obsm['q05_cell_abundance_w_sf']).to_csv("../rawdata_st/st_cell2location_res.csv")

相关可视化

# 绘图-细胞丰度
st_adata.obs[st_adata.uns['mod']['factor_names']] = st_adata.obsm['q05_cell_abundance_w_sf']
from cell2location.utils import select_slide
import matplotlib as mpl
# 选取切片样本
slide = select_slide(st_adata, 'D3')
# slide = select_slide(st_adata, 'H1')

with mpl.rc_context({'axes.facecolor':  'black',
                     'figure.figsize': [4.55]}):

    sc.pl.spatial(slide, cmap='magma',
                  # show first 8 cell types
                  color=['Hep1','Hep2','Hep3','Hep4','LSEC','Cho'],
                  ncols=2, size=1.3,
                  img_key='hires',
                  # limit color scale at 99.2% quantile of cell abundance
                  vmin=0, vmax='p99.2'
                 )
clust_labels = ['Hep1','Hep2','Hep3','Hep4','LSEC','Cho']
clust_col = ['' + str(i) for i in clust_labels] # in case column names differ from labels
# 绘图-细胞叠加
 fig = plot_spatial(
        adata=slide,
        # labels to show on a plot
        color=clust_col, labels=clust_labels,
        show_img=True,
        # 'fast' (white background) or 'dark_background'
        style='fast',
        # limit color scale at 99.2% quantile of cell abundance
        max_color_quantile=0.992,
        # size of locations (adjust depending on figure size)
        circle_diameter
=6,
        colorbar_position='right'
    )
# 单细胞数据集亚群特征基因集
scRNA <- readRDS("../liver-spatial/rawdata_sc/scRNA.rds")
DefaultAssay(scRNA) <- "RNA"
sc.markers <- FindAllMarkers(scRNA, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
sc.markers = sc.markers %>% arrange(cluster,desc(avg_log2FC))
celltype_specific = sc.markers[,c("cluster","gene")]
colnames(celltype_specific)[1]="celltype"
# 空转数据集亚群特征基因集
stRNA <- readRDS("../liver-spatial/stRNA.rds")
st.markers <- FindAllMarkers(stRNA, only.pos = TRUE, min.pct = 0.2, logfc.threshold = 0.2)
st.markers = st.markers %>% arrange(cluster,desc(avg_log2FC))
region_specific = st.markers[,c("cluster","gene")]
colnames(region_specific)[1]="region"

区域细胞分布-MIA

MIA_sc_st<-function(sp.diff,sc.diff,sample,outdir){
  PVALUE.CUTOFF = 1
  QVALUE.CUTOFF = 1
  MIN.GSSIZE = 1
  MAX.GSSIZE = 10000
  PADJUST.METHOD = "BH"
  
  library(clusterProfiler)
  library(ggplot2)
  library(foreach)
  library(ComplexHeatmap)
  library(circlize)
  
  enrich.list = list()
  
  TERM2NAME = data.frame(term = sp.diff$region, name = sp.diff$region, row.names=NULL, stringsAsFactors=FALSE)
  TERM2GENE = data.frame(sp.diff, row.names=NULL, stringsAsFactors=FALSE)
  gmt.annot = list(term2gene = TERM2GENE, term2name = TERM2NAME)
  clusters.list = unique(sc.diff$celltype)
  for(each.cluster in clusters.list){
    diff.gene = as.character(sc.diff[which(sc.diff$celltype == each.cluster),]$gene)
    enrich = enricher(
      gene = diff.gene,
      TERM2GENE = gmt.annot$term2gene,
      TERM2NAME = gmt.annot$term2name,
      pAdjustMethod = PADJUST.METHOD,
      pvalueCutoff = PVALUE.CUTOFF,
      qvalueCutoff = QVALUE.CUTOFF,
      minGSSize = MIN.GSSIZE,
      maxGSSize = MAX.GSSIZE
    )
    enrich.res = enrich[,c(2,5,6)]
    enrich.res['sc.celltpye'] = each.cluster
    enrich.res['ES'] = -log(enrich.res$p.adjust)
    enrich.list[[each.cluster]] = enrich.res
  }
  
  for (i in 1:length(enrich.list)){
    data = rbind(data,enrich.list[[i]])
  }
  
  write.csv(data,file = paste(outdir , 'MIA.Result.csv',sep = '/'),quote = F,row.names = F)

  all.terms = NULL
  for(x in enrich.list)
{
    if(class(x)!="logical"){
      indices = which(!rownames(x) %in% names(all.terms))
      new.terms = x$Description[indices]
      names(new.terms) = rownames(x)[indices]
      all.terms = c(all.terms, new.terms)
    }
  }
  
  padj.df = foreach(x = enrich.list, .combine=rbind) %do% {
    if(class(x)!="logical"){
      padj = x[names(all.terms),"p.adjust"]
      padj[is.na(padj)] = 1
    }else{
      padj = rep(1,length(all.terms))
    }
    names(padj) = names(all.terms)
    return(padj)
  }
  rownames(padj.df) = names(enrich.list)
  padj.df = t(padj.df)
  
  min.qadj.terms = apply(padj.df, 1, min)
  min.qadj.terms = sort(min.qadj.terms)
  indices = which(min.qadj.terms<0.05 & 1:length(min.qadj.terms)<=20)
  if(length(indices) == 0){
    indices = 1:min(20,length(min.qadj.terms))
  }
  plot.terms = names(min.qadj.terms)[indices]
  
  if(length(plot.terms) > 1){
    
    plot.data = padj.df[plot.terms,]
    plot.data = -log10(plot.data)
    
    plot.data[plot.data>8] = 8
    plot.data = plot.data[1:min(nrow(plot.data),20),]
    
    ora.pal = colorRampPalette(c("#FFFFFF","#D96354""#500019"))
    
    dbname = 'MIA'
    outdir = outdir
    prefix = sample
    
    ht = Heatmap(plot.data, 
                 col = colorRamp2(breaks=0:8, colors=ora.pal(9)),
                 column_title="cluster",
                 column_title_side = "bottom",
                 rect_gp = gpar(col = "black"),
                 show_row_names=FALSE,
                 heatmap_legend_param = list(
                   title = "Pvalue adjust\n   (-log10)",
                   title_position="leftcenter",
                   border=NULL,
                   at = seq(0,8,2),
                   labels = c(0,2,4,6,">8"),
                   legend_width = unit(0.5,"npc"),
                   legend_direction = "horizontal"))
    
    width.genename = min(max(nchar(all.terms[plot.terms]))/5,10)
    genename = rowAnnotation(pct = row_anno_text(all.terms[plot.terms], just = "left"
                                                 offset = unit(0"npc"), gp = gpar(col="black",fontsize=9)), width = unit(width.genename,"cm"))
    png(file.path(outdir, paste(prefix,".cluster.",dbname,".enrich.heatmap.png",sep="")),type="cairo-png",width=10*200,height=6*200,res=200)
    draw(ht+genename, heatmap_legend_side = "bottom")
    dev.off()
    pdf(file.path(outdir, paste(prefix,".cluster.",dbname,".enrich.heatmap.pdf",sep="")),width=10,height=6)
    draw(ht+genename, heatmap_legend_side = "bottom")
    dev.off()
  }
}
MIA_result=MIA_sc_st(region_specific,celltype_specific,"liver",'./')
# 结合cell2location和Seurat降维信息查看
cell2location <- read.csv("../liver-spatial/rawdata_st/st_cell2location_res.csv",row.names=1)
cell2location <- cell2location[colnames(stRNA),]
cell2location <- cell2location[colnames(stRNA),]
stRNA <- AddMetaData(stRNA,metadata = cell2location)
SpatialFeaturePlot(stRNA, features = "q05cell_abundance_w_sf_Hep3", pt.size.factor = 1,
                   crop = FALSE, alpha = c(0.1,1))

细胞组成降维(Seurat/scanpy)

# 利用cell2location的结果进行降维 (Seurat)
DefaultAssay(stRNA) <- "SCT"
stRNA@reductions$harmony@cell.embeddings <- as.matrix(cell2location)
stRNA <- RunUMAP(stRNA, reduction = "harmony", dims = 1:13)
stRNA <- FindNeighbors(stRNA, reduction = "harmony", dims = 1:13)
stRNA <- FindClusters(stRNA, verbose = T,resolution = 0.5)
DimPlot(stRNA, reduction = "umap", label = TRUE,group.by = 'SCT_snn_res.0.5',pt.size = 1)
SpatialDimPlot(stRNA,label = T)
# 利用cell2location的结果进行降维 (Scanpy)
st_adata_ = st_adata_[st_adata.obs.index, :]
st_adata_.obsm['q05_cell_abundance_w_sf'] = st_adata.obsm['q05_cell_abundance_w_sf']
sc.pp.neighbors(st_adata_, use_rep='q05_cell_abundance_w_sf')
sc.tl.leiden(st_adata_, resolution=0.5)
st_adata_.obs["region_cluster"] = st_adata_.obs["leiden"].astype("category")
sc.pl.umap(st_adata_, color=['region_cluster'], size=30,
           color_map='magma',  legend_loc='on data',
           legend_fontsize=15)
           fig, axs = plt.subplots(15, figsize=(1510))
for i, library in enumerate(
    ['D1','D2',"H1","D3","H2"]
)
:
    ad 
= st_adata_[st_adata_.obs.library_id == library, :].copy()
    sc.pl.spatial(
        ad,
        img_key="hires",
        library_id=library,
        color=['region_cluster'],
        size=1.5,
        color_map='magma',
        ax=axs[i],
    )

plt.tight_layout()


朴素的科研打工仔
专注于文献的分享,浙大研究生学习生活的记录。
 最新文章