相关性|空转+空代区域GSVA相关性分析

文摘   2024-12-27 11:33   浙江  
写在前面的话
  年度技术再次给到了空间组学,单细胞到空间的各种联合分析是大势所趋。分享一下空转和空代的区域相关分析,空转GSVA矩阵获取步骤和空转空代联合分析略过了(这部分后续会更新)。
  得出结论:
(1)GSVA评分确实是不考虑表达矩阵,只考虑提供的背景基因集或者代谢集,根据累计密度行数进行排序,计算分值。
(2)从相关性分析可以得出,肿瘤区域内的细胞毒性T淋巴细胞的富集与糖酵解代谢活性呈现负相关。

下载KEGG数据库并提取整理所需信息

BiocManager::install("KEGGREST",force = T)
BiocManager::install("RbioRXN")
BiocManager::install("fmcsR")
library(KEGGREST)
library(fmcsR)
library(plyr)
library(devtools)
library(stringr)
library(dplyr)
# 修复HTTPS
devtools::install_github("https://github.com/kozo2/KEGGREST/tree/patch-1")
hsa_pathway <- keggList("pathway","hsa") # 获取KEGG数据库中所有人类通路
hsa_path <- data.frame(hsa_pathway) # 转成数据框,方便后续分析
hsa_path$pathID <- substr(rownames(hsa_path),6,nchar(rownames(hsa_path)[1])) # 提取pathway ID
# 提取通路中的所有代谢物ID及名称
match.df <- vector()
for (i in 1:nrow(hsa_path)) {
  hsa_info <- keggGet(hsa_path[i,"pathID"])
  hsa_compound <- hsa_info[[1]]$COMPOUND
  path_name <- hsa_info[[1]]$NAME
  if(length(hsa_compound)>0)
  
{
    cpd <- names(hsa_compound[1])
    cpd_name <- as.character(hsa_compound[1])
    for (j in 1:(length(hsa_compound)-1)) {
      cpd <- paste(cpd,names(hsa_compound)[j+1],sep = ";")
      cpd_name <- paste(cpd_name,as.character(hsa_compound)[j+1],sep = ";")
    }
    match.ver <- c(hsa_path[i,"pathID"],path_name,cpd,cpd_name)
    match.df <- rbind(match.df,match.ver)
  }
if(length(hsa_compound)==0){
    match.df <- rbind(match.df,c(hsa_path[i,"pathID"],path_name,"",""))
  }
}
修复后还是报错
Error in .getUrl(url, .flatFileParser) : Bad Request (HTTP 400).

偶然发现重新定义函数,不直接使用keggGet就不会出现网络问题。期间还试着更换R版本,更新KEGGREST的最新版~~,都无济于事

library(dplyr)
library(KEGGREST)
library(purrr)##为了数据整合、过滤list
library(progress)##显示下载进度条

定义函数
1. #获取物种的通路信息,通路名称和通路ID
request_kegg_pathway_info<-
  function(organism="hsa")
{
    pathway_id<-
      KEGGREST::keggList(database="pathway",organism=organism)
    result<-
      data.frame(KEGG.ID=names(pathway_id),
                 Pathway.name=unname(pathway_id))%>%
      dplyr::mutate(KEGG.ID=stringr::str_replace(KEGG.ID,"path\\:",""))
    return(result)
  }
2. #读取整理下载的KEGG各个数据库
read_kegg_pathway<-function(path=".",organism="hsa"){
  load(file.path(path,organism))
  pb<-progress::progress_bar$new(total=length(kegg_pathway_database))
  kegg_data<-
    seq_len(length(kegg_pathway_database))%>%
    purrr::map(function(i){
      pb$tick()
      pathway_id<-
        kegg_pathway_database[[i]]$ENTRY
      pathway_name<-
        kegg_pathway_database[[i]]$PATHWAY_MAP%>%
        unname()
      describtion<-
        kegg_pathway_database[[i]]$DESCRIPTION
      pathway_class<-
        kegg_pathway_database[[i]]$CLASS
      gene_list<-
        kegg_pathway_database[[i]]$GENE
      if(is.null(gene_list))
{
        gene_list<-data.frame()
      }else{
        gene_list<-
          data.frame(
            KEGG.ID=gene_list[seq(1,length(gene_list)-1,by=2)],
            Gene.name=gene_list[seq(2,length(gene_list),by=2)],
            stringsAsFactors=FALSE
          )
      }
      compound_list<-
        kegg_pathway_database[[i]]$COMPOUND
      compound_list<-
        data.frame(
          KEGG.ID=names(kegg_pathway_database[[i]]$COMPOUND),
          Compound.name=kegg_pathway_database[[i]]$COMPOUND,
          stringsAsFactors=FALSE
        )
      related_disease<-
        data.frame(
          Disease.ID=names(kegg_pathway_database[[i]]$DISEASE),
          Disease.name=kegg_pathway_database[[i]]$DISEASE,
          stringsAsFactors=FALSE
        )
      related_module<-
        data.frame(
          Module.ID=names(kegg_pathway_database[[i]]$MODULE),
          Module.name=kegg_pathway_database[[i]]$MODULE,
          stringsAsFactors=FALSE
        )
      list(
        pathway_id=pathway_id,
        pathway_name=pathway_name,
        describtion=describtion,
        pathway_class=pathway_class,
        gene_list=gene_list,
        compound_list=compound_list,
        related_disease=related_disease,
        related_module=related_module
      )
    })
return(kegg_data)
}

3. #解析KEGG的通路信息,获取通路的基因名称,ko,基因描述,酶的编号信息等
parseKEGGInfo=function(keggdes){
  kg_lst=unlist(strsplit(keggdes,split=";"))
  gene_name=kg_lst[1]
  gene_des=sub("(.*)\\[KO.*","\\1",kg_lst[2])
  ko=sub(".*\\[KO:(.*)\\]\\[.*","\\1",kg_lst[2])
  ec=sub(".*\\]\\[EC:(.*)\\]","\\1",kg_lst[2])
return(c(gene_name,ko,gene_des,ec))
}
4. #按照自己的需求和格式提取数据
extractKEGG<-function(data){
  pathway<-
    list(
      database_info=list(source="KEGG",
                         version=as.character(Sys.Date())),
      pathway_id=lapply(data,function(x)
        x$pathway_id),
      pathway_name=lapply(data,function(x)
        x$pathway_name),
      describtion=lapply(data,function(x)
        x$describtion),
      pathway_class=lapply(data,function(x)
        x$pathway_class),
      gene_list=lapply(data,function(x)
        x$gene_list),
      compound_list=lapply(data,function(x)
        x$compound_list),
      protein_list=list(),
      reference_list=list(),
      related_disease=lapply(data,function(x)
        x$related_disease),
      related_module=lapply(data,function(x)
        x$related_module)
    )
if(length(pathway$gene_list)==0){
    pathway@gene_list<-
      vector(mode="list",
             length=length(pathway$pathway_id))%>%
      purrr::map(function(x){
        x=data.frame()
        x
      })
  }
if(length(pathway$compound_list)==0){
    pathway$compound_list<-
      vector(mode="list",
             length=length(pathway$pathway_id))%>%
      purrr::map(function(x){
        x=data.frame()
        x
      })
  }
if(length(pathway$protein_list)==0){
    pathway$protein_list<-
      vector(mode="list",
             length=length(pathway$pathway_id))%>%
      purrr::map(function(x){
        x=data.frame()
        x
      })
  }

4. #获取通路-基因信息-可自己根据需要修改
  gene_pathway_dt=lapply(seq_len(length(pathway$pathway_id)),function(x){
    gene_dt=pathway$gene_list[[x]]
    if(nrow(gene_dt)>0){
      pathway_id=pathway$pathway_id[[x]]
      pathway_name=pathway$pathway_name[[x]]
      pathway_class=pathway$pathway_class[[x]]
      full_gene_dt=lapply(gene_dt$Gene.name,function(gn){
        parseKEGGInfo(gn)
      })%>%do.call(rbind,.)%>%as.data.frame(.)
      full_gene_dt$ENTREZID=gene_dt$KEGG.ID
      colnames(full_gene_dt)
=c("gene_name","KO_ID","gene_des","EC","ENTREZID")
      full_gene_dt$pathway_id=pathway_id
      full_gene_dt$pathway_name=pathway_name
      full_gene_dt$pathway_id=pathway_id
      full_gene_dt$level1_pathway_name=unlist(strsplit(pathway_class,split=";"))[1]
      full_gene_dt$level2_pathway_name=unlist(strsplit(pathway_class,split=";"))[2]
      return(full_gene_dt)
    }
    return(NULL)
  })%>%purrr::keep(~!is.null(.))%>%do.call(rbind,.)%>%as.data.frame(.)
  #获取通路-代谢物信息
  compound_pathway_dt=lapply(seq_len(length(pathway$pathway_id)),function(x){
    compound_dt=pathway$compound_list[[x]]
    if(nrow(compound_dt)>0){
      pathway_id=pathway$pathway_id[[x]]
      pathway_name=pathway$pathway_name[[x]]
      pathway_class=pathway$pathway_class[[x]]
      compound_dt$pathway_id=pathway_id
      compound_dt$pathway_name=pathway_name
      compound_dt$pathway_id=pathway_id
      compound_dt$level1_pathway_name=unlist(strsplit(pathway_class,split=";"))[1]
      compound_dt$level2_pathway_name=unlist(strsplit(pathway_class,split=";"))[2]
      compound_dt$ENTREZID=compound_dt$KEGG.ID
      compound_dt$gene_name=compound_dt$KEGG.ID
      return(compound_dt)
    }
    return(NULL)
  })%>%purrr::keep(~!is.null(.))%>%do.call(rbind,.)%>%as.data.frame(.)
  return(list(gene=gene_pathway_dt,compound=compound_pathway_dt))
}
5.#下载主函数接口,设置保存的路径,时间间隔和物种信息
download_kegg_pathway<-function(path=".",
                                sleep=1,
                                organism="hsa")
{
  dir.create(path,recursive=TRUE,showWarnings=FALSE)
  kegg_id<-
    request_kegg_pathway_info(organism=organism)
  pb<-progress::progress_bar$new(total=nrow(kegg_id))
  kegg_pathway_database<-
    seq_along(kegg_id$KEGG.ID)%>%
    purrr::map(function(i){
      pb$tick()
      Sys.sleep(time=sleep)
      KEGGREST::keggGet(dbentries=kegg_id$KEGG.ID[i])[[1]]
    })

  save(kegg_pathway_database,
       file=file.path(path,organism))
}
# 步骤1:下载物种对应的KEGG信息,并保存为rds
download_kegg_pathway(path="kegg_pathway",
                      sleep=1,
                      organism="hsa")

# 步骤2:获取下载好的KEGG数据库信息
data<-read_kegg_pathway(path="kegg_pathway",organism="hsa")
# 步骤3:按自己的需求提取数据,这里展示仅提取通路-基因,通路-代谢的数据
kegg_human_pathway<-extractKEGG(data=data)
print(head(kegg_human_pathway$gene))
print(head(kegg_human_pathway$compound))

制作GSVA背景代谢集

# 制作用于GSVA的背景代谢集
gene_list <- split(hsa$gene_name,hsa$pathway_id)
# 便于计算就取前三个
gene_list <- gene_list[1:3]

读取空转注释和评分数据

# 载入空间转录组metadata信息(包括区域注释和T细胞各亚型评分信息)
meta_1 <- read.delim("HCC1_metadata.txt", header = TRUE)
# 选取肿瘤相关区域
Tumor_region <- meta_1 %>% filter(region == "Tumor")
rownames(combined_Tumor_Tregs)[1:5]
[1"AACACGTGCATCGCAC-1" "AACAGGAAGAGCATAG-1" "AACAGGTTCACCGAAG-1"
[4"AACAGTCAGGCTCCGC-1" "AACAGTCCACGCGGTG-1"

处理空代数据

# 载入空间代谢组数据
metabolism_1 <- readRDS("./metadata-both_A3.rds")
colnames(metabolism_1)[1:25]
metabolism_1[,"KEGG"][1:10]
# 可以看到有很多检出代谢物是没有KEGG数据库信息的

两种情况:

1.将没有注释到数据库的代谢物以代谢物名替代 

2.直接过滤没有注释到数据库的代谢物信息 

分别计算检验GSVA算法是否只依靠背景集进行计算

替代

metabolism_KEGG <- metabolism_1[,"KEGG","Metabolites"]
# 将没有注释的都先换成A
metabolism_KEGG <- metabolism_KEGG %>%
  mutate(col1 = case_when(
    KEGG %in% c(NA, ";",";;") ~ A,  # 
    TRUE ~ KEGG  # 其他值保持不变
  ))
metabolism_KEGG <- metabolism_KEGG %>%
  mutate(KEGG =if_else(KEGG == "A", metabolism, KEGG))
head(metabolism_KEGG[,"KEGG"])
[1"Hydroxymethyl hydrogen carbonate"                        
[2"2-Aminoethane-1,1,1-triol; Tri-hydroxymethylaminomethane"
[3"Ethynol; Ketene; Oxirene"                                
[4"C02356; C00334; C05145"                                  
[5"C00099; C00041; C00213"                                  
[6"C00186; C01013; C02154"
# 多个KEGG号的只取第一个
metabolism_KEGG$KEGG <- sapply(strsplit(as.character(metabolism_KEGG$KEGG), ";"), `[`, 1)
head(metabolism_KEGG[,"x"])
[1"Hydroxymethyl hydrogen carbonate""2-Aminoethane-1,1,1-triol"       
[3"Ethynol"                          "C02356"                          
[5"C00099"                           "C00186"
metabolism_1$KEGG <- metabolism_KEGG$KEGG
metabolism_1 <- metabolism_1[,c(13,24:4456)]
colnames(metabolism_1)[1:5]
[1"KEGG"                    "A3_922.CTATCGATCACGATTA"
[3"A3_922.ACTACCTTGGTTGACC""A3_922.GTACGAAGGCGTCTAT"
[5"A3_922.AAGGTTACTATGACGC"
# 修改细胞标签和空转一致
colnames(metabolism_1) <- gsub("A13_922\\.""", colnames(metabolism_1))
colnames(metabolism_1)[2:ncol(metabolism_1)] <- paste0(colnames(metabolism_1)[2:ncol(metabolism_1)], "-1")
# 提取共有标签
valid_columns <- rownames(Tumor_region)[rownames(Tumor_region) %in% colnames(metabolism_1)]
metabolism_1_filter <- metabolism_1[,c("KEGG",valid_columns)]
exprSet <- column_to_rownames(metabolism_1_filter, var='KEGG')
# 报错,重复名
# 进行重复的对象取均值
exprSet<-aggregate(x=metabolism_1_filter[,2:(ncol(metabolism_1_filter))],by=list(metabolism_1_filter$KEGG),FUN=mean)
library(tidyverse)
exprSet <- column_to_rownames(exprSet, var='Group.1')
library(GSVA) 
exprSet <- as.matrix(exprSet)# GSVA需要矩阵而不是数据框 
exprSet_1 <- apply(exprSet, 2, function(x) as.numeric(as.character(x)))#数值类型转换 
rownames(exprSet_1) <- rownames(exprSet)#重命名 #gsva计算,kcdf可传参Gaussian"和"Poisson",默认是poisson,Gaussian"forlogCPM,logRPKM,logTPM,"Poisson"forcounts
GSVA <- gsva(expr = exprSet_1,            
             gset.idx.list=gene_list, 
             kcdf="Poisson" ,     
             verbose=T,   
             parallel.sz = parallel::detectCores())
GSVA_1 <- t(GSVA)
# 融合空转GSVA和空代GSVA计算矩阵
Tumor_region <- Tumor_region[valid_columns,]
combined_Tumor <- cbind(Tumor_region,GSVA_1)
a1 <- combined_Tumor[,c("GSVA_CD8CTL","hsa00010","hsa00020","hsa00030")]

过滤

# 直接过滤
# 保留KEGG列中以C开头的行
metabolism_1_filter <- metabolism_1_filter %>%
  filter(grepl("^C", KEGG))
exprSet<-aggregate(x=metabolism_1_filter[,2:(ncol(metabolism_1_filter))],by=list(metabolism_1_filter$KEGG),FUN=mean)
exprSet <- column_to_rownames(exprSet, var='Group.1')
exprSet <- as.matrix(exprSet)
exprSet_1 <- apply(exprSet, 2, function(x) as.numeric(as.character(x))) 
rownames(exprSet_1) <- rownames(exprSet)
GSVA <- gsva(expr = exprSet_1,            
             gset.idx.list=gene_list, 
             kcdf="Poisson" ,     
             verbose=T,   
             parallel.sz = parallel::detectCores())
GSVA_1 <- t(GSVA)
Tumor_region <- Tumor_region[valid_columns,]
combined_Tumor <- cbind(Tumor_region,GSVA_1)
a2 <- combined_Tumor[,c("GSVA_CD8CTL","hsa00010","hsa00020","hsa00030")]

相关性可视化

#可视化
将三个通路一起展示
# 颜色
colors = c( "#1f77b4",# 蓝色
            "#ff7f0e", # 橙色
            "#2ca02c"
#melt()函数将宽格式转化为长格式
library(reshape2)
a1_long<-melt(a1, id.vars = c("GSVA_CD8CTL"), #需保留的不参与聚合的变量列名
                  measure.vars = 2:4,#选择需要转化的列
                  variable.name = c('group'),#聚合变量的新列名
                  value.name = 'value')#聚合值的新列名

b<-ggplot(a1_long,aes(x=GSVA_CD8CTL,y=value))+
  geom_point(aes(color=group))+
  geom_rug(aes(color=group))+
  geom_smooth(aes(color=group,fill=group),method=lm
              ,fullrange=TRUE)+
  scale_color_manual(values=colors)+
  scale_fill_manual(values=colors)+
  ggpubr::stat_cor(aes(color=group),label.x=0)
  b
# 同一通路,多个样本展示
a1$group <- "sample1"
a2$group <- "sample2"
a <- rbind(a1,a2)
b<-ggplot(a,aes(x=GSVA_CD8CTL,y=hsa00010))+
  geom_point(aes(color=group))+
  geom_rug(aes(color=group))+
  geom_smooth(aes(color=group,fill=group),method=lm
              ,fullrange=TRUE)+
  scale_color_manual(values=colors)+
  scale_fill_manual(values=colors)+
  ggpubr::stat_cor(aes(color=group),label.x=0)
b


朴素的科研打工仔
专注于文献的分享,浙大研究生学习生活的记录。
 最新文章