下载KEGG数据库并提取整理所需信息
BiocManager::install("KEGGREST",force = T)
BiocManager::install("RbioRXN")
BiocManager::install("fmcsR")
library(KEGGREST)
library(fmcsR)
library(plyr)
library(devtools)
library(stringr)
library(dplyr)
# 修复HTTPS
devtools::install_github("https://github.com/kozo2/KEGGREST/tree/patch-1")
hsa_pathway <- keggList("pathway","hsa") # 获取KEGG数据库中所有人类通路
hsa_path <- data.frame(hsa_pathway) # 转成数据框,方便后续分析
hsa_path$pathID <- substr(rownames(hsa_path),6,nchar(rownames(hsa_path)[1])) # 提取pathway ID
# 提取通路中的所有代谢物ID及名称
match.df <- vector()
for (i in 1:nrow(hsa_path)) {
hsa_info <- keggGet(hsa_path[i,"pathID"])
hsa_compound <- hsa_info[[1]]$COMPOUND
path_name <- hsa_info[[1]]$NAME
if(length(hsa_compound)>0)
{
cpd <- names(hsa_compound[1])
cpd_name <- as.character(hsa_compound[1])
for (j in 1:(length(hsa_compound)-1)) {
cpd <- paste(cpd,names(hsa_compound)[j+1],sep = ";")
cpd_name <- paste(cpd_name,as.character(hsa_compound)[j+1],sep = ";")
}
match.ver <- c(hsa_path[i,"pathID"],path_name,cpd,cpd_name)
match.df <- rbind(match.df,match.ver)
}
if(length(hsa_compound)==0){
match.df <- rbind(match.df,c(hsa_path[i,"pathID"],path_name,"",""))
}
}
修复后还是报错
Error in .getUrl(url, .flatFileParser) : Bad Request (HTTP 400).
偶然发现重新定义函数,不直接使用keggGet就不会出现网络问题。期间还试着更换R版本,更新KEGGREST的最新版~~,都无济于事
library(dplyr)
library(KEGGREST)
library(purrr)##为了数据整合、过滤list
library(progress)##显示下载进度条
定义函数
1. #获取物种的通路信息,通路名称和通路ID
request_kegg_pathway_info<-
function(organism="hsa"){
pathway_id<-
KEGGREST::keggList(database="pathway",organism=organism)
result<-
data.frame(KEGG.ID=names(pathway_id),
Pathway.name=unname(pathway_id))%>%
dplyr::mutate(KEGG.ID=stringr::str_replace(KEGG.ID,"path\\:",""))
return(result)
}
2. #读取整理下载的KEGG各个数据库
read_kegg_pathway<-function(path=".",organism="hsa"){
load(file.path(path,organism))
pb<-progress::progress_bar$new(total=length(kegg_pathway_database))
kegg_data<-
seq_len(length(kegg_pathway_database))%>%
purrr::map(function(i){
pb$tick()
pathway_id<-
kegg_pathway_database[[i]]$ENTRY
pathway_name<-
kegg_pathway_database[[i]]$PATHWAY_MAP%>%
unname()
describtion<-
kegg_pathway_database[[i]]$DESCRIPTION
pathway_class<-
kegg_pathway_database[[i]]$CLASS
gene_list<-
kegg_pathway_database[[i]]$GENE
if(is.null(gene_list)){
gene_list<-data.frame()
}else{
gene_list<-
data.frame(
KEGG.ID=gene_list[seq(1,length(gene_list)-1,by=2)],
Gene.name=gene_list[seq(2,length(gene_list),by=2)],
stringsAsFactors=FALSE
)
}
compound_list<-
kegg_pathway_database[[i]]$COMPOUND
compound_list<-
data.frame(
KEGG.ID=names(kegg_pathway_database[[i]]$COMPOUND),
Compound.name=kegg_pathway_database[[i]]$COMPOUND,
stringsAsFactors=FALSE
)
related_disease<-
data.frame(
Disease.ID=names(kegg_pathway_database[[i]]$DISEASE),
Disease.name=kegg_pathway_database[[i]]$DISEASE,
stringsAsFactors=FALSE
)
related_module<-
data.frame(
Module.ID=names(kegg_pathway_database[[i]]$MODULE),
Module.name=kegg_pathway_database[[i]]$MODULE,
stringsAsFactors=FALSE
)
list(
pathway_id=pathway_id,
pathway_name=pathway_name,
describtion=describtion,
pathway_class=pathway_class,
gene_list=gene_list,
compound_list=compound_list,
related_disease=related_disease,
related_module=related_module
)
})
return(kegg_data)
}
3. #解析KEGG的通路信息,获取通路的基因名称,ko,基因描述,酶的编号信息等
parseKEGGInfo=function(keggdes){
kg_lst=unlist(strsplit(keggdes,split=";"))
gene_name=kg_lst[1]
gene_des=sub("(.*)\\[KO.*","\\1",kg_lst[2])
ko=sub(".*\\[KO:(.*)\\]\\[.*","\\1",kg_lst[2])
ec=sub(".*\\]\\[EC:(.*)\\]","\\1",kg_lst[2])
return(c(gene_name,ko,gene_des,ec))
}
4. #按照自己的需求和格式提取数据
extractKEGG<-function(data){
pathway<-
list(
database_info=list(source="KEGG",
version=as.character(Sys.Date())),
pathway_id=lapply(data,function(x)
x$pathway_id),
pathway_name=lapply(data,function(x)
x$pathway_name),
describtion=lapply(data,function(x)
x$describtion),
pathway_class=lapply(data,function(x)
x$pathway_class),
gene_list=lapply(data,function(x)
x$gene_list),
compound_list=lapply(data,function(x)
x$compound_list),
protein_list=list(),
reference_list=list(),
related_disease=lapply(data,function(x)
x$related_disease),
related_module=lapply(data,function(x)
x$related_module)
)
if(length(pathway$gene_list)==0){
pathway@gene_list<-
vector(mode="list",
length=length(pathway$pathway_id))%>%
purrr::map(function(x){
x=data.frame()
x
})
}
if(length(pathway$compound_list)==0){
pathway$compound_list<-
vector(mode="list",
length=length(pathway$pathway_id))%>%
purrr::map(function(x){
x=data.frame()
x
})
}
if(length(pathway$protein_list)==0){
pathway$protein_list<-
vector(mode="list",
length=length(pathway$pathway_id))%>%
purrr::map(function(x){
x=data.frame()
x
})
}
4. #获取通路-基因信息-可自己根据需要修改
gene_pathway_dt=lapply(seq_len(length(pathway$pathway_id)),function(x){
gene_dt=pathway$gene_list[[x]]
if(nrow(gene_dt)>0){
pathway_id=pathway$pathway_id[[x]]
pathway_name=pathway$pathway_name[[x]]
pathway_class=pathway$pathway_class[[x]]
full_gene_dt=lapply(gene_dt$Gene.name,function(gn){
parseKEGGInfo(gn)
})%>%do.call(rbind,.)%>%as.data.frame(.)
full_gene_dt$ENTREZID=gene_dt$KEGG.ID
colnames(full_gene_dt)=c("gene_name","KO_ID","gene_des","EC","ENTREZID")
full_gene_dt$pathway_id=pathway_id
full_gene_dt$pathway_name=pathway_name
full_gene_dt$pathway_id=pathway_id
full_gene_dt$level1_pathway_name=unlist(strsplit(pathway_class,split=";"))[1]
full_gene_dt$level2_pathway_name=unlist(strsplit(pathway_class,split=";"))[2]
return(full_gene_dt)
}
return(NULL)
})%>%purrr::keep(~!is.null(.))%>%do.call(rbind,.)%>%as.data.frame(.)
#获取通路-代谢物信息
compound_pathway_dt=lapply(seq_len(length(pathway$pathway_id)),function(x){
compound_dt=pathway$compound_list[[x]]
if(nrow(compound_dt)>0){
pathway_id=pathway$pathway_id[[x]]
pathway_name=pathway$pathway_name[[x]]
pathway_class=pathway$pathway_class[[x]]
compound_dt$pathway_id=pathway_id
compound_dt$pathway_name=pathway_name
compound_dt$pathway_id=pathway_id
compound_dt$level1_pathway_name=unlist(strsplit(pathway_class,split=";"))[1]
compound_dt$level2_pathway_name=unlist(strsplit(pathway_class,split=";"))[2]
compound_dt$ENTREZID=compound_dt$KEGG.ID
compound_dt$gene_name=compound_dt$KEGG.ID
return(compound_dt)
}
return(NULL)
})%>%purrr::keep(~!is.null(.))%>%do.call(rbind,.)%>%as.data.frame(.)
return(list(gene=gene_pathway_dt,compound=compound_pathway_dt))
}
5.#下载主函数接口,设置保存的路径,时间间隔和物种信息
download_kegg_pathway<-function(path=".",
sleep=1,
organism="hsa"){
dir.create(path,recursive=TRUE,showWarnings=FALSE)
kegg_id<-
request_kegg_pathway_info(organism=organism)
pb<-progress::progress_bar$new(total=nrow(kegg_id))
kegg_pathway_database<-
seq_along(kegg_id$KEGG.ID)%>%
purrr::map(function(i){
pb$tick()
Sys.sleep(time=sleep)
KEGGREST::keggGet(dbentries=kegg_id$KEGG.ID[i])[[1]]
})
save(kegg_pathway_database,
file=file.path(path,organism))
}
# 步骤1:下载物种对应的KEGG信息,并保存为rds
download_kegg_pathway(path="kegg_pathway",
sleep=1,
organism="hsa")
# 步骤2:获取下载好的KEGG数据库信息
data<-read_kegg_pathway(path="kegg_pathway",organism="hsa")
# 步骤3:按自己的需求提取数据,这里展示仅提取通路-基因,通路-代谢的数据
kegg_human_pathway<-extractKEGG(data=data)
print(head(kegg_human_pathway$gene))
print(head(kegg_human_pathway$compound))
制作GSVA背景代谢集
# 制作用于GSVA的背景代谢集
gene_list <- split(hsa$gene_name,hsa$pathway_id)
# 便于计算就取前三个
gene_list <- gene_list[1:3]
读取空转注释和评分数据
# 载入空间转录组metadata信息(包括区域注释和T细胞各亚型评分信息)
meta_1 <- read.delim("HCC1_metadata.txt", header = TRUE)
# 选取肿瘤相关区域
Tumor_region <- meta_1 %>% filter(region == "Tumor")
rownames(combined_Tumor_Tregs)[1:5]
[1] "AACACGTGCATCGCAC-1" "AACAGGAAGAGCATAG-1" "AACAGGTTCACCGAAG-1"
[4] "AACAGTCAGGCTCCGC-1" "AACAGTCCACGCGGTG-1"
处理空代数据
# 载入空间代谢组数据
metabolism_1 <- readRDS("./metadata-both_A3.rds")
colnames(metabolism_1)[1:25]
metabolism_1[,"KEGG"][1:10]
# 可以看到有很多检出代谢物是没有KEGG数据库信息的
两种情况:
1.将没有注释到数据库的代谢物以代谢物名替代
2.直接过滤没有注释到数据库的代谢物信息
分别计算检验GSVA算法是否只依靠背景集进行计算
替代
metabolism_KEGG <- metabolism_1[,"KEGG","Metabolites"]
# 将没有注释的都先换成A
metabolism_KEGG <- metabolism_KEGG %>%
mutate(col1 = case_when(
KEGG %in% c(NA, ";",";;") ~ A, #
TRUE ~ KEGG # 其他值保持不变
))
metabolism_KEGG <- metabolism_KEGG %>%
mutate(KEGG =if_else(KEGG == "A", metabolism, KEGG))
head(metabolism_KEGG[,"KEGG"])
[1] "Hydroxymethyl hydrogen carbonate"
[2] "2-Aminoethane-1,1,1-triol; Tri-hydroxymethylaminomethane"
[3] "Ethynol; Ketene; Oxirene"
[4] "C02356; C00334; C05145"
[5] "C00099; C00041; C00213"
[6] "C00186; C01013; C02154"
# 多个KEGG号的只取第一个
metabolism_KEGG$KEGG <- sapply(strsplit(as.character(metabolism_KEGG$KEGG), ";"), `[`, 1)
head(metabolism_KEGG[,"x"])
[1] "Hydroxymethyl hydrogen carbonate""2-Aminoethane-1,1,1-triol"
[3] "Ethynol" "C02356"
[5] "C00099" "C00186"
metabolism_1$KEGG <- metabolism_KEGG$KEGG
metabolism_1 <- metabolism_1[,c(13,24:4456)]
colnames(metabolism_1)[1:5]
[1] "KEGG" "A3_922.CTATCGATCACGATTA"
[3] "A3_922.ACTACCTTGGTTGACC""A3_922.GTACGAAGGCGTCTAT"
[5] "A3_922.AAGGTTACTATGACGC"
# 修改细胞标签和空转一致
colnames(metabolism_1) <- gsub("A13_922\\.", "", colnames(metabolism_1))
colnames(metabolism_1)[2:ncol(metabolism_1)] <- paste0(colnames(metabolism_1)[2:ncol(metabolism_1)], "-1")
# 提取共有标签
valid_columns <- rownames(Tumor_region)[rownames(Tumor_region) %in% colnames(metabolism_1)]
metabolism_1_filter <- metabolism_1[,c("KEGG",valid_columns)]
exprSet <- column_to_rownames(metabolism_1_filter, var='KEGG')
# 报错,重复名
# 进行重复的对象取均值
exprSet<-aggregate(x=metabolism_1_filter[,2:(ncol(metabolism_1_filter))],by=list(metabolism_1_filter$KEGG),FUN=mean)
library(tidyverse)
exprSet <- column_to_rownames(exprSet, var='Group.1')
library(GSVA)
exprSet <- as.matrix(exprSet)# GSVA需要矩阵而不是数据框
exprSet_1 <- apply(exprSet, 2, function(x) as.numeric(as.character(x)))#数值类型转换
rownames(exprSet_1) <- rownames(exprSet)#重命名 #gsva计算,kcdf可传参Gaussian"和"Poisson",默认是poisson,Gaussian"forlogCPM,logRPKM,logTPM,"Poisson"forcounts
GSVA <- gsva(expr = exprSet_1,
gset.idx.list=gene_list,
kcdf="Poisson" ,
verbose=T,
parallel.sz = parallel::detectCores())
GSVA_1 <- t(GSVA)
# 融合空转GSVA和空代GSVA计算矩阵
Tumor_region <- Tumor_region[valid_columns,]
combined_Tumor <- cbind(Tumor_region,GSVA_1)
a1 <- combined_Tumor[,c("GSVA_CD8CTL","hsa00010","hsa00020","hsa00030")]
过滤
# 直接过滤
# 保留KEGG列中以C开头的行
metabolism_1_filter <- metabolism_1_filter %>%
filter(grepl("^C", KEGG))
exprSet<-aggregate(x=metabolism_1_filter[,2:(ncol(metabolism_1_filter))],by=list(metabolism_1_filter$KEGG),FUN=mean)
exprSet <- column_to_rownames(exprSet, var='Group.1')
exprSet <- as.matrix(exprSet)
exprSet_1 <- apply(exprSet, 2, function(x) as.numeric(as.character(x)))
rownames(exprSet_1) <- rownames(exprSet)
GSVA <- gsva(expr = exprSet_1,
gset.idx.list=gene_list,
kcdf="Poisson" ,
verbose=T,
parallel.sz = parallel::detectCores())
GSVA_1 <- t(GSVA)
Tumor_region <- Tumor_region[valid_columns,]
combined_Tumor <- cbind(Tumor_region,GSVA_1)
a2 <- combined_Tumor[,c("GSVA_CD8CTL","hsa00010","hsa00020","hsa00030")]
相关性可视化
#可视化
将三个通路一起展示
# 颜色
colors = c( "#1f77b4",# 蓝色
"#ff7f0e", # 橙色
"#2ca02c")
#melt()函数将宽格式转化为长格式
library(reshape2)
a1_long<-melt(a1, id.vars = c("GSVA_CD8CTL"), #需保留的不参与聚合的变量列名
measure.vars = 2:4,#选择需要转化的列
variable.name = c('group'),#聚合变量的新列名
value.name = 'value')#聚合值的新列名
b<-ggplot(a1_long,aes(x=GSVA_CD8CTL,y=value))+
geom_point(aes(color=group))+
geom_rug(aes(color=group))+
geom_smooth(aes(color=group,fill=group),method=lm
,fullrange=TRUE)+
scale_color_manual(values=colors)+
scale_fill_manual(values=colors)+
ggpubr::stat_cor(aes(color=group),label.x=0)
b
# 同一通路,多个样本展示
a1$group <- "sample1"
a2$group <- "sample2"
a <- rbind(a1,a2)
b<-ggplot(a,aes(x=GSVA_CD8CTL,y=hsa00010))+
geom_point(aes(color=group))+
geom_rug(aes(color=group))+
geom_smooth(aes(color=group,fill=group),method=lm
,fullrange=TRUE)+
scale_color_manual(values=colors)+
scale_fill_manual(values=colors)+
ggpubr::stat_cor(aes(color=group),label.x=0)
b