✦
医学科研新动向
✦
TCGA
TCGA(The Cancer Genome Atlas)是一个拥有海量肿瘤基因组数据的公共数据库,为癌症研究提供了丰富的资源。掌握如何高效获取并分析这些数据对研究至关重要!
📊 本篇推文将快速了解:如何从TCGA获取肿瘤miRNA基因表达数据。
登录TCGA
网站来下载数据:https://portal.gdc.cancer.gov/
/ TO MOM /
/ TO MOM /
01
数据下载
01. 点击 Cohort Builder
02. 以TCGA-DLBC为例 → Program → TCGA → Project → TCGA-DLBC
03. 点击Repository
04. 下载必须文件
Metadata + Clinical.csv
02
miRNA表达矩阵获取
将下载的文件保存在分析路径中:
# 设置工作目录
setwd("C:\\Users\\admin\\Desktopdata\\miRNA")
# 安装包
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("edgeR")
BiocManager::install("miRBaseVersions.db")
# 加载必要的库
library(rjson)
library(edgeR)
library(data.table)
library(dplyr)
library(stringr)
library(tidyr)
library(miRBaseVersions.db)
# 读取 metadata.json 文件
json <- jsonlite::fromJSON("metadata.cart.2024-10-22.json")
file_sample0 <- json[c('file_name', 'associated_entities')]
# 提取样本ID
file_sample0$sample_id <- sapply(file_sample0$associated_entities, function(x) { x[, 1] })
file_sample <- subset(file_sample0, select = -associated_entities)
# 列出所有miRNA的表达文件
count_file <- list.files('gdc_download_20241022_072814.603834', pattern = '*.isoforms.quantification.tsv', recursive = TRUE)
count_file_name <- sapply(strsplit(count_file, split = '/'), function(x) { x[2] })
# 创建一个空的表达矩阵
isoform_matrix <- data.frame()
path1 <- "gdc_download_20241022_072814.603834\\"
# 循环读取每个miRNA的表达文件,并合并成表达矩阵
for (i in 1:length(count_file_name)) {
path <- paste0(path1, count_file[i])
data0 <- read.table(path, fill = TRUE, header = TRUE)
# 提取感兴趣的列:miRNA区域和 read_count
data1 <- data0[, c(6, 3)] # 列6是miRNA区域,列3是 read_count
data <- data1 %>%
group_by(miRNA_region) %>%
summarise_all(sum) # 合并同一miRNA区域的表达量(求和)
# 设置列名为样本ID
colnames(data)[2] <- file_sample0[which(file_sample0$file_name == count_file_name[i]), 'sample_id']
# 合并到总的表达矩阵
isoform_matrix <- if (nrow(isoform_matrix) == 0) data else merge(isoform_matrix, data, by = "miRNA_region", all = TRUE)
}
# 查看生成的表达矩阵
View(isoform_matrix)
dim(isoform_matrix)
# 导出原始表达矩阵
write.table(isoform_matrix, "raw_mir_counts.txt", sep = "\t", row.names = TRUE, col.names = NA, quote = FALSE)
### 数据处理,提取成熟miRNA区域
isoform_matrix <- isoform_matrix[grep("mature", isoform_matrix$miRNA_region), ]
exp <- isoform_matrix
# 分割 miRNA_region 列
exp <- separate(exp, into = c("State", "ACCESSION"), col = "miRNA_region", sep = ",")
# 读取 miRBase 数据库文件
ano <- read.table("miRbase.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE)
merge_mir <- merge(ano, exp, by = "ACCESSION")
merge_mir <- merge_mir[, -c(1, 3)] # 去除无关列
# 保存过滤后的数据
write.table(merge_mir, "raw_mir_counts_filtered.txt", sep = "\t", row.names = TRUE, col.names = NA, quote = FALSE)
### 过滤表达量
# 将表达量为 NA 的地方替换成 0
merge_mir[, -1][is.na(merge_mir[, -1])] <- 0
# 去除在 50% 样本中没有表达的基因
zero_percentage <- rowMeans(merge_mir[, 2:ncol(merge_mir)] == 0)
exp1 <- merge_mir[zero_percentage < 0.5, ]
# 对重复的基因名进行平均处理
selected_cols <- exp1[, -1]
exp2 <- avereps(selected_cols, ID = exp1$NAME)
# 去除低表达基因
exp2 <- exp2[rowMeans(exp2) > 0.5, ]
### 样本分组
# 按样本编号提取肿瘤和正常样本
tumor <- colnames(exp2)[as.integer(substr(colnames(exp2), 14, 15)) == 01]
normal <- colnames(exp2)[as.integer(substr(colnames(exp2), 14, 15)) == 11]
exprSet_by_group <- cbind(exp2[, tumor], exp2[, normal])
# 修改列名,提取前16位字符
colnames(exprSet_by_group) <- substr(colnames(exprSet_by_group), 1, 16)
# 保存最终的表达矩阵
write.table(as.data.frame(exprSet_by_group), "mir_counts_final.txt", sep = "\t", row.names = TRUE, col.names = NA, quote = FALSE)
-END-
文字丨本人编写,如有补充,请随时告诉我
后台私信TCGA-miRNA即可获取完整代码