Robust Rank Aggregation (RRA) 是一种排名整合算法,用于从多个排序列表中识别在所有或大多数列表中排名靠前的元素/基因/变量,目标是找到跨所有数据来源具有一致性的显著元素/基因/变量。
RRA分析方法的优势在于:
鲁棒性:RRA 对排名列表的不完整性和噪声有较强的鲁棒性,允许某些列表中缺失部分元素。 无分布假设:与其他排名整合方法不同,RRA 不要求输入数据符合特定的统计分布。 高效性:RRA 算法计算效率较高,适合处理多个来源的大规模数据。
分析流程
1.导入
rm(list = ls())
library(RobustRankAggreg)
2. 数据预处理
这里的GSE118719是高通量测序数据,GSE12452/GSE34573是芯片测序数据,都是对肿瘤和正常组织进行了差异分析,差异分析可以看一看既往的推文:https://mp.weixin.qq.com/s/dCgLjzOGKeSpZlpFMzIR9g。
其中展示了GSE118719这个数据集的差异分析结果,常做分析的同学应该都很熟悉的吧hhh~
# GSE118719
load("../1-GSE118719/GSE118719_DEG.Rdata")
library(tinyarray)
a <- intersect_all(rownames(DEG1[DEG1$change=="UP",]))
up_GSE118719 <- a
b <- intersect_all(rownames(DEG1[DEG1$change=="DOWN",]))
down_GSE118719 <- b
GSE118719 <- DEG1[rownames(DEG1) %in% c(a,b),]
head(GSE118719)
# baseMean log2FoldChange lfcSE stat pvalue padj change
# SOX4 2193.3624 3.465037 0.2702584 12.821201 1.247533e-37 4.105629e-33 UP
# MPP3 277.3020 3.212097 0.2673546 12.014368 2.986444e-33 4.914193e-29 UP
# CHMP7 1100.4561 -1.301527 0.1265650 -10.283467 8.366291e-25 9.177821e-21 DOWN
# C12orf42 105.6587 -3.218636 0.3222610 -9.987669 1.725931e-23 1.420010e-19 DOWN
# CBX2 410.3895 4.414904 0.4489029 9.834876 7.966597e-23 5.243614e-19 UP
# C14orf182 200.4966 -2.471731 0.2648547 -9.332407 1.034939e-20 5.676640e-17 DOWN
# GSE12452
load("../2-GSE12452/step2output.Rdata")
up_GSE12452 <- rownames(deg[deg$change=="up",])
down_GSE12452 <- rownames(deg[deg$change=="down",])
GSE12452 <- deg[rownames(deg) %in% c(up_GSE12452,down_GSE12452),]
# GSE34573
load("../3-GSE34573/step2output.Rdata")
up_GSE34573 <- deg[deg$change=="up",]$symbol
down_GSE34573 <- deg[deg$change=="down",]$symbol
GSE34573 <- deg[rownames(deg) %in% c(up_GSE34573,down_GSE34573),]
# GSE53819
load("../4-GSE53819/step2output.Rdata")
up_GSE53819 <- deg[deg$change=="up",]$symbol
down_GSE53819 <- deg[deg$change=="down",]$symbol
GSE53819 <- deg[rownames(deg) %in% c(up_GSE53819,down_GSE53819),]
uplist <- list(up_GSE118719,
up_GSE12452,
up_GSE34573
)
downlist <- list(down_GSE118719,
down_GSE12452,
down_GSE34573
)
3.RRA分析
核心的代码其实就是aggregateRanks(glist = uplist, N = length(unique(unlist(uplist))))~
ups = aggregateRanks(glist = uplist, N = length(unique(unlist(uplist))))
head(ups)
# Name Score
# COL4A1 COL4A1 6.813894e-06
# LAMB1 LAMB1 7.337819e-06
# GAD1 GAD1 3.651821e-05
# C6orf141 C6orf141 3.810574e-05
# KREMEN2 KREMEN2 4.100591e-05
# DSG2 DSG2 4.766501e-05
downs = aggregateRanks(glist = downlist, N = length(unique(unlist(downlist))))
head(downs)
# Name Score
# C19orf33 C19orf33 0.0001299836
# KRT4 KRT4 0.0001870812
# UPK1B UPK1B 0.0002195047
# ALDH3A1 ALDH3A1 0.0003322503
# TMC5 TMC5 0.0004681669
# MGLL MGLL 0.0005716242
# 这里可以按照分数进行筛选,分数越小越有意义
toptotal <- rbind(ups[ups$Score<0.005,],downs[downs$Score<0.005,])
# 跟数据集一起取个交集看一看
w <- intersect_all(toptotal$Name,rownames(GSE118719),
rownames(GSE12452),rownames(GSE34573))
w
# [1] "COL4A1" "LAMB1" "GAD1" "C6orf141" "DSG2" "SSX2IP" "CEP135" "ZNRF3"
# [9] "RCN1" "HELLS" "GRB10" "LAMC1" "ANLN" "ITGAV" "COL4A2" "GINS1"
# [17] "CNTNAP2" "LGALS1" "ROBO1" "FMNL2" "IDH1" "SCD5" "CXCL3" "ABCA3"
# [25] "ZIC2" "FN1" "INSM1" "KCTD3" "FRAS1" "BRIP1" "NREP" "CACNA2D1"
# [33] "OLFM1" "SLC39A14" "KIF23" "TYMS" "DTL" "KIF18A" "VRK2" "IGF2BP3"
# [41] "ZNF112" "HOXC6" "KIF14" "SYNPO2" "CDK1" "RAD51AP1" "PRC1" "MCM7"
# [49] "FJX1" "GJA1" "STAR" "FBN1" "KRT4" "ZBTB7C" "TTC9" "S100P"
# [57] "CHST6" "C9orf24" "PSCA" "PIFO" "MAL" "CAPS"
分析后提取数据(可视化前准备)
datnames <- c("GSE118719","GSE12452","GSE34573")
# 创建一个空列表用于存储每个数据集提取的结果
result_list <- list()
# 遍历每个数据集,提取基因表达量
for (i in datnames) {
# 加载每个数据集
dataset <- get(i) # 使用 get() 函数获取数据集对象
# 提取符合toptotal中基因名的行
selected_genes <- dataset[rownames(dataset) %in% toptotal$Name, ]
# 按照toptotal的顺序进行排序
selected_genes <- selected_genes[match(toptotal$Name,
rownames(selected_genes)), ]
# 动态选择列名
logFC_col <- grep("logFC|log2FoldChange", colnames(selected_genes), value = TRUE)
if (length(logFC_col) > 0) {
# 提取 logFC 或 log2FoldChange 列
logFC_values <- selected_genes[, logFC_col, drop = FALSE]
} else {
stop(paste("No 'logFC' or 'log2FoldChange' column found in dataset:", i))
}
# 为提取的数据添加列名,列名为数据集的名称
colnames(logFC_values) <- i
# 将提取的数据添加到列表中
result_list[[i]] <- logFC_values
}
# 将所有提取的结果合并成一个数据框
final_result <- do.call(cbind, result_list)
# 查看合并后的数据框
head(final_result)
final_res <- na.omit(final_result)
4.可视化
data_total <- final_res
library(ComplexHeatmap)
# 检查数据类型是否正确转换为数值型
str(final_res)
# 也可以用circlize创建颜色映射
# 定义颜色渐变
library(circlize)
color_fun <- colorRamp2(c(-4, 0, 4), c("#336699", "white", "tomato"))
GSE_colors <- c("GSE118719" = "blue", "GSE12452" = "orange", "GSE34573" = "purple",
"GSE53819" = "brown", "GSE61218" = "yellow","GSE64634" = "pink")
# 顶部注释HeatmapAnnotation
colAnno <- HeatmapAnnotation(GSE_num =datnames,
col = list(GSE_num = GSE_colors))
pdf("heatmap.pdf",width = 5,height = 8)
ComplexHeatmap::Heatmap(data_total,
na_col = "white",
col = color_fun, # 添加颜色映射函数
show_column_names = T, #是否显示列名
show_row_names = T, #是否显示行名
row_names_side = "left",
name = "fraction",
row_order = rownames(data_total),
#row_split = datnames,
row_title = NULL,
cluster_columns = F,
#left_annotation = rowAnno,
top_annotation = colAnno,
#heatmap_width = unit(20, "cm"), # 调整热图宽度
#row_dend_width = unit(1, "cm"), # 调整聚类树宽度
cluster_rows = F, # 行聚类
row_names_gp = gpar(fontsize = 10), # 调整行名字体大小
column_names_gp = gpar(fontsize = 12),
cell_fun = function(j, i, x, y, width, height, fill) {
# 在单元格中心显示数值
grid.text(sprintf("%.2f", data_total[i, j]), x, y, gp = gpar(fontsize = 8))
}
#cluster_columns = FALSE # 列聚类。
)
dev.off()
参考资料:
Robust rank aggregation for gene list integration and meta-analysis. Bioinformatics. 2012 Feb 15;28(4):573-80. RRA:https://www.rdocumentation.org/packages/RobustRankAggreg/versions/1.2.1 生信技能树:https://mp.weixin.qq.com/s/JOadz9fXIDEoUXrh6BkbWA 被炸熟的虾:https://mp.weixin.qq.com/s/ua8QWm9RNPe3M-IidiluIw 生信交流平台:https://mp.weixin.qq.com/s/YDVeJecvxzoH5WY2bmOP0A 生信协作组:https://mp.weixin.qq.com/s/6333Qvf0m7D-QxPYlOIu3Q
致谢:感谢曾老师以及生信技能树团队全体成员。
注:若对内容有疑惑或者有发现明确错误的朋友,请联系后台(欢迎交流)。更多内容可关注公众号:生信方舟
- END -