大家好,由此篇文章起SDbioinfo将新开一个数据可视化专栏,为大家带来生物信息学各方向数据可视化教程,包括工具介绍,代码实现,应用案例等内容。第一篇为Circos图,读者如有期望的工具教程请告知我们,我们会按情况安排到之后的计划中。
一、Circos介绍
1.背景
Circos 图是一种用于展示复杂数据关系的可视化工具,最初由 Krzywinski 等人开发,主要用于基因组数据的可视化。 它通过将数据以环形的方式展示,使得复杂的基因组结构和基因间的关系更加直观易懂。
2.应用领域
主要应用于生物信息学,用于展示基因组结构、基因表达数据、染色体相互作用等。 也可以用于其他领域,如网络分析、数据关系展示等,帮助研究人员更好地理解和分析复杂的数据集。
3.特点
直观的环形展示
将基因组的不同部分以环形的方式展示,每个环代表一个染色体或基因组区域。 通过颜色、大小等视觉元素,可以直观地展示基因组的结构特征和基因间的相互作用。
灵活的数据整合
可以整合多种类型的数据,如基因组序列数据、基因表达数据、表观遗传学数据等。 支持在同一个图中展示多个数据层,方便研究人员进行综合分析。
丰富的可视化选项
提供多种可视化选项,如基因组轨道、基因表达时间点、基因间的连接线等。 可以根据需要自定义颜色、大小、形状等视觉元素,以突出展示重点信息。
二、R实现Circos代码示例(以AP2基因为例)
library(tidyverse)
library(IRanges)
library(Biostrings)
library(openxlsx)
library(dplyr)
library(data.table)
library(rtracklayer)
library(ComplexHeatmap)
library(circlize)
library(ggplot2)
####Step1: to get chromosome info(chr names + length)
####第一步:获得染色体长度和名称信息作为circos plot圆盘极坐标
Toxo.fasta_all <- readDNAStringSet("/Users/sidaye/Documents/R/AP2_all_analysis/Input/Toxo_ME49v62/ToxoDB-62_TgondiiME49_Genome.fasta")
#names(Toxo.fasta_all)
#rule out contigs/scaffolds and keep the chromosome
Toxo.fasta <- Toxo.fasta_all[c(1:14),]
#head(Toxo.fasta)
Chr.len <- unlist(lapply(strsplit(names(Toxo.fasta), ' | '), '[[',7))
Chr.len <- unlist(lapply(strsplit(Chr.len, '='), '[[',2))
Chr.name <- unlist(lapply(strsplit(names(Toxo.fasta), ' | '), '[[',1))
Chr.name <- unlist(lapply(strsplit(Chr.name, '_'), '[[',2))
Genome.len <- data.frame(Chr.name = Chr.name,
Length = Chr.len)
head(Genome.len) ###Datafrome contains info
## Chr.name Length
## 1 chrIa 1859933
## 2 chrIb 1955354
## 3 chrII 2347032
## 4 chrIII 2532871
## 5 chrIV 2686605
## 6 chrV 3331915
####Step2. Make reference datafrome for the corresponding relationship between name of chromosome and length of chromosome
####第二步:把Genome.len作为ref
ref <- Genome.len
####3. Prepare the dataframe(bed file) for transcript track for genomicrectangular plot to show the transcript of whole genome
####第三步:得到需要画的最外层圆轮的转录本/CDS区域的坐标储存在bed file
##use the gtf file instead of gff file
gtf_genome <- read.table("/Users/sidaye/Documents/R/AP2_all_analysis/Input/Toxo_ME49v62/ToxoDB-62_TgondiiME49.gtf", sep = '\t')
transcript_gtf <- gtf_genome %>% dplyr::filter(V3 == 'transcript')
CDS_gtf<- gtf_genome %>% dplyr::filter(V3 == 'CDS')
##transform gtf file into bed file
function_gtf_to_bed <- function(x){
x <- x%>% dplyr::select(V1, V4, V5, V9, V3, V7)
x <- x %>% dplyr::rename(V1=V1, V2=V4, V3=V5, V4=V9, V5=V3, V6=V7)
x$V5 <- '0'
return(x)
}
transcript_bed <- function_gtf_to_bed(transcript_gtf)
transcript_names <- unlist(lapply(strsplit(transcript_bed$V4, ' '), '[[',2))
transcript_names <- unlist(lapply(strsplit(transcript_names, ';'),'[[',1))
transcript_bed$V4 <- transcript_names
#write.table(transcript_bed, '/Users/sidaye/Documents/R/AP2_all_analysis/Output/transcript_loci_Toxo_v62.bed', col.names = F, quote = F, row.names = F, sep = '\t')
CDS_bed <- function_gtf_to_bed(CDS_gtf)
CDS_names <- unlist(lapply(strsplit(CDS_bed$V4, ' '), '[[',4))
CDS_names <- unlist(lapply(strsplit(CDS_names, ';'), '[[',1))
CDS_bed$V4 <- CDS_names
original_chr_names <- unlist(lapply(strsplit(names(Toxo.fasta), ' | '), '[[',1))
original_chr_names <- original_chr_names[1:14]
print(original_chr_names) ####这里是获取染色体的名称,需要注意染色体名称,因为后面到画图时染色体名称对应极坐标的x-axis
## [1] "TGME49_chrIa" "TGME49_chrIb" "TGME49_chrII" "TGME49_chrIII"
## [5] "TGME49_chrIV" "TGME49_chrV" "TGME49_chrVI" "TGME49_chrVIIa"
## [9] "TGME49_chrVIIb" "TGME49_chrVIII" "TGME49_chrIX" "TGME49_chrX"
## [13] "TGME49_chrXI" "TGME49_chrXII"
####Step4. To change the chromosomes names
####第四步:确认和修改染色体名称,以免后面画图造成错误
function_change_chr_names <- function(x){
for (i in 1:length(original_chr_names)){
x$V1 <- gsub(original_chr_names[i], Chr.name[i], x$V1)
}
return(x)
}
head(transcript_bed)
## V1 V2 V3 V4 V5 V6
## 1 TGME49_chrII 17422 22256 TGME49_220840-t26_1 0 -
## 2 TGME49_chrII 23771 24986 TGME49_220850-t26_1 0 -
## 3 TGME49_chrII 23771 30138 TGME49_220860-t26_1 0 +
## 4 TGME49_chrII 30196 34402 TGME49_220870-t26_1 0 -
## 5 TGME49_chrII 34473 38391 TGME49_220880-t26_1 0 +
## 6 TGME49_chrII 37902 41699 TGME49_220890-t26_1 0 -
transcript_bed_new <- function_change_chr_names(transcript_bed)
transcript_bed_new <- transcript_bed_new%>%dplyr::filter(grepl('chr', V1)) #filter rows that contain a certain string 'chr'
#####可以明显看到这里chromosome name 改过来了,例如TGME49_chrII 改成了chrII, chrII会作为后面画图的每一个染色体的x-axis名称
head(transcript_bed_new )
## V1 V2 V3 V4 V5 V6
## 1 TGME49_chrII 17422 22256 TGME49_220840-t26_1 0 -
## 2 TGME49_chrII 23771 24986 TGME49_220850-t26_1 0 -
## 3 TGME49_chrII 23771 30138 TGME49_220860-t26_1 0 +
## 4 TGME49_chrII 30196 34402 TGME49_220870-t26_1 0 -
## 5 TGME49_chrII 34473 38391 TGME49_220880-t26_1 0 +
## 6 TGME49_chrII 37902 41699 TGME49_220890-t26_1 0 -
CDS_bed_new <- function_change_chr_names(CDS_bed)
CDS_bed_new <- CDS_bed_new%>%dplyr::filter(grepl('chr', V1))
####5. Prepare the dataframe for the peaking time of all the AP2s and genomic loci of each AP2
####5.第五步:这里需要准备dot plots(显示每一个AP2基因的转录本表达peak time)的表格
AP2s_transcript_loci <- read.csv("/Users/sidaye/Documents/R/AP2_all_analysis/Input/ME49_AP2_genomicposition.csv")
AP2s_list_source_id <- AP2s_transcript_loci$source_id
AP2s_transcript_loci <- transcript_bed_new[(transcript_bed_new$V4 %in% AP2s_list_source_id),]
max(AP2s_transcript_loci$V3-AP2s_transcript_loci$V2)
#21232
head(AP2s_transcript_loci) ######显示需要画的所有AP2基因的坐标轴
## [1] 21232
####6. Prepare the dataframe for y-axis of dots
####6. 第六步:这里根据每一一副图希望描述的特征和对象不同来进行修改,也就是储存基因特征的表格,这个表格是最终画图的表格
####主要包含信息是每个基因的表达时间(这个可以是用数字表示的任何特征,比如相关性等等)和每个基因的坐标
AP2s_transcript_loci$V4 <- unlist(lapply(strsplit(AP2s_transcript_loci$V4, "-"), "[[", 1))
AP2_loci <- data.frame(AP2 = AP2s_transcript_loci$V4,
start = AP2s_transcript_loci$V2,
end = AP2s_transcript_loci$V3,
chr = AP2s_transcript_loci$V1)
#####
AP2_loci2 <- AP2_loci[c(53, 18:21, 63:67, 32:33, 42:44, 54:62, 22:24, 25:31, 45:52, 7:17, 1:6, 34:41),]
AP2_Peak <- data.frame(chr = AP2s_transcript_loci$V4,
start = AP2s_transcript_loci$V2,
end = AP2s_transcript_loci$V3,
time = rep(0, 67))
AP2_Peak2<-AP2_Peak[c(53, 18:21, 63:67, 32:33, 42:44, 54:62, 22:24, 25:31, 45:52, 7:17, 1:6, 34:41),]
#AP2_Peak2$time <- runif(67, min=0, max=4)
head(AP2_Peak2)
## chr start end time
## 53 TGME49_208020 517009 529250 0
## 18 TGME49_252370 516506 523610 0
## 19 TGME49_253380 904923 917480 0
## 20 TGME49_299150 2145570 2154749 0
## 21 TGME49_299020 2241912 2249335 0
## 63 TGME49_320700 130940 133193 0
AP2_microarray <- read.xlsx("/Users/sidaye/Documents/R/AP2_all_analysis/Input/microarray_cyclic_timing2.xlsx")
AP2_microarray$GeneID<- gsub('-','_', AP2_microarray$GeneID)
AP2_microarray <- AP2_microarray[AP2_microarray$GeneID %in% AP2_Peak2$chr, ]
colnames(AP2_microarray)[1] <- "chr"
#####since the order of AP2 in AP2_Peak2 has already been adjusted to corresponding to the order of chromosom, so it should be fixed in order to make nested zooming figure
AP2_Peak_all <- left_join(AP2_Peak2, AP2_microarray, by="chr")
AP2_Peak_constitutive <- AP2_Peak_all %>% dplyr::filter(constitutive == 1)
AP2_Peak_constitutive_list <- AP2_Peak_constitutive$chr
AP2_Peak_cyclic <- AP2_Peak_all %>% dplyr::filter(cyclic == 1)
AP2_Peak_noexpressed <- AP2_Peak_all[is.na(AP2_Peak_all$expressed),]
AP2_Peak_noexpressed_list <-AP2_Peak_noexpressed$chr
###########for those genes is constitutive, even though it has a peak time, just to make it as a line across the all cell cycle
AP2_Peak2$time <- AP2_Peak_all$peak.time
##########label all the constitutive genes as peaking time "1.11111"
AP2_Peak2 <- AP2_Peak2 %>% dplyr::mutate(time=ifelse(chr %in% AP2_Peak_constitutive_list, 1.11111, time))
######Because the cell cycle is cyclic, so those AP2s have peaking time at 0 or 6 should indicate at both 0 and 6 (C1->G)
AP2_Peak_cyclic <- AP2_Peak2[AP2_Peak2$chr%in%AP2_Peak_cyclic$chr,]
AP2_Peak_GtoC <- AP2_Peak_cyclic %>% dplyr::filter(time == 0)
AP2_Peak_GtoC$time <- rep(6,nrow(AP2_Peak_GtoC))
AP2_Peak_CtoG <- AP2_Peak_cyclic %>% dplyr::filter(time == 6)
AP2_Peak_CtoG$time <- rep(0,nrow(AP2_Peak_CtoG))
AP2_Peak3 <- rbind(AP2_Peak2,AP2_Peak_GtoC,AP2_Peak_CtoG)
head(AP2_Peak3) ####准备好的最终画图表格格式
## chr start end time
## 53 TGME49_208020 517009 529250 1.11111
## 18 TGME49_252370 516506 523610 1.11111
## 19 TGME49_253380 904923 917480 5.50000
## 20 TGME49_299150 2145570 2154749 NA
## 21 TGME49_299020 2241912 2249335 NA
## 63 TGME49_320700 130940 133193 NA
####7.prepare the corresponding dataframe for nested zooming connection in the plot
####第七步.因为我们需要花nested zoom in circos plot,需要做一个另外的table提供nested zoom in的坐标信息
####前三列是最外层一轮的坐标,后三列是里面nested circos plot的坐标,因为nested circos plot外层和里层极坐标不一样
####需要设置关系表格correspondance,来表明两套极坐标的关系
correspondance<- data.frame(chr = AP2s_transcript_loci$V1,
start = AP2s_transcript_loci$V2,
end = AP2s_transcript_loci$V3,
AP2 = AP2s_transcript_loci$V4,
start.1 = AP2s_transcript_loci$V2,
end.1 = AP2s_transcript_loci$V3)
###调整顺序
correspondance2 <- correspondance[c(53, 18:21, 63:67, 32:33, 42:44, 54:62, 22:24, 25:31, 45:52, 7:17, 1:6, 34:41),]
head(correspondance2)
## chr start end AP2 start.1 end.1
## 53 chrIb 517009 529250 TGME49_208020 517009 529250
## 18 chrIII 516506 523610 TGME49_252370 516506 523610
## 19 chrIII 904923 917480 TGME49_253380 904923 917480
## 20 chrIII 2145570 2154749 TGME49_299150 2145570 2154749
## 21 chrIII 2241912 2249335 TGME49_299020 2241912 2249335
## 63 chrIV 130940 133193 TGME49_320700 130940 133193
3.绘图
初始化Circos图
########################################################Starting to plot circos
########################################################Starting to plot circos
########################################################Starting to plot circos
#clear all the previous circos plot
circos.clear()
#set color for text in the circos plot
col_text <- "black"
#set up 12 random colors
# chrIa chrIb chrII chrIII chrIV chrV chrVI chrVIIa chrVIIb chrVIII chrIX
#"#AD2D0533" "#FFC78833" "#1018E833" "#C0670E33" "#6EFFC833" "#95387433" "#E4BF8D33" "#0992C633" "#7E30CB33" "#A1884333" "#3FC7F433"
# chrX chrXI chrXII
#"#34E28D33" "#363ED033" "#78ECDE33"
#chr_bg_color = rand_color(14, transparency = 0.8)
chr_bg_color =c("#AD2D0533" , "#FFC78833", "#1018E833", "#C0670E33", "#6EFFC833", "#95387433", "#E4BF8D33", "#0992C633", "#7E30CB33", "#A1884333","#3FC7F433", "#34E28D33", "#363ED033", "#78ECDE33")
names(chr_bg_color) = ref$Chr.name
#Sectors in `f1()` and `f2()` should be in the same order, or else the connection lines may overlap
f1 = function() {
circos.par("track.height"=0.8,gap.degree=2,start.degree = 90,clock.wise = T,
cell.padding=c(0,0,0,0))
#create circos object, 14=number of chromosome
circos.initialize(factors=ref$Chr.name,
xlim=matrix(c(rep(0,14),ref$Length),ncol=2)
)
#cex,Specify the size of the title text with a numeric value of length 1
#######Track1
#set the chromosome name as track1, set the bg.col as white
#change font by cex
circos.track(ylim=c(0,1),panel.fun=function(x,y) {
Genome=CELL_META$sector.index
xlim=CELL_META$xlim
ylim=CELL_META$ylim
circos.text(mean(xlim),mean(ylim),Genome,cex=3,col=col_text,
facing="outside",niceFacing=TRUE,font =2)
},bg.col="white",bg.border=F,track.height=0.05)
#set gap between tracks
set_track_gap(mm_h(1)) # 0.5cm
#set_track_gap(mm_h(2)) # 2mm
#set track2 and add scale to the track
circos.track(ylim=c(0,1),panel.fun=function(x,y) {
Genome=CELL_META$sector.index
xlim=CELL_META$xlim
ylim=CELL_META$ylim
circos.text(mean(xlim),mean(ylim),Genome,cex=0.03,col="#808080",
facing="outside",niceFacing=TRUE)
},bg.col=chr_bg_color,bg.border=F,track.height=0.02)
#ref check the largest length of the chromosome
#max(ref$Length)
#7353086
#brk <- c(0.0, 1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0)*10^6
#line width=lwd
#circos.track(track.index = get.current.track.index(), panel.fun = function(x, y) {
#circos.axis(h="top",major.at=brk,labels=round(brk/10^6,1),labels.cex=1.2,
#col=col_text,labels.col=col_text,lwd=0.8, labels.facing="clockwise")
#},bg.border=F)
set_track_gap(mm_h(0.5))
##track3 shows the positions for all genes transcript
circos.genomicTrackPlotRegion(
transcript_bed_new, track.height = 0.05, stack = F, bg.border = NA, ylim = c(0,1),
panel.fun = function(region, value, ...) {
circos.genomicRect(region, value, col = 'black', border = NA, ...)
} )
}
circos.yaxis.scale <- as.numeric(c(0, 3, 4.7, 5, 6))
#sector.index for constitutive genes
sector.index.constitutive <-which(AP2_Peak3$chr%in%AP2_Peak_constitutive_list)
sector.index.noexpressed <- which(AP2_Peak3$chr%in%AP2_Peak_noexpressed_list)
###################To change all the GeneID in input dataframe into AP2 names
AP2_geneID2name <- read.xlsx("/Users/sidaye/Documents/R/AP2_all_analysis/Input/AP2_geneID2name.xlsx")
colnames(AP2_geneID2name)[1] <- "chr"
AP2_Peak_all2 <- left_join(AP2_Peak_all,AP2_geneID2name, by="chr")
AP2_Peak_all2$Ap2Name2 <- unlist(lapply(strsplit(AP2_Peak_all2$Ap2Name, 'P2'), '[[',2))
#gap.after is a vector means the gap between each neighboring sectors
#Total track.height =1
#Input dataframe has standardized format:chr, start, end, time
f2 = function() {
#this track shows the position of each AP2s on the genome and the nested zooming track for the peaking time during CDC
circos.par(cell.padding = c(0, 0, 0, 0), gap.after = c(rep(1, nrow(AP2_loci)-1), 10))
#To create new sectors for the second part
circos.genomicInitialize(AP2_loci2, plotType = NULL)
circos.track(ylim=c(0,1),bg.col="white",bg.border=F,track.height=0.18)
AP2_Peak3_new <- AP2_Peak3
AP2_Peak3_new <- AP2_Peak3_new %>% dplyr::mutate(x.center=round((start+end)/2))
sector_indices <- get.all.sector.index()
for (i in 1:67){
circos.text(AP2_Peak3_new$x.center[i], 0.5, AP2_Peak_all2$Ap2Name2[i],
sector.index = sector_indices[i],
track.index = get.current.track.index(),
cex=1.8,
col=ifelse(i %in% sector.index.constitutive,"#d42729" ,ifelse(i %in% sector.index.noexpressed ,"grey16" ,"#0080ff")),
font = 2,
facing="reverse.clockwise",niceFacing=TRUE)}
set_track_gap(cm_h(0.8))
#set_track_gap(mm_h(1.2))
#track.height here change the height of dot block
circos.genomicTrack(AP2_Peak3, ylim = c(0, 6),track.height=0.4,
#the panel.fun argument is used to specify a function that will be called for each sector of a circular plot
panel.fun = function(region, value, ...) {
#here, the sector.index is the name of each AP2 genes
sector_indices <- get.all.sector.index()
rec_colors <- c("#f8c09b", "#e7db9f", "#bfc9a8", "#dea9f9")
circos.rect(CELL_META$cell.xlim[1],0,CELL_META$cell.xlim[2],3, col=rec_colors[1], border = NA)
circos.rect(CELL_META$cell.xlim[1],3,CELL_META$cell.xlim[2],4.7, col=rec_colors[2], border = NA)
circos.rect(CELL_META$cell.xlim[1],4.7,CELL_META$cell.xlim[2],5, col=rec_colors[3], border = NA)
circos.rect(CELL_META$cell.xlim[1],5,CELL_META$cell.xlim[2],6, col=rec_colors[4], border = NA)
for(h in circos.yaxis.scale) {
#CELL_META is an easy way to get meta data in the current cell
#The cell.xlim element specifies the range of x-axis values that are visible within the cell.
circos.lines(CELL_META$cell.xlim, c(h, h), lty = 4, col = "grey16")
}
circos.genomicPoints(region, value,
col = ifelse(value[[1]] == 0|value[[1]] == 6, "#0080ff",ifelse(value[[1]] ==1.11111, "white", "#0080ff")),
pch = 16,
cex = ifelse(value[[1]] == 0|value[[1]] == 6, 1.8,ifelse(value[[1]] ==1.11111, 0.01, 1.8))
)
}, bg.col = "white", track.margin = c(0.02, 0))
#for (i in sector.index.constitutive){
#circos.lines(c(AP2_Peak3_new$x.center[i], AP2_Peak3_new$x.center[i]), CELL_META$cell.ylim,
#sector.index = sector_indices[i],
#track.index = get.current.track.index(),
#lty = 1,
#col ="#d42729")
#}
circos.yaxis(side = "left", at = circos.yaxis.scale,
sector.index = get.all.sector.index()[1], labels = FALSE)
}
#circos.nested shows the zooming connecting parts, the order of sector should be the same as the df "AP2_loci2" and "AP2_Peak3"
circos.nested(f1, f2, correspondance2, connection_col = chr_bg_color[correspondance2[[1]]])
#15inches X 15inches
#legend("right", inset=.02, title="Category",
# c("Constitutive","Not expressed","Cyclic"), fill=c('#d42729', 'grey16', '#377EB8'), horiz=F, cex=0.8)
circos.info()
## No sector has been created
##
## No track has been created
#rand_color : generate random colors in circlize
#circos.par: Parameters for the circular layout
circos.clear()
三、应用案例
1.基因组结构展示
通过展示染色体的排列和基因的位置,帮助理解基因组的整体结构。 例如,展示不同物种或不同个体的基因组结构,比较它们之间的相似性和差异性。
2.基因表达分析
展示基因在不同时间点的表达模式,分析基因表达的动态变化。 例如,展示 AP2 基因在细胞周期不同阶段的表达时间,研究其在细胞周期调控中的作用。
3.基因间相互作用
通过连接线展示基因间的相互作用关系,帮助理解基因调控网络。 例如,展示染色体之间的相互作用、基因之间的调控关系等。