作为服务器系统,linux包括内核及其应用程序。有了这个linux这个内核,我们还需远程终端工具(外壳shell)来帮助我们实现数据存储和处理等。finalshell和xshell都是常用的远程终端工具,都可以作为SSH连接工具,操作服务器中的各种资源!
在实际操作中,我们使用的是Xshell,所以就以Xshell为例分享远程终端工具的使用方法。Xshell是一款强大的SSH、TELNET和RLOGIN终端仿真软件,适用于Windows用户安全地访问Linux主机。使用服务器时,通过Xshell远程连接就可以操作服务器了。
第一次登录需要输入IP地址、端口、用户名和密码等,第二次及其以后的登录都比较方便。然后,我们就可以在Xshell上面创建文件,从公共网站下载数据,对下载的上游测序数据进行过滤(如去除低质量数据,去接头)和对比等操作。这次我们分析肝细胞癌和癌旁组织中MAIT细胞的表达差异。
在针对原始测序数据分析的实战中,我们发现,通过linux分析得到的差异基因数目,有时远远用多于GEO官网提供的、已经标准化的数据差异分析得来的细胞数目。当然,因为linux分析得到的差异基因,既有编码基因,也有非编码基因;而标准化的数据往往只关注编码基因,或者编码基因,或者由于序列比对算法不同而导致的。如果不进行上游分析,我们也可以下载GSE106830的标准化数据,然后进行差异分析等。
## 用别人的数据发文章,使用原始测序数据还是已经标准化处理的数据?########
## 数据来源于NCBI GEO官网,使用tidyverse包清洗数据,DESeq2算法获得差异基因
## 差异分析
{
## 下载数据,加载数据 #######################
library(tidyverse)
library(DESeq2)
library(pheatmap) # 用于制作热图
library(ggplot2) # 用于作图的包
library(clusterProfiler)
gset <- read_csv(file = "GSE106830.csv.gz", skip = 0) # 数据下载自NCBI GEO
colnames(gset) <- c("ENSEMBL","P1","T1","P2","T2","P3","T3","H1","H2")
## 探针ID转换
library(org.Hs.eg.db)
keytypes(org.Hs.eg.db)
Ensembl_ID <- gset$ENSEMBL # 查看org.Hs.eg.db 包提供的转换类型
symbol <- bitr(Ensembl_ID, fromType="ENSEMBL", # 采用bitr()函数进行转换
toType=c("SYMBOL", "ENTREZID"),
OrgDb="org.Hs.eg.db")
data <- inner_join(symbol,gset, by = "ENSEMBL")
## 筛选表达数据,用aggregate函数,取最大值
data <- aggregate( . ~ SYMBOL, data = data, max)
## 数据清洗,获得行为基因名,列为样本名的表达矩阵
rownames(data) <- data$SYMBOL
exprset <- data[,c(4:9)]
exprSet = as.data.frame(lapply(exprset,as.numeric),
stringsAsFactors = F)
rownames(exprSet) <- rownames(exprset) # str(exprSet)
countData <- exprSet[rowMeans(exprSet) > 1,] # 去除表达量过低的基因
## 样本分组信息
ctrl <- countData[,c(1,3,5)]
tumor <- countData[,c(2,4,6)]
exprSet1 <- cbind(ctrl,tumor)
condition <- factor(c(rep("ctrl",3),rep("tumor",3)), levels = c("ctrl","tumor"))
colData <- data.frame(row.names=colnames(exprSet1), condition)
## 差异分析
#第一步,构建 DESeqDataSet 对象
dds <- DESeqDataSetFromMatrix(countData = exprSet1, colData = colData, design= ~condition)
#第二步,计算差异倍数并获得 p 值
dds1 <- DESeq(dds, fitType = 'mean', minReplicatesForReplace = 7, parallel = TRUE)
res <- results(dds1, contrast = c('condition', 'tumor', 'ctrl'))
res1 = na.omit(res)
allDiff = as.data.frame(res1)
colnames(allDiff) <- c("baseMean","logFC","lfcSE","t","P.Value","padj")
## 获得差异表达矩阵
logFoldChange = 1
P = 0.05
diffSig <- allDiff[with(allDiff, (abs(logFC)>logFoldChange & P.Value < P )), ]
save(allDiff,diffSig, file = "data.Rdata")
}
## 火山图 ########
{
xMax <- max(-log10(allDiff$P.Value))
yMax <- max(abs(allDiff$logFC))
allDiff$change <- ifelse(allDiff$P.Value < 0.05 & abs(allDiff$logFC) > 1,
ifelse(allDiff$logFC > 1,'UP','DOWN'),
'NOT')
table(allDiff$change)
##数据整理和条件设置
data1 <- allDiff %>%
rownames_to_column("Genes") #行名转为Genes为列名的一列
data2 <- data1 %>%
mutate(regulate = case_when(logFC >= 1 & P.Value <= 0.05 ~ "up",
logFC <= -1 & P.Value <= 0.05 ~ "down",
TRUE ~ "NS"))
# 基本绘图
ggplot(data2,aes(logFC,-log10(P.Value)))+
geom_point()+
labs(x=expression(Log[2]*" Fold Change"),
y=expression(-Log[10]*" (p value)")) #修改坐标轴命名细节
library(ggrepel) # 注释基因
data2$selectedgene <- ifelse(data2$padj < 0.05 & abs(data2$logFC) > 1 ,data2$Genes,NA)
# 美化绘图
ggplot(data2,aes(logFC,-log10(P.Value), #分别给正负显著变化的基因在图中根据颜色、大小标注出来
color=factor(regulate),
size=factor(regulate)))+
geom_point()+
labs(x=expression(Log[2]*" Fold Change"),
y=expression(-Log[10]*" (p value)"))+
theme_grey(base_size = 15)+
scale_color_manual(values = c('blue','grey','red'))+
scale_size_manual(values = c(2,1,2))+
geom_hline(yintercept = -log10(0.05),linetype = 2,cex = 1)+ #添加辅助线
geom_vline(xintercept = c(- 1, 1),linetype = 2,cex = 1)+
theme(legend.title = element_blank(), #图例的设置参数
legend.position = "right", #标签位置为right
legend.background = element_rect(fill='transparent'))+
#用ggrepel包给选择的基因加上文本标签
geom_text_repel(aes(label=selectedgene), color="black",size= 2.5,
box.padding=unit(0.5, "lines"),
point.padding=NA,
segment.colour = "black")
}
## GO和KEGG富集分析 ########
{
library(clusterProfiler)
library(ggplot2)
library(enrichplot)
library(GOplot)
library(DOSE)
library(stringr)
#### 读取数据或直接使用前面分析的数据data1
Data <- diffSig
Data <- Data %>%
rownames_to_column("Genes")
gene <- bitr(Data$Genes,
fromType = 'SYMBOL',
toType = 'ENTREZID',
OrgDb = 'org.Hs.eg.db') #基因名ID转换,把基因名转换成ENTREZID
#### GO富集分析
GO <- enrichGO(
gene$ENTREZID,
OrgDb = 'org.Hs.eg.db',
keyType = "ENTREZID",
ont = "ALL",
pvalueCutoff = 0.01, ##p值
pAdjustMethod = "BH",
qvalueCutoff = 0.05, ##q值
minGSSize = 50, ##最少基因数目
maxGSSize = 500, ##最多基因数目
readable = TRUE)
#### KEGG富集分析
KEGG <- enrichKEGG(
gene$ENTREZID,
organism = "hsa",
keyType = "kegg",
pvalueCutoff = 0.5,
pAdjustMethod = "BH",
minGSSize = 10,
maxGSSize = 500,
qvalueCutoff = 0.5,
use_internal_data = FALSE)
#### GO富集分析绘图
barplot(GO, split="ONTOLOGY",showCategory = 5,title = 'GO Pathway')+
facet_grid(ONTOLOGY~., scale="free") +
scale_y_discrete(labels = function(x) stringr::str_wrap(x, width = 35)) ##避免字体重合
dotplot(GO, split="ONTOLOGY",showCategory = 5,title = 'GO Pathway')+
facet_grid(ONTOLOGY~., scale="free") +
scale_y_discrete(labels = function(x) stringr::str_wrap(x, width = 35))
#### KEGG富集分析绘图
barplot(KEGG,showCategory = 10,title = 'TOP10 KEGG Pathway') +
scale_y_discrete(labels = function(x) stringr::str_wrap(x, width = 35))
dotplot(KEGG,showCategory = 10,title = 'TOP10 KEGG Pathway') +
scale_y_discrete(labels = function(x) stringr::str_wrap(x, width = 35))
}
## GSEA富集分析##########
{
## 使用clusterProfiler包进行GSEA分析##########
# 加载所需的R包
library(org.Hs.eg.db) # human的OrgDB
library(clusterProfiler)
library(msigdbr)
library(enrichplot)
library(ggplot2)
# ID转化
gene_entrezid <- bitr(geneID = rownames(allDiff),
fromType = "SYMBOL",
toType = "ENTREZID", # 转成ENTREZID
OrgDb = "org.Hs.eg.db")
head(gene_entrezid)
gene_entrezid$logFC <- allDiff$logFC[match(gene_entrezid$SYMBOL, rownames(allDiff))]
genelist = gene_entrezid[,3]
names(genelist) = gene_entrezid$ENTREZID
genelist <- sort(genelist, decreasing = TRUE)
m_t2g <- msigdbr(species = "Homo sapiens", category = "H") %>%
dplyr::select(gs_name, entrez_gene)
head(m_t2g)
gsea_res <- GSEA(geneList = genelist,
TERM2GENE = m_t2g,
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 1,
pAdjustMethod = "BH"
)
gsea_res[[gsea_res$ID[[1]]]] # 第一个条目的所有基因
## 数据可视化,峰峦图
ridgeplot(gsea_res,
showCategory = 10,
fill = "pvalue", #填充色 "pvalue", "p.adjust", "qvalue"
core_enrichment = TRUE, #是否只使用 core_enriched gene
label_format = 30,
orderBy = "NES",
decreasing = FALSE) +
theme(axis.text.y = element_text(size = 10))
ids <- gsea_res@result$ID[1:5]
gseadist(gsea_res, # boxplot
IDs = ids,
type = "density") +
theme(legend.direction = "vertical")
## 排序
gsearank(gsea_res,
geneSetID = 1 # 要展示的基因集
)
## 绘图
gseaplot(gsea_res, geneSetID = 1, by = "runningScore",
title = gsea_res$Description[1])
gseaplot(gsea_res, geneSetID = 1, by = "preranked",
title = gsea_res$Description[1]) +
theme(plot.title = element_text(size = 10, color = "blue"))
# 取子集绘图
p <- gseaplot(gsea_res, geneSetID = 1, title = gsea_res$Description[1])
p
# 取子集进行修改
p[[1]] <- p[[1]]+theme(plot.title = element_text(size = 6))
p
# 默认subplots = 1:3,把3个图放一起
gseaplot2(gsea_res,geneSetID = 1,title = "title",
subplots = 1:3,
base_size = 10)
gseaplot2(gsea_res, geneSetID = 1, subplots = 1)
gseaplot2(gsea_res, geneSetID = 1, subplots = 1:2)
#把entrezid变为symbol
gsea_res_symbol <- setReadable(gsea_res, "org.Hs.eg.db", "ENTREZID")
p <- gseaplot2(gsea_res_symbol,geneSetID = 1,
title = gsea_res_symbol$Description[1])
p[[1]] <- p[[1]]+
theme(title = element_text(color = "red"))
p
## 展示多条通路
tmp <- as.data.frame(gsea_res_symbol)
colnames(tmp)
p <- gseaplot2(gsea_res, geneSetID = 1:6)
p
}