之前我们分享过seurat对象中counts、data、scale.data区别的区别:
seurat单细胞分析中:counts、data、scale.data区别、联系与R语言实现
今天我们来看看如何把data逆转为count矩阵?有可能实现吗?
#2在seurat_v5文件夹下安装v5---
.libPaths(
c(
# '/home/rootyll/seurat_v5/',
"/usr/local/lib/R/site-library",
"/usr/lib/R/site-library",
"/usr/lib/R/library"
)
)
library(Seurat)
library(dplyr)
dir.create("~/gzh/20240904_data转为count")
setwd('~/gzh/20240904_data转为count')
pbmc = readRDS('~/gzh/pbmc3k_final.rds')
DimPlot(pbmc,label = TRUE)
pbmc=CreateSeuratObject(counts = pbmc@assays$RNA@counts[1:100,1:100])
pbmc=NormalizeData(pbmc, normalization.method = "LogNormalize",
scale.factor = 10000,
margin = 1 #If performing CLR normalization, normalize across features (1) or cells (2)
)
#counts slot------
counts=GetAssayData(pbmc,slot = "counts") ;tail(counts)[,1:9]
data=GetAssayData(pbmc,slot = "data") ;tail(data)[,1:9]
data_lognorm=LogNormalize(counts);tail(data_lognorm)[,1:9]
# View(Seurat:::LogNorm)
# mydata= log( counts/colSums(counts) *10000 +1) ;tail(mydata)[,1:9] #这个不太对,应该+1的位置
# mydata2=log1p( counts/colSums(counts) *10000 ) ;tail(mydata2)[,1:9]
# mydata1=log1p( counts/colSums(counts) *10000 ) ;tail(mydata1)[,1:9]
# mydata =apply(counts, 2, function(x){ log( x/sum(x)*1e4 +1) }) ;tail(mydata )[,1:9] #R语言实现该过程
#换种方式实现
mydata =apply(counts, 2, function(x){ log1p( x/sum(x) *10000) }) ;tail(mydata )[,1:9] #R语言实现该过程
tail(data)[,1:9]
如何把mydata转回count呢?
##如何把mydata转回count呢?
# 假设sum_counts是之前计算出的每列总和
sum_counts <- colSums(exp(mydata) - 1) / 10000 ;tail(sum_counts )
# 还原counts
mycounts <- apply(mydata, 2, function(y) {
(exp(y) - 1) * sum_counts
}) ;tail( mycounts )[,1:9]
tail( counts )[,1:9]
#method2
{
# 假设mydata是已经经过log1p标准化后的数据矩阵
# 使用expm1进行逆运算
counts_restored <- apply(mydata, 2, function(y) {
expm1(y) * sum(expm1(y)) / 10000
})
counts_restored_sparse <-Matrix:: Matrix(counts_restored, sparse = TRUE)
tail(counts_restored_sparse)[,1:9]
tail(mycounts)[,1:9]
}
#method-
{
exp(2)
# 假设mydata已经定义
sum_counts <- colSums(exp(mydata) - 1)
# 恢复counts
counts_restored <- apply(mydata, 2, function(y) {
(exp(y) - 1) * sum_counts / 10000
})
# 稀疏矩阵
counts_restored_sparse <-Matrix:: Matrix(counts_restored, sparse = TRUE)
tail(counts_restored_sparse)[,1:9]
tail(counts)[,1:9]
tail(log1p(counts_restored_sparse))
log1p(counts_restored_sparse)[81:91,81:91]
data[81:91,81:91]
identical(log1p(counts_restored_sparse),data)
}
我们发现手动逆转的矩阵和原来的data矩阵 数值相差很大!
但是,虽然数值相差大,但是它们确是强相关性!(不知道这种强相关是否有意义)
mycounts[is.na(mycounts)] <- 0
cor(as.matrix(counts)[,1], mycounts [,1] )
colnames(mycounts)=paste0(colnames(mycounts),"_mycounts")
tail(log1p(mycounts))[ ,1:9]
tail(counts)[,1:9]
# 计算原始counts和恢复的counts之间的相关性
cor_matrix <- cor(as.matrix(counts), as.matrix(mycounts))
# 绘制热图
cor_matrix[is.na(cor_matrix)]=0
pheatmap::pheatmap(cor_matrix,
main = "Correlation between Original and Restored Counts",
)
可以看到,虽然数值不同,但是相关性极强。同时也可以看到有部分细胞的相关性也很强,有可能是同一类细胞亚群。
我得结论:data不能逆转为和原始一样的count矩阵。你有更好的办法吗?欢迎留言。