3.8 批次效应处理
刘小泽写于2020.7.7
1 前言
数据准备
# 数据下载(数据我已做好,链接在:https://share.weiyun.com/iE0VjVD0)
library(TENxPBMCData)
all.sce <- list(
pbmc3k=TENxPBMCData('pbmc3k'),
pbmc4k=TENxPBMCData('pbmc4k'),
pbmc8k=TENxPBMCData('pbmc8k')
)
# 批量质控
library(scater)
stats <- high.mito <- list()
for (n in names(all.sce)) {
current <- all.sce[[n]] #接下来的每步还是和之前一样
is.mito <- grep("MT", rowData(current)$Symbol_TENx)
stats[[n]] <- perCellQCMetrics(current, subsets=list(Mito=is.mito))
high.mito[[n]] <- isOutlier(stats[[n]]$subsets_Mito_percent, type="higher")
all.sce[[n]] <- current[,!high.mito[[n]]]
}
# 批量归一化
all.sce <- lapply(all.sce, logNormCounts)
# 批量找HVGs
library(scran)
all.dec <- lapply(all.sce, modelGeneVar)
all.hvgs <- lapply(all.dec, getTopHVGs, prop=0.1)
> length(all.hvgs[[1]])
[1] 1001
> length(all.hvgs[[2]])
[1] 1352
# 批量降维(使用了一个更高级的mapply:Apply a Function to Multiple List or Vector Arguments)
library(BiocSingular)
set.seed(10000)
all.sce <- mapply(FUN=runPCA, x=all.sce, subset_row=all.hvgs,
MoreArgs=list(ncomponents=25, BSPARAM=RandomParam()),
SIMPLIFY=FALSE)
set.seed(100000)
all.sce <- lapply(all.sce, runTSNE, dimred="PCA")
set.seed(1000000)
all.sce <- lapply(all.sce, runUMAP, dimred="PCA")
# 批量聚类
for (n in names(all.sce)) {
g <- buildSNNGraph(all.sce[[n]], k=10, use.dimred='PCA')
clust <- igraph::cluster_walktrap(g)$membership
colLabels(all.sce[[n]]) <- factor(clust)
}
# 把数据提取出来
pbmc3k <- all.sce$pbmc3k
dec3k <- all.dec$pbmc3k
pbmc4k <- all.sce$pbmc4k
dec4k <- all.dec$pbmc4k为了方便下面的批次矫正,先做几件事
2 检查批次效应
合并数据

进行PCA
再看聚类分群结果
看t-SNE结果

3 矫正批次效应之线性回归

4 MNN矫正
4.1 先了解一下这个方法
4.2 还是应用到PBMC数据
4.3 矫正后的检查

5 更为细致的检查
5.1 同一批次内clusters的比较

5.2 利用marker基因检查

6 矫正后数据的应用
TODO:对13.7 Using the corrected values继续补充
最后更新于