4.13 实战十三 | 10X | 小鼠乳腺上皮细胞
刘小泽写于2020.7.21

1 前言

数据来自Bach et al. (2017),使用的是妊娠期小鼠乳腺上皮细胞 + 10X技术建库

数据准备

1
library(scRNAseq)
2
sce.mam <- BachMammaryData(samples="G_1")
3
sce.mam
4
# class: SingleCellExperiment
5
# dim: 27998 2915
6
# metadata(0):
7
# assays(1): counts
8
# rownames: NULL
9
# rowData names(2): Ensembl Symbol
10
# colnames: NULL
11
# colData names(3): Barcode Sample Condition
12
# reducedDimNames(0):
13
# altExpNames(0):
Copied!

数据初探

1
# 样本信息
2
sapply(names(colData(sce.mam)), function(x) head(colData(sce.mam)[,x]))
3
# Barcode Sample Condition
4
# [1,] "AAACCTGAGGATGCGT-1" "G_1" "Gestation"
5
# [2,] "AAACCTGGTAGTAGTA-1" "G_1" "Gestation"
6
# [3,] "AAACCTGTCAGCATGT-1" "G_1" "Gestation"
7
# [4,] "AAACCTGTCGTCCGTT-1" "G_1" "Gestation"
8
# [5,] "AAACGGGCACGAAATA-1" "G_1" "Gestation"
9
# [6,] "AAACGGGCAGACGCTC-1" "G_1" "Gestation"
Copied!

ID转换

依然是整合行名 + 添加染色体信息
1
library(scater)
2
rownames(sce.mam) <- uniquifyFeatureNames(
3
rowData(sce.mam)$Ensembl, rowData(sce.mam)$Symbol)
4
5
library(AnnotationHub)
6
ens.mm.v97 <- AnnotationHub()[["AH73905"]]
7
rowData(sce.mam)$SEQNAME <- mapIds(ens.mm.v97, keys=rowData(sce.mam)$Ensembl,
8
keytype="GENEID", column="SEQNAME")
9
10
# 总共有13个线粒体基因
11
sum(grepl("MT",rowData(sce.mam)$SEQNAME))
12
# [1] 13
Copied!

2 质控

依然是备份一下,把unfiltered数据主要用在质控的探索上
1
unfiltered <- sce.mam
Copied!
使用线粒体信息进行过滤
1
is.mito <- rowData(sce.mam)$SEQNAME == "MT"
2
stats <- perCellQCMetrics(sce.mam, subsets=list(Mito=which(is.mito)))
3
qc <- quickPerCellQC(stats, percent_subsets="subsets_Mito_percent")
4
5
colSums(as.matrix(qc))
6
## low_lib_size low_n_features high_subsets_Mito_percent
7
## 0 0 143
8
## discard
9
## 143
Copied!
作图
1
colData(unfiltered) <- cbind(colData(unfiltered), stats)
2
unfiltered$discard <- qc$discard
3
4
gridExtra::grid.arrange(
5
plotColData(unfiltered, y="sum", colour_by="discard") +
6
scale_y_log10() + ggtitle("Total count"),
7
plotColData(unfiltered, y="detected", colour_by="discard") +
8
scale_y_log10() + ggtitle("Detected features"),
9
plotColData(unfiltered, y="subsets_Mito_percent",
10
colour_by="discard") + ggtitle("Mito percent"),
11
ncol=3
12
)
Copied!
再看看线粒体含量与文库大小的关系
1
plotColData(unfiltered, x="sum", y="subsets_Mito_percent",
2
colour_by="discard") + scale_x_log10()
Copied!
最后过滤
1
dim(unfiltered);dim(sce.mam)
2
# [1] 27998 2915
3
# [1] 27998 2772
Copied!

3 归一化

使用去卷积的方法
1
library(scran)
2
set.seed(101000110)
3
clusters <- quickCluster(sce.mam)
4
sce.mam <- computeSumFactors(sce.mam, clusters=clusters)
5
sce.mam <- logNormCounts(sce.mam)
Copied!

4 找高变异基因

这里由于是10X的数据,所以会有UMI信息,因此可以用基于泊松分布的模型构建方法
1
set.seed(00010101)
2
dec.mam <- modelGeneVarByPoisson(sce.mam)
3
top.mam <- getTopHVGs(dec.mam, prop=0.1)
Copied!
最后做个图
1
plot(dec.mam$mean, dec.mam$total, pch=16, cex=0.5,
2
xlab="Mean of log-expression", ylab="Variance of log-expression")
3
curfit <- metadata(dec.mam)
4
curve(curfit$trend(x), col='dodgerblue', add=TRUE, lwd=2)
Copied!

5 降维聚类

降维

1
library(BiocSingular)
2
set.seed(101010011)
3
sce.mam <- denoisePCA(sce.mam, technical=dec.mam, subset.row=top.mam)
4
sce.mam <- runTSNE(sce.mam, dimred="PCA")
5
6
# 检查PC的数量
7
ncol(reducedDim(sce.mam, "PCA"))
8
## [1] 15
Copied!

聚类

有一个很重要的参数是k ,含义是:the number of nearest neighbors used to construct the graph。如果k设置越大,得到的图之间联通程度越高,cluster也越大。因此这个参数也是可以不断尝试的
我们这里由于细胞数量比较多,所以设置的k就比较大,得到的cluster就少而大
1
snn.gr <- buildSNNGraph(sce.mam, use.dimred="PCA", k=25)
2
colLabels(sce.mam) <- factor(igraph::cluster_walktrap(snn.gr)$membership)
3
4
table(colLabels(sce.mam))
5
##
6
## 1 2 3 4 5 6 7 8 9 10
7
## 550 799 716 452 24 84 52 39 32 24
Copied!

作图

1
plotTSNE(sce.mam, colour_by="label")
Copied!
最近更新 1yr ago