3.15 不同R包数据的相互转换
刘小泽写于2020.7.18

1 前言

单细胞数据格式目前有这么几大派:
    Bioconductor主导的SingleCellExperiment数据格式:例如scran、scater、monocle(尽管它的对象不直接使用SingleCellExperiment,但灵感来源于SingleCellExperiment,并且操作也是类似的)
    Seurat:SeuratObject格式
    scanpy:AnnData格式
这么一来,很多分析流程就被固定在某个包中了,比如使用Seurat会一用到底,也不会去学习scater或其他R包了,但也许就错过了其他R包好用的一些功能(比如我感觉scateruniquifyFeatureNames就很好用)
既然有需求,就有开发者添加功能 ,这里Davis McCarthy 和Alex Wolf就为Seurat添加了和其他数据类型转换的函数

2 Seurat与SingleCellExperiment的相互转换

1
library(scater)
2
# devtools::install_github(repo = "satijalab/seurat", ref = "loom")
3
library(loomR)
4
library(Seurat)
5
library(patchwork)
Copied!

2.1 Seurat转SingleCellExperiment

1
# 使用Seurat内置数据
2
data("pbmc_small")
3
> pbmc_small
4
An object of class Seurat
5
230 features across 80 samples within 1 assay
6
Active assay: RNA (230 features)
7
2 dimensional reductions calculated: pca, tsne
8
9
# 一个函数即可
10
pbmc.sce <- as.SingleCellExperiment(pbmc_small)
11
> pbmc.sce
12
class: SingleCellExperiment
13
dim: 230 80
14
metadata(0):
15
assays(2): counts logcounts
16
rownames(230): MS4A1 CD79B ... SPON2 S100B
17
rowData names(5): vst.mean vst.variance
18
vst.variance.expected
19
vst.variance.standardized vst.variable
20
colnames(80): ATGCCAGAACGACT CATGGCCTGTGCAT ...
21
GGAACACTTCAGAC CTTGATTGATCTTC
22
colData names(8): orig.ident nCount_RNA ...
23
RNA_snn_res.1 ident
24
reducedDimNames(2): PCA TSNE
25
spikeNames(0):
26
altExpNames(0):
27
28
# 接下来就是scater的操作了
29
p1 <- plotExpression(pbmc.sce, features = "MS4A1", x = "ident") + theme(axis.text.x = element_text(angle = 45,
30
hjust = 1))
31
p2 <- plotPCA(pbmc.sce, colour_by = "ident")
32
p1 + p2
Copied!

2.2 SingleCellExperiment转Seurat

1
# 导入sce对象(https://scrnaseq-public-datasets.s3.amazonaws.com/scater-objects/manno_human.rds)
2
manno <- readRDS(file = "manno_human.rds")
3
> manno
4
class: SingleCellExperiment
5
dim: 20560 4029
6
metadata(0):
7
assays(2): counts logcounts
8
rownames(20560): 'MARC1' 'MARC2' ... ZZEF1 ZZZ3
9
rowData names(10): feature_symbol
10
is_feature_control ... total_counts
11
log10_total_counts
12
colnames(4029): 1772122_301_C02 1772122_180_E05
13
... 1772116-063_G02 1772099-259_H03
14
colData names(34): Species cell_type1 ...
15
pct_counts_ERCC is_cell_control
16
reducedDimNames(0):
17
altExpNames(0):
18
19
manno <- runPCA(manno)
20
# 转为seurat对象
21
manno.seurat <- as.Seurat(manno, counts = "counts", data = "logcounts")
22
23
# 看下这个函数
24
# as.Seurat(
25
# x,
26
# counts = "counts",
27
# data = "logcounts",
28
# assay = "RNA",
29
# project = "SingleCellExperiment",
30
# ...
31
# )
32
# 既然有默认参数,因此直接按下面这么写就可以:
33
manno.seurat <- as.Seurat(manno)
34
35
> manno.seurat
36
An object of class Seurat
37
20560 features across 4029 samples within 1 assay
38
Active assay: RNA (20560 features)
39
1 dimensional reduction calculated: PCA
40
41
Idents(manno.seurat) <- "cell_type1"
42
p1 <- DimPlot(manno.seurat, reduction = "PCA", group.by = "Source") + NoLegend()
43
p2 <- RidgePlot(manno.seurat, features = "ACTB", group.by = "Source")
44
p1 + p2
Copied!

3 Seurat与loom的相互转换

还记得上次在单细胞交响乐16-处理大型数据中说到:处理大型数据遇到内存不足时,可以使用这个HDF5ArrayR包(类似的还有 bigmemory, matter),它会将底层数据做成HDF5格式,用硬盘空间来存储数据,必要时再调用一部分数据到内存。loom格式就是处理HDF5使用的

3.1 Seurat转为loom

1
pbmc.loom <- as.loom(pbmc, filename = "pbmc3k.loom", verbose = FALSE)
2
pbmc.loom
3
## Class: loom
4
## Filename: /__w/1/s/output/pbmc3k.loom
5
## Access type: H5F_ACC_RDWR
6
## Attributes: version, chunks, LOOM_SPEC_VERSION, assay, last_modified
7
## Listing:
8
## name obj_type dataset.dims dataset.type_class
9
## col_attrs H5I_GROUP <NA> <NA>
10
## col_graphs H5I_GROUP <NA> <NA>
11
## layers H5I_GROUP <NA> <NA>
12
## matrix H5I_DATASET 2638 x 13714 H5T_FLOAT
13
## row_attrs H5I_GROUP <NA> <NA>
14
## row_graphs H5I_GROUP <NA> <NA>
15
16
# 最后使用完要记得关上loom对象
17
pbmc.loom$close_all()
Copied!

3.2 loom转为Seurat

首先读取:用 loomR 的connect
1
l6.immune <- connect(filename = "../data/l6_r1_immune_cells.loom", mode = "r")
2
l6.immune
3
## Class: loom
4
## Filename: /__w/1/s/data/l6_r1_immune_cells.loom
5
## Access type: H5F_ACC_RDONLY
6
## Attributes: CreationDate, last_modified
7
## Listing:
8
## name obj_type dataset.dims dataset.type_class
9
## col_attrs H5I_GROUP <NA> <NA>
10
## col_graphs H5I_GROUP <NA> <NA>
11
## layers H5I_GROUP <NA> <NA>
12
## matrix H5I_DATASET 14908 x 27998 H5T_FLOAT
13
## row_attrs H5I_GROUP <NA> <NA>
14
## row_graphs H5I_GROUP <NA> <NA>
Copied!
然后转换
1
l6.seurat <- as.Seurat(l6.immune)
2
VlnPlot(l6.seurat, features = c("Sparc", "Ftl1", "Junb", "Ccl4"), ncol = 2, pt.size = 0.1)
Copied!
最后处理完,记得关闭loom文件
1
l6.immune$close_all()
Copied!

3.3 补充

如果使用Seurat V2,还有一个自带的函数Convert
1
data("pbmc_small")
2
pbmc_small
3
pfile <- Convert(from = pbmc_small, to = "loom", filename = "pbmc_small.loom",
4
display.progress = FALSE)
5
pfile
6
## Class: loom
7
## Filename: /home/paul/Documents/Satija/pbmc_small.loom
8
## Access type: H5F_ACC_RDWR
9
## Attributes: version, chunks
10
## Listing:
11
## name obj_type dataset.dims dataset.type_class
12
## col_attrs H5I_GROUP <NA> <NA>
13
## col_graphs H5I_GROUP <NA> <NA>
14
## layers H5I_GROUP <NA> <NA>
15
## matrix H5I_DATASET 80 x 230 H5T_FLOAT
16
## row_attrs H5I_GROUP <NA> <NA>
17
## row_graphs H5I_GROUP <NA> <NA>
Copied!

4 Scanpy转Seurat

Seurat有一个函数ReadH5AD可以读取AnnData的H5AD文件
1
pbmc3k <- ReadH5AD(file = "pbmc3k.h5ad")
2
# 利用Seurat操作
3
Idents(pbmc3k) <- "louvain"
4
p1 <- DimPlot(pbmc3k, label = TRUE)
5
p2 <- VlnPlot(pbmc3k, features = c("CST3", "NKG7", "PPBP"), combine = FALSE)
6
wrap_plots(c(list(p1), p2), ncol = 2) & NoLegend()
Copied!
目前还不能直接将Seurat写成H5AD文件,因此不能之间将Seurat转为Scanpy;但是可以将loom文件作为桥梁实现Seurat转Scanpy,例如Scanpy 有一个函数scanpy.read_loom()
最近更新 1yr ago