Creating MAMS from AnnData

Introduction

Matrix and Analysis Metadata Standards (MAMS) can be used to capture the relevant information about the data matrices and annotations that are produced during common and complex analysis workflows for single-cell data.

This workflow highlights how an RNA expression based experiment from an AnnData object can be annotated using the rmams package. For this purpose, we utilize a pre-processed PBMC3K AnnData object, which has been passed through standard python scanpy workflow.

The data object (scverse-getting-started-anndata-pbmc3k_processed.h5ad) is available through this link here[1].

This workflow is focused on producing data/results that is commonly generated during real-world analyses which can be annotated with MAMS. For more information about MAMS, see the GitHub repository

Step 1: Import AnnData object

library(reticulate)
library(anndata)

adata <- anndata::read_h5ad('../inst/extdata/pbmc3k_annData_processed.h5ad')

Step 2: Creating the MAMS object from the AnnData object

library(rmams)

# get the file paths of anndata objects
file_paths <- c('../inst/extdata/pbmc3k_annData_processed.h5ad')

# get the names of these objects 
file_names <-  gsub(pattern = "\\.rds$", replacement = "", x = basename(file_paths))

# save the anndata objects onto the object_list by reading the object from file path.
object_list <- lapply(file_paths, read_h5ad)

# name the list objects using previously generated file_names.
names(object_list) <- file_names

# define the type of each subset
observational_subsets<-c("nonartifact")
datasetid = "PBMC3K"

Create MAMS object

result <- convert_AnnData_to_MAMS(
    object_list = object_list, 
    observation_subsets = observational_subsets, 
    X_processing = c("logcounts"), 
    dataset_id = datasetid)

Accessing data

# Get the dataset_id from the first fom
fom(result,"fom1","dataset_id")
#> [1] "PBMC3K"

Step 3: Validating MAMS object

fom(result,"fom1","dataset_id") <-""
check_MAMS(mams_object = result)
#> Warning: Please use the setter functions to add the missing field information for the following slots: 
#> (Ex. If missing 'filepath' for fom1, then 'fom(mams = mams_object, fom_id = 'fom1', key = 'filepath') <- ...')
#> :
#> Warning in check_MAMS(mams_object = result): fom1 is missing the fields:
#>  [1] "representation_description"      "obs_unit"                       
#>  [3] "processing_description"          "analyte_description"            
#>  [5] "obs_subset"                      "obs_subset_description"         
#>  [7] "feature_subset"                  "feature_subset_description"     
#>  [9] "record_id"                       "parent_id"                      
#> [11] "parent_relationship"             "parent_relationship_description"
#> [13] "fid"                             "obs"                            
#> [15] "fea"
#> Warning in check_MAMS(mams_object = result):
#> Warning in check_MAMS(mams_object = result): fom2 is missing the fields:
#>  [1] "representation_description"      "obs_unit"                       
#>  [3] "processing"                      "processing_description"         
#>  [5] "analyte_description"             "obs_subset"                     
#>  [7] "obs_subset_description"          "feature_subset"                 
#>  [9] "feature_subset_description"      "record_id"                      
#> [11] "parent_id"                       "parent_relationship"            
#> [13] "parent_relationship_description" "fid"                            
#> [15] "obs"                             "fea"
#> Warning in check_MAMS(mams_object = result):
#> Warning in check_MAMS(mams_object = result): fom3 is missing the fields:
#>  [1] "representation_description"      "obs_unit"                       
#>  [3] "processing_description"          "analyte_description"            
#>  [5] "obs_subset_description"          "feature_subset"                 
#>  [7] "feature_subset_description"      "record_id"                      
#>  [9] "parent_id"                       "parent_relationship"            
#> [11] "parent_relationship_description" "fid"                            
#> [13] "obs"                             "fea"
#> Warning in check_MAMS(mams_object = result):
#> Warning in check_MAMS(mams_object = result): fom4 is missing the fields:
#>  [1] "representation_description"      "obs_unit"                       
#>  [3] "processing_description"          "analyte_description"            
#>  [5] "obs_subset_description"          "feature_subset"                 
#>  [7] "feature_subset_description"      "record_id"                      
#>  [9] "parent_id"                       "parent_relationship"            
#> [11] "parent_relationship_description" "fid"                            
#> [13] "obs"                             "fea"
#> Warning in check_MAMS(mams_object = result):
#> Warning in check_MAMS(mams_object = result): fom5 is missing the fields:
#>  [1] "representation_description"      "obs_unit"                       
#>  [3] "processing_description"          "analyte_description"            
#>  [5] "obs_subset_description"          "feature_subset"                 
#>  [7] "feature_subset_description"      "record_id"                      
#>  [9] "parent_id"                       "parent_relationship"            
#> [11] "parent_relationship_description" "fid"                            
#> [13] "obs"                             "fea"
#> Warning in check_MAMS(mams_object = result):
#> Error: Please use the setter functions to add the missing field information for the following slots: 
#> (Ex. If missing 'filepath' for fom1, then 'fom(mams = mams_object, fom_id = 'fom1', key = 'filepath') <- ...')
#> :
#> Warning in check_MAMS(mams_object = result): fom1 is missing the fields:
#> [1] "dataset_id"
#> Warning in check_MAMS(mams_object = result):
#> Warning in check_MAMS(mams_object = result): The following slot is empty: ONG
#> Warning in check_MAMS(mams_object = result): The following slot is empty: FEA
#> Warning in check_MAMS(mams_object = result): The following slot is empty: OBS
#> Warning in check_MAMS(mams_object = result): The following slot is empty: FID
#> Warning in check_MAMS(mams_object = result): The following slot is empty: OID
#> Warning in check_MAMS(mams_object = result): The following slot is empty: REC
#> Warning in check_MAMS(mams_object = result): The following slot is empty: FNG

Step 4: Updating fields manually

# add dataset id
fom(result,"fom1","dataset_id") <-"PBMC3K"

# by default all reduced dimensions are annotated as Reductions, but UMAP can be better reflected as Embedding
fom(result, "fom6", "processing") <- "Embedding"

We can add some commands as well, e.g. the record for the generation of counts matrix:

record_id <- "CellRanger.count"
record_package_name<- "CellRanger"
record_function_name<- "count"
record_package_version<- "unknown"
result@REC[[record_id]]<- create_REC_object(record_package_name = record_package_name,
                                          record_function_name = record_function_name,
                                          record_package_version = record_package_version)

Step 5: Store mams object to AnnData

adata$uns[["MAMS"]] <- convert_MAMS_to_list(result)

Step 6: Saving MAMS object to JSON

Here we show how you can convert the MAMS object to the JSON format for export. MAMS and JSON formats are interchangeable.

write_MAMS_to_output(MAMS=result, filepath = "../inst/extdata/pbmc3k_rna_adata_mams.JSON",format = "JSON")

Step 5: Saving MAMS as YAML

write_MAMS_to_output(MAMS=result, filepath = "../inst/extdata/pbmc3k_rna_adata_mams.yml",format = "YAML")

References:

[1] Lause, Jan (2023). scverse tutorial data: Getting started with AnnData. figshare. Dataset. https://doi.org/10.6084/m9.figshare.22577536.v2

Session Info

sessionInfo()
#> R version 4.4.1 (2024-06-14)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 22.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
#> 
#> locale:
#>  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
#>  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
#>  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
#> [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
#> 
#> time zone: UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] rmams_1.0.1       anndata_0.7.5.6   reticulate_1.38.0
#> 
#> loaded via a namespace (and not attached):
#>  [1] Matrix_1.7-0        future.apply_1.11.2 jsonlite_1.8.8     
#>  [4] compiler_4.4.1      Rcpp_1.0.13         SeuratObject_5.0.2 
#>  [7] parallel_4.4.1      assertthat_0.2.1    jquerylib_0.1.4    
#> [10] globals_0.16.3      systemfonts_1.1.0   textshaping_0.4.0  
#> [13] png_0.1-8           yaml_2.3.10         fastmap_1.2.0      
#> [16] lattice_0.22-6      here_1.0.1          R6_2.5.1           
#> [19] generics_0.1.3      knitr_1.48          htmlwidgets_1.6.4  
#> [22] dotCall64_1.1-1     future_1.34.0       bookdown_0.40      
#> [25] desc_1.4.3          rprojroot_2.0.4     bslib_0.8.0        
#> [28] rlang_1.1.4         sp_2.1-4            cachem_1.1.0       
#> [31] xfun_0.47           fs_1.6.4            sass_0.4.9         
#> [34] cli_3.6.3           progressr_0.14.0    pkgdown_2.1.0      
#> [37] withr_3.0.1         rmdformats_1.0.4    digest_0.6.37      
#> [40] grid_4.4.1          rappdirs_0.3.3      spam_2.10-0        
#> [43] lifecycle_1.0.4     evaluate_0.24.0     listenv_0.9.1      
#> [46] codetools_0.2-20    ragg_1.3.2          parallelly_1.38.0  
#> [49] rmarkdown_2.28      tools_4.4.1         htmltools_0.5.8.1

2024-08-28