Skip to contents

Introduction

This vignette reproduces the LaminDB Introduction guide. The equivalent {laminr} code is included here, for the related text see the associated links. This vignette requires the bionty Python package to be available.

Walkthrough

See https://docs.lamin.ai/guide#walkthrough.

lamin_init_temp(name = "laminr-intro", modules = c("bionty"))

NOTE: We have used a lamin_init_temp() to create a temporary instance for this vignette but in most cases you should use regular lamin_init()

Transforms

See https://docs.lamin.ai/guide#transforms.

ln <- import_module("lamindb")
#> → connected lamindb: anonymous/laminr-intro-20250310172937
ln$track()
#> → created Transform('k0fqNQTwFlA70000'), started new Run('uucVQFwb...') at 2025-03-10 17:29:53 UTC

ln$Transform$df()
#>                uid              key      description     type source_code hash
#> 1 k0fqNQTwFlA70000 introduction.Rmd introduction.Rmd notebook        <NA> <NA>
#>   reference reference_type space_id _template_id version is_latest
#> 1      <NA>           <NA>        1         <NA>    <NA>      TRUE
#>            created_at created_by_id _aux _branch_code
#> 1 2025-03-10 17:29:53             1 <NA>            1

ln$Run$df()
#>                    uid name          started_at finished_at reference
#> 1 uucVQFwbTRzJtomI5Hvd <NA> 2025-03-10 17:29:53        <NA>      <NA>
#>   reference_type _is_consecutive _status_code space_id transform_id report_id
#> 1           <NA>            <NA>            0        1            1      <NA>
#>   _logfile_id environment_id initiated_by_run_id          created_at
#> 1        <NA>           <NA>                <NA> 2025-03-10 17:29:53
#>   created_by_id _aux _branch_code
#> 1             1 <NA>            1

Artifacts

See https://docs.lamin.ai/guide#artifacts.

df <- ln$core$datasets$small_dataset1(otype = "DataFrame", with_typo = TRUE)
df
#>         ENSG00000153563 ENSG00000010610 ENSG00000170458 perturbation
#> sample1               1               3               5         DMSO
#> sample2               2               4               6         IFNJ
#> sample3               3               5               7         DMSO
#>         sample_note cell_type_by_expert cell_type_by_model
#> sample1      was ok              B cell             B cell
#> sample2  looks naah              T cell             T cell
#> sample3  pretty! 🤩              T cell             T cell

artifact <- ln$Artifact$from_df(df, key = "my_datasets/rnaseq1.parquet")$save()
artifact$describe()
#> Artifact .parquet/DataFrame
#> └── General
#>     ├── .uid = '0QKuoYMA4v42mpaH0000'
#>     ├── .key = 'my_datasets/rnaseq1.parquet'
#>     ├── .size = 6120
#>     ├── .hash = 'cD9NSUzJ3YzfmybCQ4Ab9w'
#>     ├── .n_observations = 3
#>     ├── .path = 
#>     │   /tmp/RtmpJ8A8HB/laminr-intro-20250310172937/.lamindb/0QKuoYMA4v42mpaH000
#>     │   0.parquet
#>     ├── .created_by = anonymous
#>     ├── .created_at = 2025-03-10 17:29:54
#>     └── .transform = 'introduction.Rmd'

artifact$cache()
#> [1] "/tmp/RtmpJ8A8HB/laminr-intro-20250310172937/.lamindb/0QKuoYMA4v42mpaH0000.parquet"

dataset <- artifact$open()
as.data.frame(dataset)
#> # A data frame: 3 × 8
#>   ENSG00000153563 ENSG00000010610 ENSG00000170458 perturbation sample_note
#>             <dbl>           <dbl>           <dbl> <fct>        <chr>      
#> 1               1               3               5 DMSO         was ok     
#> 2               2               4               6 IFNJ         looks naah 
#> 3               3               5               7 DMSO         pretty! 🤩 
#> # ℹ 3 more variables: cell_type_by_expert <fct>, cell_type_by_model <fct>,
#> #   `__index_level_0__` <chr>

artifact$load()
#>         ENSG00000153563 ENSG00000010610 ENSG00000170458 perturbation
#> sample1               1               3               5         DMSO
#> sample2               2               4               6         IFNJ
#> sample3               3               5               7         DMSO
#>         sample_note cell_type_by_expert cell_type_by_model
#> sample1      was ok              B cell             B cell
#> sample2  looks naah              T cell             T cell
#> sample3  pretty! 🤩              T cell             T cell

artifact$view_lineage()
#> ✖ `view_lineage()` is not yet implemented. Please view the lineage in the web interface.

df_typo <- df
levels(df$perturbation) <- c("DMSO", "IFNG")
df["sample2", "perturbation"] <- "IFNG"
artifact <- ln$Artifact$from_df(df, key = "my_datasets/rnaseq1.parquet")$save()
#> → creating new artifact version for key='my_datasets/rnaseq1.parquet' (storage: '/tmp/RtmpJ8A8HB/laminr-intro-20250310172937')
artifact$versions$df()
#>                    uid                         key description   suffix    kind
#> 1 0QKuoYMA4v42mpaH0000 my_datasets/rnaseq1.parquet        <NA> .parquet dataset
#> 2 0QKuoYMA4v42mpaH0001 my_datasets/rnaseq1.parquet        <NA> .parquet dataset
#>       otype size                   hash n_files n_observations _hash_type
#> 1 DataFrame 6120 cD9NSUzJ3YzfmybCQ4Ab9w    <NA>              3        md5
#> 2 DataFrame 6120 O69yLgP32m9XBvvw_7WWxg    <NA>              3        md5
#>   _key_is_virtual _overwrite_versions space_id storage_id schema_id version
#> 1            TRUE               FALSE        1          1      <NA>    <NA>
#> 2            TRUE               FALSE        1          1      <NA>    <NA>
#>   is_latest run_id          created_at created_by_id _aux _branch_code
#> 1     FALSE      1 2025-03-10 17:29:54             1 <NA>            1
#> 2      TRUE      1 2025-03-10 17:29:56             1 <NA>            1

Labels

See https://docs.lamin.ai/guide#labels.

bt <- import_module("bionty")

experiment_type <- ln$ULabel(name = "Experiment", is_type = TRUE)$save()
candidate_marker_experiment <- ln$ULabel(
  name = "Candidate marker experiment", type = experiment_type
)$save()

artifact$ulabels$add(candidate_marker_experiment)

cell_type <- bt$CellType$from_source(name = "effector T cell")$save()
artifact$cell_types$add(cell_type)

artifact$describe()
#> Artifact .parquet/DataFrame
#> ├── General
#> │   ├── .uid = '0QKuoYMA4v42mpaH0001'
#> │   ├── .key = 'my_datasets/rnaseq1.parquet'
#> │   ├── .size = 6120
#> │   ├── .hash = 'O69yLgP32m9XBvvw_7WWxg'
#> │   ├── .n_observations = 3
#> │   ├── .path = 
#> │   │   /tmp/RtmpJ8A8HB/laminr-intro-20250310172937/.lamindb/0QKuoYMA4v42mpaH000
#> │   │   1.parquet
#> │   ├── .created_by = anonymous
#> │   ├── .created_at = 2025-03-10 17:29:56
#> │   └── .transform = 'introduction.Rmd'
#> └── Labels
#>     └── .cell_types         bionty.CellType    effector T cell                  
#>         .ulabels            ULabel             Candidate marker experiment

Registries

See https://docs.lamin.ai/guide#registries.

ln$ULabel$df()
#>        uid                        name is_type description reference
#> 2 nfNRAt9Z Candidate marker experiment   FALSE        <NA>      <NA>
#> 1 ISDFDrug                  Experiment    TRUE        <NA>      <NA>
#>   reference_type space_id type_id run_id          created_at created_by_id _aux
#> 2           <NA>        1       1      1 2025-03-10 17:29:56             1 <NA>
#> 1           <NA>        1     NaN      1 2025-03-10 17:29:56             1 <NA>
#>   _branch_code
#> 2            1
#> 1            1

ln$Artifact
#> Artifact
#>   Simple fields
#>     .uid: CharField
#>     .key: CharField
#>     .description: CharField
#>     .suffix: CharField
#>     .kind: CharField
#>     .otype: CharField
#>     .size: BigIntegerField
#>     .hash: CharField
#>     .n_files: BigIntegerField
#>     .n_observations: BigIntegerField
#>     .version: CharField
#>     .is_latest: BooleanField
#>     .created_at: DateTimeField
#>     .updated_at: DateTimeField
#>   Relational fields
#>     .space: Space
#>     .storage: Storage
#>     .run: Run
#>     .schema: Schema
#>     .created_by: User
#>     .ulabels: ULabel
#>     .input_of_runs: Run
#>     .feature_sets: Schema
#>     .collections: Collection
#>     .references: Reference
#>     .projects: Project
#>   Bionty fields
#>     .organisms: bionty.Organism
#>     .genes: bionty.Gene
#>     .proteins: bionty.Protein
#>     .cell_markers: bionty.CellMarker
#>     .tissues: bionty.Tissue
#>     .cell_types: bionty.CellType
#>     .diseases: bionty.Disease
#>     .cell_lines: bionty.CellLine
#>     .phenotypes: bionty.Phenotype
#>     .pathways: bionty.Pathway
#>     .experimental_factors: bionty.ExperimentalFactor
#>     .developmental_stages: bionty.DevelopmentalStage
#>     .ethnicities: bionty.Ethnicity
#>  signature: (*args, **kwargs)

See https://docs.lamin.ai/guide#query-search.

transform <- ln$Transform$get(key = "introduction.Rmd")

ln$Artifact$filter(key__startswith = "my_datasets/")$df()
#>                    uid                         key description   suffix    kind
#> 1 0QKuoYMA4v42mpaH0000 my_datasets/rnaseq1.parquet        <NA> .parquet dataset
#> 2 0QKuoYMA4v42mpaH0001 my_datasets/rnaseq1.parquet        <NA> .parquet dataset
#>       otype size                   hash n_files n_observations _hash_type
#> 1 DataFrame 6120 cD9NSUzJ3YzfmybCQ4Ab9w    <NA>              3        md5
#> 2 DataFrame 6120 O69yLgP32m9XBvvw_7WWxg    <NA>              3        md5
#>   _key_is_virtual _overwrite_versions space_id storage_id schema_id version
#> 1            TRUE               FALSE        1          1      <NA>    <NA>
#> 2            TRUE               FALSE        1          1      <NA>    <NA>
#>   is_latest run_id          created_at created_by_id _aux _branch_code
#> 1     FALSE      1 2025-03-10 17:29:54             1 <NA>            1
#> 2      TRUE      1 2025-03-10 17:29:56             1 <NA>            1

artifacts <- ln$Artifact$filter(transform = transform)$all()

artifacts <- ln$Artifact$filter(
  transform__description__icontains = "intro", ulabels = candidate_marker_experiment
)$all()

ln$Transform$search("intro")$df()
#>                uid              key      description     type source_code hash
#> 1 k0fqNQTwFlA70000 introduction.Rmd introduction.Rmd notebook        <NA> <NA>
#>   reference reference_type space_id _template_id version is_latest
#> 1      <NA>           <NA>        1         <NA>    <NA>      TRUE
#>            created_at created_by_id _aux _branch_code
#> 1 2025-03-10 17:29:53             1 <NA>            1
ulabels <- ln$ULabel$lookup()
cell_types <- bt$CellType$lookup()

Features

See https://docs.lamin.ai/guide#features.

ln$Feature(name = "temperature", dtype = "float")$save()
#> Feature(uid='Y51YJDN3oiJg', name='temperature', dtype='float', array_rank=0, array_size=0, space_id=1, created_by_id=1, run_id=1, created_at=2025-03-10 17:29:58 UTC)

ln$Feature(name = "experiment", dtype = ln$ULabel)$save()
#> Feature(uid='fQ9c1yaruwJu', name='experiment', dtype='cat[ULabel]', array_rank=0, array_size=0, space_id=1, created_by_id=1, run_id=1, created_at=2025-03-10 17:29:58 UTC)

artifact$features$add_values(
  list("temperature" = 21.6, "experiment" = "Candidate marker experiment")
)

artifact$describe()
#> Artifact .parquet/DataFrame
#> ├── General
#> │   ├── .uid = '0QKuoYMA4v42mpaH0001'
#> │   ├── .key = 'my_datasets/rnaseq1.parquet'
#> │   ├── .size = 6120
#> │   ├── .hash = 'O69yLgP32m9XBvvw_7WWxg'
#> │   ├── .n_observations = 3
#> │   ├── .path = 
#> │   │   /tmp/RtmpJ8A8HB/laminr-intro-20250310172937/.lamindb/0QKuoYMA4v42mpaH000
#> │   │   1.parquet
#> │   ├── .created_by = anonymous
#> │   ├── .created_at = 2025-03-10 17:29:56
#> │   └── .transform = 'introduction.Rmd'
#> ├── Linked features
#> │   └── experiment          cat[ULabel]        Candidate marker experiment      
#> │       temperature         float              21.6                             
#> └── Labels
#>     └── .cell_types         bionty.CellType    effector T cell                  
#>         .ulabels            ULabel             Candidate marker experiment

ln$Artifact$features$filter(experiment__contains = "marker experiment")$df()
#>                    uid                         key description   suffix    kind
#> 2 0QKuoYMA4v42mpaH0001 my_datasets/rnaseq1.parquet        <NA> .parquet dataset
#>       otype size                   hash n_files n_observations _hash_type
#> 2 DataFrame 6120 O69yLgP32m9XBvvw_7WWxg    <NA>              3        md5
#>   _key_is_virtual _overwrite_versions space_id storage_id schema_id version
#> 2            TRUE               FALSE        1          1      <NA>    <NA>
#>   is_latest run_id          created_at created_by_id _aux _branch_code
#> 2      TRUE      1 2025-03-10 17:29:56             1 <NA>            1

Key use cases

See https://docs.lamin.ai/guide#key-use-cases.

Understand data lineage

See https://docs.lamin.ai/guide#understand-data-lineage.

artifact$view_lineage()
#> ✖ `view_lineage()` is not yet implemented. Please view the lineage in the web interface.
transform$view_lineage()
#> ✖ `view_lineage()` is not yet implemented. Please view the lineage in the web interface.
# Example only, not run
ln <- import_module("lamindb")
ln$track()
ln$finish()

# lamin load https://lamin.ai/laminlabs/lamindata/transform/13VINnFk89PE0004

Curate datasets

See https://docs.lamin.ai/introduction#curate-datasets.

perturbation_type <- ln$ULabel(name = "Perturbation", is_type = TRUE)$save()
ln$ULabel(name = "DMSO", type = perturbation_type)$save()
#> ULabel(uid='57q8hOBE', name='DMSO', is_type=False, space_id=1, created_by_id=1, run_id=1, type_id=3, created_at=2025-03-10 17:29:59 UTC)
ln$ULabel(name = "IFNG", type = perturbation_type)$save()
#> ULabel(uid='Q3ZqXruz', name='IFNG', is_type=False, space_id=1, created_by_id=1, run_id=1, type_id=3, created_at=2025-03-10 17:29:59 UTC)

# Load Python built ins to get access to dtypes
py_builtins <- reticulate::import_builtins()

schema <- ln$Schema(
  name = "My DataFrame schema",
  features = list(
    # NOTE: These have dtype=int in the original guide
    ln$Feature(name = "ENSG00000153563", dtype = py_builtins$float)$save(),
    ln$Feature(name = "ENSG00000010610", dtype = py_builtins$float)$save(),
    ln$Feature(name = "ENSG00000170458", dtype = py_builtins$float)$save(),
    ln$Feature(name = "perturbation", dtype = ln$ULabel)$save()
  )
)$save()

curator <- ln$curators$DataFrameCurator(df, schema)
artifact <- curator$save_artifact(key = "my_curated_dataset.parquet")
#> ✓ "perturbation" is validated against ULabel.name
#> → returning existing artifact with same hash: Artifact(uid='0QKuoYMA4v42mpaH0001', is_latest=True, key='my_datasets/rnaseq1.parquet', suffix='.parquet', kind='dataset', otype='DataFrame', size=6120, hash='O69yLgP32m9XBvvw_7WWxg', n_observations=3, space_id=1, storage_id=1, run_id=1, created_by_id=1, created_at=2025-03-10 17:29:56 UTC); to track this artifact as an input, use: ln.Artifact.get()
#> ! key my_datasets/rnaseq1.parquet on existing artifact differs from passed key my_curated_dataset.parquet
#> ✓ 4 unique terms (57.10%) are validated for name
#> ! 3 unique terms (42.90%) are not validated for name: 'sample_note', 'cell_type_by_expert', 'cell_type_by_model'
#> ✓ loaded 4 Feature records matching name: 'ENSG00000153563', 'ENSG00000010610', 'ENSG00000170458', 'perturbation'
#> ! did not create Feature records for 3 non-validated names: 'cell_type_by_expert', 'cell_type_by_model', 'sample_note'
#> → returning existing schema with same hash: Schema(uid='A2G3AnSXsO60eoiHj6d0', name='My DataFrame schema', n=4, itype='Feature', is_type=False, hash='cMIAATRcLJccyj67Jp5Jsw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=1, created_at=2025-03-10 17:29:59 UTC)
#> ! updated otype from None to DataFrame
artifact$describe()
#> Artifact .parquet/DataFrame
#> ├── General
#> │   ├── .uid = '0QKuoYMA4v42mpaH0001'
#> │   ├── .key = 'my_datasets/rnaseq1.parquet'
#> │   ├── .size = 6120
#> │   ├── .hash = 'O69yLgP32m9XBvvw_7WWxg'
#> │   ├── .n_observations = 3
#> │   ├── .path = 
#> │   │   /tmp/RtmpJ8A8HB/laminr-intro-20250310172937/.lamindb/0QKuoYMA4v42mpaH000
#> │   │   1.parquet
#> │   ├── .created_by = anonymous
#> │   ├── .created_at = 2025-03-10 17:29:56
#> │   └── .transform = 'introduction.Rmd'
#> ├── Dataset features/.feature_sets
#> │   └── columns • 4         [Feature]                                           
#> │       perturbation        cat[ULabel]        DMSO, IFNG                       
#> │       ENSG00000153563     float                                               
#> │       ENSG00000010610     float                                               
#> │       ENSG00000170458     float                                               
#> ├── Linked features
#> │   └── experiment          cat[ULabel]        Candidate marker experiment      
#> │       temperature         float              21.6                             
#> └── Labels
#>     └── .cell_types         bionty.CellType    effector T cell                  
#>         .ulabels            ULabel             Candidate marker experiment, DMS…
ln$Artifact$get(ulabels__name = "IFNG")
#> Artifact(uid='0QKuoYMA4v42mpaH0001', is_latest=True, key='my_datasets/rnaseq1.parquet', suffix='.parquet', kind='dataset', otype='DataFrame', size=6120, hash='O69yLgP32m9XBvvw_7WWxg', n_observations=3, space_id=1, storage_id=1, run_id=1, schema_id=1, created_by_id=1, created_at=2025-03-10 17:29:56 UTC)

curator <- ln$curators$DataFrameCurator(df_typo, schema)
tryCatch(
  curator$validate(),
  error = function(err) {
    cat(conditionMessage(err))
  }
)
#> • mapping "perturbation" on ULabel.name
#> !   1 term is not validated: 'IFNJ'
#>     → fix typos, remove non-existent values, or save terms via .add_new_from("perturbation")
#> lamindb.errors.ValidationError: 1 term is not validated: 'IFNJ'
#>     → fix typos, remove non-existent values, or save terms via .add_new_from("perturbation")
#> Run `reticulate::py_last_error()` for details.

Manage biological registries

See https://docs.lamin.ai/introduction#manage-biological-registries.

cell_types <- bt$CellType$public()
cell_types
#> PublicOntology
#> Entity: CellType
#> Organism: all
#> Source: cl, 2024-08-16
#> #terms: 2959
cell_types$search("gamma-delta T cell") |> head(2)
#>                                  name
#> CL:0000798         gamma-delta T cell
#> CL:4033072 cycling gamma-delta T cell
#>                                                                definition
#> CL:0000798 A T Cell That Expresses A Gamma-Delta T Cell Receptor Complex.
#> CL:4033072                       A(N) Gamma-Delta T Cell That Is Cycling.
#>                                                                                          synonyms
#> CL:0000798 gamma-delta T-cell|gamma-delta T lymphocyte|gammadelta T cell|gamma-delta T-lymphocyte
#> CL:4033072                                                       proliferating gamma-delta T cell
#>                           parents
#> CL:0000798             CL:0000084
#> CL:4033072 CL:4033069, CL:0000798

var_schema <- ln$Schema(
  name = "my_var_schema",
  itype = bt$Gene$ensembl_gene_id,
  dtype = py_builtins$float
)$save()
obs_schema <- ln$Schema(
  name = "my_obs_schema",
  features = list(
    ln$Feature(name = "perturbation", dtype = ln$ULabel)$save()
  )
)$save()
#> → returning existing Feature record with same name: 'perturbation'
anndata_schema <- ln$Schema(
  name = "my_anndata_schema",
  otype = "AnnData",
  components = list("obs" = obs_schema, "var" = var_schema)
)$save()

library(anndata)
adata <- AnnData(
  df[c("ENSG00000153563", "ENSG00000010610", "ENSG00000170458")],
  obs = df[, "perturbation", drop = FALSE]
)
curator <- ln$curators$AnnDataCurator(adata, anndata_schema)
#> • saving validated records of 'columns'
#> ✓ added 3 records from public with Gene.ensembl_gene_id for "columns": 'ENSG00000170458', 'ENSG00000153563', 'ENSG00000010610'
artifact <- curator$save_artifact(description = "my RNA-seq")
#> ✓ "perturbation" is validated against ULabel.name
#> • path content will be copied to default storage upon `save()` with key `None` ('.lamindb/7Ci5odXeEN4yuGPW0000.h5ad')
#> ✓ storing artifact '7Ci5odXeEN4yuGPW0000' at '/tmp/RtmpJ8A8HB/laminr-intro-20250310172937/.lamindb/7Ci5odXeEN4yuGPW0000.h5ad'
#> • parsing feature names of X stored in slot 'var'
#> ✓    3 unique terms (100.00%) are validated for ensembl_gene_id
#> ✓    linked: Schema(uid='8JKccgcJ5p9uSew4vYPb', n=3, dtype='float', itype='bionty.Gene', is_type=False, hash='f2UVeHefaZxXFjmUwo9Ozw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=1, created_at=<django.db.models.expressions.DatabaseDefault object at 0x7fe5e87e72c0>)
#> • parsing feature names of slot 'obs'
#> ✓    1 unique term (100.00%) is validated for name
#> →    returning existing schema with same hash: Schema(uid='oZW5uuU1TIWfXXKNz6hB', name='my_obs_schema', n=1, itype='Feature', is_type=False, hash='nHGWVy7t0lz0LQS6lpcZnA', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=1, created_at=2025-03-10 17:30:00 UTC)
#> !    updated otype from None to DataFrame
#> ✓    linked: Schema(uid='oZW5uuU1TIWfXXKNz6hB', name='my_obs_schema', n=1, itype='Feature', is_type=False, otype='DataFrame', hash='nHGWVy7t0lz0LQS6lpcZnA', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=1, created_at=2025-03-10 17:30:00 UTC)
#> ✓ saved 1 feature set for slot: 'var'
artifact$describe()
#> Artifact .h5ad/AnnData
#> ├── General
#> │   ├── .uid = '7Ci5odXeEN4yuGPW0000'
#> │   ├── .size = 19240
#> │   ├── .hash = 'gO44MDqttaaKNyBLVM-zzA'
#> │   ├── .n_observations = 3
#> │   ├── .path = 
#> │   │   /tmp/RtmpJ8A8HB/laminr-intro-20250310172937/.lamindb/7Ci5odXeEN4yuGPW000
#> │   │   0.h5ad
#> │   ├── .created_by = anonymous
#> │   ├── .created_at = 2025-03-10 17:30:02
#> │   └── .transform = 'introduction.Rmd'
#> ├── Dataset features/.feature_sets
#> │   ├── var • 3             [bionty.Gene]                                       
#> │   │   CD14                float                                               
#> │   │   CD8A                float                                               
#> │   │   CD4                 float                                               
#> │   └── obs • 1             [Feature]                                           
#> │       perturbation        cat[ULabel]        DMSO, IFNG                       
#> └── Labels
#>     └── .ulabels            ULabel             DMSO, IFNG

genes <- bt$Gene$filter(organism__name = "human")$lookup()
feature_sets <- ln$FeatureSet$filter(genes = genes$cd8a)$all()
ln$Artifact$filter(feature_sets__in = feature_sets)$df()
#>                    uid  key description suffix    kind   otype  size
#> 3 7Ci5odXeEN4yuGPW0000 <NA>  my RNA-seq  .h5ad dataset AnnData 19240
#>                     hash n_files n_observations _hash_type _key_is_virtual
#> 3 gO44MDqttaaKNyBLVM-zzA    <NA>              3        md5            TRUE
#>   _overwrite_versions space_id storage_id schema_id version is_latest run_id
#> 3               FALSE        1          1         4    <NA>      TRUE      1
#>            created_at created_by_id _aux _branch_code
#> 3 2025-03-10 17:30:02             1 <NA>            1

neuron <- bt$CellType$from_source(name = "neuron")$save()
#> ✓ created 1 CellType record from Bionty matching name: 'neuron'
#> ✓ created 3 CellType records from Bionty matching ontology_id: 'CL:0002319', 'CL:0000404', 'CL:0000393'
new_cell_state <- bt$CellType(
  name = "my neuron cell state", description = "explains X"
)$save()
new_cell_state$parents$add(neuron)
new_cell_state$view_parents(distance = 2)

Scale learning

See https://docs.lamin.ai/introduction#scale-learning.

df2 <- ln$core$datasets$small_dataset2(otype = "DataFrame")
adata <- AnnData(
  df2[c("ENSG00000153563", "ENSG00000010610", "ENSG00000004468")],
  obs = df2[, "perturbation", drop = FALSE]
)
curator <- ln$curators$AnnDataCurator(adata, anndata_schema)
#> • saving validated records of 'columns'
#> ✓ added 1 record from public with Gene.ensembl_gene_id for "columns": 'ENSG00000004468'
artifact2 <- curator$save_artifact(key = "my_datasets/my_rnaseq2.h5ad")
#> ✓ "perturbation" is validated against ULabel.name
#> • path content will be copied to default storage upon `save()` with key 'my_datasets/my_rnaseq2.h5ad'
#> ✓ storing artifact 'j3mcBo5sbvXaNHAK0000' at '/tmp/RtmpJ8A8HB/laminr-intro-20250310172937/.lamindb/j3mcBo5sbvXaNHAK0000.h5ad'
#> • parsing feature names of X stored in slot 'var'
#> ✓    3 unique terms (100.00%) are validated for ensembl_gene_id
#> ✓    linked: Schema(uid='3Zt0X3MCIbPw7mioNAhm', n=3, dtype='float', itype='bionty.Gene', is_type=False, hash='QW2rHuIo5-eGNZbRxHMDCw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=1, created_at=<django.db.models.expressions.DatabaseDefault object at 0x7fe5e88c6b40>)
#> • parsing feature names of slot 'obs'
#> ✓    1 unique term (100.00%) is validated for name
#> →    returning existing schema with same hash: Schema(uid='oZW5uuU1TIWfXXKNz6hB', name='my_obs_schema', n=1, itype='Feature', is_type=False, hash='nHGWVy7t0lz0LQS6lpcZnA', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=1, created_at=2025-03-10 17:30:00 UTC)
#> !    updated otype from None to DataFrame
#> ✓    linked: Schema(uid='oZW5uuU1TIWfXXKNz6hB', name='my_obs_schema', n=1, itype='Feature', is_type=False, otype='DataFrame', hash='nHGWVy7t0lz0LQS6lpcZnA', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=1, created_at=2025-03-10 17:30:00 UTC)
#> ✓ saved 1 feature set for slot: 'var'

collection <- ln$Collection(
  list(artifact, artifact2),
  key = "my-RNA-seq-collection"
)$save()
collection$describe()
#> Collection 
#> └── General
#>     ├── .uid = 'vL9IW55WPb9VoGOz0000'
#>     ├── .key = 'my-RNA-seq-collection'
#>     ├── .hash = 'DbgO9hDdS-KOydwDZDLp3g'
#>     ├── .created_by = anonymous
#>     ├── .created_at = 2025-03-10 17:30:05
#>     └── .transform = 'introduction.Rmd'
collection$view_lineage()
#> ✖ `view_lineage()` is not yet implemented. Please view the lineage in the web interface.

collection$load()
#> AnnData object with n_obs × n_vars = 6 × 4
#>     obs: 'perturbation', 'artifact_uid'
collection$artifacts$all()
#> <QuerySet [Artifact(uid='7Ci5odXeEN4yuGPW0000', is_latest=True, description='my RNA-seq', suffix='.h5ad', kind='dataset', otype='AnnData', size=19240, hash='gO44MDqttaaKNyBLVM-zzA', n_observations=3, space_id=1, storage_id=1, run_id=1, schema_id=4, created_by_id=1, created_at=2025-03-10 17:30:02 UTC), Artifact(uid='j3mcBo5sbvXaNHAK0000', is_latest=True, key='my_datasets/my_rnaseq2.h5ad', suffix='.h5ad', kind='dataset', otype='AnnData', size=19240, hash='Ti7bcnIOlk0fIt2TFW_VAw', n_observations=3, space_id=1, storage_id=1, run_id=1, schema_id=4, created_by_id=1, created_at=2025-03-10 17:30:05 UTC)]>
collection$artifacts$df()
#>                    uid                         key description suffix    kind
#> 3 7Ci5odXeEN4yuGPW0000                        <NA>  my RNA-seq  .h5ad dataset
#> 4 j3mcBo5sbvXaNHAK0000 my_datasets/my_rnaseq2.h5ad        <NA>  .h5ad dataset
#>     otype  size                   hash n_files n_observations _hash_type
#> 3 AnnData 19240 gO44MDqttaaKNyBLVM-zzA    <NA>              3        md5
#> 4 AnnData 19240 Ti7bcnIOlk0fIt2TFW_VAw    <NA>              3        md5
#>   _key_is_virtual _overwrite_versions space_id storage_id schema_id version
#> 3            TRUE               FALSE        1          1         4    <NA>
#> 4            TRUE               FALSE        1          1         4    <NA>
#>   is_latest run_id          created_at created_by_id _aux _branch_code
#> 3      TRUE      1 2025-03-10 17:30:02             1 <NA>            1
#> 4      TRUE      1 2025-03-10 17:30:05             1 <NA>            1
# This example might be beyond the scope of {laminr}
from torch.utils.data import DataLoader, WeightedRandomSampler
dataset = collection.mapped(obs_keys=["perturbation"])
sampler = WeightedRandomSampler(
    weights=dataset.get_label_weights("perturbation"), num_samples=len(dataset)
)
data_loader = DataLoader(dataset, batch_size=2, sampler=sampler)
for batch in data_loader:
    pass

Design

See https://docs.lamin.ai/introduction#design.

Influences

See https://docs.lamin.ai/introduction#influences.

ln$finish()
#> ! no html report found; to attach one, create an .html export for your .Rmd file and then run: lamin save introduction.Rmd
#> → finished Run('uucVQFwb') after 12s at 2025-03-10 17:30:06 UTC
#> ! calling anonymously, will miss private instances