Manage scRNA-seq datasets#
This illustrates how to manage scRNA-seq datasets in absence of a custom schema.
Show code cell content
!lamin init --storage ./test-scrna --schema bionty
import lamindb as ln
import lnschema_bionty as lb
ln.settings.verbosity = 3 # show hints
lb.settings.auto_save_parents = (
False # don't recurse through ontology hierarchies to speed up CI
)
ln.track()
Preparation: registries#
Let’s assume that this is not the first time we work with experimental entities, and hence, our registries are already pre-populated:
Show code cell content
# assume prepared registries
# strain
lb.ExperimentalFactor.from_bionty(ontology_id="EFO:0004472").save()
record = lb.ExperimentalFactor.filter(ontology_id="EFO:0004472").one()
record.add_synonym("C57BL/6N")
# developmental stage
lb.ExperimentalFactor.from_bionty(ontology_id="EFO:0001272").save()
# tissue
lb.Tissue.from_bionty(ontology_id="UBERON:0001542").save()
# cell types
ln.save(lb.CellType.from_values(["CL:0000115", "CL:0000738"], "ontology_id"))
ln.view(schema="bionty", orms=["CellType", "ExperimentalFactor", "Tissue"])
Mouse lymph node cells: Detmar22#
We’re working with mouse data:
lb.settings.species = "mouse"
Let’s look at a scRNA-seq count matrix in form of an AnnData
object:
Show code cell source
adata = ln.dev.datasets.anndata_mouse_sc_lymph_node()
# The column names are a bit lengthy, let's abbreviate them:
adata.obs.columns = (
adata.obs.columns.str.replace("Sample Characteristic", "")
.str.replace("Factor Value ", "Factor Value:", regex=True)
.str.replace("Factor Value\[", "Factor Value:", regex=True)
.str.replace(" Ontology Term\[", "ontology_id:", regex=True)
.str.strip("[]")
.str.replace("organism part", "tissue")
.str.replace("organism", "species")
.str.replace("developmental stage", "developmental_stage")
.str.replace("cell type", "cell_type")
# the last one could be interesting, too
# .str.replace("Factor Value:Ontology Term[inferred cell_type - authors labels", "cell_type_authors")
)
adata
When we create a File
object from an AnnData
, we’ll automatically link its feature sets and get information about unmapped categories:
file = ln.File.from_anndata(
adata, description="Detmar22", var_ref=lb.Gene.ensembl_gene_id
)
We’re seeing that several gene identifiers can’t be validated through Bionty. We’d like to validate all features in this dataset, hence, let’s register them:
ln.save(lb.Gene.from_values(adata.var.index, lb.Gene.ensembl_gene_id))
Similarly, for the metadata, we’d like to validate the names:
columns = [
col
for col in adata.obs.columns
if not col.startswith("ontology_id") and not col.startswith("Factor Value")
]
ln.save(ln.Feature.from_df(adata.obs[columns]))
file = ln.File.from_anndata(
adata, description="Detmar22", var_ref=lb.Gene.ensembl_gene_id
)
file.save()
The file now has two linked feature sets:
file.features
Some of the metadata can be typed using dedicated registries:
species = lb.Species.from_bionty(name="mouse")
strains = lb.ExperimentalFactor.from_values(adata.obs["strain"], "name")
dev_stages = lb.ExperimentalFactor.from_values(adata.obs["developmental_stage"], "name")
cell_types = lb.CellType.from_values(adata.obs["cell_type"], "name")
tissues = lb.Tissue.from_values(adata.obs["tissue"], "name")
We did not validate strains, hence, let’s try to map synonyms:
lb.ExperimentalFactor.map_synonyms(adata.obs["strain"], return_mapper=True)
Indeed, there is a synonym:
adata.obs["strain"] = adata.obs["strain"].map(
lb.ExperimentalFactor.map_synonyms(adata.obs["strain"], return_mapper=True)
)
Now we can validate:
strains = lb.ExperimentalFactor.from_values(adata.obs["strain"], "name")
file.features.add_labels(species, feature="species")
file.features.add_labels(strains + dev_stages + tissues + cell_types)
Metadata that doesn’t have can’t be typed with dedicated registries:
labels = ln.Label.from_values(adata.obs["sex"])
labels += ln.Label.from_values(adata.obs["age"])
labels += ln.Label.from_values(adata.obs["genotype"])
labels += ln.Label.from_values(adata.obs["immunophenotype"])
file.features.add_labels(labels)
The file is now queryable by everything we linked:
file.describe()
Human immune cells: Conde22#
lb.settings.species = "human"
conde22 = ln.dev.datasets.anndata_human_immune_cells()
ln.save(lb.Gene.from_values(conde22.var.index, lb.Gene.ensembl_gene_id))
conde22.obs.columns = conde22.obs.columns.str.replace("donor_id", "donor")
columns = [col for col in conde22.obs.columns if "ontology_term" not in col]
ln.save(ln.Feature.from_df(conde22.obs[columns]))
file = ln.File.from_anndata(
conde22, description="Conde22", var_ref=lb.Gene.ensembl_gene_id
)
file.save()
The file has the following linked features:
file.features
Let’s now link observational metadata.
cell_types = lb.CellType.from_values(conde22.obs.cell_type, field="name")
efs = lb.ExperimentalFactor.from_values(conde22.obs.assay, field="name")
tissues = lb.Tissue.from_values(conde22.obs.tissue, field="name")
file.features.add_labels([lb.settings.species], feature="species")
file.features.add_labels(cell_types + efs + tissues)
As neither the core schema nor lnschema_bionty
have a Donor
table, we’re using Label
to track donor ids:
donors = ln.Label.from_values(conde22.obs["donor"])
file.features.add_labels(donors)
file.describe()
A less well curated dataset#
Let’s now consider a dataset with less-well curated features:
pbcm68k = ln.dev.datasets.anndata_pbmc68k_reduced()
We see that this dataset is indexed by gene symbols:
pbcm68k.var.index
Because gene symbols don’t uniquely characterize an Ensembl ID, we’re linking more feature records to this file than columns in the AnnData
.
Tip
Use Ensembl Gene IDs rather than gene Symbols to index genes.
validated = lb.Gene.inspect(pbcm68k.var.index, lb.Gene.symbol)["mapped"]
pbcm68k_validated = pbcm68k[:, validated]
Link cell types:
Show code cell content
# inspect shows none of the terms are mappable
lb.CellType.inspect(pbcm68k_validated.obs["cell_type"], "name")
# here we search the cell type names from the public ontology and grab the top match
# then add the cell type names from the pbcm68k as synonyms
celltype_bt = lb.CellType.bionty()
ontology_ids = []
mapper = {}
for ct in pbcm68k_validated.obs["cell_type"].unique():
ontology_id = celltype_bt.search(ct).iloc[0].ontology_id
record = lb.CellType.from_bionty(ontology_id=ontology_id)
mapper[ct] = record.name
record.save()
record.add_synonym(ct)
pbcm68k_validated.obs["cell_type"] = pbcm68k_validated.obs["cell_type"].map(mapper)
Now, all cell types should be validated:
lb.CellType.inspect(pbcm68k_validated.obs["cell_type"], "name")
file_pbcm68k = ln.File.from_anndata(
pbcm68k_validated, description="10x reference pbmc68k", var_ref=lb.Gene.symbol
)
file_pbcm68k.save()
cell_types = lb.CellType.from_values(pbcm68k_validated.obs["cell_type"], "name")
file_pbcm68k.features.add_labels(cell_types)
file_pbcm68k.describe()
🎉 Now let’s continue with data integration: Integrate scRNA datasets based on shared features/metadata