Skip to content

Commit

Permalink
Merge pull request #741 from theislab/feature/new_data
Browse files Browse the repository at this point in the history
add pancreas dataset
  • Loading branch information
selmanozleyen authored Sep 12, 2024
2 parents f0a2bfe + bf2951d commit b516bd9
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 39 deletions.
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
"anndata": ("https://anndata.readthedocs.io/en/latest/", None),
"scanpy": ("https://scanpy.readthedocs.io/en/latest/", None),
"squidpy": ("https://squidpy.readthedocs.io/en/latest/", None),
"mudata": ("https://mudata.readthedocs.io/en/latest/", None),
}
master_doc = "index"
pygments_style = "tango"
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ dependencies = [
"ott-jax[neural]>=0.4.6",
"cloudpickle>=2.2.0",
"rich>=13.5",
"docstring_inheritance>=2.0.0"
"docstring_inheritance>=2.0.0",
"mudata>=0.2.2"
]

[project.optional-dependencies]
Expand Down
126 changes: 88 additions & 38 deletions src/moscot/datasets.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,19 @@
import contextlib
import os
import pathlib
import pickle
import shutil
import tempfile
import urllib.request
from itertools import combinations
from types import MappingProxyType
from typing import Any, Dict, List, Literal, Mapping, Optional, Tuple
from typing import Any, Dict, List, Literal, Mapping, Optional, Tuple, Union

import mudata as mu

import networkx as nx
import numpy as np
import pandas as pd
from scipy.linalg import block_diag

import anndata as ad
from anndata import AnnData
from scanpy import read
from scanpy.readwrite import _check_datafile_present_and_download

from moscot._types import PathLike

Expand All @@ -36,7 +33,7 @@ def mosta(
path: PathLike = "~/.cache/moscot/mosta.h5ad",
force_download: bool = False,
**kwargs: Any,
) -> AnnData: # pragma: no cover
) -> ad.AnnData: # pragma: no cover
"""Preprocessed and extracted data as provided in :cite:`chen:22`.
Includes embryo sections `E9.5`, `E2S1`, `E10.5`, `E2S1`, `E11.5`, `E1S2`.
Expand All @@ -59,6 +56,7 @@ def mosta(
"""
return _load_dataset_from_url(
path,
file_type="h5ad",
backup_url="https://figshare.com/ndownloader/files/40569779",
expected_shape=(54134, 2000),
force_download=force_download,
Expand All @@ -70,7 +68,7 @@ def hspc(
path: PathLike = "~/.cache/moscot/hspc.h5ad",
force_download: bool = False,
**kwargs: Any,
) -> AnnData: # pragma: no cover
) -> ad.AnnData: # pragma: no cover
"""CD34+ hematopoietic stem and progenitor cells from 4 healthy human donors.
From the `NeurIPS Multimodal Single-Cell Integration Challenge
Expand All @@ -95,6 +93,7 @@ def hspc(
"""
dataset = _load_dataset_from_url(
path,
file_type="h5ad",
backup_url="https://figshare.com/ndownloader/files/37993503",
expected_shape=(4000, 2000),
force_download=force_download,
Expand All @@ -111,7 +110,7 @@ def drosophila(
spatial: bool,
force_download: bool = False,
**kwargs: Any,
) -> AnnData:
) -> ad.AnnData:
"""Embryo of Drosophila melanogaster described in :cite:`Li-spatial:22`.
Minimal pre-processing was performed, such as gene and cell filtering, as well as normalization.
Expand All @@ -135,6 +134,7 @@ def drosophila(
if spatial:
return _load_dataset_from_url(
path + "_sp.h5ad",
file_type="h5ad",
backup_url="https://figshare.com/ndownloader/files/37984935",
expected_shape=(3039, 82),
force_download=force_download,
Expand All @@ -143,6 +143,7 @@ def drosophila(

return _load_dataset_from_url(
path + "_sc.h5ad",
file_type="h5ad",
backup_url="https://figshare.com/ndownloader/files/37984938",
expected_shape=(1297, 2000),
force_download=force_download,
Expand All @@ -154,7 +155,7 @@ def c_elegans(
path: PathLike = "~/.cache/moscot/c_elegans.h5ad",
force_download: bool = False,
**kwargs: Any,
) -> Tuple[AnnData, nx.DiGraph]: # pragma: no cover
) -> Tuple[ad.AnnData, nx.DiGraph]: # pragma: no cover
"""scRNA-seq time-series dataset of C.elegans embryogenesis :cite:`packer:19`.
Contains raw counts of 46,151 cells with at least partial lineage information.
Expand All @@ -175,6 +176,7 @@ def c_elegans(
"""
adata = _load_dataset_from_url(
path,
file_type="h5ad",
backup_url="https://figshare.com/ndownloader/files/39943585",
expected_shape=(46151, 20222),
force_download=force_download,
Expand All @@ -191,7 +193,7 @@ def zebrafish(
path: PathLike = "~/.cache/moscot/zebrafish.h5ad",
force_download: bool = False,
**kwargs: Any,
) -> Tuple[AnnData, Dict[str, nx.DiGraph]]:
) -> Tuple[ad.AnnData, Dict[str, nx.DiGraph]]:
"""Lineage-traced scRNA-seq time-series dataset of Zebrafish heart regeneration :cite:`hu:22`.
Contains gene expression vectors, LINNAEUS :cite:`spanjaard:18` reconstructed lineage trees,
Expand All @@ -212,6 +214,7 @@ def zebrafish(
"""
adata = _load_dataset_from_url(
path,
file_type="h5ad",
backup_url="https://figshare.com/ndownloader/files/39951073",
expected_shape=(44014, 31466),
force_download=force_download,
Expand All @@ -230,7 +233,7 @@ def bone_marrow(
rna: bool,
force_download: bool = False,
**kwargs: Any,
) -> AnnData:
) -> ad.AnnData:
"""Multiome data of bone marrow measurements :cite:`luecken:21`.
Contains processed counts of 6,224 cells. The RNA data was filtered to 2,000 top
Expand All @@ -256,25 +259,72 @@ def bone_marrow(
if rna:
return _load_dataset_from_url(
path + "_rna.h5ad",
file_type="h5ad",
backup_url="https://figshare.com/ndownloader/files/40195114",
expected_shape=(6224, 2000),
force_download=force_download,
**kwargs,
)
return _load_dataset_from_url(
path + "_atac.h5ad",
file_type="h5ad",
backup_url="https://figshare.com/ndownloader/files/41013551",
expected_shape=(6224, 8000),
force_download=force_download,
**kwargs,
)


def pancreas_multiome(
rna_only: bool,
path: PathLike = "~/.cache/moscot/pancreas_multiome.h5mu",
force_download: bool = True,
**kwargs: Any,
) -> Union[mu.MuData, ad.AnnData]: # pragma: no cover
"""Pancreatic endocrinogenesis dataset published with the moscot manuscript :cite:`Klein:23`.
The dataset contains paired scRNA-seq and scATAC-seq data of pancreatic cells at embryonic days 14.5, 15.5, and
16.5. The data was preprocessed and filtered as described in the manuscript, the raw data and the full processed
data are available via GEO accession code GSE275562.
Parameters
----------
rna_only
Only load the RNA data, resulting in a smaller file.
path
Path where to save the file.
force_download
Whether to force-download the data.
kwargs
Keyword arguments for :func:`anndata.read_h5ad` if `rna_only`, else for :func:`mudata.read`.
Returns
-------
:class:`mudata.MuData` object with RNA and ATAC data if `rna_only`, else :class:`anndata.AnnData` with RNA only.
"""
if rna_only:
return _load_dataset_from_url(
path,
file_type="h5ad",
backup_url="https://figshare.com/ndownloader/files/48785320",
expected_shape=(22604, 20242),
force_download=force_download,
**kwargs,
)
return _load_dataset_from_url(
path,
file_type="h5mu",
backup_url="https://figshare.com/ndownloader/files/48782332",
expected_shape=(22604, 271918),
force_download=force_download,
)


def tedsim(
path: PathLike = "~/.cache/moscot/tedsim.h5ad",
force_download: bool = False,
**kwargs: Any,
) -> AnnData: # pragma: no cover
) -> ad.AnnData: # pragma: no cover
"""Dataset simulated with TedSim :cite:`pan:22`.
Simulated scRNA-seq dataset of a differentiation trajectory. For each cell, the dataset includes a (raw counts)
Expand Down Expand Up @@ -302,6 +352,7 @@ def tedsim(
"""
return _load_dataset_from_url(
path,
file_type="h5ad",
backup_url="https://figshare.com/ndownloader/files/40178644",
expected_shape=(8448, 500),
force_download=force_download,
Expand All @@ -313,7 +364,7 @@ def sciplex(
path: PathLike = "~/.cache/moscot/sciplex.h5ad",
force_download: bool = False,
**kwargs: Any,
) -> AnnData: # pragma: no cover
) -> ad.AnnData: # pragma: no cover
"""Perturbation dataset published in :cite:`srivatsan:20`.
Transcriptomes of A549, K562, and mCF7 cells exposed to 188 compounds.
Expand All @@ -334,6 +385,7 @@ def sciplex(
"""
return _load_dataset_from_url(
path,
file_type="h5ad",
backup_url="https://figshare.com/ndownloader/files/43381398",
expected_shape=(799317, 110984),
force_download=force_download,
Expand All @@ -345,7 +397,7 @@ def sim_align(
path: PathLike = "~/.cache/moscot/sim_align.h5ad",
force_download: bool = False,
**kwargs: Any,
) -> AnnData: # pragma: no cover
) -> ad.AnnData: # pragma: no cover
"""Spatial transcriptomics simulated dataset as described in :cite:`Jones-spatial:22`.
Parameters
Expand All @@ -363,6 +415,7 @@ def sim_align(
"""
return _load_dataset_from_url(
path,
file_type="h5ad",
backup_url="https://figshare.com/ndownloader/files/37984926",
expected_shape=(1200, 500),
force_download=force_download,
Expand All @@ -383,7 +436,7 @@ def simulate_data(
lin_cost_matrix: Optional[str] = None,
quad_cost_matrix: Optional[str] = None,
**kwargs: Any,
) -> AnnData:
) -> ad.AnnData:
"""Simulate data.
This function is used to generate data, mainly for the purpose of
Expand Down Expand Up @@ -424,7 +477,7 @@ def simulate_data(
"""
rng = np.random.RandomState(seed)
adatas = [
AnnData(
ad.AnnData(
X=rng.multivariate_normal(
mean=kwargs.pop("mean", np.arange(n_genes)),
cov=kwargs.pop("cov", var * np.diag(np.ones(n_genes))),
Expand Down Expand Up @@ -477,32 +530,29 @@ def simulate_data(

def _load_dataset_from_url(
fpath: PathLike,
file_type: Literal["h5ad", "h5mu"],
*,
backup_url: str,
expected_shape: Tuple[int, int],
force_download: bool = False,
sparse: bool = True,
cache: bool = True,
**kwargs: Any,
) -> AnnData:
) -> Union[ad.AnnData, mu.MuData]:
# TODO: make nicer once https://github.com/scverse/mudata/issues/76 resolved
fpath = os.path.expanduser(fpath)
if not fpath.endswith(".h5ad"):
fpath += ".h5ad"

if force_download:
with tempfile.TemporaryDirectory() as tmpdir:
tmp = pathlib.Path(tmpdir) / "data.h5ad"
adata = read(filename=tmp, backup_url=backup_url, sparse=sparse, cache=cache, **kwargs)
with contextlib.suppress(FileNotFoundError):
os.remove(fpath)
shutil.move(tmp, fpath)
else:
adata = read(filename=fpath, backup_url=backup_url, sparse=sparse, cache=cache, **kwargs)

if adata.shape != expected_shape:
raise ValueError(f"Expected `AnnData` object to have shape `{expected_shape}`, found `{adata.shape}`.")

return adata
assert file_type in ["h5ad", "h5mu"], f"Invalid type `{file_type}`. Must be one of `['h5ad', 'h5mu']`."
if not fpath.endswith(file_type):
fpath += f".{file_type}"
if force_download and os.path.exists(fpath):
os.remove(fpath)
if not _check_datafile_present_and_download(backup_url=backup_url, path=fpath):
raise FileNotFoundError(f"File `{fpath}` not found or download failed.")
data = ad.read_h5ad(filename=fpath, **kwargs) if file_type == "h5ad" else mu.read_h5mu(filename=fpath, backed=False)

if data.shape != expected_shape:
data_str = "MuData" if file_type == "h5mu" else "AnnData"
raise ValueError(f"Expected {data_str} object to have shape `{expected_shape}`, found `{data.shape}`.")

return data


def _get_random_trees(
Expand Down

0 comments on commit b516bd9

Please sign in to comment.