Source code for tdm.dataset.dataset

"""
Base class for all datasets.

A Dataset maps cell types to features & observations.
"""

from typing import Sequence, Literal
import pandas as pd
import numpy as np


[docs] class Dataset: """ Base class for all datasets. A dataset maps cell types to features & labels. Note: A dataset is typically constructed based on one of the following sources: - Tissue: Used for direct computations on tissue cells, such as counting neighbors (see: NeighborsDataset) - Dataset: Typically used for transforming features (see: PolynomialDataset) - A list of Datasets: Used for combining datasets (see: ConcatDataset) """
[docs] def __init__(self) -> None: """ Initializes the Dataset with a dictionary mapping cell type to features and obs. dataset_dict: - key: - cell_type (str) - value: - features: dataframe with shape (n_cells, n_features) - observations: dataframe with shape (n_cells, 2) holding observations. columns: division, death """ self.dataset_dict: dict[str, tuple[pd.DataFrame, pd.DataFrame]] = self._init_dataset_dict()
def _init_dataset_dict(self) -> dict[str, tuple[pd.DataFrame, pd.DataFrame]]: """ Returns a dataset dictionary mapping cell type to features and obs: - key: cell_type (str) - value: - features: dataframe with shape (n_cells, n_features) - observations: dataframe with shape (n_cells, 2) holding observations. columns: division, death """ raise NotImplementedError
[docs] def fetch(self, cell_type: str) -> tuple[pd.DataFrame, pd.DataFrame]: """ Returns the features and observations associated with a cell type. Parameters: cell_type: a str from tdm.tissue.cell_types.CELL_TYPES_ARRAY Returns: features, observations (tuple): - features: dataframe with shape (n_cells, n_features) - observations: dataframe with shape (n_cells, 2) holding observations. columns: division, death """ return self.dataset_dict[cell_type]
[docs] def set_dataset(self, cell_type: str, features: pd.DataFrame, obs: pd.DataFrame): """Manually write the features and obs for a cell type. Args: cell_type (str): string identifier of a cell type. features (pd.DataFrame): dataframe with shape (n_cells, n_features) obs (pd.DataFrame): dataframe with shape (n_cells, 2) holding observations. columns: division, death """ self.dataset_dict[cell_type] = features, obs
[docs] def cell_types(self) -> list[str]: """ Returns the cell types in the dataset. See: tdm.tissue.cell_types.CELL_TYPES_ARRAY for possible values. """ return list(self.dataset_dict.keys())
[docs] def fetch_all(self) -> tuple[pd.DataFrame, pd.DataFrame]: """ Returns features and observations from all cell types, concatenated. Returns: features, observations (tuple): - features: dataframe with shape (n_cells, n_features) - observations: dataframe with shape (n_cells, 2) holding observations. columns: division, death """ features_and_obs = [self.fetch(c) for c in self.cell_types()] features = pd.concat([f for f, o in features_and_obs]) obs = pd.concat([o for f, o in features_and_obs]) return features, obs
[docs] def n_cells(self, cell_type: str | None = None) -> int: """ Returns the number of cells of cell_type in the dataset, or all cell types if cell_type = None """ if cell_type is None: return sum([self.dataset_dict[c][0].shape[0] for c in self.dataset_dict.keys()]) else: return self.dataset_dict[cell_type][0].shape[0]
[docs] def n_obs(self, cell_type: str, obs: Literal["division", "death"]) -> int: """ Returns the number of division or death events """ return self.fetch(cell_type)[1][obs].sum()
[docs] def construct_features_from_counts( self, cell_counts: dict[str, float | Sequence[float]], target_cell: str, **kwargs ) -> pd.DataFrame: """ Constructs features compatible with construct_polynomial_features Input is in raw values! """ raise NotImplementedError
[docs] def n_features(self) -> int: """ Returns the number of features in the dataset. warning: Fails if there are different numbers of features for different cell types """ n_features = [self.fetch(c)[0].shape[1] for c in self.cell_types()] assert len(np.unique(n_features) == 1) return n_features[0]