Source code for tdm.preprocess.single_cell_df

"""
Validations of the main single-cell dataframe provided to the Analysis object.
"""

import pandas as pd
import warnings
from anndata import AnnData
from pandas.api.types import is_float_dtype

from tdm.utils import cprint, verbosity


X_COL = "x"
Y_COL = "y"
DIVISION_COL = "division"
CELL_TYPE_COL = "cell_type"
IMG_ID_COL = "img_id"
SUBJECT_ID_COL = "subject_id"

REQUIRED_COLUMNS = [X_COL, Y_COL, DIVISION_COL, CELL_TYPE_COL, IMG_ID_COL, SUBJECT_ID_COL]

KI67_COL = "ki67"


[docs] @verbosity def check_single_cell_df(single_cell_df: pd.DataFrame, verbose: bool = True) -> bool: """ Checks that ``single_cell_df`` is preprocessed correctly and provides hints in case it isn't. Note: The ``single_cell_df`` must have: - ``x (float)`` and ``y (float)`` columns with spatial coordinates in standard units (e.g 1 micron = 1e-6) - a ``cell_type (str)`` column. - a ``division (bool)`` column. - an ``img_id (int)`` column with an identifier of the tissue section - a ``subject_id (int | str)`` column with an identifier of the patient (or mouse etc.) """ cprint("Validating single cell dataframe...\n", color="blue") valid = True """ 'x' column: """ if X_COL not in single_cell_df.columns: cprint( f"[ERROR] Missing {X_COL} column in the dataframe." """ To rename the current x,y columns try: >>> from tdm.preprocess.single_cell_df import X_COL, Y_COL >>> df = df.rename(columns={"current x column name": X_COL, "current y column name": Y_COL}) """, color="red", ) valid = False else: cprint(f"[SUCCESS] Found {X_COL} column", color="green") x = single_cell_df[X_COL] # check correct dtype: if not is_float_dtype(x): cprint(f"[ERROR] {X_COL} column is not of type float.", color="red") valid = False # check correct units: if max(x) - min(x) > 5e-2: cprint( f""" Warning: the difference between maximal and minimal {X_COL} values exceeds 5e-2 (5cm). Did you provide positions in standard units (e.g 1 micron = 1e-6)? To convert integer numbers of microns to standard units run: >>> from tdm.utils import microns >>> df[x] = microns(df[x]) # equivalent to df[x]*1e-6 """, color="yellow", ) # minimal x is within 10 microns of zero: if not abs(min(x)) < 1e-5: cprint( """ Warning: the minimal x position is not close to 0. The spatial positions within the tissue should range from (0,0) to (x max, y max) """, color="yellow", ) """ 'y' column: """ if Y_COL not in single_cell_df.columns: cprint( f"[ERROR] Missing {Y_COL} column in the dataframe." """ To rename the current x,y columns try: >>> from tdm.preprocess.single_cell_df import X_COL, Y_COL >>> df = df.rename(columns={"current x column name": X_COL, "current y column name": Y_COL}) """, color="red", ) valid = False else: cprint(f"[SUCCESS] Found {Y_COL} column", color="green") y = single_cell_df[Y_COL] # check correct dtype: if not is_float_dtype(y): cprint(f"[ERROR] {Y_COL} column is not of type float.", color="red") valid = False # check correct units: if max(y) - min(y) > 5e-2: cprint( f""" Warning: the difference between maximal and minimal {Y_COL} values exceeds 5e-2 (5cm). Did you provide positions in standard units (e.g 1 micron = 1e-6)? To convert integer numbers of microns to standard units run: >>> from tdm.utils import microns >>> df[y] = microns(df[y]) # equivalent to df[y]*1e-6 """, color="yellow", ) # minimal y is within 10 microns of zero: if not abs(min(y)) < 1e-5: cprint( """ Warning: the minimal y position is not close to 0. The spatial positions within the tissue should range from (0,0) to (x max, y max) """, color="yellow", ) """ 'cell_type' column: """ if CELL_TYPE_COL not in single_cell_df.columns: cprint(f"[ERROR] Missing {CELL_TYPE_COL} column in the dataframe.", color="red") valid = False else: cprint(f"[SUCCESS] Found {CELL_TYPE_COL} column.", color="green", new_line=False) # there shouldn't be more than 100 types: types = cell_types(single_cell_df) num_types = len(types) if num_types < 100: num_types_color = "green" cprint(f"Number of cell types: {num_types}", color=num_types_color) else: num_types_color = "yellow" cprint( f"Number of cell types: {num_types}. \nThis is a large number of cell-types, do the types look ok?", color=num_types_color, ) # there shouldn't be any nan cell types (can cause problems in RestrictedNeighbors): if single_cell_df[CELL_TYPE_COL].isna().sum() > 0: cprint("[ERROR] Found nan values in the cell_type column.", color="red") valid = False cprint(f"\tCell types: {cell_types(single_cell_df)}", color=num_types_color) """ 'division' column: """ if DIVISION_COL not in single_cell_df.columns: cprint(f"[ERROR] Missing {DIVISION_COL} column in the dataframe.", color="red") valid = False else: division = single_cell_df[DIVISION_COL] # check correct dtype: if not division.dtype == bool: cprint(f"[ERROR] {DIVISION_COL} column is not of type bool.", color="red") valid = False else: avg_divisions = division.mean() cprint( f"[SUCCESS] Found {DIVISION_COL} column. Fraction of dividing cells: {avg_divisions:.3f}", color="green", ) """ 'img_num' column: """ if IMG_ID_COL not in single_cell_df.columns: cprint(f"[ERROR] Missing {IMG_ID_COL} column in the dataframe.", color="red") valid = False else: img_num = single_cell_df[IMG_ID_COL] cprint( f"[SUCCESS] Found {IMG_ID_COL} column. Number of images found: {len(img_num.unique())}", color="green", ) """ 'subject_id' column: """ if SUBJECT_ID_COL not in single_cell_df.columns: cprint(f"[ERROR] Missing {SUBJECT_ID_COL} column in the dataframe.", color="red") valid = False else: subject_id = single_cell_df[SUBJECT_ID_COL] cprint( f"[SUCCESS] Found {SUBJECT_ID_COL} column. Number of subjects found: {len(subject_id.unique())}", color="green", ) if valid: cprint("\n[SUCCESS] Validation complete!", color="green") else: cprint("\n[FAIL] Validation complete!", color="red") return valid
def set_dtypes(df: pd.DataFrame) -> pd.DataFrame: types = { CELL_TYPE_COL: str, X_COL: float, Y_COL: float, DIVISION_COL: bool, IMG_ID_COL: int, } return df.astype({k: v for k, v in types.items() if k in df.columns}) def restrict_df_to_required_columns(df: pd.DataFrame, warn_missing_columns: bool = True) -> pd.DataFrame: """ Restrict the dataframe to the required columns. Note: If any required columns are missing, a warning is issued and the available columns are returned. """ missing_cols = [col for col in REQUIRED_COLUMNS if col not in df.columns] if missing_cols: if warn_missing_columns: warnings.warn(f"Warning: Missing required columns: {missing_cols}", stacklevel=2) available_cols = [col for col in REQUIRED_COLUMNS if col in df.columns] return df[available_cols] return df[REQUIRED_COLUMNS] def cell_types(single_cell_df: pd.DataFrame) -> list[str]: """Return all cell types present in the single_cell_df. Args: single_cell_df (pd.DataFrame): the single cell dataframe, see :func:`~tdm.preprocess.single_cell_df.check_single_cell_df` for more details. Returns: list[str]: list of cell types. """ return list(single_cell_df[CELL_TYPE_COL].unique()) def n_cells_per_type(single_cell_df: pd.DataFrame) -> pd.DataFrame: """Return the number of cells per cell type in the single_cell_df. Args: single_cell_df (pd.DataFrame): the single_cell_df, see: :func:`~tdm.preprocess.single_cell_df.check_single_cell_df` Returns: pd.DataFrame: a dataframe with columns 'cell_type' and '#', where '#' is the number of cells of that type. """ return single_cell_df.groupby(CELL_TYPE_COL).size().reset_index(name="#") def remap_dict_values(d: dict, val_map: dict): """ Remap values from d to new values given in val_map. Note: only maps values in d that appear in val_map. Tip: useful for reducing the granularity of a cell-type definitions dictionary. Example: >>> val_map = {'CD4': 'T', 'CD8':'T'} >>> d = {'CD4 T-cell Marker': 'CD4', 'CD8 T-cell Marker': 'CD8'} >>> remap_dict_values(d, val_map) {'CD4 T-cell Marker': 'T', 'CD8 T-cell Marker': 'T'} """ return {k: val_map[v] if v in val_map else v for k, v in d.items()} def add_and_rename_columns(df: pd.DataFrame, rename_dict: dict) -> pd.DataFrame: """ Add and rename columns in the dataframe. Args: df (pd.DataFrame): the dataframe to add and rename columns in. rename_dict (dict): a dictionary with the original column names as keys and the new column names as values. Returns: pd.DataFrame: the dataframe with the added and renamed columns. """ # create columns with tdm names: # x,y, ki67, cell_type, img_num, subject_id for original_col, renamed_col in rename_dict.items(): df[renamed_col] = df[original_col] return df def single_cell_df_from_adata(adata: AnnData) -> pd.DataFrame: """ Create a single cell dataframe from an AnnData object. Args: adata (AnnData): the AnnData object to create the single cell dataframe from. Note: The adata object is expected to be the output of the MCMICRO pipeline followed by SCIMAP phenotyping workflow. This means it should contain: * spatial coordinates as ``X_centroid`` and ``Y_centroid`` * marker counts in ``adata.raw.X`` * phenotype annotations in ``adata.obs['phenotype']`` Note: This function doesn't define division events or add a ``division`` column. See tdm.preprocess.ki67 for defining division events based on Ki67 expression. Note: The ``adata`` should have a ``raw`` attribute with the original counts. Returns: pd.DataFrame: the single cell dataframe. """ # assert adata has the required columns: assert "X_centroid" in adata.obs.columns, "Missing X_centroid column in adata.obs" assert "Y_centroid" in adata.obs.columns, "Missing Y_centroid column in adata.obs" assert "phenotype" in adata.obs.columns, "Missing phenotype column in adata.obs" assert hasattr(adata, "raw"), "Missing raw attribute in adata" # fetch marker counts from adata: protein_counts = pd.DataFrame(data=adata.raw.X, index=adata.obs.index, columns=adata.raw.var.index) protein_counts = protein_counts.rename(columns={"KI67": KI67_COL}) # fetch x,y positions and phenotype: scdf = adata.obs.copy() # rename columns to match required columns: scdf = scdf.rename( columns={ "X_centroid": X_COL, "Y_centroid": Y_COL, "imageid": IMG_ID_COL, "phenotype": CELL_TYPE_COL, } ) # assume single subject scdf[SUBJECT_ID_COL] = 1 scdf[IMG_ID_COL] = 1 # TODO: support string image type and fetch from adata # filter out some irrelevant columns: scdf = restrict_df_to_required_columns(scdf, warn_missing_columns=False) # join marker counts: scdf = scdf.join(protein_counts) scdf = set_dtypes(scdf) return scdf