Source code for sc_utils.meta

import anndata as ad
import pandas as pd


def _select_first(df: pd.DataFrame, col_pattern: str) -> pd.Series:
    """\
    Selects first non-empty value from columns matching ``col_pattern``

    Parameters
    ----------
    df
        Pandas data frame
    col_pattern
        Pattern to match columns

    Returns
    -------
    Pandas series with first non-empty value among the matching columns
    selected for each index of the dataframe
    """
    x = df.loc[:, df.columns[df.columns.str.match(col_pattern)]]
    cols = x.T.notna().idxmax()
    x = x.reset_index().melt("index")
    x = x.set_index(
        ["index", "variable"]
    ).loc[zip(cols.index, cols.values), :].droplevel("variable")
    return x


[docs]def merge_gene_info(adata: ad.AnnData):
    """\
    Merges gene information from different batches

    After concatenating several datasets, the gene information dataframe
    ``adata.var`` can have a lot of duplicate columns from all the batches.

    This function merges ``gene_ids``, ``feature_types`` and ``genome``
    information from batches, inserts them in the table and removes the
    batch-associated columns.

    Parameters
    ----------
    adata
        Annotated data matrix.

    Example
    -------
    >>> datasets = [sc.read_h5ad(path) for path in paths]
    >>> adata = datasets[0].concatenate(datasets[1:], join="outer")
    >>> sc_utils.merge_gene_info(adata)
    """
    for i, column in enumerate(["gene_ids", "feature_types", "genome"]):
        adata.var.insert(
            i,
            column,
            _select_first(adata.var, rf"{column}-\d+")
        )
        adata.var.drop(
            adata.var.columns[adata.var.columns.str.match(r"{column}-\d+")],
            inplace=True,
            axis=1
        )