Source code for sc_utils.markers

import numpy as np
import pandas as pd


[docs]def get_markers( adata, groupby, key="rank_genes_groups", p_val_cutoff=0.05, logfc_cutoff=0.5 ): """\ Extract markers from adata into Seurat-like table Extracts markers after they are computed by ``scanpy``. Produces Seurat-like table with fields ``"p_val", "avg_logFC", "pct.1", "pct.2", "p_val_adj", "cluster", "gene"`` Calculates the percentage of cells that express a given gene in the target cluster (``pct.1`` field) and outside the cluster (``pct.2`` field) from ``adata.raw`` matrix. Parameters ---------- adata Annotated data matrix. groupby ``adata.obs`` field used for marker calculation key ``adata.uns`` key that has computed markers p_val_cutoff Drop all genes with adjusted p-value greater than or equal to this logfc_cutoff Drop all genes with average logFC less than or equal to this Returns ------- Returns a pandas dataframe with above listed columns, optionally subsetted on the genes that pass the cutoffs. ``p_val`` field is a copy of adjusted p-value field. Example ------- >>> sc.tl.rank_genes_groups(adata, "leiden", method="wilcoxon", n_genes=200) >>> markers = sc_utils.get_markers(adata, "leiden") >>> markers.to_csv("markers.csv") """ markers = pd.concat([ pd.DataFrame(adata.uns[key]["names"]).melt(), pd.DataFrame(adata.uns[key]["pvals_adj"]).melt(), pd.DataFrame(adata.uns[key]["logfoldchanges"]).melt() ], axis=1) markers.columns = ("cluster", "gene", "cluster2", "p_val_adj", "cluster3", "avg_logFC") markers = markers.loc[:, ["cluster", "gene", "avg_logFC", "p_val_adj"]] markers = markers.loc[markers.avg_logFC > logfc_cutoff, ] markers = markers.loc[markers.p_val_adj < p_val_cutoff, ] markers["pct.1"] = pd.Series(dtype=float) markers["pct.2"] = pd.Series(dtype=float) for cluster in markers.cluster.unique(): cells = adata.obs[groupby] == cluster in_cluster_selector = markers.cluster == cluster genes = markers.gene[in_cluster_selector] in_cluster = np.sum(adata.raw[cells, genes].X > 0, axis=0) / cells.sum() markers.loc[in_cluster_selector, "pct.1"] = in_cluster.T other_cells = adata.obs[groupby] != cluster other_clusters = np.sum(adata.raw[other_cells, genes].X > 0, axis=0) / other_cells.sum() markers.loc[in_cluster_selector, "pct.2"] = other_clusters.T markers["p_val"] = markers.p_val_adj markers = markers.loc[:, ["p_val", "avg_logFC", "pct.1", "pct.2", "p_val_adj", "cluster", "gene"]] return markers