import numpy as np
import pandas as pd
[docs]def get_markers(
adata,
groupby,
key="rank_genes_groups",
p_val_cutoff=0.05,
logfc_cutoff=0.5
):
"""\
Extract markers from adata into Seurat-like table
Extracts markers after they are computed by ``scanpy``. Produces Seurat-like
table with fields
``"p_val", "avg_logFC", "pct.1", "pct.2", "p_val_adj", "cluster", "gene"``
Calculates the percentage of cells that express a given gene
in the target cluster (``pct.1`` field) and outside the cluster
(``pct.2`` field) from ``adata.raw`` matrix.
Parameters
----------
adata
Annotated data matrix.
groupby
``adata.obs`` field used for marker calculation
key
``adata.uns`` key that has computed markers
p_val_cutoff
Drop all genes with adjusted p-value greater than or equal to this
logfc_cutoff
Drop all genes with average logFC less than or equal to this
Returns
-------
Returns a pandas dataframe with above listed columns, optionally
subsetted on the genes that pass the cutoffs.
``p_val`` field is a copy of adjusted p-value field.
Example
-------
>>> sc.tl.rank_genes_groups(adata, "leiden", method="wilcoxon", n_genes=200)
>>> markers = sc_utils.get_markers(adata, "leiden")
>>> markers.to_csv("markers.csv")
"""
markers = pd.concat([
pd.DataFrame(adata.uns[key]["names"]).melt(),
pd.DataFrame(adata.uns[key]["pvals_adj"]).melt(),
pd.DataFrame(adata.uns[key]["logfoldchanges"]).melt()
], axis=1)
markers.columns = ("cluster", "gene", "cluster2", "p_val_adj", "cluster3", "avg_logFC")
markers = markers.loc[:, ["cluster", "gene", "avg_logFC", "p_val_adj"]]
markers = markers.loc[markers.avg_logFC > logfc_cutoff, ]
markers = markers.loc[markers.p_val_adj < p_val_cutoff, ]
markers["pct.1"] = pd.Series(dtype=float)
markers["pct.2"] = pd.Series(dtype=float)
for cluster in markers.cluster.unique():
cells = adata.obs[groupby] == cluster
in_cluster_selector = markers.cluster == cluster
genes = markers.gene[in_cluster_selector]
in_cluster = np.sum(adata.raw[cells, genes].X > 0, axis=0) / cells.sum()
markers.loc[in_cluster_selector, "pct.1"] = in_cluster.T
other_cells = adata.obs[groupby] != cluster
other_clusters = np.sum(adata.raw[other_cells, genes].X > 0, axis=0) / other_cells.sum()
markers.loc[in_cluster_selector, "pct.2"] = other_clusters.T
markers["p_val"] = markers.p_val_adj
markers = markers.loc[:, ["p_val", "avg_logFC", "pct.1", "pct.2", "p_val_adj", "cluster", "gene"]]
return markers