Source code for sequana.enrichment.panther

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2021 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################

import colorlog
from tqdm import tqdm

from sequana.enrichment.ontology import Ontology
from sequana.enrichment.plot_go_terms import PlotGOTerms
from sequana.enrichment.quickgo import QuickGOGraph
from sequana.lazy import numpy as np
from sequana.lazy import pandas as pd
from sequana.lazy import pylab

logger = colorlog.getLogger(__name__)


__all__ = ["PantherEnrichment"]


[docs]class PantherEnrichment(Ontology, PlotGOTerms):
    """

    By default, we keep only the genes with a adjusted pvalue <= 0.05. The
    fold_change threshold is on a log2 scale and set to 0 (meaning no
    filtering). Only one value is requested and
    used to filter out positive and negative fold change (on the log2
    scale). In other word, a log2 fold change threshold of 2 means that we
    filter out values between -2 and 2.

    If you prefer to work in natural scale, just set the parameter
    fc_threshold, which overwrite the log2_fc_threshold parameter.

    ::

        pe = PantherEnrichment("input_file.tsv", taxon=10090, log2_fc_threshold=1)
        # compute enrichment for genes down and up
        pe.compute_enrichment()

        # Results for up case is stored in pe.enrichment
        # then, we plot the most mportat go terms
        df_up = pe.plot_go_terms("up")

        df = pe.plot_go_terms("up", pe.MF)
        pe.save_chart(df, "chart_MF_up.png")

        # all 3 main ontology
        df = pe.plot_go_terms("up")
        pe.save_chart(df, "chart_up.png")

    e.stats contains some statistics. One important is the list of unmapped
    genes. The results from the GO enrichment are stored in the attributes
    enrichment. There, we have again adjusted p-value and a fold enrichment,
    which can in turn be filtered or not.

    You can retrieve the cleaned data using the get_data method.

    You can also plot the GO terms that are significantly enriched using::

        e.plot_go_terms(['MF', 'CC', 'BP])

    This function returns the dataframe used during the plotting.

    If you want to look at the up regulated genes only::

        e.compute_enrichment(pe.mygenes["up"], 83333)
        df = e.plot_go_terms(['MF', 'CC', 'BP'],
                log=False, include_negative_enrichment=False,
                fontsize=8, sort_by='fold_enrichment',
                show_pvalues=True, fdr_threshold=0.05)


    The number of genes is limited to about 3100 depending (don't ask me
    why, this seem to be a hard-coded limitation on PantherDB website).
    In such case, you should add a filter e.g on padj or fold change


    """

    def __init__(
        self,
        gene_lists,
        taxon,
        requests_per_sec=10,
        padj_threshold=0.05,
        log2_fc_threshold=0,
        fc_threshold=None,
        enrichment_fdr=0.05,
        annot_col="Name",
    ):
        """


        rnadiff if provided, superseeds the input filename. This is useful for
        debugging
        """
        Ontology.__init__(self)
        PlotGOTerms.__init__(self)

        self.gene_lists = gene_lists
        self.enrichment_fdr = enrichment_fdr

        # users can set the fold change threshold in the log2 scale or normal
        # scale.
        assert log2_fc_threshold >= 0, "log2 fc_threshold must be >=0"
        if fc_threshold is not None:
            log2_fc_threshold = pylab.log2(fc_threshold)

        from bioservices import panther, quickgo

        self.quick_go_graph = QuickGOGraph()

        self.panther = panther.Panther(cache=True)
        self.valid_taxons = [x["taxon_id"] for x in self.panther.get_supported_genomes()]
        self.summary = {}

        self._taxon = None
        self.taxon = taxon

        self.quickgo = quickgo.QuickGO(cache=True)
        self.quickgo.requests_per_sec = requests_per_sec
        self.quickgo.services.settings.TIMEOUT = 120

        self._ancestors = {
            "MF": "GO:0003674",
            "CC": "GO:0005575",
            "BP": "GO:0008150",
            "SLIM_MF": "GO:0003674",
            "SLIM_CC": "GO:0005575",
            "SLIM_BP": "GO:0008150",
        }
        self.ontologies.extend(
            [
                "ANNOT_TYPE_ID_PANTHER_GO_SLIM_MF",
                "ANNOT_TYPE_ID_PANTHER_GO_SLIM_BP",
                "ANNOT_TYPE_ID_PANTHER_GO_SLIM_CC",
                "ANNOT_TYPE_ID_PANTHER_PC",
                "ANNOT_TYPE_ID_PANTHER_PATHWAY",
                "ANNOT_TYPE_ID_REACTOME_PATHWAY",
            ]
        )

        self.ontology_aliases.extend(
            [
                "SLIM_MF",
                "SLIM_BP",
                "SLIM_CC",
                "PROTEIN",
                "PANTHER_PATHWAY",
                "REACTOME_PATHWAY",
            ]
        )

        # panther accepts onyl ~2-3000 genes at max. Let us restrict the analysis
        # to the first 2000 genes based on their log2 fold change 2000 + and
        # 2000 negatives

        msg = "Ignoring DEGs with adjusted p-value > {} and fold change in [{}, {}]".format(
            padj_threshold, 1 / (2**log2_fc_threshold), 2**log2_fc_threshold
        )
        logger.info(msg)

        # used in report module
        self.summary["fold_change_range"] = [
            1 / (2**log2_fc_threshold),
            2**log2_fc_threshold,
        ]
        self.summary["padj_threshold"] = padj_threshold

        fc_threshold = log2_fc_threshold

        for x in sorted(gene_lists.keys()):
            N = len(gene_lists[x])
            logger.info(f"Starting with {N} genes from category '{x}'")

        self.summary["DGE_after_filtering"] = {k: len(v) for k, v in gene_lists.items()}

        self.enrichment = {}
        self.stats = {}
        self.obsolets = []

    def _set_taxon(self, taxon):
        if taxon not in self.valid_taxons:
            raise ValueError(f"taxon {taxon} not in pantherDB. please use one of {self.valid_taxons}")
        self.taxon_info = [x for x in self.panther.get_supported_genomes() if x["taxon_id"] == taxon]
        self.taxon_info = self.taxon_info[0]
        self._taxon_id = taxon

    def _get_taxon(self):
        return self._taxon_id

    taxon = property(_get_taxon, _set_taxon)

[docs]    def compute_enrichment(
        self,
        taxid=None,
        ontologies=None,
        enrichment_test="FISHER",
        correction="FDR",
    ):
        """
        :param enrichment_test: Fisher or Binomial
        :param correction: FDR or Bonferonni

        The field **number_in_reference** indicates from the reference, the number
        of genes that have a given ontolgy term. For instance, 998 genes have
        the term. This is stored in **number_in_reference**. If the reference
        contains 4391 genes, and you provided 49
        genes , the **expected** number of genes that have this ontology term is
        49*998/4391 that is 11.1369, which is stored in **"expected**.

        Now, if you actually find that 14 out of 49
        genes have the term, you need to compare the numbers 11.1369 and 14. Are
        they really different ? The ratio 14 / 11.1369 is stored in
        **fold_enrichment**. The pvalue and FDR are stored as well.

        Some genes may be missing If you provide 50 genes, you may end up with
        only 45 being mapped onto panther db database. This may explain some
        differences with the expected value.

        Fold enrichment is the number_in_list / expected ratio. Another close metric is the
        fractional difference: (observed - expected) / expected. This metric is
        slighlty less than the fold enrichment

        To get the number f genes from the database, simply take one output, and
        compute number_in_reference * number_in_list / expected

        The fold enrichment is also called odd-ratio.
        """

        for category, gene_list in self.gene_lists.items():
            logger.info(f"Computing enrichment for category {category}")
            self.enrichment[category], self.stats[category] = self._compute_enrichment(
                gene_list,
                taxid=taxid,
                ontologies=ontologies,
                enrichment_test=enrichment_test,
                correction=correction,
            )

[docs]    def get_mapping_stats(self):
        results = []
        for k in self.enrichment.keys():
            if self.enrichment[k]:
                ontologies = self.enrichment[k].keys()
                for o in ontologies:
                    M = self.enrichment[k][o]["input_list"]["mapped_count"]
                    U = self.enrichment[k][o]["input_list"]["unmapped_count"]
                    results.append([k, o, M, U])
        df = pd.DataFrame(results)
        if not df.empty:
            df[4] = 100 * df[2] / (df[3] + df[2])
            df[5] = df[2] + df[3]

            df.columns = [
                "category",
                "ontology",
                "mapped",
                "unmapped",
                "mapped_percentage",
                "total",
            ]
        return df

    def _compute_enrichment(
        self,
        mygenes,
        taxid,
        ontologies=None,
        enrichment_test="FISHER",
        correction="FDR",
    ):
        # taxid=83333 # ecoli
        if taxid is None:
            taxid = self.taxon

        # nice bug from panterdb website. if 'php' is a gene name, no process is done
        if "php" in mygenes:
            mygenes.remove("php")

        if isinstance(mygenes, list):
            mygenes = ",".join(mygenes)

        if mygenes.count(",") > 2500:  # pragma: no cover
            logger.warning("Please reduce the list input genes. may fail on pantherb otherwise")
        if len(mygenes) <= 2:  # pragma: no cover
            logger.error(f"Less than 2 genes are found for in the gene set: {mygenes}. No enrichment will be computed")
            return None, None

        if ontologies is None:
            ontologies = self.ontology_aliases
        else:
            for x in ontologies:
                assert x in self.ontology_aliases, f"{x} not valid ontology"

        # for each ontology, we will store one key/value item
        enrichment = {}

        def get_panther_ont(x):
            index = self.ontology_aliases.index(x)
            return self.ontologies[index]

        unclassified = {}
        for ontology in ontologies:
            unclassified[ontology] = 0
            logger.info(f" - Computing enrichment for {ontology}")

            try:
                del results
            except NameError:
                pass
            results = self.panther.get_enrichment(
                mygenes,
                taxid,
                get_panther_ont(ontology),
                enrichment_test=enrichment_test,
                correction=correction,
            )
            count = 0
            while count < 2 and results == 404:  # pragma: no cover
                logger.warning("Panther request failed Trying again")
                results = self.panther.get_enrichment(
                    mygenes,
                    taxid,
                    get_panther_ont(ontology),
                    enrichment_test=enrichment_test,
                    correction=correction,
                )
                count += 1

            if results == 404:  # pragma: no cover
                logger.warning("Invalid output from pantherdb (too many genes ?). skipping {}".format(ontology))
                enrichment[ontology] = None
                continue

            ##if isinstance(results["result"], dict):  # pragma: no cover
            #    results["result"] = [results["result"]]

            # extract the ID and label of the go term and save
            # as primary keys

            for i, k in enumerate(results["result"]):
                if results["result"][i]["term"]["label"] == "UNCLASSIFIED":
                    unclassified[ontology] += 1
                    # results["result"][i]['label'] = None
                else:
                    results["result"][i]["id"] = k["term"]["id"]
                    results["result"][i]["label"] = k["term"]["label"]

            # convert to dataframe and remove unclassified labels
            df = pd.DataFrame(results["result"])
            df = df[df["label"].isnull() == False]

            # compute pvalues adjusted
            pvalues = df["pValue"].copy()

            import statsmodels
            import statsmodels.stats.multitest

            if correction == "FDR":
                fdr = statsmodels.stats.multitest.multipletests(pvalues, method="fdr_bh")[1]
            elif correction.lower() == "bonferroni":
                fdr = statsmodels.stats.multitest.multipletests(pvalues, method="bonferroni")[1]

            df["fdr2"] = fdr
            if enrichment_test.lower() == "binomial":
                df["fdr"] = fdr

            # store all results
            enrichment[ontology] = results
            enrichment[ontology]["result"] = df

        stats = dict([(k, len(v["result"])) for k, v in enrichment.items()])
        stats["input_genes"] = len(mygenes.split(","))

        try:
            unmapped = enrichment[ontologies[0]]["input_list"]["unmapped_id"]
            stats["unmapped_genes"] = unmapped
            stats["N_unmapped_genes"] = len(unmapped)
        except Exception:
            stats["unmapped_genes"] = []
            stats["N_unmapped_genes"] = 0
        stats["unclassified_GO"] = unclassified

        # Here, looking at the FDr, it appears that when using bonferroni,
        # all FDR are set to zeros. Moreover, when using Fisher tests and
        # FDR (supposibly a FDR_BH, the results are noisy as compare to a
        # test from statsmodels. Moreover, when using binomial test, the FDR
        # is not computed... So, we will recompute the FDR ourself
        return enrichment, stats

[docs]    def get_data(self, category, ontologies, include_negative_enrichment=True, fdr=0.05):
        """

        From all input GO term that have been found and stored in
        enrichment[ONTOLOGY]['result'], we keep those with fdr<0.05. We also
        exclude UNCLASSIFIED entries. The final dataframe is returned

        ::

            pe.get_data(""up", MF")

        """

        df = self._get_data(category, ontologies)

        # some extra information for convenience
        df["pct_diff_expr"] = df["number_in_list"] * 100 / df["number_in_reference"]

        # could happen that all fold enrichment are set to 'NaN'
        df = df[df["fold_enrichment"] != "NaN"]
        df["log2_fold_enrichment"] = pylab.log2(df["fold_enrichment"])
        df["abs_log2_fold_enrichment"] = abs(pylab.log2(df["fold_enrichment"]))
        df["expected"] = [int(x) for x in df.expected]

        # Some user may want to include GO terms with fold enrichment
        # significanyly below 1 or not.
        if include_negative_enrichment is False:
            df = df.query("fold_enrichment>=1").copy()
            logger.info("Found {} GO terms after keeping only positive enrichment".format(len(df)))

        # filter out FDR>0.05
        df = df.query("fdr<=@fdr").copy()
        logger.info("Found {} GO terms after keeping only FDR<{}".format(len(df), fdr))

        return df

[docs]    def get_functional_classification(self, mygenes, taxon):  # pragma: no cover ; too slow
        """Mapping information from pantherDB for the lisf of genes

        We also store uniprot persistent id

        """
        logger.warning("Very slow. Please wait")
        if isinstance(mygenes, list):
            mygenes = ",".join(mygenes)

        res = self.panther.get_mapping(mygenes, taxon)
        res = res["mapped"]

        for i, item in tqdm(enumerate(res)):
            accession = item["accession"]
            res[i]["persistent_id"] = self._get_name_given_accession(accession)
        return res

    def _get_name_given_accession(self, accession):  # pragma: no cover
        from bioservices import UniProt

        self.uniprot = UniProt(cache=True)
        self.uniprot.requests_per_sec = 10

        acc = [x for x in accession.split("|") if x.startswith("UniProtKB")]
        acc = acc[0].split("=")[1]
        res = self.uniprot.get_df(acc, limit=1)
        name = res["Gene names  (primary )"][0]
        return name

[docs]    def plot_piechart(self, df):
        # Here we show the GO terms that have number in list > 0
        # Note, that this is dangerous to look only at this picture without
        # the reference plot, which data is not available thourg the pathner API
        labels = []
        for this in df.query("number_in_list!=0").label.values:
            if len(this) > 50:
                labels.append(this)
            else:
                labels.append(this[0:50] + "...")
        pylab.pie(df.query("number_in_list!=0").number_in_list, labels=labels)
        pylab.gcf().set_layout_engine("tight")

[docs]    def save_chart(self, df, filename="chart.png"):
        """

        pe = PantherEnrichment("B4052-V1.T1vsT0.complete.xls", fc_threshold=5,
            padj_threshold=0.05)
        df = pe.plot_go_terms("down", log=True, compute_levels=False)
        pe.save_chart(df, "chart.png")

        """
        self.quick_go_graph.save_chart(df, filename)

    def _get_graph(self, df, ontologies):
        return self.quick_go_graph._get_graph(df, ontologies=ontologies)