Source code for sequana.enrichment.gsea

# -*- coding: utf-8 -*-
#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2020 - Sequana Development Team
#
#  File author(s):
#      Thomas Cokelaer <thomas.cokelaer@pasteur.fr>
#      Etienne Kornobis <etienne.kornobis@pasteur.fr>
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################

import tempfile

import colorlog
import gseapy

logger = colorlog.getLogger(__name__)

from sequana.lazy import pandas as pd

__all__ = ["GSEA"]


[docs]class GSEA:
    def __init__(self, gene_sets={}):
        """.. rubric:: **Constructor**

        :param species:

        """
        #: attribute to store the gene sets to be checked for enrichment
        self.gene_sets = gene_sets
        self.no_plot = True

    # Remove description when using v0.13.0 of gseapy API
[docs]    def compute_enrichment(self, gene_list, background=None, verbose=False, outdir=None):
        """

        :param gene_list: list of genes (e.g. genes with significant fold change)
        :param background: expected background of the species.
            Should be number of genes for the species of interest.
        :param verbose:
        :param str outdir: a temporary directory to store reports and intermediate results
        """
        if outdir is None:
            outdir = tempfile.TemporaryDirectory()

        try:
            enr = gseapy.enrichr(
                gene_list=gene_list,
                gene_sets=self.gene_sets,
                verbose=verbose,
                background=background,
                outdir=outdir.name,
                no_plot=self.no_plot,
            )
            if len(enr.results):
                enr.results["Genes"] = [";".join(sorted(x.split(";"))) for x in enr.results["Genes"].values]
                enr.results["size"] = [len(x.split(";")) for x in enr.results.Genes]
        except ValueError:
            # if no hits, newest gseapy version will raise a ValueError
            from collections import namedtuple

            enrich = namedtuple("MyStruct", "results")
            enr = enrich(results=pd.DataFrame({"Genes": [], "size": [], "Term": []}))

        return enr