Source code for sequana.modules_report.panther_enrichment

#
#  This file is part of Sequana software
#
#  Copyright (c) 2020 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
"""Module to write enrichment report"""
import os
import sys
from pathlib import Path

import colorlog
from plotly import offline

from sequana.enrichment.panther import PantherEnrichment
from sequana.lazy import pandas as pd
from sequana.lazy import pylab
from sequana.modules_report.base_module import SequanaBaseModule
from sequana.utils import config
from sequana.utils.datatables_js import DataTable

logger = colorlog.getLogger(__name__)



[docs]
class ModulePantherEnrichment(SequanaBaseModule):
    """Write HTML report of variant calling. This class takes a csv file
    generated by sequana_variant_filter.
    """

    def __init__(
        self,
        gene_lists,
        taxon,
        enrichment_params={
            "padj": 0.05,
            "log2_fc": 3,
            "max_entries": 3000,  # not used in enrichment
            "nmax": 50,
            "mapper": None,
            "plot_compute_levels": False,
            "plot_logx": True,
        },
        command="",
        ontologies=["MF", "BP", "CC"],
    ):
        """.. rubric:: constructor"""
        super().__init__()
        self.title = "Panther Enrichment"

        self.command = command
        self.gene_lists = gene_lists
        self.enrichment_params = enrichment_params
        self.nmax = enrichment_params.get("nmax", 50)
        self.csv_directory = Path(config.output_dir) / "tables"
        self.plot_directory = Path(config.output_dir) / "plots"
        self.csv_directory.mkdir(exist_ok=True)
        self.plot_directory.mkdir(exist_ok=True)

        # compute the enrichment here once for all, This may take time
        from sequana import logger

        logger.setLevel("INFO")

        logger.info(" === Module PanthEnrichment. === ")
        self.pe = PantherEnrichment(
            self.gene_lists,
            taxon,
            # max_entries=self.enrichment_params["max_entries"],
            log2_fc_threshold=self.enrichment_params["log2_fc"],
        )

        self.ontologies = ontologies

        # Compute the enrichment
        self.pe.compute_enrichment(ontologies=self.ontologies)
        self.df_stats = self.pe.get_mapping_stats()

        self.create_report_content()
        self.create_html("enrichment.html")


[docs]
    def create_report_content(self):
        self.sections = list()
        self.summary()
        self.add_go()
        self.sections.append({"name": "5 - Info", "anchor": "command", "content": self.command})



[docs]
    def summary(self):
        """Add information."""

        total_up = len(self.gene_lists["up"])
        total_down = len(self.gene_lists["down"])
        total = total_up + total_down
        log2fc = self.enrichment_params["log2_fc"]

        # create html table for taxon information
        _taxon_id = self.pe.taxon_info["taxon_id"]
        _taxon_name = self.pe.taxon_info["long_name"]

        # table of undertermined IDs
        df_stats = self.df_stats.drop_duplicates()
        datatable = DataTable(df_stats, "unmapped")
        datatable.datatable.datatable_options = {
            "scrollX": "true",
            "pageLength": 10,
            "scrollCollapse": "true",
            "dom": "Bfrtip",
            "buttons": ["copy", "csv"],
        }
        js = datatable.create_javascript_function()
        html_table = datatable.create_datatable(float_format="%E")

        self.sections.append(
            {
                "name": "1 - Summary",
                "anchor": "filters_option",
                "content": f"""

<p>In the following sections, you will find the GO
terms enrichment. The input data for those analyis is the output of the RNADiff
analysis where adjusted p-values above 0.05 are excluded. Moreover, we removed
candidates with log2 fold change below {log2fc}. Using these filters, the list of
differentially expressed genes is made of {total_up} up and {total_down} down genes (total {total})</p>
<p> In the following plots you can find the first GO terms that are enriched, keeping a
maximum of {self.nmax} identifiers. </p>

<p>The taxon used is {_taxon_name} (ID {_taxon_id}).<br>

<p> Check the following table for the percentage of mapped genes on Panther
DB</p>

{js}{html_table}
""",
            }
        )



[docs]
    def add_go(self):
        # somehow, logger used here and in add_kegg must be global. If you call
        # add_go and then add_kegg, logger becomes an unbound local variable.
        # https://stackoverflow.com/questions/10851906/python-3-unboundlocalerror-local-variable-referenced-before-assignment
        # global logger
        level = logger.level
        logger.setLevel(level)

        html_intro = """
<p>Here below is a set of plots showing the enriched GO terms using the down
regulated genes only, and then the up-regulated genes only. When possible a
graph of the found GO terms is provided. MF stands for molecular
function, CC for cellular components and BP for biological process.</p>
        </div>
        """.format(
            self.pe.summary["padj_threshold"],
            self.pe.summary["fold_change_range"][0],
            self.pe.summary["fold_change_range"][1],
            self.pe.summary["DGE_after_filtering"]["down"],
            self.pe.summary["DGE_after_filtering"]["up"],
            self.pe.summary["DGE_after_filtering"]["all"],
        )

        html = self._get_enrichment("down")
        self.sections.append(
            {
                "name": "2 - Enriched GO terms (Down cases)",
                "anchor": "go_down",
                "content": html,
            }
        )

        html = self._get_enrichment("up")
        self.sections.append(
            {
                "name": "3 - Enriched GO terms (Up cases)",
                "anchor": "go_up",
                "content": html,
            }
        )

        html = self._get_enrichment("all")
        self.sections.append(
            {
                "name": "4 - Enriched GO terms (All cases)",
                "anchor": "go_all",
                "content": html,
            }
        )


    # a utility function to create the proper html table

[docs]
    def get_html_table(self, this_df, identifier):
        df = this_df.copy()

        # depending on the Panther category, we add the link to the identifier
        links = []
        for x in df["id"]:
            if x.startswith("PC"):
                links.append(f"http://www.pantherdb.org/panther/category.do?categoryAcc={x}")
            elif x.startswith("R-"):
                links.append(f"https://reactome.org/PathwayBrowser/#/{x}")
            else:
                links.append(f"https://www.ebi.ac.uk/QuickGO/term/{x}")
        df["links"] = links

        # remove non-informative or redundant fields
        df = df.drop(
            ["term", "fdr2", "abs_log2_fold_enrichment", "pct_diff_expr"],
            errors="ignore",
            axis=1,
        )

        first_col = df.pop("id")
        df.insert(0, "id", first_col)
        df = df.sort_values(by="fold_enrichment", ascending=False)

        datatable = DataTable(pd.DataFrame(df), identifier)
        datatable.datatable.set_links_to_column("links", "id")
        datatable.datatable.datatable_options = {
            "scrollX": "true",
            "pageLength": 10,
            "scrollCollapse": "true",
            "dom": "Bfrtip",
            "buttons": ["copy", "csv"],
        }
        js = datatable.create_javascript_function()
        html_table = datatable.create_datatable(float_format="%E")
        return js + html_table


    def _get_enrichment(self, category):
        # category is in down/up/all

        style = "width:95%"
        _temp_df = {}
        _minus = {}
        _plus = {}

        if not self.pe.enrichment[category]:
            return ""

        html = ""

        for ontology in self.ontologies:
            # get dat without plotting to store the entire set of GO terms
            df, subdf = self.pe._get_plot_go_terms_data(category, ontologies=ontology, compute_levels=False)
            if df is not None and len(df):
                _temp_df[ontology] = df.copy()
                _plus[ontology] = sum(df.plus_minus == "+")
                _minus[ontology] = sum(df.plus_minus == "-")

                # now plotting but showing only some restricted GO terms
                fig = self.pe.plot_go_terms(
                    category,
                    ontologies=ontology,
                    compute_levels=self.enrichment_params["plot_compute_levels"],
                    log=self.enrichment_params["plot_logx"],
                    max_features=self.nmax,
                )
                html_scatter_plotly = offline.plot(fig, output_type="div", include_plotlyjs=False)

                _temp_df[ontology].to_csv(self.csv_directory / f"DEGs_enrichment_{category}_{ontology}.csv")
                fig.write_image(self.plot_directory / f"DEGs_enrichment_{category}_{ontology}.pdf")
                html += f"""
<h3>{category.title()} - {ontology}</h3>
<p>For {ontology}, we found {_plus[ontology]+_minus[ontology]} go terms.
Showing {self.nmax} here below (at most). The full list is downloadable from the CSV
 file hereafter.</p> {html_scatter_plotly} <br>"""
                html += self.get_html_table(_temp_df[ontology], f"GO_table_{category}_{ontology}")
            else:
                html += f"""
<h4>{category.title()} - {ontology}</h4><p>For {ontology} case, we found 0
enriched go terms. </p><br>"""

            filenames = []
            for ontology in self.ontologies:
                if "PROTEIN" in ontology or "PATHWAY" in ontology:
                    continue
                filename = self.plot_directory / f"Chart_{category}_{ontology}.png"
                if ontology in _temp_df and len(_temp_df[ontology]):
                    logger.info(f"Saving chart for case {ontology} ({category}) in {filename}")

                    self.pe.save_chart(_temp_df[ontology].iloc[0 : self.nmax], filename)
                    # Files path should be relative to the location of the enrichment.html file
                    # ie relative to self.output_dir (.parents[1] here)
                    filenames.append(filename.relative_to(filename.parents[1]))
        foto = self.add_fotorama(filenames, width=1000)
        html += f"<h4>Charts {category} -- </h4> {foto}"

        return html