Source code for sequana.modules_report.uniprot_enrichment

#
#  This file is part of Sequana software
#
#  Copyright (c) 2020 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
"""Module to write enrichment report"""
import os
import sys
from pathlib import Path

import colorlog

from sequana.lazy import pandas as pd
from sequana.lazy import pylab
from sequana.modules_report.base_module import SequanaBaseModule
from sequana.utils import config
from sequana.utils.datatables_js import DataTable

logger = colorlog.getLogger(__name__)



[docs]
class ModuleUniprotEnrichment(SequanaBaseModule):
    """Write HTML report of variant calling. This class takes a csv file
    generated by sequana_variant_filter.
    """

    def __init__(
        self,
        gene_lists,
        summary,
        enrichment_params={
            "padj": 0.05,
            "log2_fc": 3,
            "max_entries": 3000,
            "nmax": 50,
            "plot_logx": True,
        },
        command="",
        ontologies=["MF", "BP", "CC"],
    ):
        """.. rubric:: constructor"""
        super().__init__()
        self.title = "UniProt Enrichment (GO terms)"
        self.taxon_id = summary["taxon"]
        self.taxon_name = summary["taxon_name"]
        self.compa = summary["name"]

        self.command = command
        self.gene_lists = gene_lists
        self.enrichment_params = enrichment_params
        self.nmax = enrichment_params.get("nmax", 50)

        # compute the enrichment here once for all, This may take time
        from sequana import logger

        logger.setLevel("INFO")

        logger.info(" === Module UniProtEnrichment. === ")
        self.ontologies = ontologies

        self.create_report_content()
        print(f"{config.output_dir}/{self.compa}_enrichment.html")
        # config.output_dir used internally
        self.create_html(f"{self.compa}_enrichment.html")


[docs]
    def create_report_content(self):
        self.sections = list()
        self.summary()
        self.add_go()
        self.sections.append({"name": "5 - Info", "anchor": "command", "content": self.command})



[docs]
    def summary(self):
        """Add information."""

        total_up = len(self.gene_lists["up"])
        total_down = len(self.gene_lists["down"])
        total = total_up + total_down
        log2fc = self.enrichment_params["log2_fc"]

        # create html table for taxon information
        _taxon_id = self.taxon_id
        _taxon_name = self.taxon_name

        js = ""
        html_table = ""
        self.sections.append(
            {
                "name": "1 - Summary",
                "anchor": "filters_option",
                "content": f"""

<p>In the following sections, you will find the GO
terms enrichment. The input data for those analysis is the output of the RNADiff
analysis where adjusted p-values above 0.05 are excluded. Moreover, we removed
candidates with log2 fold change below {log2fc}. Using these filters, the list of
differentially expressed genes is made of {total_up} up and {total_down} down genes (total {total})</p>
<p> In the following plots you can find the first GO terms that are enriched, keeping a
maximum of {self.nmax} identifiers. </p>

<p>The taxon used is {_taxon_name} (ID {_taxon_id}).<br>

""",
            }
        )



[docs]
    def add_go(self):
        # somehow, logger used here and in add_kegg must be global. If you call
        # add_go and then add_kegg, logger becomes an unbound local variable.
        # https://stackoverflow.com/questions/10851906/python-3-unboundlocalerror-local-variable-referenced-before-assignment
        # global logger
        level = logger.level
        logger.setLevel(level)

        html_intro = """
<p>Here below is a set of plots showing the enriched GO terms using the down
regulated genes only, and then the up-regulated genes only. When possible a
graph of the found GO terms is provided. MF stands for molecular
function, CC for cellular components and BP for biological process.</p>
        </div>
        """

        html = self._get_enrichment("down")
        self.sections.append(
            {
                "name": "2 - Enriched GO terms (Down cases)",
                "anchor": "go_down",
                "content": html,
            }
        )

        html = self._get_enrichment("up")
        self.sections.append(
            {
                "name": "3 - Enriched GO terms (Up cases)",
                "anchor": "go_up",
                "content": html,
            }
        )

        html = self._get_enrichment("all")
        self.sections.append(
            {
                "name": "4 - Enriched GO terms (All cases)",
                "anchor": "go_all",
                "content": html,
            }
        )


    # a utility function to create the proper html table

[docs]
    def get_html_table(self, this_df, identifier):
        df = this_df.copy()

        links = []
        for x in df["Term"]:
            links.append(f"https://www.ebi.ac.uk/QuickGO/term/{x}")
        df["links"] = links

        # remove non-informative or redundant fields
        df = df.drop(
            ["Gene_set", "Unnamed: 0", "id", "abs_log2_fold_enrichment"],
            errors="ignore",
            axis=1,
        )

        # we move the "Genes" to the end
        df = df[[x for x in df.columns if x != "Genes"] + ["Genes"]]

        first_col = df.pop("Term")
        df.insert(0, "Term", first_col)
        df = df.sort_values(by="fold_enrichment", ascending=False)

        datatable = DataTable(pd.DataFrame(df), identifier)
        datatable.datatable.set_links_to_column("links", "Term")
        datatable.datatable.datatable_options = {
            "scrollX": "true",
            "pageLength": 10,
            "scrollCollapse": "true",
            "dom": "Bfrtip",
            "buttons": ["copy", "csv"],
        }
        js = datatable.create_javascript_function()
        html_table = datatable.create_datatable(float_format="%E")
        return js + html_table


    def _get_enrichment(self, category):
        # category is in down/up/all

        style = "width:95%"

        html = ""

        filenames = []

        for ontology in self.ontologies:
            filename = f"{config.output_dir}/plot_{category}_{ontology}.png"

            if os.path.exists(filename):
                df = pd.read_csv(f"{config.output_dir}/{category}_{ontology}.csv")
                image = self.png_to_embedded_png(
                    filename,
                    style=style,
                )

                n_go_terms = len(df)
                html += f"""
<h3>{category.title()} - {ontology}</h3>
<p>For {ontology}, we found {n_go_terms} go terms.
Showing {self.nmax} here below (at most). The full list is downlodable from the CSV
 file hereafter.</p> {image} <br>"""
                html += self.get_html_table(df, f"GO_table_{category}_{ontology}")
                filenames.append(Path(f"chart_{category}_{ontology}.png"))
            else:
                html += f"""
<h4>{category.title()} - {ontology}</h4><p>For {ontology} case, we found 0
enriched go terms. </p><br>"""

        foto = self.add_fotorama(filenames, width=1000)
        html += f"<h4>Charts {category} -- </h4> {foto}"

        return html