Source code for sequana.modules_report.kegg_enrichment

#
#  This file is part of Sequana software
#
#  Copyright (c) 2020 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
"""Module to write KEGG enrichment report"""
import os
import sys
from pathlib import Path

import colorlog
from plotly import offline
from tqdm import tqdm

from sequana.enrichment.kegg import KEGGPathwayEnrichment
from sequana.modules_report.base_module import SequanaBaseModule
from sequana.utils.datatables_js import DataTable

logger = colorlog.getLogger(__name__)

from sequana.utils import config


[docs] class ModuleKEGGEnrichment(SequanaBaseModule): """Write HTML report of variant calling. This class takes a csv file generated by sequana_variant_filter. """ def __init__( self, gene_lists, kegg_name, dataframe, enrichment_params={ "padj": 0.05, "log2_fc": 3, "nmax": 15, "max_entries": 3000, "kegg_background": None, "mapper": None, "preload_directory": None, "color_node_with_annotation": "Name", "plot_logx": True, }, command="", used_genes=None, ): """.. rubric:: constructor""" super().__init__() self.title = "Enrichment" self.command = command self.gene_lists = gene_lists self.enrichment_params = enrichment_params self.data = dataframe self.organism = kegg_name self.csv_directory = Path(config.output_dir) / "tables" self.plot_directory = Path(config.output_dir) / "plots" self.csv_directory.mkdir(exist_ok=True) self.plot_directory.mkdir(exist_ok=True) if self.enrichment_params["preload_directory"]: pathname = self.enrichment_params["preload_directory"] if not os.path.exists(pathname): logger.error(f"{pathname} does not exist") sys.exit(1) self.nmax = enrichment_params.get("nmax", 15) self.ke = KEGGPathwayEnrichment( self.gene_lists, self.organism, mapper=self.enrichment_params["mapper"], background=self.enrichment_params["kegg_background"], preload_directory=self.enrichment_params["preload_directory"], color_node_with_annotation=self.enrichment_params["color_node_with_annotation"], used_genes=used_genes, ) self.create_report_content() self.create_html("enrichment.html")
[docs] def create_report_content(self): self.sections = list() self.summary() self.add_kegg() self.sections.append({"name": "3 - Info", "anchor": "command", "content": self.command})
[docs] def summary(self): """Add information of filter.""" total_up = len(self.gene_lists["up"]) total_down = len(self.gene_lists["down"]) total = total_up + total_down log2fc = self.enrichment_params["log2_fc"] # table of undertermined IDs datatable = DataTable(self.ke.overlap_stats, "unmapped") datatable.datatable.datatable_options = { "scrollX": "true", "pageLength": 10, "scrollCollapse": "true", "dom": "Bfrtip", "buttons": ["copy", "csv"], } js = datatable.create_javascript_function() html_table = datatable.create_datatable(float_format="%E") # information about background bkg = self.ke.background self.sections.append( { "name": "1 - Summary", "anchor": "filters_option", "content": f""" <p>In the following sections, you will find the KEGG Pathway enrichment. The input data for those analyis is the output of the RNADiff analysis where adjusted p-values above 0.05 are excluded. Moreover, we removed candidates with log2 fold change below {log2fc}. Using these filters, the list of differentially expressed genes is made of {total_up} up and {total_down} down genes (total {total})</p> <p> In the following plots you can find the first KEGG Pathways that are enriched, keeping a maximum of {self.nmax} pathways. </p> <p>The KEGG name used is <b>{self.organism}</b>. <p>This table gives you (column N) the number of genes used when performing the enrichment on up-regulated genes (up), down-regulated genes, and when we take all up and down regulated genes (all). We also provide the total number of genes that were analysed in the differential analysis (all genes). This number may be lower that the number of genes found in the annotation if a filter was applied. Finally, using those genes, we provide the percentage of those genes that were found in the KEGG database. the Enrichment analysis relies on a background. This background was defined as {self.ke.background} </p> {js}{html_table} <br> """, } )
[docs] def add_kegg(self): logger.info("Enrichment module: kegg term") style = "width:45%" html = "" try: html_barplot_up_and_down = self.plot_barplot_up_and_down() html += f"<p>Here are the enrichment kegg pathways and the number of up and down genes found in each of them.</p>{html_barplot_up_and_down}" except Exception as err: print(err) pass for category in ["down", "up", "all"]: df = self.ke.dfs[category] if len(df) == 0 or "significative" not in df: n_enriched = 0 else: n_enriched = len(df.query("significative == True")) if n_enriched: html_barplot = self.plot_barplot(category=category) html_scatter = self.plot_scatter(category=category) js_table, html_table, fotorama = self.get_table(category) df.query("significative == True").to_csv(self.csv_directory / f"DEGs_enrichment_{category}.csv") else: html_barplot = html_barplot_up_and_down = html_scatter = js_table = html_table = fotorama = "" html += f""" <h3>2.1 - KEGG pathways enriched in {category} regulated genes</h3> <p>{n_enriched} KEGG pathways are found enriched using the {category} regulated genes</p> <br> {html_barplot} {html_scatter} <hr> {js_table} {html_table} <hr> <p>Here below are the pathways with gene colored according to their fold change. Blue colors are for down-regulated genes and Orange are for up-regulated genes. (Note that absolute log2 fold change above 4 are clipped to 4; So a gene with a log2 fold change of 4 or 40 will have the same darkest color.). These colors, when present, are superseding the original Green (core components) and White (associated, not necessarily essential components) colors. In case a FoldChange threshold has been applied for the enrichment analysis, genes not passing the threshold will not be colored.</p> {fotorama} """ self.sections.append({"name": "2 - KEGG", "anchor": "kegg", "content": html})
[docs] def plot_barplot(self, category=None): fig = self.ke.barplot(category, nmax=self.nmax) html_barplot_plotly = offline.plot(fig, output_type="div", include_plotlyjs=False) return html_barplot_plotly
[docs] def plot_scatter(self, category=None): fig = self.ke.scatterplot(category, nmax=self.nmax) html_scatter_plotly = offline.plot(fig, output_type="div", include_plotlyjs=False) return html_scatter_plotly
[docs] def plot_barplot_up_and_down(self, category=None): fig = self.ke.barplot_up_and_down(nmax=self.nmax) html_plotly = offline.plot(fig, output_type="div", include_plotlyjs=False) return html_plotly
[docs] def get_table(self, category): # Results down (pathway info) # html_before_table = """<p>Enrichment pathways summary</p>""" if not self.ke.dfs[category].empty: df = self.ke.dfs[category].query("significative == True").copy() if not df.empty: links = ["https://www.genome.jp/dbget-bin/www_bget?path:{}".format(x) for x in df["pathway_id"]] df["links"] = links df = df[ [ "pathway_id", "name", "size", "Overlap", "Odds Ratio", "Combined Score", "P-value", "Adjusted P-value", "Genes", "links", ] ] # save pathways and add fotorama logger.setLevel("WARNING") files = [] logger.info(f"Saving {len(df)} pathways") for ID in tqdm(df["pathway_id"], desc=f"Processing category {category}"): plot_file = self.plot_directory / f"{ID}.png" self.ke.save_pathway(ID, self.data, filename=plot_file) # Files path should be relative to the location of the enrichment.html file # ie relative to self.output_dir (.parents[1] here) files.append(plot_file.relative_to(plot_file.parents[1])) fotorama = self.add_fotorama(files, width=800) datatable = DataTable(df, f"kegg_{category}") datatable.datatable.set_links_to_column("links", "pathway_id") datatable.datatable.datatable_options = { "scrollX": "true", "pageLength": 20, "scrollCollapse": "true", "dom": "Bfrtip", "buttons": ["copy", "csv"], } js_table = datatable.create_javascript_function() html_table = datatable.create_datatable(float_format="%E") return (js_table, html_table, fotorama)