Source code for sequana.modules_report.rnadiff

# coding: utf-8
#
#  This file is part of Sequana software
#
#  Copyright (c) 2020 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
"""Module to write differential regulation analysis report"""
import os

import colorlog

from sequana.lazy import pandas as pd
from sequana.lazy import pylab
from sequana.modules_report.base_module import SequanaBaseModule
from sequana.rnadiff import RNADiffResults
from sequana.utils.datatables_js import DataTable

logger = colorlog.getLogger(__name__)


__all__ = ["RNAdiffModule"]


[docs] class RNAdiffModule(SequanaBaseModule): """Write HTML report of variant calling. This class takes a csv file generated by sequana_variant_filter. """ def __init__(self, folder, gff, output_filename="summary.html", **kwargs): """.. rubric:: constructor""" super().__init__() self.title = "RNAdiff" self.folder = folder self.independent_module = True self.module_command = "--module rnadiff" self.rnadiff = RNADiffResults(folder, gff=gff, **kwargs) # nice layout for the report. Use import here to not overload # import time import seaborn seaborn.set() # any user option here self.kwargs = kwargs self._count_section = 1 self.create_main_report_content() self.create_individual_reports() self.create_command_section() self.create_html(output_filename) # Fixme not sure this is required to be imported here import matplotlib matplotlib.rc_file_defaults()
[docs] def create_command_section(self): command = self.kwargs.get("command", "") self.sections.append({"name": f"{self._count_section} - Info", "anchor": "command", "content": command}) self._count_section += 1
[docs] def create_individual_reports(self): description = """<p>The differential analysis is based on DESeq2. This tool aim at fitting one linear model per feature. Given the replicates in condition one and the replicates in condition two, a p-value is computed to indicate whether the feature (gene) is differentially expressed. Then, all p-values are corrected for multiple testing.</p> <p>It may happen that one sample seems unrelated to the rest. For every feature and every model, Cook's distance is computed. It reflects how the sample matches the model. A large value indicates an outlier count and p-values are not computed for that feature.</p> """ self.sections.append( { "name": f"{self._count_section}. DGE results", "anchor": "filters_option", "content": description, } ) self._count_section += 1 counter = 1 for name, comp in self.rnadiff.comparisons.items(): self.add_individual_report(comp, name, counter)
[docs] def create_main_report_content(self): self.sections = list() self.summary() self.add_plot_count_per_sample() self.add_cluster() self.add_normalisation() self.add_dispersion() if len(self.rnadiff.comparisons) > 1 and len(self.rnadiff.comparisons) < 7: self.add_upset_plot()
[docs] def summary(self): """Add information of filter.""" Sdefault = self.rnadiff.summary() self.rnadiff.log2_fc = 1 S1 = self.rnadiff.summary() # set options options = { "scrollX": "true", "pageLength": 20, "scrollCollapse": "true", "dom": "", "buttons": [], } if len(S1) > 20: options["dom"] = "Bfrtip" def get_local_df(Sdata, lfc=0): N = len(Sdata) if lfc == 0: message = "Number of DGE (any FC)" else: message = f"Number of DGE log2(|FC|) > {lfc} " df = pd.DataFrame( { "comparison_link": [1] * N, "comparison": Sdata.index.values, "Description": [message] * N, "Down": Sdata["down"].values, "Up": Sdata["up"].values, "Total": Sdata["all"].values, } ) df = df[["comparison", "Description", "Down", "Up", "Total", "comparison_link"]] df["comparison_link"] = [f"#{name.replace('.', '')}_stats" for name in Sdata.index] return df dt = DataTable(get_local_df(Sdefault), "dge_default") dt.datatable.set_links_to_column("comparison_link", "comparison", new_page=False) dt.datatable.datatable_options = options js_all = dt.create_javascript_function() html = dt.create_datatable(float_format="%d") dt = DataTable(get_local_df(S1, lfc=1), "dge_lfc") dt.datatable.set_links_to_column("comparison_link", "comparison", new_page=False) dt.datatable.datatable_options = options js_all += dt.create_javascript_function() html += dt.create_datatable(float_format="%d") self.sections.append( { "name": "Summary", "anchor": "filters_option", "content": f"""<p>Here below is a summary of the Differential Gene Expression (DGE) analysis. You can find two entries per comparison. The first one has no filter except for an adjusted p-value of 0.05. The second shows the expressed genes with a filter of the log2 fold change of 1 (factor 2 in a normal scale). Clicking on any of the link will lead you to the section of the comparison. {js_all} {html} </p>""", } )
[docs] def add_dispersion(self): style = "width:65%" def dispersion(filename): with pylab.ioff(): pylab.clf() self.rnadiff.plot_dispersion() pylab.savefig(filename) pylab.close() html = """<p>dispersion of the fitted data to the model</p>{}<hr>""".format( self.create_embedded_png(dispersion, "filename", style=style) ) self.sections.append({"name": f"{self._count_section}. Dispersion", "anchor": "table", "content": html}) self._count_section += 1
[docs] def add_cluster(self): style = "width:45%" def dendogram(filename, count_mode="norm", transform="log"): with pylab.ioff(): pylab.clf() import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") self.rnadiff.plot_dendogram(count_mode=count_mode, transform_method=transform) try: # Here, tight layout is not comaptible with dendogram # Yet, if we call it and plot again, we get the expected layout. # otherwise, colormap is kind of shifted and not as large as the plot itself. pylab.tight_layout() except Exception: # This does not work right now in v0.16.0 but should be used in future pylab.gcf().set_layout_engine("tight") self.rnadiff.plot_dendogram(count_mode=count_mode, transform_method=transform) pylab.savefig(filename) pylab.close() # save image in case of dendogram(f"{self.folder}/images/dendogram.png") image1 = self.create_embedded_png(dendogram, "filename", style=style) if os.path.exists(f"{self.folder}/counts/counts_vst_batch.csv"): # save image in case of dendogram(f"{self.folder}/images/dendogram_vst_batch.png", count_mode="vst_batch", transform="none") # html report html_dendogram = f"""<p>The following image shows a hierarchical clustering of the whole sample set. An euclidean distance is computed between samples. The dendogram itself is built using the <a href="https://en.wikipedia.org/wiki/Ward%27s_method"> Ward method </a>. The normed data was log-transformed first and at max 5,000 most variable features were selected.</p>""" if os.path.exists(f"{self.folder}/counts/counts_vst_batch.csv"): image2 = self.create_embedded_png( dendogram, "filename", count_mode="vst_batch", transform="none", style=style ) html_dendogram += f"""<p>A batch effect was included. Here is the corrected image</p>{image1}{image2}<hr>""" else: html_dendogram += f"""{image1}<hr>""" # =========== PCA def pca(filename, transform_method="none", count_mode="vst"): pylab.ioff() pylab.clf() variance = self.rnadiff.plot_pca( 2, fontsize=self.kwargs.get("pca_fontsize", 10), transform_method=transform_method, count_mode=count_mode, ) pylab.savefig(filename) pylab.close() image1 = self.create_embedded_png(pca, "filename", style=style) html_pca = f"""<p>The experiment variability is also represented by a principal component analysis as shown here below. The two main components are represented. We expect the first principal component (PC1) to separate samples from the different biological conditions, meaning that the biological variability is the main source of variance in the data. The data used is the normalised count matrix transformed using a VST method variance stabilization); the first 500 most variable genes were selected. </p>""" from plotly import offline html_pca_plotly = "<p>Here is a PCA showing the first 3 components in 3D.</p>" import plotly.io as pio pio.renderers.default = "iframe" # Ensure it's set to browser pio.renderers["browser"].timeout = 60 # Increa if os.path.exists(f"{self.folder}/counts/counts_vst_batch.csv"): image2 = self.create_embedded_png(pca, "filename", count_mode="vst_batch", style=style) html_pca += f"""<p>Note that a batch effect was included. </p>{image1}{image2}<hr>""" fig = self.rnadiff.plot_pca(n_components=3, plotly=True, count_mode="vst_batch") html_pca_plotly += offline.plot(fig, output_type="div", include_plotlyjs=False) else: html_pca += f"""{image1}<hr>""" fig = self.rnadiff.plot_pca(n_components=3, plotly=True) html_pca_plotly += offline.plot(fig, output_type="div", include_plotlyjs=False) self.sections.append( { "name": f"{self._count_section}. Clusterisation", "anchor": "table", "content": html_dendogram + html_pca + html_pca_plotly, } ) self._count_section += 1
[docs] def add_plot_count_per_sample(self): style = "width:65%" def plotter(filename): pylab.ioff() pylab.clf() self.rnadiff.plot_count_per_sample(rotation=45) pylab.savefig(filename) pylab.close() html1 = """<p>The following image shows the total number of counted reads for each sample. We expect counts to be similar within conditions. They may be different across conditions. Note that variation may happen (e.g., different rRNA contamination levels, library concentrations, etc).<p>{}<hr>""".format( self.create_embedded_png(plotter, "filename", style=style) ) def null_counts(filename): pylab.ioff() pylab.clf() self.rnadiff.plot_percentage_null_read_counts() pylab.savefig(filename) pylab.close() html_null = """<p>The next image shows the percentage of features with no read count in each sample (taken individually). Features with null read counts in <b>all</b> samples are not taken into account in the analysis (black dashed line). Fold-change and p-values will be set to NA in the final results</p> {}<hr>""".format( self.create_embedded_png(null_counts, "filename", style=style) ) def count_density(filename): pylab.ioff() pylab.clf() self.rnadiff.plot_density() pylab.savefig(filename) pylab.close() html_density = """<p>In the following figure, we show the distribution of read counts for each sample (log10 scale). We expect replicates to behave in a similar fashion. The mode depends on the biological conditions and organism considered.</p> {}<hr>""".format( self.create_embedded_png(count_density, "filename", style=style) ) def best_count(filename): pylab.ioff() pylab.clf() self.rnadiff.plot_most_expressed_features() pylab.savefig(filename) pylab.close() html_feature = """<p>The following figure shows for each sample (left y-axis) the gene/feature that captures the highest proportion of the reads considered. This gene/feature is indicated in the right y-axis. This should not impact the DESeq2 normalization. We expect consistency across samples within a single condition</p> {}<hr>""".format( self.create_embedded_png(best_count, "filename", style=style) ) self.sections.append( { "name": f"{self._count_section}. Diagnostic plots", "anchor": "table_count", "content": html1 + html_null + html_density + html_feature, } ) self._count_section += 1
[docs] def add_normalisation(self): style = "width:45%" def rawcount(filename): pylab.ioff() pylab.clf() self.rnadiff.plot_boxplot_rawdata() ax = pylab.gca() xticklabels = ax.get_xticklabels() ax.set_xticklabels(xticklabels, rotation=45, ha="right") try: pylab.set_engine_layout("tight") except: pass pylab.savefig(filename) pylab.close() def normedcount(filename): pylab.ioff() pylab.clf() self.rnadiff.plot_boxplot_normeddata() ax = pylab.gca() xticklabels = ax.get_xticklabels() ax.set_xticklabels(xticklabels, rotation=45, ha="right") try: pylab.set_engine_layout("tight") except: pass pylab.savefig(filename) pylab.close() html_boxplot = """<p>A normalization of the data is performed to correct the systematic technical biases due to different counts across samples. The normalization is performed with DESeq2. It relies on the hypothess that most features are not differentially expressed. It computes a scaling factor for each sample. Normalized read counts are obtained by dividing raw read counts by the scaling factor associated with the sample they belong to. Boxplots are often used as a qualitative measure of the quality of the normalization process, as they show how distributions are globally affected during this process. We expect normalization to stabilize distributions across samples. The left figure shows the raw counts while the right figure shows the normalised counts. </p>""" img1 = self.create_embedded_png(rawcount, "filename", style=style) img2 = self.create_embedded_png(normedcount, "filename", style=style) html_size_factor = "<p>The size factor is a crucial component used to normalize the raw counts of RNA-seqr data to account for differences in sequencing depth and RNA composition across samples. Each sample is assigned a size factor, which represents the relative amount of sequencing depth (or library size) needed to compare expression levels between samples fairly. This factor adjusts the raw counts by scaling them, so that differences in gene expression are due to biological variation rather than technical artifacts. The size factor is computed by taking the median ratio of each gene's count in a sample to its geometric mean across all samples. Significant departures from 1.0 suggest notable differences in sequencing depth across samples. However, extreme values (either much larger or much smaller than 1.0) might indicate potential issues such as technical variability or inconsistencies in sample preparation. </p>" options = { "scrollX": "false", "pageLength": 1, "scrollCollapse": "true", "dom": "", "buttons": [], } df = pd.read_csv(f"{self.folder}/code/size_factors.csv", index_col=0) datatable = DataTable(df.T, "table_size_factor") datatable.datatable.datatable_options = options js_all = datatable.create_javascript_function() table1 = datatable.create_datatable(float_format="%.3e") self.sections.append( { "name": f"{self._count_section}. Normalisation", "anchor": "table", "content": html_boxplot + img1 + img2 + html_size_factor + js_all + table1 + "</hr>", } ) self._count_section += 1
[docs] def add_upset_plot(self): style = "width:65%" def upsetplot(filename): pylab.ioff() pylab.clf() self.rnadiff.plot_upset() pylab.savefig(filename) pylab.close() html_upsetplot = """<p> Upset plots are an alternative to venn diagrams, easing the visualisation of DEG lists overlap between comparisons.""" img = self.create_embedded_png(upsetplot, "filename", style=style) self.sections.append( { "name": f"{self._count_section}. Upset plot", "anchor": "table", "content": html_upsetplot + img + "</hr>", } ) self._count_section += 1
[docs] def add_individual_report(self, comp, name, counter): style = "width:45%" description = """<p> When the dispersion estimation and model fitting is done, statistical testing is performed. The distribution of raw p-values computed by the statistical test is expected to be a mixture of a uniform distribution on [0, 1] and a peak around 0 corresponding to the differentially expressed features. This may not always be the case. </p>""" def plot_pvalue_hist(filename): pylab.ioff() pylab.clf() comp.plot_pvalue_hist() pylab.savefig(filename) pylab.close() def plot_padj_hist(filename): pylab.ioff() pylab.clf() comp.plot_padj_hist() pylab.savefig(filename) pylab.close() img1 = self.create_embedded_png(plot_pvalue_hist, "filename", style=style) img2 = self.create_embedded_png(plot_padj_hist, "filename", style=style) # FIXME. pvalues adjusted are not relevant so commented for now img2 = "" self.sections.append( { "name": f"{self._count_section}.{counter}.a pvalue distribution ({name})", "anchor": f"dge_summary", "content": description + img1 + img2, } ) def plot_volcano(filename): pylab.ioff() pylab.clf() comp.plot_volcano() pylab.savefig(filename) pylab.close() html_volcano = """<p>The volcano plot here below shows the differentially expressed features with a adjusted p-value below 0.05 (dashed back line). The volcano plot represents the log10 of the adjusted P value as a function of the log2 ratio of differential expression. </p>""" # img3 = self.create_embedded_png(plot_volcano, "filename", style=style) img3 = "" fig = comp.plot_volcano( plotly=True, annotations=self.rnadiff.annotation, hover_name=self.kwargs.get("hover_name", None), ) from plotly import offline plotly = offline.plot(fig, output_type="div", include_plotlyjs=False) self.sections.append( { "name": f"{self._count_section}.{counter}.b volcano plots ({name})", "anchor": f"{name}_volcano", "content": html_volcano + img3 + "<hr>" + plotly, } ) # finally, let us add the tables df = comp.df.copy() # .reset_index() # here we need to add the annotation if possible try: df = pd.concat([df, self.rnadiff.annotation.loc[[str(x) for x in comp.df.index]]], axis=1) except Exception as err: logger.critical(f"Could not add annotation. {err}. annotation skipped") df = df.reset_index() fold_change = 2 ** df["log2FoldChange"] log10padj = -pylab.log10(df["padj"]) df.insert(df.columns.get_loc("log2FoldChange") + 1, "FoldChange", fold_change) df.insert(df.columns.get_loc("padj") + 1, "log10_padj", log10padj) try: del df["dispGeneEst"] except: pass for x in ["lfcSE", "stat", "dispersion"]: try: del df[x] except: pass # set options options = { "scrollX": "true", "pageLength": 10, "scrollCollapse": "true", "dom": "Bfrtip", "buttons": ["copy", "csv"], } idname = name.replace(".", "") datatable = DataTable(df, f"{idname}_table_all") datatable.datatable.datatable_options = options js_all = datatable.create_javascript_function() html_tab_all = datatable.create_datatable(float_format="%.3e") df_sign = df.query(f"padj<=0.05 and ({comp.l2fc_name}>1 or {comp.l2fc_name}<-1)") datatable = DataTable(df_sign, f"{idname}_table_sign") datatable.datatable.datatable_options = options js_sign = datatable.create_javascript_function() html_tab_sign = datatable.create_datatable(float_format="%.3e") content = f"""<p>The following tables give all DGE results. The first table contains all significant genes (adjusted yyp-value below 0.05 and absolute fold change of at least 0.5). The following tables contains all results without any filtering. Here is a short explanation for each column: <ul> <li> baseMean: read count mean over all samples</li> <li> FoldChange: fold change in natural base</li> <li> log2FoldChange: log2 Fold Change estimated by the model. Reflects change between the condition versus the reference condition</li> <li> stat: Wald statistic for the coefficient (contrast) tested</li> <li> pvalue: raw p-value from statistical test</li> <li> padj: adjusted pvalue. Used for cutoff at 0.05 </li> <li> betaConv: convergence of the coefficients of the model </li> <li> maxCooks: maximum Cook's distance of the feature </li> <li> outlier: indicates if the feature is an outlier according to Cook's distance </li> </ul> </p> <h3>Significative only (and |l2fC| > 1) <a id="{idname}_table_sign"></a></h3> Here below is a subset of the entire table. It contains all genes below adjusted p-value of 0.05 and absolute log2 fold change above 1. {js_sign} {html_tab_sign}""" if not self.kwargs.get("split_full_table", False): content += f"""<h3>All genes<a id="{idname}_table_all"></a></h3>{js_all} {html_tab_all}""" else: # create an independent pages (lighter to have N pages rather than everything # on the same page. Could be useful for eukaryotes. link = f"{self.folder}/{name}.html" content += f"""<h3>All genes<a id="{idname}_table_all"></a></h3>""" content += f'The full list is available <a target="_blank" href="{name}.html">here</a>' r = SequanaBaseModule() r.title = "List of DGEs" r.folder = self.folder r.independent_module = True r.sections = list() r.sections.append( { "name": f"{name} comparison", "anchor": "table", "content": f"""<h3>All genes<a id="{idname}_table_all"></a></h3>{js_all} {html_tab_all}""", } ) r.create_html(f"{name}.html") self.sections.append( { "name": f"{self._count_section}.{counter}.c {name} Tables ({name})", "anchor": f"{name}_stats", "content": content, } )