Source code for sequana.modules_report.coverage

# coding: utf-8
#
#  This file is part of Sequana software
#
#  Copyright (c) 2016 - Sequana Development Team
#
#  File author(s):
#      Dimitri Desvillechabrol <dimitri.desvillechabrol@pasteur.fr>,
#          <d.desvillechabrol@gmail.com>
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
"""Module to write coverage report"""
import os

import colorlog
import pandas as pd

from sequana import bedtools
from sequana.modules_report.base_module import SequanaBaseModule
from sequana.modules_report.summary import SummaryModule
from sequana.plots.canvasjs_linegraph import CanvasJSLineGraph
from sequana.utils import config
from sequana.utils.datatables_js import DataTable, DataTableFunction

logger = colorlog.getLogger(__name__)


__all__ = ["CoverageModule", "ChromosomeCoverageModule"]


[docs]class CoverageModule(SequanaBaseModule): """Write HTML report of coverage analysis. This class takes either a :class:`SequanaCoverage` instances or a csv file where analysis are stored. """ def __init__(self, data, region_window=200000): """.. rubric:: constructor :param data: it can be a csv filename created by sequana_coverage or a :class:`bedtools.SequanaCoverage` object. :param region_window: """ super().__init__() self.region_window = region_window if isinstance(data, bedtools.SequanaCoverage): self.bed = data else: raise TypeError if len(self.bed._html_list) == 0: try: html_list = self.create_chromosome_reports() except TypeError: msg = ( "Data must be either a csv file or a :class:`SequanaCoverage` " "instance where zscore is computed." ) raise TypeError(msg) else: html_list = list(self.bed._html_list) self.title = f"Main coverage report ({config.sample_name})" self.intro = ( f"<p>Report the coverage of your sample ({config.sample_name}) to check the " "quality of your mapping and to highlight regions of " "interest (under and over covered).</p>" ) # set the chromosome tables self.sections = list() self.create_chromosome_table(html_list) # and create final HTML self.create_html("sequana_coverage.html")
[docs] def create_chromosome_table(self, html_list): """Create table with links to chromosome reports""" df = pd.DataFrame( [ [ chrom, self.bed._basic_stats[chrom]["length"], self.bed._basic_stats[chrom]["DOC"], self.bed._basic_stats[chrom]["CV"], f"{chrom}/{chrom}.cov.html", ] for chrom in self.bed.chrom_names ], columns=["chromosome", "size", "mean_coverage", "coef_variation", "link"], ) datatable = DataTable(df, "chrom") datatable.datatable.datatable_options = { "pageLength": 15, "dom": "Bfrtip", "buttons": ["copy", "csv"], } datatable.datatable.set_links_to_column("link", "chromosome") js = datatable.create_javascript_function() html_table = datatable.create_datatable(float_format="%.3g") self.sections.append( { "name": "Chromosomes", "anchor": "chromosomes", "content": "<p>Link to coverage analysis report for each chromosome. " "Size, mean coverage and coefficient of variation are reported" " in the table below.</p>\n{0}\n{1}".format(js, html_table), } )
[docs] def create_chromosome_reports(self): """Create HTML report for each chromosome present in data.""" # We choose the first chromosome, to build a common javascript object datatable_js = CoverageModule.init_roi_datatable(self.bed[0]) chrom_output_dir = config.output_dir if not os.path.exists(chrom_output_dir): os.makedirs(chrom_output_dir) page_list = [] for chrom in self.bed: logger.info(f"Creating coverage report {chrom.chrom_name}") chrom_report = ChromosomeCoverageModule(chrom, datatable_js, region_window=self.region_window) page_list.append(chrom_report.html_page) return page_list
# a static method because we need it in the coverage standalone # to initiate the datatables
[docs] def init_roi_datatable(rois): """Initiate :class:`DataTableFunction` to create table to link each row with sub HTML report. All table will have the same appearance. We can therefore initialise the roi once for all. :param rois: can be a ROIs from ChromosomeCov instance or a simple dataframe """ # computed try: df = rois.df.copy() except: df = rois.copy() df["link"] = "" # set datatable options datatable_js = DataTableFunction(df, "roi") if "start" in df.columns: datatable_js.set_links_to_column("link", "start") if "end" in df.columns: datatable_js.set_links_to_column("link", "end") datatable_js.datatable_options = { "scrollX": "true", "pageLength": 15, "scrollCollapse": "true", "dom": "Bfrtip", "buttons": ["copy", "csv"], } return datatable_js
[docs]class ChromosomeCoverageModule(SequanaBaseModule): """Write HTML report of coverage analysis for each chromosome. It is created by CoverageModule. """ def __init__(self, chromosome, datatable, region_window=200000, options=None, command="", skip_html=False): """ :param chromosome: :param datatable: :param directory: :param int region_window: length of the sub coverage plot :param options: should contain "W", "k", "circular" """ super().__init__() self.region_window = region_window directory = chromosome.chrom_name # to define where are css and js if directory in {None, "."}: self.path = "" directory = "." else: self.path = "../" self.chromosome = chromosome self.datatable = datatable self.command = command self.command += "\nSequana version: {}".format(config.version) self.title = "Coverage analysis of chromosome {0}".format(self.chromosome.chrom_name) self.intro = "<p>The genome coverage analysis of the chromosome " "<b>{0}</b>.</p>".format( self.chromosome.chrom_name ) self.create_report_content(directory, options=options) if skip_html: return self.html_page = "{0}{1}{2}.cov.html".format(directory, os.sep, self.chromosome.chrom_name) self.create_html(self.html_page) # inform the main coverage instance that HTML is ready self.chromosome.bed._html_list = self.chromosome.bed._html_list.union([self.html_page])
[docs] def create_report_content(self, directory, options=None): """Generate the sections list to fill the HTML report.""" self.sections = list() if self.chromosome._mode == "memory": # nothing to do, all computations should be already available # and in memory rois = self.chromosome.rois elif self.chromosome._mode == "chunks": # we need to reset the data to the first chunk # and compute the median and zscore. So, first, we save the entire # data set # self.chromosome.reset() # self.chromosome.running_median(options['W'], circular=options['circular']) # self.chromosome.compute_zscore(options['k']) # We mus set the ROI manually rois = options["ROIs"] if self.chromosome.DOC > 0: self.coverage_plot() if self.chromosome._mode == "memory": links = self.subcoverage(rois, directory) else: links = None else: links = None self.basic_stats() if self.chromosome.DOC: self.regions_of_interest(rois, links) self.coverage_barplot() if "gc" in self.chromosome.df.columns: self.gc_vs_coverage() self.normalized_coverage() self.zscore_distribution() self.add_command()
[docs] def add_command(self): self.sections.append( { "name": "Command", "anchor": "command", "content": ("<p>Command used: <pre>{}</pre>.</p>".format(self.command)), } )
def _add_large_data_section(self, name, anchor): self.sections.append( { "name": name, "anchor": anchor, "content": ("<p>Large data sets (--chunk-size argument " "used), skipped plotting.</p>"), } )
[docs] def coverage_plot(self): """Coverage section.""" if self.chromosome._mode == "chunks": self._add_large_data_section("Coverage", "coverage") return image = self.create_embedded_png(self.chromosome.plot_coverage, input_arg="filename") self.sections.append( { "name": "Coverage", "anchor": "coverage", "content": "<p>The following figures shows the per-base coverage along the" " reference genome (black line). The blue line indicates the " "running median. From the normalised coverage, we estimate " "z-scores on a per-base level. The red lines indicates the " "z-scores at plus or minus N standard deviations, where N is " "chosen by the user. (default:4). Only a million point are " "shown. This may explain some visual discrepancies with. </p>\n{0}".format(image), } )
[docs] def coverage_barplot(self): """Coverage barplots section.""" if self.chromosome._mode == "chunks": self._add_large_data_section("Coverage histogram", "cov_barplot") return image1 = self.create_embedded_png(self.chromosome.plot_hist_coverage, input_arg="filename", style="width:45%") image2 = self.create_embedded_png( self.chromosome.plot_hist_coverage, input_arg="filename", style="width:45%", logx=False, ) self.sections.append( { "name": "Coverage histogram", "anchor": "cov_barplot", "content": "<p>The following figures contain the histogram of the genome " "coverage. The X and Y axis being in log scale in the left panel" "while only the Y axis is in log scale in the right panel.</p>\n" "{0}\n{1}".format(image1, image2), } )
[docs] def subcoverage(self, rois, directory): """Create subcoverage reports to have access to a zoomable line plot. :params rois: :param directory: directory name for the chromsome This method create sub reports for each region of 200,000 bases (can be changed). Usually, it starts at position 0 so reports will be stored in e.g. for a genome of 2,300,000 bases:: chromosome_name/chromosome_name_0_200000.html chromosome_name/chromosome_name_200000_400000.html ... ... chromosome_name/chromosome_name_2000000_2200000.html chromosome_name/chromosome_name_2200000_2300000.html Note that if the BED file positions does not start at zero, then names will take care of that. """ # an aliases W = self.region_window name = self.chromosome.chrom_name chrom = self.chromosome N = len(self.chromosome) # position does not always start at position zero shift = self.chromosome.df.pos.iloc[0] maxpos = self.chromosome.df.pos.iloc[-1] # create directory chrom_output_dir = os.sep.join([config.output_dir, str(directory), "subplots"]) if not os.path.exists(chrom_output_dir): os.makedirs(chrom_output_dir) # create the combobox to link toward different sub coverage # Here, we should (1) get the length of the data and (2) # figure out the boundary. Indeed, you can imagine a BED file # that does not start at position zero, but from POS1>0 to POS2 links = ["subplots/{0}_{1}_{2}.html".format(name, i, min(i + W, maxpos)) for i in range(shift, shift + N, W)] intra_links = ("{0}_{1}_{2}.html".format(name, i, min(i + W, maxpos)) for i in range(shift, shift + N, W)) combobox = self.create_combobox(links, "sub", True) combobox_intra = self.create_combobox(intra_links, "sub", False) datatable = self._init_datatable_function(rois) # break the chromosome as pieces of 200,000 bp for i in range(shift, shift + N, W): SubCoverageModule(chrom, rois, combobox_intra, datatable, i, min(i + W, maxpos), directory) self.sections.append({"name": "Subcoverage", "anchor": "subcoverage", "content": combobox}) return links
def _init_datatable_function(self, rois): """Initiate :class:`DataTableFunction` to create table to link each row with sub HTML report. All table will have the same appearance. So, let's initiate its only once. """ datatable_js = DataTableFunction(rois.df, "roi") datatable_js.datatable_options = { "scrollX": "true", "pageLength": 15, "scrollCollapse": "true", "dom": "Bfrtip", "buttons": ["copy", "csv"], } return datatable_js
[docs] def basic_stats(self): """Basics statistics section.""" li = "<li><b>{0}</b> ({1}): {2:.2f}</li>" stats = self.chromosome.get_stats() description = { "BOC": "breadth of coverage: the proportion (in %s) " + " of the genome covered by at least one read.", "CV": "the coefficient of variation.", "DOC": "the sequencing depth (Depth of Coverage), that is the " + "average the genome coverage.", "MAD": "median of the absolute median deviation defined as median(|X-median(X)|).", "Median": "Median of the coverage.", "STD": "standard deviation.", "GC": "GC content (%)", } data = [ li.format(key, description[key], stats[key]) for key in ["BOC", "CV", "DOC", "MAD", "Median", "STD", "GC"] if key in stats.keys() ] stats = "<ul>{0}</ul>".format("\n".join(data)) self.sections.append( { "name": "Basic stats", "anchor": "basic_stats", "content": "<p>Here are some basic statistics about the " "genome coverage.</p>\n{0}".format(stats), } )
[docs] def regions_of_interest(self, rois, links): """Region of interest section.""" def connect_link(x): for link in links: _, x1, x2 = link.rsplit(os.sep)[1].rstrip(".html").rsplit("_", 2) x1 = int(x1) x2 = int(x2) if x >= x1 and x <= x2: return link # for the case where the data is fully stored in memory, we must # find all events ! if self.chromosome._mode == "memory" and self.chromosome.binning == 1: raise Exception("{} position not in the range of reports".format(x)) if links: links_list = [connect_link(n) for n in rois.df["start"]] else: links_list = [None for n in rois.df["start"]] rois.df["link"] = links_list # create datatable low_roi = rois.get_low_rois() high_roi = rois.get_high_rois() datatable = CoverageModule.init_roi_datatable(low_roi) datatable.set_links_to_column("link", "chr") js = datatable.create_javascript_function() lroi = DataTable(low_roi, "lroi", datatable) hroi = DataTable(high_roi, "hroi", datatable) html_low_roi = lroi.create_datatable(float_format="%.3g") html_high_roi = hroi.create_datatable(float_format="%.3g") rois.df.drop("link", axis=1, inplace=True) roi_paragraph = ( "<p>Regions with a z-score {0}er than {1:.2f} and at " "least one base with a z-score {0}er than {2:.2f} are detected." "There are {3} {0} regions of interest." "</p>" ) low_paragraph = roi_paragraph.format( "low", self.chromosome.thresholds.low2, self.chromosome.thresholds.low, len(low_roi), ) high_paragraph = roi_paragraph.format( "high", self.chromosome.thresholds.high2, self.chromosome.thresholds.high, len(high_roi), ) self.sections.append( { "name": "Regions Of Interest (ROI)", "anchor": "roi", "content": "{4}\n" "<p>The following tables give regions of " "interest detected by sequana. Here are the definitions of the " "columns:</p>\n" "<ul><li>mean_cov: the average of coverage</li>\n" "<li>mean_rm: the average of running median</li>\n" "<li>mean_zscore: the average of zscore</li>\n" "<li>max_zscore: the higher zscore contains in the region</li>" "</ul>\n" "<h3>Low coverage region</h3>\n{0}\n{1}\n" "<h3>High coverage region</h3>\n{2}\n{3}\n".format( low_paragraph, html_low_roi, high_paragraph, html_high_roi, js ), } )
[docs] def gc_vs_coverage(self): """3 dimensional plot of GC content versus coverage.""" image = self.create_embedded_png(self.chromosome.plot_gc_vs_coverage, input_arg="filename") corr = self.chromosome.get_gc_correlation() self.sections.append( { "name": "Coverage vs GC content", "anchor": "cov_vs_gc", "content": "<p>The correlation coefficient between the coverage and GC " "content is <b>{0:.3g}</b> with a window size of {1}bp.</p>\n" "{2}\n" "<p>Note: the correlation coefficient has to be between -1.0 " "and 1.0. A coefficient of 0 means no correlation, while a " "coefficient of -1 or 1 means an existing " "correlation between GC and Coverage</p>".format(corr, self.chromosome.bed.gc_window_size, image), } )
[docs] def normalized_coverage(self): """Barplot of normalized coverage section.""" if self.chromosome._mode == "chunks": self._add_large_data_section("Normalised coverage", "normalised_coverage") return image = self.create_embedded_png(self.chromosome.plot_hist_normalized_coverage, input_arg="filename") self.sections.append( { "name": "Normalised coverage", "anchor": "normalised_coverage", "content": "<p>Distribution of the normalised coverage with predicted " "Gaussian. The red line should be followed the trend of the " "barplot.</p>\n{0}".format(image), } )
[docs] def zscore_distribution(self): """Barplot of zscore distribution section.""" if self.chromosome._mode == "chunks": self._add_large_data_section("Z-Score distribution", "zscore_distribution") return image = self.create_embedded_png(self.chromosome.plot_hist_zscore, input_arg="filename") self.sections.append( { "name": "Z-Score distribution", "anchor": "zscore_distribution", "content": "<p>Distribution of the z-score (normalised coverage); You " "should see a Gaussian distribution centered around 0. The " "estimated parameters are mu={0:.2f} and sigma={1:.2f}.</p>\n" "{2}".format( self.chromosome.best_gaussian["mu"], self.chromosome.best_gaussian["sigma"], image, ), } )
class SubCoverageModule(SequanaBaseModule): """Write HTML report of subsection of chromosome with a javascript coverage plot. """ def __init__(self, chromosome, rois, combobox, datatable, start, stop, directory): super().__init__() if directory == ".": self.path = "../" else: self.path = "../../" self.chromosome = chromosome self.rois = rois self.combobox = combobox self.datatable = datatable self.start = start self.stop = stop self.title = "Coverage analysis of chromosome {0}<br>" "positions {1} and {2}".format( self.chromosome.chrom_name, start, stop ) self.create_report_content() self.create_html( "{0}{4}subplots{4}{1}_{2}_{3}.html".format(directory, self.chromosome.chrom_name, start, stop, os.sep) ) def create_report_content(self): """Generate the sections list to fill the HTML report.""" self.sections = list() self.canvasjs_line_plot() self.regions_of_interest() def canvasjs_line_plot(self): """Create the CanvasJS line plot section.""" # set column of interest and create the csv x_col = "pos" y_col = ("cov", "mapq0", "gc") columns = self.chromosome.df.columns y_col = [n for n in y_col if n in columns] csv = self.chromosome.to_csv( start=self.start, stop=self.stop, columns=[x_col] + y_col, index=False, float_format="%.3g", ) # create CanvasJS stuff cjs = CanvasJSLineGraph(csv, "cov", x_col, y_col) # set options cjs.set_options({"zoomEnabled": "true", "zoomType": "x", "exportEnabled": "true"}) # set title cjs.set_title("Genome Coverage") # set legend cjs.set_legend( { "verticalAlign": "bottom", "horizontalAlign": "center", "cursor": "pointer", }, hide_on_click=True, ) # set axis cjs.set_axis_x( { "title": "Position (bp)", "labelAngle": 30, "minimum": self.start, "maximum": self.stop, } ) cjs.set_axis_y({"title": "Coverage (Count)"}) cjs.set_axis_y2( { "title": "GC content (ratio)", "minimum": 0, "maximum": 1, "lineColor": "#FFC425", "titleFontColor": "#FFC425", "labelFontColor": "#FFC425", } ) # set datas cjs.set_data( index=0, data_dict={ "type": "line", "name": "Coverage", "showInLegend": "true", "color": "#5BC0DE", "lineColor": "#5BC0DE", }, ) try: i = y_col.index("mapq0") cjs.set_data( index=i, data_dict={ "type": "line", "name": "Coverage 2", "showInLegend": "true", "color": "#D9534F", "lineColor": "#D9534F", }, ) except ValueError: pass try: i = y_col.index("gc") cjs.set_data( index=i, data_dict={ "type": "line", "axisYType": "secondary", "name": "GC content", "showInLegend": "true", "color": "#FFC425", "lineColor": "#FFC425", }, ) except ValueError: pass # create canvasJS html_cjs = cjs.create_canvasjs() self.sections.append( { "name": "Interactive coverage plot", "anchor": "iplot", "content": ("{0}{1}\n".format(self.combobox, html_cjs)), } ) def regions_of_interest(self): """Region of interest section.""" subseq = [self.start, self.stop] low_roi = self.rois.get_low_rois(subseq) high_roi = self.rois.get_high_rois(subseq) js = self.datatable.create_javascript_function() lroi = DataTable(low_roi, "lroi", self.datatable) hroi = DataTable(high_roi, "hroi", self.datatable) html_low_roi = lroi.create_datatable(float_format="%.3g") html_high_roi = hroi.create_datatable(float_format="%.3g") roi_paragraph = ( "<p>Regions with a z-score {0}er than {1:.2f} and at " "least one base with a z-score {0}er than {2:.2f} are detected as " "{0} coverage region. Thus, there are {3} {0} coverage regions " "between the position {4} and the position {5}</p>" ) low_paragraph = roi_paragraph.format( "low", self.chromosome.thresholds.low2, self.chromosome.thresholds.low, len(low_roi), self.start, self.stop, ) high_paragraph = roi_paragraph.format( "high", self.chromosome.thresholds.high2, self.chromosome.thresholds.high, len(high_roi), self.start, self.stop, ) self.sections.append( { "name": "Regions Of Interest (ROI)", "anchor": "roi", "content": "{4}\n" "<p>Running median is the median computed along the genome " "using a sliding window. The following tables give regions of " "interest detected by sequana. Here are some definition of the " "table's columns:</p>\n" "<ul><li><b>mean_cov</b>: the average of coverage</li>\n" "<li><b>mean_rm</b>: the average of running median</li>\n" "<li><b>mean_zscore</b>: the average of zscore</li>\n" "<li><b>max_zscore</b>: the higher zscore contains in the " "region</li>\n" "<li><b>log2_ratio</b>:log2(mean_cov/mean_rm)</li></ul>\n" "<h3>Low coverage region</h3>\n{0}\n{1}\n" "<h3>High coverage region</h3>\n{2}\n{3}\n".format( low_paragraph, html_low_roi, high_paragraph, html_high_roi, js ), } )