Source code for sequana.modules_report.fastq_stats

# coding: utf-8
#
#  This file is part of Sequana software
#
#  Copyright (c) 2016 - Sequana Development Team
#
#  File author(s):
#      Thomas Cokelaer <thomas.cokelaer@pasteur.fr>
#      Dimitri Desvillechabrol <dimitri.desvillechabrol@pasteur.fr>,
#          <d.desvillechabrol@gmail.com>
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
"""Module to write coverage report"""
import glob
import io
import os

import colorlog

from sequana.lazy import pandas as pd
from sequana.lazy import pylab
from sequana.modules_report.base_module import SequanaBaseModule
from sequana.utils import config

logger = colorlog.getLogger(__name__)


from sequana.utils.datatables_js import DataTable


[docs] class FastQStatsModule(SequanaBaseModule): """Write HTML report of fastq stats analysis.""" def __init__(self, input_directory, path_to_fastqc, output_filename=None, tag_R1="_R1_"): """ :param input_directory: where to find the json and boxplot image. The path where to find the data does not matter since the JSON and PNG will be embedded. :param path_to_fastqc: This must be provided by the user. This is the directory where will be found the original FastQC reports. This can be infered but is prone to error so for now, we must provide this argument. :param output_filename: if not provided, the HTML is not created. :: from sequana.modules_report.fastq_stats import FastQStatsModule ff = FastQStatsModule("./SAMPLE/fastq_stats_samples", "fastqc_samples", "test.html") """ super().__init__() self.path_to_fastqc = path_to_fastqc self.directory = input_directory self.create_report_content() if output_filename: self.create_html(output_filename)
[docs] def create_report_content(self): """Generate the sections list to fill the HTML report.""" self.sections = list() self.add_stats()
def _get_files(self, pattern): # !! need to sort the files so that R1 appears before R2 filenames = sorted(glob.glob(self.directory + os.sep + pattern)) if len(filenames) == 2: mode = "pe" elif len(filenames) == 1: mode = "se" elif len(filenames) == 0: return else: logger.warning("FastQStatsModule: more than 2 files " "matched the pattern %s" % pattern) return return filenames, mode
[docs] def get_stats(self): import pandas as pd filenames, mode = self._get_files("*.json") if mode == "pe": df1 = pd.read_json(filenames[0]) df2 = pd.read_json(filenames[1]) df = pd.concat([df1, df2]) # Should have been sorted ! df.index = ["R1", "R2"] else: df = pd.read_json(filenames[0]) df.index = ["R1"] df = df[ [ "A", "C", "G", "T", "N", "n_reads", "mean quality", "GC content", "average read length", "total bases", ] ] for this in "ACGTN": df[this] /= df["total bases"] df[this] *= 100 return df
def _get_stats_section(self, tablename="stats"): self.df_stats = self.get_stats() filenames, mode = self._get_files("*boxplot.png") datatable = DataTable(self.df_stats, tablename, index=True) datatable.datatable.datatable_options = { "scrollX": "300px", "pageLength": 15, "scrollCollapse": "true", "dom": "rtpB", "paging": "false", "buttons": ["copy", "csv"], } js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format="%.3g") html = """<p>The following table gives some basic statistics about the data before any filtering. The A, C, G, T, N columns report the percentage of each bases in the overall sequences. The GC content is provided in percentage as well. </p> <div>{} {}</div> <div>""".format( html_tab, js ) html += """ <p>The following figure(s) gives the average quality (red line) of raw reads (500,000 at max). The x-axis being the length of the reads. The yellow enveloppe gives the variation of the quality (1 standard deviation).</p> <p> Click on the image to jump to a full FastQC report.</p>""" if len(filenames) == 2: width = "49" else: width = "65" filename = os.path.split(filenames[0])[1].replace("_boxplot.png", "_fastqc.html") href = self.path_to_fastqc + os.sep + filename html += """ <figure style="float:left; width:{}%; padding:0px; margin:0px;"> <a href="{}">{}</a> <figcaption style="font-style:italic">Fig1: R1 reads</figcaption> </figure>""".format( width, href, self.png_to_embedded_png(filenames[0]) ) if len(filenames) == 2: filename = os.path.split(filenames[1])[1].replace("_boxplot.png", "_fastqc.html") href = self.path_to_fastqc + os.sep + filename html += """ <figure style="float:right; width:{}%; padding:0px; margin:0px;"> <a href="{}">{}</a> <figcaption style="font-style:italic">Fig2: R2 reads</figcaption> </figure>""".format( width, href, self.png_to_embedded_png(filenames[1]) ) return html
[docs] def add_stats(self): html = self._get_stats_section() self.sections.append({"name": "Stats inputs", "anchor": "stats", "content": html})