Source code for sequana.modules_report.variant_calling

# coding: utf-8
#
#  This file is part of Sequana software
#
#  Copyright (c) 2016 - Sequana Development Team
#
#  File author(s):
#      Dimitri Desvillechabrol <dimitri.desvillechabrol@pasteur.fr>,
#          <d.desvillechabrol@gmail.com>
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
"""Module to write variant calling report"""
import ast

from sequana.lazy import pandas as pd
from sequana.modules_report.base_module import SequanaBaseModule
from sequana.utils.datatables_js import DataTable



[docs]
class VariantCallingModule(SequanaBaseModule):
    """Write HTML report of variant calling. This class takes a csv file
    generated by sequana_variant_filter.
    """

    def __init__(self, data):
        """.. rubric:: constructor

        :param data: a CSV filename created by ``sequana.freebayes_vcf_filter``
            or a :class:`freebayes_vcf_filter.Filtered_freebayes` object.
        """
        super().__init__()
        self.title = "Variant Calling Report"
        try:
            with open(data, "r") as fp:
                self.filename = data
                line = fp.readline()
                if line.startswith("# sequana_variant_calling"):
                    string_dict = line.split(";")[-1].strip()
                    try:
                        self.filter_dict = ast.literal_eval(string_dict)
                    except SyntaxError:
                        self.filter_dict = None
                    self.df = pd.read_csv(fp)
        except FileNotFoundError:
            msg = "The csv file is not present. Please, check if your" " file is present."
            raise FileNotFoundError(msg)
        except TypeError:
            self.df = data.df
            self.filter_dict = data.vcf.filters_params
        self.create_report_content()
        self.create_html("variant_calling.html")


[docs]
    def create_report_content(self):
        self.sections = list()

        try:
            self.add_intro_download()
        except:
            pass

        if self.filter_dict:
            self.filters_information()
        try:
            self.add_overview()
        except:
            pass
        try:
            self.create_histogram_frequencies()
        except:
            pass
        try:
            self.create_histogram_frequencies2()
        except:
            pass
        try:
            self.variant_calling()
        except:
            pass



[docs]
    def add_intro_download(self):
        self.sections.append(
            {
                "name": "Data",
                "anchor": "intro_download",
                "content": f'You can download the <a href="./freebayes_vcf_filter/temp.filter.vcf">VCF here</a>. This is the filtered VCF (see parameters used in the section below). Raw VCF file can be found in the sequana structure directory (freebayes/YOUR_SAMPLE.raw.vcf)',
            }
        )



[docs]
    def filters_information(self):
        """Add information of filter."""
        self.sections.append(
            {
                "name": "Filter Options",
                "anchor": "filters_option",
                "content": "<p>All filters parameters used is presented in this list:</p>"
                "\n<ul><li>freebayes_score: {freebayes_score}</li>\n"
                "<li>frequency: {frequency}</li>\n"
                "<li>min_depth: {min_depth}</li>\n"
                "<li>forward_depth: {forward_depth}</li>\n"
                "<li>reverse_depth: {reverse_depth}</li>\n"
                "<li>strand_ratio: {strand_ratio}</li></ul>\n"
                "Note:<ul><li>frequency: alternate allele / depth</li>\n"
                "<li>min_depth: minimum alternate allele present</li>\n"
                "<li>forward_depth: minimum alternate allele present on "
                "forward strand</li>\n"
                "<li>reverse_depth: minimum alternate allele present on "
                "reverse strand</li>\n"
                "<li>strand_ratio: alternate allele forward / (alternate "
                "allele forward + alternate allele reverse)</li>"
                "</ul>".format(**self.filter_dict),
            }
        )



[docs]
    def add_overview(self):

        import os
        import tempfile
        import warnings
        from collections import defaultdict

        import plotly.express as px

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            # Count variant types
            variants = defaultdict(int)
            for typ in sorted(self.df["type"].unique()):
                variants[typ] = sum(self.df["type"] == typ)

            # Prepare data
            labels = sorted(variants.keys())
            counts = [variants[x] for x in labels]
            labels_display = [f"{x.upper()} ({variants[x]})" for x in labels]

            # Create interactive pie chart
            fig = px.pie(
                names=labels_display,
                values=counts,
                title="Variant Type Distribution",
                color_discrete_sequence=px.colors.qualitative.Set3,
            )

            # Reduce margins, adjust layout
            fig.update_layout(
                title_x=0.5,
                margin=dict(t=50, b=10, l=10, r=10),
                width=700,
                height=700,
            )

            tmpfile = tempfile.NamedTemporaryFile(suffix=".html", delete=False)
            fig.write_html(tmpfile.name, include_plotlyjs=False, full_html=False)

            # Read the HTML fragment and return it as a string
            with open(tmpfile.name, "r") as f:
                html_fragment = f.read()
            os.unlink(tmpfile.name)

            # Integration in your HTML report
            self.sections.append(
                {
                    "name": "Overview",
                    "anchor": "overview",
                    "content": "Here below is a pie plot showing the distribution of various variants found in the sample. "
                    + html_fragment,
                }
            )



[docs]
    def create_histogram_frequencies(self):

        import plotly.express as px
        import plotly.io as pio

        df = self.df.copy()
        df = df.query("type=='snp'").copy()
        df["freq"] = [float(x) for x in df["frequency"]]
        fig = px.histogram(
            df,
            x="freq",
            color="chr",
            facet_col="chr",
            facet_col_wrap=6,
            nbins=40,
            title="SNP Frequency Distribution per Chromosome",
            labels={"AF": "Allele frequency"},
        )
        # fig.update_xaxes(range=[0, 1])
        fig.update_layout(height=1000, width=1000)
        fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

        fig_html = pio.to_html(fig, full_html=False, include_plotlyjs=False)
        self.sections.append(
            {
                "name": "Histogram (frequency) per contig",
                "anchor": "histo",
                "content": "Here is the frequency distribution of SNPs for each contig. Contigs with no SNPs are not shown."
                + f"{fig_html}",
            }
        )



[docs]
    def create_histogram_frequencies2(self):

        import pandas as pd
        import plotly.graph_objects as go
        import plotly.io as pio

        # --- PARSE VCF ---
        df = self.df.copy()
        df = df.query("type in ['snp', 'del', 'ins']").copy()
        df["freq"] = [float(x) for x in df["frequency"]]

        # --- BUILD FIGURE TRACES ---
        traces = []
        variant_types = ["snp", "del", "ins"]
        for vtype in variant_types:
            sub = df[df["type"] == vtype]
            if sub.empty:
                continue
            trace = go.Histogram(
                x=sub["freq"], nbinsx=50, name=vtype, opacity=0.75, visible=True if vtype in ["snp"] else False
            )
            traces.append(trace)

        # --- CREATE DROPDOWN MENU ---
        buttons = []
        for i, vtype in enumerate(variant_types):
            visible = [False] * len(traces)
            if i < len(traces):
                visible[i] = True
            buttons.append(
                dict(
                    label=vtype,
                    method="update",
                    args=[{"visible": visible}, {"title": f"Allele Frequency Distribution — {vtype}"}],
                )
            )

        # --- FIGURE LAYOUT ---
        fig = go.Figure(data=traces)
        fig.update_layout(
            title="Allele Frequency Distribution — SNP",
            xaxis_title="Allele Frequency",
            yaxis_title="Count",
            updatemenus=[dict(buttons=buttons, direction="down", x=0.45, y=1.15, showactive=True)],
            bargap=0.1,
            template="plotly_white",
        )

        # --- EXPORT TO HTML ---
        html_code = pio.to_html(fig, full_html=False, include_plotlyjs=False)
        self.sections.append(
            {
                "name": "histogram (all contigs)",
                "anchor": "histo2",
                "content": "Here are the distribution across all contigs of some non-complex variants. variants that are combination such as SNP,SNP are not shown. "
                + f"{html_code}",
            }
        )



[docs]
    def variant_calling(self):
        """Variants detected section."""
        datatable = DataTable(self.df, "vc")
        # set options
        datatable.datatable.datatable_options = {
            "scrollX": "true",
            "pageLength": 30,
            "scrollCollapse": "true",
            "dom": "Bfrtip",
            "buttons": ["copy", "csv"],
        }
        js = datatable.create_javascript_function()
        html_tab = datatable.create_datatable(float_format="%.3f")
        self.sections.append(
            {
                "name": "Variants Detected",
                "anchor": "basic_stats",
                "content": "<p>This table gives variants detected by freebayes after "
                "filtering. The important metrics are the depth (if not enough reads supports "
                "the variant it should be ignored); the strand_balance (forward and reverse number "
                " of reads supporting the variants should be similar e.g balance of 0.5); the fisher "
                "pvalue (variants with pvalue<0.05 should be rejected since the strand balance of"
                "alternate and reference are different).</p><p>Note: the freebayes score can be"
                " understood as 1 - P(locus is homozygous given the data)</p> {0}\n{1}\n".format(js, html_tab),
            }
        )