Source code for sequana.mgi

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2022 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################


from sequana.lazy import numpy as np
from sequana.lazy import pandas as pd
from sequana.lazy import pylab



[docs]
class MGI:
    def __init__(self, filename):
        self.filename = filename
        self.df = pd.read_csv(self.filename, sep="\t", comment="#", header=None)

        self.metadata = {}
        with open(self.filename, "r") as fin:
            line = fin.readline()
            while line.startswith("#"):
                key, value = line.split(maxsplit=1)
                key = key[1:]  # skip the # character
                try:
                    self.metadata[key] = int(value.strip())
                except:
                    self.metadata[key] = value.strip()
                if key.endswith("%"):
                    try:
                        self.metadata[key] = float(value.strip())
                    except:  # pragma: no cover
                        pass
                line = fin.readline()
        try:
            ncounts = self.metadata["N_Count"]
            self.metadata["N_Count"] = int(ncounts.split("\t")[0])
            self.metadata["N_Count%"] = float(ncounts.split("\t")[1])
        except Exception as err:  # pragma no cover
            print(err)
            pass
        self.df.columns = ["Pos"] + self.metadata["Pos"].split("\t")
        del self.metadata["Pos"]
        self.df.set_index("Pos", inplace=True)


[docs]
    def plot_acgt(self):
        ACGTN = self.df[["A", "C", "G", "T", "N"]]
        N = ACGTN.sum(axis=1).iloc[0]
        assert N == self.metadata["ReadNum"]
        for this in "ACGTN":
            pylab.plot(self.df[this] / N * 100, label=this)



[docs]
    def boxplot_quality(self, color_line="r", bgcolor="grey", color="yellow", lw=4, hold=False, ax=None):
        quality = self.df[[str(x) for x in range(42)]]  # not sure why we have phred score from 0 to 41
        N = self.metadata["ReadNum"]
        proba = quality / N

        self.xmax = 150
        xmax = self.xmax + 1
        if ax:
            pylab.sca(ax)  # pragma no cover
        pylab.fill_between([0, xmax], [0, 0], [20, 20], color="red", alpha=0.3)
        pylab.fill_between([0, xmax], [20, 20], [30, 30], color="orange", alpha=0.3)
        pylab.fill_between([0, xmax], [30, 30], [41, 41], color="green", alpha=0.3)

        X = []
        Q = []
        S = []
        for pos in range(1, 151):
            qualities = [((int(k) + 1) * v) for k, v in quality.loc[pos].items()]
            mean_quality = sum(qualities) / N
            X.append(pos)
            Q.append(mean_quality)
            proba = quality.loc[pos] / N

            std = pylab.sqrt(sum([(x - mean_quality) ** 2 * y for x, y in zip(range(42), proba)]))
            S.append(std)

        print(len(X))
        print(len(Q))
        print(len(S))

        Q = np.array(Q)
        X = np.array(X)
        S = np.array(S)
        pylab.fill_between(X, Q + S, Q - S, color=color, interpolate=False)

        pylab.plot(X, Q, color=color_line, lw=lw)
        pylab.ylim([0, 41])
        pylab.xlim([0, self.xmax + 1])
        pylab.title("Quality scores across all bases")
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("Quality")
        pylab.grid(axis="x")