Source code for sequana.coverage

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2022 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
import math

import colorlog

from sequana.lazy import numpy as np
from sequana.lazy import pandas as pd

logger = colorlog.getLogger(__name__)


__all__ = ["Coverage"]


[docs]class Coverage(object): r"""Utilities related to Lander and Waterman theory We denote :math:`G` the genome length in nucleotides and :math:`L` the read length in nucleotides. These two numbers are in principle well defined since :math:`G` is defined by biology and :math:`L` by the sequencing machine. The total number of reads sequenced during an experiment is denoted :math:`N`. Therefore the total number of nucleotides is simply :math:`NL`. The depth of coverage (DOC) at a given nucleotide position is the number of times that a nucleotide is covered by a mapped read. The theoretical fold-coverage is defined as : .. math:: a = NL/G that is the average number of times each nucleotide is expected to be sequenced (in the whole genome). The fold-coverage is often denoted :math:`aX` (e.g., 50X). In the :class:`Coverage` class, :math:`G` and :math:`N` are fixed at the beginning. Then, if one changes :math:`a`, then :math:`N` is updated and vice-versa so that the relation :math:`a=NL/G` is always true:: >>> cover = Coverage(G=1000000, L=100) >>> cover.N = 100000 # number of reads >>> cover.a # What is the mean coverage 10 >>> cover.a = 50 >>> cover.N 500000 From the equation aforementionned, and assuming the reads are uniformly distributed, we can answer a few interesting questions using probabilities. In each chromosome, a read of length :math:`L` could start at any position (except the last position L-1). So in a genome :math:`G` with :math:`n_c` chromosomes, there are :math:`G - n_c (L-1)` possible starting positions. In general :math:`G >> n_c (L-1)` so the probability that one of the :math:`N` read starts at any specific nucleotide is :math:`N/G`. The probability that a read (of length :math:`L`) covers a given position is :math:`L/G`. The probability of **not** covering that location is :math:`1-L/G`. For :math:`N` fragments, we obtain the probability :math:`\left(1-L/G\right)^N`. So, the probability of covering a given location with at least one read is : .. math:: P = 1 - \left(1- \frac{L}{G}\right)^N Since in general, N>>1, we have: .. math:: P = 1- \exp^{-NL/G} From this equation, we can derive the fold-coverage required to have e.g., :math:`E=99\%` of the genome covered: .. math:: a = log(-1/(E-1) .. indeed e^x = lim (1+x/N)^N when N goes to infinite so for x -> -x.N e^-Nx = lim (1-x)^N when N goes to infinite equivalent to .. math:: a = -log(1-E) The method :meth:`get_required_coverage` uses this equation. However, for numerical reason, one should not provide :math:`E` as an argument but (1-E). See :meth:`get_required_coverage` Other information can also be derived using the methods :meth:`get_mean_number_contigs`, :meth:`get_mean_contig_length`, :meth:`get_mean_contig_length`. .. seealso:: :meth:`get_table` that provides a summary of all these quantities for a range of coverage. :reference: http://www.math.ucsd.edu/~gptesler/283/FA14/slides/shotgun_14-handout.pdf """ def __init__(self, N=None, L=None, G=None, a=None): self._a = None # coverage self._L = L # length of each read self._N = N # number of reads self._G = G # length target genome def __repr__(self): if self._N is None: N = "undefined" else: N = self._N if self._a is None: a = "undefined" else: a = self._a return "Coverage(N=%s, L=%s, G=%s, a=%s) " % (N, self.L, self.G, a) def _get_a(self): return self._N * self._L / float(self._G) def _set_a(self, value): assert value > 0 self._a = value self._N = self._a * self._G / float(self._L) a = property(_get_a, _set_a, doc="coverage defined as NL/G") def _get_L(self): return self._L def _set_L(self, value): assert value > 0 self._L = value L = property(_get_L, _set_L, doc="length of the reads") def _get_N(self): return self._N def _set_N(self, value): assert value > 0 self._N = value N = property(_get_N, _set_N, doc="number of reads defined as aG/L") def _get_G(self): return self._G def _set_G(self, value): assert value > 0 self._G = value G = property(_get_G, _set_G, doc="genome length")
[docs] def get_required_coverage(self, M=0.01): """Return the required coverage to ensure the genome is covered A general question is what should be the coverage to make sure that e.g. E=99% of the genome is covered by at least a read. The answer is: .. math:: \log^{-1/(E-1)} This equation is correct but have a limitation due to floating precision. If one provides E=0.99, the answer is 4.6 but we are limited to a maximum coverage of about 36 when one provides E=0.9999999999999999 after which E is rounded to 1 on most computers. Besides, it is no convenient to enter all those numbers. A scientific notation would be better but requires to work with :math:`M=1-E` instead of :math:`E`. .. math:: \log^{-1/ - M} So instead of asking the question what is the requested fold coverage to have 99% of the genome covered, we ask the question what is the requested fold coverage to have 1% of the genome not covered. This allows us to use :math:`M` values as low as 1e-300 that is a fold coverage as high as 690. :param float M: this is the fraction of the genome not covered by any reads (e.g. 0.01 for 1%). See note above. :return: the required fold coverage .. plot:: import pylab from sequana import Coverage cover = Coverage() misses = np.array([1e-1, 1e-2, 1e-3, 1e-4,1e-5,1e-6]) required_coverage = cover.get_required_coverage(misses) pylab.semilogx(misses, required_coverage, 'o-') pylab.ylabel("Required coverage", fontsize=16) pylab.xlabel("Uncovered genome", fontsize=16) pylab.grid() # The inverse equation is required fold coverage = [log(-1/(E - 1))] """ # What should be the fold coverage to have 99% of the genome sequenced ? # It is the same question as equating 1-e^{-(NL/G}) == 0.99, we need NL/G = 4.6 if isinstance(M, float) or isinstance(M, int): assert M < 1 assert M >= 0 else: M = np.array(M) # Here we do not use log(-1/(E-1)) but log(-1/(1-E-1)) to allow # for using float down to 1e-300 since 0.999999999999999 == 1 return np.log(-1 / (-M))
[docs] def get_mean_number_contigs(self): r"""Expected number of contigs A binomial distribution with parameters :math:`N` and :math:`p` .. math:: \left(\frac{aG}{L}\right) \exp^{-a} """ return self.G / float(self.L) * self.a * math.exp(-self.a)
[docs] def get_mean_contig_length(self): """Expected length of the contigs .. math:: \\frac{e^a-1)L}{a} """ return (math.exp(self.a) - 1) * self.L / self.a
[docs] def get_mean_reads_per_contig(self): r"""Expected number of reads per contig Number of reads divided by expected number of contigs: .. math:: N / (N\exp^{-a}) = e^a """ return math.exp(self.a)
[docs] def get_percent_genome_sequenced(self): """Return percent of the genome covered .. math:: 100 (1-\exp{-a}) """ return 100 * (1 - math.exp(-self.a))
[docs] def get_summary(self): """Return a summary (dictionary) for the current fold coverage :attr:`a`""" data = {} data["coverage"] = self.a data["reads"] = self.N data["nucleotides"] = self.a * self.G data["% genome sequenced"] = self.get_percent_genome_sequenced() data["mean number of contigs"] = self.get_mean_number_contigs() data["mean contig length"] = self.get_mean_contig_length() data["mean reads per contig"] = self.get_mean_reads_per_contig() return data
[docs] def get_table(self, coverage=None): """Return a summary dataframe for a set of fold coverage :param list coverage: if None, coverage list starts at 0.5 and ends at 10 with a step of 0.5 """ if coverage is None: X = np.arange(0.5, 10.5, 0.5) else: X = coverage a_buf = self.a results = [] for this in X: self.a = this results.append(self.get_summary()) df = pd.DataFrame.from_records(results) df.set_index("coverage", inplace=True) return df