Source code for sequana.phred

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-222 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
"""Manipulate phred quality of reads

FastQ quality are stored as characters. The phred scales indicates the range of characters.

In general, characters goes from ! to ~ that is from 33 to 126 in an ascii
table. This convention starts at 33 because characters before ! may cause trouble
(e.g. white spaces). This scale is the Sanger scale. There are 2 other scales
that could be used ranging from 59 to 126 (illumina 1) and from 64 to 126 (illumina 1.3+).



So, here are the offset to use:

============== ============ ===============
 Name           offset       Numeric range
============== ============ ===============
 Sanger         33            0 to 93
 Solexa         64            -5 to 62
 illumina1.3+   64            0 to 62
============== ============ ===============

:reference: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2847217/


Even though dedicated tools would have better performances, we provide a set
of convenient functions. An example is provided here below to plot the quality
corresponding to a character string extracted from a FastQ read.


In this example, we use :class:`Quality` class where the default offset is 33
(Sanger). We compare the quality for another offset

.. plot::
    :include-source:

    from sequana import phred

    from sequana.phred import Quality
    q = Quality('BCCFFFFFHHHHHIIJJJJJJIIJJJJJJJJFH')
    q.plot()
    q.offset = 64
    q.plot()
    from pylab import legend
    legend(loc="best")


"""
import colorlog

from sequana.lazy import numpy as np
from sequana.lazy import pylab

logger = colorlog.getLogger(__name__)


quality = r"""!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOP"""
quality += r"""QRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~"""


from math import log10

__all__ = ["Quality", "proba_to_quality_sanger", "quality_to_proba_sanger"]


[docs]def proba_to_quality_sanger(pe): """A value between 0 and 93 :param pe: the probability of error. :return: Q is the quality score. - a high probability of error (0.99) gives Q=0 - q low proba of errors (0.05) gives Q = 13 - q low proba of errors (0.01) gives Q = 20 """ if pe > 1: pe = 1 if pe < 1e-90: pe = 1e-90 Qs = -10 * log10(pe) if Qs > 93: Qs = 93 return Qs
[docs]def quality_to_proba_sanger(quality): """Quality to probability (Sanger)""" return 10 ** (quality / -10.0)
def proba_to_quality_solexa(pe): """prior v1.3 (ref: wikipedia https://en.wikipedia.org/wiki/FASTQ_format """ if pe > 1: pe = 1 return -5 if pe < 1e-90: pe = 1e-90 Qs = -10 * log10(pe / (1 - pe)) if Qs > 62: Qs = 62 if Qs < -5: Qs = -5 return Qs def quality_solexa_to_quality_sanger(qual): return 10 * log10(10 ** (qual / 10.0) + 1) def quality_sanger_to_quality_solexa(qual): """ """ return 10 * log10(10 ** (qual / 10.0) - 1) def ascii_to_quality(character, phred=33): """ASCII to Quality conversion :param int phred: offset (defaults to 33) :: >>> ascii_to_quality("!") 0 >>> ascii_to_quality("~") 93 """ return ord(character) - phred def quality_to_ascii(quality, phred=33): """Quality to ASCII conversion :param int phred: offset (defaults to 33) :: >>> quality_to_ascii(65) b """ return chr(quality + phred)
[docs]class Quality(object): """Phred quality .. plot:: :include-source: >>> from sequana.phred import Quality >>> q = Quality('BCCFFFFFHHHHHIIJJJJJJIIJJJJJJJJFH') >>> q.plot() You can access to the quality as a list using the :attr:`quality` attribute and the mean quality from the :attr:`mean_quality` attribute. """ def __init__(self, seq, offset=33): self.seq = seq self.offset = offset def _get_quality(self): # bytearray conversion is required since ord() function # does not handle bytes from py3 return [x - self.offset for x in bytearray(self.seq, "utf-8")] quality = property(_get_quality, doc="phred string into quality list") def _get_mean_quality(self): return np.mean(self.quality) mean_quality = property(_get_mean_quality, doc="return mean quality")
[docs] def plot(self, fontsize=16): """plot quality versus base position""" pylab.plot(self.quality, label="offset: %s" % self.offset) pylab.xlabel("base position", fontsize=fontsize) pylab.ylabel("Quality per base", fontsize=fontsize) pylab.grid(True) # ylim set autoscale to off so if we want to call this function several # times, we must reset autoscale to on before calling ylim pylab.autoscale() limits = pylab.ylim() pylab.ylim(max(0, limits[0] - 1), limits[1] + 1)
# this should qlso be correct for Illumina 1.8+ class QualitySanger(Quality): """Specialised :class:`Quality` class for Sanger case""" def __init__(self, seq): super(QualitySanger, self).__init__(seq, offset=33) class QualitySolexa(Quality): """Specialised :class:`Quality` class for Solexa case""" def __init__(self, seq): super(QualitySolexa, self).__init__(seq, offset=64)