Source code for sequana.cigar

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2022 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
import re
from collections import defaultdict

import colorlog

# Note: pysam could probably be used to improve the speed of this module.

logger = colorlog.getLogger(__name__)


[docs] class Cigar: """A class to handle CIGAR strings from BAM files. .. doctest:: >>> from sequana.cigar import Cigar >>> c = Cigar("2S30M1I") >>> len(c) 33 >>> c = Cigar("1S1S1S1S") >>> c.compress() >>> c.cigarstring '4S' Possible CIGAR types are: - "M" : alignment match - "I" : insertion to the reference - "D" : deletion from the reference - "N" : skipped region from the reference - "S" : soft clipping (clipped sequence present in seq) - "H" : hard clipping (sequence NOT present) - "P" : padding (silent deletion from padded reference) - "=" : sequence match - "X" : sequence mismatched - "B" : back (rare) (could be also NM ?) !!! BWA MEM get_cigar_stats returns list with 11 items Last item is !!! what is the difference between M and = ??? Last item is I + S + X !!! dans BWA, mismatch (X) not provided... should be deduced from last item - I - S .. note:: the length of the query sequence based on the CIGAR is calculated by adding the M, I, S, =, or X and other operations are ignored. source: https://stackoverflow.com/questions/39710796/infer-the-length-of-a-sequence-using-the-cigar/39812985#39812985 :reference: https://github.com/samtools/htslib/blob/develop/htslib/sam.h """ __slots__ = ("_cigarstring",) pattern = re.compile(r"(\d+)([A-Za-z])?") types = "MIDNSHP=XB" def __init__(self, cigarstring: str): """.. rubric:: Constructor :param str cigarstring: the CIGAR string. .. note:: the input CIGAR string validity is not checked. If an unknown type is found, it is ignored generally. For instance, the length of 1S100Y is 1 since Y is not correct. """ if not isinstance(cigarstring, str): raise TypeError("Cigar string must be a string.") #: the CIGAR string attribute self._cigarstring = cigarstring @property def cigarstring(self): return self._cigarstring def __str__(self): return self._cigarstring def __repr__(self): return "Cigar( {} )".format(self._cigarstring) def __len__(self): return self.get_query_length() def _decompose(self): for num, op in self.pattern.findall(self._cigarstring): if op and op in self.types: yield op, int(num)
[docs] def as_sequence(self): return "".join(op * count for op, count in self._decompose())
[docs] def as_dict(self): """Return cigar types and their count :return: dictionary Note that repeated types are added:: >>> c = Cigar('1S2M1S') >>> c.as_dict() {"S":2,"M":2} """ # !! here, we have to make sure that duplicated letters are summed up d = defaultdict(int) for op, count in self._decompose(): d[op] += count return d
[docs] def as_tuple(self): """Decompose the cigar string into tuples keeping track of repeated types :return: tuple .. doctest:: >>> from sequana import Cigar >>> c = Cigar("1S2M1S") >>> c.as_tuple() (('S', 1), ('M', 2), ('S', 1)) """ return tuple(self._decompose())
[docs] def compress(self): """1S1S should become 2S. inplace modification""" data = list(self._decompose()) if not data: return compressed = [data[0]] for op, count in data[1:]: last_op, last_count = compressed[-1] if op == last_op: compressed[-1] = (op, last_count + count) else: compressed.append((op, count)) self._cigarstring = "".join(f"{count}{op}" for op, count in compressed)
[docs] def stats(self): """Returns number of occurence for each type found in :attr:`types` :: >>> c = Cigar("1S2M1S") >>> c.stats() [2, 0, 0, 0, 2, 0, 0, 0, 0, 0] """ counts = [0] * len(self.types) # _decompose only yields ops present in self.types, so find() always hits for op, val in self.as_dict().items(): counts[self.types.index(op)] = val return counts
[docs] def get_query_length(self) -> int: """Return length consumed by the query (read).""" return sum(count for op, count in self._decompose() if op in "MIS=X")
[docs] def get_reference_length(self) -> int: """Return length consumed by the reference.""" return sum(count for op, count in self._decompose() if op in "MDN=X")
# CIGAR op codes (pysam convention): # 0 -> M alignment match 1 -> I insertion 2 -> D deletion # 3 -> N skipped/intron 4 -> S soft clipping # Insertions (op 1) do not consume the reference, so they never advance the # cursor; every other op above does. def _fetch(chrom, start, cigar, target): """Walk a list of ``(op, size)`` tuples and collect bounds for ``target`` op. The reference cursor advances by ``size`` for M (0), D (2), N (3) and S (4); insertions (1) and any other op do not advance it. When the op matches ``target`` a ``(chrom, start, end)`` bound is recorded; insertions are the exception and are recorded as ``(chrom, start, size)``. """ chrom_start = start bounds = [] for c, size in cigar: if c == target: if c == 1: # insertion: stored as (chrom, start, size), no end bounds.append((chrom, chrom_start, size)) else: bounds.append((chrom, chrom_start, chrom_start + size)) if c in (0, 2, 3, 4): chrom_start += size return bounds
[docs] def fetch_exon(chrom, start, cigar): return _fetch(chrom, start, cigar, 0)
[docs] def fetch_intron(chrom, start, cigar): return _fetch(chrom, start, cigar, 3)
[docs] def fetch_clip(chrom, start, cigar): return _fetch(chrom, start, cigar, 4)
[docs] def fetch_deletion(chrom, start, cigar): return _fetch(chrom, start, cigar, 2)
[docs] def fetch_insertion(chrom, start, cigar): # NOTE: insertions are stored as (chrom, start, size) rather than # (chrom, start, end) as in the other fetchers. return _fetch(chrom, start, cigar, 1)