Source code for sequana.cigar

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2022 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
import re

"""
Note: could use pysam most probably to improve the speed.
"""
import colorlog

logger = colorlog.getLogger(__name__)


[docs]class Cigar(object): """ .. doctest:: >>> from sequana.cigar import Cigar >>> c = Cigar("2S30M1I") >>> len(c) 33 >>> c = Cigar("1S1S1S1S") >>> c.compress() >>> c.cigarstring '4S' Possible CIGAR types are: - "M" for alignment MATCH (0) - "I" for Insertion to the reference (1) - "D" for deletion from the reference 2 - "N" for skipped region from the reference 3 - "S" for soft clipping (clipped sequence present in seq) 4 - "H" for hard clipping (clipped sequence NOT present in seq) 5 - "P" for padding (silent deletion from padded reference) - "=" for equal - "X" for diff (sequence mismatched) - "B" for back !!!! could be also NM ??? !!! BWA MEM get_cigar_stats returns list with 11 items Last item is !!! what is the difference between M and = ??? Last item is I + S + X !!! dans BWA, mismatch (X) not provided... should be deduced from last item - I - S .. note:: the length of the query sequence based on the CIGAR is calculated by adding the M, I, S, =, or X and other operations are ignored. source: https://stackoverflow.com/questions/39710796/infer-the-length-of-a-sequence-using-the-cigar/39812985#39812985 :reference: https://github.com/samtools/htslib/blob/develop/htslib/sam.h """ __slots__ = ["cigarstring"] pattern = r"(\d+)([A-Za-z])?" # could use a dictionary. would be faster #: valid CIGAR types types = "MIDNSHP=XB" def __init__(self, cigarstring): """.. rubric:: Constructor :param str cigarstring: the CIGAR string. .. note:: the input CIGAR string validity is not checked. If an unknown type is found, it is ignored generally. For instance, the length of 1S100Y is 1 since Y is not correct. """ #: the CIGAR string attribute self.cigarstring = cigarstring def __str__(self): return self.cigarstring def __repr__(self): return "Cigar( {} )".format(self.cigarstring) def __len__(self): return sum([y for x, y in self._decompose() if x in "MIS=X"]) def _decompose(self): # x is the type, y the number. Note the inversion in the tuple return ((y, int(x)) for x, y in re.findall(self.pattern, self.cigarstring))
[docs] def as_sequence(self): return "".join((y * x for x, y in self._decompose()))
[docs] def as_dict(self): """Return cigar types and their count :return: dictionary Note that repeated types are added:: >>> c = Cigar('1S2M1S') >>> c.as_dict() {"S":2,"M":2} """ # !! here, we have to make sure that duplicated letters are summed up from collections import defaultdict d = defaultdict(int) for letter, num in self._decompose(): d[letter] += num return d
[docs] def as_tuple(self): """Decompose the cigar string into tuples keeping track of repeated types :return: tuple .. doctest:: >>> from sequana import Cigar >>> c = Cigar("1S2M1S") >>> c.as_tuple() (('S', 1), ('M', 2), ('S', 1)) """ return tuple((x, y) for (x, y) in self._decompose())
[docs] def compress(self): """1S1S should become 2S. inplace modification""" if len(set((x[0] for x, y in self._decompose()))) == len(self.as_tuple()): return data = self.as_tuple() newdata = [data[0]] for i, x in enumerate(data[1:]): N = len(newdata) - 1 if newdata[N][0] == x[0]: newdata[N] = (x[0], newdata[N][1] + x[1]) else: newdata.append(x) self.cigarstring = "".join(["{}{}".format(y, x) for x, y in newdata])
[docs] def stats(self): """Returns number of occurence for each type found in :attr:`types` :: >>> c = Cigar("1S2M1S") >>> c.stats() [2, 0, 0, 0, 2, 0, 0, 0, 0, 0] """ data = [0] * len(self.types) dd = self.as_dict() for k, v in self.as_dict().items(): data[self.types.index(k)] = v return data
[docs]def fetch_exon(chrom, start, cigar): chrom_start = start exon_bound = [] for c, size in cigar: if c == 0: exon_bound.append((chrom, chrom_start, chrom_start + size)) chrom_start += size elif c == 1: continue elif c == 2: chrom_start += size elif c == 3: chrom_start += size elif c == 4: # FIXME do we want to include this in the exon chrom_start += size else: continue return exon_bound
[docs]def fetch_intron(chrom, start, cigar): # equivalence: # c = 0 -> M # c = 1 -> I # c = 2 -> D # c = 3 -> N gap/intron # c = 4 -> S soft clipping chrom_start = start intron_bound = [] for c, size in cigar: if c == 0: chrom_start += size elif c == 1: continue elif c == 2: chrom_start += size elif c == 3: intron_bound.append((chrom, chrom_start, chrom_start + size)) chrom_start += size elif c == 4: # not including soft clipping in the intron continue else: continue return intron_bound
[docs]def fetch_clip(chrom, start, cigar): chrom_start = start clip_bound = [] for c, size in cigar: if c == 0: chrom_start += size elif c == 1: continue elif c == 2: chrom_start += size elif c == 3: chrom_start += size elif c == 4: clip_bound.append((chrom, chrom_start, chrom_start + size)) chrom_start += size else: continue return clip_bound
[docs]def fetch_deletion(chrom, start, cigar): chrom_start = start deletion_bound = [] for c, size in cigar: if c == 0: chrom_start += size elif c == 1: continue elif c == 2: deletion_bound.append((chrom, chrom_start, chrom_start + size)) chrom_start += size elif c == 3: chrom_start += size elif c == 4: chrom_start += size else: continue return deletion_bound
[docs]def fetch_insertion(chrom, start, cigar): # NOTE that the returned insertions are stored as # chrom, start, size rather than chrom, start, end in other fetchers # functions chrom_start = start insertion_bound = [] for c, size in cigar: if c == 0: chrom_start += size elif c == 1: # See note above insertion_bound.append((chrom, chrom_start, size)) continue elif c == 2: chrom_start += size elif c == 3: chrom_start += size elif c == 4: chrom_start += size else: continue return insertion_bound