Source code for sequana.genbank

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2022 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
import re

import colorlog

from sequana.fasta import FastA

logger = colorlog.getLogger(__name__)


__all__ = ["GenBank"]



[docs]
class GenBank:
    """This class reads a Genbank file

    This is kept for back compatibility but most code in Sequana
    will rely on :class:`sequana.gff3.GFF3` instead. So please, transform
    your genbank to GFF3 format if possible.

    ::

        gg = GenBank()
        gg.get_types()

    """

    def __init__(self, filename, skip_types=["biological_region"]):
        self.filename = filename
        self._df = None
        self._features = None
        self.skip_types = skip_types

    # property from parent class
    def _get_features(self):
        if self._features:
            features = self._features
        else:
            records = self.genbank_features_parser()
            _types = set()
            for contig in records.keys():
                for feature in records[contig]:
                    _type = feature["type"]
                    _types.add(_type)
            self._features = sorted(_types)
        return self._features

    features = property(_get_features)


[docs]
    def get_types(self):  # pragma no cover
        logger.warning("genbank.GenBank.get_types is deprecrated. Please use features property")
        records = self.genbank_features_parser()
        _types = set()
        for contig in records.keys():
            for feature in records[contig]:
                _type = feature["type"]
                _types.add(_type)
        return sorted(_types)



[docs]
    def extract_fasta(self, fastafile, features=["rRNA"]):
        for feature in features:
            if feature not in self.features:
                raise ValueError("{} not found".format(feature))

        # fasta may have several contig/chromosome names
        # the gene bank should be compatible !!
        fasta = FastA(fastafile)
        contig_names = fasta.get_lengths_as_dict()

        # most of the times, the version is not in the gbk
        contig_names = [x.split(".")[0] for x in contig_names]

        # then we read the features from the genbank
        records = self.genbank_features_parser()

        # FIXME FastA is not very efficient for eukaryotes but is enough for now
        output = ""
        for name in records.keys():
            if name not in contig_names:
                logger.warning("{} contig from genbank not found in fasta".format(name))
                continue
            index = contig_names.index(name)
            sequence = fasta.sequences[index]

            for item in records[name]:
                if item["type"] in features:
                    start, end = item["gene_start"], item["gene_end"]
                    try:
                        info = item["product"]
                        output += ">{}_{}_{}_{} {}\n".format(name, item["type"], start, end, info)
                    except:
                        output += ">{}_{}_{}_{} {}\n".format(name, item["type"], start, end)
                    output += "{}\n".format(sequence[start:end])
        return output



[docs]
    def genbank_features_parser(self):
        """Return dictionary with features contains inside a genbank file.

        :param str input_filename: genbank formated file
        """
        new_feature = {}
        records = {}
        feature_list = []
        feature_field = False

        with open(self.filename, "r") as fp:
            for line in fp:
                # pass header and sequence fields
                if not feature_field:
                    # get contig/chrom name
                    if line.startswith("LOCUS"):
                        name = line.split()[1]
                    elif line.startswith("FEATURE"):
                        feature_field = True
                else:
                    # if feature field is finished
                    if line.startswith("ORIGIN"):
                        feature_field = False
                        records[name] = feature_list
                        feature_list = []
                        new_feature = []
                        continue

                    # if there are a word in qualifier indent (feature type)
                    # maybe we need to infer the size of indentation ???
                    if line[0:20].split():
                        if new_feature:
                            feature_list.append(new_feature)
                        split_line = line.split()
                        t = split_line[0]
                        # Handle :
                        # complement(order(1449596..1449640,1449647..1450684,
                        # 1450695..1450700))
                        positions = split_line[1]
                        if positions[0].isalpha():
                            while not line[:-1].endswith(")"):
                                line = next(fp)
                                positions += line
                        pos = [int(n) for n in re.findall(r"\d+", positions)]
                        # Handle complement(join(3773333..3774355,3774357..3774431))
                        start = pos[0]
                        end = pos[-1]
                        strand = "-" if split_line[1].startswith("c") else "+"
                        new_feature = {
                            "type": t,
                            "gene_start": start,
                            "gene_end": end,
                            "strand": strand,
                        }

                    # recover qualifier bound with feature
                    else:
                        quali_line = line.strip().replace('"', "")
                        if quali_line.startswith("/") and "=" in quali_line:
                            qualifier = quali_line.split("=")
                            key = qualifier[0][1:]
                            new_feature[key] = qualifier[1]
                        else:
                            if key == "translation":
                                new_feature[key] += quali_line
                            else:
                                new_feature[key] += " " + quali_line
        return records