Source code for sequana.genbank
#
# This file is part of Sequana software
#
# Copyright (c) 2016-2022 - Sequana Development Team
#
# Distributed under the terms of the 3-clause BSD license.
# The full license is in the LICENSE file, distributed with this software.
#
# website: https://github.com/sequana/sequana
# documentation: http://sequana.readthedocs.io
#
##############################################################################
import re
import colorlog
from sequana.fasta import FastA
logger = colorlog.getLogger(__name__)
__all__ = ["GenBank"]
[docs]
class GenBank:
"""This class reads a Genbank file
This is kept for back compatibility but most code in Sequana
will rely on :class:`sequana.gff3.GFF3` instead. So please, transform
your genbank to GFF3 format if possible.
::
gg = GenBank()
gg.get_types()
"""
def __init__(self, filename, skip_types=["biological_region"]):
self.filename = filename
self._df = None
self._features = None
self.skip_types = skip_types
# property from parent class
def _get_features(self):
if self._features:
features = self._features
else:
records = self.genbank_features_parser()
_types = set()
for contig in records.keys():
for feature in records[contig]:
_type = feature["type"]
_types.add(_type)
self._features = sorted(_types)
return self._features
features = property(_get_features)
[docs]
def get_types(self): # pragma no cover
logger.warning("genbank.GenBank.get_types is deprecrated. Please use features property")
records = self.genbank_features_parser()
_types = set()
for contig in records.keys():
for feature in records[contig]:
_type = feature["type"]
_types.add(_type)
return sorted(_types)
[docs]
def genbank_features_parser(self):
"""Return dictionary with features contains inside a genbank file.
:param str input_filename: genbank formated file
"""
new_feature = {}
records = {}
feature_list = []
feature_field = False
with open(self.filename, "r") as fp:
for line in fp:
# pass header and sequence fields
if not feature_field:
# get contig/chrom name
if line.startswith("LOCUS"):
name = line.split()[1]
elif line.startswith("FEATURE"):
feature_field = True
else:
# if feature field is finished
if line.startswith("ORIGIN"):
feature_field = False
records[name] = feature_list
feature_list = []
new_feature = []
continue
# if there are a word in qualifier indent (feature type)
# maybe we need to infer the size of indentation ???
if line[0:20].split():
if new_feature:
feature_list.append(new_feature)
split_line = line.split()
t = split_line[0]
# Handle :
# complement(order(1449596..1449640,1449647..1450684,
# 1450695..1450700))
positions = split_line[1]
if positions[0].isalpha():
while not line[:-1].endswith(")"):
line = next(fp)
positions += line
pos = [int(n) for n in re.findall(r"\d+", positions)]
# Handle complement(join(3773333..3774355,3774357..3774431))
start = pos[0]
end = pos[-1]
strand = "-" if split_line[1].startswith("c") else "+"
new_feature = {
"type": t,
"gene_start": start,
"gene_end": end,
"strand": strand,
}
# recover qualifier bound with feature
else:
quali_line = line.strip().replace('"', "")
if quali_line.startswith("/") and "=" in quali_line:
qualifier = quali_line.split("=")
key = qualifier[0][1:]
new_feature[key] = qualifier[1]
else:
if key == "translation":
new_feature[key] += quali_line
else:
new_feature[key] += " " + quali_line
return records