Source code for sequana.gtf_fixer
#
# This file is part of Sequana software
#
# Copyright (c) 2016-2022 - Sequana Development Team
#
# Distributed under the terms of the 3-clause BSD license.
# The full license is in the LICENSE file, distributed with this software.
#
# website: https://github.com/sequana/sequana
# documentation: http://sequana.readthedocs.io
#
##############################################################################
"""Utilities for GTF files"""
from collections import defaultdict
import colorlog
logger = colorlog.getLogger(__name__)
[docs]
class GTFFixer:
"""Some GTF have syntax issues. This class should fix some of them
With exons, we want to have unique IDs.
We append the transcrpt version and ID to the exon ID
"""
def __init__(self, filename):
self.filename = filename
[docs]
def fix(self, output):
features = defaultdict(int)
logger.info("Scanning file")
count = 0
with open(self.filename, "r") as fin:
with open(output, "w") as fout:
line = fin.readline()
count += 1
while line:
if line.startswith("#") or len(line.strip()) == 0:
fout.write(line)
else:
entries = line.split(maxsplit=8)
features[entries[2]] += 1
try:
# sanity check that this is correctly written for all
# features
annotations = dict(
[x.strip().split(maxsplit=1) for x in entries[8].split(";") if len(x.strip())]
)
except IndexError:
logger.warning("warning line {}. could not parse annotations correctly".format(count))
# if exon feature, we want to make sure the ID are unique by
# appending the transcript_version if not already done
if entries[2] == "exon" and "." not in annotations["exon_id"]:
logger.info("Fixing exon ID line {} for exon ID {}".format(count, annotations["exon_id"]))
exon_id = "{}.{}.{}".format(
annotations["exon_id"].rstrip('"'),
annotations["transcript_version"].strip('"'),
annotations["transcript_id"].lstrip('"'),
)
newline = "\t".join(entries[0:8]) + "\t"
# we do not resuse the dictionaru 'annotations' just to
# keep the order
for entry in entries[8].split(";"):
if len(entry.strip()):
x, y = entry.split(maxsplit=1)
if x.strip() != "exon_id":
newline += "{} {};".format(x, y)
else:
newline += "exon_id {};".format(exon_id)
newline += ";\n"
line = newline
# ugly but fast replacement.
line = line.replace("%09", "\t").replace("%0A", "\n").replace("%0D", "\r")
line = line.replace("%25", "%").replace("%3D", "=").replace("%26", "&").replace("%2C", ",")
line = line.replace("%28", "(").replace("%29", ")") # brackets
fout.write(line)
line = fin.readline()
count += 1
if count % 100000 == 0:
print("scanned {} entries".format(count))
return features