Source code for sequana.gff3

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2022 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
import os
from collections import defaultdict

import colorlog

from sequana.errors import BadFileFormat

logger = colorlog.getLogger(__name__)

from sequana.lazy import pandas as pd
from sequana.lazy import pysam

__all__ = ["GFF3"]


[docs]class GFF3:
    """Read a GFF file, version 3


    .. seealso:: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md


    ::

        g = GFF3(filename)
        # first call is slow
        g.df
        # print info about the different feature types
        g.features
        # prints info about duplicated attributes:
        g.get_duplicated_attributes_per_genetic_type(self)

    On eukaryotes, the reading and processing of the GFF may take a while.
    On prokaryotes, it should be pretty fast (a few seconds).
    To speed up the eukaryotes case, we skip the processing biological_regions
    (50% of the data in mouse).

    """

    def __init__(self, filename, skip_types=["biological_region"]):
        self.filename = filename
        self.skip_types = skip_types
        self._df = None
        self._features = None
        self._attributes = None

    def _get_features(self):
        """Extract unique GFF feature types

        This is equivalent to awk '{print $3}' | sort | uniq to extract unique GFF
        types. No sanity check, this is suppose to be fast.

        Less than a few seconds for mammals.
        """
        # This is used by the rnaseq pipeline and should be kept fast
        count = 0
        if self._features:
            features = self._features
        else:
            features = set()
            with open(self.filename, "r") as reader:
                for line in reader:
                    # Skip metadata and comments
                    if line.startswith("#"):
                        continue
                    # Skip empty lines
                    if not line.strip():  # pragma: no cover
                        continue
                    split = line.rstrip().split("\t")
                    L = len(split)
                    if L == 9:
                        features.add(split[2])
                    count += 1
                # FIXME may be overwritten by get_df
                self._features = features
        return sorted(features)

    features = property(_get_features)

[docs]    def get_attributes(self, feature=None, sep="; "):
        """Return list of possible attributes

        If feature is provided, must be valid and used as a filter
        to keep only entries for that feature.

        ~10 seconds on mouse genome GFF file.

        sep must be "; " with extra space to cope with special cases
        where an attribute has several entries separated by ; e.g.:

            BP="GO:0006412"; MF="GO:0005524;GO:0004004;"

        """
        # This is used by the rnaseq pipeline and should be kept fast
        if self._attributes:
            return self._attributes

        attributes = set()
        with open(self.filename, "r") as reader:
            for line in reader:
                # stop once FASTA starts
                if line.startswith("##FASTA"):
                    break
                # Skip metadata and comments and empty lines
                if line.startswith("#") or not line.strip():  # pragma: no cover
                    continue

                split = line.rstrip().split("\t")
                if feature and split[2] != feature:
                    continue

                for item in split[8].split(sep):
                    item = item.strip()
                    if len(item) == 0:  # empty final string #pragma: no cover
                        continue

                    # Here, some GFF use = some others use spaces... very
                    # annoying.
                    if "=" in item:
                        item = item.split("=")[0].strip()
                    else:
                        item = item.split()[0].strip()
                    attributes.add(item)

        self._attributes = sorted(attributes)
        return self._attributes

    attributes = property(get_attributes)

[docs]    def read(self):
        """Read annotations one by one creating a generator"""

        self._features = set()

        with open(self.filename, "r") as reader:
            line = None
            for line in reader:
                # stop once FASTA starts
                if line.startswith("##FASTA"):
                    break

                # Skip metadata and comments
                if line.startswith("#"):
                    continue

                # Skip empty lines
                if not line.strip():
                    continue

                # Format checking. skip rows that do not have 9 columns since
                # it is comments or fasta sequence
                split = line.rstrip().split("\t")
                if len(split) != 9:
                    continue

                # skipping  biological_region saves lots of time
                if split[2].strip() in self.skip_types:
                    continue

                # we process main fields and attributes. This takes most of the
                # time
                self._features.add(split[2])

                # the main first 8 fields
                annotation = self._process_main_fields(split[0:8])

                # all attributes as key/values added to all annotations.
                annotation["attributes"] = self._process_attributes(split[8])
                annotation.update(annotation["attributes"])

                yield annotation

    def _get_df(self):
        if self._df is not None:
            return self._df

        logger.info("Processing GFF file. 1. Reading the input file. Please be patient")
        # ~ 30 seconds on mouse
        df = pd.DataFrame(self.read())

        self._df = df
        return self._df

    df = property(_get_df)

[docs]    def get_duplicated_attributes_per_genetic_type(self):
        results = {}
        for typ in self.features:
            results[typ] = {}
            print("{}: {} entries".format(typ, len(self.df.query("genetic_type==@typ"))))
            for attr in sorted(self.attributes):
                L = len(self.df.query("genetic_type==@typ")[attr].dropna())
                dups = self.df.query("genetic_type==@typ")[attr].dropna().duplicated().sum()
                if dups > 0:
                    print(f"  - {attr}:{dups} duplicates ({L} in total)")
                else:
                    print(f"  - {attr}:No duplicates ({L} in total)")
                results[typ][attr] = dups

        df = pd.DataFrame(results)
        return df

[docs]    def transcript_to_gene_mapping(self, feature="all", attribute="transcript_id"):
        """

        :param feature: not used yet
        :param attribute: the attribute to be usde. should be transcript_id for
            salmon compatability but could use soething different.
        """
        # entries may have transcripts set to None
        transcripts = [x for x in self.df[attribute] if x]

        # retrieve only the data with transcript id defined
        transcripts_df = self.df.set_index(attribute)
        transcripts_df = transcripts_df.loc[transcripts]
        transcripts_df = transcripts_df.reset_index()

        results = {}

        results2 = defaultdict(list)
        for _id, data in transcripts_df[["ID", "Parent"]].iterrows():
            results[data.values[0]] = data.values[1]
            results2[data.values[1]].append(data.values[0])

        return results, results2

[docs]    def save_annotation_to_csv(self, filename="annotations.csv"):
        self.df.to_csv(filename, index=False)

[docs]    def read_and_save_selected_features(self, outfile, features=["gene"]):
        count = 0
        with open(self.filename, "r") as fin, open(outfile, "w") as fout:
            for line in fin:
                # stop once FASTA starts
                if line.startswith("##FASTA"):
                    break
                split = line.rstrip().split("\t")
                # skipping  biological_region saves lots of time
                try:
                    if split[2].strip() in features:
                        fout.write(line)
                        count += 1
                except IndexError:
                    pass
        logger.info(f"Found {count} entries and saved into {outfile}")

[docs]    def save_gff_filtered(self, filename="filtered.gff", features=["gene"], replace_seqid=None):
        """

        save_gff_filtered("test.gff", features=['misc_RNA', 'rRNA'],
                replace_seqid='locus_tag')
        """
        with open(filename, "w") as fout:
            fout.write("#gff-version 3\n#Custom gff from sequana\n")
            count = 0
            from collections import defaultdict

            counter = defaultdict(int)
            for x, y in self.df.iterrows():
                if y["genetic_type"] in features:
                    if replace_seqid:
                        y["seqid"] = y["attributes"][replace_seqid]
                    fout.write(
                        "{}\tfeature\tcustom\t{}\t{}\t.\t{}\t{}\t{}\n".format(
                            y["seqid"],
                            y["start"],
                            y["stop"],
                            y["strand"],
                            y["phase"],
                            ";".join([f"{a}={b}" for a, b in y["attributes"].items()]),
                        )
                    )
                    counter[y["genetic_type"]] += 1
                    count += 1
            logger.info("# kept {} entries".format(count))
            for feature in features:
                counter[feature] += 0
                logger.info("# {}: {} entries".format(feature, counter[feature]))

    def _process_main_fields(self, fields):
        annotation = {}

        # Unique id of the sequence
        annotation["seqid"] = fields[0]

        # Optional source
        if fields[1] != ".":
            annotation["source"] = fields[1]

        # Annotation type
        annotation["genetic_type"] = fields[2]

        # Start and stop
        annotation["start"] = int(fields[3])
        annotation["stop"] = int(fields[4])

        # Optional score field
        if fields[5] != ".":
            annotation["score"] = float(fields[5])

        # Strand
        if fields[6] == "+" or fields[6] == "-" or fields[6] == "?" or fields[6] == ".":
            annotation["strand"] = fields[6]

        # Phase
        if fields[7] != ".":
            annotation["phase"] = int(fields[7]) % 3
        else:
            annotation["phase"] = fields[7]
        return annotation

    def _process_attributes(self, text):
        attributes = {}

        # some GFF/GTF use different conventions:
        # - "ID=1;DB=2"     this is the standard
        # - "ID 1;DB 2"     some gtf uses spaces but should be fine
        # - "ID=1;DB=2;Note=some text with ; character " worst case scenario
        # In the later case, there is no easy way to fix this. I believe this is
        # a non-compatible GFF file.

        # we first figure out whether this is a = or space convention
        sep = None
        text = text.strip()
        for x in text:
            if x in ["=", " "]:
                sep = x
                break

        if sep is None:
            logger.error(f"Your GFF/GTF does not seem to be correct ({text}). Expected a = or space as separator")
            sys.exit(1)

        # ugly but fast replacement.
        text = text.replace("%09", "\t").replace("%0A", "\n").replace("%0D", "\r")
        text = text.replace("%25", "%").replace("%3D", "=").replace("%26", "&").replace("%2C", ",")
        text = text.replace("%28", "(").replace("%29", ")")  # brackets
        # we do not convert the special %3B into ;  or %20 into spaces for now

        # split into mutliple attributes
        # GTF ends in ; so we need to strip it
        split = text.rstrip(";")

        # some GFF uses ; and a space as separator but usually, it is just ; with no surrounding spaces.
        if "; " in split:
            split = text.split("; ")
        else:
            split = text.split(";")

        for attr in split:
            # make sure there is no trailing spaces
            attr = attr.strip()

            # find the separator. Sometimes it is spaces, sometimes a = sign
            idx = attr.find(sep)
            value = attr[idx + 1 :]

            # replace " by nothing (GTF case)
            attributes[attr[:idx]] = value.replace('"', "").replace("%3B", ";").replace("%20", " ")
        return attributes

[docs]    def to_gtf(self, output_filename="test.gtf", mapper={"ID": "{}_id"}):
        # experimental . used by rnaseq pipeline to convert input gff to gtf,
        # used by RNA-seqc tools

        fout = open(output_filename, "w")

        with open(self.filename, "r") as reader:
            for line in reader:
                # stop once FASTA starts
                if line.startswith("##FASTA"):
                    break
                # Skip metadata and comments
                if line.startswith("#"):
                    fout.write(line)
                    continue
                # Skip empty lines
                if not line.strip():  # pragma: no cover
                    continue
                split = line.rstrip().split("\t")
                L = len(split)

                name = split[0]
                source = split[1]
                feature = split[2]
                start = split[3]
                stop = split[4]
                a = split[5]
                strand = split[6]
                b = split[7]
                attributes = split[8]

                new_attributes = ""
                for item in attributes.split(";"):
                    try:
                        key, value = item.split("=")
                        if key in mapper.keys():
                            key = mapper[key].format(feature)
                        new_attributes += '{} "{}";'.format(key, value)
                    except:
                        pass

                # Here we need some cooking due to gtf/gff clumsy conventiom
                # 1. looks like attributes' values must have "" surrounding their content
                # 2. if feature is e.g. exon, then gtf expects the exon_id attribute
                msg = f"{name}\t{source}\t{feature}\t{start}\t{stop}\t{a}\t{strand}\t{b}\t{new_attributes}\n"
                fout.write(msg)

        fout.close()

[docs]    def to_fasta(self, ref_fasta, fasta_out):
        """From a genomic FASTA file ref_fasta, extract regions stored in the
        gff. Export the corresponding regions to a FASTA file fasta_out.

        :param ref_fasta: path to genomic FASTA file to extract rRNA regions from.
        :param fasta_out: path to FASTA file where rRNA regions will be exported to.
        """

        count = 0

        with pysam.Fastafile(ref_fasta) as fas:
            with open(fasta_out, "w") as fas_out:
                for region in self.df.to_dict("records"):
                    name = f"{region['seqid']}:{region['start']}-{region['stop']}"
                    seq_record = f">{name}\n{fas.fetch(region=name)}\n"
                    fas_out.write(seq_record)
                    count += 1

        logger.info(f"{count} regions were extracted from '{ref_fasta}' to '{fasta_out}'")

[docs]    def to_bed(self, output_filename, attribute_name):
        """Experimental export to BED format to be used with rseqc scripts

        :param str attribute_name: the attribute_name name to be found in the
            GFF attributes
        """

        # rseqc expects a BED12 file. The format is not clear from the
        # documentation. The first 6 columns are clear (e.g., chromosome name
        # positions, etc) but last one are not. From the examples, it should be
        # block sizes, starts of the transcript but they recommend bedops
        # gff2bed tool that do not extract such information. For now, for
        # prokaryotes, the block sizes version have been implemented and worked
        # on a leptospira example.
        fout = open(output_filename, "w")
        with open(self.filename, "r") as reader:
            for line in reader:
                # stop once FASTA starts
                if line.startswith("##FASTA"):
                    break
                # Skip metadata and comments
                if line.startswith("#"):
                    continue
                # Skip empty lines
                if not line.strip():  # pragma: no cover
                    continue

                # a line is read and split on tabulations
                split = line.rstrip().split("\t")

                chrom_name = split[0]
                # source = split[1]    #keep this code commented for book-keeping
                feature = split[2]
                gene_start = int(split[3])
                gene_stop = int(split[4])
                cds_start = gene_start  # for prokaryotes, for now cds=gene
                cds_stop = gene_stop
                a = split[5]  # not used apparently
                strand = split[6]
                b = split[7]  # not used apparently
                attributes = split[8]  # may be required for eukaryotes

                score = 0  # in examples for rseqc, the score is always zero
                unknown = 0  # a field not documented in rseqc
                block_count = 1
                block_sizes = f"{cds_stop-cds_start},"  # fixme +1 ?
                block_starts = "0,"  # commas are important at the end. no spaces
                # according to rseqc (bed.py) code , the expected bed format is
                # chrom, chrom_start, chrom_end, gene name, score, strand, cdsStart, cdsEnd,
                # blockcount, blocksizes, blockstarts where blocksizes and blocks
                # starts are comma separated list. Here is a line example on
                # human:
                # chr1	1676716 1678658 NM_001145277 0 +    1676725 1678549 0 4	182,101,105, 0,2960,7198

                # for now only the feature 'gene' is implemented. We can
                # generalize this later on.
                if feature == "gene":
                    gene_name = None
                    for item in attributes.split(";"):
                        if item.split("=")[0].strip() == attribute_name:
                            gene_name = item.split("=")[-1]
                    assert gene_name
                    # should be the cds start/stop but for now we use the gene
                    # info start/stop
                    msg = f"{chrom_name}\t{gene_start}\t{gene_stop}\t{gene_name}\t{score}\t{strand}\t{cds_start}\t{cds_stop}\t{unknown}\t{block_count}\t{block_sizes}\t{block_starts}\n"
                    fout.write(msg)

        fout.close()

[docs]    def clean_gff_line_special_characters(self, text):
        """Simple leaner of gff lines that may contain special characters"""
        text = text.replace("%09", "\t").replace("%0A", "\n").replace("%0D", "\r")
        text = text.replace("%25", "%").replace("%3D", "=").replace("%26", "&").replace("%2C", ",")
        text = text.replace("%28", "(").replace("%29", ")")  # brackets
        return text

[docs]    def get_simplify_dataframe(self):
        """Method to simplify the gff and keep only the most informative features."""
        # Set weight for genetic type to sort them and keep only the most informative
        if self.df.empty:
            raise BadFileFormat("%s file is not a GFF3.", self.filename)
        genetype = ["tRNA", "rRNA", "ncRNA", "CDS", "exon", "gene", "tRNA"]
        worst_score = len(genetype) + 1
        weight = {k: i for i, k in enumerate(genetype)}
        # Note seems optional

        tokeep = [
            x
            for x in [
                "seqid",
                "genetic_type",
                "start",
                "stop",
                "strand",
                "gene",
                "gene_id",
                "gene_name",
                "locus_tag",
                "Note",
                "product",
            ]
            if x in self.df.columns
        ]

        df = self.df.filter(tokeep, axis=1)

        # remove region and chromosome row
        df = df.drop(df.loc[df.genetic_type.isin({"region", "chromosome"})].index)
        try:
            df["gene"] = df["gene"].fillna(df.locus_tag)
        except (KeyError, AttributeError):
            pass
        df["score"] = [weight.get(g_t, worst_score) for g_t in df.genetic_type]
        # keep most informative features if on the same region
        best_idx = df.groupby(["seqid", "start", "stop"])["score"].idxmin()
        return df.loc[best_idx].reset_index(drop=True)

[docs]    def get_features_dict(self):
        """Format feature dict for sequana_coverage."""
        df = self.get_simplify_dataframe()
        # rename column to fit for sequana_coverage
        df = df.set_index("seqid").rename(columns={"start": "gene_start", "stop": "gene_end", "genetic_type": "type"})
        return {chr: df.loc[chr].to_dict("records") for chr in df.index.unique()}