Source code for sequana.hmmtools

from io import StringIO

import pandas as pd
from tqdm import tqdm

from sequana import GFF3



[docs]
class PfamDomtblout:
    """

    parser for the output of hmmscan (domtblout format)

    ::

        hmmscan --cpu 8 --domtblout pfam.domtblout Pfam-A.hmm temp.faa > pfam.hmmscan.txt

    """

    def __init__(self, filepath):
        self.filepath = filepath
        self.df = None


[docs]
    def read(self):
        col_names = [
            "target_name",
            "target_accession",
            "tlen",
            "query_name",
            "query_accession",
            "qlen",
            "full_evalue",
            "full_score",
            "full_bias",
            "dom_num",
            "dom_total",
            "c_evalue",
            "i_evalue",
            "dom_score",
            "dom_bias",
            "hmm_start",
            "hmm_end",
            "ali_start",
            "ali_end",
            "env_start",
            "env_end",
            "acc",
            "description",
        ]

        def read_custom_space_file(filepath):
            data = []
            with open(filepath) as f:
                for line in f:
                    if line.startswith("#"):
                        continue
                    parts = line.strip().split()
                    fixed = parts[:22]
                    description = " ".join(parts[22:])
                    data.append(fixed + [description])
            return pd.DataFrame(data)

        self.df = read_custom_space_file(self.filepath)
        self.df.columns = col_names



[docs]
    def to_gff(self, output_file, augustus_gff=None, best_hit=True):
        if self.df is None:
            raise ValueError("Call read() before to_gff()")

        if augustus_gff:
            gff_aug = GFF3(augustus_gff)

        if best_hit:
            df = self.df.loc[self.df.groupby("query_name")["i_evalue"].idxmin().dropna()]
        else:
            df = self.df

        with open(output_file, "w") as gff:
            gff.write("##gff-version 3\n")
            for _, row in tqdm(df.iterrows()):

                gff_fields = {
                    "seqid": row["target_name"],
                    "source": "Pfam",
                    "type": "protein_domain",
                    "start": int(row["ali_start"]),
                    "end": int(row["ali_end"]),
                    "score": f"{row['i_evalue']}",
                    "strand": ".",
                    "phase": ".",
                    "attributes": f"ID={row['query_name']};Name={row['description'].replace(' ', '_')};Pfam={row['query_accession']}",
                }

                # populate seqid, start and stop with information from the GFF file.
                ID = row["query_name"]
                if augustus_gff:
                    subdf = gff_aug.df.query("ID==@ID")
                    seqid = subdf.seqid.values[0]
                    start = subdf.start.values[0] + int(row["ali_start"])
                    stop = subdf.start.values[0] + int(row["ali_end"])
                    gff_fields["seqid"] = seqid
                    gff_fields["start"] = start
                    gff_fields["end"] = stop

                gff_line = "\t".join(
                    str(gff_fields[k])
                    for k in ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]
                )
                gff.write(gff_line + "\n")



[docs]
    def annotate_gff(self, augustus_gff, out_gff, key="pfam"):
        """add pfam+description into original GFF file"""
        # group by name, pick up the best hit
        df = self.df.loc[self.df.groupby("query_name")["i_evalue"].idxmin().dropna()]
        self.df_temp = df
        from tqdm import tqdm

        with open(augustus_gff, "r") as fin:
            with open(out_gff, "w") as fout:
                for line in tqdm(fin.readlines()):
                    items = line.split("\t")
                    if len(items) == 9:
                        if items[2] == "gene":
                            ID = items[-1].strip().split("=")[-1]
                            ID = ID + ".t1"
                            try:
                                acc = df.query("query_name==@ID").target_accession.values[0]
                                desc = df.query("query_name==@ID").description.values[0]
                            except IndexError:
                                acc = "none"
                                desc = "none"
                            fout.write(line.strip() + f";annot={desc};{key}{acc}\n")
                        else:
                            fout.write(line)
                    else:
                        fout.write(line)