Source code for sequana.blast

#!/usr/bin/env python3
import sys

from sequana.lazy import pandas as pd



[docs]
class BLAST:
    # blast is tricky since input will differ depending on command
    # line format. here we hard-coded columns used in one of our paper.

    def __init__(self, filename):

        self.filename = filename

        self.columns = [
            "qseqid",
            "sseqid",
            "pident",
            "length",
            "mismatch",
            "gapopen",
            "qstart",
            "qend",
            "sstart",
            "send",
            "evalue",
            "bitscore",
            "taxids",
            "stitle",
            "ssciname",
        ]


[docs]
    def scan(self, columns=None):

        if columns is None:
            columns = self.columns
        df = pd.read_csv(self.filename, sep="\t", header=None, names=columns)

        # cast the taxids column is a string
        df["taxids"] = df["taxids"].astype(str)

        Ntotal = len(df)
        len_diff = (Ntotal - len(df["qseqid"].unique())) / 100

        # Percentage of unclassified reads not in blast
        print(f"percentage of input reads not classified by Blast: {len_diff}% ")

        # keep only the first taxids if there are several in the same row knowing
        # they belong to the same lineage
        # df["taxids"] = df["taxids"].str.partition(";")[0]

        # Distribution of the bitscores for each read
        # print("Distribution des bitscores pour chaque read:")
        """df_2 = pd.crosstab(df["qseqid"], df["bitscore"])

        # Keeping only the best bitscore for each read
        qseqid_set = set(df["qseqid"])
        for qseqid in qseqid_set:
            bitscore_max = df.loc[(df["qseqid"] == str(qseqid))]["bitscore"].max()
            df.drop(
                df[
                    ((df["qseqid"] == str(qseqid)) & (df["bitscore"] < int(bitscore_max)))
                ].index,
                inplace=True,
            )
        """
        return df



[docs]
    def best_hit_per_query(self, df):
        """Return one best hit per ``qseqid``.

        Priority:

        1. lowest evalue
        2. highest bitscore
        """

        # Ensure numeric types
        df = df.copy()
        df["evalue"] = pd.to_numeric(df["evalue"])
        df["bitscore"] = pd.to_numeric(df["bitscore"])

        # Sort so best hit comes first
        df = df.sort_values(by=["qseqid", "evalue", "bitscore"], ascending=[True, True, False])

        # Keep first hit per query
        best = df.groupby("qseqid", as_index=False).first()

        return best





[docs]
def blast_to_gff(blast_file, gff_file):
    """input if blast results with outfmt=6"""
    with open(blast_file) as fin, open(gff_file, "w") as fout:
        fout.write("##gff-version 3\n")
        for line in fin:
            if line.startswith("#") or not line.strip():
                continue
            (
                qseqid,
                sseqid,
                pident,
                length,
                mismatch,
                gapopen,
                qstart,
                qend,
                sstart,
                send,
                evalue,
                bitscore,
            ) = line.strip().split("\t")

            sstart, send = int(sstart), int(send)
            start = min(sstart, send)
            end = max(sstart, send)
            strand = "+" if sstart < send else "-"

            # Write a GFF3 "match" feature
            fout.write(
                f"{sseqid}\ttblastn\tmatch\t{start}\t{end}\t{bitscore}\t{strand}\t.\t"
                f"ID={qseqid}_{sseqid}_{start}_{end};Name={qseqid};Evalue={evalue};Bitscore={bitscore}\n"
            )



if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: blast_to_gff3.py results.blastn output.gff3")
        sys.exit(1)

    blast_to_gff(sys.argv[1], sys.argv[2])