Source code for sequana.zdna

import re

import pandas as pd

from sequana import FastA



[docs]
class ZDNA:
    """

    Support multiple motifs: ["CG", "CA", "GT"]



    """

    def __init__(self, fasta_file, motif="CG", min_repeats=6):
        self.fasta_file = fasta_file
        self.motif = motif.upper()
        self.min_repeats = min_repeats
        self.pattern = re.compile(f"({self.motif})" + "{" + f"{self.min_repeats},}}")
        self.df = pd.DataFrame(columns=["seqid", "start", "end", "length", "sequence"])


[docs]
    def run(self):
        fa = FastA(self.fasta_file)
        results = []

        for name in fa.names:
            sequence = fa.sequences[fa.names.index(name)]
            for match in self.pattern.finditer(sequence):
                start = match.start()
                end = match.end()
                results.append(
                    {"seqid": name, "start": start, "end": end, "length": end - start, "sequence": match.group()}
                )

        self.df = pd.DataFrame(results)



[docs]
    def to_bed(self, output_file, append=True, mode="a"):
        if self.df.empty:
            raise ValueError("Run `.run()` first.")
        bed = self.df[["seqid", "start", "end"]].copy()
        bed["name"] = self.df["sequence"]
        bed["score"] = 0
        bed["strand"] = "+"
        bed.to_csv(output_file, sep="\t", header=False, index=False, mode=mode)