Source code for sequana.repeats.palindromes
"""Reverse-complement palindrome detection.
A DNA *palindrome* is a sequence equal to its own reverse complement (e.g.
``GAATTC``). Such palindromes always have an even length: the central base of
an odd-length window would have to be its own complement, which never happens
for A/C/G/T. Only even window sizes are therefore scanned.
"""
import pandas as pd
from tqdm import tqdm
from sequana import FastA
from sequana.tools import reverse_complement
_COLS = ["seqid", "start", "end", "length", "sequence"]
[docs]
class Palindromes:
def __init__(self, fasta_file, min_len=4, max_len=12):
self.fasta_file = fasta_file
self.min_len = min_len
self.max_len = max_len
self.df = pd.DataFrame(columns=_COLS)
[docs]
def is_palindrome(self, seq):
# Upper-case first: the complement table preserves case, so a
# soft-masked (lower-case) palindrome would otherwise be missed.
seq = seq.upper()
return seq == reverse_complement(seq)
[docs]
def run(self):
fa = FastA(self.fasta_file)
results = []
for name, sequence in zip(tqdm(fa.names), fa.sequences):
sequence = sequence.upper()
seq_len = len(sequence)
# Even sizes only: odd-length windows can never be palindromes.
start = self.min_len + (self.min_len % 2)
for size in range(start, self.max_len + 1, 2):
for i in range(seq_len - size + 1):
subseq = sequence[i : i + size]
if subseq == reverse_complement(subseq):
results.append({"seqid": name, "start": i, "end": i + size, "length": size, "sequence": subseq})
self.df = pd.DataFrame(results, columns=_COLS)
[docs]
def to_bed(self, output_file):
if self.df.empty:
raise ValueError("No data. Run `.run()` first.")
bed = self.df[["seqid", "start", "end"]].copy()
bed["name"] = self.df["sequence"]
bed["score"] = 0
bed["strand"] = "+"
bed.to_csv(output_file, sep="\t", header=False, index=False)