Source code for sequana.ribodesigner

#  This file is part of Sequana software
#  Copyright (c) 2016-2020 - Sequana Development Team
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#  website:
#  documentation:
"""Ribodesigner module"""
import datetime
import json
import shutil
import subprocess
import sys
from itertools import product
from pathlib import Path

from sequana import logger
from sequana.fasta import FastA
from sequana.lazy import numpy as np
from sequana.lazy import pandas as pd
from sequana.lazy import pylab, pysam
from import reverse_complement


[docs]class RiboDesigner(object): """Design probes for ribosomes depletion. From a complete genome assembly FASTA file and a GFF annotation file: - Extract genomic sequences corresponding to the selected ``seq_type``. - For these selected sequences, design probes computing probe length and inter probe space according to the length of the ribosomale sequence. - Detect the highest cd-hit-est identity threshold where the number of probes is inferior or equal to ``max_n_probes``. - Report the list of probes in BED and CSV files. In the CSV, the oligo names are in column 1 and the oligo sequences in column 2. :param fasta: The FASTA file with complete genome assembly to extract ribosome sequences from. :param gff: GFF annotation file of the genome assembly. :param output_directory: The path to the output directory. :param seq_type: string describing sequence annotation type (column 3 in GFF) to select rRNA from. :param max_n_probes: Max number of probes to design :param force: If the `output_directory` already exists, overwrite it. :param threads: Number of threads to use in cd-hit clustering. :param float identity_step: step to scan the sequence identity (between 0 and 1) defaults to 0.01. """ def __init__( self, fasta, gff, output_directory, seq_type="rRNA", max_n_probes=384, force=False, threads=4, identity_step=0.01, **kwargs, ): # Input self.fasta = fasta self.gff = gff self.seq_type = seq_type self.max_n_probes = max_n_probes self.threads = threads self.outdir = Path(output_directory) self.identity_step = identity_step if force: self.outdir.mkdir(exist_ok=True) else: try: self.outdir.mkdir() except FileExistsError as err: # pragma: no cover logger.error(f"Output directory {output_directory} exists. Use --force or set force=True") sys.exit(1) # Output self.filtered_gff = self.outdir / "ribosome_filtered.gff" self.ribo_sequences_fasta = self.outdir / "ribosome_sequences.fas" self.probes_fasta = self.outdir / "probes_sequences.fas" self.clustered_probes_fasta = self.outdir / "clustered_probes.fas" self.clustered_probes_csv = self.outdir / "clustered_probes.csv" self.clustered_probes_bed = self.outdir / "clustered_probes.bed" self.output_json = self.outdir / "ribodesigner.json" self.json = { "max_n_probes": max_n_probes, "identity_step": identity_step, "feature": seq_type, }
[docs] def get_rna_pos_from_gff(self): """Convert a GFF file into a pandas DataFrame filtered according to the self.seq_type. """ gff = pd.read_csv( self.gff, sep="\t", comment="#", names=[ "seqid", "source", "seq_type", "start", "end", "score", "strand", "phase", "attributes", ], ) filtered_gff = gff.query("seq_type == @self.seq_type") with pysam.Fastafile(self.fasta) as fas: with open(self.ribo_sequences_fasta, "w") as fas_out: for row in filtered_gff.itertuples(): region = f"{row.seqid}:{row.start}-{row.end}" seq_record = f">{region}\n{fas.fetch(region=region)}\n" fas_out.write(seq_record) seq_types = gff.seq_type.unique().tolist()"Genetic types found in gff: {','.join(seq_types)}")"Found {filtered_gff.shape[0]} '{self.seq_type}' entries in the annotation file.") logger.debug(f"\t" + filtered_gff.to_string().replace("\n", "\n\t")) filtered_gff.to_csv(self.filtered_gff)
def _get_probe_and_step_len_greedy(self, seq): """Modified version of _get_probe_and_step_len""" seq_len = len(seq.sequence) # sequences below 92 base fail when scanning the parameter space. # in such case, one probe or two overlapping probes should do it. if seq_len < 50: return 50, 15 elif seq_len < 100: return 40, 10 probe_lens = range(60, 40, -1) inter_probe_space = range(20, 10, -1) for probe_len, inter_probe_space in product(probe_lens, inter_probe_space): if ((seq_len + inter_probe_space) / (probe_len + inter_probe_space)).is_integer(): return probe_len, inter_probe_space # 4% of sequence length are not found in the parameter space [60-40] x [10-20] # Using 70-40 x 30-10 gives 0 fails for sequences up to 200,000 bases probe_lens = range(70, 40, -1) inter_probe_space = range(30, 10, -1) for probe_len, inter_probe_space in product(probe_lens, inter_probe_space): X = (seq_len + inter_probe_space) / (probe_len + inter_probe_space) if X.is_integer(): return probe_len, inter_probe_space # 34 of sequence length are not found in the parameter space [60-40] x [10-20] # Using 80-40 x 35-10 gives 0 fails for sequences up to 200,000 bases probe_lens = range(80, 40, -1) inter_probe_space = range(35, 10, -1) for probe_len, inter_probe_space in product(probe_lens, inter_probe_space): X = (seq_len + inter_probe_space) / (probe_len + inter_probe_space) if X.is_integer(): return probe_len, inter_probe_space raise ValueError( f"No correct probe length/inter probe space combination was found for {}" ) # pragma: no cover def _get_probe_and_step_len(self, seq): """Calculates the probe_len and inter_probe_space for a ribosomal sequence. ribo_len = probe_len * n + (inter_probe_space * (n - 1)) <=> n = (ribo_len + inter_probe_space) / (prob_len + inter_probe_space) """ seq_len = len(seq.sequence) probe_lens = range(60, 40, -1) inter_probe_space = range(20, 10, -1) for probe_len, inter_probe_space in product(probe_lens, inter_probe_space): if ((seq_len + inter_probe_space) / (probe_len + inter_probe_space)).is_integer(): return probe_len, inter_probe_space raise ValueError( f"No correct probe length/inter probe space combination was found for {}" ) # pragma: no cover def _get_probe_and_step_len_simple(self, seq): seq_len = len(seq.sequence) if seq_len < 50: return 50, 15 elif seq_len < 100: return 40, 10 probe_len = 50 inter_probe_space = 15 # starts = arange(0, seq_len, probe_len+inter_probe_space) return 50, 15 def _get_probe_and_step_len_spiral(self, seq): # much slower than original and greedy but ensure that probes are closer to the # expected value seq_len = len(seq.sequence) if seq_len < 50: return 50, 15 elif seq_len < 100: return 40, 10 def spiral(X, Y, x0, y0): items = [] x = y = 0 dx = 0 dy = -1 for i in range(max(X, Y) ** 2): if (-X / 2 - 1 < x <= X / 2) and (-Y / 2 - 1 < y <= Y / 2): items.append((x, y)) if x == y or (x < 0 and x == -y) or (x > 0 and x == 1 - y): dx, dy = -dy, dx x, y = x + dx, y + dy items = [(x + X / 2 + x0, y + Y / 2 + y0) for x, y in items] return items # set of points from 40 to 60 (40+21) and from 10 to 20 (10+11) positions = spiral(40, 10, 40, 10) for probe_len, inter_probe_space in positions: if ((seq_len + inter_probe_space) / (probe_len + inter_probe_space)).is_integer(): return int(probe_len), int(inter_probe_space) raise ValueError( f"No correct probe length/inter probe space combination was found for {}" ) # pragma: no cover def _get_probes_df(self, seq, probe_len, step_len, mode="generic"): """Generate the Dataframe with probes information. Design probes to have end-to-end coverage on the + strand and fill the inter_probe_space present on the + strand with probes designed on the - strand. :param seq: A pysam sequence object. :param prob_len: The length of the probes calculated by self._get_probe_and_step_len. :param step_len: The length of the inter-probe space calculated by self._get_probe_and_step_len. :param strand: The strand on which probes are designed. """ # + strand probes starts = [start for start in range(0, len(seq.sequence) - probe_len + 1, probe_len + step_len)] stops = [start + probe_len for start in starts] df = pd.DataFrame( { "name":, "start": starts, "stop": stops, "strand": "+", "score": 0, } ) df["sequence"] = [seq.sequence[row.start : row.stop] for row in df.itertuples()] df["seq_id"] = df["name"] + f"_+_" + df["start"].astype(str) + "_" + df["stop"].astype(str) # - strand probes sequence = reverse_complement(seq.sequence) # Starts reverse probes to be centered on inter_probe_space of the forward probes rev_starts = [int((starts[i + 1] + starts[i]) / 2) for i in range(0, len(starts) - 1)] rev_stops = [start + probe_len for start in rev_starts] if mode == "simple": rev_starts = [x for x in starts] rev_stops = [start + probe_len for start in rev_starts] df_rev = pd.DataFrame( { "name":, "start": rev_starts, "stop": rev_stops, "strand": "-", "score": 0, } ) df_rev["sequence"] = [sequence[row.start : row.stop] for row in df_rev.itertuples()] df_rev["seq_id"] = df_rev["name"] + f"_-_" + df_rev["start"].astype(str) + "_" + df_rev["stop"].astype(str) # Transform to bed coordinates for the reverse_complement df_rev["start"] = len(sequence) - df_rev["start"] df_rev["stop"] = len(sequence) - df_rev["stop"] df_rev.rename(columns={"start": "stop", "stop": "start"}, inplace=True) return pd.concat([df, df_rev])
[docs] def get_all_probes(self, method="original"): """Run all probe design and concatenate results in a single DataFrame.""" probes_dfs = [] with pysam.FastxFile(self.ribo_sequences_fasta) as fas: for seq in fas: if method == "greedy": probe_len, step_len = self._get_probe_and_step_len_greedy(seq) df = self._get_probes_df(seq, probe_len, step_len) elif method == "original": probe_len, step_len = self._get_probe_and_step_len(seq) df = self._get_probes_df(seq, probe_len, step_len) elif method == "spiral": probe_len, step_len = self._get_probe_and_step_len_spiral(seq) df = self._get_probes_df(seq, probe_len, step_len) elif method == "simple": probe_len, step_len = self._get_probe_and_step_len_simple(seq) df = self._get_probes_df(seq, probe_len, step_len, mode="simple") probes_dfs.append(df) self.probes_df = pd.concat(probes_dfs) self.probes_df["kept_after_clustering"] = True self.probes_df["bed_color"] ={True: "21,128,0", False: "128,64,0"})
[docs] def export_to_fasta(self): """From the self.probes_df, export to FASTA and CSV files.""" with open(self.probes_fasta, "w") as fas: for row in self.probes_df.itertuples(): fas.write(f">{row.seq_id}\n{row.sequence}\n")
[docs] def clustering_needed(self, force=False): """Checks if a clustering is needed. :param force: force clustering even if unecessary. """ # Do not cluster if number of probes already inferior to defined threshold if not force and self.probes_df.shape[0] <= self.max_n_probes: f"Number of probes {self.probes_df.shape[0]} already inferior to {self.max_n_probes}. No clustering will be performed." ) return False else: return True
[docs] def cluster_probes(self): """Use cd-hit-est to cluster highly similar probes.""" outdir = ( Path(self.clustered_probes_fasta).parent / f"cd-hit-est-{'seconds', sep='_')}" ) outdir.mkdir() log_file = outdir / "cd-hit.log" res_dict = {"seq_id_thres": [], "n_probes": []} for seq_id_thres in np.arange(0.8, 1, self.identity_step).round(3): tmp_fas = outdir / f"clustered_{seq_id_thres}.fas" cmd = f"cd-hit-est -i {self.probes_fasta} -o {tmp_fas} -c {seq_id_thres} -n {self.threads}" logger.debug(f"Clustering probes with command: {cmd} (log in '{log_file}').") with open(log_file, "a") as f:, shell=True, check=True, stdout=f) res_dict["seq_id_thres"].append(seq_id_thres) res_dict["n_probes"].append(len(FastA(tmp_fas))) # Add number of probes without clustering res_dict["seq_id_thres"].append(1) res_dict["n_probes"].append(self.probes_df.shape[0]) self.json["results"] = res_dict # Dataframe with number of probes for each cdhit identity threshold pylab.clf() df = pd.DataFrame(res_dict) import seaborn as sns # local import to speed up imports p = sns.lineplot(data=df, x="seq_id_thres", y="n_probes", markers=["o"]) p.axhline( self.max_n_probes, alpha=0.8, linestyle="--", color="red", label="max number of probes requested", ) pylab.xlabel("Sequence identity", fontsize=16) pylab.ylabel("Number of probes", fontsize=16) # Extract the best identity threshold best_thres = df.query("n_probes <= @self.max_n_probes").seq_id_thres.max() if not np.isnan(best_thres): n_probes = df.query("seq_id_thres == @best_thres").loc[:, "n_probes"].values[0] self.json["n_probes"] = int(n_probes) self.json["best_thres"] = best_thres"Best clustering threshold: {best_thres}, with {n_probes} probes.") shutil.copy(outdir / f"clustered_{best_thres}.fas", self.clustered_probes_fasta) kept_probes = [ for seq in FastA(outdir / f"clustered_{best_thres}.fas")] self.probes_df["kept_after_clustering"] = self.probes_df.seq_id.isin(kept_probes) self.probes_df["bed_color"] = {True: "21,128,0", False: "128,64,0"} ) self.probes_df["clustering_thres"] = best_thres pylab.plot(best_thres, n_probes, "o", label="Final number of probes") pylab.legend() else: logger.warning( f"No identity threshold was found to have as few as {self.max_n_probes} probes. Keep all probes. Set a valid value with --max-n-probes between {df.n_probes.min()} (min) and {df.n_probes.max()} (max)" ) self.clustering_df = df.sort_values("seq_id_thres") return self.probes_df.query("kept_after_clustering == True")
[docs] def export_to_csv_bed(self): """Export final results to CSV and BED files""" if self.clustering_needed(): df = self.cluster_probes() else: df = self.probes_df df.to_csv(self.clustered_probes_csv, index=False, columns=["seq_id", "sequence"]) self.probes_df.to_csv( self.clustered_probes_bed, sep="\t", index=False, header=None, columns=[ "name", "start", "stop", "sequence", "score", "strand", "start", "stop", "bed_color", ], )
[docs] def export_to_json(self): with open(self.output_json, "w") as fout: json.dump(self.json, fout, indent=4, sort_keys=True)
[docs] def run(self, method="greedy"): if self.gff: self.get_rna_pos_from_gff() else: shutil.copy(self.fasta, self.ribo_sequences_fasta) self.get_all_probes(method=method) self.export_to_fasta() self.export_to_csv_bed() self.export_to_json()