Source code for sequana.assembly

#
#  This file is part of Sequana software
#
#  Copyright (c) 2018-2022 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
import textwrap

import colorlog

from sequana.lazy import pandas as pd
from sequana.lazy import pylab

logger = colorlog.getLogger(__name__)


__all__ = ["BUSCO"]


[docs]class BUSCO: """Wrapper of the BUSCO output "BUSCO provides a quantitative measures for the assessment of a genome assembly, gene set, transcriptome completeness, based on evolutionarily-informed expectations of gene content from near-universal single-copy orthologs selected from OrthoDB v9." -- BUSCO website 2017 This class reads the full report generated by BUSCO and provides some visualisation of this report. The information is stored in a dataframe :attr:`df`. The score can be retrieve with the attribute :attr:`score` in percentage in the range 0-100. :reference: http://busco.ezlab.org/ .. note:: support version 3.0.1 and new formats from v5.X """ def __init__(self, filename="full_table_test.tsv"): """.. rubric:: constructor :filename: a valid BUSCO input file (full table). See example in sequana code source (testing) """ # version 3.0.1 try: self.df = pd.read_csv(filename, sep="\t", skiprows=4) if "Status" not in self.df.columns: # pragma: no cover # version 5.2.2 self.df = pd.read_csv(filename, sep="\t", skiprows=2) except pd.io.parsers.python_parser.ParserError: # pragma: no cover # it may happen that the parsing is incorrect with some files self.df = pd.read_csv(filename, sep="\t", skiprows=2) self.df.rename({"# Busco id": "ID"}, inplace=True, axis=1)
[docs] def pie_plot(self, filename=None, hold=False): """Pie plot of the status (completed / fragment / missed) .. plot:: :include-source: from sequana import BUSCO, sequana_data b = BUSCO(sequana_data("test_busco_full_table.tsv")) b.pie_plot() """ if hold is False: pylab.clf() self.df.groupby("Status").count()["ID"].plot(kind="pie") pylab.ylabel("") # pylab.title("Distribution Complete/Fragmented/Missing") # pylab.legend() if filename: pylab.savefig(filename)
[docs] def scatter_plot(self, filename=None, hold=False): """Scatter plot of the score versus length of each ortholog .. plot:: :include-source: from sequana import BUSCO, sequana_data b = BUSCO(sequana_data("test_busco_full_table.tsv")) b.scatter_plot() Missing are not show since there is no information about contig . """ if hold is False: pylab.clf() colors = ["green", "orange", "red", "blue"] markers = ["o", "s", "x", "o"] for i, this in enumerate(["Complete", "Fragmented", "Duplicated"]): mask = self.df.Status == this if sum(mask) > 0: self.df[mask].plot( x="Length", y="Score", kind="scatter", color=colors[i], ax=pylab.gca(), marker=markers[i], label=this, ) pylab.legend() pylab.grid() if filename: pylab.savefig(filename)
[docs] def summary(self): """Return summary information of the missing, completed, fragemented orthologs """ df = self.df.drop_duplicates(subset=["ID"]) data = {} data["S"] = sum(df.Status == "Complete") data["F"] = sum(df.Status == "Fragmented") data["D"] = sum(df.Status == "Duplicated") data["C"] = data["S"] + data["D"] data["M"] = sum(df.Status == "Missing") data["total"] = len(df) data["C_pc"] = data["C"] * 100.0 / data["total"] data["D_pc"] = data["D"] * 100.0 / data["total"] data["S_pc"] = data["S"] * 100.0 / data["total"] data["M_pc"] = data["M"] * 100.0 / data["total"] data["F_pc"] = data["F"] * 100.0 / data["total"] return data
[docs] def get_summary_string(self): data = self.summary() C = data["C_pc"] F = data["F_pc"] D = data["D_pc"] S = data["S_pc"] M = data["M_pc"] N = data["total"] string = "C:{:.1f}%[S:{:.1f}%,D:{:.1f}%],F:{:.1f}%,M:{:.1f}%,n:{}" return string.format(C, S, D, F, M, N)
def _get_score(self): return self.summary()["C_pc"] score = property(_get_score) def __str__(self): data = self.summary() C = data["C"] F = data["F"] D = data["D"] S = data["S"] M = data["M"] N = data["total"] string = """# BUSCO diagnostic {} {} Complete BUSCOs (C) {} Complete and single-copy BUSCOs (S) {} Complete and duplicated BUSCOs (D) {} Fragmented BUSCOs (F) {} Missing BUSCOs (M) {} Total BUSCO groups searched """ return string.format(self.get_summary_string(), C, S, D, F, M, N)
[docs] def save_core_genomes(self, contig_file, output_fasta_file="core.fasta"): """Save the core genome based on busco and assembly output The busco file must have been generated from an input contig file. In the example below, the busco file was obtained from the **data.contigs.fasta** file:: from sequana import BUSCO b = BUSCO("busco_full_table.tsv") b.save_core_genomes("data.contigs.fasta", "core.fasta") If a gene from the core genome is missing, the extracted gene is made of 100 N's If a gene is duplicated, only the best entry (based on the score) is kept. If there are 130 genes in the core genomes, the output will be a multi-sequence FASTA file made of 130 sequences. """ # local import due to cyclic import from sequana import FastA f = FastA(contig_file) # if we have duplicated hits, we group them and take the best score # we then drop the ID to keep the index. # Note the fillna set to 0 to include 'Missing' entries indices = self.df.fillna(0).groupby("ID")["Score"].nlargest(1).reset_index(level=0, drop=True).index subdf = self.df.loc[indices] # we sort the entries by gene (core genome) name # useful if we want to merge the sequences for a multiple alignment with open(output_fasta_file, "w") as fout: for record in subdf.to_dict("records"): type_ = record["Status"] if type_ == "Missing": data = "N" * 100 fout.write(f">{ID}\t{type_}:{seqname}:{start}:{end}:{end-start}\n{data}\n") else: # get gene/contig information start = int(record["Gene Start"]) ID = record["ID"] end = int(record["Gene End"]) seqname = record["Sequence"] # save the core gene sequence ctg_index = f.names.index(seqname) data = f.sequences[ctg_index][start:end] data = "\n".join(textwrap.wrap(data, 80)) fout.write(f">{ID}\t{type_}:{seqname}:{start}:{end}:{end-start}\n{data}\n")