Source code for sequana.isoseq

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2022 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
import glob
import os
from collections import Counter

import colorlog

from sequana import FastA, FastQ, phred
from sequana.lazy import numpy as np
from sequana.lazy import pandas as pd
from sequana.lazy import pylab
from sequana.pacbio import PacbioSubreads

logger = colorlog.getLogger(__name__)


__all__ = ["IsoSeqQC", "IsoSeqBAM"]


[docs] class IsoSeqQC(object): """ Use get_isoseq_files on smrtlink to get the proper files iso = IsoSeqQC() iso.hist_read_length_consensus_isoform() # histo CCS iso.stats() # "CCS" key is equivalent to summary metrics in CCS report todo: get CCS passes histogram . Where to get the info of passes ? """ def __init__(self, directory=".", prefix=""): self.prefix = prefix self.directory = directory self.sample_name = "undefined" # low quality isoforms filename = "all.polished_lq.fastq" self.lq_isoforms = self.get_file(filename) if self.lq_isoforms: logger.info("Reading {}".format(filename)) self.lq_sequence = FastQ(self.lq_isoforms) # high quality isoforms filename = "all.polished_hq.fastq" self.hq_isoforms = self.get_file(filename) if self.hq_isoforms: logger.info("Reading {}".format(filename)) self.hq_sequence = FastQ(self.hq_isoforms) # General info filename = "file.csv" self.csv = self.get_file(filename) if self.csv: logger.info("Reading {}".format(filename)) self.data = pd.read_csv(self.csv) # CCS fasta sequence # self.ccs = self.get_file("-ccs.tar.gz") filename = "ccs.fasta" self.ccs = self.get_file(filename, noprefix=True) if self.ccs: logger.info("Reading {}".format(filename)) self.ccs = FastA(self.ccs)
[docs] def get_file(self, tag, noprefix=False): if noprefix: filenames = glob.glob(self.directory + os.sep + tag) else: filenames = glob.glob(self.directory + os.sep + self.prefix + tag) if len(filenames) == 1: return filenames[0] elif len(filenames) > 1: print("Found several files ending in %s" % tag) else: print("No files matching %s" % tag) return None
[docs] def stats(self): results = {} if self.data is not None: logger.info("Reading strand") results["strand"] = { "+": sum(self.data.strand == "+"), "-": sum(self.data.strand == "-"), "?": sum(self.data.strand.isnull()), } results["classification"] = { "total_ccs_reads": len(self.data), "five_prime_reads": int(self.data.fiveseen.sum()), "three_prime_reads": int(self.data.threeseen.sum()), "chimera": int(self.data.chimera.sum()), "polyA_reads": int(self.data.polyAseen.sum()), } if self.lq_isoforms: logger.info("Reading LQ isoforms") results["lq_isoform"] = self.lq_sequence.stats() # number of if self.hq_isoforms: logger.info("Reading HQ isoforms") results["hq_isoform"] = self.hq_sequence.stats() # number of polished HQ isoform if self.ccs: seq = [len(read.sequence) for read in self.ccs] results["CCS"] = { "mean_length": pylab.mean(seq), "number_ccs_bases": sum(seq), "number_ccs_reads": len(seq), } self.idents_v = [] self.full_v = [] self.non_full_v = [] self.isoform_lengths = [] for read in self.lq_sequence: ident, full, non_full, length = read["identifier"].decode().split(";") self.idents_v.append(ident) self.full_v.append(int(full.split("=")[1])) self.non_full_v.append(int(non_full.split("=")[1])) self.isoform_lengths.append(int(length.split("=")[1])) return results
[docs] def to_summary(self, filename="sequana_summary_isoseq.json", data=None): """Save statistics into a JSON file :param filename: :param data: dictionary to save. If not provided, use :meth:`stats` """ from sequana.summary import Summary if data is None: data = self.stats() Summary("isoseq", self.sample_name, data=data).to_json(filename)
[docs] def hist_read_length_consensus_isoform( self, mode="all", bins=80, rwidth=0.8, align="left", fontsize=16, edgecolor="k", **kwargs ): """ mode can be all, lq, hq """ pylab.clf() L1 = [len(read["sequence"]) for read in self.lq_sequence] L2 = [len(read["sequence"]) for read in self.hq_sequence] if mode == "all": L = L1 + L2 elif mode == "lq": L = L1 else: L = L2 Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align, ec=edgecolor, **kwargs) pylab.gca().set_ylim(bottom=0) pylab.gca().set_xlim(left=0) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("Number of reads", fontsize=fontsize) pylab.grid() ax_twin = pylab.gca().twinx() shift = (X[1] - X[0]) / 2 ax_twin.plot(X[0:-1] - shift, len(L) - pylab.cumsum(Y), "k") ax_twin.set_ylim(bottom=0) pylab.ylabel("CDF", fontsize=fontsize) pylab.title("Read length of Consensus isoforms reads")
[docs] def hist_average_quality(self, fontsize=16, bins=None): """ bins is from 0 to 94 """ hq_qv = [pylab.mean([ord(X) - 33 for X in read["quality"].decode()]) for read in self.hq_sequence] lq_qv = [pylab.mean([ord(X) - 33 for X in read["quality"].decode()]) for read in self.lq_sequence] if bins is None: bins = range(0, 94) Y1, X = np.histogram(hq_qv, bins=bins) Y2, X = np.histogram(lq_qv, bins=bins) pylab.bar(X[1:], Y1, width=1, label="HQ") pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ") pylab.xlim([0.5, 93.5]) pylab.xlabel("Isoform average QV") pylab.ylabel("# Isoform") pylab.legend(fontsize=fontsize) ax = pylab.twinx() N = np.sum(Y1 + Y2) ax.plot(X, [N] + list(N - np.cumsum(Y1 + Y2)), "k")
[docs] class IsoSeqBAM(object): """Reads raw IsoSeq BAM file (subreads) Creates some plots and stats """ def __init__(self, filename): self.filename = filename self._passes = None self._lengths = None @property def lengths(self): if self._lengths is None: _ = self.passes # extract passes and lengths return self._lengths @property def passes(self): from collections import Counter if self._passes is None: # for BAM of 1M reads, takes 10-15 seconds from pysam import AlignmentFile ccs = AlignmentFile(self.filename, check_sq=False) qnames = [a.qname for a in ccs] names = [int(qname.split("/")[1]) for qname in qnames] self.qnames = qnames lengths = [] for qname in qnames: a, b = qname.split("/")[2].split("_") lengths.append(int(b) - int(a)) self._lengths = lengths self._passes = list(Counter(names).values()) return self._passes
[docs] def hist_passes(self, bins=100): pylab.hist(self.passes, bins=bins)
[docs] def hist_read_length(self, bins=100): pylab.hist(self.lengths, bins=bins)
[docs] def stats(self): return { "mean_read_length": pylab.mean(self.lengths), "ZMW": len(self.passes), "N": len(self.lengths), "mean_ZMW_passes": pylab.mean(self.passes), }
class PacbioIsoSeqMappedIsoforms(object): """Here, we load a SAM/BAM file generated with minimap using as input the BAM file created with the mapping og HQ isoforms on a reference. df contains a dataframe for each read found in the SAM (and hq_isoform) we populate the GC content, the mapping flag, the reference name (-1 means no mapping i.e flag ==4). flag of 4 means unmapped and there is no ambiguity about it. In the data file example, other flags are 0, 16 (SEQ being reverse complement<F12>) , 2048 (supplementary segment). Example of minimap2 command:: minimap2 -t 4 -ax splice -uf --secondary=no SIRV-E0.fa hq_isoforms.fasta 1> hq_isoforms.fasta.sam 2> hq_isoforms.fasta.sam.log Reads a SAM file for now. BAM should work as well """ def __init__(self, filename): self.filename = filename self.bam = PacbioSubreads(self.filename) self._df = None @property def df(self): if self._df is not None: return self._df # !! for isoseq, we should be able to load everything into memory self.bam.reset() data = [a for a in self.bam.data] df = self.bam.df.copy() rnames = [self.bam.data.get_reference_name(a.rname) if a.rname != -1 else -1 for a in data] df["reference_name"] = rnames df["flags"] = [a.flag for a in data] df["mapq"] = [a.mapq for a in data] df["cigar"] = [a.cigarstring for a in data] df["qname"] = [a.qname for a in data] # Drop SNR that are not populated in the mapped BAM file. df.drop(["snr_A", "snr_C", "snr_G", "snr_T"], axis=1, inplace=True) # TODO. input could be basde on mapping of CCS in which case, the ZMW is # stored and the following does not work. could check whether the # pattern is pXXfXX try: df["full_length"] = df["qname"].apply(lambda x: int(x.split("/")[1].split("p")[0].strip("f"))) df["non_full_length"] = df["qname"].apply(lambda x: int(x.split("/")[1].split("p")[1].strip("f"))) except: pass self._df = df return self._df def hist_isoform_length_mapped_vs_unmapped(self, bins=None): df = self.df if bins is None: bins = range(0, len(df.reference_length.max()), 100) mapped = df[df.reference_name != -1] unmapped = df[df.reference_name == -1] pylab.hist( mapped.reference_length, bins=bins, alpha=0.5, label="mapped {}".format(len(mapped)), density=False, ) pylab.hist( unmapped.reference, bins=bins, alpha=0.5, label="unmapped {}".format(len(unmapped)), density=False, ) pylab.xlabel("Isoform length") pylab.legend() def hist_transcript(self, hide_unmapped=True): pylab.clf() if hide_unmapped is True: query = "reference_length>0 and reference_name!=-1" else: query = "reference_length>0" ts = self.df.query(query).groupby("reference_name").count().reference_length if len(ts) == 0: print("nothing to plot") return ts ts.plot(kind="bar", color="r") try: pylab.gcf().set_layout_engine("tight") except: pass return ts def bar_mapq(self, logy=True, xmin=0, xmax=60, fontsize=12): self.df.mapq.hist() if logy: pylab.semilogy() pylab.xlim([xmin, xmax]) pylab.xlabel("Mapping quality", fontsize=fontsize) def plot_sirv_by_group(self, title, shift=5, plot=False, mapq_min=-1): aa = self.df.query("reference_name not in [-1, '-1']").copy() if len(aa) == 0: return pd.Series(), self.df aa["group"] = aa.reference_name.apply(lambda x: x[0:shift]) mapped = aa.query("mapq>@mapq_min").groupby("group").count()["mapq"] mapped.name = None if plot: mapped.plot(kind="bar") pylab.title(title) pylab.gcf().set_layout_engine("tight") # data.to_csv(path + "_hq_sirv_grouped.csv") return mapped, self.df class SIRVReference: """ See https://www.lexogen.com/sirvs/downloads/ to download one of the XLSX file :: ss = isoseq.SirvReference() ss.from_excel("SIRV_Set2.xls") ss.to_fasta("temp.fasta") """ def __init__(self): pass def from_excel(self, filename, nrows=100): """ skip 4 empty rows. next two are a header split on two lines """ # header is split on two rows, so we need to skip it df = pd.read_excel(filename, skiprows=4, header=None) assert df.iloc[1, 4] == "c" # this column will be used assert df.iloc[0, 2] == "SIRV ID" assert df.iloc[0, 1] == "SIRV gene" assert df.iloc[1, 23] == "length (nt)" assert df.iloc[1, 25] == "pA length (nt)" assert df.iloc[1, 26] == "Sequence" # with polyA assert df.iloc[1, 29] == "Sequence" # without polyA !! # in excel this last column is actually an equation based on the # sequence with polyA and may be empty when using from_excel method. # filter out useless columns and first two lines df = df.iloc[2:, [2, 4, 23, 25, 26, 29]] # drop useless rows !! keep this statement after the first filtering df = df[df.iloc[:, 0].isnull() == False] # drop header df.reset_index(inplace=True, drop=True) df.columns = [ "sirv_id", "annotation", "length", "length_polyA", "sequence_with_polya", "sequence", ] # remove and replace sequence column using sequence_with_poly and # removing the polyA # df.drop("sequence", inplace=True) seq = [a[0:-b] for a, b in zip(df.sequence_with_polya, df.length_polyA)] df["sequence"] = seq self.df = df def to_fasta(self, filename): data = self.df.query("annotation == 1").copy() with open(filename, "w") as fout: for _, this in data.iterrows(): fout.write(">{}\n{}\n".format(this.sirv_id, this.sequence)) class SIRV(object): def __init__(self, filename, shift=4): from sequana import FastA self.SIRV = FastA(filename) self.shift = 5 def __len__(self): return len(self.SIRV.names) @property def group_names(self): data = list(set([x[0 : self.shift] for x in self.SIRV.names])) return sorted(data) @property def group_lengths(self): data = dict(Counter(list([x[0 : self.shift] for x in self.SIRV.names]))) return data class PacbioIsoSeqMultipleIsoforms(object): """ mul.read("./180223_113019_lexo1/data/ccs_sirv.sam", "lexo1 ccs") mul.read("./180223_221457_lexo2/data/ccs_sirv.sam", "lexo2 ccs") mul.read("./180206_193114_lexo3/data/ccs_sirv.sam", "lexo3 ccs") mul.read("./180224_083446_lexo3/data/ccs_sirv.sam", "lexo3 ccs bis") """ SIRV_names = ["SIRV1", "SIRV2", "SIRV3", "SIRV4", "SIRV5", "SIRV6", "SIRV7"] SIRV_total_lengths = [7075, 6395, 12988, 5592, 16206, 15278, 13375] def __init__(self, sirv=None, sirv_shift=5): self.labels = [] self.rawdata = [] if sirv: try: self.SIRV_data = SIRV(sirv, shift=sirv_shift) self.SIRV_names = self.SIRV_data.group_names self.SIRV_lengths = self.SIRV_data.SIRV.get_lengths_as_dict() except: print("Could not read {}".format(sirv)) self.mapq_min = 0 @property def N(self): return [len(x) for x in self.rawdata] @property def sirv(self): sirv = [] shift = 5 for df in self.rawdata: aa = df.query("reference_name not in [-1, '-1']").copy() if len(aa) == 0: sirv.append(pd.Series(index=self.SIRV_names)) continue aa["group"] = aa.reference_name.apply(lambda x: x[0:shift]) # filter quality and flags # mask = np.logical_or(df.flags & 256, df.flags & 2048) # aa = aa.query("mapq>=@self.mapq_min and @mask") data = aa.groupby("group").count()["mapq"] data.name = None sirv.append(data) return sirv def filter_raw_data(self, quality=0, flags=[256, 2048]): for i, df in enumerate(self.rawdata): mask = np.logical_or(df["flags"] & 256, df["flags"] & 2048) ones = [-1, "-1"] df = df.query("mapq>=@quality and not @mask and reference_name not in @ones") self.rawdata[i] = df def read(self, filename, tag): if filename.endswith(".csv"): data = pd.read_csv(filename) data = data.query("reference_name not in [-1, '-1']") else: sam = PacbioIsoSeqMappedIsoforms(filename) mapped, data = sam.plot_sirv_by_group(tag) self.rawdata.append(data) self.labels.append(tag) def plot_corr(self): lengths = self.SIRV_data.SIRV.get_lengths_as_dict() spikes = self.spikes_found() spikes["lengths"] = [lengths[k] for k in spikes.index] corr = spikes.corr() pylab.imshow(corr) N = len(spikes.columns) pylab.xticks(range(N), spikes.columns, rotation=90) pylab.yticks(range(N), spikes.columns) pylab.clim(0, 1) pylab.colorbar() def plot_bar_grouped(self, normalise=False, ncol=2, N=None): """ :param normalise: :param ncol: columns in the legend """ if N is not None: N = np.array(N) else: N = np.array([len(x) for x in self.rawdata]) dd = pd.DataFrame(self.sirv).T if normalise: dd = dd / (N / max(N)) dd.columns = self.labels dd.plot(kind="bar") pylab.xlabel("") pylab.legend(self.labels, ncol=ncol) pylab.gcf().set_layout_engine("tight") return dd def spikes_found(self, spikes_filename=None): if spikes_filename: sirv_names = SIRV(spikes_filename).SIRV.names else: sirv_names = [] for data in self.rawdata: sirv_names.extend(data.reference_name.unique()) sirv_names = set(sirv_names) try: sirv_names.remove(-1) except: pass sirv_names = sorted(list(sirv_names)) df = pd.DataFrame(index=sirv_names, columns=self.labels) for i, data in enumerate(self.rawdata): aa = data.query("reference_name not in [-1, '-1']").copy() sirv_detected = aa.groupby("reference_name").count()["mapq"] df[self.labels[i]] = sirv_detected return df def plot_bar(self, spikes_filename=None, ratio=100): data = self.spikes_found(spikes_filename) lengths = [self.SIRV_lengths[x] for x in data.index] data.plot(kind="bar") pylab.plot(np.array(lengths) / ratio) pylab.gcf().set_layout_engine("tight") return data def get_stats(pattern, mode): data = [[], []] for filename in glob.glob(pattern + "/data/{}_sirv.sam".format(mode)): b = isoseq.PacbioIsoSeqMappedIsoforms(filename) N = Summary(filename.split("/")[0] + "/sequana_summary_isoseq.json").data["hq_isoform"]["N"] data[0].append(N) data[1].append(len(b.df.query("reference_name not in [-1, '-1']"))) print("scanned {}".format(filename)) return data class GeneCoverage: """ - column 12 should be the percentage of gene coverage """ def __init__(self, filename): self.filename = filename self.df = pd.read_csv(filename, header=None, sep="\t") self.coverage_column = 12 def __str__(self): icol = self.coverage_column L = float(len(self.df)) S0 = sum(self.df[icol] == 0) S2 = sum(self.df[icol] == 1) S1 = L - S0 - S2 S90 = sum(self.df[icol] > 0.9) S50 = sum(self.df[icol] > 0.5) S99 = sum(self.df[icol] > 0.99) txt = "Number of genes: {}\n".format(len(self.df)) txt += "Fully detected genes: {} ({:.2f}%)\n".format(S2, S2 / L * 100) txt += "Partially detected genes: {} ({:.2f}%)\n".format(S1, S1 / L * 100) txt += "Detected genes (99% covered): {} ({:.2f}%)\n".format(S99, S99 / L * 100) txt += "Detected genes (90% covered): {} ({:.2f}%)\n".format(S90, S90 / L * 100) txt += "Detected genes (50% covered): {} ({:.2f}%)\n".format(S50, S50 / L * 100) txt += "Undetected genes: {} ({:.2f}%)\n".format(S0, S0 / L * 100) return txt def plot( self, X=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 0.999, 1], fontsize=16, label=None, ): """plot percentage of genes covered (y axis) as a function of percentage of genes covered at least by X percent (x-axis). """ icol = self.coverage_column N = float(len(self.df)) X = np.array(X) Y = np.array([sum(self.df[icol] > x) / N * 100 for x in X]) if label is None: pylab.plot(X * 100, Y, "o-") else: pylab.plot(X * 100, Y, "o-", label=label) pylab.xlabel("Gene coverage (%)", fontsize=fontsize) pylab.ylabel("Percentage of genes covered", fontsize=fontsize) for this in [25, 50, 75]: pylab.axhline(this, color="r", alpha=0.5, ls="--") pylab.axvline(this, color="r", alpha=0.5, ls="--") def get_percentage_genes_covered_at_this_fraction(self, this): assert this <= 1 and this >= 0 icol = self.coverage_column X = pylab.linspace(0, 1, 101) N = float(len(self.df)) Y = np.array([sum(self.df[icol] > x) / N * 100 for x in X]) return np.interp(this, X, Y)