Source code for sequana.phantom

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2023 - Sequana Development Team
#
#  File author(s):
#      Thomas Cokelaer <thomas.cokelaer@pasteur.fr>
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################

import colorlog

from sequana.lazy import numpy as np
from sequana.lazy import pandas as pd
from sequana.lazy import pylab
from sequana.utils.pandas import PandasReader

logger = colorlog.getLogger(__name__)


__all__ = ["PhantomPeaksReader", "Phantom"]


[docs] class PhantomPeaksReader: """Manipulate output of PhantomPeaks. The metrics file is tabulated: * Filename * numReads: effective sequencing depth i.e. total number of mapped reads in the input file. * estFragLen: comma separated strand cross-correlation peak(s) in decreasing order of correlation. In almost all cases, the top (first) value in the list represents the predominant fragment length. * corr estFragLen: comma separated strand cross-correlation value(s) in decreasing order (col3 follows the same order). * phantomPeak: Read length/phantom peak strand shift. * corr phantomPeak: Correlation value at phantom peak. * argmin corr: strand shift at which cross-correlation is lowest. * min corr: minimum value of cross-correlation. * Normalized strand cross-correlation coefficient (NSC) = COL4 / COL8. NSC > 1.1 is good. * Relative strand cross-correlation coefficient (RSC) = (COL4 - COL8) / (COL6 - COL8); RSC = 0 means no signal, < 1 low quality and > 1 means high enrichment. Aim for RSC > 0.8. * QualityTag: Quality tag based on thresholded RSC (codes: -2:veryLow; -1:Low; 0:Medium; 1:High; 2:veryHigh). """ _columns = [ "filename", "num_reads", "estimated_fragment_length", "corr", "phantom_peak", "corr_phantom_peak", "argmin_corr", "min_corr", "NSC", "RSC", "quality_tag", ] def __init__(self, filename): self.df = PandasReader(filename, sep="\t", header=None).df self.df.columns = self._columns
[docs] def read(self, filename): df = PandasReader(filename, sep="\t", header=None).df df.columns = self._columns self.df = pd.concat([self.df, df], axis=0)
[docs] def plot_RSC(self): self.df.RSC.plot(kind="bar") pylab.axhline(0.8, lw=2, color="r", ls="--")
[docs] class Phantom: """Read alignment files and generated phantom peaks. This class reads **align** files generated from bam files using this set of commands:: samtools view -F 0x0204 -o | awk 'BEGIN{FS="\t";OFS="\t"} {if (and($2,16) > 0) {print $3,($4-1),($4-1+length($10)),"N","1000","-"} else {print $3,($4-1),($4-1+length($10)),"N","1000","+"}}' - > test.align prints: - the reference chromosome, - the starting position-1 - position -1 + sequence length - N - 1000 - + if flag is 16 (or equivalent) otherwise prints - :: c = Phantom() c.chromosomes data = c.get_data('NC_002506.1') mask = c.remove_anomalies(data) data = data[mask] c.scc(data) X = range(-100,100) Y = [c.cor(x) for x in X] plot([x*5 for x in X], Y) c = chipseq.Phantom(binning=10, start=-500, stop=500) c.read_align("test.align") results, df = c.run() c.stats(results, df) """ def __init__(self, bamfile=None, binning=5, start=-500, stop=1500): self.start = start self.stop = stop self.binning = binning self.df = None
[docs] def read_align(self, readfile): self.df = PandasReader( readfile, sep="\t", header=None, names=["ref", "start", "end", "dummy", "quality", "strand"], dtype={ "ref": str, "start": "Int64", "end": "Int64", "dummy": str, "quality": "Int64", "strand": str, }, ).df self.read_length = round(pylab.median(self.df["end"] - self.df["start"])) self.chromosomes = self.df["ref"].unique()
[docs] def run(self): ## 10% of the time in self.get_data and 90 in cor() if self.df is None: logger.error("call read_align() method to read alignement file") return m = int(self.start / self.binning) M = int(self.stop / self.binning) # because bins is set to 5, we actually go from m*5 to M*5 X = range(m, M + 1, 1) Xreal = np.arange(m * self.binning, (M + 1) * self.binning, self.binning) results = {} for chrom in self.chromosomes: # logger.info("Processing {}".format(chrom)) data = self.get_data(chrom) L = len(data) self.scc(data) # shift correlation Y = [self.cor(x) for x in X] results[chrom] = {"data_length": L, "Y": np.array(Y), "X": Xreal} # weighted average usng orginal length of the chrmosomes weights = np.array([results[x]["data_length"] for x in self.chromosomes]) weights = weights / sum(weights) self.results = results self.weights = weights # now the weighted cross correlation df_avc = pd.DataFrame([w * results[x]["Y"] for w, x in zip(weights, self.chromosomes)]) df_avc = df_avc.T df_avc.index = Xreal return results, df_avc
[docs] def stats(self, results, df_avc, bw=1): stats = {} # some simple stars about the reads stats["read_fragments"] = len(self.df) stats["fragment_length"] = self.read_length logger.info("Read {} fragments".format(stats["read_fragments"])) logger.info("ChIP data mean length: {}".format(self.read_length)) # average cross correlation across all chromosomes # df_avc.sum(axis=1).plot() df = df_avc.sum(axis=1) corr_max = df.max() shift_max = df.idxmax() # note that in phantomPeak, they use the last value as min... not the # actual min. Not very important. corr_min = df.min() shift_min = df.idxmin() logger.info("Maximum cross-correlation value: {:.5f}".format(corr_max)) logger.info("Maximum cross-correlation shift: {}".format(shift_max)) logger.info("Minimum cross-correlation value: {:.5f}".format(corr_min)) logger.info("Minimum cross-correlation shift: {}".format(shift_min)) stats["shift_max"] = int(shift_max) # to make it json serialisable stats["corr_max"] = corr_max # original code phantomPeak but always equal to 1 it range max >5 ?? # default is 500 so sbw=1 whatsoever # sbw = 2 * floor(ceil(5/15000) / 2) + 1 sbw = 1 # here we could use a rolling mean # df.rolling(window=5, center=True).mean() # so following runnin mean is useless # cc$y <- runmean(cc$y,sbw,alg="fast") # # again, computation of bw but always equal to 1 .... # Compute cross-correlation peak # bw <- ceiling(2/iparams$sep.range[2]) # crosscorr[i] is compared to crosscorr[i+/-bw] to find peaks # bw = 1 # search for local peaks within bandwidth of bw = 1 peakidx = df.diff(periods=bw) > 0 peakidx = peakidx.astype(int).diff(periods=bw) == -1 # the final bw points are NA and filled with False peakidx = peakidx.shift(-bw).fillna(False) df_peaks = df[peakidx] # when searching for max, exclude peaks from the excluded region exclusion_range = [10, self.read_length + 10] mask = np.logical_or(df_peaks.index < exclusion_range[0], df_peaks.index > exclusion_range[1]) df_peaks = df_peaks[mask] # max_peak = df_peaks.max() shift_peak = df_peaks.idxmax() # now, we select peaks that are at least 90% of main peak and with shift # higher than main shift. why higher ? mask = np.logical_and(df_peaks > max_peak * 0.9, df_peaks.index >= shift_peak) best_df_peaks = df_peaks[mask] best = best_df_peaks.sort_values(ascending=False)[0:3] values = ",".join(["{:.5f}".format(x) for x in best.values]) pos = ",".join([str(x) for x in best.index]) logger.info("Top 3 cross-correlation values: {}".format(values)) logger.info("Top 3 estimates for fragment length: {}".format(pos)) # now the real window half size according to phantom peaks, not spp ... # min + (max-min)/3 threshold = (df_peaks.max() - corr_min) / 3 + corr_min # coming back to real cross correlation, identify peak in window # readlength +- 2*binning !! not symmetry in phantompeak # x >= ( chip.data$read.length - round(2*binning) & # x <= ( chip.data$read.length + round(1.5*binning) binning = self.binning ph_min = self.read_length - round(2 * binning) ph_max = self.read_length + round(1.5 * binning) phantom = df[np.logical_and(df.index >= ph_min, df.index <= ph_max)] logger.info("Phantom peak range detection:{}-{}".format(ph_min, ph_max)) logger.info("Phantom peak location:{}".format(phantom.idxmax())) logger.info("Phantom peak Correlation: {:.5f}".format(phantom.max())) stats["phantom_corr"] = phantom.max() stats["phantom_location"] = int(phantom.idxmax()) # for json NSC = df_peaks.max() / phantom.max() # error in phamtompeaks ?? # when computing RSC, the min is actually set as the last value on the RHS so # phantom_coeff = df_peaks.max() / df.min() # is # phantom_coeff = df_peaks.max() / df.iloc[-1] NSC_spp = df_peaks.max() / df.iloc[-1] logger.info("Normalized Strand cross-correlation coefficient (NSC): {:.5f} [{:.5f}]".format(NSC, NSC_spp)) RSC = (df_peaks.max() - df.min()) / (phantom.max() - df.min()) RSC_spp = (df_peaks.max() - df.iloc[-1]) / (phantom.max() - df.iloc[-1]) # We could use a median to pick up a smooth value # X = df_peaks.rolling(5).median().iloc[-5] # but should alos be done for other metrics # RSC_median = (df_peaks.max() - X) / (phantom.max() - X) logger.info("Relative Strand cross-correlation Coefficient (RSC): {:.5f} [{:.5f}]".format(RSC, RSC_spp)) if RSC > 0 and RSC < 0.25: tag = -2 elif RSC >= 0.25 and RSC < 0.5: tag = -1 elif RSC >= 0.5 and RSC < 1: tag = 0 elif RSC >= 1 and RSC < 1.5: tag = 1 elif RSC >= 1.5: tag = 2 logger.info("Phantom Peak Quality Tag: {}".format(tag)) pylab.clf() df.plot() ##df_peaks.plot(marker="o", lw=0) ylim = pylab.ylim() # whs = df[df > threshold].index.max() # pylab.axvline(whs, ls='--', color='k', lw=1) Y0, Y1 = pylab.ylim() pylab.plot( [phantom.idxmax(), phantom.idxmax()], [Y0, phantom.max()], ls="--", color="k", lw=1, ) pylab.plot([df.idxmax(), df.idxmax()], [Y0, df.max()], ls="--", color="r", lw=2) # pylab.fill_betweenx(ylim, 10,85, color='grey', alpha=0.5) pylab.ylim(ylim) pylab.ylabel("Cross-correlation") pylab.xlabel("strand-shift: {}bp\nNSC={:.5f}, RSC={:.5f}, Qtag={}".format(best.index[0], NSC, RSC, tag)) pylab.xlim(self.start, self.stop) pylab.grid(True, zorder=-20) try: pylab.gcf().set_layout_engine("tight") except: pass stats["NSC"] = NSC stats["RSC"] = RSC stats["Qtag"] = tag return stats
[docs] def get_data(self, chrname, remove_anomalies=True): # Could be done once for all in read_alignment df = self.df.query("ref==@chrname") # first the fragment position, shifting - strand by fragment length data = np.array([x if z == "+" else -y for x, y, z in zip(df["start"], df["end"], df["strand"])]) # sort by absolute position res = pd.DataFrame(data) res.columns = ["x"] res["abs"] = res["x"].abs() res = res.sort_values("abs") del res["abs"] if remove_anomalies: mask = self.remove_anomalies(res) res = res[mask] return res
[docs] def remove_anomalies(self, data, bin=1, trim_fraction=1e-3, z=5, return_indecies=False): zo = z * 3 x = data["x"] # the frequencies, sorted from smaller to larger tt = np.floor(x / bin).value_counts().sort_values() # sort and select first 99.9% # floor to agree with phantom stt = tt.iloc[0 : int(np.floor(len(tt) * (1 - trim_fraction)))] mtc = stt.mean() var_base = 0.1 tcd = np.sqrt(stt.var() + var_base) thr = max(1, np.ceil(mtc + z * tcd)) thr_o = max(1, np.ceil(mtc + zo * tcd)) # filter tt tt = tt[tt > thr] # get + and - tags tp = tt.index pti = set(tp[tp > 0]) pti2 = set([-x for x in tp[tp < 0]]) it = pti.intersection(pti2) it = sorted(it.union(set(tt[tt > thr_o].index))) # sit <- c(it,(-1)*it); # sit = [-x for x in it] + list(it) sit = list(it)[::-1] + [-x for x in list(it)[::-1]] if bin > 1: # From 0 to 5+1 to agree with phantom R code sit = sorted([x * 5 + y for y in range(0, 6) for x in sit]) sit = set(sit) return [False if x in sit else True for x in data["x"]]
[docs] def scc(self, data, llim=10): tt = (np.sign(data["x"]) * np.floor(data["x"].abs() / self.binning + 0.5)).value_counts() mu = tt.mean() tt = tt[tt < llim * mu] # tc = tt.index pdata = tt[tc > 0] ndata = tt[tc < 0] ntv = ndata.values ptv = pdata.values self.pdata = pdata self.ndata = ndata r1 = min(-1 * ndata.index.max(), pdata.index.min()) r2 = max(-1 * ndata.index.min(), pdata.index.max()) l = r2 - r1 + 1 self.L = l mp = ptv.sum() * self.binning / l mn = ntv.sum() * self.binning / l ptv = ptv - mp ntv = ntv - mn self.mp = mp self.mn = mn ntc = -tt.index[[True if x < 0 else False for x in tt.index]].values ptc = tt.index[[True if x > 0 else False for x in tt.index]].values self.ntc = ntc self.ptc = ptc self.ntv = ntv self.ptv = ptv ss = np.sqrt((sum(ptv**2) + (l - len(ptv)) * mp**2) * (sum(ntv**2) + (l - len(ntv)) * mn**2)) self.ss = ss self.nn = pd.DataFrame(self.ntv, index=self.ntc) logger.info("mp={}; mn={}, ss={}".format(self.mp, self.mn, self.ss)) logger.info( "ptv sum={}; ptv len ={}, ntv sum={} ntv len {}".format( sum(self.ptv), len(self.ptv), sum(self.ntv), len(self.ntv) ) ) logger.info( "ptc sum={}; ptc len ={}, ntc sum={} ntc len {}".format( sum(self.ptc), len(self.ptc), sum(self.ntc), len(self.ntc) ) )
[docs] def cor(self, s): logger.info("shift {}".format(s)) p = pd.DataFrame(self.ptv, index=self.ptc + s) n = self.nn self.X = p[p.index.isin(n.index)] self.compX = p[p.index.isin(n.index) == False] self.Y = n[n.index.isin(p.index)] self.compY = n[n.index.isin(p.index) == False] R = self.L - len(self.ptv) - len(self.ntc) + len(self.X) # using dataframe # !!!!!!!!!!!! multiplication takes into account indexing XY = (self.X * self.Y).sum()[0] XX = self.compX.sum()[0] * self.mn YY = self.compY.sum()[0] * self.mp corr = (XY - XX - YY + self.mp * self.mn * R) / self.ss return corr