#
# This file is part of Sequana software
#
# Copyright (c) 2016-2023 - Sequana Development Team
#
# File author(s):
# Thomas Cokelaer <thomas.cokelaer@pasteur.fr>
#
# Distributed under the terms of the 3-clause BSD license.
# The full license is in the LICENSE file, distributed with this software.
#
# website: https://github.com/sequana/sequana
# documentation: http://sequana.readthedocs.io
#
##############################################################################
import colorlog
from sequana.lazy import numpy as np
from sequana.lazy import pandas as pd
from sequana.lazy import pylab
from sequana.utils.pandas import PandasReader
logger = colorlog.getLogger(__name__)
__all__ = ["PhantomPeaksReader", "Phantom"]
[docs]
class PhantomPeaksReader:
"""Manipulate output of PhantomPeaks.
The metrics file is tabulated:
* Filename
* numReads: effective sequencing depth i.e. total number of mapped reads in the input file.
* estFragLen: comma separated strand cross-correlation peak(s) in decreasing order of correlation.
In almost all cases, the top (first) value in the list represents the predominant fragment length.
* corr estFragLen: comma separated strand cross-correlation value(s) in decreasing order
(col3 follows the same order).
* phantomPeak: Read length/phantom peak strand shift.
* corr phantomPeak: Correlation value at phantom peak.
* argmin corr: strand shift at which cross-correlation is lowest.
* min corr: minimum value of cross-correlation.
* Normalized strand cross-correlation coefficient (NSC) = COL4 / COL8.
NSC > 1.1 is good.
* Relative strand cross-correlation coefficient (RSC) = (COL4 - COL8) / (COL6 - COL8);
RSC = 0 means no signal, < 1 low quality and > 1 means high enrichment. Aim for RSC > 0.8.
* QualityTag: Quality tag based on thresholded RSC (codes: -2:veryLow; -1:Low; 0:Medium; 1:High; 2:veryHigh).
"""
_columns = [
"filename",
"num_reads",
"estimated_fragment_length",
"corr",
"phantom_peak",
"corr_phantom_peak",
"argmin_corr",
"min_corr",
"NSC",
"RSC",
"quality_tag",
]
def __init__(self, filename):
self.df = PandasReader(filename, sep="\t", header=None).df
self.df.columns = self._columns
[docs]
def read(self, filename):
df = PandasReader(filename, sep="\t", header=None).df
df.columns = self._columns
self.df = pd.concat([self.df, df], axis=0)
[docs]
def plot_RSC(self):
self.df.RSC.plot(kind="bar")
pylab.axhline(0.8, lw=2, color="r", ls="--")
[docs]
class Phantom:
"""Read alignment files and generated phantom peaks.
This class reads **align** files generated from bam files using this set of commands::
samtools view -F 0x0204 -o | awk 'BEGIN{FS="\t";OFS="\t"} {if (and($2,16) > 0) {print $3,($4-1),($4-1+length($10)),"N","1000","-"} else {print $3,($4-1),($4-1+length($10)),"N","1000","+"}}' - > test.align
prints:
- the reference chromosome,
- the starting position-1
- position -1 + sequence length
- N
- 1000
- + if flag is 16 (or equivalent) otherwise prints -
::
c = Phantom()
c.chromosomes
data = c.get_data('NC_002506.1')
mask = c.remove_anomalies(data)
data = data[mask]
c.scc(data)
X = range(-100,100)
Y = [c.cor(x) for x in X]
plot([x*5 for x in X], Y)
c = chipseq.Phantom(binning=10, start=-500, stop=500)
c.read_align("test.align")
results, df = c.run()
c.stats(results, df)
"""
def __init__(self, bamfile=None, binning=5, start=-500, stop=1500):
self.start = start
self.stop = stop
self.binning = binning
self.df = None
[docs]
def read_align(self, readfile):
self.df = PandasReader(
readfile,
sep="\t",
header=None,
names=["ref", "start", "end", "dummy", "quality", "strand"],
dtype={
"ref": str,
"start": "Int64",
"end": "Int64",
"dummy": str,
"quality": "Int64",
"strand": str,
},
).df
self.read_length = round(pylab.median(self.df["end"] - self.df["start"]))
self.chromosomes = self.df["ref"].unique()
[docs]
def run(self):
## 10% of the time in self.get_data and 90 in cor()
if self.df is None:
logger.error("call read_align() method to read alignement file")
return
m = int(self.start / self.binning)
M = int(self.stop / self.binning)
# because bins is set to 5, we actually go from m*5 to M*5
X = range(m, M + 1, 1)
Xreal = np.arange(m * self.binning, (M + 1) * self.binning, self.binning)
results = {}
for chrom in self.chromosomes:
# logger.info("Processing {}".format(chrom))
data = self.get_data(chrom)
L = len(data)
self.scc(data)
# shift correlation
Y = [self.cor(x) for x in X]
results[chrom] = {"data_length": L, "Y": np.array(Y), "X": Xreal}
# weighted average usng orginal length of the chrmosomes
weights = np.array([results[x]["data_length"] for x in self.chromosomes])
weights = weights / sum(weights)
self.results = results
self.weights = weights
# now the weighted cross correlation
df_avc = pd.DataFrame([w * results[x]["Y"] for w, x in zip(weights, self.chromosomes)])
df_avc = df_avc.T
df_avc.index = Xreal
return results, df_avc
[docs]
def stats(self, results, df_avc, bw=1):
stats = {}
# some simple stars about the reads
stats["read_fragments"] = len(self.df)
stats["fragment_length"] = self.read_length
logger.info("Read {} fragments".format(stats["read_fragments"]))
logger.info("ChIP data mean length: {}".format(self.read_length))
# average cross correlation across all chromosomes
# df_avc.sum(axis=1).plot()
df = df_avc.sum(axis=1)
corr_max = df.max()
shift_max = df.idxmax()
# note that in phantomPeak, they use the last value as min... not the
# actual min. Not very important.
corr_min = df.min()
shift_min = df.idxmin()
logger.info("Maximum cross-correlation value: {:.5f}".format(corr_max))
logger.info("Maximum cross-correlation shift: {}".format(shift_max))
logger.info("Minimum cross-correlation value: {:.5f}".format(corr_min))
logger.info("Minimum cross-correlation shift: {}".format(shift_min))
stats["shift_max"] = int(shift_max) # to make it json serialisable
stats["corr_max"] = corr_max
# original code phantomPeak but always equal to 1 it range max >5 ??
# default is 500 so sbw=1 whatsoever
# sbw = 2 * floor(ceil(5/15000) / 2) + 1
sbw = 1
# here we could use a rolling mean
# df.rolling(window=5, center=True).mean()
# so following runnin mean is useless
# cc$y <- runmean(cc$y,sbw,alg="fast")
#
# again, computation of bw but always equal to 1 ....
# Compute cross-correlation peak
# bw <- ceiling(2/iparams$sep.range[2]) # crosscorr[i] is compared to crosscorr[i+/-bw] to find peaks
# bw = 1
# search for local peaks within bandwidth of bw = 1
peakidx = df.diff(periods=bw) > 0
peakidx = peakidx.astype(int).diff(periods=bw) == -1
# the final bw points are NA and filled with False
peakidx = peakidx.shift(-bw).fillna(False)
df_peaks = df[peakidx]
# when searching for max, exclude peaks from the excluded region
exclusion_range = [10, self.read_length + 10]
mask = np.logical_or(df_peaks.index < exclusion_range[0], df_peaks.index > exclusion_range[1])
df_peaks = df_peaks[mask]
#
max_peak = df_peaks.max()
shift_peak = df_peaks.idxmax()
# now, we select peaks that are at least 90% of main peak and with shift
# higher than main shift. why higher ?
mask = np.logical_and(df_peaks > max_peak * 0.9, df_peaks.index >= shift_peak)
best_df_peaks = df_peaks[mask]
best = best_df_peaks.sort_values(ascending=False)[0:3]
values = ",".join(["{:.5f}".format(x) for x in best.values])
pos = ",".join([str(x) for x in best.index])
logger.info("Top 3 cross-correlation values: {}".format(values))
logger.info("Top 3 estimates for fragment length: {}".format(pos))
# now the real window half size according to phantom peaks, not spp ...
# min + (max-min)/3
threshold = (df_peaks.max() - corr_min) / 3 + corr_min
# coming back to real cross correlation, identify peak in window
# readlength +- 2*binning !! not symmetry in phantompeak
# x >= ( chip.data$read.length - round(2*binning) &
# x <= ( chip.data$read.length + round(1.5*binning)
binning = self.binning
ph_min = self.read_length - round(2 * binning)
ph_max = self.read_length + round(1.5 * binning)
phantom = df[np.logical_and(df.index >= ph_min, df.index <= ph_max)]
logger.info("Phantom peak range detection:{}-{}".format(ph_min, ph_max))
logger.info("Phantom peak location:{}".format(phantom.idxmax()))
logger.info("Phantom peak Correlation: {:.5f}".format(phantom.max()))
stats["phantom_corr"] = phantom.max()
stats["phantom_location"] = int(phantom.idxmax()) # for json
NSC = df_peaks.max() / phantom.max()
# error in phamtompeaks ??
# when computing RSC, the min is actually set as the last value on the RHS so
# phantom_coeff = df_peaks.max() / df.min()
# is
# phantom_coeff = df_peaks.max() / df.iloc[-1]
NSC_spp = df_peaks.max() / df.iloc[-1]
logger.info("Normalized Strand cross-correlation coefficient (NSC): {:.5f} [{:.5f}]".format(NSC, NSC_spp))
RSC = (df_peaks.max() - df.min()) / (phantom.max() - df.min())
RSC_spp = (df_peaks.max() - df.iloc[-1]) / (phantom.max() - df.iloc[-1])
# We could use a median to pick up a smooth value
# X = df_peaks.rolling(5).median().iloc[-5]
# but should alos be done for other metrics
# RSC_median = (df_peaks.max() - X) / (phantom.max() - X)
logger.info("Relative Strand cross-correlation Coefficient (RSC): {:.5f} [{:.5f}]".format(RSC, RSC_spp))
if RSC > 0 and RSC < 0.25:
tag = -2
elif RSC >= 0.25 and RSC < 0.5:
tag = -1
elif RSC >= 0.5 and RSC < 1:
tag = 0
elif RSC >= 1 and RSC < 1.5:
tag = 1
elif RSC >= 1.5:
tag = 2
logger.info("Phantom Peak Quality Tag: {}".format(tag))
pylab.clf()
df.plot()
##df_peaks.plot(marker="o", lw=0)
ylim = pylab.ylim()
# whs = df[df > threshold].index.max()
# pylab.axvline(whs, ls='--', color='k', lw=1)
Y0, Y1 = pylab.ylim()
pylab.plot(
[phantom.idxmax(), phantom.idxmax()],
[Y0, phantom.max()],
ls="--",
color="k",
lw=1,
)
pylab.plot([df.idxmax(), df.idxmax()], [Y0, df.max()], ls="--", color="r", lw=2)
# pylab.fill_betweenx(ylim, 10,85, color='grey', alpha=0.5)
pylab.ylim(ylim)
pylab.ylabel("Cross-correlation")
pylab.xlabel("strand-shift: {}bp\nNSC={:.5f}, RSC={:.5f}, Qtag={}".format(best.index[0], NSC, RSC, tag))
pylab.xlim(self.start, self.stop)
pylab.grid(True, zorder=-20)
try:
pylab.gcf().set_layout_engine("tight")
except:
pass
stats["NSC"] = NSC
stats["RSC"] = RSC
stats["Qtag"] = tag
return stats
[docs]
def get_data(self, chrname, remove_anomalies=True):
# Could be done once for all in read_alignment
df = self.df.query("ref==@chrname")
# first the fragment position, shifting - strand by fragment length
data = np.array([x if z == "+" else -y for x, y, z in zip(df["start"], df["end"], df["strand"])])
# sort by absolute position
res = pd.DataFrame(data)
res.columns = ["x"]
res["abs"] = res["x"].abs()
res = res.sort_values("abs")
del res["abs"]
if remove_anomalies:
mask = self.remove_anomalies(res)
res = res[mask]
return res
[docs]
def remove_anomalies(self, data, bin=1, trim_fraction=1e-3, z=5, return_indecies=False):
zo = z * 3
x = data["x"]
# the frequencies, sorted from smaller to larger
tt = np.floor(x / bin).value_counts().sort_values()
# sort and select first 99.9%
# floor to agree with phantom
stt = tt.iloc[0 : int(np.floor(len(tt) * (1 - trim_fraction)))]
mtc = stt.mean()
var_base = 0.1
tcd = np.sqrt(stt.var() + var_base)
thr = max(1, np.ceil(mtc + z * tcd))
thr_o = max(1, np.ceil(mtc + zo * tcd))
# filter tt
tt = tt[tt > thr]
# get + and - tags
tp = tt.index
pti = set(tp[tp > 0])
pti2 = set([-x for x in tp[tp < 0]])
it = pti.intersection(pti2)
it = sorted(it.union(set(tt[tt > thr_o].index)))
# sit <- c(it,(-1)*it);
# sit = [-x for x in it] + list(it)
sit = list(it)[::-1] + [-x for x in list(it)[::-1]]
if bin > 1:
# From 0 to 5+1 to agree with phantom R code
sit = sorted([x * 5 + y for y in range(0, 6) for x in sit])
sit = set(sit)
return [False if x in sit else True for x in data["x"]]
[docs]
def scc(self, data, llim=10):
tt = (np.sign(data["x"]) * np.floor(data["x"].abs() / self.binning + 0.5)).value_counts()
mu = tt.mean()
tt = tt[tt < llim * mu]
#
tc = tt.index
pdata = tt[tc > 0]
ndata = tt[tc < 0]
ntv = ndata.values
ptv = pdata.values
self.pdata = pdata
self.ndata = ndata
r1 = min(-1 * ndata.index.max(), pdata.index.min())
r2 = max(-1 * ndata.index.min(), pdata.index.max())
l = r2 - r1 + 1
self.L = l
mp = ptv.sum() * self.binning / l
mn = ntv.sum() * self.binning / l
ptv = ptv - mp
ntv = ntv - mn
self.mp = mp
self.mn = mn
ntc = -tt.index[[True if x < 0 else False for x in tt.index]].values
ptc = tt.index[[True if x > 0 else False for x in tt.index]].values
self.ntc = ntc
self.ptc = ptc
self.ntv = ntv
self.ptv = ptv
ss = np.sqrt((sum(ptv**2) + (l - len(ptv)) * mp**2) * (sum(ntv**2) + (l - len(ntv)) * mn**2))
self.ss = ss
self.nn = pd.DataFrame(self.ntv, index=self.ntc)
logger.info("mp={}; mn={}, ss={}".format(self.mp, self.mn, self.ss))
logger.info(
"ptv sum={}; ptv len ={}, ntv sum={} ntv len {}".format(
sum(self.ptv), len(self.ptv), sum(self.ntv), len(self.ntv)
)
)
logger.info(
"ptc sum={}; ptc len ={}, ntc sum={} ntc len {}".format(
sum(self.ptc), len(self.ptc), sum(self.ntc), len(self.ntc)
)
)
[docs]
def cor(self, s):
logger.info("shift {}".format(s))
p = pd.DataFrame(self.ptv, index=self.ptc + s)
n = self.nn
self.X = p[p.index.isin(n.index)]
self.compX = p[p.index.isin(n.index) == False]
self.Y = n[n.index.isin(p.index)]
self.compY = n[n.index.isin(p.index) == False]
R = self.L - len(self.ptv) - len(self.ntc) + len(self.X)
# using dataframe
# !!!!!!!!!!!! multiplication takes into account indexing
XY = (self.X * self.Y).sum()[0]
XX = self.compX.sum()[0] * self.mn
YY = self.compY.sum()[0] * self.mp
corr = (XY - XX - YY + self.mp * self.mn * R) / self.ss
return corr