Source code for sequana.idr

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2023 - Sequana Development Team
#
#  File author(s): Sequana team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################

import colorlog

from sequana.lazy import pylab
from sequana.utils.pandas import PandasReader

logger = colorlog.getLogger(__name__)


__all__ = ["IDR"]


[docs] class IDR: """Reader for the output generated by idr package Can read the narrow or broad output transparently. Note that signalValue = rep1_signal + rep2_signal The score columns contains the scaled IDR value, min(int(log2(-125IDR), 1000). This means that peaks with an IDR of 0 have a score of 1000. A peak with an IDR of 0.05 has a score of int(-125log2(0.05)) = 540. Finally, a peaks with an IDR of 1.0 have a score of 0. Examples:: IDR, score 0.0039, 1000 0.01, 830 0.05, 540 0.1, 415 0.5, 125 This allows to differentiate those that crosses an IDR or 0.05. The final IDR is stored in global_idr """ def __init__(self, filename, threshold=0.05): self.df = PandasReader(filename, sep="\t", header=None).df self.threshold = threshold narrow_columns = [ "chrom", "start", "end", "region_name", "score", "strand", "signalValue", "pvalue", "qvalue", "summit", "local_idr", "global_idr", "rep1_chrom_start", "rep1_chrom_end", "rep1_signal", "rep1_summit", "rep2_chrom_start", "rep2_chrom_end", "rep2_signal", "rep2_summit", ] broad_columns = [ "chrom", "start", "end", "region_name", "score", "strand", "signalValue", "pvalue", "qvalue", "local_idr", "global_idr", "rep1_chrom_start", "rep1_chrom_end", "rep1_signal", "rep2_chrom_start", "rep2_chrom_end", "rep2_signal", ] try: self.df.columns = narrow_columns self._mode = "narrow" except Exception: try: self.df.columns = broad_columns self._mode = "broad" except ValueError: # pragma: no cover (empty dataframe) pass if len(self.df): # add ranks for rep1/rep2 self.df["rep1_rank"] = self.df["rep1_signal"].rank(ascending=False) self.df["rep2_rank"] = self.df["rep2_signal"].rank(ascending=False) self.df["idr"] = 10 ** -self.df["local_idr"] def __len__(self): return len(self.df) def _get_mode(self): return self._mode mode = property(_get_mode) def _get_N_significant_peaks(self): if len(self.df) == 0: # pragma: no cover return 0 else: return len(self.df.query("idr<@self.threshold")) N_significant_peaks = property(_get_N_significant_peaks)
[docs] def IDR2score(self, IDR): if IDR == 0: return 1000 return min(int(-125 * pylab.log2(IDR)), 1000)
[docs] def score2IDR(self, score): return 2 ** (score / -125)
[docs] def plot_ranks(self, filename=None, savefig=False): # ranks # the *score* columns contains the scaled IDR value, min(int(log2(-125IDR), 1000). # e.g. peaks with an IDR of 0 have a score of 1000, idr 0.05 have a score of # int(-125log2(0.05)) = 540, and idr 1.0 has a score of 0. df1 = self.df.query("score>540") df2 = self.df.query("score<=540") pylab.clf() pylab.plot(df1.rep1_rank, df1.rep2_rank, "ko", alpha=0.5, label="<0.05 IDR") pylab.plot(df2.rep1_rank, df2.rep2_rank, "ro", alpha=0.5, label=">=0.05 IDR") pylab.xlabel("Peak rank - replicate 1") pylab.ylabel("Peak rank - replicate 2") N = len(self.df) pylab.plot([0, N], [0, N], color="blue", alpha=0.5, ls="--") pylab.legend(loc="lower right") if savefig: pylab.savefig(filename)
[docs] def plot_scores(self, filename=None, savefig=False): # scores pylab.clf() pylab.plot( pylab.log10(self.df.query("score>540")["rep1_signal"]), pylab.log10(self.df.query("score>540")["rep2_signal"]), "ko", alpha=0.5, label="<0.05 IDR", ) pylab.plot( pylab.log10(self.df.query("score<540")["rep1_signal"]), pylab.log10(self.df.query("score<540")["rep2_signal"]), "ro", alpha=0.5, label=">=0.05 IDR", ) N = pylab.ylim()[1] pylab.plot([0, N], [0, N], color="blue", alpha=0.5, ls="--") pylab.xlabel("Rep1 log10 score") pylab.ylabel("Rep2 log10 score") pylab.legend(loc="lower right") if savefig: pylab.savefig(filename)
[docs] def plot_rank_vs_idr_score(self, filename=None, savefig=False): # rank versus IDR scores f, axes = pylab.subplots(2, 1) df = self.df axes[0].plot( range(len(df)), df.sort_values(by="rep1_rank", ascending=False)["local_idr"], "o", ) axes[0].set_ylabel("log10 IDR for replicate 1") axes[0].axvline(len(self.df) - self.N_significant_peaks, color="b", ls="--") axes[1].plot( range(len(df)), df.sort_values(by="rep2_rank", ascending=False)["local_idr"], "ro", ) axes[1].set_ylabel("log10 IDR for replicate 2") axes[1].axvline(len(self.df) - self.N_significant_peaks, color="b", ls="--") if savefig: pylab.savefig(filename)
[docs] def plot_idr_vs_peaks(self, filename=None, savefig=False): pylab.clf() X1 = pylab.linspace(0, self.threshold, 100) X2 = pylab.linspace(self.threshold, 1, 100) # convert local idr to proba df1 = self.df.query("idr<@self.threshold") df2 = self.df.query("idr>=@self.threshold") pylab.plot([sum(df1["idr"] < x) for x in X1], X1, "-", color="r", lw=2) shift = len(df1) pylab.plot([shift + sum(df2["idr"] < x) for x in X2], X2, "-", color="k", lw=2) pylab.xlabel("Number of significant peaks") pylab.ylabel("IDR") pylab.axhline(0.05, color="b", ls="--") pylab.axvline(self.N_significant_peaks, color="b", ls="--") if savefig: pylab.savefig(filename)