Source code for sequana.macs3

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2023 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################

import colorlog

from sequana.lazy import pandas as pd
from sequana.lazy import pylab
from sequana.utils.pandas import PandasReader
from sequana.viz.venn import plot_venn

logger = colorlog.getLogger(__name__)


__all__ = ["MACS3Reader", "PeakConsensus"]


[docs]class MACS3Reader: """This class reads output of macs3 tool :: from sequana import MACS3Reader mr = MACS3Reader(data) mr.df mr.plot_volcano() If input file ends in .xls, we assume this is a NAME_peaks.xls file. It is therefore a tabulated file where columns are: - chromosome name - start position of peak - end position of peak - length of peak region - absolute peak summit position - pileup height at peak summit - -log10(pvalue) for the peak summit (e.g. pvalue =1e-10, then this value should be 10) - fold enrichment for this peak summit against random Poisson distribution with local lambda, - -log10(qvalue) at peak summit Note that coordinates in XLS is 1-based which is different from BED format that follows. When --broad is enabled for broad peak calling, the pileup, p-value, q-value, and fold change in the XLS file will be the mean value across the entire peak region, since peak summit won't be called in broad peak calling mode. NAME_peaks.narrowPeak is BED6+4 format file which contains the peak locations together with peak summit, p-value, and q-value. The fifth column is the **score** calculate as int(-10*log10_pvalue) or int(-10*log10_pvalue) where log10-pvalue/log10-qvalue is the 8th or 9th column depending on whether -p (pvalue) or -q (qvalue) is used as score cutoff. The 7th columns stores the fold-change at peak summit. The 8th is -log10pvalue at peak summit and the 9th is -log10qvalue at peak summit. The 10th is the relative summit position to peak start equivalent to absolute peak summit position in the xls file. NAME_peaks.broadPeak is in BED6+3 format which is similar to narrowPeak file, except for missing the 10th column for annotating peak summits. This file and the gappedPeak file will only be available when --broad is enabled. Since in the broad peak calling mode, the peak summit won't be called, the values in the 5th, and 7-9th columns are the mean value across all positions in the peak region. Refer to narrowPeak if you want to fix the value issue in the 5th column .. reference:: https://github.com/macs3-project/MACS/blob/master/docs/callpeak.md """ def __init__(self, filename): # .xls is not stuctured similarly to .narrowPeak if filename.endswith(".xls"): # self.df = PandasReader(filename, comment="#", sep="\t").df # narrow and broad files have this header in common and can be checked. for x in [ "chr", "start", "end", "abs_summit", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)", "name", ]: # we could have also pileup depending on the user's options assert x in self.df.columns self.df["stop"] = self.df["end"] elif filename.endswith("narrowPeak"): columns = [ "chr", "start", "stop", "name", "score", "NA", "fold_enrichment", "-log10(pvalue)", "-log10(qvalue)", "abs_summit_from_start", ] self.df = PandasReader(filename, header=None, sep="\t", columns=columns).df elif filename.endswith("broadPeak"): columns = [ "chr", "start", "stop", "name", "score", "NA", "fold_enrichment", "-log10(pvalue)", "-log10(qvalue)", ] self.df = PandasReader(filename, header=None, sep="\t", columns=columns).df self.df["length"] = self.df["stop"] - self.df["start"] # def __len__(self): return len(self.df)
[docs] def plot_volcano(self, plotly=False, marker_color="b", title=""): """Plots -log10 p-value versus fold change of all peaks""" if plotly: from plotly import express as px df = self.df.copy() df["log_adj_pvalue"] = self.df["-log10(pvalue)"] df["log2FoldChange"] = pylab.log2(self.df["fold_enrichment"]) df["info"] = df["chr"] + ":" + df["start"].astype(str) + "-" + df["stop"].astype(str) fig = px.scatter( df, x="log2FoldChange", y="log_adj_pvalue", hover_name="info", log_y=False, color="length", height=600, title=title, labels={"log2_fold_enrichment": "log10 p-value"}, ) return fig pylab.scatter( pylab.log2(self.df["fold_enrichment"]), self.df["-log10(pvalue)"], marker="o", alpha=0.5, color=marker_color, lw=0, s=self.df["length"] / self.df["length"].max() * 400, ) pylab.xlabel("Fold enrichment") pylab.ylabel("log10 pvalue") pylab.title(title)
[docs]class PeakConsensus: def __init__(self, f1, f2): self.df1 = MACS3Reader(f1).df self.df1["category"] = "first" self.df2 = MACS3Reader(f2).df self.df2["category"] = "second" self.df_merged = self.merge()
[docs] def merge(self, overlap=0.2): df = pd.concat([self.df1, self.df2]).sort_values(["chr", "start"]) # if overlap at least one base, we merge the peaks and label them with # common information, otherwise we report the original peak merged = [] prev = None N1 = 0 N2 = 0 N12 = 0 skip_next = True for k, current in df.iterrows(): if skip_next: prev = current skip_next = False continue # if current overlaps the prev start or end, there is overlap # or if current included in prev there current and prev overlaps if current["start"] <= prev["start"] and current["stop"] >= prev["start"]: overlap = True N12 += 1 elif current["start"] <= prev["stop"] and current["stop"] >= prev["stop"]: overlap = True N12 += 1 elif current["start"] >= prev["start"] and current["stop"] <= prev["stop"]: overlap = True N12 += 1 else: overlap = False if prev["name"].startswith("1_vs_6_7"): N1 += 1 elif prev["name"].startswith("2_vs_6_7"): N2 += 1 if overlap: m = min(current["start"], prev["start"]) M = max(current["stop"], prev["stop"]) data = current.copy() data["start"] = m data["stop"] = M data["category"] = "both" merged.append(data) skip_next = True else: m = min(current["start"], prev["start"]) M = max(current["stop"], prev["stop"]) merged.append(prev) skip_next = False prev = current df = pd.DataFrame(merged) df = df.reset_index(drop=True) return df
[docs] def plot_venn(self, title="", labels=[]): plot_venn( ( set(self.df_merged.query("category in ['first', 'both']").index), set(self.df_merged.query("category in ['second', 'both']").index), ), labels=(("first", "second")), )
[docs] def to_saf(self, filename): """For now, all strand are categorised as strand + . strand not used in the pipeline for now.""" df = self.df_merged.reset_index() df["strand"] = "+" df["GeneID"] = df["index"] df[["GeneID", "chr", "start", "stop", "strand", "category"]].to_csv(filename, sep="\t", index=None)
[docs] def to_bed(self, filename): # output to be used by homer # annotatePeaks.pl file.bed file.fa -gid -gff file.gff # GeneID seems to have to start with Interval_ ?? df = self.df_merged.reset_index() df["strand"] = "+" df["GeneID"] = ["Interval_{}".format(x) for x in df["index"]] mapper = {"both": 3, "first": 1, "second": 2} df["score"] = [mapper[x] for x in df["category"]] df[["chr", "start", "stop", "GeneID", "score", "strand"]].to_csv(filename, sep="\t", index=None, header=False)