#
# This file is part of Sequana software
#
# Copyright (c) 2016-2021 - Sequana Development Team
#
# Distributed under the terms of the 3-clause BSD license.
# The full license is in the LICENSE file, distributed with this software.
#
# website: https://github.com/sequana/sequana
# documentation: http://sequana.readthedocs.io
#
##############################################################################
import colorlog
from tqdm import tqdm
from sequana.enrichment.ontology import Ontology
from sequana.enrichment.plot_go_terms import PlotGOTerms
from sequana.enrichment.quickgo import QuickGOGraph
from sequana.lazy import numpy as np
from sequana.lazy import pandas as pd
from sequana.lazy import pylab
logger = colorlog.getLogger(__name__)
__all__ = ["PantherEnrichment"]
[docs]class PantherEnrichment(Ontology, PlotGOTerms):
"""
By default, we keep only the genes with a adjusted pvalue <= 0.05. The
fold_change threshold is on a log2 scale and set to 0 (meaning no
filtering). Only one value is requested and
used to filter out positive and negative fold change (on the log2
scale). In other word, a log2 fold change threshold of 2 means that we
filter out values between -2 and 2.
If you prefer to work in natural scale, just set the parameter
fc_threshold, which overwrite the log2_fc_threshold parameter.
::
pe = PantherEnrichment("input_file.tsv", taxon=10090, log2_fc_threshold=1)
# compute enrichment for genes down and up
pe.compute_enrichment()
# Results for up case is stored in pe.enrichment
# then, we plot the most mportat go terms
df_up = pe.plot_go_terms("up")
df = pe.plot_go_terms("up", pe.MF)
pe.save_chart(df, "chart_MF_up.png")
# all 3 main ontology
df = pe.plot_go_terms("up")
pe.save_chart(df, "chart_up.png")
e.stats contains some statistics. One important is the list of unmapped
genes. The results from the GO enrichment are stored in the attributes
enrichment. There, we have again adjusted p-value and a fold enrichment,
which can in turn be filtered or not.
You can retrieve the cleaned data using the get_data method.
You can also plot the GO terms that are significantly enriched using::
e.plot_go_terms(['MF', 'CC', 'BP])
This function returns the dataframe used during the plotting.
If you want to look at the up regulated genes only::
e.compute_enrichment(pe.mygenes["up"], 83333)
df = e.plot_go_terms(['MF', 'CC', 'BP'],
log=False, include_negative_enrichment=False,
fontsize=8, sort_by='fold_enrichment',
show_pvalues=True, fdr_threshold=0.05)
The number of genes is limited to about 3100 depending (don't ask me
why, this seem to be a hard-coded limitation on PantherDB website).
In such case, you should add a filter e.g on padj or fold change
"""
def __init__(
self,
gene_lists,
taxon,
requests_per_sec=10,
padj_threshold=0.05,
log2_fc_threshold=0,
fc_threshold=None,
enrichment_fdr=0.05,
annot_col="Name",
):
"""
rnadiff if provided, superseeds the input filename. This is useful for
debugging
"""
Ontology.__init__(self)
PlotGOTerms.__init__(self)
self.gene_lists = gene_lists
self.enrichment_fdr = enrichment_fdr
# users can set the fold change threshold in the log2 scale or normal
# scale.
assert log2_fc_threshold >= 0, "log2 fc_threshold must be >=0"
if fc_threshold is not None:
log2_fc_threshold = pylab.log2(fc_threshold)
from bioservices import panther, quickgo
self.quick_go_graph = QuickGOGraph()
self.panther = panther.Panther(cache=True)
self.valid_taxons = [x["taxon_id"] for x in self.panther.get_supported_genomes()]
self.summary = {}
self._taxon = None
self.taxon = taxon
self.quickgo = quickgo.QuickGO(cache=True)
self.quickgo.requests_per_sec = requests_per_sec
self.quickgo.services.settings.TIMEOUT = 120
self._ancestors = {
"MF": "GO:0003674",
"CC": "GO:0005575",
"BP": "GO:0008150",
"SLIM_MF": "GO:0003674",
"SLIM_CC": "GO:0005575",
"SLIM_BP": "GO:0008150",
}
self.ontologies.extend(
[
"ANNOT_TYPE_ID_PANTHER_GO_SLIM_MF",
"ANNOT_TYPE_ID_PANTHER_GO_SLIM_BP",
"ANNOT_TYPE_ID_PANTHER_GO_SLIM_CC",
"ANNOT_TYPE_ID_PANTHER_PC",
"ANNOT_TYPE_ID_PANTHER_PATHWAY",
"ANNOT_TYPE_ID_REACTOME_PATHWAY",
]
)
self.ontology_aliases.extend(
[
"SLIM_MF",
"SLIM_BP",
"SLIM_CC",
"PROTEIN",
"PANTHER_PATHWAY",
"REACTOME_PATHWAY",
]
)
# panther accepts onyl ~2-3000 genes at max. Let us restrict the analysis
# to the first 2000 genes based on their log2 fold change 2000 + and
# 2000 negatives
msg = "Ignoring DEGs with adjusted p-value > {} and fold change in [{}, {}]".format(
padj_threshold, 1 / (2**log2_fc_threshold), 2**log2_fc_threshold
)
logger.info(msg)
# used in report module
self.summary["fold_change_range"] = [
1 / (2**log2_fc_threshold),
2**log2_fc_threshold,
]
self.summary["padj_threshold"] = padj_threshold
fc_threshold = log2_fc_threshold
for x in sorted(gene_lists.keys()):
N = len(gene_lists[x])
logger.info(f"Starting with {N} genes from category '{x}'")
self.summary["DGE_after_filtering"] = {k: len(v) for k, v in gene_lists.items()}
self.enrichment = {}
self.stats = {}
self.obsolets = []
def _set_taxon(self, taxon):
if taxon not in self.valid_taxons:
raise ValueError(f"taxon {taxon} not in pantherDB. please use one of {self.valid_taxons}")
self.taxon_info = [x for x in self.panther.get_supported_genomes() if x["taxon_id"] == taxon]
self.taxon_info = self.taxon_info[0]
self._taxon_id = taxon
def _get_taxon(self):
return self._taxon_id
taxon = property(_get_taxon, _set_taxon)
[docs] def compute_enrichment(
self,
taxid=None,
ontologies=None,
enrichment_test="FISHER",
correction="FDR",
):
"""
:param enrichment_test: Fisher or Binomial
:param correction: FDR or Bonferonni
The field **number_in_reference** indicates from the reference, the number
of genes that have a given ontolgy term. For instance, 998 genes have
the term. This is stored in **number_in_reference**. If the reference
contains 4391 genes, and you provided 49
genes , the **expected** number of genes that have this ontology term is
49*998/4391 that is 11.1369, which is stored in **"expected**.
Now, if you actually find that 14 out of 49
genes have the term, you need to compare the numbers 11.1369 and 14. Are
they really different ? The ratio 14 / 11.1369 is stored in
**fold_enrichment**. The pvalue and FDR are stored as well.
Some genes may be missing If you provide 50 genes, you may end up with
only 45 being mapped onto panther db database. This may explain some
differences with the expected value.
Fold enrichment is the number_in_list / expected ratio. Another close metric is the
fractional difference: (observed - expected) / expected. This metric is
slighlty less than the fold enrichment
To get the number f genes from the database, simply take one output, and
compute number_in_reference * number_in_list / expected
The fold enrichment is also called odd-ratio.
"""
for category, gene_list in self.gene_lists.items():
logger.info(f"Computing enrichment for category {category}")
self.enrichment[category], self.stats[category] = self._compute_enrichment(
gene_list,
taxid=taxid,
ontologies=ontologies,
enrichment_test=enrichment_test,
correction=correction,
)
[docs] def get_mapping_stats(self):
results = []
for k in self.enrichment.keys():
if self.enrichment[k]:
ontologies = self.enrichment[k].keys()
for o in ontologies:
M = self.enrichment[k][o]["input_list"]["mapped_count"]
U = self.enrichment[k][o]["input_list"]["unmapped_count"]
results.append([k, o, M, U])
df = pd.DataFrame(results)
if not df.empty:
df[4] = 100 * df[2] / (df[3] + df[2])
df[5] = df[2] + df[3]
df.columns = [
"category",
"ontology",
"mapped",
"unmapped",
"mapped_percentage",
"total",
]
return df
def _compute_enrichment(
self,
mygenes,
taxid,
ontologies=None,
enrichment_test="FISHER",
correction="FDR",
):
# taxid=83333 # ecoli
if taxid is None:
taxid = self.taxon
# nice bug from panterdb website. if 'php' is a gene name, no process is done
if "php" in mygenes:
mygenes.remove("php")
if isinstance(mygenes, list):
mygenes = ",".join(mygenes)
if mygenes.count(",") > 2500: # pragma: no cover
logger.warning("Please reduce the list input genes. may fail on pantherb otherwise")
if len(mygenes) <= 2: # pragma: no cover
logger.error(f"Less than 2 genes are found for in the gene set: {mygenes}. No enrichment will be computed")
return None, None
if ontologies is None:
ontologies = self.ontology_aliases
else:
for x in ontologies:
assert x in self.ontology_aliases, f"{x} not valid ontology"
# for each ontology, we will store one key/value item
enrichment = {}
def get_panther_ont(x):
index = self.ontology_aliases.index(x)
return self.ontologies[index]
unclassified = {}
for ontology in ontologies:
unclassified[ontology] = 0
logger.info(f" - Computing enrichment for {ontology}")
try:
del results
except NameError:
pass
results = self.panther.get_enrichment(
mygenes,
taxid,
get_panther_ont(ontology),
enrichment_test=enrichment_test,
correction=correction,
)
count = 0
while count < 2 and results == 404: # pragma: no cover
logger.warning("Panther request failed Trying again")
results = self.panther.get_enrichment(
mygenes,
taxid,
get_panther_ont(ontology),
enrichment_test=enrichment_test,
correction=correction,
)
count += 1
if results == 404: # pragma: no cover
logger.warning("Invalid output from pantherdb (too many genes ?). skipping {}".format(ontology))
enrichment[ontology] = None
continue
##if isinstance(results["result"], dict): # pragma: no cover
# results["result"] = [results["result"]]
# extract the ID and label of the go term and save
# as primary keys
for i, k in enumerate(results["result"]):
if results["result"][i]["term"]["label"] == "UNCLASSIFIED":
unclassified[ontology] += 1
# results["result"][i]['label'] = None
else:
results["result"][i]["id"] = k["term"]["id"]
results["result"][i]["label"] = k["term"]["label"]
# convert to dataframe and remove unclassified labels
df = pd.DataFrame(results["result"])
df = df[df["label"].isnull() == False]
# compute pvalues adjusted
pvalues = df["pValue"].copy()
import statsmodels
import statsmodels.stats.multitest
if correction == "FDR":
fdr = statsmodels.stats.multitest.multipletests(pvalues, method="fdr_bh")[1]
elif correction.lower() == "bonferroni":
fdr = statsmodels.stats.multitest.multipletests(pvalues, method="bonferroni")[1]
df["fdr2"] = fdr
if enrichment_test.lower() == "binomial":
df["fdr"] = fdr
# store all results
enrichment[ontology] = results
enrichment[ontology]["result"] = df
stats = dict([(k, len(v["result"])) for k, v in enrichment.items()])
stats["input_genes"] = len(mygenes.split(","))
try:
unmapped = enrichment[ontologies[0]]["input_list"]["unmapped_id"]
stats["unmapped_genes"] = unmapped
stats["N_unmapped_genes"] = len(unmapped)
except Exception:
stats["unmapped_genes"] = []
stats["N_unmapped_genes"] = 0
stats["unclassified_GO"] = unclassified
# Here, looking at the FDr, it appears that when using bonferroni,
# all FDR are set to zeros. Moreover, when using Fisher tests and
# FDR (supposibly a FDR_BH, the results are noisy as compare to a
# test from statsmodels. Moreover, when using binomial test, the FDR
# is not computed... So, we will recompute the FDR ourself
return enrichment, stats
[docs] def get_data(self, category, ontologies, include_negative_enrichment=True, fdr=0.05):
"""
From all input GO term that have been found and stored in
enrichment[ONTOLOGY]['result'], we keep those with fdr<0.05. We also
exclude UNCLASSIFIED entries. The final dataframe is returned
::
pe.get_data(""up", MF")
"""
df = self._get_data(category, ontologies)
# some extra information for convenience
df["pct_diff_expr"] = df["number_in_list"] * 100 / df["number_in_reference"]
# could happen that all fold enrichment are set to 'NaN'
df = df[df["fold_enrichment"] != "NaN"]
df["log2_fold_enrichment"] = pylab.log2(df["fold_enrichment"])
df["abs_log2_fold_enrichment"] = abs(pylab.log2(df["fold_enrichment"]))
df["expected"] = [int(x) for x in df.expected]
# Some user may want to include GO terms with fold enrichment
# significanyly below 1 or not.
if include_negative_enrichment is False:
df = df.query("fold_enrichment>=1").copy()
logger.info("Found {} GO terms after keeping only positive enrichment".format(len(df)))
# filter out FDR>0.05
df = df.query("fdr<=@fdr").copy()
logger.info("Found {} GO terms after keeping only FDR<{}".format(len(df), fdr))
return df
[docs] def get_functional_classification(self, mygenes, taxon): # pragma: no cover ; too slow
"""Mapping information from pantherDB for the lisf of genes
We also store uniprot persistent id
"""
logger.warning("Very slow. Please wait")
if isinstance(mygenes, list):
mygenes = ",".join(mygenes)
res = self.panther.get_mapping(mygenes, taxon)
res = res["mapped"]
for i, item in tqdm(enumerate(res)):
accession = item["accession"]
res[i]["persistent_id"] = self._get_name_given_accession(accession)
return res
def _get_name_given_accession(self, accession): # pragma: no cover
from bioservices import UniProt
self.uniprot = UniProt(cache=True)
self.uniprot.requests_per_sec = 10
acc = [x for x in accession.split("|") if x.startswith("UniProtKB")]
acc = acc[0].split("=")[1]
res = self.uniprot.get_df(acc, limit=1)
name = res["Gene names (primary )"][0]
return name
[docs] def plot_piechart(self, df):
# Here we show the GO terms that have number in list > 0
# Note, that this is dangerous to look only at this picture without
# the reference plot, which data is not available thourg the pathner API
labels = []
for this in df.query("number_in_list!=0").label.values:
if len(this) > 50:
labels.append(this)
else:
labels.append(this[0:50] + "...")
pylab.pie(df.query("number_in_list!=0").number_in_list, labels=labels)
pylab.gcf().set_layout_engine("tight")
[docs] def save_chart(self, df, filename="chart.png"):
"""
pe = PantherEnrichment("B4052-V1.T1vsT0.complete.xls", fc_threshold=5,
padj_threshold=0.05)
df = pe.plot_go_terms("down", log=True, compute_levels=False)
pe.save_chart(df, "chart.png")
"""
self.quick_go_graph.save_chart(df, filename)
def _get_graph(self, df, ontologies):
return self.quick_go_graph._get_graph(df, ontologies=ontologies)