Source code for sequana.kegg

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2021 - Sequana Dev Team (https://sequana.readthedocs.io)
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  Website:       https://github.com/sequana/sequana
#  Documentation: http://sequana.readthedocs.io
#  Contributors:  https://github.com/sequana/sequana/graphs/contributors
##############################################################################
import colorlog

from sequana import sequana_data
from sequana.lazy import pandas as pd

logger = colorlog.getLogger(__name__)


__all__ = ["KEGGHelper"]


[docs] class KEGGHelper: """A simple class to build kegg information""" def __init__(self): self.df = pd.read_csv(sequana_data("kegg.csv"), index_col=0) self.df.fillna("", inplace=True)
[docs] def build_csv(self, filename=None, Nmax=None): """rebuild the entire dataframe (1hour) and stores as attribute :param Nmax: for testing """ logger.info("Retrieving the kegg organisms and their definitions") from bioservices import KEGG k = KEGG() results = [] definition = [] for i, item in enumerate(k.organismIds): results.append(k.parse(k.get(f"gn:{item}"))["NAME"]) definition.append(k.parse(k.get(f"gn:{item}"))["ORG_CODE"]) print(i, Nmax) if Nmax and i + 1 >= Nmax: break results = [x[0] for x in results] IDs = [x.split(",")[0] for x in results] taxon = [x.split(",")[-1] for x in results] names = [x.split(",")[1].strip() if len(x.split(",")) == 3 else None for x in results] df = pd.DataFrame({"ID": IDs, "taxon": taxon, "name": names, "def": definition}) df = df.fillna("") df.columns = ["ID", "taxon", "shortname", "definition"] df["definition"] = [x.lower() for x in df.definition] df["shortname"] = [x.lower() for x in df.shortname] self.df = df if filename: df.to_csv(filename)
[docs] def search(self, pattern): # if pattern is a string pattern = str(pattern) f1 = self.df[[True if pattern in x else False for x in self.df.definition]] f2 = self.df[[True if pattern in x else False for x in self.df.shortname]] f3 = self.df[[True if pattern in x else False for x in self.df.ID]] indices = list(f1.index) + list(f2.index) + list(f3.index) if len(indices) == 0: # maybe it is a taxon ID ? f4 = self.df[[True if pattern in str(x) else False for x in self.df.taxon]] indices = list(f4.index) results = self.df.loc[indices] return results