Source code for sequana.kegg

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2021 - Sequana Dev Team (https://sequana.readthedocs.io)
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  Website:       https://github.com/sequana/sequana
#  Documentation: http://sequana.readthedocs.io
#  Contributors:  https://github.com/sequana/sequana/graphs/contributors
##############################################################################
import colorlog

from sequana import sequana_data
from sequana.lazy import pandas as pd

logger = colorlog.getLogger(__name__)


__all__ = ["KEGGHelper"]



[docs]
class KEGGHelper:
    """A simple class to build kegg information"""

    def __init__(self):
        self.df = pd.read_csv(sequana_data("kegg.csv"), index_col=0)
        self.df.fillna("", inplace=True)


[docs]
    def build_csv(self, filename=None, Nmax=None):
        """rebuild the entire dataframe (1hour) and stores as attribute

        :param Nmax: for testing
        """
        logger.info("Retrieving the kegg organisms and their definitions")
        from bioservices import KEGG

        k = KEGG()

        # k.organismIds queries https://rest.kegg.jp/list/organism, which may
        # return an error (e.g. HTTP 400) and make bioservices raise instead of
        # returning a list. Resolve it defensively so build_csv degrades with a
        # clear message rather than an obscure AttributeError.
        try:
            organism_ids = k.organismIds
        except Exception as err:  # noqa: BLE001
            logger.error(f"Could not retrieve KEGG organism list: {err}")
            return

        results = []
        definition = []
        for item in organism_ids:
            entry = k.get(f"gn:{item}")
            # On a network/server hiccup, bioservices returns an
            # HTTPResponseError (or status code) instead of a string. Skip it.
            if not isinstance(entry, str):
                logger.warning(f"Could not retrieve gn:{item} ({entry!r}); skipping")
                continue
            parsed = k.parse(entry)
            results.append(parsed["NAME"])
            definition.append(parsed["ORG_CODE"])
            if Nmax and len(results) >= Nmax:
                break

        results = [x[0] for x in results]
        IDs = [x.split(",")[0] for x in results]
        taxon = [x.split(",")[-1] for x in results]
        names = [x.split(",")[1].strip() if len(x.split(",")) == 3 else None for x in results]

        df = pd.DataFrame({"ID": IDs, "taxon": taxon, "name": names, "def": definition})
        df = df.fillna("")
        df.columns = ["ID", "taxon", "shortname", "definition"]
        df["definition"] = [x.lower() for x in df.definition]
        df["shortname"] = [x.lower() for x in df.shortname]

        self.df = df
        if filename:
            df.to_csv(filename)



[docs]
    def search(self, pattern):
        # if pattern is a string
        pattern = str(pattern)
        f1 = self.df[[True if pattern in x else False for x in self.df.definition]]
        f2 = self.df[[True if pattern in x else False for x in self.df.shortname]]
        f3 = self.df[[True if pattern in x else False for x in self.df.ID]]
        indices = list(f1.index) + list(f2.index) + list(f3.index)

        if len(indices) == 0:
            # maybe it is a taxon ID ?
            f4 = self.df[[True if pattern in str(x) else False for x in self.df.taxon]]
            indices = list(f4.index)

        results = self.df.loc[indices]
        return results