Source code for sequana.enrichment.mart

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2021 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
import io
import time

import colorlog

from sequana.lazy import pandas as pd

logger = colorlog.getLogger(__name__)


__all__ = ["Mart"]


# not tested. This is tested trough bioservics and takes a long time
[docs]class Mart: # pragma: no cover """ conv = Mart(dataset="mmusculus_gene_ensembl") # you could choose hsapiens_gene_ensembl for instance df = conv.query() df.set_index("ensembl_gene_id") conv.save(df) The file can now be loaded in e.g. :class:`~sequana.enrichment.kegg.KeggPathwayEnrichment` as a mapper of the ensemble identifier to external names understood by Kegg. For fungi (cryptococcus), you can use:: m = Mart(host="fungi.ensembl.org", dataset="cneoformans_eg_gene", mart="fungi_mart") m.query(['cneoformans_eg_gene']) See more information on https://bioservices.readthedocs.io """ def __init__(self, dataset, mart="ENSEMBL_MART_ENSEMBL", host=None): logger.info("Init Mart") from bioservices import BioMart self.biomart = BioMart(host=host) self.datasets = self.biomart.get_datasets(mart) self._dataset = None try: self.dataset = dataset except: logger.critical("Invalid dataset. checks datasets attributes") def _set_dataset(self, dataset): if dataset not in self.datasets["name"].values: raise ValueError("Invalid dataset {}. Choose amongst {}".format(dataset, self.datasets)) self._dataset = dataset self.attributes = self.biomart.attributes(dataset=dataset) self.filters = self.biomart.filters(dataset=dataset) def _get_dataset(self): return self._dataset dataset = property(_get_dataset, _set_dataset)
[docs] def query( self, attributes=["ensembl_gene_id", "go_id", "entrezgene_id", "external_gene_name"], ): logger.info("Please wait. This may take a while depending on your connection") self.biomart.new_query() self.biomart.add_dataset_to_xml(self.dataset) for attribute in attributes: if attribute not in self.attributes: logger.error("{} not found in the dataset {}".format(attribute, self.dataset)) raise ValueError self.biomart.add_attribute_to_xml(attribute) xml = self.biomart.get_xml() results = self.biomart.query(xml) df = pd.read_csv(io.StringIO(results), sep="\t") df.columns = attributes # df = df.set_index('ensembl_gene_id') # name should be the name used by kegg return df
[docs] def save(self, df, filename=None): """df is the output of :meth:`~query`. This function save it keeping track of day/month/year and dataset.""" date = time.localtime() if filename is None: filename = "biomart_{}_{}_{}_{}.csv".format(self.dataset, date.tm_year, date.tm_mon, date.tm_mday) logger.info("Saving into {}".format(filename)) df.to_csv(filename, index=False)