Source code for sequana.viz.clusterisation

# -*- coding: utf-8 -*-
#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2020 - Sequana Development Team
#
#  File author(s):
#      Thomas Cokelaer <thomas.cokelaer@pasteur.fr>
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################

import colorlog

from sequana.lazy import numpy as np
from sequana.lazy import pylab

logger = colorlog.getLogger(__name__)


__all__ = ["Cluster"]


[docs] class Cluster: """ Input must be a matrix in the form of a pandas DataFrame. Each column is a sample. Sample names are the columns' names. colors are set to red for all samples but user can provide a mapping of columns' names and a color. :: c = Cluster(data, colors={"A": "r", "B": "g"} """ def __init__(self, data, colors={}): """.. rubric:: constructor :param data: a dataframe; Each column being a sample. :param colors: a mapping of column/sample name a color """ self.df = data self.labels = data.columns self.colors = {x: "r" for x in self.labels} for k, v in colors.items(): self.colors[k] = v from sklearn.preprocessing import StandardScaler self.scaler = StandardScaler()
[docs] def scale_data(self, transform_method="log"): """ - Replace zeros with 1 (avoid log issue) - transform the data using log10 or anscombe transform - scale the data using the scaler attribute (standard scaler by default) """ assert transform_method in [ "log", "vst", "anscombe", "none", "standard", ], f"Scaling {transform_method} not available. must be log, vst, none, standard, anscombe (same as vst)" # transform the data data = self.df.copy() # in rare cases, with sparse feature count matrix, NA may be included data[np.isnan(data)] = 0 self.data = data if transform_method == "log": data = data.replace(0, 1) data = pylab.log10(data) elif transform_method in ["anscombe", "vst"]: from sequana.vst import VST data = VST.anscombe(data) elif transform_method == "standard": data = self.scaler.fit_transform(data) else: pass return data
def _plot(self, Xr, pca=None, pc1=0, pc2=1, colors=None, show_labels=True, fontsize=10): if colors is None: colors = [self.colors[k] for k in self.labels] if len(colors) != len(Xr): colors = ["r"] * len(Xr[:, 0]) else: for k in self.labels: if k not in colors.keys(): logger.warning("No key color for this sample: {}. Set to red".format(k)) colors[k] = "r" colors = [colors[k] for k in self.labels] pylab.scatter(Xr[:, pc1], Xr[:, pc2], c=colors) ax = pylab.gca() X1, X2 = pylab.xlim() dX = X2 - X1 pylab.xlim([X1 + X1 * 0.05, X2 + X2 * 0.05]) Y1, Y2 = pylab.ylim() dY = Y2 - Y1 pylab.ylim([Y1 + Y1 * 0.05, Y2 + Y2 * 0.05]) count = 0 if fontsize == 0: show_labels = 0 if show_labels: for x, y in zip(Xr[:, pc1], Xr[:, pc2]): x += dX / 40 y += dY / 40 ax.annotate(self.labels[count], (x, y), color=colors[count], fontsize=fontsize) count += 1 if count > 100: break if pca: pylab.xlabel( "PC{} ({}%)".format(pc1 + 1, round(pca.explained_variance_ratio_[pc1] * 100, 2)), fontsize=12, ) pylab.ylabel( "PC{} ({}%)".format(pc2 + 1, round(pca.explained_variance_ratio_[pc2] * 100, 2)), fontsize=12, ) pylab.grid(True)