Source code for sequana.viz.dendogram

# -*- coding: utf-8 -*-
#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2020 - Sequana Development Team
#
#  File author(s):
#      Thomas Cokelaer <thomas.cokelaer@pasteur.fr>
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
"""Heatmap and dendograms"""
import colormap

from sequana.lazy import numpy as np
from sequana.lazy import pylab
from sequana.viz.linkage import Linkage

__all__ = ["Dendogram"]


[docs] class Dendogram(Linkage): """dendograms of an input matrix .. plot:: :include-source: :width: 80% from sequana.viz import heatmap, dendogram df = heatmap.get_heatmap_df() h = dendogram.Dendogram(df) h.plot() You should scale the data before:: from sequana.viz.clusterisation import Clusterisation scaled, index = Clusterisation(data).scale_data() import pandas as pd df = pd.DataFrame(scaled) df.index = index df.columns = data.columns g = Dendogram(df.T) g.plot() """ def __init__( self, data=None, method="complete", metric="euclidean", cmap="yellow_black_blue", col_side_colors=None, side_colors=None, verbose=True, horizontal=True, ): """.. rubric:: constructor :param data: a dataframe or possibly a numpy matrix. :param method: complete by default :param metric: euclidean by default :param cmap: colormap. any matplotlib accepted or combo of colors as defined in colormap package (pypi) :param col_side_colors: :param side_colors: """ # should be a copy since it may be reshuffled ? try: if data is None and verbose is True: print("No data provided, please fill the `df` attribute manually") elif data is None: pass else: self._df = data.copy() except AttributeError as err: print("input must be a pandas data frame or numpy matrix") raise (err) self._method = method self._metric = metric self.horizontal = True # some default parameters self.cluster_criterion = "distance" class Params: pass self.params = Params() self.params.side_colors = ["r", "g", "b", "y", "w", "k", "m"] self.params.cmap = cmap self.category = {} if side_colors: self.params.side_colors = side_colors def _get_df(self): return self._df def _set_df(self, data): self._df = data.copy() df = property(_get_df, _set_df) frame = property(_get_df, _set_df) def _get_method(self): return self._method def _set_method(self, value): self.check_method(value) self._method = value method = property(_get_method, _set_method) def _get_metric(self): return self._metric def _set_metric(self, value): self.check_metric(value) self._metric = value metric = property(_get_metric, _set_metric)
[docs] def plot(self, num=1, cmap=None, colorbar=True, figsize=(12, 8), fontsize=None): """Render the dendogram of the input matrix. Using as input:: df = pd.DataFrame({'A':[1,0,1,1], 'B':[.9,0.1,.6,1], 'C':[.5,.2,0,1], 'D':[.5,.2,0,1]}) .. plot:: :include-source: :width: 80% from sequana.viz import heatmap, dendogram df = heatmap.get_heatmap_df() d = dendogram.Dendogram(df) d.plot() """ import matplotlib import scipy.cluster.hierarchy as hierarchy # save all parameters in a dict layout = {} if cmap is None: cmap = self.params.cmap try: cmap = colormap.cmap_builder(cmap) except: pass # keep track of row and column names for later. header = self.frame.index # FIXME something clever for the fontsize if len(header) > 100 or len(header) > 100: matplotlib.rcParams["font.size"] = 6 if len(header) > 50 or len(header) > 50: matplotlib.rcParams["font.size"] = 7 if len(header) > 30 or len(header) > 30: matplotlib.rcParams["font.size"] = 8 else: matplotlib.rcParams["font.size"] = 12 if fontsize: matplotlib.rcParams["font.size"] = fontsize # scaling min/max range # Scale the figure window size # fig = pylab.figure(num=num, figsize=figsize, layout="tight") fig.clf() Y = self.linkage(self.frame, self.method, self.metric) Z = hierarchy.dendrogram( Y, orientation="right", color_threshold=0, above_threshold_color="k", distance_sort="descending" ) ind1 = hierarchy.fcluster(Y, 0.7 * max(Y[:, 2]), self.cluster_criterion) # apply the clustering for the array-dendrograms to the actual matrix data idx1 = Z["leaves"] # Rearrange the data frame in the order of the dendogram self.frame = self.frame.iloc[idx1, :] ticks = pylab.yticks()[0] pylab.yticks(ticks, self.frame.index) # reorder the flat cluster to match the order of the leaves the dendrogram ind1 = ind1[idx1] if self.category: gca = pylab.gca() X, Y = gca.get_position().get_points() f = pylab.gcf() ax = f.add_axes([X[0], X[1], 0.02, Y[1] - X[1]]) category = [self.category[x] for x in self.df.index] dr = np.array(category, dtype=int) dr.shape = (len(category), 1) cmap_r = matplotlib.colors.ListedColormap(self.params.side_colors) ax.matshow(dr, aspect="auto", origin="lower", cmap=cmap_r) ax.set_xticks([]) ax.set_yticks([])