# -*- coding: utf-8 -*-
#
# This file is part of Sequana software
#
# Copyright (c) 2016-2020 - Sequana Development Team
#
# File author(s):
# Thomas Cokelaer <thomas.cokelaer@pasteur.fr>
#
# Distributed under the terms of the 3-clause BSD license.
# The full license is in the LICENSE file, distributed with this software.
#
# website: https://github.com/sequana/sequana
# documentation: http://sequana.readthedocs.io
#
##############################################################################
import colorlog
from sequana.lazy import numpy as np
from sequana.lazy import pylab
logger = colorlog.getLogger(__name__)
__all__ = ["Cluster"]
[docs]
class Cluster:
"""
Input must be a matrix in the form of a pandas DataFrame. Each column is a
sample. Sample names are the columns' names. colors are set to red for all
samples but user can provide a mapping of columns' names and a color.
::
c = Cluster(data, colors={"A": "r", "B": "g"}
"""
def __init__(self, data, colors={}):
""".. rubric:: constructor
:param data: a dataframe; Each column being a sample.
:param colors: a mapping of column/sample name a color
"""
self.df = data
self.labels = data.columns
self.colors = {x: "r" for x in self.labels}
for k, v in colors.items():
self.colors[k] = v
from sklearn.preprocessing import StandardScaler
self.scaler = StandardScaler()
[docs]
def scale_data(self, transform_method="log"):
"""
- Replace zeros with 1 (avoid log issue)
- transform the data using log10 or anscombe transform
- scale the data using the scaler attribute (standard scaler by default)
"""
assert transform_method in [
"log",
"vst",
"anscombe",
"none",
"standard",
], f"Scaling {transform_method} not available. must be log, vst, none, standard, anscombe (same as vst)"
# transform the data
data = self.df.copy()
# in rare cases, with sparse feature count matrix, NA may be included
data[np.isnan(data)] = 0
self.data = data
if transform_method == "log":
data = data.replace(0, 1)
data = pylab.log10(data)
elif transform_method in ["anscombe", "vst"]:
from sequana.vst import VST
data = VST.anscombe(data)
elif transform_method == "standard":
data = self.scaler.fit_transform(data)
else:
pass
return data
def _plot(self, Xr, pca=None, pc1=0, pc2=1, colors=None, show_labels=True, fontsize=10):
if colors is None:
colors = [self.colors[k] for k in self.labels]
if len(colors) != len(Xr):
colors = ["r"] * len(Xr[:, 0])
else:
for k in self.labels:
if k not in colors.keys():
logger.warning("No key color for this sample: {}. Set to red".format(k))
colors[k] = "r"
colors = [colors[k] for k in self.labels]
pylab.scatter(Xr[:, pc1], Xr[:, pc2], c=colors)
ax = pylab.gca()
X1, X2 = pylab.xlim()
dX = X2 - X1
pylab.xlim([X1 + X1 * 0.05, X2 + X2 * 0.05])
Y1, Y2 = pylab.ylim()
dY = Y2 - Y1
pylab.ylim([Y1 + Y1 * 0.05, Y2 + Y2 * 0.05])
count = 0
if fontsize == 0:
show_labels = 0
if show_labels:
for x, y in zip(Xr[:, pc1], Xr[:, pc2]):
x += dX / 40
y += dY / 40
ax.annotate(self.labels[count], (x, y), color=colors[count], fontsize=fontsize)
count += 1
if count > 100:
break
if pca:
pylab.xlabel(
"PC{} ({}%)".format(pc1 + 1, round(pca.explained_variance_ratio_[pc1] * 100, 2)),
fontsize=12,
)
pylab.ylabel(
"PC{} ({}%)".format(pc2 + 1, round(pca.explained_variance_ratio_[pc2] * 100, 2)),
fontsize=12,
)
pylab.grid(True)