Source code for sequana.checkm

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2022 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
import colorlog

from sequana.lazy import pandas as pd

logger = colorlog.getLogger(__name__)


__all__ = ["CheckM", "MultiCheckM"]


[docs] class CheckM: def __init__(self, filename): self.filename = filename # The format is not CSV or TSV, but a complex structure with different spaces... # The header can be just written here. header = [ "sample", "marker_lineage", "#genomes", "#markers", "#marker_sets", "0", "1", "2", "3", "4", "5+", "Completeness", "Contamination", "Strain heterogeneity", ] with open(self.filename, "r") as fin: data = fin.read() values = data.split("\n")[3] # convert the string to number when possible new_values = [] for val in values.split(" "): if val: try: val = float(val) except ValueError: try: val = int(val) except ValueError: pass new_values.append(val) self.df = pd.Series(new_values) self.df.index = header
[docs] class MultiCheckM: def __init__(self, filenames): dfs = [] for filename in filenames: try: dfs.append(CheckM(filename).df) except Exception: logger.warning(f"Skipped {filename}") self.df = pd.concat(dfs, axis=1)