Source code for sequana.salmon

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2022 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################

import colorlog
import pandas as pd

from sequana.gff3 import GFF3

logger = colorlog.getLogger(__name__)



[docs]
class Salmon:
    """Factory to read counts from salmon and create feature counts usable for deseq2"""

    def __init__(self, filename, gff_input, attribute="transcript_id"):
        self.filename = filename
        df = pd.read_csv(filename, sep="\t")

        self.df = df

        logger.info("Initialisation the GFF")
        # handling the GFF input file
        # The input gff may already be an instance off GFF
        if isinstance(gff_input, GFF3):
            self.gff = gff_input
        else:
            self.gff = GFF3(gff_input)
        # create the dataframe once for all
        self.gff.df

        # Of course, there is one gene by transcript, several transcript per gene
        # trs2genes is a dict with one transcript as key and one gene values
        # genes2trs is a dict with one gene sas key and a list of transcripts


[docs]
    def get_feature_counts(self, feature=None, attribute=None):
        if self.df.Name.iloc[0].startswith("transcript:"):
            logger.info("salmon results start with transcript: tag. Eukaryotes mode")
            logger.info("Identifying the mapping transcript vs genes")
            self.trs2genes, self.genes2trs = self.gff.transcript_to_gene_mapping(attribute="transcript_id")
            results = self.get_feature_counts_eukaryotes(feature, attribute)
        else:
            logger.info("salmon results not starting with transcript. Prokaryotes mode")
            results = self.get_feature_counts_prokaryotes(feature, attribute)
        return results



[docs]
    def get_feature_counts_eukaryotes(self, feature=None, attribute=None):
        if feature is None:
            feature = "gene"

        if attribute is None:
            attribute = "ID"

        # just to not loose the original
        df = self.df.copy()

        # Name contains the salmon entries read from gffread that uses
        # transcript_id. From this transcript id, we get the gene (parent)
        df["Gene"] = [self.trs2genes[x] for x in self.df.Name]

        # groups = df.groupby('Gene').groups
        counts_on_genes = df.groupby("Gene").NumReads.sum()

        ff = self.filename.split("/")[-1]
        results = f"\nGeneid\tChr\tStart\tEnd\tStrand\tLength\t{ff}"

        # mouse 25814 gene (feature)
        #       53715 gene_id (attribute)
        #      135181 transcript_id (attribute)
        #      133618 transcript_id from salmon
        #      135181 entries in transcript fasta (gffread)

        # gffread extact transcript_id from the gff if present
        # otherwise, extract geneID or gene_id
        logger.info("Recreating the feature counts")

        genes = {}

        dd = self.gff.df.query("ID in @counts_on_genes.index")
        dd = dd.set_index("ID")
        dd = dd.loc[counts_on_genes.index]
        self.dd = dd

        types = dd["type"].values
        starts = dd["start"].values
        stops = dd["stop"].values
        strands = dd["strand"].values
        seqids = dd["seqid"].values

        S = 0

        logger.info("Grouping")
        TPMgroup = df.groupby("Gene").apply(lambda group: group["TPM"].sum())
        efflength_null = df.groupby("Gene").apply(lambda group: group["EffectiveLength"].mean())

        groups = df.groupby("Gene")
        for i, name in tqdm.tqdm(enumerate(counts_on_genes.index)):
            # Since we use ID, there should be only one hit. we select the first
            # one to convert to a Series

            tpm_sum = TPMgroup.loc[name]
            if tpm_sum == 0:
                length = efflength_null.loc[name]
            else:
                abundances = groups.get_group(name).TPM
                efflength = groups.get_group(name).EffectiveLength
                length = sum([x * y for x, y in zip(abundances, efflength)]) / abundances.sum()
                S += abundances.sum()

            # FIXME we keep only types 'gene' to agree with output of
            # start/bowtie when working on the gene feature. What would happen
            # to compare salmon wit other type of features ?
            if types[i] == "gene":
                start = starts[i]
                stop = stops[i]
                seqid = seqids[i]
                strand = strands[i]
                NumReads = counts_on_genes.loc[name]
                length = length
                name = name.replace("gene:", "")
                results += f"\n{name}\t{seqid}\t{start}\t{stop}\t{strand}\t{length}\t{NumReads}"
            else:
                pass
        return results
        """

In [179]: genes2trs['gene:ENSMUSG00000000028']
Out[179]:
['transcript:ENSMUST00000000028',
 'transcript:ENSMUST00000096990',
 'transcript:ENSMUST00000115585']

length = 52660
sum(counts) = 24116676   OKAY if we include all features
sum(length) = 66698347   difference 66699167 of 820.1839 ???
sum(abundance) = 1e6

>head(counts)
ene:ENSMUSG00000000001 3266.000
gene:ENSMUSG00000000003    0.000
gene:ENSMUSG00000000028   68.000
gene:ENSMUSG00000000031 1113.000
gene:ENSMUSG00000000037  113.999
gene:ENSMUSG00000000049    1.000
> head(txi$length)   --> effective length average
                            [,1]
gene:ENSMUSG00000000001 3012.000
gene:ENSMUSG00000000003  549.500
gene:ENSMUSG00000000028 1541.881
gene:ENSMUSG00000000031 1131.651
gene:ENSMUSG00000000037 3297.491
gene:ENSMUSG00000000049  940.000
> head(txi$abundance)
                            [,1]
gene:ENSMUSG00000000001 8.148651
gene:ENSMUSG00000000003 0.000000
gene:ENSMUSG00000000028 0.331423
gene:ENSMUSG00000000031 7.391070
gene:ENSMUSG00000000037 0.259804
gene:ENSMUSG00000000049 0.007995



salmon:
Name	Length	EffectiveLength	TPM	NumReads
transcript:ENSMUST00000162897	4153	3903.000	0.044001	22.853
transcript:ENSMUST00000159265	2989	2739.000	0.000000	0.000
transcript:ENSMUST00000070533	3634	3384.000	0.064728	29.147

star:
    Geneid  Chr Start   End Strand  Length  PyMT-DTR_Replicate1_S1/mark_duplicates/PyMT-DTR_Replicate1_S1.bam
    ENSMUSG00000051951  1   3205901 3671498 -   465598  265
"""



[docs]
    def get_feature_counts_prokaryotes(self, feature=None, attribute=None):
        if feature is None:
            feature = "gene"
        if attribute is None:
            attribute = "ID"

        annot = self.gff.df
        annot = annot.query("genetic_type==@feature").copy()
        names = [x[attribute] for x in annot.attributes]
        identifiers = [x[attribute] for x in annot.attributes]

        annot["identifiers"] = identifiers
        annot["names"] = names
        annot = annot.set_index("identifiers")

        ff = self.filename.split("/")[-1]
        results = f"Geneid\tChr\tStart\tEnd\tStrand\tLength\t{ff}"

        for name, length in zip(self.df.Name, self.df.Length):
            try:
                dd = annot.loc[name]
            except:
                continue
            if isinstance(dd.seqid, str):
                length2 = dd.stop - dd.start + 1
                seqid = dd.seqid
                stops = dd.stop
                starts = dd.start
                strands = dd.strand
                new_name = dd["names"]
            else:
                seqid = ";".join(dd.seqid.values)
                starts = ";".join([str(x) for x in dd.start.values])
                stops = ";".join([str(x) for x in dd.stop.values])
                strands = ";".join(dd.strand.values)
                length2 = (dd.stop - dd.start).sum() + len(dd.stop)
                new_name = dd["names"].values[0]

            if abs(length - length2) > 5:
                logger.warning(f"GFF and salmon length for for {name} are quite different: {length} and {length2}")
                # raise ValueError("length in gff and quant not the same")
            NumReads = int(self.df.query("Name==@name")["NumReads"].values[0])
            if name.startswith("gene"):
                results += f"\n{name}\t{seqid}\t{starts}\t{stops}\t{strands}\t{length}\t{NumReads}"
        return results



[docs]
    def save_feature_counts(self, filename, feature="gene", attribute="ID"):
        from sequana import version

        data = self.get_feature_counts(feature=feature, attribute=attribute)
        with open(filename, "w") as fout:
            fout.write(
                f"# Program:sequana.salmon v{version}; sequana "
                + f"salmon -i {self.filename} -o {filename} -g {self.gff.filename}\n"
            )
            fout.write(data)