Source code for sequana.salmon
#
# This file is part of Sequana software
#
# Copyright (c) 2016-2022 - Sequana Development Team
#
# Distributed under the terms of the 3-clause BSD license.
# The full license is in the LICENSE file, distributed with this software.
#
# website: https://github.com/sequana/sequana
# documentation: http://sequana.readthedocs.io
#
##############################################################################
import colorlog
import pandas as pd
from sequana.gff3 import GFF3
logger = colorlog.getLogger(__name__)
[docs]
class Salmon:
"""Factory to read counts from salmon and create feature counts usable for deseq2"""
def __init__(self, filename, gff_input, attribute="transcript_id"):
self.filename = filename
df = pd.read_csv(filename, sep="\t")
self.df = df
logger.info("Initialisation the GFF")
# handling the GFF input file
# The input gff may already be an instance off GFF
if isinstance(gff_input, GFF3):
self.gff = gff_input
else:
self.gff = GFF3(gff_input)
# create the dataframe once for all
self.gff.df
# Of course, there is one gene by transcript, several transcript per gene
# trs2genes is a dict with one transcript as key and one gene values
# genes2trs is a dict with one gene sas key and a list of transcripts
[docs]
def get_feature_counts(self, feature=None, attribute=None):
if self.df.Name.iloc[0].startswith("transcript:"):
logger.info("salmon results start with transcript: tag. Eukaryotes mode")
logger.info("Identifying the mapping transcript vs genes")
self.trs2genes, self.genes2trs = self.gff.transcript_to_gene_mapping(attribute="transcript_id")
results = self.get_feature_counts_eukaryotes(feature, attribute)
else:
logger.info("salmon results not starting with transcript. Prokaryotes mode")
results = self.get_feature_counts_prokaryotes(feature, attribute)
return results
[docs]
def get_feature_counts_eukaryotes(self, feature=None, attribute=None):
if feature is None:
feature = "gene"
if attribute is None:
attribute = "ID"
# just to not loose the original
df = self.df.copy()
# Name contains the salmon entries read from gffread that uses
# transcript_id. From this transcript id, we get the gene (parent)
df["Gene"] = [self.trs2genes[x] for x in self.df.Name]
# groups = df.groupby('Gene').groups
counts_on_genes = df.groupby("Gene").NumReads.sum()
ff = self.filename.split("/")[-1]
results = f"\nGeneid\tChr\tStart\tEnd\tStrand\tLength\t{ff}"
# mouse 25814 gene (feature)
# 53715 gene_id (attribute)
# 135181 transcript_id (attribute)
# 133618 transcript_id from salmon
# 135181 entries in transcript fasta (gffread)
# gffread extact transcript_id from the gff if present
# otherwise, extract geneID or gene_id
logger.info("Recreating the feature counts")
genes = {}
dd = self.gff.df.query("ID in @counts_on_genes.index")
dd = dd.set_index("ID")
dd = dd.loc[counts_on_genes.index]
self.dd = dd
types = dd["type"].values
starts = dd["start"].values
stops = dd["stop"].values
strands = dd["strand"].values
seqids = dd["seqid"].values
S = 0
logger.info("Grouping")
TPMgroup = df.groupby("Gene").apply(lambda group: group["TPM"].sum())
efflength_null = df.groupby("Gene").apply(lambda group: group["EffectiveLength"].mean())
groups = df.groupby("Gene")
for i, name in tqdm.tqdm(enumerate(counts_on_genes.index)):
# Since we use ID, there should be only one hit. we select the first
# one to convert to a Series
tpm_sum = TPMgroup.loc[name]
if tpm_sum == 0:
length = efflength_null.loc[name]
else:
abundances = groups.get_group(name).TPM
efflength = groups.get_group(name).EffectiveLength
length = sum([x * y for x, y in zip(abundances, efflength)]) / abundances.sum()
S += abundances.sum()
# FIXME we keep only types 'gene' to agree with output of
# start/bowtie when working on the gene feature. What would happen
# to compare salmon wit other type of features ?
if types[i] == "gene":
start = starts[i]
stop = stops[i]
seqid = seqids[i]
strand = strands[i]
NumReads = counts_on_genes.loc[name]
length = length
name = name.replace("gene:", "")
results += f"\n{name}\t{seqid}\t{start}\t{stop}\t{strand}\t{length}\t{NumReads}"
else:
pass
return results
"""
In [179]: genes2trs['gene:ENSMUSG00000000028']
Out[179]:
['transcript:ENSMUST00000000028',
'transcript:ENSMUST00000096990',
'transcript:ENSMUST00000115585']
length = 52660
sum(counts) = 24116676 OKAY if we include all features
sum(length) = 66698347 difference 66699167 of 820.1839 ???
sum(abundance) = 1e6
>head(counts)
ene:ENSMUSG00000000001 3266.000
gene:ENSMUSG00000000003 0.000
gene:ENSMUSG00000000028 68.000
gene:ENSMUSG00000000031 1113.000
gene:ENSMUSG00000000037 113.999
gene:ENSMUSG00000000049 1.000
> head(txi$length) --> effective length average
[,1]
gene:ENSMUSG00000000001 3012.000
gene:ENSMUSG00000000003 549.500
gene:ENSMUSG00000000028 1541.881
gene:ENSMUSG00000000031 1131.651
gene:ENSMUSG00000000037 3297.491
gene:ENSMUSG00000000049 940.000
> head(txi$abundance)
[,1]
gene:ENSMUSG00000000001 8.148651
gene:ENSMUSG00000000003 0.000000
gene:ENSMUSG00000000028 0.331423
gene:ENSMUSG00000000031 7.391070
gene:ENSMUSG00000000037 0.259804
gene:ENSMUSG00000000049 0.007995
salmon:
Name Length EffectiveLength TPM NumReads
transcript:ENSMUST00000162897 4153 3903.000 0.044001 22.853
transcript:ENSMUST00000159265 2989 2739.000 0.000000 0.000
transcript:ENSMUST00000070533 3634 3384.000 0.064728 29.147
star:
Geneid Chr Start End Strand Length PyMT-DTR_Replicate1_S1/mark_duplicates/PyMT-DTR_Replicate1_S1.bam
ENSMUSG00000051951 1 3205901 3671498 - 465598 265
"""
[docs]
def get_feature_counts_prokaryotes(self, feature=None, attribute=None):
if feature is None:
feature = "gene"
if attribute is None:
attribute = "ID"
annot = self.gff.df
annot = annot.query("genetic_type==@feature").copy()
names = [x[attribute] for x in annot.attributes]
identifiers = [x[attribute] for x in annot.attributes]
annot["identifiers"] = identifiers
annot["names"] = names
annot = annot.set_index("identifiers")
ff = self.filename.split("/")[-1]
results = f"Geneid\tChr\tStart\tEnd\tStrand\tLength\t{ff}"
for name, length in zip(self.df.Name, self.df.Length):
try:
dd = annot.loc[name]
except:
continue
if isinstance(dd.seqid, str):
length2 = dd.stop - dd.start + 1
seqid = dd.seqid
stops = dd.stop
starts = dd.start
strands = dd.strand
new_name = dd["names"]
else:
seqid = ";".join(dd.seqid.values)
starts = ";".join([str(x) for x in dd.start.values])
stops = ";".join([str(x) for x in dd.stop.values])
strands = ";".join(dd.strand.values)
length2 = (dd.stop - dd.start).sum() + len(dd.stop)
new_name = dd["names"].values[0]
if abs(length - length2) > 5:
logger.warning(f"GFF and salmon length for for {name} are quite different: {length} and {length2}")
# raise ValueError("length in gff and quant not the same")
NumReads = int(self.df.query("Name==@name")["NumReads"].values[0])
if name.startswith("gene"):
results += f"\n{name}\t{seqid}\t{starts}\t{stops}\t{strands}\t{length}\t{NumReads}"
return results
[docs]
def save_feature_counts(self, filename, feature="gene", attribute="ID"):
from sequana import version
data = self.get_feature_counts(feature=feature, attribute=attribute)
with open(filename, "w") as fout:
fout.write(
f"# Program:sequana.salmon v{version}; sequana "
+ f"salmon -i {self.filename} -o {filename} -g {self.gff.filename}\n"
)
fout.write(data)