Source code for sequana.vcftools
#
# This file is part of Sequana software
#
# Copyright (c) 2016-2021 - Sequana Development Team
#
# Distributed under the terms of the 3-clause BSD license.
# The full license is in the LICENSE file, distributed with this software.
#
# website: https://github.com/sequana/sequana
# documentation: http://sequana.readthedocs.io
#
##############################################################################
"""
Python script to filter a VCF file
"""
import colorlog
from sequana.lazy import pylab
from sequana.utils.fisher import fisher_exact
logger = colorlog.getLogger(__name__)
__all__ = [
"strand_ratio",
"compute_frequency",
"compute_strand_balance",
"compute_fisher_strand_filter",
]
[docs]
def strand_ratio(number1, number2):
"""Compute ratio between two number. Return result between [0:0.5]."""
try:
division = float(number1) / (number1 + number2)
if division > 0.5:
division = 1 - division
except ZeroDivisionError:
return 0
return division
[docs]
def compute_frequency(record):
"""Compute frequency of alternate allele.
alt_freq = Count Alternate Allele / Depth
:param record: variant record
"""
try:
info = record.info
except:
info = record.INFO
# For freebayes:
try:
return [float(count) / info["DP"] for count in info["AO"]]
except:
# sniffles
if "VAF" in info:
return [float(info["VAF"])]
else:
return []
[docs]
def compute_fisher_strand_filter(record):
"""Fisher strand filter (FS).
Sites where the numbers of reference/non-reference reads are highly correlated
with the strands of the reads. Counting the number of reference reads on the forward
strand and on the reverse strand, and the number of alternate reads on the forward and
reverse strand should be equivalent. With these four numbers, we
construct a 2 x 2 contingency table and used the P-value from a Fisher’s exact test
to evaluate the correlation.
from sequana import freebayes_vcf_filter
v = freebayes_vcf_filter.VCF_freebayes("data/JB409847.vcf")
r = next(v)
compute_fisher_strand_filter(r)
If the pvalue is less than 0.05 we should reject the variant since the alternate and reference
do not behave in the same way. Typically found if the alternate has a poor strand balance. Used with
care if frequency of alternate or reference is low or deth of coverage is low.
.. note:: fisher terst with two-sided hypothesis"""
try:
info = record.info
except:
info = record.INFO
try:
return [
fisher_exact([[x, y], [info["SRF"], info["SRR"]]], "two-sided") for x, y in zip(info["SAF"], info["SAR"])
]
except KeyError:
return [1]
[docs]
def compute_strand_balance(record):
"""Compute strand balance of alternate allele include in [0,0.5].
strand_bal = Alt Forward / (Alt Forward + Alt Reverse)
:param record: variant record
FYI: in freebayes, the allele balance (reported under AB), strand
bias counts (SRF, SRR, SAF, SAR) and bias estimate (SAP)
can be used as well for filtering. Here, we use the strand balance
computed as SAF / (SAF + SAR)
"""
try:
info = record.info
except:
info = record.INFO
try:
return [strand_ratio(x, y) for x, y in zip(info["SAF"], info["SAR"])]
except KeyError:
return [0.5]