Source code for sequana.bed

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2022 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
import colorlog

logger = colorlog.getLogger(__name__)


__all__ = ["BED"]


[docs]class BED(object): """a structure to read and manipulate BED files (12-column file) columns are defined as chromosome name, start and end, gene_name, score, strand, CDS start and end, blcok count, block sizes, block starts: """ def __init__(self, filename): self.filename = filename def _get_line(self, line): try: if line.startswith("#") or line.startswith("track") or line.startswith("browser"): return {} else: fields = line.rstrip("\r\n").split() chrom_start = int(fields[1]) return { "chrom": fields[0], "start": chrom_start, "end": int(fields[2]), "gene_name": fields[3], "score": fields[4], "strand": fields[5], "cds_start": int(fields[6]), "cds_end": int(fields[7]), "block_count": int(fields[9]), "block_sizes": [int(x) for x in fields[10].strip(",").split(",")], "block_starts": [int(x) + chrom_start for x in fields[11].strip(",").split(",")], } except Exception as err: print(err) print("Input bed must be 12-column] skipped line {}".format(line)) return {} def __len__(self): count = 0 with open(self.filename, "r") as fin: for line in fin: if line.startswith(("#", "track", "browser")): continue count += 1 return count
[docs] def get_exons(self): """Extract exon regions from input BED file. Uses the first (chromosome name), second (chromosome start), 11th and 12th columns (exon start and size) of a 12-columns BED file. :: from sequana import sequana_data from sequana import BED b = BED(sequana_data("hg38_chr18.bed")) b.get_exons() """ exons = [] with open(self.filename, "r") as fin: for line in fin: if line.startswith("#"): continue if line.startswith("track"): continue if line.startswith("browser"): continue fields = line.rstrip("\r\n").split() assert len(fields) == 12 chrom_start = int(fields[1]) chrom_name = fields[0] block_sizes = (int(x) for x in fields[10].strip(",").split(",")) block_starts = (int(x) + chrom_start for x in fields[11].strip(",").split(",")) for start, size in zip(block_starts, block_sizes): exons.append((chrom_name, start, start + size)) return exons
[docs] def get_transcript_ranges(self): """Extract transcript from input BED file.""" with open(self.filename, "r") as fin: for line in fin: if line.startswith("#"): continue if line.startswith("track"): continue if line.startswith("browser"): continue fields = line.rstrip("\r\n").split() assert len(fields) == 12 chrom = fields[0] chrom_start = int(fields[1]) chrom_end = int(fields[2]) strand = fields[5] gene_name = fields[3] yield ( [ chrom, chrom_start, chrom_end, strand, "{}:{}:{}-{}".format(gene_name, chrom, chrom_start, chrom_end), ] )
[docs] def get_CDS_exons(self): """Extract CDS from input BED file.""" results = [] with open(self.filename, "r") as fin: for line in fin: row = self._get_line(line) if row: cds_start = row["cds_start"] cds_end = row["cds_end"] for start, size in zip(row["block_starts"], row["block_sizes"]): if (start + size) < cds_start: continue if start > cds_end: continue exon_start = max(start, cds_start) exon_end = min(start + size, cds_end) results.append([row["chrom"], exon_start, exon_end]) return results