Source code for sequana.misc

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2022 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################
""".. rubric:: misc utilities"""
import asyncio

import colorlog
from tqdm.asyncio import tqdm

from sequana.lazy import numpy as np

logger = colorlog.getLogger(__name__)


__all__ = ["textwrap", "wget", "download", "findpos", "normpdf", "multiple_downloads"]



[docs]
def normpdf(x, mu, sigma):
    """Return the normal pdf evaluated at *x*; args provides *mu*, *sigma*"

    .. note:: same as scipy.stats.norm but implemented to avoid scipy dependency
    """

    return 1.0 / (np.sqrt(2 * np.pi) * sigma) * np.exp(-0.5 * (1.0 / sigma * (x - mu)) ** 2)




[docs]
def textwrap(text, width=80, indent=0):
    """Wrap a string with 80 characters

    :param text: input text
    :param width: (defaults to 80 characters)
    :param indent: possible indentation (0 by default)

    """
    if indent == 0:
        indent = ""
    else:
        indent = " " * indent
    data = [indent + text[i * width : (i + 1) * width :] for i in range(len(text) // width + 1)]
    return "\n".join(data)




[docs]
def wget(link, output):
    """Retrieve a file from internet.

    :param str link: a valid URL
    :param str output: the output filename

    .. warning:: no sanity check of any kind for now
    """
    try:
        from urllib import urlretrieve
    except:
        from urllib.request import urlretrieve
    urlretrieve(link, filename=output)




[docs]
def findpos(seq, chr):
    """Find position(s) of a substring into a longer string.

    Note that this function is a generator::

        >>> list(findpos("AACCGGAAGGTT", "GG"))
        [4,8]

    """
    N = len(chr)
    for i, dummy in enumerate(seq):
        if seq[i : i + N] == chr:
            yield i




[docs]
def download(url, output):
    """Download a file from a given URL using asynchronous HTTP requests.

    :param str url: The URL from which to download the file.
    :param str output: The path and filename where the downloaded file will be saved.

    Raises a KeyboardInterrupt or asyncio.TimeoutError if the download is interrupted or takes too long.
    In such cases, it logs a message, removes partially downloaded files, and logs a critical message.
    """

    files_to_download = [(url, output, 0)]
    try:  # try an asynchrone downloads
        multiple_downloads(files_to_download)
    except (KeyboardInterrupt, asyncio.TimeoutError):  # pragma: no cover
        logger.info("The download was interrupted or network was too slow. Removing partially downloaded files")
        for values in files_to_download:
            filename = values[1]
            Path(filename).unlink()
        logger.critical(
            "Keep going but your pipeline will probably not be fully executable since images could not be downloaded"
        )



# copied from sequana_pipetools

[docs]
def multiple_downloads(files_to_download, timeout=3600):
    async def download(session, url, name, position):
        async with session.get(url, timeout=timeout) as resp:
            with tqdm.wrapattr(
                open(name, "wb"),
                "write",
                miniters=1,
                desc=url.split("/")[-1],
                total=int(resp.headers.get("content-length", 0)),
                position=position,
            ) as fout:
                async for chunk in resp.content.iter_chunked(4096):
                    fout.write(chunk)

    async def download_all(files_to_download):
        """data_to_download is a list of tuples
        each tuple contain the url to download, its output name, and a unique
        position for the progress bar."""
        import aiohttp

        async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=10)) as session:
            await asyncio.gather(*(download(session, *data) for data in files_to_download))

    asyncio.run(download_all(files_to_download))