Source code for coaster.utils.misc

"""
Miscellaneous utilities
-----------------------
"""

from base64 import urlsafe_b64decode, urlsafe_b64encode
from datetime import datetime
from email.header import decode_header
from functools import wraps
from random import SystemRandom
from urllib.parse import urlparse
import collections.abc as abc
import email.utils
import hashlib
import re
import time
import uuid

from unidecode import unidecode
import base58
import tldextract

__all__ = [
    'base_domain_matches',
    'buid',
    'buid2uuid',
    'domain_namespace_match',
    'format_currency',
    'get_email_domain',
    'getbool',
    'is_collection',
    'make_name',
    'md5sum',
    'namespace_from_url',
    'nary_op',
    'newpin',
    'newsecret',
    'nullint',
    'nullstr',
    'require_one_of',
    'unicode_http_header',
    'uuid1mc',
    'uuid1mc_from_datetime',
    'uuid2buid',
    'uuid_b58',
    'uuid_b64',
    'uuid_from_base58',
    'uuid_from_base64',
    'uuid_to_base58',
    'uuid_to_base64',
    'valid_username',
]

# --- Common delimiters and punctuation ---------------------------------------

_strip_re = re.compile('[\'"`‘’“”′″‴]+')
_punctuation_re = re.compile(
    '[\x00-\x1f +!#$%&()*\\-/<=>?@\\[\\\\\\]^_{|}:;,.…‒–—―«»]+'
)
_username_valid_re = re.compile('^[a-z0-9]([a-z0-9-]*[a-z0-9])?$')
_ipv4_re = re.compile(
    r'^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}'
    r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$'
)


# --- Utilities ---------------------------------------------------------------


[docs]def is_collection(item):
    """
    Returns True if the item is a collection class: list, tuple, set, frozenset
    or any other class that resembles one of these (using abstract base classes).

    >>> is_collection(0)
    False
    >>> is_collection(0.1)
    False
    >>> is_collection('')
    False
    >>> is_collection(b'')
    False
    >>> is_collection({})
    False
    >>> is_collection({}.keys())
    True
    >>> is_collection([])
    True
    >>> is_collection(())
    True
    >>> is_collection(set())
    True
    >>> is_collection(frozenset())
    True
    >>> from coaster.utils import InspectableSet
    >>> is_collection(InspectableSet({1, 2}))
    True
    """
    return not isinstance(item, (str, bytes)) and isinstance(
        item, (abc.Set, abc.Sequence)
    )


[docs]def uuid_b64():
    """
    Return a new random id that is exactly 22 characters long,
    by encoding a UUID4 in URL-safe Base64. See
    http://en.wikipedia.org/wiki/Base64#Variants_summary_table

    >>> len(buid())
    22
    >>> buid() == buid()
    False
    >>> isinstance(buid(), str)
    True
    """
    return urlsafe_b64encode(uuid.uuid4().bytes).decode().rstrip('=')


#: Legacy name
buid = uuid_b64


[docs]def uuid_b58():
    """
    Return a UUID4 encoded in base58 and rendered as a string. Will be 21 or 22
    characters long

    >>> len(uuid_b58()) in (21, 22)
    True
    >>> uuid_b58() == uuid_b58()
    False
    >>> isinstance(uuid_b58(), str)
    True
    """
    return base58.b58encode(uuid.uuid4().bytes).decode()


[docs]def uuid1mc():
    """
    Return a UUID1 with a random multicast MAC id.

    >>> isinstance(uuid1mc(), uuid.UUID)
    True
    """
    return uuid.uuid1(node=uuid._random_getnode())


[docs]def uuid1mc_from_datetime(dt):
    """
    Return a UUID1 with a random multicast MAC id and with a timestamp
    matching the given datetime object or timestamp value.

    .. warning::
        This function does not consider the timezone, and is not guaranteed to
        return a unique UUID. Use under controlled conditions only.

    >>> dt = datetime.now()
    >>> u1 = uuid1mc()
    >>> u2 = uuid1mc_from_datetime(dt)
    >>> # Both timestamps should be very close to each other but not an exact match
    >>> u1.time > u2.time
    True
    >>> u1.time - u2.time < 5000
    True
    >>> d2 = datetime.fromtimestamp((u2.time - 0x01b21dd213814000) * 100 / 1e9)
    >>> d2 == dt
    True
    """
    fields = list(uuid1mc().fields)
    if isinstance(dt, datetime):
        timeval = time.mktime(dt.timetuple()) + dt.microsecond / 1e6
    else:
        # Assume we got an actual timestamp
        timeval = dt

    # The following code is borrowed from the UUID module source:
    nanoseconds = int(timeval * 1e9)
    # 0x01b21dd213814000 is the number of 100-ns intervals between the
    # UUID epoch 1582-10-15 00:00:00 and the Unix epoch 1970-01-01 00:00:00.
    timestamp = int(nanoseconds // 100) + 0x01B21DD213814000
    time_low = timestamp & 0xFFFFFFFF
    time_mid = (timestamp >> 32) & 0xFFFF
    time_hi_version = (timestamp >> 48) & 0x0FFF

    fields[0] = time_low
    fields[1] = time_mid
    fields[2] = time_hi_version

    return uuid.UUID(fields=tuple(fields))


[docs]def uuid_to_base64(value):
    """
    Convert a UUID object to a 22-char URL-safe Base64 string (BUID)

    >>> uuid_to_base64(uuid.UUID('33203dd2-f2ef-422f-aeb0-058d6f5f7089'))
    'MyA90vLvQi-usAWNb19wiQ'
    """
    return urlsafe_b64encode(value.bytes).decode().rstrip('=')


#: Legacy name
uuid2buid = uuid_to_base64


[docs]def uuid_from_base64(value):
    """
    Convert a 22-char URL-safe Base64 string (BUID) to a UUID object

    >>> uuid_from_base64('MyA90vLvQi-usAWNb19wiQ')
    UUID('33203dd2-f2ef-422f-aeb0-058d6f5f7089')
    """
    return uuid.UUID(bytes=urlsafe_b64decode(str(value) + '=='))


#: Legacy name
buid2uuid = uuid_from_base64


[docs]def uuid_to_base58(value):
    """
    Render a UUID in Base58 and return as a string

    >>> uuid_to_base58(uuid.UUID('33203dd2-f2ef-422f-aeb0-058d6f5f7089'))
    '7KAmj837MyuJWUYPwtqAfz'
    >>> # The following UUID to Base58 encoding is from NPM uuid-base58, for comparison
    >>> uuid_to_base58(uuid.UUID('d7ce8475-e77c-43b0-9dde-56b428981999'))
    'TedLUruK7MosG1Z88urTkk'
    """
    return base58.b58encode(value.bytes).decode()


[docs]def uuid_from_base58(value):
    """
    Convert a Base58-encoded UUID back into a UUID object

    >>> uuid_from_base58('7KAmj837MyuJWUYPwtqAfz')
    UUID('33203dd2-f2ef-422f-aeb0-058d6f5f7089')
    >>> # The following UUID to Base58 encoding is from NPM uuid-base58, for comparison
    >>> uuid_from_base58('TedLUruK7MosG1Z88urTkk')
    UUID('d7ce8475-e77c-43b0-9dde-56b428981999')
    """
    return uuid.UUID(bytes=base58.b58decode(str(value)))


[docs]def newsecret():
    """
    Make a secret key for non-cryptographic use cases like email account verification.
    Mashes two UUID4s into a Base58 rendering, between 42 and 44 characters long. The
    resulting string consists of only ASCII strings and so will typically not be
    word-wrapped by email clients.

    >>> len(newsecret()) in (42, 43, 44)
    True
    >>> newsecret() == newsecret()
    False
    """
    return uuid_b58() + uuid_b58()


[docs]def newpin(digits=4):
    """
    Return a random numeric string with the specified number of digits,
    default 4.

    >>> len(newpin())
    4
    >>> len(newpin(5))
    5
    >>> newpin().isdigit()
    True
    """
    random = SystemRandom()
    randnum = random.randint(0, 10 ** digits)  # NOQA: S311 # nosec
    while len(str(randnum)) > digits:
        randnum = random.randint(0, 10 ** digits)  # NOQA: S311 # nosec
    return ('%%0%dd' % digits) % randnum


[docs]def make_name(text, delim='-', maxlength=50, checkused=None, counter=2):
    """
    Generate an ASCII name slug. If a checkused filter is provided, it will
    be called with the candidate. If it returns True, make_name will add
    counter numbers starting from 2 until a suitable candidate is found.

    :param string delim: Delimiter between words, default '-'
    :param int maxlength: Maximum length of name, default 50
    :param checkused: Function to check if a generated name is available for use
    :param int counter: Starting position for name counter

    >>> make_name('This is a title')
    'this-is-a-title'
    >>> make_name('Invalid URL/slug here')
    'invalid-url-slug-here'
    >>> make_name('this.that')
    'this-that'
    >>> make_name('this:that')
    'this-that'
    >>> make_name("How 'bout this?")
    'how-bout-this'
    >>> make_name("How’s that?")
    'hows-that'
    >>> make_name('K & D')
    'k-d'
    >>> make_name('billion+ pageviews')
    'billion-pageviews'
    >>> make_name('हिन्दी slug!')
    'hindii-slug'
    >>> make_name('Talk in español, Kiswahili, 廣州話 and অসমীয়া too.', maxlength=250)
    'talk-in-espanol-kiswahili-guang-zhou-hua-and-asmiiyaa-too'
    >>> make_name('__name__', delim='_')
    'name'
    >>> make_name('how_about_this', delim='_')
    'how_about_this'
    >>> make_name('and-that', delim='_')
    'and_that'
    >>> make_name('Umlauts in Mötörhead')
    'umlauts-in-motorhead'
    >>> make_name('Candidate', checkused=lambda c: c in ['candidate'])
    'candidate2'
    >>> make_name('Candidate', checkused=lambda c: c in ['candidate'], counter=1)
    'candidate1'
    >>> make_name('Candidate',
    ...   checkused=lambda c: c in ['candidate', 'candidate1', 'candidate2'], counter=1)
    'candidate3'
    >>> make_name('Long title, but snipped', maxlength=20)
    'long-title-but-snipp'
    >>> len(make_name('Long title, but snipped', maxlength=20))
    20
    >>> make_name('Long candidate', maxlength=10,
    ...   checkused=lambda c: c in ['long-candi', 'long-cand1'])
    'long-cand2'
    >>> make_name('Lǝnkǝran')
    'lankaran'
    >>> make_name('example@example.com')
    'example-example-com'
    >>> make_name('trailing-delimiter', maxlength=10)
    'trailing-d'
    >>> make_name('trailing-delimiter', maxlength=9)
    'trailing'
    >>> make_name('''test this
    ... newline''')
    'test-this-newline'
    >>> make_name("testing an emoji😁")
    'testing-an-emoji'
    >>> make_name('''testing\\t\\nmore\\r\\nslashes''')
    'testing-more-slashes'
    >>> make_name('What if a HTML <tag/>')
    'what-if-a-html-tag'
    >>> make_name('These are equivalent to \\x01 through \\x1A')
    'these-are-equivalent-to-through'
    >>> make_name("feedback;\\x00")
    'feedback'
    """
    name = text.replace('@', delim)
    name = unidecode(name).replace(
        '@', 'a'
    )  # We don't know why unidecode uses '@' for 'a'-like chars
    name = str(
        delim.join(
            [
                _strip_re.sub('', x)
                for x in _punctuation_re.split(name.lower())
                if x != ''
            ]
        )
    )
    candidate = name[:maxlength]
    if candidate.endswith(delim):
        candidate = candidate[:-1]
    if checkused is None:
        return candidate
    existing = checkused(candidate)
    while existing:
        candidate = name[: maxlength - len(str(counter))] + str(counter)
        counter += 1
        existing = checkused(candidate)
    return candidate


[docs]def format_currency(value, decimals=2):
    """
    Return a number suitably formatted for display as currency, with
    thousands separated by commas and up to two decimal points.

    >>> format_currency(1000)
    '1,000'
    >>> format_currency(100)
    '100'
    >>> format_currency(999.95)
    '999.95'
    >>> format_currency(99.95)
    '99.95'
    >>> format_currency(100000)
    '100,000'
    >>> format_currency(1000.00)
    '1,000'
    >>> format_currency(1000.41)
    '1,000.41'
    >>> format_currency(23.21, decimals=3)
    '23.210'
    >>> format_currency(1000, decimals=3)
    '1,000'
    >>> format_currency(123456789.123456789)
    '123,456,789.12'
    """
    number, decimal = (('%%.%df' % decimals) % value).split('.')
    parts = []
    while len(number) > 3:
        part, number = number[-3:], number[:-3]
        parts.append(part)
    parts.append(number)
    parts.reverse()
    if int(decimal) == 0:
        return ','.join(parts)
    else:
        return ','.join(parts) + '.' + decimal


[docs]def md5sum(data):
    """
    Return md5sum of data as a 32-character string.

    >>> md5sum('random text')
    'd9b9bec3f4cc5482e7c5ef43143e563a'
    >>> md5sum('random text')
    'd9b9bec3f4cc5482e7c5ef43143e563a'
    >>> len(md5sum('random text'))
    32
    """
    return hashlib.md5(  # NOQA: S303 # skipcq: PTC-W1003 # nosec
        data.encode('utf-8')
    ).hexdigest()


[docs]def getbool(value):
    """
    Returns a boolean from any of a range of values. Returns None for
    unrecognized values. Numbers other than 0 and 1 are considered
    unrecognized.

    >>> getbool(True)
    True
    >>> getbool(1)
    True
    >>> getbool('1')
    True
    >>> getbool('t')
    True
    >>> getbool(2)
    >>> getbool(0)
    False
    >>> getbool(False)
    False
    >>> getbool('n')
    False
    """
    value = str(value).lower()
    if value in ['1', 't', 'true', 'y', 'yes']:
        return True
    elif value in ['0', 'f', 'false', 'n', 'no']:
        return False
    return None


[docs]def nullint(value):
    """
    Return int(value) if bool(value) is not False. Return None otherwise.
    Useful for coercing optional values to an integer.

    >>> nullint('10')
    10
    >>> nullint('') is None
    True
    """
    return int(value) if value else None


[docs]def nullstr(value):
    """
    Return unicode(value) if bool(value) is not False. Return None otherwise.
    Useful for coercing optional values to a string.

    >>> nullstr(10) == '10'
    True
    >>> nullstr('') is None
    True
    """
    return str(value) if value else None


nullunicode = nullstr  # XXX: Deprecated name. Remove soon.


[docs]def require_one_of(_return=False, **kwargs):
    """
    Validator that raises :exc:`TypeError` unless one and only one parameter is
    not ``None``. Use this inside functions that take multiple parameters, but
    allow only one of them to be specified::

        def my_func(this=None, that=None, other=None):
            # Require one and only one of `this` or `that`
            require_one_of(this=this, that=that)

            # If we need to know which parameter was passed in:
            param, value = require_one_of(True, this=this, that=that)

            # Carry on with function logic
            pass

    :param _return: Return the matching parameter
    :param kwargs: Parameters, of which one and only one is mandatory
    :return: If `_return`, matching parameter name and value
    :rtype: tuple
    :raises TypeError: If the count of parameters that aren't ``None`` is not 1
    """

    # Two ways to count number of non-None parameters:
    #
    # 1. sum([1 if v is not None else 0 for v in kwargs.values()])
    #
    #    Using a list comprehension instead of a generator comprehension as the
    #    parameter to `sum` is faster on both Python 2 and 3.
    #
    # 2. len(kwargs) - kwargs.values().count(None)
    #
    #    This is 2x faster than the first method under Python 2.7. Unfortunately,
    #    it doesn't work in Python 3 because `kwargs.values()` is a view that doesn't
    #    have a `count` method. It needs to be cast into a tuple/list first, but
    #    remains faster despite the cast's slowdown. Tuples are faster than lists.

    count = len(kwargs) - tuple(kwargs.values()).count(None)

    if count == 0:
        raise TypeError(
            "One of these parameters is required: " + ', '.join(kwargs.keys())
        )
    if count != 1:
        raise TypeError(
            "Only one of these parameters is allowed: " + ', '.join(kwargs.keys())
        )

    if _return:
        keys, values = zip(*[(k, 1 if v is not None else 0) for k, v in kwargs.items()])
        k = keys[values.index(1)]
        return k, kwargs[k]


[docs]def unicode_http_header(value):
    r"""
    Convert an ASCII HTTP header string into a unicode string with the
    appropriate encoding applied. Expects headers to be RFC 2047 compliant.

    >>> unicode_http_header('=?iso-8859-1?q?p=F6stal?=') == 'p\xf6stal'
    True
    >>> unicode_http_header(b'=?iso-8859-1?q?p=F6stal?=') == 'p\xf6stal'
    True
    >>> unicode_http_header('p\xf6stal') == 'p\xf6stal'
    True
    """
    # email.header.decode_header expects strings, not bytes. Your input data may be
    # in bytes. Since these bytes are almost always ASCII, calling `.decode()` on
    # it without specifying a charset should work fine.
    if isinstance(value, bytes):
        value = value.decode()
    return ''.join(
        str(s, e or 'iso-8859-1') if not isinstance(s, str) else s
        for s, e in decode_header(value)
    )


[docs]def get_email_domain(emailaddr):
    """
    Return the domain component of an email address. Returns None if the
    provided string cannot be parsed as an email address.

    >>> get_email_domain('test@example.com')
    'example.com'
    >>> get_email_domain('test+trailing@example.com')
    'example.com'
    >>> get_email_domain('Example Address <test@example.com>')
    'example.com'
    >>> get_email_domain('foobar')
    >>> get_email_domain('foobar@')
    >>> get_email_domain('@foobar')
    """
    realname, address = email.utils.parseaddr(emailaddr)
    try:
        username, domain = address.split('@')
        if not username:
            return None
        return domain or None
    except ValueError:
        return None


[docs]def valid_username(candidate):
    """
    Check if a username is valid.

    >>> valid_username('example person')
    False
    >>> valid_username('example_person')
    False
    >>> valid_username('exampleperson')
    True
    >>> valid_username('example-person')
    True
    >>> valid_username('a')
    True
    >>> (valid_username('a-') or valid_username('ab-') or valid_username('-a') or
    ...   valid_username('-ab'))
    False
    """
    return not _username_valid_re.search(candidate) is None


[docs]def namespace_from_url(url):
    """
    Construct a dotted namespace string from a URL.
    """
    parsed = urlparse(url)
    if (
        parsed.hostname is None
        or parsed.hostname in ['localhost', 'localhost.localdomain']
        or (_ipv4_re.search(parsed.hostname))
    ):
        return None

    namespace = parsed.hostname.split('.')
    namespace.reverse()
    if namespace and not namespace[0]:
        namespace.pop(0)
    if namespace and namespace[-1] == 'www':
        namespace.pop(-1)
    return type(url)('.'.join(namespace))


[docs]def base_domain_matches(d1, d2):
    """
    Check if two domains have the same base domain, using the Public Suffix List.

    >>> base_domain_matches('https://hasjob.co', 'hasjob.co')
    True
    >>> base_domain_matches('hasgeek.hasjob.co', 'hasjob.co')
    True
    >>> base_domain_matches('hasgeek.com', 'hasjob.co')
    False
    >>> base_domain_matches('static.hasgeek.co.in', 'hasgeek.com')
    False
    >>> base_domain_matches('static.hasgeek.co.in', 'hasgeek.co.in')
    True
    >>> base_domain_matches('example@example.com', 'example.com')
    True
    """
    r1 = tldextract.extract(d1)
    r2 = tldextract.extract(d2)
    # r1 and r2 contain subdomain, domain and suffix.
    # We want to confirm that domain and suffix match.
    return r1.domain == r2.domain and r1.suffix == r2.suffix


[docs]def domain_namespace_match(domain, namespace):
    """
    Checks if namespace is related to the domain because the base domain matches.

    >>> domain_namespace_match('hasgeek.com', 'com.hasgeek')
    True
    >>> domain_namespace_match('funnel.hasgeek.com', 'com.hasgeek.funnel')
    True
    >>> domain_namespace_match('app.hasgeek.com', 'com.hasgeek.peopleflow')
    True
    >>> domain_namespace_match('app.hasgeek.in', 'com.hasgeek.peopleflow')
    False
    >>> domain_namespace_match('peopleflow.local', 'local.peopleflow')
    True
    """
    return base_domain_matches(domain, ".".join(namespace.split(".")[::-1]))


[docs]def nary_op(f, doc=None):
    """
    Decorator to convert a binary operator into a chained n-ary operator.
    """

    @wraps(f)
    def inner(lhs, *others):
        for other in others:
            lhs = f(lhs, other)
        return lhs

    if doc is not None:
        inner.__doc__ = doc
    return inner
Source code for coaster.utils.misc

coaster

Navigation

Related Topics