Source code for coaster.utils.text

"""
Text processing utilities
-------------------------
"""

from functools import partial
from html import unescape
from typing import List, Mapping, Set
import re
import string

from bleach.linkifier import DEFAULT_CALLBACKS, LinkifyFilter
from bleach.sanitizer import Cleaner
from markupsafe import Markup
import html5lib

__all__ = [
    'VALID_TAGS',
    'LINKIFY_SKIP_TAGS',
    'LINKIFY_CALLBACKS',
    'compress_whitespace',
    'deobfuscate_email',
    'normalize_spaces',
    'normalize_spaces_multiline',
    'sanitize_html',
    'simplify_text',
    'text_blocks',
    'ulstrip',
    'unicode_extended_whitespace',
    'urstrip',
    'ustrip',
]


#: Unicode's list of whitespace characters is missing some that were previously
#: classified as whitespace but are now considered format characters. These are
#: invisible and usually arrive via copy-paste, so we include them here as characters to
#: be replaced with spaces and stripped from the ends of text.
unicode_format_whitespace = (
    '\x85'  # NEXT LINE (NEL)
    '\xa0'  # NO-BREAK SPACE (NBSP)
    '\u1680'  # OGHAM SPACE MARK
    '\u180e'  # MONGOLIAN VOWEL SEPARATOR
    '\u2000'  # EN QUAD
    '\u2001'  # EM QUAD
    '\u2002'  # EN SPACE
    '\u2003'  # EM SPACE
    '\u2004'  # THREE-PER-EM SPACE
    '\u2005'  # FOUR-PER-EM SPACE
    '\u2006'  # SIX-PER-EM SPACE
    '\u2007'  # FIGURE SPACE
    '\u2008'  # PUNCTUATION SPACE
    '\u2009'  # THIN SPACE
    '\u200a'  # HAIR SPACE
    '\u200b'  # ZERO WIDTH SPACE (format)
    '\u200c'  # ZERO WIDTH NON-JOINER (format)
    '\u200d'  # ZERO WIDTH JOINER (format)
    '\u2028'  # LINE SEPARATOR
    '\u2029'  # PARAGRAPH SEPARATOR
    '\u202f'  # NARROW NO-BREAK SPACE (NNBSP)
    '\u205f'  # MEDIUM MATHEMATICAL SPACE (MMSP)
    '\u2060'  # WORD JOINER (format)
    '\u3000'  # IDEOGRAPHIC SPACE
    '\ufeff'  # ZERO WIDTH NO-BREAK SPACE (format)
)

unicode_extended_whitespace = (
    '\t\n\x0b\x0c\r\x1c\x1d\x1e\x1f '  # ASCII whitespace
) + unicode_format_whitespace

re_singleline_spaces = re.compile(
    '[' + unicode_extended_whitespace + ']', re.UNICODE | re.MULTILINE
)
re_multiline_spaces = re.compile(
    '[' + unicode_format_whitespace + ']', re.UNICODE | re.MULTILINE
)
re_compress_spaces = re.compile(
    r'[\s' + unicode_format_whitespace + ']+', re.UNICODE | re.MULTILINE
)

VALID_TAGS: Mapping[str, List[str]] = {
    'a': ['href', 'title', 'target', 'rel'],
    'abbr': ['title'],
    'b': [],
    'br': [],
    'blockquote': [],
    'cite': [],
    'code': [],
    'dd': [],
    'del': [],
    'dl': [],
    'dt': [],
    'em': [],
    'h3': [],
    'h4': [],
    'h5': [],
    'h6': [],
    'hr': [],
    'i': [],
    'img': ['src', 'width', 'height', 'align', 'alt'],
    'ins': [],
    'li': [],
    'mark': [],
    'p': [],
    'pre': [],
    'ol': ['start'],
    'strong': [],
    'sup': [],
    'sub': [],
    'ul': [],
}

LINKIFY_SKIP_TAGS: List = ['pre', 'code', 'kbd', 'samp', 'var']


# Adapted from https://bleach.readthedocs.io/en/latest/linkify.html#preventing-links
def dont_linkify_filenames(attrs, new=False):
    # This is an existing link, so leave it be
    if not new:
        return attrs
    # If the TLD is '.py', make sure it starts with http: or https:.
    # Use _text because that's the original text
    link_text = attrs['_text']
    if link_text.endswith('.py') and not link_text.startswith(('http:', 'https:')):
        # This looks like a Python file, not a URL. Don't make a link.
        return None
    # Everything checks out, keep going to the next callback.
    return attrs


LINKIFY_CALLBACKS = list(DEFAULT_CALLBACKS) + [dont_linkify_filenames]


[docs]def sanitize_html(value, valid_tags=None, strip=True, linkify=False):
    """Strip unwanted markup out of HTML."""
    if valid_tags is None:
        valid_tags = VALID_TAGS
    if linkify:
        filters = [
            partial(
                LinkifyFilter, callbacks=LINKIFY_CALLBACKS, skip_tags=LINKIFY_SKIP_TAGS
            )
        ]
    else:
        filters = []
    cleaner = Cleaner(
        tags=list(valid_tags.keys()),
        attributes=valid_tags,
        filters=filters,
        strip=strip,
    )
    return Markup(cleaner.clean(value))


blockish_tags: Set[str] = {
    'address',
    'article',
    'aside',
    'audio',
    'blockquote',
    'canvas',
    'dd',
    'div',
    'dl',
    'dt',
    'fieldset',
    'figcaption',
    'figure',
    'footer',
    'form',
    'h1',
    'h2',
    'h3',
    'h4',
    'h5',
    'h6',
    'header',
    'hgroup',
    'hr',
    'li',
    'noscript',
    'ol',
    'output',
    'p',
    'pre',
    'section',
    'table',
    'td',
    'tfoot',
    'th',
    'tr',
    'ul',
    'video',
}


[docs]def text_blocks(html_text, skip_pre=True):
    """Extracts a list of paragraphs from a given HTML string."""
    doc = html5lib.parseFragment(html_text)
    blocks = []

    def subloop(parent_tag, element, lastchild=False):
        if callable(
            element.tag
        ):  # Comments have a callable tag. TODO: Find out, anything else?
            tag = '<!-->'
            text = ''
            tail = element.tail or ''
        else:
            tag = element.tag.split('}')[
                -1
            ]  # Extract tag from namespace: {http://www.w3.org/1999/xhtml}html
            text = element.text or ''
            tail = element.tail or ''

        if tag == 'pre' and skip_pre:
            text = ''

        if tag in blockish_tags or tag == 'DOCUMENT_FRAGMENT':
            text = text.lstrip()  # Leading whitespace is insignificant in a block tag
            if not len(element):
                text = (
                    text.rstrip()
                )  # No children? Then trailing whitespace is insignificant
            # If there's text, add it.
            # If there's no text but the next element is not a block tag, add a blank
            # anyway (unless it's a pre tag and we want to skip_pre, in which case
            # ignore it again).
            if text:
                blocks.append(text)
            elif (
                len(element)
                and isinstance(element[0].tag, str)
                and element[0].tag.split('}')[-1] not in blockish_tags
                and not (skip_pre and tag == 'pre')
            ):
                blocks.append('')
        else:
            if not blocks:
                if text:
                    blocks.append(text)
            else:
                blocks[-1] += text

        if len(element) > 0 and not (skip_pre and tag == 'pre'):
            for child in element[:-1]:
                subloop(tag, child)
            subloop(tag, element[-1], lastchild=True)

        if tag in blockish_tags:
            tail = (
                tail.lstrip()
            )  # Leading whitespace is insignificant after a block tag
            if tail:
                blocks.append(tail)
        else:
            if parent_tag in blockish_tags and lastchild:
                tail = (
                    tail.rstrip()
                )  # Trailing whitespace is insignificant before a block tag end
            if not blocks:
                if tail:
                    blocks.append(tail)
            else:
                if tag == 'br' and tail:
                    blocks[-1] += '\n' + tail
                else:
                    blocks[-1] += tail

    subloop(None, doc)
    # Replace &nbsp; with ' '
    blocks = [t.replace('\xa0', ' ') for t in blocks]
    return blocks


[docs]def normalize_spaces(text):
    """Replace whitespace characters with regular spaces."""
    return re_singleline_spaces.sub(' ', text)


[docs]def normalize_spaces_multiline(text):
    """
    Replace whitespace characters with regular spaces, in multiline text.

    Line break characters like newlines are not considered whitespace.
    """
    return re_multiline_spaces.sub(' ', text)


[docs]def ulstrip(text):
    """Strip Unicode extended whitespace from the left side of a string."""
    return text.lstrip(unicode_extended_whitespace)


[docs]def urstrip(text):
    """Strip Unicode extended whitespace from the right side of a string."""
    return text.rstrip(unicode_extended_whitespace)


[docs]def ustrip(text):
    """Strip Unicode extended whitespace from a string."""
    return text.strip(unicode_extended_whitespace)


[docs]def compress_whitespace(text):
    """Reduce all space-like characters into single spaces and strip from ends."""
    return ustrip(re_compress_spaces.sub(' ', text))


# Based on http://jasonpriem.org/obfuscation-decoder/
_deobfuscate_dot1_re = re.compile(r'\W+\.\W+|\W+dot\W+|\W+d0t\W+', re.U | re.I)
_deobfuscate_dot2_re = re.compile(r'([a-z0-9])DOT([a-z0-9])')
_deobfuscate_dot3_re = re.compile(r'([A-Z0-9])dot([A-Z0-9])')
_deobfuscate_at1_re = re.compile(r'\W*@\W*|\W+at\W+', re.U | re.I)
_deobfuscate_at2_re = re.compile(r'([a-z0-9])AT([a-z0-9])')
_deobfuscate_at3_re = re.compile(r'([A-Z0-9])at([A-Z0-9])')


[docs]def deobfuscate_email(text):
    """Deobfuscate email addresses in provided text."""
    text = unescape(text)
    # Find the "dot"
    text = _deobfuscate_dot1_re.sub('.', text)
    text = _deobfuscate_dot2_re.sub(r'\1.\2', text)
    text = _deobfuscate_dot3_re.sub(r'\1.\2', text)
    # Find the "at"
    text = _deobfuscate_at1_re.sub('@', text)
    text = _deobfuscate_at2_re.sub(r'\1@\2', text)
    text = _deobfuscate_at3_re.sub(r'\1@\2', text)

    return text


[docs]def simplify_text(text):
    """
    Simplify text to allow comparison.

    >>> simplify_text("Awesome Coder wanted at Awesome Company")
    'awesome coder wanted at awesome company'
    >>> simplify_text("Awesome Coder, wanted  at Awesome Company! ")
    'awesome coder wanted at awesome company'
    >>> simplify_text("Awesome Coder, wanted  at Awesome Company! ") == (
    ...   'awesome coder wanted at awesome company')
    True
    """
    text = text.translate(text.maketrans('', '', string.punctuation)).lower()
    return ' '.join(text.split())
Source code for coaster.utils.text

coaster

Navigation

Related Topics