Source code for coaster.utils.text

"""
Text processing utilities
-------------------------
"""

from functools import partial
from html import unescape
from typing import List, Mapping, Set
import re
import string

from bleach.linkifier import DEFAULT_CALLBACKS, LinkifyFilter
from bleach.sanitizer import Cleaner
from markupsafe import Markup
import html5lib

__all__ = [
    'VALID_TAGS',
    'LINKIFY_SKIP_TAGS',
    'LINKIFY_CALLBACKS',
    'compress_whitespace',
    'deobfuscate_email',
    'normalize_spaces',
    'normalize_spaces_multiline',
    'sanitize_html',
    'simplify_text',
    'text_blocks',
    'ulstrip',
    'unicode_extended_whitespace',
    'urstrip',
    'ustrip',
]


#: Unicode's list of whitespace characters is missing some that were previously
#: classified as whitespace but are now considered format characters. These are
#: invisible and usually arrive via copy-paste, so we include them here as characters to
#: be replaced with spaces and stripped from the ends of text.
unicode_format_whitespace = (
    '\x85'  # NEXT LINE (NEL)
    '\xa0'  # NO-BREAK SPACE (NBSP)
    '\u1680'  # OGHAM SPACE MARK
    '\u180e'  # MONGOLIAN VOWEL SEPARATOR
    '\u2000'  # EN QUAD
    '\u2001'  # EM QUAD
    '\u2002'  # EN SPACE
    '\u2003'  # EM SPACE
    '\u2004'  # THREE-PER-EM SPACE
    '\u2005'  # FOUR-PER-EM SPACE
    '\u2006'  # SIX-PER-EM SPACE
    '\u2007'  # FIGURE SPACE
    '\u2008'  # PUNCTUATION SPACE
    '\u2009'  # THIN SPACE
    '\u200a'  # HAIR SPACE
    '\u200b'  # ZERO WIDTH SPACE (format)
    '\u200c'  # ZERO WIDTH NON-JOINER (format)
    '\u200d'  # ZERO WIDTH JOINER (format)
    '\u2028'  # LINE SEPARATOR
    '\u2029'  # PARAGRAPH SEPARATOR
    '\u202f'  # NARROW NO-BREAK SPACE (NNBSP)
    '\u205f'  # MEDIUM MATHEMATICAL SPACE (MMSP)
    '\u2060'  # WORD JOINER (format)
    '\u3000'  # IDEOGRAPHIC SPACE
    '\ufeff'  # ZERO WIDTH NO-BREAK SPACE (format)
)

unicode_extended_whitespace = (
    '\t\n\x0b\x0c\r\x1c\x1d\x1e\x1f '  # ASCII whitespace
) + unicode_format_whitespace

re_singleline_spaces = re.compile(
    '[' + unicode_extended_whitespace + ']', re.UNICODE | re.MULTILINE
)
re_multiline_spaces = re.compile(
    '[' + unicode_format_whitespace + ']', re.UNICODE | re.MULTILINE
)
re_compress_spaces = re.compile(
    r'[\s' + unicode_format_whitespace + ']+', re.UNICODE | re.MULTILINE
)

VALID_TAGS: Mapping[str, List[str]] = {
    'a': ['href', 'title', 'target', 'rel'],
    'abbr': ['title'],
    'b': [],
    'br': [],
    'blockquote': [],
    'cite': [],
    'code': [],
    'dd': [],
    'del': [],
    'dl': [],
    'dt': [],
    'em': [],
    'h3': [],
    'h4': [],
    'h5': [],
    'h6': [],
    'hr': [],
    'i': [],
    'img': ['src', 'width', 'height', 'align', 'alt'],
    'ins': [],
    'li': [],
    'mark': [],
    'p': [],
    'pre': [],
    'ol': ['start'],
    'strong': [],
    'sup': [],
    'sub': [],
    'ul': [],
}

LINKIFY_SKIP_TAGS: List = ['pre', 'code', 'kbd', 'samp', 'var']


# Adapted from https://bleach.readthedocs.io/en/latest/linkify.html#preventing-links
def dont_linkify_filenames(attrs, new=False):
    # This is an existing link, so leave it be
    if not new:
        return attrs
    # If the TLD is '.py', make sure it starts with http: or https:.
    # Use _text because that's the original text
    link_text = attrs['_text']
    if link_text.endswith('.py') and not link_text.startswith(('http:', 'https:')):
        # This looks like a Python file, not a URL. Don't make a link.
        return None
    # Everything checks out, keep going to the next callback.
    return attrs


LINKIFY_CALLBACKS = list(DEFAULT_CALLBACKS) + [dont_linkify_filenames]


[docs]def sanitize_html(value, valid_tags=None, strip=True, linkify=False): """Strip unwanted markup out of HTML.""" if valid_tags is None: valid_tags = VALID_TAGS if linkify: filters = [ partial( LinkifyFilter, callbacks=LINKIFY_CALLBACKS, skip_tags=LINKIFY_SKIP_TAGS ) ] else: filters = [] cleaner = Cleaner( tags=list(valid_tags.keys()), attributes=valid_tags, filters=filters, strip=strip, ) return Markup(cleaner.clean(value))
blockish_tags: Set[str] = { 'address', 'article', 'aside', 'audio', 'blockquote', 'canvas', 'dd', 'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'noscript', 'ol', 'output', 'p', 'pre', 'section', 'table', 'td', 'tfoot', 'th', 'tr', 'ul', 'video', }
[docs]def text_blocks(html_text, skip_pre=True): """Extracts a list of paragraphs from a given HTML string.""" doc = html5lib.parseFragment(html_text) blocks = [] def subloop(parent_tag, element, lastchild=False): if callable( element.tag ): # Comments have a callable tag. TODO: Find out, anything else? tag = '<!-->' text = '' tail = element.tail or '' else: tag = element.tag.split('}')[ -1 ] # Extract tag from namespace: {http://www.w3.org/1999/xhtml}html text = element.text or '' tail = element.tail or '' if tag == 'pre' and skip_pre: text = '' if tag in blockish_tags or tag == 'DOCUMENT_FRAGMENT': text = text.lstrip() # Leading whitespace is insignificant in a block tag if not len(element): text = ( text.rstrip() ) # No children? Then trailing whitespace is insignificant # If there's text, add it. # If there's no text but the next element is not a block tag, add a blank # anyway (unless it's a pre tag and we want to skip_pre, in which case # ignore it again). if text: blocks.append(text) elif ( len(element) and isinstance(element[0].tag, str) and element[0].tag.split('}')[-1] not in blockish_tags and not (skip_pre and tag == 'pre') ): blocks.append('') else: if not blocks: if text: blocks.append(text) else: blocks[-1] += text if len(element) > 0 and not (skip_pre and tag == 'pre'): for child in element[:-1]: subloop(tag, child) subloop(tag, element[-1], lastchild=True) if tag in blockish_tags: tail = ( tail.lstrip() ) # Leading whitespace is insignificant after a block tag if tail: blocks.append(tail) else: if parent_tag in blockish_tags and lastchild: tail = ( tail.rstrip() ) # Trailing whitespace is insignificant before a block tag end if not blocks: if tail: blocks.append(tail) else: if tag == 'br' and tail: blocks[-1] += '\n' + tail else: blocks[-1] += tail subloop(None, doc) # Replace &nbsp; with ' ' blocks = [t.replace('\xa0', ' ') for t in blocks] return blocks
[docs]def normalize_spaces(text): """Replace whitespace characters with regular spaces.""" return re_singleline_spaces.sub(' ', text)
[docs]def normalize_spaces_multiline(text): """ Replace whitespace characters with regular spaces, in multiline text. Line break characters like newlines are not considered whitespace. """ return re_multiline_spaces.sub(' ', text)
[docs]def ulstrip(text): """Strip Unicode extended whitespace from the left side of a string.""" return text.lstrip(unicode_extended_whitespace)
[docs]def urstrip(text): """Strip Unicode extended whitespace from the right side of a string.""" return text.rstrip(unicode_extended_whitespace)
[docs]def ustrip(text): """Strip Unicode extended whitespace from a string.""" return text.strip(unicode_extended_whitespace)
[docs]def compress_whitespace(text): """Reduce all space-like characters into single spaces and strip from ends.""" return ustrip(re_compress_spaces.sub(' ', text))
# Based on http://jasonpriem.org/obfuscation-decoder/ _deobfuscate_dot1_re = re.compile(r'\W+\.\W+|\W+dot\W+|\W+d0t\W+', re.U | re.I) _deobfuscate_dot2_re = re.compile(r'([a-z0-9])DOT([a-z0-9])') _deobfuscate_dot3_re = re.compile(r'([A-Z0-9])dot([A-Z0-9])') _deobfuscate_at1_re = re.compile(r'\W*@\W*|\W+at\W+', re.U | re.I) _deobfuscate_at2_re = re.compile(r'([a-z0-9])AT([a-z0-9])') _deobfuscate_at3_re = re.compile(r'([A-Z0-9])at([A-Z0-9])')
[docs]def deobfuscate_email(text): """Deobfuscate email addresses in provided text.""" text = unescape(text) # Find the "dot" text = _deobfuscate_dot1_re.sub('.', text) text = _deobfuscate_dot2_re.sub(r'\1.\2', text) text = _deobfuscate_dot3_re.sub(r'\1.\2', text) # Find the "at" text = _deobfuscate_at1_re.sub('@', text) text = _deobfuscate_at2_re.sub(r'\1@\2', text) text = _deobfuscate_at3_re.sub(r'\1@\2', text) return text
[docs]def simplify_text(text): """ Simplify text to allow comparison. >>> simplify_text("Awesome Coder wanted at Awesome Company") 'awesome coder wanted at awesome company' >>> simplify_text("Awesome Coder, wanted at Awesome Company! ") 'awesome coder wanted at awesome company' >>> simplify_text("Awesome Coder, wanted at Awesome Company! ") == ( ... 'awesome coder wanted at awesome company') True """ text = text.translate(text.maketrans('', '', string.punctuation)).lower() return ' '.join(text.split())