Source code for coaster.nlp
# -*- coding: utf-8 -*-
"""
Natural language processing
===========================
Provides a wrapper around NLTK to extract named entities from HTML text::
from coaster.utils import text_blocks
from coaster.nlp import extract_named_entities
html = "<p>This is some HTML-formatted text.</p><p>In two paragraphs.</p>"
textlist = text_blocks(html) # Returns a list of paragraphs.
entities = extract_named_entities(textlist)
"""
import nltk
[docs]def extract_named_entities(text_blocks):
"""
Return a list of named entities extracted from provided text blocks (list of text
strings).
"""
sentences = []
for text in text_blocks:
sentences.extend(nltk.sent_tokenize(text))
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
def extract_entity_names(t):
entity_names = []
if hasattr(t, 'label'):
if t.label() == 'NE':
entity_names.append(' '.join(child[0] for child in t))
else:
for child in t:
entity_names.extend(extract_entity_names(child))
return entity_names
entity_names = []
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))
return set(entity_names)