Source code for preprocess.basic.normalize

#!/usr/bin/env python 3.5

"""Module for popular text normalization techniques:

    - url replacement (func: replace_urls)
    - symbols replacement (func: replace_symbols)
    - abbreviations dot marking, with '_'
    - replace punctuation and other noisy chars
    - and other functions elaborated for txt comming from pdfMiner, pdf2txt

.. author: Abel Meneses abad
"""

import re, os
import string
from .punctuation import Replacer
from .symbols import replace as sreplace

#TODO Add decorator for importing external function's docstring

#Support for spanish texts
LETTERS = ''.join([string.ascii_letters,'ñÑáéíóúÁÉÍÓÚüÜ'])

#NORMALIZATION FUNCTIONS

def replace_urls(text: str) -> str:
    for i in re.finditer('www\S*(?=[.]+?\s+?|[.]\Z|\w\s)|http\S*(?=[.]+?\s+?|[.]\Z|\w\s)',text):
        for j in range(i.start(),i.end()):
            if text[j] in string.punctuation:
                text = text[:j]+'_'+text[j+1:]
    return text

def replace_symbols(text: str) -> str:
    return sreplace(text)

[docs]def replace_dot_sequence(text: str) -> str: """ Replace a contiguous dot sequence by the same amount of whitespace. Please read carefully the documentation to see all the conventions adopted to replace this sequences, and how to maintain dot sentence delimiters for sentence tokenizers. Note ---- It can't be implemented without the finditer function. This expression r'(\w+)[.]\s*[.]+[\s|[.]]*' changes the sequences of points but it is impossible to handle the number of white spaces. This functions it is used also for the alignment process after normalization, where maintaining the length of the original text is important. """ for i in re.finditer('[.]\s*?[.]+?[\s|[.]]*',text): for j in range(i.start(),i.end()): if text[j] == '.' or text[j]==' ': text = text[:j]+' '+text[j+1:] return text
[docs]def multipart_words(text: str) -> str: """Hyphenated words like 'end-of-line' are called in NLP multi-part words. All hyphens in multi-part words are changed by underscore character. Note ---- That syllable segmentation of reach format text add extra hyphens to every text, those hyphens are removed in :func: `replace_punctuation`. """ text = re.sub('(\w+)[-@.](?=\w+?)','\g<1>_',text) return text
#----------------------CONTRACTIONS REPLACEMENT #Contractions patterns based on NLTK Book suggestions contractions_patterns = [ (r'won\'t', 'will not'), (r'can\'t', 'can not'), (r'i\'m', 'i am'), (r'isn\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'), (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'d(?=\w+ed)', '\g<1> had'), (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'), (r'(\w+)\'d', '\g<1> would') ]
[docs]def expand_contractions(text: str, lang='en') -> str: """Expand english contractions. """ for (pattern, repl) in contractions_patterns: (text, count) = re.subn(pattern, repl, text) return text
[docs]def replace_punctuation(text: str) -> str: """ Replace all punctuation characters based on patterns contained in punctuation script. The Regular Expressions are ordered based on structural elements (E.g. word syllabic division), paragraph and sentence transformations. Note ---- All the syntactic and morphologic transformations depending on punctuation signs, must be done before applying replace_punctuation func. It is important to apply replace_symbols func before this func. Also the abbreviation recognition, multipart words, replace_dots and replace urls, all these functions work with punctuation signs, so if they are not underscored or transformed, this func will take its own decisions with the remaining punct signs. For example the sentence tokenization will change in case of rare quotations: “. """ #TODO: program this like re.sub(pattern, repl, text). punctuation = Replacer() text = punctuation.replace(text) return text
#TODO: program this func to permit the addition of new RE by the #user like spacy
[docs]def lowercase(text: str) -> str: """Return lowercase of string. """ if isinstance(text,str): return text.lower() else: print('Input must be a string')
#TODO: after adding typing check delete if/else structure #PREPROCESSING FUNCTIONS
[docs]def extraspace_for_endingpoints(text: str) -> str: """ Add an extra whitespace (if there isn't any) between the last sentence letter and the ending point, allowing an easier way of parsing all sentences by a very distinctive ending point. This function allows to avoid abbrev dots during the sentence parsing subprocess. The original objective of this func was to preserve \n in datasets with one sentence by line (E.g. paraphrase detection, STS). Note ---- Replace punctuation also intend to do this, but because of the complexity of RE in replace_punctuation this function guarantee the 100% of sentence dots are separated at list by a whitespace by any other char. """ text = re.sub('[.]\s*\n',' .\n ',text) return text
#TODO: look comments in CHANGELOG for v0.3.3
[docs]def add_doc_ending_point(text: str) -> str: """ Add Final Text Dot Comes from clean_punctuation script but with less functionalities, except adding an ending point at the end of the document. Note ----- This is a function to garantied that the last sentence have an ending point. The sentence tokenization process can be standardized because every sentence, even the last one, have an ending point. Parameters ----------- text : str text to process Returns ------- text: str The same text but, if missing, with a dot at the end """ # Este fragmento de código coloca un punto en el final del texto. Objetivo: luego hay funciones que necesitan que el último caracter sea el punto final de la última oración. first_ending_point = text.rfind('.') #last ending point position fragment = text[first_ending_point+1:] #text fragment after endindg point A = set(LETTERS) B = set(fragment) if len(B.intersection(A)) != 0: #if there are valid letters after ending point insert a new one text += ' .' return text
[docs]def del_tokens_len_one(text: str) -> str: """Delete tokens with length = 1. This is kind of a basic stopword filtering. """ text = re.sub('(\s)\w(\s)',' ',text) return text
[docs]def del_digits(text): """Delete words compound only by digits.""" return re.sub('(\s*)\d+(\s*)',' ',text)
# TODO: implement Deep Learning for sentence parsing. This is experimental, # because after preProcessFlow all sentences are well defined by char '.'.