#!/usr/bin/env python 3.5
"""Module for popular text normalization techniques:
- url replacement (func: replace_urls)
- symbols replacement (func: replace_symbols)
- abbreviations dot marking, with '_'
- replace punctuation and other noisy chars
- and other functions elaborated for txt comming from pdfMiner, pdf2txt
.. author: Abel Meneses abad
"""
import re, os
import string
from .punctuation import Replacer
from .symbols import replace as sreplace
#TODO Add decorator for importing external function's docstring
#Support for spanish texts
LETTERS = ''.join([string.ascii_letters,'ñÑáéíóúÁÉÍÓÚüÜ'])
#NORMALIZATION FUNCTIONS
def replace_urls(text: str) -> str:
for i in re.finditer('www\S*(?=[.]+?\s+?|[.]\Z|\w\s)|http\S*(?=[.]+?\s+?|[.]\Z|\w\s)',text):
for j in range(i.start(),i.end()):
if text[j] in string.punctuation:
text = text[:j]+'_'+text[j+1:]
return text
def replace_symbols(text: str) -> str:
return sreplace(text)
[docs]def replace_dot_sequence(text: str) -> str:
"""
Replace a contiguous dot sequence by the same amount of
whitespace.
Please read carefully the documentation to see all the
conventions adopted to replace this sequences, and how to
maintain dot sentence delimiters for sentence tokenizers.
Note
----
It can't be implemented without the finditer function.
This expression r'(\w+)[.]\s*[.]+[\s|[.]]*' changes the sequences
of points but it is impossible to handle the number of white spaces.
This functions it is used also for the alignment process after
normalization, where maintaining the length of the original text is
important.
"""
for i in re.finditer('[.]\s*?[.]+?[\s|[.]]*',text):
for j in range(i.start(),i.end()):
if text[j] == '.' or text[j]==' ':
text = text[:j]+' '+text[j+1:]
return text
[docs]def multipart_words(text: str) -> str:
"""Hyphenated words like 'end-of-line' are called in NLP multi-part
words.
All hyphens in multi-part words are changed by underscore
character.
Note
----
That syllable segmentation of reach format text add extra
hyphens to every text, those hyphens are removed in
:func: `replace_punctuation`.
"""
text = re.sub('(\w+)[-@.](?=\w+?)','\g<1>_',text)
return text
#----------------------CONTRACTIONS REPLACEMENT
#Contractions patterns based on NLTK Book suggestions
contractions_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'can not'),
(r'i\'m', 'i am'),
(r'isn\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'d(?=\w+ed)', '\g<1> had'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
]
[docs]def expand_contractions(text: str, lang='en') -> str:
"""Expand english contractions.
"""
for (pattern, repl) in contractions_patterns:
(text, count) = re.subn(pattern, repl, text)
return text
[docs]def replace_punctuation(text: str) -> str:
"""
Replace all punctuation characters based on patterns contained in
punctuation script. The Regular Expressions are ordered based on
structural elements (E.g. word syllabic division), paragraph and
sentence transformations.
Note
----
All the syntactic and morphologic transformations depending on
punctuation signs, must be done before applying
replace_punctuation func.
It is important to apply replace_symbols func before this func.
Also the abbreviation recognition, multipart words, replace_dots
and replace urls, all these functions work with punctuation signs,
so if they are not underscored or transformed, this func will take
its own decisions with the remaining punct signs.
For example the sentence tokenization will change in case of rare
quotations: “.
"""
#TODO: program this like re.sub(pattern, repl, text).
punctuation = Replacer()
text = punctuation.replace(text)
return text
#TODO: program this func to permit the addition of new RE by the
#user like spacy
[docs]def lowercase(text: str) -> str:
"""Return lowercase of string.
"""
if isinstance(text,str):
return text.lower()
else:
print('Input must be a string')
#TODO: after adding typing check delete if/else structure
#PREPROCESSING FUNCTIONS
#TODO: look comments in CHANGELOG for v0.3.3
[docs]def add_doc_ending_point(text: str) -> str:
"""
Add Final Text Dot
Comes from clean_punctuation script but with less functionalities, except
adding an ending point at the end of the document.
Note
-----
This is a function to garantied that the last sentence have an ending
point. The sentence tokenization process can be standardized because every
sentence, even the last one, have an ending point.
Parameters
-----------
text : str
text to process
Returns
-------
text: str
The same text but, if missing, with a dot at the end
"""
# Este fragmento de código coloca un punto en el final del texto. Objetivo: luego hay funciones que necesitan que el último caracter sea el punto final de la última oración.
first_ending_point = text.rfind('.') #last ending point position
fragment = text[first_ending_point+1:] #text fragment after endindg point
A = set(LETTERS)
B = set(fragment)
if len(B.intersection(A)) != 0: #if there are valid letters after ending point insert a new one
text += ' .'
return text
[docs]def del_tokens_len_one(text: str) -> str:
"""Delete tokens with length = 1.
This is kind of a basic stopword filtering.
"""
text = re.sub('(\s)\w(\s)',' ',text)
return text
[docs]def del_digits(text):
"""Delete words compound only by digits."""
return re.sub('(\s*)\d+(\s*)',' ',text)
# TODO: implement Deep Learning for sentence parsing. This is experimental,
# because after preProcessFlow all sentences are well defined by char '.'.