Source code for preprocess.shallow.techniques
#!/usr/bin/env python 3.6
"""
Shallow parsing techniques for NLP text
=========================================
Techniques to transform text into informed strings not just the senteces.
Text are less complex if:
- stopwords are eliminated or not
- capital letters are deleted or not
- multiword are trated as is
- TODO: end this list
This module concentrate the majority of shallow parsing techniques identified
in Paraphrase Detection papers.
"""
__author__ = 'Abel Meneses-Abad'
from configparser import ConfigParser
import preprocess
from os.path import join, relpath
from preprocess.shallow import LANGUAGES
from string import punctuation
#TODO verify what happen if nltk there is not.
try:
from nltk.corpus import stopwords
from nltk.tag import StanfordPOSTagger
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
except:
pass
config = ConfigParser()
config.read(preprocess.__path__[0]+'/data/cfg/stanford.cfg')
stanford_pos_model = {}
st4_pos_dir = relpath(config['POS']['stanford_dir'])
stanford_pos_model['en'] = relpath(join(st4_pos_dir,config['POS']['stanford_eng_model']))
stanford_pos_jar = relpath(join(st4_pos_dir,config['POS']['stanford_jar']))
def pos(text, lang='en', interface='stanford', multioutput='raw_value'):
"""Part of Speech Tagging.
Parameters
----------
text: string to parse, generally a sentence.
lang: natural languaje of the text.
interface: a tag of one of the implemented interfaces in preprocess.
multioutput: Format type of the output.
string in ['raw_value', 'tuple_list', 'raw_tag']
* raw value - string format
* tuple list - format is implemented for ngram generalization of
some token distances in textsim papckage.
* raw tag - string only with POS tags
Returns
-------
parsed result : string output, list of tuples [(token, POS tag)],
POS-tags substituting tokens.
Note
----
The returned string structure is build to use textsim string and token
distances.
"""
if interface == 'stanford':
result = __stanford_pos(text, lang, multioutput)
if interface == 'freeling':
result = ''
return result
def __stanford_pos(text,lang='en',multioutput='raw_value'):
"""Interface for NLTK Stanford POS Tagger interface.
"""
st = StanfordPOSTagger(model_filename=stanford_pos_model[lang], path_to_jar=stanford_pos_jar)
tuple_list = st.tag(text.split())
string = ''
raw_tag = ''
for (word,tag) in tuple_list:
string += word+'/'+tag+' '
raw_tag += tag+' '
if isinstance(multioutput, str):
if multioutput == 'raw_value':
return string
if multioutput == 'tuple_list':
return tuple_list
if multioutput == 'raw_tag':
return raw_tag
[docs]def remove_stopwords(text, lang='en', stops_path='', ignore_case = True):
"""Remove stopwords based on language.
:Software: Based on Normalizr package remove_stop_words.
"""
new_text = ''
if stops_path:
stop_words = set(open(stops_path+'/'+lang+'txt').read().split())
else:
stop_words = set(stopwords.words(lang))
for char in punctuation:
stop_words.add(char)
for word in text.split(' '):
if word.lower() not in stop_words and len(word)>3:
if ignore_case:
new_text += ' ' + word
else:
new_text += ' ' + word.lower()
return new_text
[docs]def stemming(text, lang='en'):
"""Stem words based in Snowball algorithm.
"""
stemmer = SnowballStemmer(LANGUAGES[lang])
return ' '.join(stemmer.stem(word) for word in text.split())
POS_LIST = {
'ADJ':'a',
'ADJ_SAT':'s',
'ADV':'r',
'NOUN':'n',
'VERB':'v',
}
[docs]def lemmatization(text, lang='en', input_type='raw_value'):
"""Lemmatize words based on WordNet corpus.
"""
lemmatizer = WordNetLemmatizer()
if input_type == 'raw_value':
return ' '.join(lemmatizer.lemmatize(word) for word in text.split())
elif input_type == 'tuple_list':
new_text = ''
for word,POS in text:
if POS in POS_LIST:
new_text += lemmatizer.lemmatize(word,POS_LIST[POS])+' '
else:
new_text += word + ' '
return new_text
#TODO Search the spacy not installed Warning to see how to program a missing installed library
if __name__ == '__main__':
s1=input("Input text A:")
print("The inputed text can be lexicalized '%s'" % pos(s1))