Source code for preprocess.shallow.techniques

#!/usr/bin/env python 3.6

"""
Shallow parsing techniques for NLP text
=========================================

Techniques to transform text into informed strings not just the senteces.

Text are less complex if:
- stopwords are eliminated or not
- capital letters are deleted or not
- multiword are trated as is
- TODO: end this list

This module concentrate the majority of shallow parsing techniques identified
in Paraphrase Detection papers.

"""

__author__ = 'Abel Meneses-Abad'

from configparser import ConfigParser
import preprocess
from os.path import join, relpath
from preprocess.shallow import LANGUAGES
from string import punctuation

#TODO verify what happen if nltk there is not.
try:
    from nltk.corpus import stopwords
    from nltk.tag import StanfordPOSTagger
    from nltk.stem import SnowballStemmer
    from nltk.stem import WordNetLemmatizer
except:
    pass

config = ConfigParser()
config.read(preprocess.__path__[0]+'/data/cfg/stanford.cfg')
stanford_pos_model = {}

st4_pos_dir = relpath(config['POS']['stanford_dir'])
stanford_pos_model['en'] = relpath(join(st4_pos_dir,config['POS']['stanford_eng_model']))
stanford_pos_jar = relpath(join(st4_pos_dir,config['POS']['stanford_jar']))

def pos(text, lang='en', interface='stanford', multioutput='raw_value'):
    """Part of Speech Tagging.

    Parameters
    ----------
    text: string to parse, generally a sentence.

    lang: natural languaje of the text.

    interface: a tag of one of the implemented interfaces in preprocess.

    multioutput: Format type of the output.
                 string in ['raw_value', 'tuple_list', 'raw_tag']
                 * raw value - string format
                 * tuple list - format is implemented for ngram generalization of
                 some token distances in textsim papckage.
                 * raw tag - string only with POS tags

    Returns
    -------

    parsed result : string output, list of tuples [(token, POS tag)],
                    POS-tags substituting tokens.

    Note
    ----

    The returned string structure is build to use textsim string and token
    distances.

    """

    if interface == 'stanford':
        result = __stanford_pos(text, lang, multioutput)
    if interface == 'freeling':
        result = ''

    return result

def __stanford_pos(text,lang='en',multioutput='raw_value'):
    """Interface for NLTK Stanford POS Tagger interface.
    """
    st = StanfordPOSTagger(model_filename=stanford_pos_model[lang], path_to_jar=stanford_pos_jar)
    tuple_list = st.tag(text.split())
    string = ''
    raw_tag = ''
    for (word,tag) in tuple_list:
        string += word+'/'+tag+' '
        raw_tag += tag+' '

    if isinstance(multioutput, str):
        if multioutput == 'raw_value':
            return string
        if multioutput == 'tuple_list':
            return tuple_list
        if multioutput == 'raw_tag':
            return raw_tag

[docs]def remove_stopwords(text, lang='en', stops_path='', ignore_case = True): """Remove stopwords based on language. :Software: Based on Normalizr package remove_stop_words. """ new_text = '' if stops_path: stop_words = set(open(stops_path+'/'+lang+'txt').read().split()) else: stop_words = set(stopwords.words(lang)) for char in punctuation: stop_words.add(char) for word in text.split(' '): if word.lower() not in stop_words and len(word)>3: if ignore_case: new_text += ' ' + word else: new_text += ' ' + word.lower() return new_text
[docs]def stemming(text, lang='en'): """Stem words based in Snowball algorithm. """ stemmer = SnowballStemmer(LANGUAGES[lang]) return ' '.join(stemmer.stem(word) for word in text.split())
POS_LIST = { 'ADJ':'a', 'ADJ_SAT':'s', 'ADV':'r', 'NOUN':'n', 'VERB':'v', }
[docs]def lemmatization(text, lang='en', input_type='raw_value'): """Lemmatize words based on WordNet corpus. """ lemmatizer = WordNetLemmatizer() if input_type == 'raw_value': return ' '.join(lemmatizer.lemmatize(word) for word in text.split()) elif input_type == 'tuple_list': new_text = '' for word,POS in text: if POS in POS_LIST: new_text += lemmatizer.lemmatize(word,POS_LIST[POS])+' ' else: new_text += word + ' ' return new_text
#TODO Search the spacy not installed Warning to see how to program a missing installed library if __name__ == '__main__': s1=input("Input text A:") print("The inputed text can be lexicalized '%s'" % pos(s1))