Source code for preprocess.grams.ngrams

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Ngrams
=======

This module implement different kind of ngrams.

"""

from collections import defaultdict
from nltk.tree import Tree
from collections import deque
from nltk.util import skipgrams as nltk_skipgrams
from preprocess.utils.decorators import Appender
from preprocess.utils import pipeline

_nltk_stopwords = False
try:
    from nltk.corpus import stopwords
    _nltk_stopwords = True
except:
    pass

def get_dict_from_list(dicc,lista, nivel,head_nodes):
    pendents = []

    if nivel == 1:
        root_node = lista[0].label()
        head_nodes[1]['ROOT'] = root_node                  #Head node of level 1 or ROOT node
        root_childs = list(lista[0])
        dicc[root_node] +=len(root_childs)            #if it is the first level add root-node to dicc (step-A)

        for element in root_childs:

            if isinstance(element,str):
                dicc[element]+=1                      #if actual node is a single-node add to dicc (step-B)

            if isinstance(element, Tree):
                node = element.label()
                dicc[node] +=1                        #if actual node is subtree add node to dicc (step-B) & then process.
                pendents.append(element)
                subtree = list(element)
                for i in range(len(subtree)):
                    subelement = element.label()
                    head_nodes[nivel+1][subelement] = root_node

    if nivel > 1:
        for subtree in lista:
            node = subtree.label()
            subtree_list = list(subtree)

            #Add actual-tree length to parents of past level head nodes
            if node in head_nodes[nivel].keys():          # if actual-node is child of a previous head-node
                child = node
                for level in range(nivel,1,-1):           # for every previous parent head node
                    parent = head_nodes[level][child]

                    dicc[parent] += len(subtree_list)     # add the length of actual-tree
                    child = parent

            # Check if there is not a new subtree inside the actual-tree
            for i in range(len(subtree_list)):
                subelement = subtree_list[i]

                if isinstance(subelement, Tree):
                    head_nodes[nivel+1][subelement.label()] = node #next level root nodes with childs (list elements)
                    pendents.append(subelement) # Add to pendents the element list for future sub-level processing

            dicc[node] += len(subtree_list)

            #Increment dicc in actual-node if elements are whatever (step-B)
            for element in subtree_list:
                if isinstance(element, str):
                    dicc[element] += 1
                else:
                    dicc[element.label()]+=1
    nivel += 1

    if len(pendents) > 0:
        get_dict_from_list(dicc,pendents, nivel, head_nodes)

    return dicc, head_nodes

def get_j_from_list(j,bi_grams):
    for i,tupla in enumerate(bi_grams):
        if j == tupla[1]:
            return tupla[0]

[docs]def sngrams(st, text, N=2):
    """Syntactic Ngrams

    It is a novedouse technique that combines ngrams with dependency trees
    [Sidorov2012]_.

    Parameters
    ----------

    st: tree
        syntactic tree generated by stanford syntactic parser.

    text: str
          text to process

    N: int
       Length of the gram

    Return
    ------
    
    sn_grams: list
              The list of syntactic dependencies tuples of len N

    References
    ----------

    .. [Sidorov2012] Grigori Sidorov et all (2012). Syntactic N-grams as Machine
        Learning Features for Natural Language Processing.
        Journal Expert Systems with Applications, 4(3): 853-860. Elsevier.

    """

    SYNT = [parse.tree() for parse in st.raw_parse(text)]
    SYNT1 = [list(parse.triples()) for parse in st.raw_parse(text)]

    #Generate syntactic bigram
    sbigram = []
    for triplet in SYNT1[0]:
        sbigram.append((triplet[0][0],triplet[2][0]))
    if n==2:
        return sbigram

    else:
        #Preprocessing the syntactic tree
        D = defaultdict(int)
        nivel = 1
        head_nodes=defaultdict(dict)
        sn_grams = []
        D, head_nodes = get_dict_from_list(D, SYNT, nivel,head_nodes)
        ROOT = head_nodes[1]['ROOT']
        list2 = list(sorted(zip(D.values(),D.keys())))
        list2.reverse()

        if len(head_nodes)+1 < n:
            print('There is not any possible sn-gram, n have to be lower than', len(head_nodes)+2)
        else:
            pendent_words = list2.copy()
            while (len(pendent_words) > 1):
                j = pendent_words.pop()[1]                        #From foot nodes to ROOT

                count = 0
                gram = defaultdict(list)

                while(len(gram[0]) < n):
                    gram[0].append(j)
                    x = get_j_from_list(j, sbigram)

                    if x== ROOT and len(gram[0]) < n:
                        gram[0].append(x)
                        break

                    j = x

                #Exception for repeated words in different levels
                #first: detect the same word in the last position of more than a bigram
                if len(gram[0]) > 1:
                    for i,_gram in enumerate(sbigram):
                        if _gram[1] == gram[0][0]:
                            count +=1

                #second: detect the bigram used in the last loop and delete it.
                if count > 1:
                    for i,_gram in enumerate(sbigram):
                        if _gram[1] == gram[0][0] and _gram[0] == gram[0][1]:
                            sbigram.pop(i)

                if len(gram[0]) == n:
                    sn_grams.append(gram[0])

                if count > 1:
                    pendent_words.append((1,gram[0][0]))

        return sn_grams

#TODO: optimization of this script making experimentation inside the Notebook of my NLP course "Synt..."

# Set of util functions for n-gram generation
def _make_ngrams(l, n):
    """Auxiliar ngrams generation func.
    """
    rez = [l[i:(-n + i + 1)] for i in range(n - 1)]
    rez.append(l[n - 1:])
    return zip(*rez)

def _ngram_split(text,n):
    ngram = ''
    gram_count = 0
    for i,word in enumerate(text.split(),1):
        if gram_count-n == -1 and i > n:
            ngram = ngram[ngram.find(' ')+1:]
        ngram += word+' '; gram_count+=1
        if gram_count == n:
            gram_count -= 1
            yield ngram

def _ngrams(text,n):
    ngrams = []
    ngrams.__iadd__(_ngram_split(text,n))
    return ngrams

def _chargrams(s,n):
    """Generate character n-grams.
    """
    return [s[i:i+n] for i in range(len(s)-n+1)]

[docs]def ngrams(text,n=2,gram_type='tokens',multioutput='raw_value'):
    """Generate the list of n-grams.

    Parameters
    ----------

    text : str
           string to parse, generally a sentence.

    gram_type : str
                Select the type of grams.
                string in ['chars', 'tokens']

    multioutput : str
                  Format type of the output. String in ['raw_value', 'tuple_list']
                    * raw value - list of n-grams in string format. Eg: 'a b c'
                    * tuple list - list of n-grams in tuple format. Eg: ('a','b','c')

    """
    if len(text.split()) >= n:
        if multioutput == 'raw_value':
            if gram_type == 'char':
                return _chargrams(text,n)
            else:
                return _ngrams(text,n)
        elif multioutput == 'tuple_list':
            if gram_type == 'char':
                return deque(_make_ngrams(text,n))
            else:
                return deque(_make_ngrams(text.split(),n))
    else:
        raise Exception("Not possible, n must be longer than total words.")

#TODO: here Appender must be used to add examples to ngrams func

[docs]@Appender(nltk_skipgrams.__doc__)
def skipgrams(text,n,k, gram_type='tokens'):
    if gram_type == 'tokens':
        return nltk_skipgrams(text.split(),n,k)
    else:
        return nltk_skipgrams(text,n,k)

[docs]def contextual_ngrams(text,n,multioutput='raw_value'):
    """Generates a special kind of ngrams also called CTnG.

    This ngrams are formed by sorting first the words, then removing
    stopwords and tokens of length one, stemming and sorting the 
    ngrams [RdguezTorrejon2010b]_.

    References
    -----------

    .. [RdguezTorrejon2010b] Diego A. Rodríguez Torrejon &
        José Manuel Martín Ramos. (2010b).
        Detección de plagio en documentos. Sistema externo monolingüe de
        altas prestaciones basado en n-gramas contextuales.
        Procesamiento del Lenguaje Natural, 45:49–57

    """
    temp_text = sorted(text.split())
    text = ' '.join(word for word in temp_text)
    flow = ['remove_stopwords','del_tokens_len_one','stemming']
    text = pipeline(text,flow)
    text = ngrams(text,n,multioutput=multioutput)
    return sorted(text)

[docs]def stopword_ngrams(text,n, lang='en', stops_path='',multioutput='raw_value'):
    """Ngrams obtained filtering all non stopwords also called SWNG
    [Stamatatos2011b]_.

    References
    -----------

    ..  [Stamatatos2011b] Stamatatos, Efstathios (2011).
        Plagiarism Detection Using Stopword n-grams.
        Journal of the American Society for Information Science
        and Technology, 62(12):2512–2527.

    """
    stop_words = set()
    try:
        stop_words = set(open(stops_path+'/'+lang+'txt').read().split())
    except:
        pass
    if _nltk_stopwords and len(stop_words)==0:
        stop_words = set(stopwords.words(lang))
    else:
        print('There are not stopword corpus available.')
        return
    return ngrams(' '.join(
        word for word in text.split(' ') if word.lower() in stop_words),n,multioutput=multioutput)

#TODO: add a global variable (on preprocess.__init.py__) to get 
# stopword files in all files