Source code for preprocess.grams.collocations

#!/usr/bin/env python 3.6

"""
The collocations script includes some functions to preprocess 
collocations.
"""

from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder,\
								QuadgramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures,\
							QuadgramAssocMeasures

from preprocess.shallow import remove_stopwords

[docs]class Collocations:
	""""Collocations is a kind of grams. They are a pair or group of
	words that are habitually juxtaposed. E.g. 'strong coffee', 
	'black night'.

	This class contain more methods inside to return the most important
	tokens based on different metrics. 

	Parameters
	----------

	text: str
		  The text or list of text names to be processed.
	ngrams: int
		    Number of grams your collocation must have [2,3,4]. 
	stopwords: bool
			   Preprocess texts with/without stop words.
	lang: str
		  Language of the texts ['en', 'es'].
	
	Attributes 
	----------

	list: list
		  Array with collocations.

	Examples
	--------

	>>> from preprocess.grams import Collocations
	>>> from preprocess.demo import preProcessFlow
	>>> from preprocess.data import load_culturalibre
	>>> book = load_culturalibre()
	>>> txt = preProcessFlow(book)
	>>> collocations = Collocations(txt)

	Show the first 10 collocations:

	>>> collocations.head(10)
	[('Cultura', 'libre'),
	('disponible', 'enlace'),
	('dominio', 'público'),
	('Tribunal', 'Supremo'),
	('propiedad', 'intelectual'),
	('propiedad', 'creativa'),
	('dueño', 'copyright'),
	('dueños', 'copyright'),
	('sentido', 'común'),
	('Creative', 'Commons')]

	The results of collocation list is more understandable after ejecute 
	all the preprocessing pipeline.
	
	This class internaly use the function :func:`remove_stopwords`.
	"""
	def __init__ (self,text, ngrams=2, stopwords=True, lang='en'):
		#advise about text length
		if text.count(' ') < 100:
			print('This text is to short for proper collocations!') 
		self.text = text
		self.words = []
		self.ngrams = ngrams
		self.lang = lang

		self.grams = {
			2: BigramCollocationFinder,
			3: TrigramCollocationFinder,
			4: QuadgramCollocationFinder
		}

		self.measures = {
			2: BigramAssocMeasures,
			3: TrigramAssocMeasures,
			4: QuadgramAssocMeasures
		}
		
		if isinstance(self.text,str):
			if stopwords:
				print("Removing stop words active, to change behavior run:\n \
					Collocations(txt,stopwords=False)")
				self.words = remove_stopwords(self.text, lang=self.lang).split()
			else:
				self.words = self.text.split()
		elif isinstance(self.text,list):
			for file in self.text:
				with open(file) as doc:
					if stopwords:
						print("Removing stop words active, to change behavior run:\n \
					Collocations(txt,stopwords=False)")
						self.words.extend(remove_stopwords(doc.read(), lang=self.lang).split())
					else:
						self.words.extend(doc.read().split())
		
		self.score_fn = self.measures[ngrams].likelihood_ratio

		self.list = self.grams[self.ngrams].from_words(self.words)

[docs]	def write(self, path :str):
		"""Write the list of collocations tuples in a txt."""
		with open(path) as doc:
			for element in self.list:
				doc.write(str(element)+'\n')
	
[docs]	def head(self, N :int):
		"""Show the first N elements of the collocation list based
		on the score function "likelihood_ratio.
		"""
		return [p for p, s in self.list.score_ngrams(self.score_fn)[:N]]

[docs]	def tail(self, N :int):
		"""Show the last N elements of the collocation list based
		on the score function "likelihood_ratio."""
		return [p for p, s in self.list.score_ngrams(self.score_fn)[-N:]]

	#TODO: apply from toolz.curried import compose
	#find_collocations = compose(collocations(),remove_stopwords())
	#Delete the problem to handle to many if/else and manipulate parameters of both