Spaces:

hussain-shk
/

IndiSent

Runtime error

App Files Files Community

IndiSent / indic_nlp_library /indicnlp /morph /unsupervised_morph.py

hussain-shk

Duplicate from ai4bharat/IndicTrans-Indic2English

e8aeaf1 over 2 years ago

raw

history blame contribute delete

4.56 kB

	#
	# Copyright (c) 2013-present, Anoop Kunchukuttan
	# All rights reserved.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.
	#

	import codecs, sys, itertools,re,os
	import morfessor

	from functools import lru_cache

	from indicnlp import langinfo
	from indicnlp import common
	from indicnlp.tokenize import indic_tokenize

	# Unsupervised Morphological Analyser for Indian languages.
	#
	# @author Anoop Kunchukuttan
	#

	class MorphAnalyzerI(object):
	"""
	Interface for Morph Analyzer
	"""

	def morph_analyze(word):
	pass

	def morph_analyze_document(tokens):
	pass

	class UnsupervisedMorphAnalyzer(MorphAnalyzerI):
	"""
	Unsupervised Morphological analyser built using Morfessor 2.0
	"""

	def __init__(self,lang,add_marker=False):
	self.lang=lang
	self.add_marker=add_marker

	io = morfessor.MorfessorIO()
	self._morfessor_model=io.read_any_model(os.path.join(common.INDIC_RESOURCES_PATH,'morph','morfessor','{}.model'.format(lang)))

	self._script_range_pat=r'^[{}-{}]+$'.format(chr(langinfo.SCRIPT_RANGES[lang][0]),chr(langinfo.SCRIPT_RANGES[lang][1]))
	self._script_check_re=re.compile(self._script_range_pat)

	def _contains_number(self,text):
	if self.lang in langinfo.SCRIPT_RANGES:
	for c in text:
	offset=ord(c)-langinfo.SCRIPT_RANGES[self.lang][0]
	if offset >=langinfo.NUMERIC_OFFSET_START and offset <= langinfo.NUMERIC_OFFSET_END:
	return True
	return False

	def _morphanalysis_needed(self,word):
	return self._script_check_re.match(word) and not self._contains_number(word)

	@lru_cache(maxsize=16384)
	def morph_analyze(self,word):
	"""
	Morphanalyzes a single word and returns a list of component morphemes

	@param word: string input word
	"""
	m_list=[]
	if self._morphanalysis_needed(word):
	val=self._morfessor_model.viterbi_segment(word)
	m_list=val[0]
	if self.add_marker:
	m_list= [ '{}_S_'.format(m) if i>0 else '{}_R_'.format(m) for i,m in enumerate(m_list)]
	else:
	if self.add_marker:
	word='{}_E_'.format(word)
	m_list=[word]
	return m_list

	### Older implementation
	#val=self._morfessor_model.viterbi_segment(word)
	#m_list=val[0]
	#if self.add_marker:
	# m_list= [ u'{}_S_'.format(m) if i>0 else u'{}_R_'.format(m) for i,m in enumerate(m_list)]
	#return m_list


	def morph_analyze_document(self,tokens):
	"""
	Morphanalyzes a document, represented as a list of tokens
	Each word is morphanalyzed and result is a list of morphemes constituting the document

	@param tokens: string sequence of words

	@return list of segments in the document after morph analysis
	"""

	out_tokens=[]
	for token in tokens:
	morphs=self.morph_analyze(token)
	out_tokens.extend(morphs)
	return out_tokens

	#### Older implementation
	#out_tokens=[]
	#for token in tokens:
	# if self._morphanalysis_needed(token):
	# morphs=self.morph_analyze(token)
	# out_tokens.extend(morphs)
	# else:
	# if self.add_marker:
	# token=u'{}_E_'.format(token)
	# out_tokens.append(token)
	#return out_tokens


	if __name__ == '__main__':

	if len(sys.argv)<4:
	print("Usage: python unsupervised_morph.py <infile> <outfile> <language> <indic_resources_path> [<add_marker>]")
	sys.exit(1)

	language=sys.argv[3]
	common.INDIC_RESOURCES_PATH=sys.argv[4]

	add_marker=False

	if len(sys.argv)==6:
	add_marker= True if sys.argv[5] == 'True' else False

	print('Loading morph analyser for ' + language)
	analyzer=UnsupervisedMorphAnalyzer(language,add_marker)
	print('Loaded morph analyser for ' + language)

	with codecs.open(sys.argv[1],'r','utf-8') as ifile:
	with codecs.open(sys.argv[2],'w','utf-8') as ofile:
	for line in ifile.readlines():
	line=line.strip()
	tokens=indic_tokenize.trivial_tokenize(line)
	morph_tokens=analyzer.morph_analyze_document(tokens)
	ofile.write(' '.join(morph_tokens))
	ofile.write('\n')