Spaces:
Runtime error
Runtime error
| # | |
| # Copyright (c) 2013-present, Anoop Kunchukuttan | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| # | |
| import codecs, sys, itertools,re,os | |
| import morfessor | |
| from functools import lru_cache | |
| from indicnlp import langinfo | |
| from indicnlp import common | |
| from indicnlp.tokenize import indic_tokenize | |
| # Unsupervised Morphological Analyser for Indian languages. | |
| # | |
| # @author Anoop Kunchukuttan | |
| # | |
| class MorphAnalyzerI(object): | |
| """ | |
| Interface for Morph Analyzer | |
| """ | |
| def morph_analyze(word): | |
| pass | |
| def morph_analyze_document(tokens): | |
| pass | |
| class UnsupervisedMorphAnalyzer(MorphAnalyzerI): | |
| """ | |
| Unsupervised Morphological analyser built using Morfessor 2.0 | |
| """ | |
| def __init__(self,lang,add_marker=False): | |
| self.lang=lang | |
| self.add_marker=add_marker | |
| io = morfessor.MorfessorIO() | |
| self._morfessor_model=io.read_any_model(os.path.join(common.INDIC_RESOURCES_PATH,'morph','morfessor','{}.model'.format(lang))) | |
| self._script_range_pat=r'^[{}-{}]+$'.format(chr(langinfo.SCRIPT_RANGES[lang][0]),chr(langinfo.SCRIPT_RANGES[lang][1])) | |
| self._script_check_re=re.compile(self._script_range_pat) | |
| def _contains_number(self,text): | |
| if self.lang in langinfo.SCRIPT_RANGES: | |
| for c in text: | |
| offset=ord(c)-langinfo.SCRIPT_RANGES[self.lang][0] | |
| if offset >=langinfo.NUMERIC_OFFSET_START and offset <= langinfo.NUMERIC_OFFSET_END: | |
| return True | |
| return False | |
| def _morphanalysis_needed(self,word): | |
| return self._script_check_re.match(word) and not self._contains_number(word) | |
| def morph_analyze(self,word): | |
| """ | |
| Morphanalyzes a single word and returns a list of component morphemes | |
| @param word: string input word | |
| """ | |
| m_list=[] | |
| if self._morphanalysis_needed(word): | |
| val=self._morfessor_model.viterbi_segment(word) | |
| m_list=val[0] | |
| if self.add_marker: | |
| m_list= [ '{}_S_'.format(m) if i>0 else '{}_R_'.format(m) for i,m in enumerate(m_list)] | |
| else: | |
| if self.add_marker: | |
| word='{}_E_'.format(word) | |
| m_list=[word] | |
| return m_list | |
| ### Older implementation | |
| #val=self._morfessor_model.viterbi_segment(word) | |
| #m_list=val[0] | |
| #if self.add_marker: | |
| # m_list= [ u'{}_S_'.format(m) if i>0 else u'{}_R_'.format(m) for i,m in enumerate(m_list)] | |
| #return m_list | |
| def morph_analyze_document(self,tokens): | |
| """ | |
| Morphanalyzes a document, represented as a list of tokens | |
| Each word is morphanalyzed and result is a list of morphemes constituting the document | |
| @param tokens: string sequence of words | |
| @return list of segments in the document after morph analysis | |
| """ | |
| out_tokens=[] | |
| for token in tokens: | |
| morphs=self.morph_analyze(token) | |
| out_tokens.extend(morphs) | |
| return out_tokens | |
| #### Older implementation | |
| #out_tokens=[] | |
| #for token in tokens: | |
| # if self._morphanalysis_needed(token): | |
| # morphs=self.morph_analyze(token) | |
| # out_tokens.extend(morphs) | |
| # else: | |
| # if self.add_marker: | |
| # token=u'{}_E_'.format(token) | |
| # out_tokens.append(token) | |
| #return out_tokens | |
| if __name__ == '__main__': | |
| if len(sys.argv)<4: | |
| print("Usage: python unsupervised_morph.py <infile> <outfile> <language> <indic_resources_path> [<add_marker>]") | |
| sys.exit(1) | |
| language=sys.argv[3] | |
| common.INDIC_RESOURCES_PATH=sys.argv[4] | |
| add_marker=False | |
| if len(sys.argv)==6: | |
| add_marker= True if sys.argv[5] == 'True' else False | |
| print('Loading morph analyser for ' + language) | |
| analyzer=UnsupervisedMorphAnalyzer(language,add_marker) | |
| print('Loaded morph analyser for ' + language) | |
| with codecs.open(sys.argv[1],'r','utf-8') as ifile: | |
| with codecs.open(sys.argv[2],'w','utf-8') as ofile: | |
| for line in ifile.readlines(): | |
| line=line.strip() | |
| tokens=indic_tokenize.trivial_tokenize(line) | |
| morph_tokens=analyzer.morph_analyze_document(tokens) | |
| ofile.write(' '.join(morph_tokens)) | |
| ofile.write('\n') | |