Bindings to MorphoDiTa library
Project description
ufal.morphodita
The ufal.morphodita is a Python binding to MorphoDiTa library <http://ufal.mff.cuni.cz/morphodita>.
The bindings is a straightforward conversion of the C++ bindings API. In Python 2, strings can be both unicode and UTF-8 encoded str, and the library always produces unicode. In Python 3, strings must be only str.
Wrapped C++ API
The C++ API being wrapped follows. For a API reference of the original C++ API, see <http://ufal.mff.cuni.cz/morphodita/api-reference>.
Helper Structures ----------------- typedef vector<string> Forms; struct TaggedForm { string form; string tag; }; typedef vector<TaggedForm> TaggedForms; struct TaggedLemma { string lemma; string tag; }; typedef vector<TaggedLemma> TaggedLemmas; struct TaggedLemmaForms { string lemma; TaggedForms forms; }; typedef vector<TaggedLemmaForms> TaggedLemmasForms; struct TokenRange { size_t start; size_t length; }; typedef vector<TokenRange> TokenRanges; Main Classes ------------ class Version { public: unsigned major; unsigned minor; unsigned patch; static Version current(); }; class Tokenizer { public: virtual void setText(const char* text); virtual bool nextSentence(Forms* forms, TokenRanges* tokens); static Tokenizer* newVerticalTokenizer(); static Tokenizer* newCzechTokenizer(); static Tokenizer* newEnglishTokenizer(); static Tokenizer* newGenericTokenizer(); }; class Morpho { public: static Morpho* load(const char* fname); enum { NO_GUESSER = 0, GUESSER = 1 }; virtual int analyze(const char* form, int guesser, TaggedLemmas& lemmas) const; virtual int generate(const char* lemma, const char* tag_wildcard, int guesser, TaggedLemmasForms& forms) const; virtual string rawLemma(const char* lemma) const; virtual string lemmaId(const char* lemma) const; virtual string rawForm(const char* form) const; virtual Tokenizer* newTokenizer() const; }; class Tagger { public: static Tagger* load(const char* fname); virtual const Morpho* getMorpho() const; virtual void tag(Forms& forms, TaggedLemmas& tags) const; Tokenizer* newTokenizer() const; }; class TagsetConverter { public: static TagsetConverter* newIdentityConverter(); static TagsetConverter* newPdtToConll2009Converter(); static TagsetConverter* newStripLemmaCommentConverter(const Morpho& morpho); static TagsetConverter* newStripLemmaIdConverter(const Morpho& morpho); virtual void convert(TaggedLemma& lemma) const; virtual void convertAnalyzed(TaggedLemmas& lemmas) const; virtual void convertGenerated(TaggedLemmasForms& forms) const; };
Examples
run_morpho_cli
Simple example performing morphological analysis and generation:
from ufal.morphodita import * # In Python2, wrap sys.stdin and sys.stdout to work with unicode. if sys.version_info[0] < 3: import codecs import locale encoding = locale.getpreferredencoding() sys.stdin = codecs.getreader(encoding)(sys.stdin) sys.stdout = codecs.getwriter(encoding)(sys.stdout) if len(sys.argv) < 2: sys.stderr.write('Usage: %s dict_file\n' % sys.argv[0]) sys.exit(1) sys.stderr.write('Loading dictionary: ') morpho = Morpho.load(sys.argv[1]) if not morpho: sys.stderr.write("Cannot load dictionary from file '%s'\n" % sys.argv[1]) sys.exit(1) sys.stderr.write('done\n') lemmas = TaggedLemmas() lemmas_forms = TaggedLemmasForms() line = sys.stdin.readline() while line: tokens = line.rstrip('\r\n').split('\t') if len(tokens) == 1: # analyze result = morpho.analyze(tokens[0], morpho.GUESSER, lemmas) guesser = "Guesser " if result == morpho.GUESSER else "" for lemma in lemmas: sys.stdout.write('%sLemma: %s %s\n' % (guesser, lemma.lemma, lemma.tag)) elif len(tokens) == 2: # generate result = morpho.generate(tokens[0], tokens[1], morpho.GUESSER, lemmas_forms) guesser = "Guesser " if result == morpho.GUESSER else "" for lemma_forms in lemmas_forms: sys.stdout.write('%sLemma: %s\n' % (guesser, lemma_forms.lemma)) for form in lemma_forms.forms: sys.stdout.write(' %s %s\n' % (form.form, form.tag)) line = sys.stdin.readline()
run_tagger
Simple example performing tokenization and PoS tagging:
from ufal.morphodita import * def encode_entities(text): return text.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') # In Python2, wrap sys.stdin and sys.stdout to work with unicode. if sys.version_info[0] < 3: import codecs import locale encoding = locale.getpreferredencoding() sys.stdin = codecs.getreader(encoding)(sys.stdin) sys.stdout = codecs.getwriter(encoding)(sys.stdout) if len(sys.argv) == 1: sys.stderr.write('Usage: %s tagger_file\n' % sys.argv[0]) sys.exit(1) sys.stderr.write('Loading tagger: ') tagger = Tagger.load(sys.argv[1]) if not tagger: sys.stderr.write("Cannot load tagger from file '%s'\n" % sys.argv[1]) sys.exit(1) sys.stderr.write('done\n') forms = Forms() lemmas = TaggedLemmas() tokens = TokenRanges() tokenizer = tagger.newTokenizer() if tokenizer is None: sys.stderr.write("No tokenizer is defined for the supplied model!") sys.exit(1) not_eof = True while not_eof: text = '' # Read block while True: line = sys.stdin.readline() not_eof = bool(line) if not not_eof: break line = line.rstrip('\r\n') text += line text += '\n'; if not line: break # Tag tokenizer.setText(text) t = 0 while tokenizer.nextSentence(forms, tokens): tagger.tag(forms, lemmas) for i in range(len(lemmas)): lemma = lemmas[i] token = tokens[i] sys.stdout.write('%s%s<token lemma="%s" tag="%s">%s</token>%s' % ( encode_entities(text[t : token.start]), "<sentence>" if i == 0 else "", encode_entities(lemma.lemma), encode_entities(lemma.tag), encode_entities(text[token.start : token.start + token.length]), "</sentence>" if i + 1 == len(lemmas) else "", )) t = token.start + token.length sys.stdout.write(encode_entities(text[t : ]))
COPYRIGHT AND LICENCE
Copyright 2014 by Institute of Formal and Applied Linguistics, Faculty of Mathematics and Physics, Charles University in Prague, Czech Republic.
MorphoDiTa is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
MorphoDiTa is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License along with MorphoDiTa. If not, see <http://www.gnu.org/licenses/>.
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.