Bindings to MorphoDiTa library
Project description
ufal.morphodita
===============
The ``ufal.morphodita`` is a Python binding to MorphoDiTa library <http://ufal.mff.cuni.cz/morphodita>.
The bindings is a straightforward conversion of the ``C++`` bindings API.
In Python 2, strings can be both ``unicode`` and UTF-8 encoded ``str``, and the
library always produces ``unicode``. In Python 3, strings must be only ``str``.
Wrapped C++ API
---------------
The C++ API being wrapped follows. For a API reference of the original
C++ API, see <http://ufal.mff.cuni.cz/morphodita/api-reference>.
::
Helper Structures
-----------------
typedef vector<string> Forms;
struct TaggedForm {
string form;
string tag;
};
typedef vector<TaggedForm> TaggedForms;
struct TaggedLemma {
string lemma;
string tag;
};
typedef vector<TaggedLemma> TaggedLemmas;
struct TaggedLemmaForms {
string lemma;
TaggedForms forms;
};
typedef vector<TaggedLemmaForms> TaggedLemmasForms;
struct TokenRange {
size_t start;
size_t length;
};
typedef vector<TokenRange> TokenRanges;
Main Classes
------------
class Version {
public:
unsigned major;
unsigned minor;
unsigned patch;
static Version current();
};
class Tokenizer {
public:
virtual void setText(const char* text);
virtual bool nextSentence(Forms* forms, TokenRanges* tokens);
static Tokenizer* newVerticalTokenizer();
static Tokenizer* newCzechTokenizer();
static Tokenizer* newEnglishTokenizer();
static Tokenizer* newGenericTokenizer();
};
class Morpho {
public:
static Morpho* load(const char* fname);
enum { NO_GUESSER = 0, GUESSER = 1 };
virtual int analyze(const char* form, int guesser, TaggedLemmas& lemmas) const;
virtual int generate(const char* lemma, const char* tag_wildcard, int guesser, TaggedLemmasForms& forms) const;
virtual string rawLemma(const char* lemma) const;
virtual string lemmaId(const char* lemma) const;
virtual string rawForm(const char* form) const;
virtual Tokenizer* newTokenizer() const;
};
class Tagger {
public:
static Tagger* load(const char* fname);
virtual const Morpho* getMorpho() const;
virtual void tag(Forms& forms, TaggedLemmas& tags) const;
Tokenizer* newTokenizer() const;
};
class TagsetConverter {
public:
static TagsetConverter* newIdentityConverter();
static TagsetConverter* newPdtToConll2009Converter();
virtual void convert(TaggedLemma& lemma) const;
virtual void convertAnalyzed(TaggedLemmas& lemmas) const;
virtual void convertGenerated(TaggedLemmasForms& forms) const;
};
Examples
========
run_morpho_cli
--------------
Simple example performing morphological analysis and generation::
from ufal.morphodita import *
# In Python2, wrap sys.stdin and sys.stdout to work with unicode.
if sys.version_info[0] < 3:
import codecs
import locale
encoding = locale.getpreferredencoding()
sys.stdin = codecs.getreader(encoding)(sys.stdin)
sys.stdout = codecs.getwriter(encoding)(sys.stdout)
if len(sys.argv) < 2:
sys.stderr.write('Usage: %s dict_file\n' % sys.argv[0])
sys.exit(1)
sys.stderr.write('Loading dictionary: ')
morpho = Morpho.load(sys.argv[1])
if not morpho:
sys.stderr.write("Cannot load dictionary from file '%s'\n" % sys.argv[1])
sys.exit(1)
sys.stderr.write('done\n')
lemmas = TaggedLemmas()
lemmas_forms = TaggedLemmasForms()
line = sys.stdin.readline()
while line:
tokens = line.rstrip('\r\n').split('\t')
if len(tokens) == 1: # analyze
result = morpho.analyze(tokens[0], morpho.GUESSER, lemmas)
guesser = "Guesser " if result == morpho.GUESSER else ""
for lemma in lemmas:
sys.stdout.write('%sLemma: %s %s\n' % (guesser, lemma.lemma, lemma.tag))
elif len(tokens) == 2: # generate
result = morpho.generate(tokens[0], tokens[1], morpho.GUESSER, lemmas_forms)
guesser = "Guesser " if result == morpho.GUESSER else ""
for lemma_forms in lemmas_forms:
sys.stdout.write('%sLemma: %s\n' % (guesser, lemma_forms.lemma))
for form in lemma_forms.forms:
sys.stdout.write(' %s %s\n' % (form.form, form.tag))
line = sys.stdin.readline()
run_tagger
----------
Simple example performing tokenization and PoS tagging::
from ufal.morphodita import *
def encode_entities(text):
return text.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
# In Python2, wrap sys.stdin and sys.stdout to work with unicode.
if sys.version_info[0] < 3:
import codecs
import locale
encoding = locale.getpreferredencoding()
sys.stdin = codecs.getreader(encoding)(sys.stdin)
sys.stdout = codecs.getwriter(encoding)(sys.stdout)
if len(sys.argv) == 1:
sys.stderr.write('Usage: %s tagger_file\n' % sys.argv[0])
sys.exit(1)
sys.stderr.write('Loading tagger: ')
tagger = Tagger.load(sys.argv[1])
if not tagger:
sys.stderr.write("Cannot load tagger from file '%s'\n" % sys.argv[1])
sys.exit(1)
sys.stderr.write('done\n')
forms = Forms()
lemmas = TaggedLemmas()
tokens = TokenRanges()
tokenizer = tagger.newTokenizer()
if tokenizer is None:
sys.stderr.write("No tokenizer is defined for the supplied model!")
sys.exit(1)
not_eof = True
while not_eof:
text = ''
# Read block
while True:
line = sys.stdin.readline()
not_eof = bool(line)
if not not_eof: break
line = line.rstrip('\r\n')
text += line
text += '\n';
if not line: break
# Tag
tokenizer.setText(text)
t = 0
while tokenizer.nextSentence(forms, tokens):
tagger.tag(forms, lemmas)
for i in range(len(lemmas)):
lemma = lemmas[i]
token = tokens[i]
sys.stdout.write('%s%s<token lemma="%s" tag="%s">%s</token>%s' % (
encode_entities(text[t : token.start]),
"<sentence>" if i == 0 else "",
encode_entities(lemma.lemma),
encode_entities(lemma.tag),
encode_entities(text[token.start : token.start + token.length]),
"</sentence>" if i + 1 == len(lemmas) else "",
))
t = token.start + token.length
sys.stdout.write(encode_entities(text[t : ]))
AUTHORS
=======
Milan Straka <straka@ufal.mff.cuni.cz>
Jana Straková <strakova@ufal.mff.cuni.cz>
COPYRIGHT AND LICENCE
=====================
Copyright 2014 by Institute of Formal and Applied Linguistics, Faculty of
Mathematics and Physics, Charles University in Prague, Czech Republic.
MorphoDiTa is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation, either version 3 of
the License, or (at your option) any later version.
MorphoDiTa is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with MorphoDiTa. If not, see <http://www.gnu.org/licenses/>.
===============
The ``ufal.morphodita`` is a Python binding to MorphoDiTa library <http://ufal.mff.cuni.cz/morphodita>.
The bindings is a straightforward conversion of the ``C++`` bindings API.
In Python 2, strings can be both ``unicode`` and UTF-8 encoded ``str``, and the
library always produces ``unicode``. In Python 3, strings must be only ``str``.
Wrapped C++ API
---------------
The C++ API being wrapped follows. For a API reference of the original
C++ API, see <http://ufal.mff.cuni.cz/morphodita/api-reference>.
::
Helper Structures
-----------------
typedef vector<string> Forms;
struct TaggedForm {
string form;
string tag;
};
typedef vector<TaggedForm> TaggedForms;
struct TaggedLemma {
string lemma;
string tag;
};
typedef vector<TaggedLemma> TaggedLemmas;
struct TaggedLemmaForms {
string lemma;
TaggedForms forms;
};
typedef vector<TaggedLemmaForms> TaggedLemmasForms;
struct TokenRange {
size_t start;
size_t length;
};
typedef vector<TokenRange> TokenRanges;
Main Classes
------------
class Version {
public:
unsigned major;
unsigned minor;
unsigned patch;
static Version current();
};
class Tokenizer {
public:
virtual void setText(const char* text);
virtual bool nextSentence(Forms* forms, TokenRanges* tokens);
static Tokenizer* newVerticalTokenizer();
static Tokenizer* newCzechTokenizer();
static Tokenizer* newEnglishTokenizer();
static Tokenizer* newGenericTokenizer();
};
class Morpho {
public:
static Morpho* load(const char* fname);
enum { NO_GUESSER = 0, GUESSER = 1 };
virtual int analyze(const char* form, int guesser, TaggedLemmas& lemmas) const;
virtual int generate(const char* lemma, const char* tag_wildcard, int guesser, TaggedLemmasForms& forms) const;
virtual string rawLemma(const char* lemma) const;
virtual string lemmaId(const char* lemma) const;
virtual string rawForm(const char* form) const;
virtual Tokenizer* newTokenizer() const;
};
class Tagger {
public:
static Tagger* load(const char* fname);
virtual const Morpho* getMorpho() const;
virtual void tag(Forms& forms, TaggedLemmas& tags) const;
Tokenizer* newTokenizer() const;
};
class TagsetConverter {
public:
static TagsetConverter* newIdentityConverter();
static TagsetConverter* newPdtToConll2009Converter();
virtual void convert(TaggedLemma& lemma) const;
virtual void convertAnalyzed(TaggedLemmas& lemmas) const;
virtual void convertGenerated(TaggedLemmasForms& forms) const;
};
Examples
========
run_morpho_cli
--------------
Simple example performing morphological analysis and generation::
from ufal.morphodita import *
# In Python2, wrap sys.stdin and sys.stdout to work with unicode.
if sys.version_info[0] < 3:
import codecs
import locale
encoding = locale.getpreferredencoding()
sys.stdin = codecs.getreader(encoding)(sys.stdin)
sys.stdout = codecs.getwriter(encoding)(sys.stdout)
if len(sys.argv) < 2:
sys.stderr.write('Usage: %s dict_file\n' % sys.argv[0])
sys.exit(1)
sys.stderr.write('Loading dictionary: ')
morpho = Morpho.load(sys.argv[1])
if not morpho:
sys.stderr.write("Cannot load dictionary from file '%s'\n" % sys.argv[1])
sys.exit(1)
sys.stderr.write('done\n')
lemmas = TaggedLemmas()
lemmas_forms = TaggedLemmasForms()
line = sys.stdin.readline()
while line:
tokens = line.rstrip('\r\n').split('\t')
if len(tokens) == 1: # analyze
result = morpho.analyze(tokens[0], morpho.GUESSER, lemmas)
guesser = "Guesser " if result == morpho.GUESSER else ""
for lemma in lemmas:
sys.stdout.write('%sLemma: %s %s\n' % (guesser, lemma.lemma, lemma.tag))
elif len(tokens) == 2: # generate
result = morpho.generate(tokens[0], tokens[1], morpho.GUESSER, lemmas_forms)
guesser = "Guesser " if result == morpho.GUESSER else ""
for lemma_forms in lemmas_forms:
sys.stdout.write('%sLemma: %s\n' % (guesser, lemma_forms.lemma))
for form in lemma_forms.forms:
sys.stdout.write(' %s %s\n' % (form.form, form.tag))
line = sys.stdin.readline()
run_tagger
----------
Simple example performing tokenization and PoS tagging::
from ufal.morphodita import *
def encode_entities(text):
return text.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
# In Python2, wrap sys.stdin and sys.stdout to work with unicode.
if sys.version_info[0] < 3:
import codecs
import locale
encoding = locale.getpreferredencoding()
sys.stdin = codecs.getreader(encoding)(sys.stdin)
sys.stdout = codecs.getwriter(encoding)(sys.stdout)
if len(sys.argv) == 1:
sys.stderr.write('Usage: %s tagger_file\n' % sys.argv[0])
sys.exit(1)
sys.stderr.write('Loading tagger: ')
tagger = Tagger.load(sys.argv[1])
if not tagger:
sys.stderr.write("Cannot load tagger from file '%s'\n" % sys.argv[1])
sys.exit(1)
sys.stderr.write('done\n')
forms = Forms()
lemmas = TaggedLemmas()
tokens = TokenRanges()
tokenizer = tagger.newTokenizer()
if tokenizer is None:
sys.stderr.write("No tokenizer is defined for the supplied model!")
sys.exit(1)
not_eof = True
while not_eof:
text = ''
# Read block
while True:
line = sys.stdin.readline()
not_eof = bool(line)
if not not_eof: break
line = line.rstrip('\r\n')
text += line
text += '\n';
if not line: break
# Tag
tokenizer.setText(text)
t = 0
while tokenizer.nextSentence(forms, tokens):
tagger.tag(forms, lemmas)
for i in range(len(lemmas)):
lemma = lemmas[i]
token = tokens[i]
sys.stdout.write('%s%s<token lemma="%s" tag="%s">%s</token>%s' % (
encode_entities(text[t : token.start]),
"<sentence>" if i == 0 else "",
encode_entities(lemma.lemma),
encode_entities(lemma.tag),
encode_entities(text[token.start : token.start + token.length]),
"</sentence>" if i + 1 == len(lemmas) else "",
))
t = token.start + token.length
sys.stdout.write(encode_entities(text[t : ]))
AUTHORS
=======
Milan Straka <straka@ufal.mff.cuni.cz>
Jana Straková <strakova@ufal.mff.cuni.cz>
COPYRIGHT AND LICENCE
=====================
Copyright 2014 by Institute of Formal and Applied Linguistics, Faculty of
Mathematics and Physics, Charles University in Prague, Czech Republic.
MorphoDiTa is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation, either version 3 of
the License, or (at your option) any later version.
MorphoDiTa is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with MorphoDiTa. If not, see <http://www.gnu.org/licenses/>.
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
ufal.morphodita-1.1.0.1.tar.gz
(226.4 kB
view hashes)