Dictionary incorporation for machine translation
Project description
Dependencies
unidecode
emoji
*kenlm
fuzzy
scikit-learn
pyxdameraulevenshtein
pygtrie
numpy
*Install kenlm wrapper from github:
pip install https://github.com/kpu/kenlm/archive/master.zip
Usage
# Load english dictionary
english_vocab = load_english_vocab(...)
english_vocab.update(load_english_vocab(...))
# Load bilingual lexicon dictionary
foreign_dict = load_lexicon_norm(...)
# Load target language model
lm = kenlm.Model(...)
# Train a ngram model if needed
# ngram_train(foreign_dict, 'hin-tfidf-ngram_algo')
# Ulf's romanizer
romanizer = partial(romanize,
romanization_path=...,
language_code="hin")
soundex_inst = fuzzy.DMetaphone()
soundex_algo = lambda x: soundex_inst(x)[0].decode('utf-8') if soundex_inst(x)[0] is not None else x
english_encoded_vocab = {e: soundex_algo(e) for e in english_vocab if e}
# load the ngram model
ngram_algo = pickle.loads(open(..., "rb").read())
soundex_model = partial(soundex_similarity,
encoded_english_vocab=english_encoded_vocab,
romanizer=romanizer,
soundex=soundex_algo)
lev_model = partial(lev_similarity, backup=soundex_model)
ngram_model = partial(ngram_similarity, model=ngram_algo, backup=lev_model)
final_model = partial(exact_similarity, backup=ngram_model)
for line in open(...):
source, target = line.strip('\n').split('\t')
oovs = extract_oov(target, source, english_vocab=english_vocab, romanization=True)
best, mods = translate_oov(target, oovs, foreign_dict, final_model, lm.score)
if best != target:
for oov in oovs:
alt = list(mods[oov].keys())[0]
trans = mods[oov][alt]
debug.debug(f"{romanizer(oov)} -> {romanizer(alt)} : {list(trans)}")
debug.debug(best)
debug.debug("*"*100)
or
python -m elisa_patch --help
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
elisa-patch-0.3.7.tar.gz
(6.6 kB
view details)
File details
Details for the file elisa-patch-0.3.7.tar.gz
.
File metadata
- Download URL: elisa-patch-0.3.7.tar.gz
- Upload date:
- Size: 6.6 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/1.13.0 pkginfo/1.5.0.1 requests/2.22.0 setuptools/41.0.1 requests-toolbelt/0.9.1 tqdm/4.32.2 CPython/3.6.3
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | 5e27cc7da606f4f2c17b70bca4647f2a2336e5bdd44ca24b13cc101c387c655d |
|
MD5 | 162b47e68bad3c88cc0968353d536330 |
|
BLAKE2b-256 | 8ae129d93e1d6c9f65d90c3f8957e9168c923ea886297c1336474346df748178 |