A portable document embedding using SWEM.
Project description
SWEM
Implementation of SWEM(Simple Word-Embedding-based Models)
Baseline Needs More Love: On Simple Word-Embedding-Based Models and Associated Pooling Mechanisms (ACL 2018)
Installation
pip install swem
Example
Examples are available in examples directory.
Functional API
from typing import List
import numpy as np
import swem
from gensim.models import KeyedVectors
if __name__ == '__main__':
kv: KeyedVectors = KeyedVectors(vector_size=200)
tokens: List[str] = ['I', 'have', 'a', 'pen']
embed: np.ndarray = swem.infer_vector(
tokens=tokens, kv=kv, method='concat'
)
print(embed.shape)
Japanese
from typing import List
import MeCab
import swem
from gensim.models import KeyedVectors
def tokenize_ja(text: str, args: str = '-O wakati') -> List[str]:
tagger = MeCab.Tagger(args)
return tagger.parse(text).strip().split(' ')
if __name__ == '__main__':
kv = KeyedVectors.load('wiki_mecab-ipadic-neologd.kv')
swem_embed = swem.SWEM(kv, tokenize_ja)
doc = 'すもももももももものうち'
embed = swem_embed.infer_vector(doc, method='max')
print(embed.shape)
Results
(200,)
English
from typing import List
import swem
from gensim.models import KeyedVectors
def tokenize_en(text: str) -> List[str]:
text_processed = text.replace('.', ' .').replace(',', ' ,')
return text_processed.replace('?', ' ?').replace('!', ' !').split()
if __name__ == '__main__':
kv = KeyedVectors.load('wiki_mecab-ipadic-neologd.kv')
swem_embed = swem.SWEM(kv, tokenizer=tokenize_en)
doc = 'This is an implementation of SWEM.'
embed = swem_embed.infer_vector(doc, method='max')
print(embed.shape)
Results
(200,)
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
swem-0.2.0.tar.gz
(6.2 kB
view hashes)
Built Distribution
swem-0.2.0-py3-none-any.whl
(4.8 kB
view hashes)