Python bindings for general-sam and some utilities
Project description
general-sam-py
Python bindings for general-sam
and some utilities.
flowchart LR
init((ε))
a((a))
b((b))
ab((ab))
bc(((bc)))
abc((abc))
abcb((abcb))
abcbc(((abcbc)))
init -- a --> a
init -- b --> b
a -- b --> ab
b -- c --> bc
init -- c --> bc
ab -- c --> abc
bc -- b --> abcb
abc -- b --> abcb
abcb -- c --> abcbc
The suffix automaton of abcbc.
Installation
pip install general-sam
Usage
GeneralSAM
from general_sam import GeneralSAM
sam = GeneralSAM.from_bytes(b'abcbc')
# "cbc" is a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b'cbc')
assert state.is_accepting()
# "bcb" is not a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b'bcb')
assert not state.is_accepting()
from general_sam import GeneralSAM
sam = GeneralSAM.from_chars('abcbc')
state = sam.get_root_state()
# "b" is not a suffix but at least a substring of "abcbc"
state.feed_chars('b')
assert not state.is_accepting()
# "bc" is a suffix of "abcbc"
state.feed_chars('c')
assert state.is_accepting()
# "bcbc" is a suffix of "abcbc"
state.feed_chars('bc')
assert state.is_accepting()
# "bcbcbc" is not a substring, much less a suffix of "abcbc"
state.feed_chars('bc')
assert not state.is_accepting() and state.is_nil()
from general_sam import GeneralSAM, GeneralSAMState, build_trie_from_chars
trie, _ = build_trie_from_chars(['hello', 'Chielo'])
sam = GeneralSAM.from_trie(trie)
def fetch_state(s: str) -> GeneralSAMState:
state = sam.get_root_state()
state.feed_chars(s)
return state
assert fetch_state('lo').is_accepting()
assert fetch_state('ello').is_accepting()
assert fetch_state('elo').is_accepting()
state = fetch_state('el')
assert not state.is_accepting() and not state.is_nil()
state = fetch_state('bye')
assert not state.is_accepting() and state.is_nil()
VocabPrefixAutomaton
from general_sam import VocabPrefixAutomaton, CountInfo
vocab = ['歌曲', '聆听歌曲', '播放歌曲', '歌词', '查看歌词']
automaton = VocabPrefixAutomaton(vocab, bytes_or_chars='chars')
# NOTE: CountInfo instances are actually related to the sorted `vocab`:
_ = ['播放歌曲', '查看歌词', '歌曲', '歌词', '聆听歌曲']
# Case 1:
# 一起 | 聆 | 听 | 歌
state = automaton.get_root_state()
# prepend '歌'
cnt_info = automaton.prepend_feed(state, '歌')
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=2, tot_cnt_lower=2, tot_cnt_upper=4
)
# found '歌曲' at the index 0 and '歌词' at the index 3 prefixed with '歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {0, 3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {'歌曲', '歌词'}
# prepend 听
cnt_info = automaton.prepend_feed(state, '听')
# found nothing prefixed with '听歌'
assert cnt_info is None
assert not state.is_nil()
# prepend 聆
cnt_info = automaton.prepend_feed(state, '聆')
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=4, tot_cnt_upper=5
)
# found '聆听歌曲' at the index 1 prefixed with '聆听歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {1}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {'聆听歌曲'}
# prepend 一起
assert not state.is_nil()
# found nothing prefixed with '一起聆听歌'
cnt_info = automaton.prepend_feed(state, '一起')
assert state.is_nil()
# Case 2:
# 来 | 查看 | 歌词
state = automaton.get_root_state()
# prepend 歌词
cnt_info = automaton.prepend_feed(state, '歌词')
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=3, tot_cnt_upper=4
)
# found '歌词' at the index 3 prefixed with '歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {'歌词'}
# prepend 查看
cnt_info = automaton.prepend_feed(state, '查看')
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=1, tot_cnt_upper=2
)
# found '查看歌词' at the index 4 prefixed with '查看歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {4}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {'查看歌词'}
# prepend 来
assert not state.is_nil()
# found nothing prefixed with '来查看歌词'
cnt_info = automaton.prepend_feed(state, '来')
assert state.is_nil()
GreedyTokenizer
from general_sam import GeneralSAM, GreedyTokenizer, build_trie_from_chars
vocab = ['a', 'ab', 'b', 'bc', 'c', 'd', 'e', 'f', 'cd', 'abcde']
trie, token_to_trie_node = build_trie_from_chars(vocab)
trie_node_to_token = [-1] * trie.num_of_nodes()
for i, j in enumerate(token_to_trie_node):
trie_node_to_token[j] = i
sam = GeneralSAM.from_trie(trie)
tokenizer = GreedyTokenizer.from_sam_and_trie(sam, trie)
def tokenize(s: str):
return [(trie_node_to_token[i], j) for i, j in tokenizer.tokenize_str(s)]
assert tokenize('abcde') == [(9, 5)]
assert tokenize('abcdf') == [(1, 2), (8, 2), (7, 1)]
assert tokenize('abca') == [(1, 2), (4, 1), (0, 1)]
License
- © 2023 Chielo Newctle <ChieloNewctle@gmail.com>
- © 2023 ModelTC Team
This project is licensed under either of
at your option.
The SPDX license identifier for this project is MIT OR Apache-2.0
.
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
general_sam-0.5.3.tar.gz
(21.4 kB
view hashes)
Built Distributions
general_sam-0.5.3-cp38-abi3-win32.whl
(199.8 kB
view hashes)
Close
Hashes for general_sam-0.5.3-cp38-abi3-win_amd64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | d3feeffd0c3be6f202e73f241d3fb8d8c66b923a5d29557963d0a20ec0519da1 |
|
MD5 | 496a2c62f6bfb1617383d17ab1033bb3 |
|
BLAKE2b-256 | 7ef1e979fe8fa6ac941d38d388ca8a90f5b6c6875f0a9b72f3af3bd4ebc4496f |
Close
Hashes for general_sam-0.5.3-cp38-abi3-win32.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 5f1aab1aead502f3df2c6dfa40cc8c96833b7f025e359e3cb6894201e0ec8d8d |
|
MD5 | e53bad17c7d4d71470a4e46c4d52dcec |
|
BLAKE2b-256 | ffe5b28bacf646e6da7805cd9d419bbf2e9c97df6a1065ca53ecfb0c5f669ff3 |
Close
Hashes for general_sam-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | dc451bf74b615668b2b7aba81dd7ab4b9f93efa68c2de8dfe76f5fb63c1d3eb0 |
|
MD5 | 2c792f0b48b843e2103b5c5206d7cef9 |
|
BLAKE2b-256 | 35d86cdb08fb86f2d54f64d992f9b59ecf095d00eea27ee5e83b2c8da24adcd4 |
Close
Hashes for general_sam-0.5.3-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 015d3b671c6a22335de4bf9a666f881d87340be498e89640501fa85a571fe217 |
|
MD5 | 76e17ce03ce2d2ebe8d02264fad24675 |
|
BLAKE2b-256 | f722921f45520199f8ddec2c1c78f90d6d99a574ced25e6e1bc20ee0ac748cd5 |
Close
Hashes for general_sam-0.5.3-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | a1f0baaca66b0f7d19b203342eba1022a12b08be91a0eac05da6cf18a5652331 |
|
MD5 | ba9bac95dd9f671e05fb513a68c30ddb |
|
BLAKE2b-256 | 12bfc2a808e3d7bea4c4c61f6b7199b5c72aa756da7bd6e64a40cef4f5a80552 |
Close
Hashes for general_sam-0.5.3-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 5efd78cf9090e86da77e4bc1605c9fa3c1814e291fb004bfa48dbf5cb5cca6d6 |
|
MD5 | 0cd173c2bb23847f89c1895e1a6fc110 |
|
BLAKE2b-256 | 805f29280c9dcb7923d53adda32c0dd5b5662287bff568ff60d06dd4b65d1114 |
Close
Hashes for general_sam-0.5.3-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 0bb05470896e32be038c8e0a21db8122a982b15a086322946415efb9ddc1e299 |
|
MD5 | 4482a14734a1f3c143e68a75df8bc7d8 |
|
BLAKE2b-256 | 8ba95c7aaae8030707935016d8a353918fd21254b1a063fbfb95e4d4810d493b |
Close
Hashes for general_sam-0.5.3-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 2fd7b823162bd60be057c7524479eb35a3da638cb47b2e7aa38cc4c84e466b1e |
|
MD5 | 634174cb2fbec6e19dc8e1ee528db9e7 |
|
BLAKE2b-256 | 765bb2991f7fd8fe7bc2dfc9645751221e26c549461242e47c0dbda1081b9af9 |
Close
Hashes for general_sam-0.5.3-cp38-abi3-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 2cb3a7801decd1974e77f4364e29e3155f493a086b5c6355af43242decefe708 |
|
MD5 | efd06a7cf736a065e322a8f1074d2749 |
|
BLAKE2b-256 | 9e338c1431ef5fe68558f9286f1ffde1a9c4b55a1ffaa598e1dc11ee97f2db86 |
Close
Hashes for general_sam-0.5.3-cp38-abi3-macosx_10_7_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 4578e528625c1e8614e43c6686c7af0350dffd58d15549de0694286e8109ee78 |
|
MD5 | e3b4cd873e2c21c728ce828f551d8b47 |
|
BLAKE2b-256 | bbe38e85a06a303be9dc36c53c163a5b6f93794814b94d743258d4f54e868ac1 |