Python bindings for general-sam and some utilities
Project description
general-sam-py
Python bindings for general-sam
and some utilities.
flowchart LR
init((ε))
a((a))
b((b))
ab((ab))
bc(((bc)))
abc((abc))
abcb((abcb))
abcbc(((abcbc)))
init -- a --> a
init -- b --> b
a -- b --> ab
b -- c --> bc
init -- c --> bc
ab -- c --> abc
bc -- b --> abcb
abc -- b --> abcb
abcb -- c --> abcbc
The suffix automaton of abcbc.
Installation
pip install general-sam
Usage
GeneralSAM
from general_sam import GeneralSAM
sam = GeneralSAM.from_bytes(b"abcbc")
# "cbc" is a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b"cbc")
assert state.is_accepting()
# "bcb" is not a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b"bcb")
assert not state.is_accepting()
from general_sam import GeneralSAM
sam = GeneralSAM.from_chars("abcbc")
state = sam.get_root_state()
# "b" is not a suffix but at least a substring of "abcbc"
state.feed_chars("b")
assert not state.is_accepting()
# "bc" is a suffix of "abcbc"
state.feed_chars("c")
assert state.is_accepting()
# "bcbc" is a suffix of "abcbc"
state.feed_chars("bc")
assert state.is_accepting()
# "bcbcbc" is not a substring, much less a suffix of "abcbc"
state.feed_chars("bc")
assert not state.is_accepting() and state.is_nil()
from general_sam import GeneralSAM, GeneralSAMState, build_trie_from_chars
trie, _ = build_trie_from_chars(["hello", "Chielo"])
sam = GeneralSAM.from_trie(trie)
def fetch_state(s: str) -> GeneralSAMState:
state = sam.get_root_state()
state.feed_chars(s)
return state
assert fetch_state("lo").is_accepting()
assert fetch_state("ello").is_accepting()
assert fetch_state("elo").is_accepting()
state = fetch_state("el")
assert not state.is_accepting() and not state.is_nil()
state = fetch_state("bye")
assert not state.is_accepting() and state.is_nil()
VocabPrefixAutomaton
from general_sam import CountInfo, VocabPrefixAutomaton
vocab = ["歌曲", "聆听歌曲", "播放歌曲", "歌词", "查看歌词"]
automaton = VocabPrefixAutomaton(vocab, bytes_or_chars="chars")
# NOTE: CountInfo instances are actually related to the sorted `vocab`:
_ = ["播放歌曲", "查看歌词", "歌曲", "歌词", "聆听歌曲"]
# Case 1:
# 一起 | 聆 | 听 | 歌
state = automaton.get_root_state()
# prepend '歌'
cnt_info = automaton.prepend_feed(state, "歌")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=2, tot_cnt_lower=2, tot_cnt_upper=4
)
# found '歌曲' at the index 0 and '歌词' at the index 3 prefixed with '歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {0, 3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"歌曲", "歌词"}
# prepend 听
cnt_info = automaton.prepend_feed(state, "听")
# found nothing prefixed with '听歌'
assert cnt_info is None
assert not state.is_nil()
# prepend 聆
cnt_info = automaton.prepend_feed(state, "聆")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=4, tot_cnt_upper=5
)
# found '聆听歌曲' at the index 1 prefixed with '聆听歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {1}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"聆听歌曲"}
# prepend 一起
assert not state.is_nil()
# found nothing prefixed with '一起聆听歌'
cnt_info = automaton.prepend_feed(state, "一起")
assert state.is_nil()
# Case 2:
# 来 | 查看 | 歌词
state = automaton.get_root_state()
# prepend 歌词
cnt_info = automaton.prepend_feed(state, "歌词")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=3, tot_cnt_upper=4
)
# found '歌词' at the index 3 prefixed with '歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"歌词"}
# prepend 查看
cnt_info = automaton.prepend_feed(state, "查看")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=1, tot_cnt_upper=2
)
# found '查看歌词' at the index 4 prefixed with '查看歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {4}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"查看歌词"}
# prepend 来
assert not state.is_nil()
# found nothing prefixed with '来查看歌词'
cnt_info = automaton.prepend_feed(state, "来")
assert state.is_nil()
GreedyTokenizer
from general_sam import GeneralSAM, GreedyTokenizer, build_trie_from_chars
vocab = ["a", "ab", "b", "bc", "c", "d", "e", "f", "cd", "abcde"]
trie, token_to_trie_node = build_trie_from_chars(vocab)
trie_node_to_token = [-1] * trie.num_of_nodes()
for i, j in enumerate(token_to_trie_node):
trie_node_to_token[j] = i
sam = GeneralSAM.from_trie(trie)
tokenizer = GreedyTokenizer.from_sam_and_trie(sam, trie)
def tokenize(s: str):
return [(trie_node_to_token[i], j) for i, j in tokenizer.tokenize_str(s)]
assert tokenize("abcde") == [(9, 5)]
assert tokenize("abcdf") == [(1, 2), (8, 2), (7, 1)]
assert tokenize("abca") == [(1, 2), (4, 1), (0, 1)]
License
- © 2023 Chielo Newctle <ChieloNewctle@gmail.com>
- © 2023 ModelTC Team
This project is licensed under either of
at your option.
The SPDX license identifier for this project is MIT OR Apache-2.0
.
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
general_sam-0.6.1.post0.tar.gz
(21.5 kB
view hashes)
Built Distributions
Close
Hashes for general_sam-0.6.1.post0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | a8432702ee8972fac61c70e4655022e22cfb21748ee2ae89099ea621ac535655 |
|
MD5 | a025b284b3a03f7b38e74c393b89debd |
|
BLAKE2b-256 | 50e6d0bb2d28d51d076bf5a91855b0a49e11a04fd39b3bff9e88da9941f29321 |
Close
Hashes for general_sam-0.6.1.post0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 08664b80797595fe6ef4f364c71d7348e8175185cf3995a30563a44b77d46e69 |
|
MD5 | e9d06c00bda7e66e78b1c8ecf2227bad |
|
BLAKE2b-256 | 3303735a1e4f2f06c3ca89b40e38685d4729b9c3be94a7711ba19d083cfb0dfb |
Close
Hashes for general_sam-0.6.1.post0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 27b75fcbe91954880b047d268f907f9463f5828205f860e5d6518d92f5fd4adf |
|
MD5 | e957a7729735bd25603b75c187364f89 |
|
BLAKE2b-256 | c7e83c1f50605195bbd06277cf35395c4e682ed5080593e876cd6b15decb4c04 |
Close
Hashes for general_sam-0.6.1.post0-pp310-pypy310_pp73-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 000bae155c1959cf0fcfdfeed390e7ae00d9585b47bb22fc6027e5bec4e7f93f |
|
MD5 | 5bb85e391624b26599ee0fe40ad952f8 |
|
BLAKE2b-256 | 04b2245a3222f6ada8994852e9623d3e182e6377caf734f2fcbdbc9f8f2ecbca |
Close
Hashes for general_sam-0.6.1.post0-pp310-pypy310_pp73-macosx_10_7_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | a8c5e934ca68f27f08b4e96c6c772103c520e3d751c072a0f888da0cfa2c08d6 |
|
MD5 | 5baed1bf26fc5b203f9e73ae16c10bc2 |
|
BLAKE2b-256 | fa5fbc082f31e3ea75300ed81deecd4f1e96e9e6010c981bcf836b666d6ebf42 |
Close
Hashes for general_sam-0.6.1.post0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 50848973683eef8b1ec1dbc059c97b60276272abb85aa09f90c0924ac9542c9f |
|
MD5 | e29d7402351868a1fd5e76caafe97ce3 |
|
BLAKE2b-256 | d7815dc6cb75ad3bf07fa2924f9da9d0e06343b208eb56b20e26a1de3607ee1e |
Close
Hashes for general_sam-0.6.1.post0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 19abfbfa2e256805ed0cb94143dcbe44ac55fcfa0de3b4e3f5ec670bf0006d21 |
|
MD5 | 7cd0523386daedbe566f5067a155bb39 |
|
BLAKE2b-256 | d84aae4b88ee508ee2ab15c0d3355bd1fa28d07dcd97c7765ffcaf2145ad0e1f |
Close
Hashes for general_sam-0.6.1.post0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 3b460f44d7e593c7c46dc9ff15def5e7ddd59031c297ceb2aeab08a1976a3ac1 |
|
MD5 | ed216a0a6fbfd7011ca7a5c2bc52c3dc |
|
BLAKE2b-256 | 72b5315159eec610d6c8d1e3b4265d1d3a053239b706c103703f68bfc074d9e6 |
Close
Hashes for general_sam-0.6.1.post0-pp39-pypy39_pp73-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 34030d685b03bd01872b9279858975807e46a2f9d9738246f37612dfbaf57d22 |
|
MD5 | 1cde1e014ad10c4f326cd7a9651729c2 |
|
BLAKE2b-256 | d36f8be39bea59943e03244fdb7e41d92dfea390aab5b00bb63ab3bfe750ff4e |
Close
Hashes for general_sam-0.6.1.post0-pp39-pypy39_pp73-macosx_10_7_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 8a81702b2eadf89ae2ae9bedc9d182d0744ad883aca9eee1806a387b30247ba1 |
|
MD5 | 475d9a6a83f877c4d57e8d24b825a2cc |
|
BLAKE2b-256 | 8b4158a4f43b07b31575d0514bf809b4b969807263947f8438a0d198737f28f2 |
Close
Hashes for general_sam-0.6.1.post0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 4af730c571079e67b3f4f8abf05972ec078adb76d8d389dcbda2d00f42dcf446 |
|
MD5 | 2b1bd3a12534c261f2ab7f6a8cbf36f7 |
|
BLAKE2b-256 | a5a803ce27fb8f61a3c246949c6d1b60f2e17dfca29c9e8d08a968a9065bf85b |
Close
Hashes for general_sam-0.6.1.post0-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 000f4e7ce7f65a66dc03e8cf64cceb70cc1ec96e6cf7d7919ad250e7a4627507 |
|
MD5 | 867c06f6a991ddf04fa6e980ce2b7fa9 |
|
BLAKE2b-256 | 4dea5bef122ab9bb877960b82b6c7715c199bd433a39616d429eff605698d3f1 |
Close
Hashes for general_sam-0.6.1.post0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | db13afce3324d33cbcaef33f3caf3ab6dfdc24b63e73c22de3d8655de1cd2d98 |
|
MD5 | 61bd1f22ffb4e4806606773a9a4ca38e |
|
BLAKE2b-256 | 12b9b8691a5b6dc4747fb6c59ade91e13977ca3ba73765d55c7bcd1d8bca1632 |
Close
Hashes for general_sam-0.6.1.post0-pp38-pypy38_pp73-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 47fa5d086336b888abd5fe593ed652c3cfe047a6a86c9cf31ebbf99968fe584e |
|
MD5 | d9dfcb876444e6c69cba0774cec3255b |
|
BLAKE2b-256 | 24c83cd84be986655f4b9c0d62b703dde29aab981ed0cc30988c869808aa876c |
Close
Hashes for general_sam-0.6.1.post0-pp38-pypy38_pp73-macosx_10_7_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 20014c8a27e69bbd4a5a72696e4c631f35f9fd7a8cc349c0b6ad0e7bcdd0336f |
|
MD5 | 18e64fb5b50a483a75af10a834748eaf |
|
BLAKE2b-256 | 49f646cef9fa59c92df88fa09e356af6ae0a4bd0df7a13f7a0045e2c1d3b4dd6 |
Close
Hashes for general_sam-0.6.1.post0-cp38-abi3-win_amd64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | eab7c4a900f8b57d5f2fecd59cb60033bde34b209a47e4b7e2d9be6532cbc43e |
|
MD5 | 7ff6159eb837d00fd613d8cd01ec0c6c |
|
BLAKE2b-256 | c00eba4010124dc757b0812001b8795fa34069d4c47bd04bfc6e27dcf9c8b95b |
Close
Hashes for general_sam-0.6.1.post0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | a10a9ff594486c6ec09514329bb50cf784eec52217d0d0750cc54d543b37e1cc |
|
MD5 | f9ed499e4accfa3412d07fc8f52fbcb2 |
|
BLAKE2b-256 | b5de858837c8e78051b040d94801816f057295b323f703788b4ebfd4ae3413c6 |
Close
Hashes for general_sam-0.6.1.post0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 659c8cf2011ab4283d4952f5dbe3c12db276c052c3fbebcc7f051481beae43c9 |
|
MD5 | 3f56b088cb0fc4cf1e4f1746ddab31a1 |
|
BLAKE2b-256 | aff4a1d307ec443541cb840fc9e29198d82546830e3f9db48aca3e8c6823dd0a |
Close
Hashes for general_sam-0.6.1.post0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 91c1e8fd814f5329fcbb12b7f84ff44458e9ca4a7631981311f022fec0efaec0 |
|
MD5 | 381e972930e909cbfc1527a824b6d23f |
|
BLAKE2b-256 | ff1690ba81f81c47fc43ce7c8d7a52c1779c97e8771579c75c687f77f056c851 |
Close
Hashes for general_sam-0.6.1.post0-cp38-abi3-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 6e95e85adeddafab707cc39a2072e1c33b74d92c3c287e657dcbaeec4c14c8f6 |
|
MD5 | a48f71e36220def6feba70d3c07fd699 |
|
BLAKE2b-256 | 376b21c4a4c158becaabb01a2f7a69bedcec0c5e097adf7d4d5001cede7edb1d |
Close
Hashes for general_sam-0.6.1.post0-cp38-abi3-macosx_10_7_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 92e8187ef2886846d260d1856882f9f8cc77f79bb0140f460db61d3a89ad4c0a |
|
MD5 | c6bf3b3c6799c389e52d41c1ba72cdbd |
|
BLAKE2b-256 | c99ce683e1e3d33867e4e51a77bace048c7906da72d635e27c3354de4cff3df2 |