Python bindings for general-sam and some utilities
Project description
general-sam-py
Python bindings for general-sam
and some utilities.
flowchart LR
init((ε))
a((a))
b((b))
ab((ab))
bc(((bc)))
abc((abc))
abcb((abcb))
abcbc(((abcbc)))
init -- a --> a
init -- b --> b
a -- b --> ab
b -- c --> bc
init -- c --> bc
ab -- c --> abc
bc -- b --> abcb
abc -- b --> abcb
abcb -- c --> abcbc
The suffix automaton of abcbc.
Installation
pip install general-sam
Usage
GeneralSam
from general_sam import GeneralSam
sam = GeneralSam.from_bytes(b"abcbc")
# "cbc" is a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b"cbc")
assert state.is_accepting()
# "bcb" is not a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b"bcb")
assert not state.is_accepting()
from general_sam import GeneralSam
sam = GeneralSam.from_chars("abcbc")
state = sam.get_root_state()
# "b" is not a suffix but at least a substring of "abcbc"
state.feed_chars("b")
assert not state.is_accepting()
# "bc" is a suffix of "abcbc"
state.feed_chars("c")
assert state.is_accepting()
# "bcbc" is a suffix of "abcbc"
state.feed_chars("bc")
assert state.is_accepting()
# "bcbcbc" is not a substring, much less a suffix of "abcbc"
state.feed_chars("bc")
assert not state.is_accepting() and state.is_nil()
from general_sam import GeneralSam, GeneralSamState, build_trie_from_chars
trie, _ = build_trie_from_chars(["hello", "Chielo"])
sam = GeneralSam.from_trie(trie)
def fetch_state(s: str) -> GeneralSamState:
state = sam.get_root_state()
state.feed_chars(s)
return state
assert fetch_state("lo").is_accepting()
assert fetch_state("ello").is_accepting()
assert fetch_state("elo").is_accepting()
state = fetch_state("el")
assert not state.is_accepting() and not state.is_nil()
state = fetch_state("bye")
assert not state.is_accepting() and state.is_nil()
VocabPrefixAutomaton
from general_sam import CountInfo, VocabPrefixAutomaton
vocab = ["歌曲", "聆听歌曲", "播放歌曲", "歌词", "查看歌词"]
automaton = VocabPrefixAutomaton(vocab, bytes_or_chars="chars")
# NOTE: CountInfo instances are actually related to the sorted `vocab`:
_ = ["播放歌曲", "查看歌词", "歌曲", "歌词", "聆听歌曲"]
# Case 1:
# 一起 | 聆 | 听 | 歌
state = automaton.get_root_state()
# prepend '歌'
cnt_info = automaton.prepend_feed(state, "歌")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=2, tot_cnt_lower=2, tot_cnt_upper=4
)
# found '歌曲' at the index 0 and '歌词' at the index 3 prefixed with '歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {0, 3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"歌曲", "歌词"}
# prepend 听
cnt_info = automaton.prepend_feed(state, "听")
# found nothing prefixed with '听歌'
assert cnt_info is None
assert not state.is_nil()
# prepend 聆
cnt_info = automaton.prepend_feed(state, "聆")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=4, tot_cnt_upper=5
)
# found '聆听歌曲' at the index 1 prefixed with '聆听歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {1}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"聆听歌曲"}
# prepend 一起
assert not state.is_nil()
# found nothing prefixed with '一起聆听歌'
cnt_info = automaton.prepend_feed(state, "一起")
assert state.is_nil()
# Case 2:
# 来 | 查看 | 歌词
state = automaton.get_root_state()
# prepend 歌词
cnt_info = automaton.prepend_feed(state, "歌词")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=3, tot_cnt_upper=4
)
# found '歌词' at the index 3 prefixed with '歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"歌词"}
# prepend 查看
cnt_info = automaton.prepend_feed(state, "查看")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=1, tot_cnt_upper=2
)
# found '查看歌词' at the index 4 prefixed with '查看歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {4}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"查看歌词"}
# prepend 来
assert not state.is_nil()
# found nothing prefixed with '来查看歌词'
cnt_info = automaton.prepend_feed(state, "来")
assert state.is_nil()
GreedyTokenizer
from general_sam import GeneralSam, GreedyTokenizer, build_trie_from_chars
vocab = ["a", "ab", "b", "bc", "c", "d", "e", "f", "cd", "abcde"]
trie, token_to_trie_node = build_trie_from_chars(vocab)
trie_node_to_token = [-1] * trie.num_of_nodes()
for i, j in enumerate(token_to_trie_node):
trie_node_to_token[j] = i
sam = GeneralSam.from_trie(trie)
tokenizer = GreedyTokenizer.from_sam_and_trie(sam, trie)
def tokenize(s: str):
return [(trie_node_to_token[i], j) for i, j in tokenizer.tokenize_str(s)]
assert tokenize("abcde") == [(9, 5)]
assert tokenize("abcdf") == [(1, 2), (8, 2), (7, 1)]
assert tokenize("abca") == [(1, 2), (4, 1), (0, 1)]
License
- © 2023 Chielo Newctle <ChieloNewctle@gmail.com>
- © 2023 ModelTC Team
This project is licensed under either of
at your option.
The SPDX license identifier for this project is MIT OR Apache-2.0
.
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
general_sam-1.0.0.tar.gz
(21.6 kB
view hashes)
Built Distributions
Close
Hashes for general_sam-1.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 0bbc9c8026515a4bdf8a240efb8884802b144030d4d1d12ffb3086581ebfedb6 |
|
MD5 | caae4b11ac03d689726647918a09c3fa |
|
BLAKE2b-256 | 35c1f5da1afa0cab6a57f6c12a8e41258ed640e44153af8284d7b6254af6db3c |
Close
Hashes for general_sam-1.0.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 90f0558e8df04de74ff3abd271ed4399a12c12fd9b447c8c06d0d8daab7299bb |
|
MD5 | 83e733299b5f47dbb285351a6fd92f1b |
|
BLAKE2b-256 | 56479339eb8d5ea97529019e89d94e4a18a486071d2a1b1fc22d2c05f5d2fea6 |
Close
Hashes for general_sam-1.0.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 636308548bf89bd77f4257cd21e785f1d1c4ca0a40834692a0e6d4116cb04a2d |
|
MD5 | a2f18eab6290930b34f4de8fa68d417c |
|
BLAKE2b-256 | eb2ad36dc3ae70cd1f6f82c1036bf637196471551f9692c18686443753128ef8 |
Close
Hashes for general_sam-1.0.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | e103bc96591d4098da3d6f7353a19c8f30ff016503b923be2ce01780492a8547 |
|
MD5 | 5447de6fc9ee670527136020a0b5ba8f |
|
BLAKE2b-256 | c6f6ab262bf80958eff273878441e36abc17963adce79995ddf09df767e0d96c |
Close
Hashes for general_sam-1.0.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | e8bf965979874c390f15d7d97a1c5a0649fdfa12a1e8865b178116dd778d075d |
|
MD5 | 76132d3eb0cd187cd577272c3ce42d42 |
|
BLAKE2b-256 | de8c5d16d9576a5956d2c264d11df5c702f94f40e4fa71744f3ca25684093480 |
Close
Hashes for general_sam-1.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | afda8c7c97a06be7bde2c4ed80162cf81fb179dea0bdbf76802707bd17cc89f2 |
|
MD5 | b48c0acd618793021426a7c99503ad39 |
|
BLAKE2b-256 | 47d9b9d121606a9d61f46b1b7f43e742a81adc99230ec22cf37e76ed19f356a9 |
Close
Hashes for general_sam-1.0.0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | ec5f9121f0ed99d28de92fbf4113fc672dc5b6b8699a53cfe0eb509005c649a1 |
|
MD5 | edfe9e377edb36907e0bc6b9a04e1379 |
|
BLAKE2b-256 | 7f956a780f94c457e73656b078f8980e1c5d139ff44a7af2d119e63ab246f114 |
Close
Hashes for general_sam-1.0.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | ccbfce59ccf0018564f4893a1089179040b0e21b43f0dca3d29e8122da800870 |
|
MD5 | acd151652a62a322eb1060d48e9e8e83 |
|
BLAKE2b-256 | ac5cfaad6f478d1a48dc1b88ce730819401b58291dc7b600e1011d24acb794c3 |
Close
Hashes for general_sam-1.0.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 8bcd01f5b3378099e44dc62731c0d42d4c17554b2416eaba32df44fda53c2a9f |
|
MD5 | 72d726d5f8c4775c203c058b903b49ba |
|
BLAKE2b-256 | f41d5292cbbc8467709b3519273da56f7d99133f8f87cad66583abaaa439a5cc |
Close
Hashes for general_sam-1.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 22e9ef2c146e3283ac21483a3da9c317784aa9618f7251c6004b5488696d4d1d |
|
MD5 | 053f1d683fc6f6a5d971376d60b0409a |
|
BLAKE2b-256 | 28aa8757cef0714c947641e170a1687be964e63e4eb98acbb157da8f98f8c334 |
Close
Hashes for general_sam-1.0.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | eb6b8f495822f0f70682421d888ed1e17396640bc8973e02a24fe0852c8228c2 |
|
MD5 | 8e1b0a50e81c4c6bc1f3f04108187a49 |
|
BLAKE2b-256 | 180ea6b6ea8ebaa70a0ec957c2c4babfac46bfc7a4a460fe837283ffa61b3328 |
Close
Hashes for general_sam-1.0.0-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | cd33cf285d5fe189968f45a708cf45bb9e241f1482075390f4c29dd1582cc9d3 |
|
MD5 | ef211055e932c79daf4fc840eceff204 |
|
BLAKE2b-256 | 7652092e035718a282c78b0a3957ac54274d553eefda80a47f6e4c9650899cd3 |
Close
Hashes for general_sam-1.0.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 4c09b0fe0971d18e401aa3284487220fc9a94827d5466a8b6fadfacf1d7d5611 |
|
MD5 | 1f06bd53f38fcfcf6f364a2ab38ed589 |
|
BLAKE2b-256 | 2c0946c14156d99e3d18a97acba818a0af0c64a48d39e42aa2a5ec2ea7b1422b |
Close
Hashes for general_sam-1.0.0-pp38-pypy38_pp73-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | ff6d7055bab46506e8ae247256595207b1e13dc031c07ca12c18ca4d4a7fdc52 |
|
MD5 | 40e06503c32c703c3c4eb08b40e5d156 |
|
BLAKE2b-256 | a93ce5bdce70d7690c6c0a4744ad2c7dd550db685642db1c00626035b59dab9d |
Close
Hashes for general_sam-1.0.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 0ddea3ece0bafe54f331c7cf0526dd3ec579681599b7a99208a6f31e45940c94 |
|
MD5 | 11cb0fc879242c09c42ccf6369939bdd |
|
BLAKE2b-256 | c52c2e54820d1f3da46a43f7217ff2878512186ae4236dd554b28b442d5c79fa |
Close
Hashes for general_sam-1.0.0-cp38-abi3-win_amd64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 7dbb63fece1c16e9ce4040075c1120377ee3ec6212d19547100ba5092391d367 |
|
MD5 | dfff3ae71255b74ea50db4e31f090614 |
|
BLAKE2b-256 | 22eb46ece8a2660e03322ccec90de23653cba30240dc625460d21917ccbb21d3 |
Close
Hashes for general_sam-1.0.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | afe317ddcaba3c79ba0be593e757396dca7448b4a3a8c928bd7e2c2fc0fe032e |
|
MD5 | f6d60167e7954f49b57b92b7bffd1051 |
|
BLAKE2b-256 | ab183b1554476c74d45f69cda236d0cf18f99fcd35ae99df6033a24e6304c66e |
Close
Hashes for general_sam-1.0.0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 9295486f4f632048fe8f88fd857c1ea503029d9edecdcc9a2722ba97d6367aee |
|
MD5 | 5b6e789d1007c714d962113404e4f007 |
|
BLAKE2b-256 | 223e357154ba2ef19116cbc648147ec13dfa51252fd2af339df3f454b41c45ee |
Close
Hashes for general_sam-1.0.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 0b0dddbca1ee9a45abe6fd661213228a9018c3c67cbacbd3d9820523931f5363 |
|
MD5 | 1a8d25aff1f10eed06bd05e0ca1d869d |
|
BLAKE2b-256 | 35d0031d06bc794914e380aa42ff13e97fd8fa671fa4e9cc85c564ca4309f87c |
Close
Hashes for general_sam-1.0.0-cp38-abi3-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 6a8afd3e609816dca8ccd4cb69991fee8bc8f8db3d9805391f1cb49ed2595183 |
|
MD5 | 2ddc52584c4e9dbc401b2f650881a2ca |
|
BLAKE2b-256 | f58b4d976252f9254e871fb244a73d4bf442e8b1531ba053126fdffb1850fde0 |
Close
Hashes for general_sam-1.0.0-cp38-abi3-macosx_10_12_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 66fc852c20d79f3407009fa85bbe7b8b85c4c0ad466978c76928853007a7cae0 |
|
MD5 | e992ee1a1bdc98480f31e55048d9b029 |
|
BLAKE2b-256 | b5d846f3d20754acc033335c4ab242264d4f0ac4b92f4753d05867f051601b56 |