Python bindings for general-sam and some utilities
Project description
general-sam-py
Python bindings for general-sam
and some utilities.
flowchart LR
init((ε))
a((a))
b((b))
ab((ab))
bc(((bc)))
abc((abc))
abcb((abcb))
abcbc(((abcbc)))
init -- a --> a
init -- b --> b
a -- b --> ab
b -- c --> bc
init -- c --> bc
ab -- c --> abc
bc -- b --> abcb
abc -- b --> abcb
abcb -- c --> abcbc
The suffix automaton of abcbc.
Installation
pip install general-sam
Usage
GeneralSAM
from general_sam import GeneralSAM
sam = GeneralSAM.from_bytes(b"abcbc")
# "cbc" is a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b"cbc")
assert state.is_accepting()
# "bcb" is not a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b"bcb")
assert not state.is_accepting()
from general_sam import GeneralSAM
sam = GeneralSAM.from_chars("abcbc")
state = sam.get_root_state()
# "b" is not a suffix but at least a substring of "abcbc"
state.feed_chars("b")
assert not state.is_accepting()
# "bc" is a suffix of "abcbc"
state.feed_chars("c")
assert state.is_accepting()
# "bcbc" is a suffix of "abcbc"
state.feed_chars("bc")
assert state.is_accepting()
# "bcbcbc" is not a substring, much less a suffix of "abcbc"
state.feed_chars("bc")
assert not state.is_accepting() and state.is_nil()
from general_sam import GeneralSAM, GeneralSAMState, build_trie_from_chars
trie, _ = build_trie_from_chars(["hello", "Chielo"])
sam = GeneralSAM.from_trie(trie)
def fetch_state(s: str) -> GeneralSAMState:
state = sam.get_root_state()
state.feed_chars(s)
return state
assert fetch_state("lo").is_accepting()
assert fetch_state("ello").is_accepting()
assert fetch_state("elo").is_accepting()
state = fetch_state("el")
assert not state.is_accepting() and not state.is_nil()
state = fetch_state("bye")
assert not state.is_accepting() and state.is_nil()
VocabPrefixAutomaton
from general_sam import CountInfo, VocabPrefixAutomaton
vocab = ["歌曲", "聆听歌曲", "播放歌曲", "歌词", "查看歌词"]
automaton = VocabPrefixAutomaton(vocab, bytes_or_chars="chars")
# NOTE: CountInfo instances are actually related to the sorted `vocab`:
_ = ["播放歌曲", "查看歌词", "歌曲", "歌词", "聆听歌曲"]
# Case 1:
# 一起 | 聆 | 听 | 歌
state = automaton.get_root_state()
# prepend '歌'
cnt_info = automaton.prepend_feed(state, "歌")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=2, tot_cnt_lower=2, tot_cnt_upper=4
)
# found '歌曲' at the index 0 and '歌词' at the index 3 prefixed with '歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {0, 3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"歌曲", "歌词"}
# prepend 听
cnt_info = automaton.prepend_feed(state, "听")
# found nothing prefixed with '听歌'
assert cnt_info is None
assert not state.is_nil()
# prepend 聆
cnt_info = automaton.prepend_feed(state, "聆")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=4, tot_cnt_upper=5
)
# found '聆听歌曲' at the index 1 prefixed with '聆听歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {1}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"聆听歌曲"}
# prepend 一起
assert not state.is_nil()
# found nothing prefixed with '一起聆听歌'
cnt_info = automaton.prepend_feed(state, "一起")
assert state.is_nil()
# Case 2:
# 来 | 查看 | 歌词
state = automaton.get_root_state()
# prepend 歌词
cnt_info = automaton.prepend_feed(state, "歌词")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=3, tot_cnt_upper=4
)
# found '歌词' at the index 3 prefixed with '歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"歌词"}
# prepend 查看
cnt_info = automaton.prepend_feed(state, "查看")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=1, tot_cnt_upper=2
)
# found '查看歌词' at the index 4 prefixed with '查看歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {4}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"查看歌词"}
# prepend 来
assert not state.is_nil()
# found nothing prefixed with '来查看歌词'
cnt_info = automaton.prepend_feed(state, "来")
assert state.is_nil()
GreedyTokenizer
from general_sam import GeneralSAM, GreedyTokenizer, build_trie_from_chars
vocab = ["a", "ab", "b", "bc", "c", "d", "e", "f", "cd", "abcde"]
trie, token_to_trie_node = build_trie_from_chars(vocab)
trie_node_to_token = [-1] * trie.num_of_nodes()
for i, j in enumerate(token_to_trie_node):
trie_node_to_token[j] = i
sam = GeneralSAM.from_trie(trie)
tokenizer = GreedyTokenizer.from_sam_and_trie(sam, trie)
def tokenize(s: str):
return [(trie_node_to_token[i], j) for i, j in tokenizer.tokenize_str(s)]
assert tokenize("abcde") == [(9, 5)]
assert tokenize("abcdf") == [(1, 2), (8, 2), (7, 1)]
assert tokenize("abca") == [(1, 2), (4, 1), (0, 1)]
License
- © 2023 Chielo Newctle <ChieloNewctle@gmail.com>
- © 2023 ModelTC Team
This project is licensed under either of
at your option.
The SPDX license identifier for this project is MIT OR Apache-2.0
.
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
general_sam-0.7.0.tar.gz
(21.6 kB
view hashes)
Built Distributions
Close
Hashes for general_sam-0.7.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 3a6815c819ed8ed692657156b9675908773177e9df7727cdeb965b14ec429b2f |
|
MD5 | 698eed9a026f50ae3b19cc4dfba9c276 |
|
BLAKE2b-256 | cd757d7ce13eda265b89c4ede0e258f727aa19e36c33531ba65ee35951d64a7a |
Close
Hashes for general_sam-0.7.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 67f395b3c47a175a8c786dfc6cc428cf1a4f1e4b70bb603f016b5143c22f302b |
|
MD5 | 69c2b2d16d6d49e0a5d4a5e94f998356 |
|
BLAKE2b-256 | 75ca1a4df2d253ec49aae154f471704d14d6a0f874583e335bfbc4c3e6744be9 |
Close
Hashes for general_sam-0.7.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | a385b5a1eaf82971ff5402b07c55e74dcac8cd92b82d800e31c732aa8822522d |
|
MD5 | c71b1f109b263edaf349dc79ffd56cdd |
|
BLAKE2b-256 | 10ee5a1f3e09e6370e0bf834b76e5f2bec06325f7584555bdc847d4b66d4ce02 |
Close
Hashes for general_sam-0.7.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | bb51d96440451d5983e388f55a19dad5f8b7346d4f794ac712d2ca824baca000 |
|
MD5 | 9fcb45d4e3b5dd3116afd8ba5200f210 |
|
BLAKE2b-256 | 20ab918d3378da01308c8577830650ef294a25118f83ada66fd37fdf428506f3 |
Close
Hashes for general_sam-0.7.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | b3f1f726696dd0096e7660c7e057c96f8caba82ea46fe90122084bc3d2c604ff |
|
MD5 | c2414ad02c9cc79ff2ab742e95d3bef8 |
|
BLAKE2b-256 | 4bd6be866c410e13c347690a62efafd8a6aad33289ed3e86e4e3d956a175d653 |
Close
Hashes for general_sam-0.7.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 59bd88b36b8c80f9c3d84424144a5b37aa16fa4a42d7630ab84eeb2e85922509 |
|
MD5 | 91ce17550c0407700de5713e65db15f0 |
|
BLAKE2b-256 | eaca81059fa834fa014cf350aee9def16ab960873a572d24291b96d8f84e1ad8 |
Close
Hashes for general_sam-0.7.0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 02941536249ce8a76923c79ecafb3940d4d4a94a2a78fdd326e13c709a8b8fcd |
|
MD5 | 096102f76f559036aa0824a9c73a7ec1 |
|
BLAKE2b-256 | 17a77e1b26667f0aa13b9dafb723e8cde31c36cc8430d70cafcb6461cf55ac3a |
Close
Hashes for general_sam-0.7.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 228b5a58d0e9cb572faceffa5a7f14b1de4c3326ecc3571f79218740b8298d5a |
|
MD5 | 45128524e89f3226c14ee09702628261 |
|
BLAKE2b-256 | 4a3c553c5c73fbecac0fb90b8c516c742b2e3699caba446b4bb1dd42cffc7dd3 |
Close
Hashes for general_sam-0.7.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 0a298f00b12849c46886208921cb687a11cf2bca22c418b71cbc84d0dac0dc13 |
|
MD5 | 5711186673d42bfb3f372d804a000f75 |
|
BLAKE2b-256 | 4d7b2e50f650c729c4741e91c3538dd57e11c90590b6ad12acdafc274521df7d |
Close
Hashes for general_sam-0.7.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 2b1ad18cca33047225c7bcbb2761570b3b2f0540986aade649aaf55c8a24192c |
|
MD5 | 9018ade9feaa589f86bd3afe8ac0ea19 |
|
BLAKE2b-256 | 9bef7dc5e968f2716238314a7815b4a0e1bf6afc3ba39de81d72fd2e8a461d96 |
Close
Hashes for general_sam-0.7.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 4661cf21ebc3cb878ce1c2616821382036b500e91e5994c28b0c4fc738d364c4 |
|
MD5 | de4f5280193b132bf0e61f074bb78d56 |
|
BLAKE2b-256 | edb8eb307a635b2e7f019eb257bc6a7f3a44496465029b74cc5bbb33e267c84d |
Close
Hashes for general_sam-0.7.0-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 7fd5ca9e6ab6f280216ce4510e73526db1b2df6020a75cfaace81940887205c0 |
|
MD5 | 58bb89c68c2ac4d909c0b6a335bcea74 |
|
BLAKE2b-256 | d83086c6f319b6b61ffd3ec262167963af96a33269daee56a64967a416ab0a6b |
Close
Hashes for general_sam-0.7.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | a808c8fb7871b8c6028817ef12f82f96ab0f49477ed3cdc70da08b979fece6c7 |
|
MD5 | c87f8195024e56516700402d4b563731 |
|
BLAKE2b-256 | dfea6eaed1286ce9b54f06f0fc482e7187cbf2ea73ba692763d8d2dce709f85f |
Close
Hashes for general_sam-0.7.0-pp38-pypy38_pp73-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 504d8ff3ccd83a1d99cc1354b1a008d0494da96fb44700d0328cd652a6a84749 |
|
MD5 | a3b6e84fd57aa89a6bfaf9ea18a6d4c2 |
|
BLAKE2b-256 | 1c50e68f7bafd512928d10019a1f4c75696eba2a83b575c119b7b1f944449166 |
Close
Hashes for general_sam-0.7.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | c61ea2bac1439fc81d9d8f2747cf84e27b66e54af8bdb143d8eb090765b215af |
|
MD5 | 6f26b07ae4326999b6c006f1ca40e5d7 |
|
BLAKE2b-256 | c8039c49f227ddec8b80a84fe550a8f8a0ca4700dc6de9ddc579eb35e43084b3 |
Close
Hashes for general_sam-0.7.0-cp38-abi3-win_amd64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 4b51c73ea6428a960f5b8866425bded48ba5f22932f3e2ee986e31cd93ae78f7 |
|
MD5 | a2a9e3034e6542a5f3d6427969a1e542 |
|
BLAKE2b-256 | b32c187831a414ecff4945951022ae7cc4fd6e16b1456bf84554145ac39e8612 |
Close
Hashes for general_sam-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 1d5abb2946d2c191170d28865c6b81331796da171d3e2df533c09e35358da2d5 |
|
MD5 | d7f4771d315ccdb48b1a69dbac8b8e84 |
|
BLAKE2b-256 | b4fbaeda2499a5302e119ab974f615e0e9be56c8af1e898bacbe4ec8c494d952 |
Close
Hashes for general_sam-0.7.0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 1069fa667cd060063d600dfb06f9870c53d0843f6fb50c1335ff1d6de3dfb91f |
|
MD5 | e87621171d2eb4a4e2dcfb2a9d6f9a46 |
|
BLAKE2b-256 | 22228db602ce4ce5b5f8bd17bd944dbc10cd137fff02fe954d736379521156ea |
Close
Hashes for general_sam-0.7.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | b9db7687833017070324633a57fd00993fe4f9012633ffa32260c2e17d9e2f21 |
|
MD5 | 7542dd7f28b1f414a5d8d7ecb63213be |
|
BLAKE2b-256 | fd7c74f82af7eebebf3b95d92ff6e5393e4c33329ffb8ced9a792020a8b1ea9d |
Close
Hashes for general_sam-0.7.0-cp38-abi3-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 19b36e869532fd867cce377904ca755a95d200b22ead089901a507853e71ed96 |
|
MD5 | f7a254bc1b9ce322ddeac5a90a3faaad |
|
BLAKE2b-256 | 1fe93a43cc31b8de6a24a434fed0000e097340d970122eb429e144a46dab66a6 |
Close
Hashes for general_sam-0.7.0-cp38-abi3-macosx_10_12_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 24e544cb7dc1f8699812b5dad69b764724d6716cc2bebb8027f6b8622ff8f948 |
|
MD5 | 1bea0b95ffdf5faf2e1ec1b4c779071a |
|
BLAKE2b-256 | 9a66fa91279f784f3b512cd9474ef466aa388389c769f60b198ce0083cdeafaf |