Python bindings for general-sam and some utilities
Project description
general-sam-py
Python bindings for general-sam
and some utilities.
flowchart LR
init((ε))
a((a))
b((b))
ab((ab))
bc(((bc)))
abc((abc))
abcb((abcb))
abcbc(((abcbc)))
init -- a --> a
init -- b --> b
a -- b --> ab
b -- c --> bc
init -- c --> bc
ab -- c --> abc
bc -- b --> abcb
abc -- b --> abcb
abcb -- c --> abcbc
The suffix automaton of abcbc.
Installation
pip install general-sam
Usage
GeneralSAM
from general_sam import GeneralSAM
sam = GeneralSAM.construct_from_bytes(b'abcbc')
# "cbc" is a suffix.
state = sam.get_root_state()
state.feed_bytes(b'cbc')
assert state.is_accepting()
# "bcb" isn't a suffix.
state = sam.get_root_state()
state.feed_bytes(b'bcb')
assert not state.is_accepting()
from general_sam import GeneralSAM
sam = GeneralSAM.construct_from_chars('abcbc')
state = sam.get_root_state()
# "b" is not a suffix but a substring.
state.feed_chars('b')
assert not state.is_accepting()
# "bc" is a suffix.
state.feed_chars('c')
assert state.is_accepting()
# "bcbc" is also a suffix.
state.feed_chars('bc')
assert state.is_accepting()
# "bcbcbc" is not a substring.
state.feed_chars('bc')
assert not state.is_accepting() and state.is_nil()
from general_sam import GeneralSAM, GeneralSAMState, construct_trie_from_chars
trie, _ = construct_trie_from_chars(['hello', 'Chielo'])
sam = GeneralSAM.construct_from_trie(trie)
def fetch_state(s: str) -> GeneralSAMState:
state = sam.get_root_state()
state.feed_chars(s)
return state
assert fetch_state('lo').is_accepting()
assert fetch_state('ello').is_accepting()
assert fetch_state('elo').is_accepting()
state = fetch_state('el')
assert not state.is_accepting() and not state.is_nil()
state = fetch_state('bye')
assert not state.is_accepting() and state.is_nil()
VocabPrefixAutomaton
from general_sam import VocabPrefixAutomaton, CountInfo
vocab = ['歌曲', '聆听歌曲', '播放歌曲', '歌词', '查看歌词']
automaton = VocabPrefixAutomaton(vocab, bytes_or_chars='chars')
# NOTE: CountInfo instances are actually related to the sorted `vocab`:
_ = ['播放歌曲', '查看歌词', '歌曲', '歌词', '聆听歌曲']
# Case 1:
# 一起 | 聆 | 听 | 歌
state = automaton.get_root_state()
# prepend '歌'
cnt_info = automaton.prepend_feed(state, '歌')
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=2, tot_cnt_lower=2, tot_cnt_upper=4
)
# found '歌曲' at the index 0 and '歌词' at the index 3 prefixed with '歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {0, 3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {'歌曲', '歌词'}
# prepend 听
cnt_info = automaton.prepend_feed(state, '听')
# found nothing prefixed with '听歌'
assert cnt_info is None
assert not state.is_nil()
# prepend 聆
cnt_info = automaton.prepend_feed(state, '聆')
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=4, tot_cnt_upper=5
)
# found '聆听歌曲' at the index 1 prefixed with '聆听歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {1}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {'聆听歌曲'}
# prepend 一起
assert not state.is_nil()
# found nothing prefixed with '一起聆听歌'
cnt_info = automaton.prepend_feed(state, '一起')
assert state.is_nil()
# Case 2:
# 来 | 查看 | 歌词
state = automaton.get_root_state()
# prepend 歌词
cnt_info = automaton.prepend_feed(state, '歌词')
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=3, tot_cnt_upper=4
)
# found '歌词' at the index 3 prefixed with '歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {'歌词'}
# prepend 查看
cnt_info = automaton.prepend_feed(state, '查看')
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=1, tot_cnt_upper=2
)
# found '查看歌词' at the index 4 prefixed with '查看歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {4}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {'查看歌词'}
# prepend 来
assert not state.is_nil()
# found nothing prefixed with '来查看歌词'
cnt_info = automaton.prepend_feed(state, '来')
assert state.is_nil()
License
- © 2023 Chielo Newctle <ChieloNewctle@gmail.com>
- © 2023 ModelTC Team
This project is licensed under either of
at your option.
The SPDX license identifier for this project is MIT OR Apache-2.0
.
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
general_sam-0.2.0.tar.gz
(17.8 kB
view hashes)
Built Distributions
general_sam-0.2.0-cp38-abi3-win32.whl
(175.2 kB
view hashes)
Close
Hashes for general_sam-0.2.0-cp38-abi3-win_amd64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | ae03a336ccdc6e522fab0204d28a466e8d8cca20231c02a194bdb325ab7c47ec |
|
MD5 | 2f9c60fb93afc1a74be6158e034f19c3 |
|
BLAKE2b-256 | 8ddafb111226f04cc8923193ef2b44cfd26d8a1bbc4c5a73aacb3cae072f1c94 |
Close
Hashes for general_sam-0.2.0-cp38-abi3-win32.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 3e3b1f2d1b6aa0a7bb006e107f4903ca0ba51f770df63d7f41507748ffe413f8 |
|
MD5 | 3033dee8733ba1b674a097b5ac2c44a2 |
|
BLAKE2b-256 | 1c46a6f1ce56eeffc0b35a2087d0232b37e8e35e77aaf0191a28323707130cc6 |
Close
Hashes for general_sam-0.2.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | aeab5a6938b15f66dcdc80e6023a67348af7a33c89aec48c707254066d8276ff |
|
MD5 | 38f392989a911cad9b93f38ead964390 |
|
BLAKE2b-256 | 0f45c94051dcfe69e60053e66f725b64ee66f7ce05cc2bf21ae1de619a41852c |
Close
Hashes for general_sam-0.2.0-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 2c7f4b53d2266707b9d2eee6fc9f7360f40a476b392292a982ee87930e990d9a |
|
MD5 | 2579aaff5a3a3e8db6e3ec7fcda0ba06 |
|
BLAKE2b-256 | 93efd20eb5520d7735a5e8a4bc35da5aeb87b6a9f970e406dfb0f80bc0c35eb5 |
Close
Hashes for general_sam-0.2.0-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 0180e677504685cb9bd0933b3b0f4a72e2fc9862ffda73ffaacf9dc16d99dd73 |
|
MD5 | 813e9b17f2c5e2994e4c611f01d88dc1 |
|
BLAKE2b-256 | 5bf4362d30074299c080613b1342039bacab6b13154eab2edb593ccff4fd81de |
Close
Hashes for general_sam-0.2.0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | f485f41284ce60ca9ed70dcf1ffb26ebb3ac3b6313d4b8fdbcedcfb61f697844 |
|
MD5 | 2821fbe40f7b11d9588dd93f2321ae66 |
|
BLAKE2b-256 | 53934afae68d69d6c4e0fa72f1bd5dc620624fe40fa2a470cae7e06ba902659c |
Close
Hashes for general_sam-0.2.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 1fdf85f4bfaeffd591f2b13a0ca7f7d08e34e5a39c238fa033fe105c2f4ba156 |
|
MD5 | e681caa07520afb59fe0be0d7b292949 |
|
BLAKE2b-256 | 6994e04011acf7c6e6f6fe530f4dae907790fa3e47c34eaa65ae087afcebafe2 |
Close
Hashes for general_sam-0.2.0-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | f549631e0d55fa73640a012adf3032711035fae21bd7c0e3b5b3d82815ffaa07 |
|
MD5 | 0c95d047a63b22fd673351df217beeeb |
|
BLAKE2b-256 | df31556c044874bd42bedabdf5379d5c26f91e2cbcd89d202b36b5f837cea35e |
Close
Hashes for general_sam-0.2.0-cp38-abi3-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 3167acca71cbbea6c057b3366ec7f25e6a4f874bcdef0ef49afec7df449d71e1 |
|
MD5 | 89f10b844ee7c57973afd15cb2012002 |
|
BLAKE2b-256 | 52663cd1afa976e08d21eff2b24ba7d7bf01c3221ea91f1f3cdd2c8e6cadb1fc |
Close
Hashes for general_sam-0.2.0-cp38-abi3-macosx_10_7_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 5e8b1bd090f877385cdfe6bebac7b273c14266f8bcea3a8f2444283fa2b04ce4 |
|
MD5 | 217538ece47368b87020b90e973595a1 |
|
BLAKE2b-256 | 0c5ad3a4625ffa105891df5e11c3d1e517ab32aa456bfe7fcc69c8cd9fe3360e |