Python bindings for general-sam and some utilities
Project description
general-sam-py
Python bindings for general-sam
and some utilities.
flowchart LR
init((ε))
a((a))
b((b))
ab((ab))
bc(((bc)))
abc((abc))
abcb((abcb))
abcbc(((abcbc)))
init -- a --> a
init -- b --> b
a -- b --> ab
b -- c --> bc
init -- c --> bc
ab -- c --> abc
bc -- b --> abcb
abc -- b --> abcb
abcb -- c --> abcbc
The suffix automaton of abcbc.
Installation
pip install general-sam
Usage
GeneralSAM
from general_sam import GeneralSAM
sam = GeneralSAM.construct_from_bytes(b'abcbc')
# "cbc" is a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b'cbc')
assert state.is_accepting()
# "bcb" is not a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b'bcb')
assert not state.is_accepting()
from general_sam import GeneralSAM
sam = GeneralSAM.construct_from_chars('abcbc')
state = sam.get_root_state()
# "b" is not a suffix but at least a substring of "abcbc"
state.feed_chars('b')
assert not state.is_accepting()
# "bc" is a suffix of "abcbc"
state.feed_chars('c')
assert state.is_accepting()
# "bcbc" is a suffix of "abcbc"
state.feed_chars('bc')
assert state.is_accepting()
# "bcbcbc" is not a substring, much less a suffix of "abcbc"
state.feed_chars('bc')
assert not state.is_accepting() and state.is_nil()
from general_sam import GeneralSAM, GeneralSAMState, construct_trie_from_chars
trie, _ = construct_trie_from_chars(['hello', 'Chielo'])
sam = GeneralSAM.construct_from_trie(trie)
def fetch_state(s: str) -> GeneralSAMState:
state = sam.get_root_state()
state.feed_chars(s)
return state
assert fetch_state('lo').is_accepting()
assert fetch_state('ello').is_accepting()
assert fetch_state('elo').is_accepting()
state = fetch_state('el')
assert not state.is_accepting() and not state.is_nil()
state = fetch_state('bye')
assert not state.is_accepting() and state.is_nil()
VocabPrefixAutomaton
from general_sam import VocabPrefixAutomaton, CountInfo
vocab = ['歌曲', '聆听歌曲', '播放歌曲', '歌词', '查看歌词']
automaton = VocabPrefixAutomaton(vocab, bytes_or_chars='chars')
# NOTE: CountInfo instances are actually related to the sorted `vocab`:
_ = ['播放歌曲', '查看歌词', '歌曲', '歌词', '聆听歌曲']
# Case 1:
# 一起 | 聆 | 听 | 歌
state = automaton.get_root_state()
# prepend '歌'
cnt_info = automaton.prepend_feed(state, '歌')
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=2, tot_cnt_lower=2, tot_cnt_upper=4
)
# found '歌曲' at the index 0 and '歌词' at the index 3 prefixed with '歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {0, 3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {'歌曲', '歌词'}
# prepend 听
cnt_info = automaton.prepend_feed(state, '听')
# found nothing prefixed with '听歌'
assert cnt_info is None
assert not state.is_nil()
# prepend 聆
cnt_info = automaton.prepend_feed(state, '聆')
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=4, tot_cnt_upper=5
)
# found '聆听歌曲' at the index 1 prefixed with '聆听歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {1}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {'聆听歌曲'}
# prepend 一起
assert not state.is_nil()
# found nothing prefixed with '一起聆听歌'
cnt_info = automaton.prepend_feed(state, '一起')
assert state.is_nil()
# Case 2:
# 来 | 查看 | 歌词
state = automaton.get_root_state()
# prepend 歌词
cnt_info = automaton.prepend_feed(state, '歌词')
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=3, tot_cnt_upper=4
)
# found '歌词' at the index 3 prefixed with '歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {'歌词'}
# prepend 查看
cnt_info = automaton.prepend_feed(state, '查看')
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=1, tot_cnt_upper=2
)
# found '查看歌词' at the index 4 prefixed with '查看歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {4}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {'查看歌词'}
# prepend 来
assert not state.is_nil()
# found nothing prefixed with '来查看歌词'
cnt_info = automaton.prepend_feed(state, '来')
assert state.is_nil()
License
- © 2023 Chielo Newctle <ChieloNewctle@gmail.com>
- © 2023 ModelTC Team
This project is licensed under either of
at your option.
The SPDX license identifier for this project is MIT OR Apache-2.0
.
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
general_sam-0.3.0.tar.gz
(18.2 kB
view hashes)
Built Distributions
general_sam-0.3.0-cp38-abi3-win32.whl
(175.5 kB
view hashes)
Close
Hashes for general_sam-0.3.0-cp38-abi3-win_amd64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | dbf4fa9ed7a0a23ed8e5d0d2614b4607e589a919ff3b728690a15e510b9726d4 |
|
MD5 | da19c4b24f0fd2d979dae5df5151d6ed |
|
BLAKE2b-256 | 44a6d90c98ad9ab4d36cdf4136eba58025e8b1e7c22f5c8413bbc0b74c36f0be |
Close
Hashes for general_sam-0.3.0-cp38-abi3-win32.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 30f27594e6ee5f02bb098e3450e61d3a959ab3a94c4c9a42c3ca7fce1fab6d74 |
|
MD5 | 1e81931ade20699177e277bd7c4dbe12 |
|
BLAKE2b-256 | de3f55cf6149d4b146e6534e1bc63fb62284d0dcaa1f44c6f75dc510df5eb695 |
Close
Hashes for general_sam-0.3.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | f6f3c6e20e39dc4127c6c2e02780c9d6b93f8ac075436125688c6744c39c1868 |
|
MD5 | 489485022dce27ab3e5fd3fdb5ed54c8 |
|
BLAKE2b-256 | ef4e23757bbae78d8791e440cb5f5b8aefe29a4b10f55d453eec6d81d4563ae9 |
Close
Hashes for general_sam-0.3.0-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | c4885309bdc374f8fea2b5a9cebdbdde256968cad6f0e7c66f71e92c5190f41a |
|
MD5 | dc8f2e236d0722b8d07e199442e05db2 |
|
BLAKE2b-256 | e54e58909dd566ca0dc3c544f31325202bbb0424af95ae1937a2d2305f6204ba |
Close
Hashes for general_sam-0.3.0-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | fbcb117a37651e02e364e0edfff68d3b3218563b64f8357a2b6e4174debb4ee0 |
|
MD5 | 30f752d504e80bfd262ff3bc827c4eef |
|
BLAKE2b-256 | 26023e4e4c3c0852d9e2cdcc12cf31cb6edf9c98d59d953148a4319bcadf6e93 |
Close
Hashes for general_sam-0.3.0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 7fdc930d3388cfe769ca602f2a5b57f047e0953078b0feacb938ccb6dcc7c767 |
|
MD5 | 4cc8e2d33c5f009ffb8526b873d1c7ca |
|
BLAKE2b-256 | 2d8127acad87f7332d2b617804cbe543c858fe2aacea2ad90551b1d509397afb |
Close
Hashes for general_sam-0.3.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 0a949c0484d04470cf089f2115002b11cd9f131e7031677410200a64e234e543 |
|
MD5 | 00278d39b4ca53f7f179180f40cd9e6d |
|
BLAKE2b-256 | 76dad106658ca16391bee09e68f99e3ed62575b3e9742beadaa4dc5151d96215 |
Close
Hashes for general_sam-0.3.0-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | c6804b241941fa01f609019e11ee36277201da67d4a08b27847c839a8eab5eca |
|
MD5 | 01212d10d8942d1ee8a4e218144a4cf3 |
|
BLAKE2b-256 | dc9c2a6cf02a6b563dea4099af41dc6a10254cee0d7078c9ee1d428efe30bfee |
Close
Hashes for general_sam-0.3.0-cp38-abi3-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 9300f67d8d3635529767439217ba58e3a222ec14fb5f2af20c2dd1f10f6733ac |
|
MD5 | 5b51ffe60d2b68102365c1853bb5c28b |
|
BLAKE2b-256 | 82ce7a621dcffad1ef1336863c39bb66341c28aff7d869e20553bfaafb785db1 |
Close
Hashes for general_sam-0.3.0-cp38-abi3-macosx_10_7_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | e041cd38e09f105bbe5498f9c46fd4bb121ff866a94e128b5b84124e71712bb3 |
|
MD5 | 38cbbb21e7ff0eb9b421f20dc5cb76e0 |
|
BLAKE2b-256 | c21da7783cac476d11fd6201fbba5237d9e0f22b3f7e6a9cdec68f8a96381c45 |