Python bindings for general-sam and some utilities
Project description
general-sam-py
Python bindings for general-sam
and some utilities.
flowchart LR
init((ε))
a((a))
b((b))
ab((ab))
bc(((bc)))
abc((abc))
abcb((abcb))
abcbc(((abcbc)))
init -- a --> a
init -- b --> b
a -- b --> ab
b -- c --> bc
init -- c --> bc
ab -- c --> abc
bc -- b --> abcb
abc -- b --> abcb
abcb -- c --> abcbc
The suffix automaton of abcbc.
Installation
pip install general-sam
Usage
GeneralSAM
from general_sam import GeneralSAM
sam = GeneralSAM.construct_from_bytes(b'abcbc')
# "cbc" is a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b'cbc')
assert state.is_accepting()
# "bcb" is not a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b'bcb')
assert not state.is_accepting()
from general_sam import GeneralSAM
sam = GeneralSAM.construct_from_chars('abcbc')
state = sam.get_root_state()
# "b" is not a suffix but at least a substring of "abcbc"
state.feed_chars('b')
assert not state.is_accepting()
# "bc" is a suffix of "abcbc"
state.feed_chars('c')
assert state.is_accepting()
# "bcbc" is a suffix of "abcbc"
state.feed_chars('bc')
assert state.is_accepting()
# "bcbcbc" is not a substring, much less a suffix of "abcbc"
state.feed_chars('bc')
assert not state.is_accepting() and state.is_nil()
from general_sam import GeneralSAM, GeneralSAMState, construct_trie_from_chars
trie, _ = construct_trie_from_chars(['hello', 'Chielo'])
sam = GeneralSAM.construct_from_trie(trie)
def fetch_state(s: str) -> GeneralSAMState:
state = sam.get_root_state()
state.feed_chars(s)
return state
assert fetch_state('lo').is_accepting()
assert fetch_state('ello').is_accepting()
assert fetch_state('elo').is_accepting()
state = fetch_state('el')
assert not state.is_accepting() and not state.is_nil()
state = fetch_state('bye')
assert not state.is_accepting() and state.is_nil()
VocabPrefixAutomaton
from general_sam import VocabPrefixAutomaton, CountInfo
vocab = ['歌曲', '聆听歌曲', '播放歌曲', '歌词', '查看歌词']
automaton = VocabPrefixAutomaton(vocab, bytes_or_chars='chars')
# NOTE: CountInfo instances are actually related to the sorted `vocab`:
_ = ['播放歌曲', '查看歌词', '歌曲', '歌词', '聆听歌曲']
# Case 1:
# 一起 | 聆 | 听 | 歌
state = automaton.get_root_state()
# prepend '歌'
cnt_info = automaton.prepend_feed(state, '歌')
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=2, tot_cnt_lower=2, tot_cnt_upper=4
)
# found '歌曲' at the index 0 and '歌词' at the index 3 prefixed with '歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {0, 3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {'歌曲', '歌词'}
# prepend 听
cnt_info = automaton.prepend_feed(state, '听')
# found nothing prefixed with '听歌'
assert cnt_info is None
assert not state.is_nil()
# prepend 聆
cnt_info = automaton.prepend_feed(state, '聆')
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=4, tot_cnt_upper=5
)
# found '聆听歌曲' at the index 1 prefixed with '聆听歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {1}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {'聆听歌曲'}
# prepend 一起
assert not state.is_nil()
# found nothing prefixed with '一起聆听歌'
cnt_info = automaton.prepend_feed(state, '一起')
assert state.is_nil()
# Case 2:
# 来 | 查看 | 歌词
state = automaton.get_root_state()
# prepend 歌词
cnt_info = automaton.prepend_feed(state, '歌词')
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=3, tot_cnt_upper=4
)
# found '歌词' at the index 3 prefixed with '歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {'歌词'}
# prepend 查看
cnt_info = automaton.prepend_feed(state, '查看')
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=1, tot_cnt_upper=2
)
# found '查看歌词' at the index 4 prefixed with '查看歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {4}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {'查看歌词'}
# prepend 来
assert not state.is_nil()
# found nothing prefixed with '来查看歌词'
cnt_info = automaton.prepend_feed(state, '来')
assert state.is_nil()
License
- © 2023 Chielo Newctle <ChieloNewctle@gmail.com>
- © 2023 ModelTC Team
This project is licensed under either of
at your option.
The SPDX license identifier for this project is MIT OR Apache-2.0
.
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
general_sam-0.4.2.tar.gz
(18.4 kB
view hashes)
Built Distributions
general_sam-0.4.2-cp38-abi3-win32.whl
(174.0 kB
view hashes)
Close
Hashes for general_sam-0.4.2-cp38-abi3-win_amd64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 4db9704db1b376329acf5b21c43faee539fb1a93ec31d83ff4304cd559af72c8 |
|
MD5 | 5f529922201df4dd933e96df1ce344e9 |
|
BLAKE2b-256 | 87e503dd3a02a5ecba707a991ec719accf8578322c0901ce520696c990458dbc |
Close
Hashes for general_sam-0.4.2-cp38-abi3-win32.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 5caa79cbc5d6b618df8d58194528b1fe6a73d32a0355a71f90e3981573a31c25 |
|
MD5 | 1cc28015e7bcb01e2ac605c717e115d7 |
|
BLAKE2b-256 | 6a25830376fde6c792df479d1dd63c2743cde62b1c7991dd91f0b22e672b34e5 |
Close
Hashes for general_sam-0.4.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | e7ca60fe59c9d629c94dca1e58d532a14096788343128f3c7256f0d3c0d9b262 |
|
MD5 | b34027266886eaa6c57281bcffc5d55d |
|
BLAKE2b-256 | a206f26a9542fc556eaddb17998963fd5635203bfd550b185d28fa7853e670fa |
Close
Hashes for general_sam-0.4.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 382f2ec6badfc3e07099adc1173bab6b8840f784039c3ff8ad19b39b1921a047 |
|
MD5 | fa9fa73bd388bce9b3d258d43b55dad3 |
|
BLAKE2b-256 | 7dfea6e510e5002a93bdb81577f1480cec7f620caeb0126d2afcb1275afea48a |
Close
Hashes for general_sam-0.4.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | cc642d1e69f78798b7056e8bb203bb9ad6d2d8ee5ddfd924bb79a569e049eef8 |
|
MD5 | 884383ef270c4bd0d1fb40bae8696d2f |
|
BLAKE2b-256 | c42f766f5e1ba8edf9978e99c7ad5e59779ebd258883218304544900110f692a |
Close
Hashes for general_sam-0.4.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 91aa0a912f0803e7b3f3607f5a6a192838255b08dbfa1e72c7eb9b632c88bf49 |
|
MD5 | 3bf8848f4662f783b419cc20290d2842 |
|
BLAKE2b-256 | e90afb05bcc7dfb313d7a0d50dd28b30ea52a25a049a6e3a2026c21a8bd6eb1d |
Close
Hashes for general_sam-0.4.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 4ee8c39eab180045b03904a671054890d39ed633968345289d5f6642b8cfcc19 |
|
MD5 | ecc9186a98c6be6dfbe462557b0a71a4 |
|
BLAKE2b-256 | 47e8e7cc15e7739223f1e4fa1d40538923e648be2e8d53ac87aad71d76402018 |
Close
Hashes for general_sam-0.4.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 7fe5d3529d99703f6aacf8bfc2ff602e75f347a618971896fe439e889e49ecc1 |
|
MD5 | 4b434cef21fbae6976ce6d956476049d |
|
BLAKE2b-256 | 6d2b33ab31152e99ff74d59292d8cb666e99cc154fab538406cb66acfb834580 |
Close
Hashes for general_sam-0.4.2-cp38-abi3-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 76c03cc318b8d26bb2479108cb9d936c5ea772db97126b1253c3e39402939633 |
|
MD5 | 4f83f367f59d51110a05ee8514f4b8b0 |
|
BLAKE2b-256 | c3d7e18bcc1d850d7e1bb78ae45c95972982397ed498944aa3b4ceb6dd9340a1 |
Close
Hashes for general_sam-0.4.2-cp38-abi3-macosx_10_7_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 623190c82458bb69635e5967f4bc119ff034805d10d84b41c54def1903dffd94 |
|
MD5 | a789b74b5904b9cdff378a332154e991 |
|
BLAKE2b-256 | 6f27975cf22fde31663f0e4be6d4d6ee55f891514d369796685a83db58c39b05 |