Python bindings for general-sam and some utilities
Project description
general-sam-py
Python bindings for general-sam
and some utilities.
flowchart LR
init((ε))
a((a))
b((b))
ab((ab))
bc(((bc)))
abc((abc))
abcb((abcb))
abcbc(((abcbc)))
init -- a --> a
init -- b --> b
a -- b --> ab
b -- c --> bc
init -- c --> bc
ab -- c --> abc
bc -- b --> abcb
abc -- b --> abcb
abcb -- c --> abcbc
The suffix automaton of abcbc.
Installation
pip install general-sam
Usage
GeneralSAM
from general_sam import GeneralSAM
sam = GeneralSAM.from_bytes(b"abcbc")
# "cbc" is a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b"cbc")
assert state.is_accepting()
# "bcb" is not a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b"bcb")
assert not state.is_accepting()
from general_sam import GeneralSAM
sam = GeneralSAM.from_chars("abcbc")
state = sam.get_root_state()
# "b" is not a suffix but at least a substring of "abcbc"
state.feed_chars("b")
assert not state.is_accepting()
# "bc" is a suffix of "abcbc"
state.feed_chars("c")
assert state.is_accepting()
# "bcbc" is a suffix of "abcbc"
state.feed_chars("bc")
assert state.is_accepting()
# "bcbcbc" is not a substring, much less a suffix of "abcbc"
state.feed_chars("bc")
assert not state.is_accepting() and state.is_nil()
from general_sam import GeneralSAM, GeneralSAMState, build_trie_from_chars
trie, _ = build_trie_from_chars(["hello", "Chielo"])
sam = GeneralSAM.from_trie(trie)
def fetch_state(s: str) -> GeneralSAMState:
state = sam.get_root_state()
state.feed_chars(s)
return state
assert fetch_state("lo").is_accepting()
assert fetch_state("ello").is_accepting()
assert fetch_state("elo").is_accepting()
state = fetch_state("el")
assert not state.is_accepting() and not state.is_nil()
state = fetch_state("bye")
assert not state.is_accepting() and state.is_nil()
VocabPrefixAutomaton
from general_sam import CountInfo, VocabPrefixAutomaton
vocab = ["歌曲", "聆听歌曲", "播放歌曲", "歌词", "查看歌词"]
automaton = VocabPrefixAutomaton(vocab, bytes_or_chars="chars")
# NOTE: CountInfo instances are actually related to the sorted `vocab`:
_ = ["播放歌曲", "查看歌词", "歌曲", "歌词", "聆听歌曲"]
# Case 1:
# 一起 | 聆 | 听 | 歌
state = automaton.get_root_state()
# prepend '歌'
cnt_info = automaton.prepend_feed(state, "歌")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=2, tot_cnt_lower=2, tot_cnt_upper=4
)
# found '歌曲' at the index 0 and '歌词' at the index 3 prefixed with '歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {0, 3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"歌曲", "歌词"}
# prepend 听
cnt_info = automaton.prepend_feed(state, "听")
# found nothing prefixed with '听歌'
assert cnt_info is None
assert not state.is_nil()
# prepend 聆
cnt_info = automaton.prepend_feed(state, "聆")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=4, tot_cnt_upper=5
)
# found '聆听歌曲' at the index 1 prefixed with '聆听歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {1}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"聆听歌曲"}
# prepend 一起
assert not state.is_nil()
# found nothing prefixed with '一起聆听歌'
cnt_info = automaton.prepend_feed(state, "一起")
assert state.is_nil()
# Case 2:
# 来 | 查看 | 歌词
state = automaton.get_root_state()
# prepend 歌词
cnt_info = automaton.prepend_feed(state, "歌词")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=3, tot_cnt_upper=4
)
# found '歌词' at the index 3 prefixed with '歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"歌词"}
# prepend 查看
cnt_info = automaton.prepend_feed(state, "查看")
assert cnt_info is not None and cnt_info == CountInfo(
str_cnt=1, tot_cnt_lower=1, tot_cnt_upper=2
)
# found '查看歌词' at the index 4 prefixed with '查看歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {4}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"查看歌词"}
# prepend 来
assert not state.is_nil()
# found nothing prefixed with '来查看歌词'
cnt_info = automaton.prepend_feed(state, "来")
assert state.is_nil()
GreedyTokenizer
from general_sam import GeneralSAM, GreedyTokenizer, build_trie_from_chars
vocab = ["a", "ab", "b", "bc", "c", "d", "e", "f", "cd", "abcde"]
trie, token_to_trie_node = build_trie_from_chars(vocab)
trie_node_to_token = [-1] * trie.num_of_nodes()
for i, j in enumerate(token_to_trie_node):
trie_node_to_token[j] = i
sam = GeneralSAM.from_trie(trie)
tokenizer = GreedyTokenizer.from_sam_and_trie(sam, trie)
def tokenize(s: str):
return [(trie_node_to_token[i], j) for i, j in tokenizer.tokenize_str(s)]
assert tokenize("abcde") == [(9, 5)]
assert tokenize("abcdf") == [(1, 2), (8, 2), (7, 1)]
assert tokenize("abca") == [(1, 2), (4, 1), (0, 1)]
License
- © 2023 Chielo Newctle <ChieloNewctle@gmail.com>
- © 2023 ModelTC Team
This project is licensed under either of
at your option.
The SPDX license identifier for this project is MIT OR Apache-2.0
.
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
general_sam-0.7.0.post0.tar.gz
(21.6 kB
view hashes)
Built Distributions
Close
Hashes for general_sam-0.7.0.post0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 0998e42fd606f65d7fde561bf4bb3c71a04584ecb6971aff0c20b3f82d64272a |
|
MD5 | 232ffd68ce4d03fd02d76c37f731c225 |
|
BLAKE2b-256 | 76e766e7a75cb0bfdb8a7d2535cb32e1ddaa85ff71493c3ff66da4697e6a0200 |
Close
Hashes for general_sam-0.7.0.post0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | d77cbe45bbe4001b137626166cf3e430f6add1d74e7044eea51bb0724f6a6e8f |
|
MD5 | ec83015bf9ba400dbfa522eaa62a243b |
|
BLAKE2b-256 | 789c42f010be35a8ae5fdb567ce26aa0ccc5699acf4152747fa2eff966e8e3e6 |
Close
Hashes for general_sam-0.7.0.post0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 1ed0c46f3761a25cd2416de04e3150f61d4243a13074cc6b5a1f1b8d31d57f02 |
|
MD5 | 3e6a775245380c6da855ce2382cbe792 |
|
BLAKE2b-256 | 942d036f66858306b8bf83b504107663beb4d4cbbf25c450086c2c6b374b0836 |
Close
Hashes for general_sam-0.7.0.post0-pp310-pypy310_pp73-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | b87c8e330ae009eb8f3540773cdf898baeeae7aaa6b3d6ff847c18e205f331fd |
|
MD5 | 02d0f87cc353f730e80927d6194617b9 |
|
BLAKE2b-256 | f347e8d57241b34bb456d5cb22893bbd2cb3491e17a9616463bb084616f9ae36 |
Close
Hashes for general_sam-0.7.0.post0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 0b25d1fc4b7ed1dbf4b70768e70eb7c5cf1addea30a2c10140ffec9e1c503d63 |
|
MD5 | 11cc1ec7a206faa335d876116f7e441d |
|
BLAKE2b-256 | e85f9c26aca4f87c62bdd754613163ca68b8e2b7cac420c20900c0b8d47c5736 |
Close
Hashes for general_sam-0.7.0.post0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | fa5a98b681aeb5060a71caeb444166540e97704b978a5be1559fe9f846072b3d |
|
MD5 | 6cf38992a97db290ce8051498f9796e7 |
|
BLAKE2b-256 | 8e3fd07859e83f6770cf903fdaf23c1f18cb63241886fe6660dffef3e6209c3f |
Close
Hashes for general_sam-0.7.0.post0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 69823337361b2a066380d03d21d3da8a8fb8e0667f8aff58067f36bb309af0c5 |
|
MD5 | 881a8a39682090f1298f48256dfba1fa |
|
BLAKE2b-256 | cfe522d71e282e01e672f84b2ae3b067d541842abb51b333e32240c18e6296e1 |
Close
Hashes for general_sam-0.7.0.post0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 989ca191df3308ad8ae2d3589c9e98c0432a2aadf6dadd6d5f047f656ae586d8 |
|
MD5 | 2e99d0e6993318843e98e9ff50899ec9 |
|
BLAKE2b-256 | 799369f61506abf441fdddd66c7baff6a672f8324a176ccb89c3f654c36bd8f3 |
Close
Hashes for general_sam-0.7.0.post0-pp39-pypy39_pp73-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 9702d159749abcd3dd651c167cc2950bfe564d309aa873330d8840a1b6c143c3 |
|
MD5 | a000a1b3cc97a977409699a7aea6e87d |
|
BLAKE2b-256 | dc710aea6975e1977d75e826e1aa4c50eeef7e835c61a09484c2994b2af81a53 |
Close
Hashes for general_sam-0.7.0.post0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | cea1e0fcfd66eb78186158f12f8b189a792a8c761c3dbd853094c9c43561ac2f |
|
MD5 | 6162a9d2cd0393d1bd1e6b310682dc1b |
|
BLAKE2b-256 | b6929aa4d6bb9e5d539fb7567c40b4458588d26a6ffcb509b75bb0ee94c66fec |
Close
Hashes for general_sam-0.7.0.post0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 4a59e0d2da46b7f08232d5915140f36d741fc9c37c2cc6480ba4f612d8a135e4 |
|
MD5 | 6a0459b2615b62f5a9be2c145f2da5bb |
|
BLAKE2b-256 | 495d90e0afa17fe43c3319d1c60509a9468aec0a6f9734c7932bd0ec36cf2e92 |
Close
Hashes for general_sam-0.7.0.post0-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 448cf369ea8134f3416d4808caf033e34b3a3a7fc7491aa437a567cca04f6bf4 |
|
MD5 | 933a5881300beacc1942de7db728d8fe |
|
BLAKE2b-256 | b61a4e3fe1e5be34f7325559be48a373782636dddb43011163fbb45ed7fd86e2 |
Close
Hashes for general_sam-0.7.0.post0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 892c800d942769bf75ca81e6bd4723ea214ddf5990cd857dd84ac37926590655 |
|
MD5 | d7ee0c5ab4b3c1428d0bf79f248c17f7 |
|
BLAKE2b-256 | 453c6c28c900fbdecfea2195ee06486c3cb0e3a2a80636547295e3f3bc069871 |
Close
Hashes for general_sam-0.7.0.post0-pp38-pypy38_pp73-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | ca425093a0c4447fc8715e22615deb36845b2a7fd93690ead9327a518417d195 |
|
MD5 | 7481173e8e8aa9297f82eda46c1d9cce |
|
BLAKE2b-256 | 3e9ae8770461228b53aafec9d03f466ae15ee9015b5f7b85d6db7dfbb8ce01c7 |
Close
Hashes for general_sam-0.7.0.post0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | e775fbebd3141e301ea9cf46d0e4c624385b2d03cecde792e96bbf2e9c040d91 |
|
MD5 | dce02400e1b08f5f3cbde144ff15fb00 |
|
BLAKE2b-256 | a538a513ec506520fdef00923d2211fe3ad32f3762495e977e8fa424007de941 |
Close
Hashes for general_sam-0.7.0.post0-cp38-abi3-win_amd64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | c2dc8ac6b918cbb37fe48e532f55a7f79aaa24137f7c985b91d0585a26a8e4bf |
|
MD5 | 2a7e61c83367853913f94fcdea647527 |
|
BLAKE2b-256 | c20ba360c4dfaf2a1ebe5fa48b664f895aeb6ad9104945a8c7c0e566092f3902 |
Close
Hashes for general_sam-0.7.0.post0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 9c95755d758264ae29cbf1f470e891d2ce98c171cf536f647fc66998344b357f |
|
MD5 | 16c9c5928bd24006c5bfb46512f57602 |
|
BLAKE2b-256 | eb64af9ae0538c9a0adedc4deb37b888a0d215f22742ab7d6525dd423f6abaad |
Close
Hashes for general_sam-0.7.0.post0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 71357ddec0f3e258d3681136ebc7fe5a139583f9c6680c1e5cfb14b5032bed87 |
|
MD5 | 1b07e777c050cd69c1dbf3572a9c29b6 |
|
BLAKE2b-256 | 358e91cef21606d6695a82176d173cbbdef0a5aa7fdfcdc05ef6fa22bb29c571 |
Close
Hashes for general_sam-0.7.0.post0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | cbffdf50eb2421c352feb2ff773f6dd3468bfdb53eb6b3a49f0f9a05d5148d48 |
|
MD5 | 831152ed799375e40110511eb19b9396 |
|
BLAKE2b-256 | 7e355256b0413a89f27186dbdd8cf616aedc95f728b9da06e0c3d97b090065f4 |
Close
Hashes for general_sam-0.7.0.post0-cp38-abi3-macosx_11_0_arm64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | efdde916036e0abbc443f80b5ca47ea7d907068463773eedd98e050e873b5cd2 |
|
MD5 | 4eb15ef26f6fab7dd12a8590aa109ef4 |
|
BLAKE2b-256 | fc0c5fcbfac3e082b3e802bd075e99940b2c17f91eb298ef25da14d0120dda9c |
Close
Hashes for general_sam-0.7.0.post0-cp38-abi3-macosx_10_12_x86_64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | aa7e1e481686d2fa92da0cb5d4880825c8510033afe7bfd17f7b4d087478a710 |
|
MD5 | 128e4ecc8f798c1ffce9d4f627ba6e6d |
|
BLAKE2b-256 | cd8af624ee52a3b9de812f29c76570b723dcc9ec2458fc976bb677ba905a8253 |