Skip to main content

Python bindings for general-sam and some utilities

Project description

general-sam-py

PyPI version License Build status

Python bindings for general-sam and some utilities.

flowchart LR
  init((ε))
  a((a))
  b((b))
  ab((ab))
  bc(((bc)))
  abc((abc))
  abcb((abcb))
  abcbc(((abcbc)))

  init -- a --> a
  init -- b --> b
  a -- b --> ab
  b -- c --> bc
  init -- c --> bc
  ab -- c --> abc
  bc -- b --> abcb
  abc -- b --> abcb
  abcb -- c --> abcbc

The suffix automaton of abcbc.

Installation

pip install general-sam

Usage

GeneralSam

from general_sam import GeneralSam

sam = GeneralSam.from_bytes(b"abcbc")

# "cbc" is a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b"cbc")
assert state.is_accepting()

# "bcb" is not a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b"bcb")
assert not state.is_accepting()
from general_sam import GeneralSam

sam = GeneralSam.from_chars("abcbc")
state = sam.get_root_state()

# "b" is not a suffix but at least a substring of "abcbc"
state.feed_chars("b")
assert not state.is_accepting()

# "bc" is a suffix of "abcbc"
state.feed_chars("c")
assert state.is_accepting()

# "bcbc" is a suffix of "abcbc"
state.feed_chars("bc")
assert state.is_accepting()

# "bcbcbc" is not a substring, much less a suffix of "abcbc"
state.feed_chars("bc")
assert not state.is_accepting() and state.is_nil()
from general_sam import GeneralSam, GeneralSamState, build_trie_from_chars

trie, _ = build_trie_from_chars(["hello", "Chielo"])
sam = GeneralSam.from_trie(trie)


def fetch_state(s: str) -> GeneralSamState:
    state = sam.get_root_state()
    state.feed_chars(s)
    return state


assert fetch_state("lo").is_accepting()
assert fetch_state("ello").is_accepting()
assert fetch_state("elo").is_accepting()

state = fetch_state("el")
assert not state.is_accepting() and not state.is_nil()

state = fetch_state("bye")
assert not state.is_accepting() and state.is_nil()

VocabPrefixAutomaton

from general_sam import CountInfo, VocabPrefixAutomaton

vocab = ["歌曲", "聆听歌曲", "播放歌曲", "歌词", "查看歌词"]
automaton = VocabPrefixAutomaton(vocab, bytes_or_chars="chars")

# NOTE: CountInfo instances are actually related to the sorted `vocab`:
_ = ["播放歌曲", "查看歌词", "歌曲", "歌词", "聆听歌曲"]

# Case 1:
#   一起 | 聆 | 听 | 歌
state = automaton.get_root_state()

# prepend '歌'
cnt_info = automaton.prepend_feed(state, "歌")
assert cnt_info is not None and cnt_info == CountInfo(
    str_cnt=2, tot_cnt_lower=2, tot_cnt_upper=4
)

# found '歌曲' at the index 0 and '歌词' at the index 3 prefixed with '歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {0, 3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"歌曲", "歌词"}

# prepend 听
cnt_info = automaton.prepend_feed(state, "听")
# found nothing prefixed with '听歌'
assert cnt_info is None
assert not state.is_nil()

# prepend 聆
cnt_info = automaton.prepend_feed(state, "聆")
assert cnt_info is not None and cnt_info == CountInfo(
    str_cnt=1, tot_cnt_lower=4, tot_cnt_upper=5
)

# found '聆听歌曲' at the index 1 prefixed with '聆听歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {1}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"聆听歌曲"}

# prepend 一起
assert not state.is_nil()
# found nothing prefixed with '一起聆听歌'
cnt_info = automaton.prepend_feed(state, "一起")
assert state.is_nil()

# Case 2:
#   来 | 查看 | 歌词
state = automaton.get_root_state()

# prepend 歌词
cnt_info = automaton.prepend_feed(state, "歌词")
assert cnt_info is not None and cnt_info == CountInfo(
    str_cnt=1, tot_cnt_lower=3, tot_cnt_upper=4
)

# found '歌词' at the index 3 prefixed with '歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"歌词"}

# prepend 查看
cnt_info = automaton.prepend_feed(state, "查看")
assert cnt_info is not None and cnt_info == CountInfo(
    str_cnt=1, tot_cnt_lower=1, tot_cnt_upper=2
)

# found '查看歌词' at the index 4 prefixed with '查看歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {4}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"查看歌词"}

# prepend 来
assert not state.is_nil()
# found nothing prefixed with '来查看歌词'
cnt_info = automaton.prepend_feed(state, "来")
assert state.is_nil()

GreedyTokenizer

from general_sam import GeneralSam, GreedyTokenizer, build_trie_from_chars

vocab = ["a", "ab", "b", "bc", "c", "d", "e", "f", "cd", "abcde"]
trie, token_to_trie_node = build_trie_from_chars(vocab)

trie_node_to_token = [-1] * trie.num_of_nodes()
for i, j in enumerate(token_to_trie_node):
    trie_node_to_token[j] = i

sam = GeneralSam.from_trie(trie)
tokenizer = GreedyTokenizer.from_sam_and_trie(sam, trie)


def tokenize(s: str):
    return [(trie_node_to_token[i], j) for i, j in tokenizer.tokenize_str(s)]


assert tokenize("abcde") == [(9, 5)]
assert tokenize("abcdf") == [(1, 2), (8, 2), (7, 1)]
assert tokenize("abca") == [(1, 2), (4, 1), (0, 1)]

License

This project is licensed under either of

at your option.

The SPDX license identifier for this project is MIT OR Apache-2.0.

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

general_sam-1.0.2.tar.gz (21.0 kB view details)

Uploaded Source

Built Distributions

general_sam-1.0.2-cp310-abi3-win_amd64.whl (201.6 kB view details)

Uploaded CPython 3.10+Windows x86-64

general_sam-1.0.2-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (290.6 kB view details)

Uploaded CPython 3.10+manylinux: glibc 2.17+ x86-64

general_sam-1.0.2-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl (287.5 kB view details)

Uploaded CPython 3.10+manylinux: glibc 2.17+ ARMv7l

general_sam-1.0.2-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (273.0 kB view details)

Uploaded CPython 3.10+manylinux: glibc 2.17+ ARM64

general_sam-1.0.2-cp310-abi3-macosx_11_0_arm64.whl (258.2 kB view details)

Uploaded CPython 3.10+macOS 11.0+ ARM64

general_sam-1.0.2-cp310-abi3-macosx_10_12_x86_64.whl (275.6 kB view details)

Uploaded CPython 3.10+macOS 10.12+ x86-64

File details

Details for the file general_sam-1.0.2.tar.gz.

File metadata

  • Download URL: general_sam-1.0.2.tar.gz
  • Upload date:
  • Size: 21.0 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: maturin/1.8.3

File hashes

Hashes for general_sam-1.0.2.tar.gz
Algorithm Hash digest
SHA256 4dea007a24a4a8bc58ed53b83f733033ac23dca7024954cf46781004cf9e9872
MD5 d64d51ddc353c5f200990d129bbeb71a
BLAKE2b-256 ae87db49d67bf4ff29451ec0a0d0b591968176798dbdc1f5a6e81f5d67711ca6

See more details on using hashes here.

File details

Details for the file general_sam-1.0.2-cp310-abi3-win_amd64.whl.

File metadata

File hashes

Hashes for general_sam-1.0.2-cp310-abi3-win_amd64.whl
Algorithm Hash digest
SHA256 b548c9e6743cd41d75f640c17d6f8b2d320d70ef110f536f3bfbf7c3777a2797
MD5 523f0c25442bab91ec6cebaf1d2d5a30
BLAKE2b-256 246e7d1bebfa31a0d5702411a099a2a3f98d148f42d46bcff1f9dd22543e7d2b

See more details on using hashes here.

File details

Details for the file general_sam-1.0.2-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.

File metadata

File hashes

Hashes for general_sam-1.0.2-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm Hash digest
SHA256 78ad6352e75c580b82c1c822b6e9bb155e50f7abd4066717716b94e802f1fe61
MD5 94b245f7d598cc57fe62d8494c0f2b7f
BLAKE2b-256 aeb93ea3813be5cf3d527c47d9faf1fa393797738d538dfec0b5bc7d2c74d617

See more details on using hashes here.

File details

Details for the file general_sam-1.0.2-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl.

File metadata

File hashes

Hashes for general_sam-1.0.2-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm Hash digest
SHA256 adc848fac5c51d1a338bcc1c5c733f5f5165be30d6c331c72f3629df7036207f
MD5 46587fa811ceed8f38c6bfc218436345
BLAKE2b-256 90d27dfd22b72163e1cee6840e497d512276aaeec54459a13568516b3b1b46b9

See more details on using hashes here.

File details

Details for the file general_sam-1.0.2-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.

File metadata

File hashes

Hashes for general_sam-1.0.2-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm Hash digest
SHA256 1b5826d644940174d1bb3a9f738c11bd0245cffa8c15e49b7ff553dfe30839bc
MD5 3be19e19b7e90195a5a928c816e8016e
BLAKE2b-256 d9a326e79dbaca77dad06f6914580ffdbec503ce55b020912b6e152b25f44236

See more details on using hashes here.

File details

Details for the file general_sam-1.0.2-cp310-abi3-macosx_11_0_arm64.whl.

File metadata

File hashes

Hashes for general_sam-1.0.2-cp310-abi3-macosx_11_0_arm64.whl
Algorithm Hash digest
SHA256 e2789f3214b8f391613e46247ee0ae51c898f0a80b09a6313ec6449353195092
MD5 e1d19463edae0bf6d30764e418dddeba
BLAKE2b-256 3d05b325bd50e27cb2ac1a4e9283d6ecf5600da4fd6186bba5e2b8ff7b4aa0da

See more details on using hashes here.

File details

Details for the file general_sam-1.0.2-cp310-abi3-macosx_10_12_x86_64.whl.

File metadata

File hashes

Hashes for general_sam-1.0.2-cp310-abi3-macosx_10_12_x86_64.whl
Algorithm Hash digest
SHA256 8bb5f1624f369abcfb495a0e9a7f84738243681de4243234671258d23d411b11
MD5 137064dd0fce96b74e2296a820ac2130
BLAKE2b-256 f74855379fa7a737b1153c7b4fa2ed080984ec610122043388a914de3f4b78bc

See more details on using hashes here.

Supported by

AWS Cloud computing and Security Sponsor Datadog Monitoring Fastly CDN Google Download Analytics Pingdom Monitoring Sentry Error logging StatusPage Status page