Skip to main content

Python bindings for general-sam and some utilities

Project description

general-sam-py

PyPI version License Build status

Python bindings for general-sam and some utilities.

flowchart LR
  init((ε))
  a((a))
  b((b))
  ab((ab))
  bc(((bc)))
  abc((abc))
  abcb((abcb))
  abcbc(((abcbc)))

  init -- a --> a
  init -- b --> b
  a -- b --> ab
  b -- c --> bc
  init -- c --> bc
  ab -- c --> abc
  bc -- b --> abcb
  abc -- b --> abcb
  abcb -- c --> abcbc

The suffix automaton of abcbc.

Installation

pip install general-sam

Usage

GeneralSAM

from general_sam import GeneralSAM

sam = GeneralSAM.from_bytes(b"abcbc")

# "cbc" is a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b"cbc")
assert state.is_accepting()

# "bcb" is not a suffix of "abcbc"
state = sam.get_root_state()
state.feed_bytes(b"bcb")
assert not state.is_accepting()
from general_sam import GeneralSAM

sam = GeneralSAM.from_chars("abcbc")
state = sam.get_root_state()

# "b" is not a suffix but at least a substring of "abcbc"
state.feed_chars("b")
assert not state.is_accepting()

# "bc" is a suffix of "abcbc"
state.feed_chars("c")
assert state.is_accepting()

# "bcbc" is a suffix of "abcbc"
state.feed_chars("bc")
assert state.is_accepting()

# "bcbcbc" is not a substring, much less a suffix of "abcbc"
state.feed_chars("bc")
assert not state.is_accepting() and state.is_nil()
from general_sam import GeneralSAM, GeneralSAMState, build_trie_from_chars

trie, _ = build_trie_from_chars(["hello", "Chielo"])
sam = GeneralSAM.from_trie(trie)


def fetch_state(s: str) -> GeneralSAMState:
    state = sam.get_root_state()
    state.feed_chars(s)
    return state


assert fetch_state("lo").is_accepting()
assert fetch_state("ello").is_accepting()
assert fetch_state("elo").is_accepting()

state = fetch_state("el")
assert not state.is_accepting() and not state.is_nil()

state = fetch_state("bye")
assert not state.is_accepting() and state.is_nil()

VocabPrefixAutomaton

from general_sam import CountInfo, VocabPrefixAutomaton

vocab = ["歌曲", "聆听歌曲", "播放歌曲", "歌词", "查看歌词"]
automaton = VocabPrefixAutomaton(vocab, bytes_or_chars="chars")

# NOTE: CountInfo instances are actually related to the sorted `vocab`:
_ = ["播放歌曲", "查看歌词", "歌曲", "歌词", "聆听歌曲"]

# Case 1:
#   一起 | 聆 | 听 | 歌
state = automaton.get_root_state()

# prepend '歌'
cnt_info = automaton.prepend_feed(state, "歌")
assert cnt_info is not None and cnt_info == CountInfo(
    str_cnt=2, tot_cnt_lower=2, tot_cnt_upper=4
)

# found '歌曲' at the index 0 and '歌词' at the index 3 prefixed with '歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {0, 3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"歌曲", "歌词"}

# prepend 听
cnt_info = automaton.prepend_feed(state, "听")
# found nothing prefixed with '听歌'
assert cnt_info is None
assert not state.is_nil()

# prepend 聆
cnt_info = automaton.prepend_feed(state, "聆")
assert cnt_info is not None and cnt_info == CountInfo(
    str_cnt=1, tot_cnt_lower=4, tot_cnt_upper=5
)

# found '聆听歌曲' at the index 1 prefixed with '聆听歌'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {1}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"聆听歌曲"}

# prepend 一起
assert not state.is_nil()
# found nothing prefixed with '一起聆听歌'
cnt_info = automaton.prepend_feed(state, "一起")
assert state.is_nil()

# Case 2:
#   来 | 查看 | 歌词
state = automaton.get_root_state()

# prepend 歌词
cnt_info = automaton.prepend_feed(state, "歌词")
assert cnt_info is not None and cnt_info == CountInfo(
    str_cnt=1, tot_cnt_lower=3, tot_cnt_upper=4
)

# found '歌词' at the index 3 prefixed with '歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {3}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"歌词"}

# prepend 查看
cnt_info = automaton.prepend_feed(state, "查看")
assert cnt_info is not None and cnt_info == CountInfo(
    str_cnt=1, tot_cnt_lower=1, tot_cnt_upper=2
)

# found '查看歌词' at the index 4 prefixed with '查看歌词'
selected_idx = automaton.get_order_slice(cnt_info)
assert frozenset(selected_idx) == {4}
selected_vocab = [vocab[i] for i in selected_idx]
assert frozenset(selected_vocab) == {"查看歌词"}

# prepend 来
assert not state.is_nil()
# found nothing prefixed with '来查看歌词'
cnt_info = automaton.prepend_feed(state, "来")
assert state.is_nil()

GreedyTokenizer

from general_sam import GeneralSAM, GreedyTokenizer, build_trie_from_chars

vocab = ["a", "ab", "b", "bc", "c", "d", "e", "f", "cd", "abcde"]
trie, token_to_trie_node = build_trie_from_chars(vocab)

trie_node_to_token = [-1] * trie.num_of_nodes()
for i, j in enumerate(token_to_trie_node):
    trie_node_to_token[j] = i

sam = GeneralSAM.from_trie(trie)
tokenizer = GreedyTokenizer.from_sam_and_trie(sam, trie)


def tokenize(s: str):
    return [(trie_node_to_token[i], j) for i, j in tokenizer.tokenize_str(s)]


assert tokenize("abcde") == [(9, 5)]
assert tokenize("abcdf") == [(1, 2), (8, 2), (7, 1)]
assert tokenize("abca") == [(1, 2), (4, 1), (0, 1)]

License

This project is licensed under either of

at your option.

The SPDX license identifier for this project is MIT OR Apache-2.0.

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

general_sam-0.7.0.post0.tar.gz (21.6 kB view details)

Uploaded Source

Built Distributions

general_sam-0.7.0.post0-pp310-pypy310_pp73-macosx_11_0_arm64.whl (318.8 kB view details)

Uploaded PyPy macOS 11.0+ ARM64

general_sam-0.7.0.post0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl (331.3 kB view details)

Uploaded PyPy macOS 10.12+ x86-64

general_sam-0.7.0.post0-pp39-pypy39_pp73-macosx_11_0_arm64.whl (319.0 kB view details)

Uploaded PyPy macOS 11.0+ ARM64

general_sam-0.7.0.post0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl (331.8 kB view details)

Uploaded PyPy macOS 10.12+ x86-64

general_sam-0.7.0.post0-pp38-pypy38_pp73-macosx_11_0_arm64.whl (319.0 kB view details)

Uploaded PyPy macOS 11.0+ ARM64

general_sam-0.7.0.post0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl (332.0 kB view details)

Uploaded PyPy macOS 10.12+ x86-64

general_sam-0.7.0.post0-cp38-abi3-win_amd64.whl (207.8 kB view details)

Uploaded CPython 3.8+ Windows x86-64

general_sam-0.7.0.post0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB view details)

Uploaded CPython 3.8+ manylinux: glibc 2.17+ x86-64

general_sam-0.7.0.post0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl (1.1 MB view details)

Uploaded CPython 3.8+ manylinux: glibc 2.17+ ARMv7l

general_sam-0.7.0.post0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (1.1 MB view details)

Uploaded CPython 3.8+ manylinux: glibc 2.17+ ARM64

general_sam-0.7.0.post0-cp38-abi3-macosx_11_0_arm64.whl (322.6 kB view details)

Uploaded CPython 3.8+ macOS 11.0+ ARM64

general_sam-0.7.0.post0-cp38-abi3-macosx_10_12_x86_64.whl (336.1 kB view details)

Uploaded CPython 3.8+ macOS 10.12+ x86-64

File details

Details for the file general_sam-0.7.0.post0.tar.gz.

File metadata

  • Download URL: general_sam-0.7.0.post0.tar.gz
  • Upload date:
  • Size: 21.6 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: maturin/1.5.1

File hashes

Hashes for general_sam-0.7.0.post0.tar.gz
Algorithm Hash digest
SHA256 500b4c0717f6057e066d161533ed3681db26ceb24ab13b3b2857b56d26542c0a
MD5 11275c9b615bda286c767c5f51d381dd
BLAKE2b-256 60f55d2bdd5ba8074a26db72081d88afad7e040ff9b9af52927b48406451c515

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm Hash digest
SHA256 0998e42fd606f65d7fde561bf4bb3c71a04584ecb6971aff0c20b3f82d64272a
MD5 232ffd68ce4d03fd02d76c37f731c225
BLAKE2b-256 76e766e7a75cb0bfdb8a7d2535cb32e1ddaa85ff71493c3ff66da4697e6a0200

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm Hash digest
SHA256 d77cbe45bbe4001b137626166cf3e430f6add1d74e7044eea51bb0724f6a6e8f
MD5 ec83015bf9ba400dbfa522eaa62a243b
BLAKE2b-256 789c42f010be35a8ae5fdb567ce26aa0ccc5699acf4152747fa2eff966e8e3e6

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm Hash digest
SHA256 1ed0c46f3761a25cd2416de04e3150f61d4243a13074cc6b5a1f1b8d31d57f02
MD5 3e6a775245380c6da855ce2382cbe792
BLAKE2b-256 942d036f66858306b8bf83b504107663beb4d4cbbf25c450086c2c6b374b0836

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-pp310-pypy310_pp73-macosx_11_0_arm64.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-pp310-pypy310_pp73-macosx_11_0_arm64.whl
Algorithm Hash digest
SHA256 b87c8e330ae009eb8f3540773cdf898baeeae7aaa6b3d6ff847c18e205f331fd
MD5 02d0f87cc353f730e80927d6194617b9
BLAKE2b-256 f347e8d57241b34bb456d5cb22893bbd2cb3491e17a9616463bb084616f9ae36

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl
Algorithm Hash digest
SHA256 0b25d1fc4b7ed1dbf4b70768e70eb7c5cf1addea30a2c10140ffec9e1c503d63
MD5 11cc1ec7a206faa335d876116f7e441d
BLAKE2b-256 e85f9c26aca4f87c62bdd754613163ca68b8e2b7cac420c20900c0b8d47c5736

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm Hash digest
SHA256 fa5a98b681aeb5060a71caeb444166540e97704b978a5be1559fe9f846072b3d
MD5 6cf38992a97db290ce8051498f9796e7
BLAKE2b-256 8e3fd07859e83f6770cf903fdaf23c1f18cb63241886fe6660dffef3e6209c3f

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm Hash digest
SHA256 69823337361b2a066380d03d21d3da8a8fb8e0667f8aff58067f36bb309af0c5
MD5 881a8a39682090f1298f48256dfba1fa
BLAKE2b-256 cfe522d71e282e01e672f84b2ae3b067d541842abb51b333e32240c18e6296e1

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm Hash digest
SHA256 989ca191df3308ad8ae2d3589c9e98c0432a2aadf6dadd6d5f047f656ae586d8
MD5 2e99d0e6993318843e98e9ff50899ec9
BLAKE2b-256 799369f61506abf441fdddd66c7baff6a672f8324a176ccb89c3f654c36bd8f3

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-pp39-pypy39_pp73-macosx_11_0_arm64.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-pp39-pypy39_pp73-macosx_11_0_arm64.whl
Algorithm Hash digest
SHA256 9702d159749abcd3dd651c167cc2950bfe564d309aa873330d8840a1b6c143c3
MD5 a000a1b3cc97a977409699a7aea6e87d
BLAKE2b-256 dc710aea6975e1977d75e826e1aa4c50eeef7e835c61a09484c2994b2af81a53

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl
Algorithm Hash digest
SHA256 cea1e0fcfd66eb78186158f12f8b189a792a8c761c3dbd853094c9c43561ac2f
MD5 6162a9d2cd0393d1bd1e6b310682dc1b
BLAKE2b-256 b6929aa4d6bb9e5d539fb7567c40b4458588d26a6ffcb509b75bb0ee94c66fec

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm Hash digest
SHA256 4a59e0d2da46b7f08232d5915140f36d741fc9c37c2cc6480ba4f612d8a135e4
MD5 6a0459b2615b62f5a9be2c145f2da5bb
BLAKE2b-256 495d90e0afa17fe43c3319d1c60509a9468aec0a6f9734c7932bd0ec36cf2e92

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm Hash digest
SHA256 448cf369ea8134f3416d4808caf033e34b3a3a7fc7491aa437a567cca04f6bf4
MD5 933a5881300beacc1942de7db728d8fe
BLAKE2b-256 b61a4e3fe1e5be34f7325559be48a373782636dddb43011163fbb45ed7fd86e2

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm Hash digest
SHA256 892c800d942769bf75ca81e6bd4723ea214ddf5990cd857dd84ac37926590655
MD5 d7ee0c5ab4b3c1428d0bf79f248c17f7
BLAKE2b-256 453c6c28c900fbdecfea2195ee06486c3cb0e3a2a80636547295e3f3bc069871

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-pp38-pypy38_pp73-macosx_11_0_arm64.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-pp38-pypy38_pp73-macosx_11_0_arm64.whl
Algorithm Hash digest
SHA256 ca425093a0c4447fc8715e22615deb36845b2a7fd93690ead9327a518417d195
MD5 7481173e8e8aa9297f82eda46c1d9cce
BLAKE2b-256 3e9ae8770461228b53aafec9d03f466ae15ee9015b5f7b85d6db7dfbb8ce01c7

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl
Algorithm Hash digest
SHA256 e775fbebd3141e301ea9cf46d0e4c624385b2d03cecde792e96bbf2e9c040d91
MD5 dce02400e1b08f5f3cbde144ff15fb00
BLAKE2b-256 a538a513ec506520fdef00923d2211fe3ad32f3762495e977e8fa424007de941

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-cp38-abi3-win_amd64.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-cp38-abi3-win_amd64.whl
Algorithm Hash digest
SHA256 c2dc8ac6b918cbb37fe48e532f55a7f79aaa24137f7c985b91d0585a26a8e4bf
MD5 2a7e61c83367853913f94fcdea647527
BLAKE2b-256 c20ba360c4dfaf2a1ebe5fa48b664f895aeb6ad9104945a8c7c0e566092f3902

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm Hash digest
SHA256 9c95755d758264ae29cbf1f470e891d2ce98c171cf536f647fc66998344b357f
MD5 16c9c5928bd24006c5bfb46512f57602
BLAKE2b-256 eb64af9ae0538c9a0adedc4deb37b888a0d215f22742ab7d6525dd423f6abaad

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl
Algorithm Hash digest
SHA256 71357ddec0f3e258d3681136ebc7fe5a139583f9c6680c1e5cfb14b5032bed87
MD5 1b07e777c050cd69c1dbf3572a9c29b6
BLAKE2b-256 358e91cef21606d6695a82176d173cbbdef0a5aa7fdfcdc05ef6fa22bb29c571

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
Algorithm Hash digest
SHA256 cbffdf50eb2421c352feb2ff773f6dd3468bfdb53eb6b3a49f0f9a05d5148d48
MD5 831152ed799375e40110511eb19b9396
BLAKE2b-256 7e355256b0413a89f27186dbdd8cf616aedc95f728b9da06e0c3d97b090065f4

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-cp38-abi3-macosx_11_0_arm64.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-cp38-abi3-macosx_11_0_arm64.whl
Algorithm Hash digest
SHA256 efdde916036e0abbc443f80b5ca47ea7d907068463773eedd98e050e873b5cd2
MD5 4eb15ef26f6fab7dd12a8590aa109ef4
BLAKE2b-256 fc0c5fcbfac3e082b3e802bd075e99940b2c17f91eb298ef25da14d0120dda9c

See more details on using hashes here.

File details

Details for the file general_sam-0.7.0.post0-cp38-abi3-macosx_10_12_x86_64.whl.

File metadata

File hashes

Hashes for general_sam-0.7.0.post0-cp38-abi3-macosx_10_12_x86_64.whl
Algorithm Hash digest
SHA256 aa7e1e481686d2fa92da0cb5d4880825c8510033afe7bfd17f7b4d087478a710
MD5 128e4ecc8f798c1ffce9d4f627ba6e6d
BLAKE2b-256 cd8af624ee52a3b9de812f29c76570b723dcc9ec2458fc976bb677ba905a8253

See more details on using hashes here.

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page