Skip to main content

A high performance multiple functional word matcher

Project description

Matcher Rust Implement PyO3 binding

Usage

Python usage is in the test.ipynb file.

Matcher

import msgspec
import numpy as np

from matcher_py import Matcher, SimpleMatcher # type: ignore
from extension_types import MatchTableType, SimpleMatchType, MatchTable

msgpack_encoder = msgspec.msgpack.Encoder()

matcher = Matcher(
    msgpack_encoder.encode(
        {
            "test": [
                MatchTable(
                    table_id=1,
                    match_table_type=MatchTableType.Simple,
                    simple_match_type=SimpleMatchType.MatchFanjian | SimpleMatchType.MatchDeleteNormalize,
                    word_list=["蔔", "你好"],
                    exemption_simple_match_type=SimpleMatchType.MatchFanjian | SimpleMatchType.MatchDeleteNormalize,
                    exemption_word_list=[],
                )
            ]
        }
    )
)

matcher.is_match(r"卜")

matcher.word_match(r"你,好")

matcher.word_match_as_string("你好")

matcher.batch_word_match_as_string(["你好", "你好", "你真棒"])

text_array = np.array(
    [
        "Laborum eiusmod anim aliqua non veniam laboris officia dolor. Adipisicing sit est irure Lorem duis adipisicing exercitation. Cillum excepteur non anim ipsum eiusmod deserunt veniam. Nulla veniam sunt sint ad velit occaecat in deserunt nulla nisi excepteur. Cillum veniam Lorem aute eu. Nisi voluptate laboris quis sint pariatur ullamco minim pariatur officia non anim nisi nulla ipsum ad. Veniam pariatur ut occaecat ut veniam velit aliquip commodo culpa elit eu eiusmod."
    ]
    * 10000,
    dtype=np.dtype("object")
)
matcher.numpy_word_match_as_string(text_array)

text_array = np.array(
    [
        "Laborum eiusmod anim aliqua non veniam laboris officia dolor. Adipisicing sit est irure Lorem duis adipisicing exercitation. Cillum excepteur non anim ipsum eiusmod deserunt veniam. Nulla veniam sunt sint ad velit occaecat in deserunt nulla nisi excepteur. Cillum veniam Lorem aute eu. Nisi voluptate laboris quis sint pariatur ullamco minim pariatur officia non anim nisi nulla ipsum ad. Veniam pariatur ut occaecat ut veniam velit aliquip commodo culpa elit eu eiusmod."
    ]
    * 10000,
    dtype=np.dtype("object")
)
matcher.numpy_word_match_as_string(text_array, inplace=True)
text_array

Simple Matcher

import msgspec
import numpy as np

from matcher_py import Matcher, SimpleMatcher # type: ignore
from extension_types import MatchTableType, SimpleMatchType, MatchTable

msgpack_encoder = msgspec.msgpack.Encoder()

simple_matcher = SimpleMatcher(
    msgpack_encoder.encode(
        {
            SimpleMatchType.MatchFanjian | SimpleMatchType.MatchDeleteNormalize: {
                1: "无,法,无,天",
                2: "xxx",
                3: "你好",
                6: r"It's /\/\y duty",
                4: "xxx,yyy",
            },
            SimpleMatchType.MatchFanjian: {
                4: "xxx,yyy",
            },
            SimpleMatchType.MatchNone: {
                5: "xxxxx,xxxxyyyyxxxxx",
            },
        }
    )
)

simple_matcher.is_match("xxx")

simple_matcher.simple_process(r"It's /\/\y duty")

simple_matcher.batch_simple_process([r"It's /\/\y duty", "你好", "xxxxxxx"])

text_array = np.array(
    [
        "Laborum eiusmod anim aliqua non veniam laboris officia dolor. Adipisicing sit est irure Lorem duis adipisicing exercitation. Cillum excepteur non anim ipsum eiusmod deserunt veniam. Nulla veniam sunt sint ad velit occaecat in deserunt nulla nisi excepteur. Cillum veniam Lorem aute eu. Nisi voluptate laboris quis sint pariatur ullamco minim pariatur officia non anim nisi nulla ipsum ad. Veniam pariatur ut occaecat ut veniam velit aliquip commodo culpa elit eu eiusmod."
    ]
    * 10000,
    dtype=np.dtype("object"),
)
simple_matcher.numpy_simple_process(text_array)

text_array = np.array(
    [
        "Laborum eiusmod anim aliqua non veniam laboris officia dolor. Adipisicing sit est irure Lorem duis adipisicing exercitation. Cillum excepteur non anim ipsum eiusmod deserunt veniam. Nulla veniam sunt sint ad velit occaecat in deserunt nulla nisi excepteur. Cillum veniam Lorem aute eu. Nisi voluptate laboris quis sint pariatur ullamco minim pariatur officia non anim nisi nulla ipsum ad. Veniam pariatur ut occaecat ut veniam velit aliquip commodo culpa elit eu eiusmod."
    ]
    * 10000,
    dtype=np.dtype("object"),
)
simple_matcher.numpy_simple_process(text_array, inplace=True)
text_array

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

matcher_py-0.1.1.tar.gz (406.5 kB view details)

Uploaded Source

Built Distribution

matcher_py-0.1.1-cp310-abi3-macosx_11_0_arm64.whl (1.1 MB view details)

Uploaded CPython 3.10+ macOS 11.0+ ARM64

File details

Details for the file matcher_py-0.1.1.tar.gz.

File metadata

  • Download URL: matcher_py-0.1.1.tar.gz
  • Upload date:
  • Size: 406.5 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: maturin/1.6.0

File hashes

Hashes for matcher_py-0.1.1.tar.gz
Algorithm Hash digest
SHA256 94b3d7719fb640f3eb3ab51cedb077f7758d6bd9a63b6dec52ce060a06dfb224
MD5 3d491bf7d8989be789c389a402abba8c
BLAKE2b-256 4c292195aef52183c5be470b28d81bf490b0729ff778ffc848749db4981a301d

See more details on using hashes here.

File details

Details for the file matcher_py-0.1.1-cp310-abi3-macosx_11_0_arm64.whl.

File metadata

File hashes

Hashes for matcher_py-0.1.1-cp310-abi3-macosx_11_0_arm64.whl
Algorithm Hash digest
SHA256 b1bc351318abf0706e77e25ab009aa8a71318ff0c1886571a7fcf8f3ae6cfbcb
MD5 362ddbae8a7f711789e4e356c71d2680
BLAKE2b-256 222ac8dd4d13edd4efe5228c30c985d02b58e2410dec28a20b4d94510e124a7a

See more details on using hashes here.

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page