Skip to main content

A Khmer Text Normalization and Verbalization Toolkit.

Project description

Tha (ថា)

Khmer Text Normalization and Verbalization Toolkit.

Install

pip install tha
import tha.normalize
import tha.phone_numbers
import tha.urls
import tha.datetime
import tha.hashtags
import tha.ascii_lines
import tha.license_plate
import tha.cardinals
import tha.decimals
import tha.ordinals
import tha.currency
import tha.parenthesis
import tha.repeater

## Normalize
assert tha.normalize.processor("មិន\u200bឲ្យ") == "មិនឱ្យ"

## Phone Numbers
assert tha.phone_numbers.processor("010123123", chunk_size=2) == "0▁10▁12▁31▁23"
assert tha.phone_numbers.processor("010123123", chunk_size=3) == "0▁10▁123▁123"
assert tha.phone_numbers.processor("0961231234", chunk_size=3) == "0▁96▁123▁1234"

## URLs and emails
assert tha.urls.processor("example@gmail.com") == "example at g▁mail dot com"
assert tha.urls.processor("https://google.com") == "google dot com"
assert tha.urls.processor("http://google.com") == "google dot com"
assert tha.urls.processor("google.com") == "google dot com"
assert tha.urls.processor("google.gov.kh") == "google dot gov dot k▁h"
assert tha.urls.processor("google.com.kh") == "google dot com dot k▁h"

## Time
assert tha.datetime.time_processor("10:23AM") == "10 23▁A▁M"
assert tha.datetime.time_processor("10:23PM") == "10 23▁P▁M"
assert tha.datetime.time_processor("1:23PM") == "1 23▁P▁M"

## Date
assert tha.datetime.date_processor("2024-01-02") == "2024 01 02"
assert tha.datetime.date_processor("01-02-2034") == "01 02 2034"

## Hashtags
assert (
  tha.hashtags.processor("Hello world #this_will_remove hello") == "Hello world  hello"
)
assert tha.hashtags.processor("Hello world #លុប hello") == "Hello world  hello"
assert tha.hashtags.processor("Hello world #លុប1234 hello") == "Hello world  hello"

## ASCII Lines
assert tha.ascii_lines.processor("Remove --- asdasd") == "Remove  asdasd"
assert tha.ascii_lines.processor("Remove\n###\nasdasd") == "Remove\n\nasdasd"

## Cambodia License Plate
assert tha.license_plate.processor("1A 1234") == "1 A 12▁34"
assert tha.license_plate.processor("1A 4444") == "1 A ការ៉េ4"

## Number - Cardinals
assert tha.cardinals.processor("1234") == "មួយពាន់▁ពីររយ▁សាមសិបបួន"
assert tha.cardinals.processor("1") == "មួយ"
assert tha.cardinals.processor("1▁2") == "មួយ▁ពីរ"
assert tha.cardinals.processor("-1") == "ដក▁មួយ"
assert tha.cardinals.processor("10") == "ដប់"
assert tha.cardinals.processor("15") == "ដប់ប្រាំ"
assert tha.cardinals.processor("100") == "មួយរយ"
assert tha.cardinals.processor("10000") == "មួយម៉ឺន"
assert tha.cardinals.processor("10000.234") == "មួយម៉ឺន.ពីររយ▁សាមសិបបួន"
assert tha.cardinals.processor("-10000.234") == "ដក▁មួយម៉ឺន.ពីររយ▁សាមសិបបួន"
assert tha.cardinals.processor("-10000,234") == "ដក▁មួយម៉ឺន,ពីររយ▁សាមសិបបួន"

## Number - Decimals
assert tha.decimals.processor("123.324") == "មួយរយ▁ម្ភៃបី▁ចុច▁បីរយ▁ម្ភៃបួន"
assert tha.decimals.processor("123.001") == "មួយរយ▁ម្ភៃបី▁ចុច▁សូន្យ▁សូន្យ▁មួយ"
assert tha.decimals.processor("-123.0012") == "ដក▁មួយរយ▁ម្ភៃបី▁ចុច▁សូន្យ▁សូន្យ▁ដប់ពីរ"
assert tha.decimals.processor("-123,0012") == "ដក▁មួយរយ▁ម្ភៃបី▁ក្បៀស▁សូន្យ▁សូន្យ▁ដប់ពីរ"

## Number - Ordinals
assert tha.ordinals.processor("5th") == "ទី▁ប្រាំ"
assert tha.ordinals.processor("3rd") == "ទី▁បី"
assert tha.ordinals.processor("1st") == "ទី▁មួយ"
assert tha.ordinals.processor("10th") == "ទី▁ដប់"
assert tha.ordinals.processor("10") == "10"

## Number - Currency
assert tha.currency.processor("$100.01") == "មួយរយដុល្លារ▁មួយសេន"
assert tha.currency.processor("$100") == "មួយរយ▁ដុល្លារ"
assert tha.currency.processor("100$") == "មួយរយដុល្លារ"
assert tha.currency.processor("100៛") == "មួយរយរៀល"
assert tha.currency.processor("100.32៛") == "មួយរយ▁ចុច▁សាមសិបពីររៀល"
assert tha.currency.processor("100.0032៛") == "មួយរយ▁ចុច▁សូន្យ▁សូន្យ▁សាមសិបពីររៀល"

## Parenthesis
assert tha.parenthesis.processor("Hello (this will be ignored) world") == "Hello world"


## Iteration Mark
def fake_tokenizer(_):
  return ["គាត់", "បាន", "ទៅ", "បន្តិច", "ម្ដង"]


assert (
  tha.repeater.processor("គាត់បានទៅបន្តិចម្ដងៗហើយ", tokenizer=fake_tokenizer)
  == "គាត់បានទៅបន្តិចម្ដង▁បន្តិចម្ដងហើយ"
)

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

tha-0.1.1.tar.gz (18.9 kB view details)

Uploaded Source

File details

Details for the file tha-0.1.1.tar.gz.

File metadata

  • Download URL: tha-0.1.1.tar.gz
  • Upload date:
  • Size: 18.9 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/5.1.0 CPython/3.8.19

File hashes

Hashes for tha-0.1.1.tar.gz
Algorithm Hash digest
SHA256 b12bd820b4913c2b1f879a52ccd9766d344c52703594f5cb516ab8595e022a21
MD5 81a85f07b5ca07378877b7b7b4ce92c6
BLAKE2b-256 1b002538b4c3b5a9a585b4391bc67546a720e78f8e7e64d44ee63eb419527351

See more details on using hashes here.

Provenance

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page