Skip to main content

A Khmer Text Normalization and Verbalization Toolkit.

Project description

Tha (ថា)

Khmer Text Normalization and Verbalization Toolkit.

Install

pip install tha
import tha.normalize
import tha.phone_numbers
import tha.urls
import tha.datetime
import tha.hashtags
import tha.ascii_lines
import tha.license_plate
import tha.cardinals
import tha.decimals
import tha.ordinals
import tha.currency
import tha.parenthesis
import tha.repeater

## Normalize
assert tha.normalize.processor("មិន\u200bឲ្យ") == "មិនឱ្យ"

## Phone Numbers
assert tha.phone_numbers.processor("010123123", chunk_size=2) == "0▁10▁12▁31▁23"
assert tha.phone_numbers.processor("010123123", chunk_size=3) == "0▁10▁123▁123"
assert tha.phone_numbers.processor("0961231234", chunk_size=3) == "0▁96▁123▁1234"

## URLs and emails
assert tha.urls.processor("example@gmail.com") == "example at g▁mail dot com"
assert tha.urls.processor("https://google.com") == "google dot com"
assert tha.urls.processor("http://google.com") == "google dot com"
assert tha.urls.processor("google.com") == "google dot com"
assert tha.urls.processor("google.gov.kh") == "google dot gov dot k▁h"
assert tha.urls.processor("google.com.kh") == "google dot com dot k▁h"

## Time
assert tha.datetime.time_processor("10:23AM") == "10 23▁A▁M"
assert tha.datetime.time_processor("10:23PM") == "10 23▁P▁M"
assert tha.datetime.time_processor("1:23PM") == "1 23▁P▁M"

## Date
assert tha.datetime.date_processor("2024-01-02") == "2024 01 02"
assert tha.datetime.date_processor("01-02-2034") == "01 02 2034"

## Hashtags
assert (
  tha.hashtags.processor("Hello world #this_will_remove hello") == "Hello world  hello"
)
assert tha.hashtags.processor("Hello world #លុប hello") == "Hello world  hello"
assert tha.hashtags.processor("Hello world #លុប1234 hello") == "Hello world  hello"

## ASCII Lines
assert tha.ascii_lines.processor("Remove --- asdasd") == "Remove  asdasd"
assert tha.ascii_lines.processor("Remove\n###\nasdasd") == "Remove\n\nasdasd"

## Cambodia License Plate
assert tha.license_plate.processor("1A 1234") == "1 A 12▁34"
assert tha.license_plate.processor("1A 4444") == "1 A ការ៉េ4"

## Number - Cardinals
assert tha.cardinals.processor("1234") == "មួយពាន់▁ពីររយ▁សាមសិបបួន"
assert tha.cardinals.processor("1") == "មួយ"
assert tha.cardinals.processor("1▁2") == "មួយ▁ពីរ"
assert tha.cardinals.processor("-1") == "ដក▁មួយ"
assert tha.cardinals.processor("10") == "ដប់"
assert tha.cardinals.processor("15") == "ដប់ប្រាំ"
assert tha.cardinals.processor("100") == "មួយរយ"
assert tha.cardinals.processor("10000") == "មួយម៉ឺន"
assert tha.cardinals.processor("10000.234") == "មួយម៉ឺន.ពីររយ▁សាមសិបបួន"
assert tha.cardinals.processor("-10000.234") == "ដក▁មួយម៉ឺន.ពីររយ▁សាមសិបបួន"
assert tha.cardinals.processor("-10000,234") == "ដក▁មួយម៉ឺន,ពីររយ▁សាមសិបបួន"

## Number - Decimals
assert tha.decimals.processor("123.324") == "មួយរយ▁ម្ភៃបី▁ចុច▁បីរយ▁ម្ភៃបួន"
assert tha.decimals.processor("123.001") == "មួយរយ▁ម្ភៃបី▁ចុច▁សូន្យ▁សូន្យ▁មួយ"
assert tha.decimals.processor("-123.0012") == "ដក▁មួយរយ▁ម្ភៃបី▁ចុច▁សូន្យ▁សូន្យ▁ដប់ពីរ"
assert tha.decimals.processor("-123,0012") == "ដក▁មួយរយ▁ម្ភៃបី▁ក្បៀស▁សូន្យ▁សូន្យ▁ដប់ពីរ"

## Number - Ordinals
assert tha.ordinals.processor("5th") == "ទី▁ប្រាំ"
assert tha.ordinals.processor("3rd") == "ទី▁បី"
assert tha.ordinals.processor("1st") == "ទី▁មួយ"
assert tha.ordinals.processor("10th") == "ទី▁ដប់"
assert tha.ordinals.processor("10") == "10"

## Number - Currency
assert tha.currency.processor("$100.01") == "មួយរយដុល្លារ▁មួយសេន"
assert tha.currency.processor("$100") == "មួយរយ▁ដុល្លារ"
assert tha.currency.processor("100$") == "មួយរយដុល្លារ"
assert tha.currency.processor("100៛") == "មួយរយរៀល"
assert tha.currency.processor("100.32៛") == "មួយរយ▁ចុច▁សាមសិបពីររៀល"
assert tha.currency.processor("100.0032៛") == "មួយរយ▁ចុច▁សូន្យ▁សូន្យ▁សាមសិបពីររៀល"

## Parenthesis
assert tha.parenthesis.processor("Hello (this will be ignored) world") == "Hello world"


## Iteration Mark
def fake_tokenizer(_):
  return ["គាត់", "បាន", "ទៅ", "បន្តិច", "ម្ដង"]


assert (
  tha.repeater.processor("គាត់បានទៅបន្តិចម្ដងៗហើយ", tokenizer=fake_tokenizer)
  == "គាត់បានទៅបន្តិចម្ដង▁បន្តិចម្ដងហើយ"
)

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

tha-0.1.4.tar.gz (18.7 kB view details)

Uploaded Source

Built Distribution

tha-0.1.4-py3-none-any.whl (20.8 kB view details)

Uploaded Python 3

File details

Details for the file tha-0.1.4.tar.gz.

File metadata

  • Download URL: tha-0.1.4.tar.gz
  • Upload date:
  • Size: 18.7 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/5.1.0 CPython/3.8.19

File hashes

Hashes for tha-0.1.4.tar.gz
Algorithm Hash digest
SHA256 99d2b24f486ada99093ae76119bd6612fbb1b9c56fb726b6f390d0cacc2ced10
MD5 f1d1182ff0ceb961f8046ebb28161974
BLAKE2b-256 7abf2894b3da61b6bd0c47e87134feacae37f4d51bd19473393d033f4a6c4c66

See more details on using hashes here.

File details

Details for the file tha-0.1.4-py3-none-any.whl.

File metadata

  • Download URL: tha-0.1.4-py3-none-any.whl
  • Upload date:
  • Size: 20.8 kB
  • Tags: Python 3
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/5.1.0 CPython/3.8.19

File hashes

Hashes for tha-0.1.4-py3-none-any.whl
Algorithm Hash digest
SHA256 b3ab5fcdc980eefe2e9a9c4cb7c53a8ffeb967da02ea226849f473e2d198629c
MD5 02282262368e63a0129bb4d31d69d9dd
BLAKE2b-256 08cce4af05d25ca325082a33c4b37812733ab89aca0fd5c753b820d753bc0130

See more details on using hashes here.

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page