Skip to main content

A Python library for normalizing Dhivehi text and converting numbers to Dhivehi text format, supporting written, spoken and year forms

Project description

dv-normalize

A Python library for normalizing Dhivehi text by converting numbers to Dhivehi and standardizing sentence endings.

Features

  • Converts numbers to Dhivehi text (both written and spoken forms)
  • Handles years
  • Handles decimal numbers
  • Normalizes formal sentence endings to colloquial form
  • Preserves proper spacing and punctuation
  • Converts time to Dhivehi format
  • Processes full text with mixed content (numbers, years, times, and sentences)

Installation

pip install dv-normalize

Usage

The library provides several main components for different types of conversions:

1. Number Conversion

from dv_normalize.dv_numbers import DhivehiNumberConverter

# Basic number conversion
result = DhivehiNumberConverter.convert(232)

# Large numbers
result = DhivehiNumberConverter.convert(7878787874151545121545454)

# Negative numbers
result = DhivehiNumberConverter.convert(-21)

2. Time Conversion

from dv_normalize.dv_time import DhivehiTimeConverter

# Convert time to Dhivehi
result = DhivehiTimeConverter.convert("14:30")
result = DhivehiTimeConverter.convert("01:30")
result = DhivehiTimeConverter.convert("00:00")

3. Year Conversion

from dv_normalize.dv_years import DhivehiYearConverter

# Convert years to Dhivehi
result = DhivehiYearConverter.convert(1960)
result = DhivehiYearConverter.convert(2023)

4. Text Processing

from dv_normalize.dv_sentence import DhivehiTextProcessor

# Create processor instance
processor = DhivehiTextProcessor()

# Process individual sentences
result = processor.spoken_dv("ވަކި ލާރިން ވެސް 232.23 ލާރި ހޯދައެވެ")

# Process full text with mixed content
sample_text = """
އިބްރާހިމް އަކީ 1982 އިން 2024 ވަނަ އަހަރާއި ހަމައަށް ވަޒީފާގައި އުޅުން މީހެކެވެ.
"""
result = processor.process_full_text(sample_text)

# Process text as separate sentences
sentence_results = processor.process_text(sample_text)
for result in sentence_results:
    print(f"Raw: {result.raw}")
    print(f"Processed: {result.processed}")
    print(f"Length: {result.length}")

Test Cases

""" Dhivehi Number Converter """
from dv_normalize.dv_numbers import DhivehiNumberConverter
# Test cases for numbers
if __name__ == "__main__":
    test_cases = [
        (0, "ސުމެއް"),
        (1, "އެކެއް"),
        (10, "ދިހައެއް"),
        (15, "ފަނަރަ"),
        (20, "ވިހި"),
        (21, "އެކާވީސް"),
        (22, "ބާވީސް"),
        (25, "ފަންސަވީސް"),
        (29, "ނަވާވީސް"),
        (30, "ތިރީސް"),
        (100, "އެއް ސަތޭކަ"),
        (101, "އެއް ސަތޭކަ އެކެއް"),
        (110, "އެއް ސަތޭކަ ދިހައެއް"),
        (115, "އެއް ސަތޭކަ ފަނަރަ"),
        (121, "އެއް ސަތޭކަ އެކާވީސް"),
        (1000, "އެއް ހާސް"),
        (1001, "އެއް ހާސް އެކެއް"),
        (1021, "އެއް ހާސް އެކާވީސް"),
        (8988, "އަށް ހާސް ނުވަ ސަތޭކަ އައްޑިހަ އަށެއް"),
        (100000, "އެއް ލައްކަ"),
        (200000, "ދެ ލައްކަ"),
        (955545102, "ނުވަ ސަތޭކަ ފަންސާސް ފަސް މިލިއަން ފަސް ލައްކަ ސާޅީސް ފަސް ހާސް އެއް ސަތޭކަ ދޭއް"),
        (7878787874151545121545454,"ހަތެއް އަށެއް ހަތެއް އަށެއް ހަތެއް އަށެއް ހަތެއް އަށެއް ހަތެއް ހަތަރެއް އެކެއް ފަހެއް އެކެއް ފަހެއް ހަތަރެއް ފަހެއް އެކެއް ދޭއް އެކެއް ފަހެއް ހަތަރެއް ފަހެއް ހަތަރެއް ފަހެއް ހަތަރެއް"),
        (-5, "މައިނަސް ފަހެއް"),
        (-21, "މައިނަސް އެކާވީސް")
    ]
    
    print("Testing Dhivehi number converter with known cases:")
    for number, expected in test_cases:
        result = DhivehiNumberConverter.convert(number)
        print(f"{number}: {result}")
        if result != expected:
            print(f"  Expected: '{expected}'")
            print(f"  Got: '{result}'")
    
""" Dhivehi Time Converter """
from dv_normalize.dv_time import DhivehiTimeConverter

# Test cases for times
if __name__ == "__main__":
    # Comprehensive test with various time formats
    print("Testing Dhivehi time converter (Comprehensive Test):")
    
    # First, test the existing cases
    existing_test_cases = [
        ("14:30", "ސާދަ ގަޑި ތިރީސް"),
        ("01:30", "އެއް ގަޑި ތިރީސް"),
        ("01:21", "އެއް ގަޑި އެކާވީސް"),
        ("00:00", "ބާރަ ގަޑި ސުމެއް"),  # Auto-detected as 12-hour format
        ("23:59", "ތޭވީސް ގަޑި ފަންސާސް ނުވައެއް"),
        ("12:00", "ބާރަ ގަޑި ސުމެއް"),
        ("00:01", "ބާރަ ގަޑި އެކެއް"),  # Auto-detected as 12-hour
        ("05:15", "ފަސް ގަޑި ފަނަރަ"),
        ("10:45", "ދިހަ ގަޑި ސާޅީސް ފަހެއް"),
        ("17:05", "ސަތާރަ ގަޑި ފަހެއް"),
        ("20:30", "ވިހި ގަޑި ތިރީސް"),
        ("18:25", "އަށާރަ ގަޑި ފަންސަވީސް"),
    ]
    
    print("\n=== Testing Original Cases ===")
    for time_str, expected in existing_test_cases:
        result = DhivehiTimeConverter.convert(time_str)
        print(f"{time_str}: {result}")
        if result != expected:
            print(f"  Expected: '{expected}'")
            print(f"  Got: '{result}'")
    
    # Test all hours with 00 minutes
    print("\n=== Testing All Hours with 00 Minutes ===")
    for hour in range(24):
        time_str = f"{hour:02d}:00"
        result = DhivehiTimeConverter.convert(time_str)
        print(f"{time_str}: {result}")
    
    # Test hour 12 with all minutes (covers standard hour)
    print("\n=== Testing Hour 12 with Various Minutes ===")
    for minute in range(0, 60, 5):  # Test every 5 minutes for brevity
        time_str = f"12:{minute:02d}"
        result = DhivehiTimeConverter.convert(time_str)
        print(f"{time_str}: {result}")
    
    # Test special minute values (covering all tens places)
    print("\n=== Testing Various Hours with Special Minute Values ===")
    special_minutes = [0, 1, 10, 11, 19, 20, 21, 25, 29, 30, 40, 50, 55, 59]
    for minute in special_minutes:
        # Test with different hours to cover both 12-hour and 24-hour formats
        for hour in [0, 1, 12, 23]:
            time_str = f"{hour:02d}:{minute:02d}"
            result = DhivehiTimeConverter.convert(time_str)
            print(f"{time_str}: {result}")
    
    # Test for invalid inputs
    print("\n=== Testing Invalid Inputs ===")
    invalid_inputs = ["24:00", "12:60", "abc", "12:xx", "24:60", "-1:30"]
    for invalid in invalid_inputs:
        result = DhivehiTimeConverter.convert(invalid)
        print(f"{invalid}: {result}")
    
    print("\nTime Testing completed!")

""" Dhivehi Year Converter """
from dv_normalize.dv_years import DhivehiYearConverter

# Test cases for years
if __name__ == "__main__":
    year_test_cases = [
        (1960, "ނަވާރަސަތޭކަ ފަސްދޮޅަސް"),
        (2000, "ދެހާސް"),
        (2023, "ދެހާސް ތޭވީސް"),
        (1492, "ސާދަސަތޭކަ ނުވަދިހަ ދޭއް"),
        (1985, "ނަވާރަސަތޭކަ އައްޑިހަ ފަސް"),
        (1800, "އަށާރަސަތޭކަ"),
        (1234, "ބާރަސަތޭކަ ތިރީސް ހަތަރު"),
        (2525, "ދެހާސް ފަސްސަތޭކަ ފަންސަވީސް"),
        (1066, "އެއްހާސް ފަސްދޮޅަސް ހަ"),
        (622, "ހަސަތޭކަ ބާވީސް"),
        (50, "ފަންސާސް"),
        (-44, "ސާޅީސް ހަތަރު ކުރީގެ"),
    ]
    
    print("Testing Dhivehi year converter:")
    for year, expected in year_test_cases:
        result = DhivehiYearConverter.convert(year)
        print(f"{year}: {result}")
        if result != expected:
            print(f"  Expected: '{expected}'")
            print(f"  Got: '{result}'")
    
""" Dhivehi Sentence Converter """
from dv_normalize.dv_sentence import DhivehiTextProcessor

# Test cases for sentences

"""Demo function to test the processor."""
# Sample Dhivehi text for testing
sample_text = """
އިބްރާހިމް އަކީ 1982 އިން 2024 ވަނަ އަހަރާއި ހަމައަށް ވަޒީފާގައި އުޅުން މީހެކެވެ. މިގޮތުން ކޮންމެ ދުވަހަކު ހެދުނު 08:00 އިން ފަށައިގެ ގޮސް ހަވީރު 16:00 އަށް އޮފީހުގައި އުޅުމަށް ފަހު ގެޔަށް ދިޔުމަށް ނުކުމެއެވެ. އަދި ހެލެމެޓް ގަދަކަމުން ބޮލަށް ފައްތާލައިގެން، ބާރު ސްޕީޑްގައި ގެޔަށް ނައްޓާލައެވެ. ބައެއް ފަހަރުގަ ކ.އަތޮޅު ވިލިނގިލިން ނައްޓައިލަގެން ގުޅީފަޅަށް 15 މިނެޓްތެރޭ ދާއިރު ސްޕީޑް ހުންނަނީ 120 ކިލޯ މީޓަރު ބާރު މިނުގައެވެ. މިސޮރު މިހެން އުޅެ ކޮންމެ މަހަކު 52،1092 ރުފިޔާ ގެޔަށް ގެންދެއެވެ. އިތުރު މަސައްކަތް ކޮށްގެން 9982711 ރ މިވަރަށް ހޯދާ ކަމަށް ވެއެވެ. ވަކި ލާރިން ވެސް 232.23 ލާރި ހޯދައެވެ
"""

# Create processor
processor = DhivehiTextProcessor()

# Process as sentences
print("Processing as sentences:")
print("-" * 50)
sentence_results = processor.process_text(sample_text)
print(f"Processed {len(sentence_results)} sentences:")
for idx, result in enumerate(sentence_results, 1):
    print(f"\nSentence {idx}:")
    print(f"Raw: {result.raw}")
    print(f"Processed: {result.processed}")
    print(f"Length: {result.length}")

# Process as full text
print("\nProcessing as full text:")
print("-" * 50)
full_text_result = processor.process_full_text(sample_text)
print("Processed full text:")
print(full_text_result)


# Seprate alias test
print("Testing alias functions:")
print("-" * 50)
print("Testing spoken_dv:")
print(processor.spoken_dv("ވަކި ލާރިން ވެސް 232.23 ލާރި ހޯދައެވެ"))
print(processor.spoken_dv("ވަކި ލާރިން ވެސް 232,23 ލާރި ހޯދައެވެ"))
print(processor.spoken_dv("މި މަހުގެ 11ވަނަ ދުވަހު ބާއްވަން ނިންމައިފި"))
print(processor.spoken_dv("ޣައްޒާގެ މުސްލިމުންނާ ދެކޮޅަށް މީގެ ތޭރަ މަސް ކުރިން ޔަހޫދީ ފައުޖުން ފެށި ލާއިންސާނީ ޢުދުވާނުގައި ޝަހީދު ކޮށްލާފައިވާ މީހުންގެ އަދަދު ވަނީ 20:00އަށްވުރެ މައްޗަށް އުފުލިފައިވޭ"))
print(processor.spoken_dv("މީގެ ކުރިން އެ މަގާމުގައި ހުންނެވީ ޖެނުއަރީ  ނުވައެއް  2024ގައި އެ މަގާމަށް އައްޔަނުކުރެއްވި އެ ކުންފުނީގެ މެނޭޖިން ޑިރެކްޓަރުކަން ވެސް ކުރައްވާފައިވާ މުހައްމަދު މިހާދު"))

print("Testing int_to_dv:")
print(processor.int_to_dv("232"))
print(processor.int_to_dv("7878787874151545121545454"))

Example Outputs

Numbers

  • 0 → ސުމެއް
  • 1 → އެކެއް
  • 10 → ދިހައެއް
  • 100 → އެއް ސަތޭކަ
  • 1000 → އެއް ހާސް

Time

  • 14:30 → ސާދަ ގަޑި ތިރީސް
  • 01:30 → އެއް ގަޑި ތިރީސް
  • 00:00 → ބާރަ ގަޑި ސުމެއް

Years

  • 1960 → ނަވާރަސަތޭކަ ފަސްދޮޅަސް
  • 2023 → ދެހާސް ތޭވީސް
  • 1492 → ސާދަސަތޭކަ ނުވަދިހަ ދޭއް

Known Issues

  • Not all sentence endings are normalized.

License

This project is licensed under the MIT License.

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

dv-normalizer-0.1.2.tar.gz (20.0 kB view details)

Uploaded Source

Built Distribution

If you're not sure about the file name format, learn more about wheel file names.

dv_normalizer-0.1.2-py3-none-any.whl (17.9 kB view details)

Uploaded Python 3

File details

Details for the file dv-normalizer-0.1.2.tar.gz.

File metadata

  • Download URL: dv-normalizer-0.1.2.tar.gz
  • Upload date:
  • Size: 20.0 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/5.1.1 CPython/3.10.14

File hashes

Hashes for dv-normalizer-0.1.2.tar.gz
Algorithm Hash digest
SHA256 352593383d4615a7f8cbe9d6ec77faee9031ad660bffd2144b38f697ecb976ac
MD5 6d372f5a5bab62e785269d92b535f0ca
BLAKE2b-256 550981b79c1736fb7839e94fd2b62ccd898d2bbdc49ecfd7a345ab4193dbfec1

See more details on using hashes here.

File details

Details for the file dv_normalizer-0.1.2-py3-none-any.whl.

File metadata

  • Download URL: dv_normalizer-0.1.2-py3-none-any.whl
  • Upload date:
  • Size: 17.9 kB
  • Tags: Python 3
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/5.1.1 CPython/3.10.14

File hashes

Hashes for dv_normalizer-0.1.2-py3-none-any.whl
Algorithm Hash digest
SHA256 52d8e3141b05b9c92440bba1ecbe3e40e65319f556be062e2358473b4f3a0a96
MD5 59f83942c5017e2f4267e8ac4d962783
BLAKE2b-256 ade177909a1c343a2b29b91c7c567da25ef3e7955663c70f510182c95e32f972

See more details on using hashes here.

Supported by

AWS Cloud computing and Security Sponsor Datadog Monitoring Depot Continuous Integration Fastly CDN Google Download Analytics Pingdom Monitoring Sentry Error logging StatusPage Status page