Python bindings for ICU4C
Project description
icupy
Python bindings for ICU4C using pybind11.
Changes from ICU4C
-
Naming Conventions
Renamed functions, methods, and enums to conform to PEP 8.
- Function Names:
use
lower_case_with_underscoresstyle. - Method Names:
use
lower_case_with_underscoresstyle. Also, use one leading underscore only for protected methods. - C++ Enum Member Names:
use
UPPER_CASE_WITH_UNDERSCORESstyle without a leading "k". (e.g.,kDateOffset→DATE_OFFSET) - APIs that match Python reserved words: e.g.,
with()→with_()
- Function Names:
use
-
Error Handling
-
ICU C/C++ API errors are raised as
icupy.icu.ICUErrorexceptions. The underlyingUErrorCodecan be retrieved from theerror_codeattribute of the exception.For example:
from icupy import icu try: pass # Call ICU API here... except icu.ICUError as e: print(e.error_code) # → icu.ErrorCode print(e.error_code.get()) # → icu.UErrorCode
-
Examples
-
icu::UnicodeString with predefined error callback function
# from Unicode to codepage from icupy import icu cnv = icu.ucnv_open("iso8859-1") context = icu.ConstVoidPtr(icu.UCNV_ESCAPE_C) # \uXXXX action = icu.UConverterFromUCallback(icu.UCNV_FROM_U_CALLBACK_ESCAPE, context) old_action = icu.ucnv_set_from_u_call_back(cnv, action) s = icu.UnicodeString("A€B") s.extract(cnv) # → b'A\\u20ACB'
# from codepage to Unicode from icupy import icu cnv = icu.ucnv_open("Shift-JIS") context = icu.ConstVoidPtr(icu.UCNV_ESCAPE_XML_HEX) # &#xXXXX; action = icu.UConverterToUCallback(icu.UCNV_TO_U_CALLBACK_ESCAPE, context) old_action = icu.ucnv_set_to_u_call_back(cnv, action) src = b"\x61\xeb\x40\x62" # 0xeb 0x40: UNASSIGNED SEQUENCE s = icu.UnicodeString(src, -1, cnv) str(s) # → 'aë@b'
-
icu::UnicodeString with custom error callback function
# from Unicode to codepage from icupy import icu from icupy.utils import gc def from_unicode_cb( options: object, args: icu.UConverterFromUnicodeArgs, code_units: str, length: int, code_point: int, reason: icu.UConverterCallbackReason, error_code: icu.ErrorCode, ) -> None: _ = options, length, code_point # unused if reason in [icu.UCNV_UNASSIGNED, icu.UCNV_ILLEGAL, icu.UCNV_IRREGULAR]: error_code.set(icu.U_ZERO_ERROR) source = "".join(f"\\u{ord(c):04x}" for c in code_units) icu.ucnv_cb_from_u_write_bytes(args, source, len(source), 0) with gc(icu.ucnv_open("iso8859-1"), icu.ucnv_close) as cnv: action = icu.UConverterFromUCallback(from_unicode_cb) old_action = icu.ucnv_set_from_u_call_back(cnv, action) s = icu.UnicodeString("A€B") s.extract(cnv) # → b'A\\u20acB'
# from codepage to Unicode from icupy import icu from icupy.utils import gc def to_unicode_cb( options: object, args: icu.UConverterToUnicodeArgs, code_units: bytes, length: int, reason: icu.UConverterCallbackReason, error_code: icu.ErrorCode, ) -> None: _ = options, length # unused if reason in [icu.UCNV_UNASSIGNED, icu.UCNV_ILLEGAL, icu.UCNV_IRREGULAR]: error_code.set(icu.U_ZERO_ERROR) source = "".join(f"%{b:02X}" for b in code_units) icu.ucnv_cb_to_u_write_uchars(args, source, len(source), 0) with gc(icu.ucnv_open("Shift-JIS"), icu.ucnv_close) as cnv: action = icu.UConverterToUCallback(to_unicode_cb) old_action = icu.ucnv_set_to_u_call_back(cnv, action) src = b"\x61\xeb\x40\x62" # 0xeb 0x40: UNASSIGNED SEQUENCE s = icu.UnicodeString(src, -1, cnv) str(s) # → 'a%EB%40b'
-
icu::BreakIterator for word-breaks
from icupy import icu bi = icu.BreakIterator.create_word_instance("en_US") src = icu.UnicodeString("Alice was beginning to get very tired of sitting by her sister on the bank.") bi.set_text(src) result = [] start = bi.first() while (end := bi.next()) != icu.BreakIterator.DONE: if bi.get_rule_status() != icu.UBRK_WORD_NONE: result.append(src[start:end]) start = end # result: ['Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank']
-
Natural sort (human-friendly sorting)
from icupy import icu coll = icu.Collator.create_instance("en_US") coll.set_attribute(icu.UCOL_NUMERIC_COLLATION, icu.UCOL_ON) data = ["file1.txt", "file10.txt", "file2.txt", "file20.txt", "file3.txt"] sorted(data, key=coll.get_sort_key) # ['file1.txt', 'file2.txt', 'file3.txt', 'file10.txt', 'file20.txt']
-
icu::IDNA (UTS #46)
from icupy import icu uts46 = icu.IDNA.create_uts46_instance(icu.UIDNA_DEFAULT | icu.UIDNA_CHECK_BIDI | icu.UIDNA_CHECK_CONTEXTJ) dest = icu.UnicodeString() info = icu.IDNAInfo() # a + ZERO WIDTH NON-JOINER + b.com uts46.name_to_ascii("a\u200cb.com", dest, info) # → 'xn--ab-j1t.com' bool(info.get_errors() & icu.UIDNA_ERROR_BIDI) # → False bool(info.get_errors() & icu.UIDNA_ERROR_CONTEXTJ) # → True
-
icu::number::NumberFormatter (ICU 60+)
from icupy import icu from icupy.icu import number template = ( number.NumberFormatter.with_() .notation(number.Notation.compact_short()) .unit(icu.CurrencyUnit("EUR")) .precision(number.Precision.max_significant_digits(2)) ) template.locale("en_US").format_int(1234).to_string() # "€1.2K" in en-US
-
icu::RegexMatcher::find with custom callback function
from icupy import icu src = icu.UnicodeString("aaaaaaaaaaaaaaaaaaab") matcher = icu.RegexMatcher("((.)\\2)x", src, 0) def progress_callback(options: dict[str, int], match_index: int) -> bool: if not isinstance(options, dict): return False calls = options.get("numCalls", 0) + 1 options["numCalls"] = calls options["lastIndex"] = match_index max_calls = options.get("maxCalls", -1) return True if max_calls < 0 else calls < max_calls info = {} context = icu.ConstVoidPtr(info) callback = icu.URegexFindProgressCallback(progress_callback, context) matcher.set_find_progress_callback(callback) matcher.find(0) # → False # info: {'numCalls': 18, 'lastIndex': 18} info.clear() info["maxCalls"] = 5 matcher.find(0) # → ICUError: U_REGEX_STOPPED_BY_CALLER # info: {'maxCalls': 5, 'numCalls': 5, 'lastIndex': 5}
-
icu::number::SimpleNumberFormatter (ICU 73+)
from icupy import icu from icupy.icu import number fmt = number.SimpleNumberFormatter.for_locale_and_grouping_strategy("de-CH", icu.UNUM_GROUPING_ON_ALIGNED) fmtval = fmt.format_int64(1234567) fmtval.to_string() # → "1'234'567"
Installation
Prerequisites
- Python >=3.10
- ICU4C (ICU - The International Components for Unicode) (>=70 recommended)
- C++17 compatible compiler (see Supported Compilers)
- CMake >=3.15
Installing prerequisites
-
Windows:
Install the following dependencies:
- Python >=3.10
- Pre-built ICU4C binary package (>=70 recommended)
- C++17 compatible compiler. Visual Studio 2022 or newer recommended
- CMake >=3.15
- Note: Add CMake to the system PATH.
-
Linux:
To install dependencies, run the following command:
-
Ubuntu/Debian:
sudo apt install g++ cmake libicu-dev python3-dev python3-pip
-
Fedora:
sudo dnf install gcc-c++ cmake icu libicu-devel python3-devel
Note: If your system's ICU is out of date, consider building ICU4C from source or installing pre-built ICU4C binary package.
-
Installing icupy
-
Configuring environment variables
-
Windows:
-
Set the
ICU_ROOTenvironment variable to the root of the ICU installation.For example, if the ICU is located in
C:\icu4c:in PowerShell:
$env:ICU_ROOT = "C:\icu4c"
or in Command Prompt:
set ICU_ROOT=C:\icu4c
-
To verify settings using
icuinfo(64-bit):in PowerShell:
& $env:ICU_ROOT\bin64\icuinfo
or in Command Prompt:
%ICU_ROOT%\bin64\icuinfo
-
-
Linux:
-
If the ICU is located in a non-regular place, set the
PKG_CONFIG_PATHandLD_LIBRARY_PATHenvironment variables.For example, if the ICU is located in
/usr/local:export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
-
To verify settings using
pkg-config:pkg-config --cflags --libs icu-uc # -I/usr/local/include -L/usr/local/lib -licuuc
-
-
-
Installing from PyPI
pip install icupy
Optionally, CMake environment variables are available. For example, using the Ninja build system and Clang:
CMAKE_GENERATOR=Ninja CXX=clang++ pip install icupy
Alternatively, installing development version from the git repository:
pip install git+https://github.com/miute/icupy.git
Usage
-
Configuring environment variables
-
Windows:
-
Set the
ICU_ROOTenvironment variable to the root of the ICU installation (default isC:\icu).For example, if the ICU is located in
C:\icu4c:in PowerShell:
$env:ICU_ROOT = "C:\icu4c"
or in Command Prompt:
set ICU_ROOT=C:\icu4c
-
-
Linux:
-
If the ICU is located in a non-regular place, set the
LD_LIBRARY_PATHenvironment variables.For example, if the ICU is located in
/usr/local:export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
-
-
-
Using icupy
import icupy.icu as icu # or from icupy import icu
License
This project is licensed under the MIT License.
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
File details
Details for the file icupy-0.23.0.tar.gz.
File metadata
- Download URL: icupy-0.23.0.tar.gz
- Upload date:
- Size: 507.4 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/6.2.0 CPython/3.14.0
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
1bad65795cfb5550fe06825003f4fd9ef353f7a5ab63b5dd24bc4dc4770de50b
|
|
| MD5 |
2174d81b63a4099d39222519a5e39427
|
|
| BLAKE2b-256 |
c914ac38f657a1f062da721e5d4c95da983c417650b048d797823864b5e4a02f
|