A local, NLTK-based text processing worker for the cjm-plugin-system that provides sentence splitting and tokenization with character-level span tracking.
Project description
cjm-text-plugin-nltk
Install
pip install cjm_text_plugin_nltk
Project Structure
nbs/
├── meta.ipynb # Metadata introspection for the NLTK text plugin used by cjm-ctl to generate the registration manifest.
└── plugin.ipynb # Plugin implementation for NLTK-based text processing with character-level span tracking
Total: 2 notebooks
Module Dependencies
graph LR
meta[meta<br/>Metadata]
plugin[plugin<br/>NLTK Plugin]
No cross-module dependencies detected.
CLI Reference
No CLI commands found in this project.
Module Overview
Detailed documentation for each module in the project:
Metadata (meta.ipynb)
Metadata introspection for the NLTK text plugin used by cjm-ctl to generate the registration manifest.
Import
from cjm_text_plugin_nltk.meta import (
get_plugin_metadata
)
Functions
def get_plugin_metadata() -> Dict[str, Any]: # Plugin metadata for manifest generation
"""Return metadata required to register this plugin with the PluginManager."""
# Fallback base path (current behavior for backward compatibility)
base_path = os.path.dirname(os.path.dirname(sys.executable))
# Use CJM config if available, else fallback to env-relative paths
cjm_data_dir = os.environ.get("CJM_DATA_DIR")
# Plugin data directory
plugin_name = "cjm-text-plugin-nltk"
if cjm_data_dir
"Return metadata required to register this plugin with the PluginManager."
NLTK Plugin (plugin.ipynb)
Plugin implementation for NLTK-based text processing with character-level span tracking
Import
from cjm_text_plugin_nltk.plugin import (
NLTKPluginConfig,
NLTKPlugin
)
Classes
@dataclass
class NLTKPluginConfig:
"Configuration for NLTK text processing plugin."
tokenizer: str = field(...)
language: str = field(...)
class NLTKPlugin:
def __init__(self):
"""Initialize the NLTK plugin."""
self.logger = logging.getLogger(f"{__name__}.{type(self).__name__}")
self.config: NLTKPluginConfig = None
"NLTK-based text processing plugin with character-level span tracking."
def __init__(self):
"""Initialize the NLTK plugin."""
self.logger = logging.getLogger(f"{__name__}.{type(self).__name__}")
self.config: NLTKPluginConfig = None
"Initialize the NLTK plugin."
def name(self) -> str: # Plugin name identifier
"""Get the plugin name identifier."""
return "nltk_text"
@property
def version(self) -> str: # Plugin version string
"Get the plugin name identifier."
def version(self) -> str: # Plugin version string
"""Get the plugin version string."""
return "1.0.0"
def get_current_config(self) -> Dict[str, Any]: # Current configuration as dictionary
"Get the plugin version string."
def get_current_config(self) -> Dict[str, Any]: # Current configuration as dictionary
"""Return current configuration state."""
if not self.config
"Return current configuration state."
def get_config_schema(self) -> Dict[str, Any]: # JSON Schema for configuration
"""Return JSON Schema for UI generation."""
return dataclass_to_jsonschema(NLTKPluginConfig)
@staticmethod
def get_config_dataclass() -> NLTKPluginConfig: # Configuration dataclass
"Return JSON Schema for UI generation."
def get_config_dataclass() -> NLTKPluginConfig: # Configuration dataclass
"""Return dataclass describing the plugin's configuration options."""
return NLTKPluginConfig
def _ensure_nltk_data(self) -> None
"Return dataclass describing the plugin's configuration options."
def initialize(
self,
config: Optional[Any] = None # Configuration dataclass, dict, or None
) -> None
"Initialize or re-configure the plugin (idempotent)."
def execute(
self,
action: str = "split_sentences", # Operation: 'split_sentences'
**kwargs
) -> Dict[str, Any]: # JSON-serializable result
"Execute a text processing operation."
def split_sentences(
self,
text: str, # Input text to split into sentences
**kwargs
) -> TextProcessResult: # Result with TextSpan objects containing character indices
"Split text into sentence spans with accurate character positions."
def cleanup(self) -> None
"Clean up resources."
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
Built Distribution
Filter files by name, interpreter, ABI, and platform.
If you're not sure about the file name format, learn more about wheel file names.
Copy a direct link to the current filters
File details
Details for the file cjm_text_plugin_nltk-0.0.1.tar.gz.
File metadata
- Download URL: cjm_text_plugin_nltk-0.0.1.tar.gz
- Upload date:
- Size: 11.8 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/6.2.0 CPython/3.11.14
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
4f3671f9535ca795ad40a071d512bc1f850071740bf5e44c5ce685750cd35d40
|
|
| MD5 |
a968bfbfadb8e6cc8e8396db36653859
|
|
| BLAKE2b-256 |
24894d1d8e9ea22f0e40edd8457fe665c431f3438386525341812c1c4c8a2792
|
File details
Details for the file cjm_text_plugin_nltk-0.0.1-py3-none-any.whl.
File metadata
- Download URL: cjm_text_plugin_nltk-0.0.1-py3-none-any.whl
- Upload date:
- Size: 11.9 kB
- Tags: Python 3
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/6.2.0 CPython/3.11.14
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
c5db3dc34bd262e1ea0d28ad73fee72df527aaa217cbd6bd949313790b3d05ab
|
|
| MD5 |
4dd7c983cb9c1d7263ca1a1126378838
|
|
| BLAKE2b-256 |
2d9f80b72fd5754348ae17b17200c53f38edf28c145d75cbd2a14aa8bac9f312
|