Tools for dkpro cassis
Project description
Dkpro cassis tools
Toolkit for managing uima cas xmi files.
Install
pip install -U git+https://github.com/pangeamt/dkpro_cassis_tools
Load cas from a zip file
from dkpro_cassis_tools import load_cas_from_zip_file
with open('cas.zip', 'rb') as f:
cas = load_cas_from_zip_file(f)
Save cas to a zip file
from dkpro_cassis_tools import dump_cas_to_zip_file
with open('cas.zip', 'rb') as f:
dump_cas_to_zip_file(cas, f)
Restore cas segmentation by newline
from dkpro_cassis_tools import load_cas_from_zip_file
from dkpro_cassis_tools import restore_cas_segmentation_by_newline
from dkpro_cassis_tools import dump_cas_to_zip_file
# Open the cas
with open('cas.zip', 'rb') as f:
cas = load_cas_from_zip_file(f)
# Restore segmentation
re_segmented_cas = restore_cas_segmentation_by_newline(cas)
# Save it
with open('re_segmented_cas.zip', 'rb') as f:
dump_cas_to_zip_file(cas, f)
Combine sentences from one or more cas
from dkpro_cassis_tools import load_cas_from_zip_file
from dkpro_cassis_tools import dump_cas_to_zip_file
from dkpro_cassis_tools import create_cas_from_sentences
from dkpro_cassis_tools import SENTENCE_NS
sentences = []
# Extract some sentences from cas1
with open('cas1.zip', 'rb') as f:
cas1 = load_cas_from_zip_file(f)
for sentence in cas1.select(SENTENCE_NS):
if len(sentence.get_covered_text())>10:
sentences.append((cas1, sentence))
# Extract some sentences from cas2
with open('cas2.zip', 'rb') as f:
cas2 = load_cas_from_zip_file(f)
for sentence in cas2.select(SENTENCE_NS):
if len(sentence.get_covered_text())>10:
sentences.append((cas2, sentence))
# Create the new cas
new_cas = create_cas_from_sentences(sentences)
# Save it
with open('new_cas.zip', 'rb') as f:
dump_cas_to_zip_file(new_cas, f)
Tokenize cas
from dkpro_cassis_tools import load_cas_from_zip_file
from dkpro_cassis_tools import tokenize_cas
wakati = MeCab.Tagger("-Owakati")
def tokenize(text: str) -> List[str]:
return wakati.parse(text).split()
with open('data/cas_tokenize.zip', 'rb') as f:
cas = load_cas_from_zip_file(f)
mecab_tokenized_cas = tokenize_cas(cas, tokenize)
Project details
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
Built Distribution
Close
Hashes for dkpro_cassis_tools-0.0.1-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | f04c993fd2ac3604e1e960c751e9f0dc2d20576060b772392aed89c200ac5181 |
|
MD5 | 157de4bdf8e21c68be3bb4b95668535f |
|
BLAKE2b-256 | 18d43cb2c96c31706f88afbef9b6620fb25bf9c2311128184b2708987ac5e231 |