Python package with CText C++ extension
Project description
CText
Modern C++ text processing library
https://github.com/antonmilev/CText
Python Reference
To install CText:
pip install ctextlib
To use CText in Python script:
from ctextlib import CTextA as text
a = text("Hello World")
print(a)
Python methods reference:
addToFileName
a = text("C:\\Temp\\Temp2\\File.bmp")
a.addToFileName("_mask")
print(a)
C:\Temp\Temp2\File_mask.bmp
append
a = text("Hello ")
a.append("World")
Hello World
a = text("123")
a.append('4',4)
1234444
a = text("")
a.append(['Hello', ' ', 'World'])
Hello World
appendRange
a = text()
a.appendRange('a','z').appendRange('0','9')
abcdefghijklmnopqrstuvwxyz0123456789
between
a = text('The quick brown fox jumps over the lazy dog')
a.between('q','d')
print(a)
uick brown fox jumps over the lazy
a = text('The quick brown fox jumps over the lazy dog')
a.between('quick','lazy')
print(a)
brown fox jumps over the
contain
a = text('The quick brown fox jumps over the lazy dog')
if a.contain('quick') :
print("contain 'quick'")
contain 'quick'
Case-incensitive
a = text('The quick brown fox jumps over the lazy dog')
if a.contain('Quick', False) :
print("contain 'quick'")
contain 'quick'
a = text('The quick brown fox jumps over the lazy dog')
if a.contain(['slow','fast','quick']):
print("contain 'quick'")
contain 'quick'
containAny
a = text('Hello World')
a.containAny('abcd')
True
containOnly
a = text('4365767')
a.containOnly('0123456789')
True
convertToHex
a = text("Hello World")
a.convertToHex()
print(a)
48 65 6C 6C 6F 20 57 6F 72 6C 64
count
a = text('The quick brown fox jumps over the lazy dog')
a.count('the', False)
2
countWordFrequencies
from ctextlib import CTextA as text
a = text("The quick brown fox jumps over the lazy dog")
a.countWordFrequencies(False)
[(2, 'the'), (1, 'brown'), (1, 'dog'), (1, 'fox'), (1, 'jumps'), (1, 'lazy'), (1, 'over'), (1, 'quick')]
cutAfterFirst
s = text('The quick brown fox jumps over the lazy dog')
a.cutAfterFirst('o')
The quick br
cutAfterLast
s = text('The quick brown fox jumps over the lazy dog')
a.cutAfterLast('o')
The quick brown fox jumps over the lazy d
cutBeforeFirst
s = text('The quick brown fox jumps over the lazy dog')
a.cutBeforeFirst('o')
own fox jumps over the lazy dog
cutEnds
s = text('The quick brown fox jumps over the lazy dog')
a.cutEnds(4)
quick brown fox jumps over the lazy
cutLeft
s = text("Hello World")
s.cutLeft(6)
World
cutRight
s = text("Hello World")
s.cutRight(6)
Hello
enclose
a = text("Hello World")
a.enclose('<','>')
<Hello World>
endsWith
a = text("Hello World")
if a.endsWith('World'):
print("ends with 'World'")
ends with 'World'
With case-insensitive search:
a = text("Hello World")
if a.endsWith('world', False):
print("ends with 'world'")
ends with 'world'
endsWithAny
if(a.endsWithAny(['cat','dog'])):
print('end to animal...')
end to animal...
erase
a = text('The quick brown fox jumps over the lazy dog')
a.erase(8, 10)
print(a)
The quicx jumps over the lazy dog
equal
a = text()
a.equal('A',10)
AAAAAAAAAA
find
a = text('The quick brown fox jumps over the lazy dog')
a.find('brown')
'brown fox jumps over the lazy dog'
With case-incensitive search:
a = text('The quick brown fox jumps over the lazy dog')
a.find('Brown', False)
'brown fox jumps over the lazy dog'
fromArray
a = text()
a.fromArray([1,2,3,4])
print(a)
1 2 3 4
a = text()
a.fromArray([1,2,3,4], '|')
print(a)
1|2|3|4
a = text()
a.fromArray([1,2,3,4], '')
print(a)
1234
Array of floats
a = text()
a.fromArray([1.1,2.2,3.3,4.4])
print(a)
1.1 2.2 3.3 4.4
Array of strings
a = text()
a.fromArray(['hello','world'])
print(a)
hello world
import numpy as np
a = text()
a.fromArray(np.array(["hello","world"]))
print(a)
hello world
fromArrayAsHex
a = text()
a.fromArrayAsHex([10,20,30,40])
print(a)
0A 14 1E 28
Use without separator
a.fromArrayAsHex([10,20,30,40],2,'')
print(a)
0A141E28
a = text()
a.fromArrayAsHex([1000,2000,3000,4000])
print(a)
3E8 7D0 BB8 FA0
a = text()
a.fromArrayAsHex([1000,2000,3000,4000], 4, ',')
print(a)
03E8,07D0,0BB8,0FA0
fromBinary
a = text()
a.fromBinary(12345)
print(a)
00000000000000000011000000111001
fromDouble
a = text()
a.fromDouble(3.333338478)
print(a)
a.fromDouble(3.33989, 4)
print(a)
a.fromDouble(3.333338478, 10)
3.333338
3.3399
3.3333384780
fromHex
a = text()
a.fromHex(1234567)
a.fromHex('a')
0012D687
61
fromInteger
a = text()
a.fromInteger(358764)
print(a)
358764
fromMatrix
from ctextlib import CTextA as text
import numpy as np
x = np.array([[10, 20, 30], [40, 50, 60]])
a = text()
a.fromMatrix(x)
print(a)
10 20 30
40 50 60
from ctextlib import CTextA as text
import numpy as np
x = np.array([[10, 20, 30], [40, 50, 60]])
a = text()
a.fromMatrix(x, ',')
10,20,30
40,50,60
fromMatrixAsHex
from ctextlib import CTextA as text
import numpy as np
x = np.array([[10, 20, 30], [40, 50, 60]])
a = text()
a.fromMatrixAsHex(x)
print(a)
0A 14 1E
28 32 3C
from ctextlib import CTextA as text
import numpy as np
x = np.array([[1000, 2000, 3000], [4000, 5000, 6000]])
a = text()
a.fromMatrixAsHex(x,4)
print(a)
03E8 07D0 0BB8
0FA0 1388 1770
getDir
a = text("D:\\Folder\\SubFolder\\TEXT\\file.dat")
a.getDir()
D:\Folder\SubFolder\TEXT\
getExtension
a = text("D:\\Folder\\SubFolder\\TEXT\\file.dat")
a.getExtension()
'.dat'
getFileName
a = text("D:\\Folder\\SubFolder\\TEXT\\file.dat")
a.getFileName()
'file.dat'
hash
s.hash()
9257130453210036571
indexOf
a = text("The quick brown fox jumps over the lazy dog.")
a.indexOf("brown")
10
indexOfAny
a = text("The quick brown fox jumps over the lazy dog.")
a.indexOfAny(["fox", "dog"])
16
indexOfAny
a = text("The quick brown fox jumps over the lazy dog.")
a.indexOfAny("abc")
7
insert
a = text("abc")
a.insert(1,'d',2)
addbc
a = text("The quick jumps over the lazy dog.")
a.insert(10,"fox ")
The quick fox jumps over the lazy dog.
insertAtBegin
insertAtEnd
a = text("Hello")
a.insertAtBegin("<begin>")
a.insertAtEnd("</begin>")
<begin>abc</begin>
isAlpha
a = text("Abcd")
a.isAlpha()
True
isBinary
a = text("01111011100001")
a.isBinary()
True
isEmpty
a = text()
a.isEmpty()
True
isHexNumber
a = text("12AB56FE")
a.isHexNumber()
True
isNumber
a = text("123456")
a.isNumber()
True
isLower
a = text("hello world")
a.isLower()
True
isUpper
a = text("HELLO WORLD")
a.isUpper()
True
isPalindrome
a = text("racecar")
a.isPalindrome()
True
keep
s = text("Hello World").keep(3,5)
lo Wo
keepLeft
a = text("The quick jumps over the lazy dog.")
a.keepLeft(10)
The quick
keepRight
a = text("The quick jumps over the lazy dog.")
a.keepRight(10)
lazy dog.
lastIndexOf
s = text("Hello World")
s.lastIndexOf('l')
9
lines
a = text("L1\nL2\n\nL3\nL4\n \n\nL5")
a.lines()
['L1', 'L2', 'L3', 'L4', 'L5']
linesCount
a = text("L1\nL2\n\nL3\nL4\n \n\nL5")
a.linesCount()
7
linesRemoveEmpty
a = text("L1\nL2\n\nL3\nL4\n \n\nL5")
a.linesRemoveEmpty()
print(a)
L1
L2
L3
L4
L5
Several per line methods
linesAppend
linesInsertAtBegin
linesSort
linesPaddRight
linesTrim
Example of opening a text file, sort all lines, and save it with another name
from ctextlib import CTextA as text
s = text()
s.readFile('Unordered.txt')
s.linesSort()
s.writeFile('Sorted_python.txt')
limit
s = text("Hello World")
s.limit(6)
Hello
lower
s = text("Hello World")
s.lower()
hello world
makeUnique
a = text()
a.appendRange('a','z').appendRange('a','z')
abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz
a.makeUnique()
print(a)
abcdefghijklmnopqrstuvwxyz
mid
a = text("Hello World").mid(3)
lo Wo
nextLine
# Example of iterating all lines
from ctextlib import CTextA as text
a = text("Line1\nLine2\nLine3")
line = text()
pos = 0
while(pos >= 0):
pos = a.nextLine(pos,line)
print(line)
Line1
Line2
Line3
nextWord
# Example of iterating all words
from ctextlib import CTextA as text
a = text('The quick brown fox jumps over the lazy dog')
word = text()
pos = 0
while(pos >= 0):
pos = a.nextWord(pos,word)
print(word)
The
quick
brown
fox
jumps
over
the
lazy
dog
paddLeft
s = text("Abra")
s.paddLeft('.', 16)
............Abra
paddRight
s = text("Abra")
s.paddRight('.', 16)
Abra............
pathCombine
a = text("C:\\Temp")
a.pathCombine("..\\Folder")
C:\Folder
quote
a = text("Hello")
a.quote()
"Hello"
random
a = text()
a.random()
"P1kAlMiG2Kb7FzP5"
a.sort()
"1257AFGKMPPbiklz"
a.shuffle()
"k2lF7KAPG5M1Pzbi"
a.random(32)
P1kAlMiG2Kb7FzP5tM1QBI6DSS92c31A
randomAlpha
s = text()
s.randomAlpha()
IkEffmzNiMKKASVW
randomNumber
s = text()
s.randomNumber()
3892795431
s.randomNumber(32)
33341138742779319865028602486509
readFile
# demontrates how to read a whole text file
from ctextlib import CTextA as text
a = text()
a.readFile('test.txt')
print(a)
Hello World
regexMatch
s = text("+336587890078")
if(s.regexMatch("(\\+|-)?[[:digit:]]+")):
print("it is a number")
it is a number
regexLines
animals.txt
------------
Cat
Dog
Giraffe
Lion
Llama
Monkey
Mouse
Parrot
Poodle
Scorpion
Snake
Weasel
# collect all lines starting with given characters
from ctextlib import CTextA as text
a = text()
a.readFile("animals.txt")
a.regexLines("^[A-G][a-z]+")
['Cat', 'Dog', 'Giraffe']
regexReplace
from ctextlib import CTextA as text
a = text("there is sub-sequence in the sub-way string")
a.regexReplace("\\b(sub)([^ ]*)", "sub-$2")
there is sub--sequence in the sub--way string
regexSearch
# collect all words using regex
from ctextlib import CTextA as text
a = text("The quick brown fox jumps over the lazy dog")
a.regexSearch("\\w+")
'The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
regexWords
# collect all words starting with given characters
from ctextlib import CTextA as text
a = text("The quick brown fox jumps over the lazy dog")
a.regexWords("^[a-n][a-z]+")
['brown', 'fox', 'jumps', 'lazy', 'dog']
remove
a = text('we few, we happy few, we band of brothers.')
a.remove('we')
a.reduceChain()
a.trim()
few happy few band of brothers
removeAny
from ctextlib import CTextA as text
a = text('The quick brown fox jumps over the lazy dog')
a.removeAny(['brown','quick','lazy'])
a.reduceChain()
The fox jumps over the dog
removeExtension
a = text("D:\\Folder\\SubFolder\\TEXT\\File.dat")
a.removeExtension()
D:\Folder\SubFolder\TEXT\File
removeFileName
a = text("D:\\Folder\\SubFolder\\TEXT\\File.dat")
a.removeFileName()
D:\Folder\SubFolder\TEXT\
removeWhileBegins
a = text("Some text ending with something")
a.removeWhileBegins("Some text ")
print(a)
ending with something
removeWhileEnds
a = text("Some text ending with something")
a.removeWhileEnds(" something")
print(a)
Some text ending with
replace
a = text("The quick brown fox jumps over the lazy dog")
a.replace("fox", "cat")
print(a)
The quick brown cat jumps over the lazy dog
a = text("The quick brown fox jumps over the lazy dog")
a.replace(["fox", "cat","dog","quick"], "-")
The ----- brown --- jumps over the lazy ---
replaceAny
a = text("The quick brown fox jumps over the lazy dog")
a.replaceAny(["fox", "cat","dog"], "***")
print(a)
The quick brown *** jumps over the lazy ***
a = text("The quick brown fox jumps over the lazy dog")
a.replaceAny(["fox", "dog"], ["dog", "fox"])
The quick brown dog jumps over the lazy fox
reverse
a = text("Hello")
a.reverse()
olleH
right
a = text("Hello World")
a.right(5)
World
rotate
a = text("Hello World")
a.rotateLeft(2)
a.rotateRight(4)
Output
llo WorldHe
ldHello Wor
split
# by default split uses the standard separators (" \t\r\n")
a = text("The quick brown fox jumps over the lazy dog")
a.split()
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
# split can be used with any list of separator characters
a = text("The quick, brown....fox,,, ,jumps over,the lazy.dog")
a.split(",. ")
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
toBinary
bOk = False
a = text("100001")
a.toBinaryNumber(bOk)
33
toHexNumber
bOk = False
a = text("1E1E")
a.toHexNumber(bOk)
7710
trim
a = text(" \t\n lazy dog \t\n ")
a.trim()
lazy dog
a = text("000000000000101")
a.trimLeft("0")
101
a = ("101000000000000")
a.trimRight('0')
101
a = text("0000000101000000000")
a.trim("0")
101
upper
s = text("Hello World")
s.upper()
HELLO WORLD
words
a = text("The quick brown fox jumps over the lazy dog")
a.words()
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
a = text("The|quick|brown|fox|jumps|over|the|lazy|dog")
a.words('|')
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
wordsCapitalize
a = text("The quick brown fox jumps over the lazy dog")
a.wordsCapitalize()
The Quick Brown Fox Jumps Over The Lazy Dog
wordsCount
a = text('The quick brown fox jumps over the lazy dog')
a.wordsCount()
9
wordsEnclose
a = text("The quick brown fox jumps over the lazy dog")
a.wordsEnclose('[',']')
[The] [quick] [brown] [fox] [jumps] [over] [the] [lazy] [dog]
wordsReverse
a = text("The quick brown fox jumps over the lazy dog")
a.wordsReverse()
ehT kciuq nworb xof spmuj revo eht yzal god
wordsSort
a = text('The quick brown fox jumps over the lazy dog')
a.wordsSort()
Output
The brown dog fox jumps lazy over quick the
writeFile
# demontrates how to read a whole text file
from ctextlib import CTextA as text
a = text("Hello World")
a.writeFile('test.txt')
print(a)
For the full list type help(ctextlib).
Performance Tests
Comparing to built-in Python text library, in many cases CText methods are faster, sometimes 2-3 times faster. When using Python regular epressions difference is even bigger.
Below are given several Python performance tests and obtained results.
Test 1 - words enclose
from time import perf_counter
from ctextlib import CTextA as text
import re
import urllib.request
# download words.txt from https://github.com/dwyl/english-words
print("download words.txt.....")
url = 'https://github.com/dwyl/english-words/raw/master/words.txt'
urllib.request.urlretrieve(url, 'words.txt')
a = text()
if(a.readFile("words.txt") == False):
print("error openning file")
exit()
start = perf_counter()
a.wordsEnclose('[',']')
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("words.txt CText wordsEnclose", duration * 1000))
if(a.readFile("words.txt") == False):
print("error openning file")
exit()
s = a.str()
start = perf_counter()
b = ' '.join('[{}]'.format(word) for word in s.split('\n'))
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("words.txt Python split-join", duration * 1000))
s = a.str()
start = perf_counter()
s = re.sub(r'(\w+)',r'[\1]',s)
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("words.txt Python re.sub", duration * 1000))
Output
words.txt CText wordsEnclose took 92.083 ms
words.txt Python split-join took 186.377 ms
words.txt Python re.sub took 601.214 ms
Test 2 - words reverse
from time import perf_counter
from ctextlib import CTextA as text
import re
import urllib.request
# download words.txt from https://github.com/dwyl/english-words
print("download words.txt.....")
url = 'https://github.com/dwyl/english-words/raw/master/words.txt'
urllib.request.urlretrieve(url, 'words.txt')
a = text()
if(a.readFile("words.txt") == False):
print("error openning file")
exit()
start = perf_counter()
a.wordsReverse()
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("words.txt CText wordsReverse", duration * 1000))
if(a.readFile("words.txt") == False):
print("error openning file")
exit()
s = a.str()
start = perf_counter()
b = ' '.join( word[::-1] for word in (s.split('\n')))
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("words.txt Python reverse split-join", duration * 1000))
start = perf_counter()
words = ' '.join( word[::-1] for word in ( re.findall('\w+|[:;,.!?]', s)))
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("words.txt Python reverse re.findall", duration * 1000))
words.txt CText wordsReverse took 78.501 ms
words.txt Python reverse split-join took 130.286 ms
words.txt Python reverse re.findall took 609.706 ms
Test 2 - remove repeating lines
from time import perf_counter
from ctextlib import CTextA as text
import re
import urllib.request
# download from https://www.gutenberg.org/files/2600/2600-0.txt
print("download 2600-0.txt.....")
url = 'https://www.gutenberg.org/files/2600/2600-0.txt'
urllib.request.urlretrieve(url, '2600-0.txt')
a = text()
if(a.readFile("2600-0.txt") == False):
print("error opening file")
exit()
s = a.str()
start = perf_counter()
a.linesRemoveEmpty()
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("linesRemoveEmpty", duration * 1000))
#print(a)
start = perf_counter()
b = '\n'.join(line for line in s.split('\n') if line.strip() != '')
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("line.strip 18362.txt", duration * 1000))
linesRemoveEmpty took 11.599 ms
line.strip took 31.567 ms
When comparing CText words list opperations with Python regular expressions difference in performance gap becomes much bigger. For example below is compared CText wordsReplaceAny function with regex.sub. For managing large words lists, CText uses optimized character tries and thus search time is a linear function from the words number. For replacing the 500th most common English words with a single fixed string in War and Peace book, by Leo Tolstoy (Gutenberg EBook), CText needs 27 times less time than the regular expessions, for 1000 words CText becomes more than 50 times faster!
from time import perf_counter
from ctextlib import CTextA as text
import re
import urllib.request
url = 'https://gist.githubusercontent.com/deekayen/4148741/raw/98d35708fa344717d8eee15d11987de6c8e26d7d/1-1000.txt'
urllib.request.urlretrieve(url, 'words1000.txt')
with open('words1000.txt', 'r') as f:
words = f.read().split('\n')
print(words)
print("replace using CText.....")
a = text()
if(a.readFile("2600-0.txt") == False):
print("error opening file")
exit()
s = a.str()
start = perf_counter()
a.wordsReplaceAny(words, "***")
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("replace 1000 words with CText wordsReplaceAny", duration * 1000))
start = perf_counter()
regex = re.compile(r'\b%s\b' % r'\b|\b'.join(map(re.escape, words)))
s_new = regex.sub("***", s)
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("replace 1000 words with regex.sub", duration * 1000))
replace 1000 words with CText wordsReplaceAny took 77.058 ms
replace 1000 words with regex.sub took 4445.524 ms
Similarly, for wordsReplaceWithChar difference with re.sub is more than 60 times:
# replaces words from the given list with a single character with same words length
from time import perf_counter
from ctextlib import CTextA as text
import re
import urllib.request
url = 'https://gist.githubusercontent.com/deekayen/4148741/raw/98d35708fa344717d8eee15d11987de6c8e26d7d/1-1000.txt'
urllib.request.urlretrieve(url, 'words1000.txt')
with open('words1000.txt', 'r') as f:
words = f.read().split('\n')[:1000]
print("replace using CText.....")
a = text()
if(a.readFile("2600-0.txt") == False):
print("error opening file")
exit()
s = a.str()
start = perf_counter()
a.wordsReplaceWithChar(words, "-")
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("replace 1000 words with CText wordsReplaceWithChar", duration * 1000))
#print(a)
def repl(m):
return '-' * len(m.group())
start = perf_counter()
regex = re.compile(r'\b%s\b' % r'\b|\b'.join(map(re.escape, words)))
s_new = regex.sub(repl, s)
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("replace 1000 words with regex.sub", duration * 1000))
replace 1000 words with CText wordsReplaceWithChar took 69.136 ms
replace 1000 words with regex.sub took 4225.293 ms
TODO List
- More methods for words,lines,sentences and complex expressions: There are lots more methods that can be added to support diferent NLP and lexical tasks.
- Further improve containers abstraction: CText needs more convertion routines to/from STL and other containers and generic data structures.
- Regular Expressions: - Partial or full support to regular expressions.
- Other char types: - Character types like char_32 can be also supported
- Mini Text Editor: - This is a text editor based on CText that I plan to port on Modern C++.
- Export to Python: - I want to export CText library to Python-3
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
Built Distribution
Hashes for ctextlib-1.0.7-cp37-cp37m-win_amd64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | b0ad77e79118d96a919a79d2c9c1cf4398cb4f3aa150270dfafad865451f0166 |
|
MD5 | 01542117e9464c323ebe13a4f64708b9 |
|
BLAKE2b-256 | 373b674c93bac64a0bd7aae16d9dca65f2ef78b2c1fb48356d8b390b1a19c5c4 |