Python package with CText C++ extension
Project description
CText
Modern C++ text processing library
https://github.com/antonmilev/CText
Python Reference
To install CText:
pip install ctextlib
To use CText in Python script:
from ctextlib import CTextA as text
a = text("Hello World")
print(a)
Python methods reference:
addToFileName
a = text("C:\\Temp\\Temp2\\File.bmp")
a.addToFileName("_mask")
print(a)
C:\Temp\Temp2\File_mask.bmp
append
a = text("Hello ")
a.append("World")
Hello World
a = text("123")
a.append('4',4)
1234444
a = text("")
a.append(['Hello', ' ', 'World'])
Hello World
appendRange
a = text()
a.appendRange('a','z').appendRange('0','9')
abcdefghijklmnopqrstuvwxyz0123456789
between
a = text('The quick brown fox jumps over the lazy dog')
a.between('q','d')
print(a)
uick brown fox jumps over the lazy
a = text('The quick brown fox jumps over the lazy dog')
a.between('quick','lazy')
print(a)
brown fox jumps over the
contain
a = text('The quick brown fox jumps over the lazy dog')
if a.contain('quick') :
print("contain 'quick'")
contain 'quick'
Case-incensitive
a = text('The quick brown fox jumps over the lazy dog')
if a.contain('Quick', False) :
print("contain 'quick'")
contain 'quick'
a = text('The quick brown fox jumps over the lazy dog')
if a.contain(['slow','fast','quick']):
print("contain 'quick'")
contain 'quick'
containAny
a = text('Hello World')
a.containAny('abcd')
True
containOnly
a = text('4365767')
a.containOnly('0123456789')
True
convertToHex
a = text("Hello World")
a.convertToHex()
print(a)
48 65 6C 6C 6F 20 57 6F 72 6C 64
count
a = text('The quick brown fox jumps over the lazy dog')
a.count('the', False)
2
countWordFrequencies
from ctextlib import CTextA as text
a = text("The quick brown fox jumps over the lazy dog")
a.countWordFrequencies(False)
[(2, 'the'), (1, 'brown'), (1, 'dog'), (1, 'fox'), (1, 'jumps'), (1, 'lazy'), (1, 'over'), (1, 'quick')]
cutAfterFirst
s = text('The quick brown fox jumps over the lazy dog')
a.cutAfterFirst('o')
The quick br
cutAfterLast
s = text('The quick brown fox jumps over the lazy dog')
a.cutAfterLast('o')
The quick brown fox jumps over the lazy d
cutBeforeFirst
s = text('The quick brown fox jumps over the lazy dog')
a.cutBeforeFirst('o')
own fox jumps over the lazy dog
cutEnds
s = text('The quick brown fox jumps over the lazy dog')
a.cutEnds(4)
quick brown fox jumps over the lazy
cutLeft
s = text("Hello World")
s.cutLeft(6)
World
cutRight
s = text("Hello World")
s.cutRight(6)
Hello
enclose
a = text("Hello World")
a.enclose('<','>')
<Hello World>
endsWith
a = text("Hello World")
if a.endsWith('World'):
print("ends with 'World'")
ends with 'World'
With case-insensitive search:
a = text("Hello World")
if a.endsWith('world', False):
print("ends with 'world'")
ends with 'world'
endsWithAny
if(a.endsWithAny(['cat','dog'])):
print('end to animal...')
end to animal...
erase
a = text('The quick brown fox jumps over the lazy dog')
a.erase(8, 10)
print(a)
The quicx jumps over the lazy dog
equal
a = text()
a.equal('A',10)
AAAAAAAAAA
find
a = text('The quick brown fox jumps over the lazy dog')
a.find('brown')
'brown fox jumps over the lazy dog'
With case-incensitive search:
a = text('The quick brown fox jumps over the lazy dog')
a.find('Brown', False)
'brown fox jumps over the lazy dog'
fromArray
a = text()
a.fromArray([1,2,3,4])
print(a)
1 2 3 4
a = text()
a.fromArray([1,2,3,4], '|')
print(a)
1|2|3|4
a = text()
a.fromArray([1,2,3,4], '')
print(a)
1234
Array of floats
a = text()
a.fromArray([1.1,2.2,3.3,4.4])
print(a)
1.1 2.2 3.3 4.4
Array of strings
a = text()
a.fromArray(['hello','world'])
print(a)
hello world
import numpy as np
a = text()
a.fromArray(np.array(["hello","world"]))
print(a)
hello world
fromArrayAsHex
a = text()
a.fromArrayAsHex([10,20,30,40])
print(a)
0A 14 1E 28
Use without separator
a.fromArrayAsHex([10,20,30,40],2,'')
print(a)
0A141E28
a = text()
a.fromArrayAsHex([1000,2000,3000,4000])
print(a)
3E8 7D0 BB8 FA0
a = text()
a.fromArrayAsHex([1000,2000,3000,4000], 4, ',')
print(a)
03E8,07D0,0BB8,0FA0
fromBinary
a = text()
a.fromBinary(12345)
print(a)
00000000000000000011000000111001
fromDouble
a = text()
a.fromDouble(3.333338478)
print(a)
a.fromDouble(3.33989, 4)
print(a)
a.fromDouble(3.333338478, 10)
3.333338
3.3399
3.3333384780
fromHex
a = text()
a.fromHex(1234567)
a.fromHex('a')
0012D687
61
fromInteger
a = text()
a.fromInteger(358764)
print(a)
358764
fromMatrix
from ctextlib import CTextA as text
import numpy as np
x = np.array([[10, 20, 30], [40, 50, 60]])
a = text()
a.fromMatrix(x)
print(a)
10 20 30
40 50 60
from ctextlib import CTextA as text
import numpy as np
x = np.array([[10, 20, 30], [40, 50, 60]])
a = text()
a.fromMatrix(x, ',')
10,20,30
40,50,60
fromMatrixAsHex
from ctextlib import CTextA as text
import numpy as np
x = np.array([[10, 20, 30], [40, 50, 60]])
a = text()
a.fromMatrixAsHex(x)
print(a)
0A 14 1E
28 32 3C
from ctextlib import CTextA as text
import numpy as np
x = np.array([[1000, 2000, 3000], [4000, 5000, 6000]])
a = text()
a.fromMatrixAsHex(x,4)
print(a)
03E8 07D0 0BB8
0FA0 1388 1770
getDir
a = text("D:\\Folder\\SubFolder\\TEXT\\file.dat")
a.getDir()
D:\Folder\SubFolder\TEXT\
getExtension
a = text("D:\\Folder\\SubFolder\\TEXT\\file.dat")
a.getExtension()
'.dat'
getFileName
a = text("D:\\Folder\\SubFolder\\TEXT\\file.dat")
a.getFileName()
'file.dat'
hash
s.hash()
9257130453210036571
indexOf
a = text("The quick brown fox jumps over the lazy dog.")
a.indexOf("brown")
10
indexOfAny
a = text("The quick brown fox jumps over the lazy dog.")
a.indexOfAny(["fox", "dog"])
16
indexOfAny
a = text("The quick brown fox jumps over the lazy dog.")
a.indexOfAny("abc")
7
insert
a = text("abc")
a.insert(1,'d',2)
addbc
a = text("The quick jumps over the lazy dog.")
a.insert(10,"fox ")
The quick fox jumps over the lazy dog.
insertAtBegin
insertAtEnd
a = text("Hello")
a.insertAtBegin("<begin>")
a.insertAtEnd("</begin>")
<begin>abc</begin>
isAlpha
a = text("Abcd")
a.isAlpha()
True
isBinary
a = text("01111011100001")
a.isBinary()
True
isEmpty
a = text()
a.isEmpty()
True
isHexNumber
a = text("12AB56FE")
a.isHexNumber()
True
isNumber
a = text("123456")
a.isNumber()
True
isLower
a = text("hello world")
a.isLower()
True
isUpper
a = text("HELLO WORLD")
a.isUpper()
True
isPalindrome
a = text("racecar")
a.isPalindrome()
True
keep
s = text("Hello World").keep(3,5)
lo Wo
keepLeft
a = text("The quick jumps over the lazy dog.")
a.keepLeft(10)
The quick
keepRight
a = text("The quick jumps over the lazy dog.")
a.keepRight(10)
lazy dog.
lastIndexOf
s = text("Hello World")
s.lastIndexOf('l')
9
lines
a = text("L1\nL2\n\nL3\nL4\n \n\nL5")
a.lines()
['L1', 'L2', 'L3', 'L4', 'L5']
linesCount
a = text("L1\nL2\n\nL3\nL4\n \n\nL5")
a.linesCount()
7
linesRemoveEmpty
a = text("L1\nL2\n\nL3\nL4\n \n\nL5")
a.linesRemoveEmpty()
print(a)
L1
L2
L3
L4
L5
Several per line methods
linesAppend
linesInsertAtBegin
linesSort
linesPaddRight
linesTrim
Example of opening a text file, sort all lines, and save it with another name
from ctextlib import CTextA as text
s = text()
s.readFile('Unordered.txt')
s.linesSort()
s.writeFile('Sorted_python.txt')
limit
s = text("Hello World")
s.limit(6)
Hello
lower
s = text("Hello World")
s.lower()
hello world
makeUnique
a = text()
a.appendRange('a','z').appendRange('a','z')
abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz
a.makeUnique()
print(a)
abcdefghijklmnopqrstuvwxyz
mid
a = text("Hello World").mid(3)
lo Wo
nextLine
# Example of iterating all lines
from ctextlib import CTextA as text
a = text("Line1\nLine2\nLine3")
line = text()
pos = 0
while(pos >= 0):
pos = a.nextLine(pos,line)
print(line)
Line1
Line2
Line3
nextWord
# Example of iterating all words
from ctextlib import CTextA as text
a = text('The quick brown fox jumps over the lazy dog')
word = text()
pos = 0
while(pos >= 0):
pos = a.nextWord(pos,word)
print(word)
The
quick
brown
fox
jumps
over
the
lazy
dog
paddLeft
s = text("Abra")
s.paddLeft('.', 16)
............Abra
paddRight
s = text("Abra")
s.paddRight('.', 16)
Abra............
pathCombine
a = text("C:\\Temp")
a.pathCombine("..\\Folder")
C:\Folder
quote
a = text("Hello")
a.quote()
"Hello"
random
a = text()
a.random()
"P1kAlMiG2Kb7FzP5"
a.sort()
"1257AFGKMPPbiklz"
a.shuffle()
"k2lF7KAPG5M1Pzbi"
a.random(32)
P1kAlMiG2Kb7FzP5tM1QBI6DSS92c31A
randomAlpha
s = text()
s.randomAlpha()
IkEffmzNiMKKASVW
randomNumber
s = text()
s.randomNumber()
3892795431
s.randomNumber(32)
33341138742779319865028602486509
readFile
# demontrates how to read a whole text file
from ctextlib import CTextA as text
a = text()
a.readFile('test.txt')
print(a)
Hello World
regexMatch
s = text("+336587890078")
if(s.regexMatch("(\\+|-)?[[:digit:]]+")):
print("it is a number")
it is a number
regexLines
animals.txt
------------
Cat
Dog
Giraffe
Lion
Llama
Monkey
Mouse
Parrot
Poodle
Scorpion
Snake
Weasel
# collect all lines starting with given characters
from ctextlib import CTextA as text
a = text()
a.readFile("animals.txt")
a.regexLines("^[A-G][a-z]+")
['Cat', 'Dog', 'Giraffe']
regexReplace
from ctextlib import CTextA as text
a = text("there is sub-sequence in the sub-way string")
a.regexReplace("\\b(sub)([^ ]*)", "sub-$2")
there is sub--sequence in the sub--way string
regexSearch
# collect all words using regex
from ctextlib import CTextA as text
a = text("The quick brown fox jumps over the lazy dog")
a.regexSearch("\\w+")
'The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
regexWords
# collect all words starting with given characters
from ctextlib import CTextA as text
a = text("The quick brown fox jumps over the lazy dog")
a.regexWords("^[a-n][a-z]+")
['brown', 'fox', 'jumps', 'lazy', 'dog']
remove
a = text('we few, we happy few, we band of brothers.')
a.remove('we')
a.reduceChain()
a.trim()
few happy few band of brothers
removeAny
from ctextlib import CTextA as text
a = text('The quick brown fox jumps over the lazy dog')
a.removeAny(['brown','quick','lazy'])
a.reduceChain()
The fox jumps over the dog
removeExtension
a = text("D:\\Folder\\SubFolder\\TEXT\\File.dat")
a.removeExtension()
D:\Folder\SubFolder\TEXT\File
removeFileName
a = text("D:\\Folder\\SubFolder\\TEXT\\File.dat")
a.removeFileName()
D:\Folder\SubFolder\TEXT\
removeWhileBegins
a = text("Some text ending with something")
a.removeWhileBegins("Some text ")
print(a)
ending with something
removeWhileEnds
a = text("Some text ending with something")
a.removeWhileEnds(" something")
print(a)
Some text ending with
replace
a = text("The quick brown fox jumps over the lazy dog")
a.replace("fox", "cat")
print(a)
The quick brown cat jumps over the lazy dog
a = text("The quick brown fox jumps over the lazy dog")
a.replace(["fox", "cat","dog","quick"], "-")
The ----- brown --- jumps over the lazy ---
replaceAny
a = text("The quick brown fox jumps over the lazy dog")
a.replaceAny(["fox", "cat","dog"], "***")
print(a)
The quick brown *** jumps over the lazy ***
a = text("The quick brown fox jumps over the lazy dog")
a.replaceAny(["fox", "dog"], ["dog", "fox"])
The quick brown dog jumps over the lazy fox
reverse
a = text("Hello")
a.reverse()
olleH
right
a = text("Hello World")
a.right(5)
World
rotate
a = text("Hello World")
a.rotateLeft(2)
a.rotateRight(4)
Output
llo WorldHe
ldHello Wor
split
# by default split uses the standard separators (" \t\r\n")
a = text("The quick brown fox jumps over the lazy dog")
a.split()
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
# split can be used with any list of separator characters
a = text("The quick, brown....fox,,, ,jumps over,the lazy.dog")
a.split(",. ")
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
toBinary
bOk = False
a = text("100001")
a.toBinaryNumber(bOk)
33
toHexNumber
bOk = False
a = text("1E1E")
a.toHexNumber(bOk)
7710
trim
a = text(" \t\n lazy dog \t\n ")
a.trim()
lazy dog
a = text("000000000000101")
a.trimLeft("0")
101
a = ("101000000000000")
a.trimRight('0')
101
a = text("0000000101000000000")
a.trim("0")
101
upper
s = text("Hello World")
s.upper()
HELLO WORLD
words
a = text("The quick brown fox jumps over the lazy dog")
a.words()
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
a = text("The|quick|brown|fox|jumps|over|the|lazy|dog")
a.words('|')
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
wordsCapitalize
a = text("The quick brown fox jumps over the lazy dog")
a.wordsCapitalize()
The Quick Brown Fox Jumps Over The Lazy Dog
wordsCount
a = text('The quick brown fox jumps over the lazy dog')
a.wordsCount()
9
wordsEnclose
a = text("The quick brown fox jumps over the lazy dog")
a.wordsEnclose('[',']')
[The] [quick] [brown] [fox] [jumps] [over] [the] [lazy] [dog]
wordsReverse
a = text("The quick brown fox jumps over the lazy dog")
a.wordsReverse()
ehT kciuq nworb xof spmuj revo eht yzal god
wordsSort
a = text('The quick brown fox jumps over the lazy dog')
a.wordsSort()
Output
The brown dog fox jumps lazy over quick the
writeFile
# demontrates how to read a whole text file
from ctextlib import CTextA as text
a = text("Hello World")
a.writeFile('test.txt')
print(a)
For the full list type help(ctextlib).
Performance Tests
Comparing to built-in Python text library, in many cases CText methods are faster, sometimes 2-3 times faster. When using Python regular epressions difference is even bigger.
Below are given several Python performance tests and obtained results.
Test 1 - words enclose
from time import perf_counter
from ctextlib import CTextA as text
import re
import urllib.request
# download words.txt from https://github.com/dwyl/english-words
print("download words.txt.....")
url = 'https://github.com/dwyl/english-words/raw/master/words.txt'
urllib.request.urlretrieve(url, 'words.txt')
a = text()
if(a.readFile("words.txt") == False):
print("error openning file")
exit()
start = perf_counter()
a.wordsEnclose('[',']')
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("words.txt CText wordsEnclose", duration * 1000))
if(a.readFile("words.txt") == False):
print("error openning file")
exit()
s = a.str()
start = perf_counter()
b = ' '.join('[{}]'.format(word) for word in s.split('\n'))
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("words.txt Python split-join", duration * 1000))
s = a.str()
start = perf_counter()
s = re.sub(r'(\w+)',r'[\1]',s)
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("words.txt Python re.sub", duration * 1000))
Output
words.txt CText wordsEnclose took 92.083 ms
words.txt Python split-join took 186.377 ms
words.txt Python re.sub took 601.214 ms
Test 2 - words reverse
from time import perf_counter
from ctextlib import CTextA as text
import re
import urllib.request
# download words.txt from https://github.com/dwyl/english-words
print("download words.txt.....")
url = 'https://github.com/dwyl/english-words/raw/master/words.txt'
urllib.request.urlretrieve(url, 'words.txt')
a = text()
if(a.readFile("words.txt") == False):
print("error openning file")
exit()
start = perf_counter()
a.wordsReverse()
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("words.txt CText wordsReverse", duration * 1000))
if(a.readFile("words.txt") == False):
print("error openning file")
exit()
s = a.str()
start = perf_counter()
b = ' '.join( word[::-1] for word in (s.split('\n')))
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("words.txt Python reverse split-join", duration * 1000))
start = perf_counter()
words = ' '.join( word[::-1] for word in ( re.findall('\w+|[:;,.!?]', s)))
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("words.txt Python reverse re.findall", duration * 1000))
words.txt CText wordsReverse took 78.501 ms
words.txt Python reverse split-join took 130.286 ms
words.txt Python reverse re.findall took 609.706 ms
Test 2 - remove repeating lines
from time import perf_counter
from ctextlib import CTextA as text
import re
import urllib.request
# download from https://www.gutenberg.org/files/2600/2600-0.txt
print("download 2600-0.txt.....")
url = 'https://www.gutenberg.org/files/2600/2600-0.txt'
urllib.request.urlretrieve(url, '2600-0.txt')
a = text()
if(a.readFile("2600-0.txt") == False):
print("error opening file")
exit()
s = a.str()
start = perf_counter()
a.linesRemoveEmpty()
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("linesRemoveEmpty", duration * 1000))
#print(a)
start = perf_counter()
b = '\n'.join(line for line in s.split('\n') if line.strip() != '')
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("line.strip 18362.txt", duration * 1000))
linesRemoveEmpty took 11.599 ms
line.strip took 31.567 ms
TODO List
- More methods for words,lines,sentences and complex expressions: There are lots more methods that can be added to support diferent NLP and lexical tasks.
- Further improve containers abstraction: CText needs more convertion routines to/from STL and other containers and generic data structures.
- Regular Expressions: - Partial or full support to regular expressions.
- Other char types: - Character types like char_32 can be also supported
- Mini Text Editor: - This is a text editor based on CText that I plan to port on Modern C++.
- Export to Python: - I want to export CText library to Python-3
- Performance Test: - Add performance tests comparing with STL string.
- Use Tries for large keywords lists: - This will greately optimize searach in large lists of words
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
Built Distribution
Hashes for ctextlib-1.0.5-cp37-cp37m-win_amd64.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 36ba550febbcbc6166ef34ba75a770400d075616a41ca55be548986f8a718b4d |
|
MD5 | 271537246b7dce4e0f83fd84ab3a39aa |
|
BLAKE2b-256 | dca4565d2936d1481b043ed7efd8b7b4868c28bac0e7a2d16ffa8d3d033ab232 |