Skip to main content

Python package with CText C++ extension

Project description

CText

Modern C++ text processing library

https://github.com/antonmilev/CText

Python Reference

To install CText:

pip install ctextlib

To use CText in Python script:

from ctextlib import CTextA as text
a = text("Hello World")
print(a)

Python methods reference:

addToFileName

a = text("C:\\Temp\\Temp2\\File.bmp")
a.addToFileName("_mask")
print(a)
C:\Temp\Temp2\File_mask.bmp

append

a = text("Hello ")
a.append("World")
Hello World
a = text("123")
a.append('4',4)
1234444
a = text("")
a.append(['Hello', ' ', 'World'])
Hello World

appendRange

a = text()
a.appendRange('a','z').appendRange('0','9')
abcdefghijklmnopqrstuvwxyz0123456789

between

a = text('The quick brown fox jumps over the lazy dog')
a.between('q','d')
print(a)
uick brown fox jumps over the lazy
a = text('The quick brown fox jumps over the lazy dog')
a.between('quick','lazy')
print(a)
 brown fox jumps over the

contain

a = text('The quick brown fox jumps over the lazy dog')
if a.contain('quick') :
    print("contain 'quick'")
contain 'quick'

Case-incensitive

a = text('The quick brown fox jumps over the lazy dog')
if a.contain('Quick', False) :
    print("contain 'quick'")
contain 'quick'
a = text('The quick brown fox jumps over the lazy dog')
if a.contain(['slow','fast','quick']):
    print("contain 'quick'")
contain 'quick'

containAny

a = text('Hello World')
a.containAny('abcd')
True

containOnly

a = text('4365767')
a.containOnly('0123456789')
True

convertToHex

a = text("Hello World")
a.convertToHex()
print(a)
48 65 6C 6C 6F 20 57 6F 72 6C 64

count

a = text('The quick brown fox jumps over the lazy dog')
a.count('the', False)
2

countWordFrequencies

from ctextlib import CTextA as text
a = text("The quick brown fox jumps over the lazy dog")
a.countWordFrequencies(False)
[(2, 'the'), (1, 'brown'), (1, 'dog'), (1, 'fox'), (1, 'jumps'), (1, 'lazy'), (1, 'over'), (1, 'quick')]

cutAfterFirst

s = text('The quick brown fox jumps over the lazy dog')
a.cutAfterFirst('o')
The quick br

cutAfterLast

s = text('The quick brown fox jumps over the lazy dog')
a.cutAfterLast('o')
The quick brown fox jumps over the lazy d

cutBeforeFirst

s = text('The quick brown fox jumps over the lazy dog')
a.cutBeforeFirst('o')
own fox jumps over the lazy dog

cutEnds

s = text('The quick brown fox jumps over the lazy dog')
a.cutEnds(4)
quick brown fox jumps over the lazy

cutLeft

s = text("Hello World")
s.cutLeft(6)
World

cutRight

s = text("Hello World")
s.cutRight(6)
Hello

enclose

a = text("Hello World")
a.enclose('<','>')
<Hello World>

endsWith

a = text("Hello World")
if a.endsWith('World'):
    print("ends with 'World'")
ends with 'World'

With case-insensitive search:

a = text("Hello World")
if a.endsWith('world', False):
    print("ends with 'world'")
ends with 'world'

endsWithAny

if(a.endsWithAny(['cat','dog'])):
    print('end to animal...')
end to animal...

erase

a = text('The quick brown fox jumps over the lazy dog')
a.erase(8, 10)
print(a)
The quicx jumps over the lazy dog

equal

a = text()
a.equal('A',10)
AAAAAAAAAA

find

a = text('The quick brown fox jumps over the lazy dog')
a.find('brown')
'brown fox jumps over the lazy dog'

With case-incensitive search:

a = text('The quick brown fox jumps over the lazy dog')
a.find('Brown', False)
'brown fox jumps over the lazy dog'

fromArray

a = text()
a.fromArray([1,2,3,4])
print(a)
1 2 3 4
a = text()
a.fromArray([1,2,3,4], '|')
print(a)
1|2|3|4
a = text()
a.fromArray([1,2,3,4], '')
print(a)
1234

Array of floats

a = text()
a.fromArray([1.1,2.2,3.3,4.4])
print(a)
1.1 2.2 3.3 4.4

Array of strings

a = text()
a.fromArray(['hello','world'])
print(a)
hello world
import numpy as np
a = text()
a.fromArray(np.array(["hello","world"]))
print(a)
hello world

fromArrayAsHex

a = text()
a.fromArrayAsHex([10,20,30,40])
print(a)
0A 14 1E 28

Use without separator

a.fromArrayAsHex([10,20,30,40],2,'')
print(a)
0A141E28
a = text()
a.fromArrayAsHex([1000,2000,3000,4000])
print(a)
3E8 7D0 BB8 FA0
a = text()
a.fromArrayAsHex([1000,2000,3000,4000], 4, ',')
print(a)
03E8,07D0,0BB8,0FA0

fromBinary

a = text()
a.fromBinary(12345)
print(a)
00000000000000000011000000111001

fromDouble

a = text()
a.fromDouble(3.333338478)
print(a)
a.fromDouble(3.33989, 4)
print(a)
a.fromDouble(3.333338478, 10)
3.333338
3.3399
3.3333384780

fromHex

a = text()
a.fromHex(1234567)
a.fromHex('a')
0012D687
61

fromInteger

a = text()
a.fromInteger(358764)
print(a)
358764

fromMatrix

from ctextlib import CTextA as text
import numpy as np
x = np.array([[10, 20, 30], [40, 50, 60]])
a = text()
a.fromMatrix(x)
print(a)
10 20 30
40 50 60
from ctextlib import CTextA as text
import numpy as np
x = np.array([[10, 20, 30], [40, 50, 60]])
a = text()
a.fromMatrix(x, ',')
10,20,30
40,50,60

fromMatrixAsHex

from ctextlib import CTextA as text
import numpy as np
x = np.array([[10, 20, 30], [40, 50, 60]])
a = text()
a.fromMatrixAsHex(x)
print(a)
0A 14 1E
28 32 3C
from ctextlib import CTextA as text
import numpy as np
x = np.array([[1000, 2000, 3000], [4000, 5000, 6000]])
a = text()
a.fromMatrixAsHex(x,4)
print(a)
03E8 07D0 0BB8
0FA0 1388 1770

getDir

a = text("D:\\Folder\\SubFolder\\TEXT\\file.dat")
a.getDir()
D:\Folder\SubFolder\TEXT\

getExtension

a = text("D:\\Folder\\SubFolder\\TEXT\\file.dat")
a.getExtension()
'.dat'

getFileName

a = text("D:\\Folder\\SubFolder\\TEXT\\file.dat")
a.getFileName()
'file.dat'

hash

s.hash()
9257130453210036571

indexOf

a = text("The quick brown fox jumps over the lazy dog.")
a.indexOf("brown")
10

indexOfAny

a = text("The quick brown fox jumps over the lazy dog.")
a.indexOfAny(["fox", "dog"])
16

indexOfAny

a = text("The quick brown fox jumps over the lazy dog.")
a.indexOfAny("abc")
7

insert

a = text("abc")
a.insert(1,'d',2)
addbc
a = text("The quick jumps over the lazy dog.")
a.insert(10,"fox ")
The quick fox jumps over the lazy dog.

insertAtBegin
insertAtEnd

a = text("Hello")
a.insertAtBegin("<begin>")
a.insertAtEnd("</begin>")
<begin>abc</begin>

isAlpha

a = text("Abcd")
a.isAlpha()
True

isBinary

a = text("01111011100001")
a.isBinary()
True

isEmpty

a = text()
a.isEmpty()
True

isHexNumber

a = text("12AB56FE")
a.isHexNumber()
True

isNumber

a = text("123456")
a.isNumber()
True

isLower

a = text("hello world")
a.isLower()
True

isUpper

a = text("HELLO WORLD")
a.isUpper()
True

isPalindrome

a = text("racecar")
a.isPalindrome()
True

keep

s = text("Hello World").keep(3,5)
lo Wo

keepLeft

a = text("The quick jumps over the lazy dog.")
a.keepLeft(10)
The quick

keepRight

a = text("The quick jumps over the lazy dog.")
a.keepRight(10)
 lazy dog.

lastIndexOf

s = text("Hello World")
s.lastIndexOf('l')
9

lines

a = text("L1\nL2\n\nL3\nL4\n  \n\nL5")
a.lines()
['L1', 'L2', 'L3', 'L4', 'L5']

linesCount

a = text("L1\nL2\n\nL3\nL4\n  \n\nL5")
a.linesCount()
7

linesRemoveEmpty

a = text("L1\nL2\n\nL3\nL4\n  \n\nL5")
a.linesRemoveEmpty()
print(a)
L1
L2
L3
L4
L5

Several per line methods
linesAppend
linesInsertAtBegin
linesSort
linesPaddRight
linesTrim
Example of opening a text file, sort all lines, and save it with another name

from ctextlib import CTextA as text
s = text()
s.readFile('Unordered.txt')
s.linesSort()
s.writeFile('Sorted_python.txt')

limit

s = text("Hello World")
s.limit(6)
Hello

lower

s = text("Hello World")
s.lower()
hello world

makeUnique

a = text()
a.appendRange('a','z').appendRange('a','z')
abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz
a.makeUnique()
print(a)
abcdefghijklmnopqrstuvwxyz

mid

a = text("Hello World").mid(3)
lo Wo

nextLine

# Example of iterating all lines
from ctextlib import CTextA as text
a = text("Line1\nLine2\nLine3")
line = text()
pos = 0
while(pos >= 0):
    pos = a.nextLine(pos,line)
    print(line)
Line1
Line2
Line3

nextWord

# Example of iterating all words
from ctextlib import CTextA as text
a = text('The quick brown fox jumps over the lazy dog')
word = text()
pos = 0
while(pos >= 0):
    pos = a.nextWord(pos,word)
    print(word)
The
quick
brown
fox
jumps
over
the
lazy
dog

paddLeft

s = text("Abra")
s.paddLeft('.', 16)
............Abra

paddRight

s = text("Abra")
s.paddRight('.', 16)
Abra............

pathCombine

a = text("C:\\Temp")
a.pathCombine("..\\Folder")
C:\Folder

quote

a = text("Hello")
a.quote()
"Hello"

random

a = text()
a.random()
"P1kAlMiG2Kb7FzP5"
a.sort()
"1257AFGKMPPbiklz"
a.shuffle()
"k2lF7KAPG5M1Pzbi"
a.random(32)
P1kAlMiG2Kb7FzP5tM1QBI6DSS92c31A

randomAlpha

s = text()
s.randomAlpha()
IkEffmzNiMKKASVW

randomNumber

s = text()
s.randomNumber()
3892795431
s.randomNumber(32)
33341138742779319865028602486509

readFile

# demontrates how to read a whole text file
from ctextlib import CTextA as text
a = text()
a.readFile('test.txt')
print(a)
Hello World

regexMatch

s = text("+336587890078")
if(s.regexMatch("(\\+|-)?[[:digit:]]+")):
    print("it is a number")
it is a number

regexLines

animals.txt
------------
Cat
Dog
Giraffe
Lion
Llama
Monkey
Mouse
Parrot
Poodle
Scorpion
Snake
Weasel
# collect all lines starting with given characters
from ctextlib import CTextA as text
a = text()
a.readFile("animals.txt")
a.regexLines("^[A-G][a-z]+")
['Cat', 'Dog', 'Giraffe']

regexReplace

from ctextlib import CTextA as text
a = text("there is sub-sequence in the sub-way string")
a.regexReplace("\\b(sub)([^ ]*)", "sub-$2")
there is sub--sequence in the sub--way string

regexSearch

# collect all words using regex
from ctextlib import CTextA as text
a = text("The quick brown fox jumps over the lazy dog")
a.regexSearch("\\w+")
'The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

regexWords

# collect all words starting with given characters
from ctextlib import CTextA as text
a = text("The quick brown fox jumps over the lazy dog")
a.regexWords("^[a-n][a-z]+")
['brown', 'fox', 'jumps', 'lazy', 'dog']   

remove

a = text('we few, we happy few, we band of brothers.')
a.remove('we')
a.reduceChain()
a.trim()
few happy few band of brothers

removeAny

from ctextlib import CTextA as text
a = text('The quick brown fox jumps over the lazy dog')
a.removeAny(['brown','quick','lazy'])
a.reduceChain()
The fox jumps over the dog

removeExtension

a = text("D:\\Folder\\SubFolder\\TEXT\\File.dat")
a.removeExtension()
D:\Folder\SubFolder\TEXT\File

removeFileName

a = text("D:\\Folder\\SubFolder\\TEXT\\File.dat")
a.removeFileName()
D:\Folder\SubFolder\TEXT\

removeWhileBegins

a = text("Some text ending with something")
a.removeWhileBegins("Some text ")
print(a)
ending with something

removeWhileEnds

a = text("Some text ending with something")
a.removeWhileEnds(" something")
print(a)
Some text ending with

replace

a = text("The quick brown fox jumps over the lazy dog")
a.replace("fox", "cat")
print(a)
The quick brown cat jumps over the lazy dog
a = text("The quick brown fox jumps over the lazy dog")
a.replace(["fox", "cat","dog","quick"], "-")
The ----- brown --- jumps over the lazy ---

replaceAny

a = text("The quick brown fox jumps over the lazy dog")
a.replaceAny(["fox", "cat","dog"], "***")
print(a)
The quick brown *** jumps over the lazy ***
a = text("The quick brown fox jumps over the lazy dog")
a.replaceAny(["fox", "dog"], ["dog", "fox"])
The quick brown dog jumps over the lazy fox

reverse

a = text("Hello")
a.reverse()
olleH

right

a = text("Hello World")
a.right(5)
World

rotate

a = text("Hello World")
a.rotateLeft(2)
a.rotateRight(4)

Output

llo WorldHe
ldHello Wor

split

# by default split uses the standard separators (" \t\r\n")
a = text("The quick brown fox jumps over the lazy dog")
a.split()
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
# split can be used with any list of separator characters
a = text("The quick, brown....fox,,, ,jumps over,the  lazy.dog")
a.split(",. ")
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

toBinary

bOk = False
a = text("100001")
a.toBinaryNumber(bOk)
33

toHexNumber

bOk = False
a = text("1E1E")
a.toHexNumber(bOk)
7710

trim

a = text(" \t\n   lazy dog  \t\n   ")
a.trim()
lazy dog
a = text("000000000000101")
a.trimLeft("0")
101
a = ("101000000000000")
a.trimRight('0')
101
a = text("0000000101000000000")
a.trim("0")
101

upper

s = text("Hello World")
s.upper()
HELLO WORLD

words

a = text("The quick brown fox jumps over the lazy dog")
a.words()
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
a = text("The|quick|brown|fox|jumps|over|the|lazy|dog")
a.words('|')
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

wordsCapitalize

a = text("The quick brown fox jumps over the lazy dog")
a.wordsCapitalize()
The Quick Brown Fox Jumps Over The Lazy Dog

wordsCount

a = text('The quick brown fox jumps over the lazy dog')
a.wordsCount()
9

wordsEnclose

a = text("The quick brown fox jumps over the lazy dog")
a.wordsEnclose('[',']')
[The] [quick] [brown] [fox] [jumps] [over] [the] [lazy] [dog]

wordsReverse

a = text("The quick brown fox jumps over the lazy dog")
a.wordsReverse()
ehT kciuq nworb xof spmuj revo eht yzal god

wordsSort

a = text('The quick brown fox jumps over the lazy dog')
a.wordsSort()

Output

The brown dog fox jumps lazy over quick the

writeFile

# demontrates how to read a whole text file
from ctextlib import CTextA as text
a = text("Hello World")
a.writeFile('test.txt')
print(a)

For the full list type help(ctextlib).

Performance Tests

Comparing to built-in Python text library, in many cases CText methods are faster, sometimes 2-3 times faster. When using Python regular epressions difference is even bigger.

Below are given several Python performance tests and obtained results.

Test 1 - words enclose

from time import perf_counter
from ctextlib import CTextA as text
import re
import urllib.request

# download words.txt from https://github.com/dwyl/english-words
print("download words.txt.....")
url = 'https://github.com/dwyl/english-words/raw/master/words.txt'
urllib.request.urlretrieve(url, 'words.txt')

a = text()
if(a.readFile("words.txt") == False):
    print("error openning file")
    exit()
start = perf_counter()
a.wordsEnclose('[',']')
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("words.txt CText wordsEnclose", duration * 1000))

if(a.readFile("words.txt") == False):
    print("error openning file")
    exit()

s = a.str()

start = perf_counter()
b = ' '.join('[{}]'.format(word) for word in s.split('\n'))
duration = perf_counter() - start

print('{} took {:.3f} ms'.format("words.txt Python split-join", duration * 1000))

s = a.str()

start = perf_counter()
s = re.sub(r'(\w+)',r'[\1]',s)
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("words.txt Python re.sub", duration * 1000))

Output

words.txt CText wordsEnclose took 92.083 ms
words.txt Python split-join took 186.377 ms
words.txt Python re.sub took 601.214 ms

Test 2 - words reverse

from time import perf_counter
from ctextlib import CTextA as text
import re
import urllib.request

# download words.txt from https://github.com/dwyl/english-words
print("download words.txt.....")
url = 'https://github.com/dwyl/english-words/raw/master/words.txt'
urllib.request.urlretrieve(url, 'words.txt')

a = text()
if(a.readFile("words.txt") == False):
    print("error openning file")
    exit()
start = perf_counter()
a.wordsReverse()
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("words.txt CText wordsReverse", duration * 1000))

if(a.readFile("words.txt") == False):
    print("error openning file")
    exit()

s = a.str()

start = perf_counter()
b = ' '.join( word[::-1] for word in (s.split('\n')))
duration = perf_counter() - start

print('{} took {:.3f} ms'.format("words.txt Python reverse split-join", duration * 1000))

start = perf_counter()
words = ' '.join( word[::-1] for word in ( re.findall('\w+|[:;,.!?]', s)))
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("words.txt Python reverse re.findall", duration * 1000))
words.txt CText wordsReverse took 78.501 ms
words.txt Python reverse split-join took 130.286 ms
words.txt Python reverse re.findall took 609.706 ms

Test 2 - remove repeating lines

from time import perf_counter
from ctextlib import CTextA as text
import re
import urllib.request

# download from https://www.gutenberg.org/files/2600/2600-0.txt
print("download 2600-0.txt.....")
url = 'https://www.gutenberg.org/files/2600/2600-0.txt'
urllib.request.urlretrieve(url, '2600-0.txt')

a = text()
if(a.readFile("2600-0.txt") == False):
    print("error opening file")
    exit()

s = a.str()

start = perf_counter()
a.linesRemoveEmpty()
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("linesRemoveEmpty", duration * 1000))
#print(a)

start = perf_counter()    
b = '\n'.join(line for line in s.split('\n') if line.strip() != '')
duration = perf_counter() - start
print('{} took {:.3f} ms'.format("line.strip 18362.txt", duration * 1000))
linesRemoveEmpty took 11.599 ms
line.strip took 31.567 ms

CText Performance

TODO List

  • More methods for words,lines,sentences and complex expressions: There are lots more methods that can be added to support diferent NLP and lexical tasks.
  • Further improve containers abstraction: CText needs more convertion routines to/from STL and other containers and generic data structures.
  • Regular Expressions: - Partial or full support to regular expressions.
  • Other char types: - Character types like char_32 can be also supported
  • Mini Text Editor: - This is a text editor based on CText that I plan to port on Modern C++.
  • Export to Python: - I want to export CText library to Python-3
  • Performance Test: - Add performance tests comparing with STL string.
  • Use Tries for large keywords lists: - This will greately optimize searach in large lists of words

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

ctextlib-1.0.5.tar.gz (198.5 kB view hashes)

Uploaded Source

Built Distribution

ctextlib-1.0.5-cp37-cp37m-win_amd64.whl (196.4 kB view hashes)

Uploaded CPython 3.7m Windows x86-64

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page