extracts structured data from text using user-defined delimiters (strings or regex)
Project description
extracts structured data from text using user-defined delimiters (strings or regex)
Tested against Windows / Python 3.11 / Anaconda
pip install parifinder
parifinder extracts structured data from text using user-defined delimiters (strings or regex), making it versatile for data processing.
Advantages
Flexibility:
The function can handle a wide range of scenarios, making it versatile for parsing text with various delimiters. It can handle both single and multiple pairs of delimiters, whether they are simple strings or complex regular expressions. This flexibility makes it suitable for different use cases.
Scalability:
It can parse multiple pairs of delimiters within a given text, which is especially useful when dealing with documents or data containing nested elements.
Pure Python:
It uses only Python's standard library
from parifinder import parse_pairs
from pprint import pprint
text_0 = """[[1, 2, 2], [5], [2, 3]], 12: [[4, 4, 4], [12, 0], [6, 6]], 3: [[1, 2]][[1, 2, 2], [5], [2, 3]], 12: [[4, 4, 4], [12, 0], [6, 6]], 3: [[1, 2]]"""
s1_0 = "["
s2_0 = "]"
r0 = parse_pairs(string=text_0, s1=s1_0, s2=s2_0, str_regex=False)
print("r0-----------------------------------------------------------------")
pprint(r0, indent=1, width=1)
text_1 = "<body><p>a</p><p>a</p><p>The HTML <code>button</code> tag defines a clickable button.</p><p>x</p><p>The CSS <code>background-color</code> property defines the background color of an element.</p></body></html>"
s1_1 = "<p>"
s2_1 = "</p>"
r1 = parse_pairs(string=text_1, s1=s1_1, s2=s2_1, str_regex=False)
print("r1-----------------------------------------------------------------")
pprint(r1, indent=1, width=1)
text_2 = "[1bla[2bla/2]/1]"
s1_2 = r"\[\d"
s2_2 = r"/\d]"
r2 = parse_pairs(string=text_2, s1=s1_2, s2=s2_2, str_regex=True)
print("r2-----------------------------------------------------------------")
pprint(r2, indent=1, width=1)
text_3 = "[1bla[2bla/2]/1]"
s1_3 = [("[1", "/1]"), ("[2", "/2]")]
s2_3 = None
r3 = parse_pairs(string=text_3, s1=s1_3, s2=s2_3, str_regex=False)
print("r3-----------------------------------------------------------------")
pprint(r3, indent=1, width=1)
text_4 = "[1bla[2bla/2]/1]"
s1_4 = ["[1", "[2"]
s2_4 = ["/1]", "/2]"]
r4 = parse_pairs(string=text_4, s1=s1_4, s2=s2_4, str_regex=False)
print("r4-----------------------------------------------------------------")
pprint(r4, indent=1, width=1)
# r0-----------------------------------------------------------------
# {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23): {'children': [(1,
# 2,
# 3,
# 4,
# 5,
# 6,
# 7,
# 8,
# 9),
# (17,
# 18,
# 19,
# 20,
# 21,
# 22),
# (12,
# 13,
# 14)],
# 'end': 23,
# 'parents': [],
# 'size': 23,
# 'start': 0,
# 'text': '[[1, '
# '2, '
# '2], '
# '[5], '
# '[2, '
# '3]]'},
# (1, 2, 3, 4, 5, 6, 7, 8, 9): {'children': [],
# 'end': 9,
# 'parents': [(0,
# 1,
# 2,
# 3,
# 4,
# 5,
# 6,
# 7,
# 8,
# 9,
# 10,
# 11,
# 12,
# 13,
# 14,
# 15,
# 16,
# 17,
# 18,
# 19,
# 20,
# 21,
# 22,
# 23)],
# 'size': 8,
# 'start': 1,
# 'text': '[1, '
# '2, '
# '2]'},
# (12, 13, 14): {'children': [],
# 'end': 14,
# 'parents': [(0,
# 1,
# 2,
# 3,
# 4,
# 5,
# 6,
# 7,
# 8,
# 9,
# 10,
# 11,
# 12,
# 13,
# 14,
# 15,
# 16,
# 17,
# 18,
# 19,
# 20,
# 21,
# 22,
# 23)],
# 'size': 2,
# 'start': 12,
# 'text': '[5]'},
# (17, 18, 19, 20, 21, 22): {'children': [],
# 'end': 22,
# 'parents': [(0,
# 1,
# 2,
# 3,
# 4,
# 5,
# 6,
# 7,
# 8,
# 9,
# 10,
# 11,
# 12,
# 13,
# 14,
# 15,
# 16,
# 17,
# 18,
# 19,
# 20,
# 21,
# 22,
# 23)],
# 'size': 5,
# 'start': 17,
# 'text': '[2, '
# '3]'},
# (30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57): {'children': [(31,
# 32,
# 33,
# 34,
# 35,
# 36,
# 37,
# 38,
# 39),
# (42,
# 43,
# 44,
# 45,
# 46,
# 47,
# 48),
# (51,
# 52,
# 53,
# 54,
# 55,
# 56)],
# 'end': 57,
# 'parents': [],
# 'size': 27,
# 'start': 30,
# 'text': '[[4, '
# '4, '
# '4], '
# '[12, '
# '0], '
# '[6, '
# '6]]'},
# (31, 32, 33, 34, 35, 36, 37, 38, 39): {'children': [],
# 'end': 39,
# 'parents': [(30,
# 31,
# 32,
# 33,
# 34,
# 35,
# 36,
# 37,
# 38,
# 39,
# 40,
# 41,
# 42,
# 43,
# 44,
# 45,
# 46,
# 47,
# 48,
# 49,
# 50,
# 51,
# 52,
# 53,
# 54,
# 55,
# 56,
# 57)],
# 'size': 8,
# 'start': 31,
# 'text': '[4, '
# '4, '
# '4]'},
# (42, 43, 44, 45, 46, 47, 48): {'children': [],
# 'end': 48,
# 'parents': [(30,
# 31,
# 32,
# 33,
# 34,
# 35,
# 36,
# 37,
# 38,
# 39,
# 40,
# 41,
# 42,
# 43,
# 44,
# 45,
# 46,
# 47,
# 48,
# 49,
# 50,
# 51,
# 52,
# 53,
# 54,
# 55,
# 56,
# 57)],
# 'size': 6,
# 'start': 42,
# 'text': '[12, '
# '0]'},
# (51, 52, 53, 54, 55, 56): {'children': [],
# 'end': 56,
# 'parents': [(30,
# 31,
# 32,
# 33,
# 34,
# 35,
# 36,
# 37,
# 38,
# 39,
# 40,
# 41,
# 42,
# 43,
# 44,
# 45,
# 46,
# 47,
# 48,
# 49,
# 50,
# 51,
# 52,
# 53,
# 54,
# 55,
# 56,
# 57)],
# 'size': 5,
# 'start': 51,
# 'text': '[6, '
# '6]'},
# (63, 64, 65, 66, 67, 68, 69, 70): {'children': [(64,
# 65,
# 66,
# 67,
# 68,
# 69)],
# 'end': 70,
# 'parents': [],
# 'size': 7,
# 'start': 63,
# 'text': '[[1, '
# '2]]'},
# (64, 65, 66, 67, 68, 69): {'children': [],
# 'end': 69,
# 'parents': [(63,
# 64,
# 65,
# 66,
# 67,
# 68,
# 69,
# 70)],
# 'size': 5,
# 'start': 64,
# 'text': '[1, '
# '2]'},
# (71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94): {'children': [(72,
# 73,
# 74,
# 75,
# 76,
# 77,
# 78,
# 79,
# 80),
# (88,
# 89,
# 90,
# 91,
# 92,
# 93),
# (83,
# 84,
# 85)],
# 'end': 94,
# 'parents': [],
# 'size': 23,
# 'start': 71,
# 'text': '[[1, '
# '2, '
# '2], '
# '[5], '
# '[2, '
# '3]]'},
# (72, 73, 74, 75, 76, 77, 78, 79, 80): {'children': [],
# 'end': 80,
# 'parents': [(71,
# 72,
# 73,
# 74,
# 75,
# 76,
# 77,
# 78,
# 79,
# 80,
# 81,
# 82,
# 83,
# 84,
# 85,
# 86,
# 87,
# 88,
# 89,
# 90,
# 91,
# 92,
# 93,
# 94)],
# 'size': 8,
# 'start': 72,
# 'text': '[1, '
# '2, '
# '2]'},
# (83, 84, 85): {'children': [],
# 'end': 85,
# 'parents': [(71,
# 72,
# 73,
# 74,
# 75,
# 76,
# 77,
# 78,
# 79,
# 80,
# 81,
# 82,
# 83,
# 84,
# 85,
# 86,
# 87,
# 88,
# 89,
# 90,
# 91,
# 92,
# 93,
# 94)],
# 'size': 2,
# 'start': 83,
# 'text': '[5]'},
# (88, 89, 90, 91, 92, 93): {'children': [],
# 'end': 93,
# 'parents': [(71,
# 72,
# 73,
# 74,
# 75,
# 76,
# 77,
# 78,
# 79,
# 80,
# 81,
# 82,
# 83,
# 84,
# 85,
# 86,
# 87,
# 88,
# 89,
# 90,
# 91,
# 92,
# 93,
# 94)],
# 'size': 5,
# 'start': 88,
# 'text': '[2, '
# '3]'},
# (101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128): {'children': [(102,
# 103,
# 104,
# 105,
# 106,
# 107,
# 108,
# 109,
# 110),
# (113,
# 114,
# 115,
# 116,
# 117,
# 118,
# 119),
# (122,
# 123,
# 124,
# 125,
# 126,
# 127)],
# 'end': 128,
# 'parents': [],
# 'size': 27,
# 'start': 101,
# 'text': '[[4, '
# '4, '
# '4], '
# '[12, '
# '0], '
# '[6, '
# '6]]'},
# (102, 103, 104, 105, 106, 107, 108, 109, 110): {'children': [],
# 'end': 110,
# 'parents': [(101,
# 102,
# 103,
# 104,
# 105,
# 106,
# 107,
# 108,
# 109,
# 110,
# 111,
# 112,
# 113,
# 114,
# 115,
# 116,
# 117,
# 118,
# 119,
# 120,
# 121,
# 122,
# 123,
# 124,
# 125,
# 126,
# 127,
# 128)],
# 'size': 8,
# 'start': 102,
# 'text': '[4, '
# '4, '
# '4]'},
# (113, 114, 115, 116, 117, 118, 119): {'children': [],
# 'end': 119,
# 'parents': [(101,
# 102,
# 103,
# 104,
# 105,
# 106,
# 107,
# 108,
# 109,
# 110,
# 111,
# 112,
# 113,
# 114,
# 115,
# 116,
# 117,
# 118,
# 119,
# 120,
# 121,
# 122,
# 123,
# 124,
# 125,
# 126,
# 127,
# 128)],
# 'size': 6,
# 'start': 113,
# 'text': '[12, '
# '0]'},
# (122, 123, 124, 125, 126, 127): {'children': [],
# 'end': 127,
# 'parents': [(101,
# 102,
# 103,
# 104,
# 105,
# 106,
# 107,
# 108,
# 109,
# 110,
# 111,
# 112,
# 113,
# 114,
# 115,
# 116,
# 117,
# 118,
# 119,
# 120,
# 121,
# 122,
# 123,
# 124,
# 125,
# 126,
# 127,
# 128)],
# 'size': 5,
# 'start': 122,
# 'text': '[6, '
# '6]'},
# (134, 135, 136, 137, 138, 139, 140, 141): {'children': [(135,
# 136,
# 137,
# 138,
# 139,
# 140)],
# 'end': 141,
# 'parents': [],
# 'size': 7,
# 'start': 134,
# 'text': '[[1, '
# '2]]'},
# (135, 136, 137, 138, 139, 140): {'children': [],
# 'end': 140,
# 'parents': [(134,
# 135,
# 136,
# 137,
# 138,
# 139,
# 140,
# 141)],
# 'size': 5,
# 'start': 135,
# 'text': '[1, '
# '2]'}}
# r1-----------------------------------------------------------------
# {(6, 7, 8, 9, 10, 11, 12, 13, 14): {'children': [],
# 'end': 14,
# 'parents': [],
# 'size': 9,
# 'start': 6,
# 'text': '<p>a</p>'},
# (14, 15, 16, 17, 18, 19, 20, 21, 22): {'children': [],
# 'end': 22,
# 'parents': [],
# 'size': 9,
# 'start': 14,
# 'text': '<p>a</p>'},
# (22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89): {'children': [],
# 'end': 89,
# 'parents': [],
# 'size': 68,
# 'start': 22,
# 'text': '<p>The '
# 'HTML '
# '<code>button</code> '
# 'tag '
# 'defines '
# 'a '
# 'clickable '
# 'button.</p>'},
# (89, 90, 91, 92, 93, 94, 95, 96, 97): {'children': [],
# 'end': 97,
# 'parents': [],
# 'size': 9,
# 'start': 89,
# 'text': '<p>x</p>'},
# (97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194): {'children': [],
# 'end': 194,
# 'parents': [],
# 'size': 98,
# 'start': 97,
# 'text': '<p>The '
# 'CSS '
# '<code>background-color</code> '
# 'property '
# 'defines '
# 'the '
# 'background '
# 'color '
# 'of '
# 'an '
# 'element.</p>'}}
# r2-----------------------------------------------------------------
# {('[1', '/1]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],
# 'end': 16,
# 'parents': [],
# 'size': 17,
# 'start': 0,
# 'text': '[1bla[2bla/2]/1]'}},
# ('[1', '/2]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],
# 'end': 13,
# 'parents': [],
# 'size': 14,
# 'start': 0,
# 'text': '[1bla[2bla/2]'}},
# ('[2', '/1]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],
# 'end': 16,
# 'parents': [],
# 'size': 12,
# 'start': 5,
# 'text': '[2bla/2]/1]'}},
# ('[2', '/2]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],
# 'end': 13,
# 'parents': [],
# 'size': 9,
# 'start': 5,
# 'text': '[2bla/2]'}}}
# r3-----------------------------------------------------------------
# {('[1', '/1]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],
# 'end': 16,
# 'parents': [],
# 'size': 17,
# 'start': 0,
# 'text': '[1bla[2bla/2]/1]'}},
# ('[2', '/2]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],
# 'end': 13,
# 'parents': [],
# 'size': 9,
# 'start': 5,
# 'text': '[2bla/2]'}}}
# r4-----------------------------------------------------------------
# {('[1', '/1]'): {(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16): {'children': [],
# 'end': 16,
# 'parents': [],
# 'size': 17,
# 'start': 0,
# 'text': '[1bla[2bla/2]/1]'}},
# ('[2', '/2]'): {(5, 6, 7, 8, 9, 10, 11, 12, 13): {'children': [],
# 'end': 13,
# 'parents': [],
# 'size': 9,
# 'start': 5,
# 'text': '[2bla/2]'}}}
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
Built Distribution
File details
Details for the file parifinder-0.10.tar.gz
.
File metadata
- Download URL: parifinder-0.10.tar.gz
- Upload date:
- Size: 29.9 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/4.0.2 CPython/3.11.5
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | 1443ce07cef731f57e74c55744614fa1a52b6e1f8873f24be2ee1c82543f3906 |
|
MD5 | 1e67f1daa3046ced3830d334fa44d3f0 |
|
BLAKE2b-256 | 51b129731142c02192070af86a7c792d3eb7d2f5b90f6188a28df91d84b75a98 |
File details
Details for the file parifinder-0.10-py3-none-any.whl
.
File metadata
- Download URL: parifinder-0.10-py3-none-any.whl
- Upload date:
- Size: 22.3 kB
- Tags: Python 3
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/4.0.2 CPython/3.11.5
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | 8303552a9b79fa03f37b765dd3ba1f75e94faf5c18d1509e48b25a53904bfc9f |
|
MD5 | a70d380d498974b173caf0ff6506db38 |
|
BLAKE2b-256 | 9137989178c4f74ba41d23a6d99af5eabbaf1ddab92b6e8f9c29ad2485f950a4 |