Skip to main content

tfrecords: fast and simple reader and writer

Project description

tfrecords

simplify and transplant the tfrecord and table

update information

    2023-07-01:  Add arrow parquet
    2022-10-30:  Add lmdb leveldb read and writer and add record batch write
    2022-10-17:  Add shared memory for record to read mode with more accelerated Reading.
    2022-02-01:  simplify and transplant the tfrecord dataset

1. record read and write demo , with_share_memory flags will Accelerated Reading

# -*- coding: utf-8 -*-
# @Time    : 2022/9/8 15:49

import tfrecords

options = tfrecords.TFRecordOptions(compression_type=tfrecords.TFRecordCompressionType.NONE)


def test_write(filename, N=3, context='aaa'):
    with tfrecords.TFRecordWriter(filename, options=options) as file_writer:
        batch_data = []
        for i in range(N):
            d = context + '____' + str(i)
            batch_data.append(d)
            if (i + 1) % 100 == 0:
                file_writer.write_batch(batch_data)
                batch_data.clear()
        if len(batch_data):
            file_writer.write_batch(batch_data)
            batch_data.clear()


def test_record_iterator(example_paths):
    print('test_record_iterator')
    for example_path in example_paths:
        iterator = tfrecords.tf_record_iterator(example_path, options=options, skip_bytes=0, with_share_memory=True)
        offset_list = iterator.read_offsets(0)
        count = iterator.read_count(0)
        print(count)
        num = 0
        for iter in iterator:
            num += 1
            print(iter)


def test_random_reader(example_paths):
    print('test_random_reader')
    for example_path in example_paths:
        file_reader = tfrecords.tf_record_random_reader(example_path, options=options, with_share_memory=True)
        last_pos = 0
        while True:
            try:
                x, pos = file_reader.read(last_pos)
                print(x, pos)
                last_pos = pos

            except Exception as e:
                break


def test_random_reader2(example_paths):
    print('test_random_reader2')
    for example_path in example_paths:
        file_reader = tfrecords.tf_record_random_reader(example_path, options=options, with_share_memory=True)
        skip_bytes = 0
        offset_list = file_reader.read_offsets(skip_bytes)
        for offset, length in offset_list:
            x, _ = file_reader.read(offset)
            print(x)


test_write('d:/example.tfrecords0', 3, 'file0')

example_paths = tfrecords.glob('d:/example.tfrecords*')
print(example_paths)
test_record_iterator(example_paths)
print()
test_random_reader(example_paths)
print()
test_random_reader2(example_paths)
print()

2. leveldb read and write demo

# -*- coding: utf-8 -*-
# @Time    : 2022/9/8 15:49

from tfrecords import LEVELDB

db_path = 'd:/example_leveldb'


def test_write(db_path):
    options = LEVELDB.LeveldbOptions(create_if_missing=True, error_if_exists=False)
    file_writer = LEVELDB.Leveldb(db_path, options)

    keys, values = [], []
    for i in range(30):
        keys.append(b"input_" + str(i).encode())
        keys.append(b"label_" + str(i).encode())
        values.append(b"xiaoming" + str(i).encode())
        values.append(b"zzs" + str(i).encode())
        if (i + 1) % 1000 == 0:
            file_writer.put_batch(keys, values)
            keys.clear()
            values.clear()
    if len(keys):
        file_writer.put_batch(keys, values)
        keys.clear()
        values.clear()

    file_writer.close()


def test_read(db_path):
    options = LEVELDB.LeveldbOptions(create_if_missing=False, error_if_exists=False)
    reader = LEVELDB.Leveldb(db_path, options)

    def show():
        it = reader.get_iterater(reverse=False)
        i = 0
        for item in it:
            print(i, item)
            i += 1

    def test_find(key):
        value = reader.get(key)
        print('find', type(value), value)

    show()

    test_find(b'input_0')
    test_find(b'input_5')
    test_find(b'input_10')

    reader.close()


test_write(db_path)
test_read(db_path)

3. lmdb read and write demo

# -*- coding: utf-8 -*-
# @Time    : 2022/9/8 15:49

from tfrecords import LMDB

db_path = 'd:/example_lmdb'


def test_write(db_path):
    options = LMDB.LmdbOptions(env_open_flag=0,
                               env_open_mode=0o664,  # 8进制表示
                               txn_flag=0,
                               dbi_flag=0,
                               put_flag=0)
    file_writer = LMDB.Lmdb(db_path, options, map_size=1024 * 1024 * 10)
    keys, values = [], []
    for i in range(30):
        keys.append(b"input_" + str(i).encode())
        keys.append(b"label_" + str(i).encode())
        values.append(b"xiaoming_" + str(i).encode())
        values.append(b"zzs_" + str(i).encode())
        if (i + 1) % 1000 == 0:
            file_writer.put_batch(keys, values)
            keys.clear()
            values.clear()
    if len(keys):
        file_writer.put_batch(keys, values)
    file_writer.close()


def test_read(db_path):
    options = LMDB.LmdbOptions(env_open_flag=LMDB.LmdbFlag.MDB_RDONLY,
                               env_open_mode=0o664,  # 8进制表示
                               txn_flag = 0, # LMDB.LmdbFlag.MDB_RDONLY
                               dbi_flag=0,
                               put_flag=0)
    reader = LMDB.Lmdb(db_path, options, map_size=0)

    def show():
        it = reader.get_iterater(reverse=False)
        i = 0
        for item in it:
            print(i, item)
            i += 1

    def test_find(key):
        value = reader.get(key)
        print('find', type(value), value)

    show()
    test_find('input0')
    test_find('input5')
    test_find(b'input10')
    reader.close()


test_write(db_path)
test_read(db_path)

4. arrow demo

Stream

from tfrecords.python.io.arrow import IPC_Writer,IPC_StreamReader,arrow

path_file = "d:/tmp/data.arrow"

def test_write():
    schema = arrow.schema([
        arrow.field('id', arrow.int32()),
        arrow.field('text', arrow.utf8())
    ])

    a = arrow.Int32Builder()
    a.AppendValues([0,1,4])
    a = a.Finish().Value()

    b = arrow.StringBuilder()
    b.AppendValues(["aaaa","你是谁","张三"])
    b = b.Finish().Value()

    table = arrow.Table.Make(schema = schema,arrays=[a,b])
    fs = IPC_Writer(path_file,schema,with_stream = True)
    fs.write_table(table)
    fs.close()

def test_read():
    fs = IPC_StreamReader(path_file)
    table = fs.read_all()
    fs.close()
    print(table)

    col = table.GetColumnByName('text')
    text_list = col.chunk(0)
    for i in range(text_list.length()):
        x = text_list.Value(i)
        print(type(x), x)


test_write()
test_read()

file

from tfrecords.python.io.arrow import IPC_Writer,IPC_StreamReader,IPC_MemoryMappedFileReader,arrow

path_file = "d:/tmp/data.arrow"

def test_write():
    schema = arrow.schema([
        arrow.field('id', arrow.int32()),
        arrow.field('text', arrow.utf8())
    ])

    a = arrow.Int32Builder()
    a.AppendValues([0,1,4])
    a = a.Finish().Value()

    b = arrow.StringBuilder()
    b.AppendValues(["aaaa","你是谁","张三"])
    b = b.Finish().Value()

    table = arrow.Table.Make(schema = schema,arrays=[a,b])
    fs = IPC_Writer(path_file,schema,with_stream = False)
    fs.write_table(table)
    fs.close()


def test_read():

    fs = IPC_MemoryMappedFileReader(path_file)
    for i in range(fs.num_record_batches()):
        batch = fs.read_batch(i)
        print(batch)
    fs.close()


test_write()
test_read()

4. parquet demo

from tfrecords.python.io.arrow import ParquetWriter,IPC_StreamReader,ParquetReader,arrow
path_file = "d:/tmp/data.parquet"

def test_write():
    schema = arrow.schema([
        arrow.field('id', arrow.int32()),
        arrow.field('text', arrow.utf8())
    ])

    a = arrow.Int32Builder()
    a.AppendValues([0, 1, 4, 5])
    a = a.Finish().Value()

    b = arrow.StringBuilder()
    b.AppendValues(["aaaa", "你是谁", "张三", "李赛"])
    b = b.Finish().Value()

    table = arrow.Table.Make(schema=schema, arrays=[a, b])

    fs = ParquetWriter(path_file, schema)
    fs.write_table(table)
    fs.close()

def test_read():

    fs = ParquetReader(path_file,options=dict(buffer_size=2))
    table = fs.read_table()
    fs.close()
    table = table.Flatten().Value()
    print(table)

    col = table.GetColumnByName('text')
    text_list = col.chunk(0)
    for i in range(text_list.length()):
        x = text_list.Value(i)
        print(type(x),x)


test_write()
test_read()

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distributions

No source distribution files available for this release.See tutorial on generating distribution archives.

Built Distributions

If you're not sure about the file name format, learn more about wheel file names.

tfrecords-0.3.0-cp313-cp313-win_amd64.whl (8.6 MB view details)

Uploaded CPython 3.13Windows x86-64

tfrecords-0.3.0-cp313-cp313-manylinux2014_x86_64.whl (18.2 MB view details)

Uploaded CPython 3.13

tfrecords-0.3.0-cp312-cp312-win_amd64.whl (8.6 MB view details)

Uploaded CPython 3.12Windows x86-64

tfrecords-0.3.0-cp312-cp312-manylinux2014_x86_64.whl (18.2 MB view details)

Uploaded CPython 3.12

tfrecords-0.3.0-cp312-cp312-manylinux2014_aarch64.whl (14.3 MB view details)

Uploaded CPython 3.12

tfrecords-0.3.0-cp311-cp311-win_amd64.whl (8.6 MB view details)

Uploaded CPython 3.11Windows x86-64

tfrecords-0.3.0-cp311-cp311-manylinux2014_x86_64.whl (18.2 MB view details)

Uploaded CPython 3.11

tfrecords-0.3.0-cp311-cp311-manylinux2014_aarch64.whl (14.4 MB view details)

Uploaded CPython 3.11

tfrecords-0.3.0-cp310-cp310-win_amd64.whl (8.6 MB view details)

Uploaded CPython 3.10Windows x86-64

tfrecords-0.3.0-cp310-cp310-manylinux2014_x86_64.whl (18.2 MB view details)

Uploaded CPython 3.10

tfrecords-0.3.0-cp310-cp310-manylinux2014_aarch64.whl (14.4 MB view details)

Uploaded CPython 3.10

File details

Details for the file tfrecords-0.3.0-cp313-cp313-win_amd64.whl.

File metadata

  • Download URL: tfrecords-0.3.0-cp313-cp313-win_amd64.whl
  • Upload date:
  • Size: 8.6 MB
  • Tags: CPython 3.13, Windows x86-64
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.1.0 CPython/3.10.2

File hashes

Hashes for tfrecords-0.3.0-cp313-cp313-win_amd64.whl
Algorithm Hash digest
SHA256 9cf8ff4c0c56b8842d4b49e4c036023713a67951e52aebd37e6445947f0e35b5
MD5 ac05173c853f474f324ae2a6964871dd
BLAKE2b-256 a066efa7d1ec0d3ae6bc0d9348b85a84dd2f224a6b72c8dc8c26572cd50c0d1d

See more details on using hashes here.

File details

Details for the file tfrecords-0.3.0-cp313-cp313-manylinux2014_x86_64.whl.

File metadata

File hashes

Hashes for tfrecords-0.3.0-cp313-cp313-manylinux2014_x86_64.whl
Algorithm Hash digest
SHA256 b1faf7b1a8ddf42fb7416444196494a038da3a2a62fde1149a3a6372b1dc8672
MD5 a8ec14bf38cdce1d03af3d0b42005e07
BLAKE2b-256 7bba913b9845a9fe9e2832ac25a43c7d11600ac1423b80a41a41c598ebef70a1

See more details on using hashes here.

File details

Details for the file tfrecords-0.3.0-cp312-cp312-win_amd64.whl.

File metadata

  • Download URL: tfrecords-0.3.0-cp312-cp312-win_amd64.whl
  • Upload date:
  • Size: 8.6 MB
  • Tags: CPython 3.12, Windows x86-64
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.1.0 CPython/3.10.2

File hashes

Hashes for tfrecords-0.3.0-cp312-cp312-win_amd64.whl
Algorithm Hash digest
SHA256 2ea52cc1b770c68bd88343d511f2a1e4964907ccbd73b108aab648296abc4535
MD5 682b569dd225824049dcfde4ad5c5962
BLAKE2b-256 52c4261adfa2981a9d0fee04176196e558227159b7016f66c09b50e62cc9ec6a

See more details on using hashes here.

File details

Details for the file tfrecords-0.3.0-cp312-cp312-manylinux2014_x86_64.whl.

File metadata

File hashes

Hashes for tfrecords-0.3.0-cp312-cp312-manylinux2014_x86_64.whl
Algorithm Hash digest
SHA256 91a200c106c22eaf9e8f9d260377726dc50fe9dda464e0f8aefceb19f39bd784
MD5 596dd690b34f0fa25cd9387f004e443c
BLAKE2b-256 1b12739c572c4bf7f24439923c3ce9c9f0d8912b9832b12a326723dface7017c

See more details on using hashes here.

File details

Details for the file tfrecords-0.3.0-cp312-cp312-manylinux2014_aarch64.whl.

File metadata

File hashes

Hashes for tfrecords-0.3.0-cp312-cp312-manylinux2014_aarch64.whl
Algorithm Hash digest
SHA256 99a7e7d3b985da877677f84473a394504a8c408f104db5f36847aad95a4f9280
MD5 b9400f73cc8e44e8b5972c4e37561b30
BLAKE2b-256 e1fda971c1c1d384b2d3bcc9a74906bfc911b9281c55a05c13731b019f8709c9

See more details on using hashes here.

File details

Details for the file tfrecords-0.3.0-cp311-cp311-win_amd64.whl.

File metadata

  • Download URL: tfrecords-0.3.0-cp311-cp311-win_amd64.whl
  • Upload date:
  • Size: 8.6 MB
  • Tags: CPython 3.11, Windows x86-64
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.1.0 CPython/3.10.2

File hashes

Hashes for tfrecords-0.3.0-cp311-cp311-win_amd64.whl
Algorithm Hash digest
SHA256 eb3160f00a88aab23cf0ebc89ce7ba69dbfa0cfca4fa4477910d2362ad0e4754
MD5 1803dc622f1a744b5b1d4597108ec68d
BLAKE2b-256 ad4dbb224b52324a94ce2e5fca72093c3b859e68ea91fbc2a0e5ee49b9c158bd

See more details on using hashes here.

File details

Details for the file tfrecords-0.3.0-cp311-cp311-manylinux2014_x86_64.whl.

File metadata

File hashes

Hashes for tfrecords-0.3.0-cp311-cp311-manylinux2014_x86_64.whl
Algorithm Hash digest
SHA256 6e5bde87fbd8dc4c0090f51962e90781d50a574f402c38974b27339178d65d92
MD5 adb69575ea5bf6cc64937160147998d9
BLAKE2b-256 170bf596deeab6b58cd237ae0e5be63b155805bcc573add30bcfa9e47857dd4c

See more details on using hashes here.

File details

Details for the file tfrecords-0.3.0-cp311-cp311-manylinux2014_aarch64.whl.

File metadata

File hashes

Hashes for tfrecords-0.3.0-cp311-cp311-manylinux2014_aarch64.whl
Algorithm Hash digest
SHA256 008177d30c654543cec85e69947853446704c2e3fb6386b556fc253e73c89b1f
MD5 d735b4ab74cfcca2afadcf24a052d676
BLAKE2b-256 58d8ae05da2551133c0d5bd448818e3cd458f49df4911629c013a76ba9a9591d

See more details on using hashes here.

File details

Details for the file tfrecords-0.3.0-cp310-cp310-win_amd64.whl.

File metadata

  • Download URL: tfrecords-0.3.0-cp310-cp310-win_amd64.whl
  • Upload date:
  • Size: 8.6 MB
  • Tags: CPython 3.10, Windows x86-64
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.1.0 CPython/3.10.2

File hashes

Hashes for tfrecords-0.3.0-cp310-cp310-win_amd64.whl
Algorithm Hash digest
SHA256 d800497dca8029cc6048e9a65006977d0a4677934e5d549057f1c8ef72171fb5
MD5 e7cd8febbe4ed8654ae38b4540426a07
BLAKE2b-256 78915ea13fdd4b959f1c097d4a7697b8ffa199bfe2a7603e2b0e0e4b2f849459

See more details on using hashes here.

File details

Details for the file tfrecords-0.3.0-cp310-cp310-manylinux2014_x86_64.whl.

File metadata

File hashes

Hashes for tfrecords-0.3.0-cp310-cp310-manylinux2014_x86_64.whl
Algorithm Hash digest
SHA256 5e2bea82f75044e541ba2af3ab2372881a500024b0d39e664852d17f457d200e
MD5 f2e8aaf48c1e4f1da427494aac8717ed
BLAKE2b-256 71417a1355bdcb628f0d625bba414e403a5f2d8394b9bc31b4cab34f94d23c41

See more details on using hashes here.

File details

Details for the file tfrecords-0.3.0-cp310-cp310-manylinux2014_aarch64.whl.

File metadata

File hashes

Hashes for tfrecords-0.3.0-cp310-cp310-manylinux2014_aarch64.whl
Algorithm Hash digest
SHA256 6933ff8c4ed3f220fb2c7ed0e8116f602bc08447b0717a000a597137bb9a8dd4
MD5 3b94edd909d5285c0a23cd92f2864c88
BLAKE2b-256 bd29ddb7c0aa1b54e88a34b35cfc24d36294640c390fa964b794c250c09ee47b

See more details on using hashes here.

Supported by

AWS Cloud computing and Security Sponsor Datadog Monitoring Depot Continuous Integration Fastly CDN Google Download Analytics Pingdom Monitoring Sentry Error logging StatusPage Status page