This package enables users to store, query, and delete
a large number of key-value pairs on disk.
This is especially useful when the data cannot fit into RAM.
If you have hundreds of GBs or many TBs of key-value data to store
and query from, this is the package for you.
Installation
This package is built for macOS (x86/arm), Windows 64/32, and Linux x86.
It can be installed from pypi with pip install rocksdict.
Plans
set, get, del
multi get
support string, float, int, bytes
support other python objects through pickle
support BigInt
compare BigInt by value size
keys, values, items iterator
options, read options, write options, all options
SstFileWriter and bulk ingest
column families
write batch
delete range
open as secondary, with-ttl, read-only
support merge
Introduction
Below is a code example that shows how to do the following:
Create Rdict
Store something on disk
Close Rdict
Open Rdict again
Check Rdict elements
Iterate from Rdict
Batch get
Delete storage
fromrocksdictimportRdictimportnumpyasnpimportpandasaspdpath=str("./test_dict")# create a Rdict with default options at `path`db=Rdict(path)db[1.0]=1db[1]=1.0db["huge integer"]=2343546543243564534233536434567543db["good"]=Truedb["bad"]=Falsedb["bytes"]=b"bytes"db["this is a list"]=[1,2,3]db["store a dict"]={0:1}db[b"numpy"]=np.array([1,2,3])db["a table"]=pd.DataFrame({"a":[1,2],"b":[2,1]})# close Rdictdb.close()# reopen Rdict from diskdb=Rdict(path)assertdb[1.0]==1assertdb[1]==1.0assertdb["huge integer"]==2343546543243564534233536434567543assertdb["good"]==Trueassertdb["bad"]==Falseassertdb["bytes"]==b"bytes"assertdb["this is a list"]==[1,2,3]assertdb["store a dict"]=={0:1}assertnp.all(db[b"numpy"]==np.array([1,2,3]))assertnp.all(db["a table"]==pd.DataFrame({"a":[1,2],"b":[2,1]}))# iterate through all elementsfork,vindb.items():print(f"{k} -> {v}")# batch get:print(db[["good","bad",1.0]])# [True, False, 1]# delete Rdict from dictdb.close()Rdict.destroy(path)
Supported types:
key: int, float, bool, str, bytes
value: int, float, bool, str, bytes and anything that
supports pickle.
Rocksdb Options
Since the backend is implemented using rocksdb,
most of rocksdb options are supported:
Example of tuning
fromrocksdictimportRdict,Options,SliceTransform,PlainTableFactoryOptionsimportosdefdb_options():opt=Options()# create tableopt.create_if_missing(True)# config to more jobsopt.set_max_background_jobs(os.cpu_count())# configure mem-table to a large value (256 MB)opt.set_write_buffer_size(0x10000000)opt.set_level_zero_file_num_compaction_trigger(4)# configure l0 and l1 size, let them have the same size (1 GB)opt.set_max_bytes_for_level_base(0x40000000)# 256 MB file sizeopt.set_target_file_size_base(0x10000000)# use a smaller compaction multiplieropt.set_max_bytes_for_level_multiplier(4.0)# use 8-byte prefix (2 ^ 64 is far enough for transaction counts)opt.set_prefix_extractor(SliceTransform.create_max_len_prefix(8))# set to plain-table for better performanceopt.set_plain_table_factory(PlainTableFactoryOptions())returnoptdb=Rdict(str("./some_path"),db_options())
Example of Column Families
fromrocksdictimportRdict,Options,SliceTransform,PlainTableFactoryOptionsimportrandompath=str("tmp")cf1_name=str("cf1")cf2_name=str("cf2")# set cf2 as a plain tablecf2_opt=Options()cf2_opt.set_prefix_extractor(SliceTransform.create_max_len_prefix(8))p_opt=PlainTableFactoryOptions()p_opt.user_key_length=200cf2_opt.set_plain_table_factory(p_opt)# create column families if missingopt=Options()# create_if_missing=True by defaultopt.create_missing_column_families(True)db=Rdict(path,options=opt,column_families={cf1_name:Options(),cf2_name:cf2_opt})# add column familiesdb_cf1=db.get_column_family(cf1_name)db_cf2=db.get_column_family(cf2_name)db_cf3=db.create_column_family(str("cf3"))# with default Optionsdb_cf4=db.create_column_family(str("cf4"),cf2_opt)# custom options# remove column familiesdb.drop_column_family(str("cf3"))db.drop_column_family(str("cf4"))deldb_cf3,db_cf4# insert into column familiesforiinrange(10000):db_cf1[i]=i**2rand_bytes=[random.randbytes(200)for_inrange(100000)]forbinrand_bytes:db_cf2[b]=b# close databasedb_cf1.close()db_cf2.close()db.close()# reopen dbdb=Rdict(path,column_families={cf1_name:Options(),cf2_name:cf2_opt})db_cf1=db.get_column_family(cf1_name)db_cf2=db.get_column_family(cf2_name)# check keyscount=0fork,vindb_cf1.items():assertk==countassertv==count**2count+=1rand_bytes.sort()assertlist(db_cf2.keys())==rand_bytes# delete dbdb.close()db_cf1.close()db_cf2.close()Rdict.destroy(path)
Example of Bulk Ingestion By SstFileWriter
fromrocksdictimportRdict,Options,SstFileWriterimportrandom# generate some rand bytesrand_bytes1=[random.randbytes(200)for_inrange(100000)]rand_bytes1.sort()rand_bytes2=[random.randbytes(200)for_inrange(100000)]rand_bytes2.sort()# write to file1.sstwriter=SstFileWriter()writer.open("file1.sst")fork,vinzip(rand_bytes1,rand_bytes1):writer[k]=vwriter.finish()# write to file2.sstwriter=SstFileWriter(Options())writer.open("file2.sst")fork,vinzip(rand_bytes2,rand_bytes2):writer[k]=vwriter.finish()# Create a new Rdict with default optionsd=Rdict("tmp")d.ingest_external_file(["file1.sst","file2.sst"])d.close()# reopen, check if all key-values are thered=Rdict("tmp")forkinrand_bytes2+rand_bytes1:assertd[k]==kd.close()# delete tmpRdict.destroy("tmp")
Example of BatchWrite
fromrocksdictimportRdict,WriteBatch,Options# create db with two new column familiespath=str("tmp")opt=Options()opt.create_missing_column_families(True)cf_name_1=str("batch_test_1")cf_name_2=str("batch_test_2")cf={cf_name_1:Options(),cf_name_2:Options()}db=Rdict(path,column_families=cf,options=opt)# write batch to ColumnFamily `batch_test_1` (method 1)wb=WriteBatch()foriinrange(100):wb.put(i,i**2,db.get_column_family_handle(cf_name_1))db.write(wb)# write batch to ColumnFamily `batch_test_2` (method 2, change default cf)wb=WriteBatch()wb.set_default_column_family(db.get_column_family_handle(cf_name_2))foriinrange(100,200):wb[i]=i**2db.write(wb)# reopen DBdb.close()db=Rdict(path,column_families=cf)# read db, check elements in two column familiescount=0fork,vindb.get_column_family(cf_name_1).items():assertk==countassertv==count**2count+=1assertcount==100fork,vindb.get_column_family(cf_name_2).items():assertk==countassertv==count**2count+=1assertcount==200db.close()Rdict.destroy(path,opt)
Example of delete range
fromrocksdictimportRdict,Optionspath=str("tmp")c1_name=str("c1")db=Rdict(path)c1=db.create_column_family(c1_name,Options())# write keysforiinrange(0,100):db[i]=ic1[i]=i# delete rangedb.delete_range(0,50)c1.delete_range(50,100)# check keys after delete_rangeassertlist(db.keys())==list(range(50,100))assertlist(c1.keys())==list(range(0,50))c1.close()db.close()Rdict.destroy(path)
Contribution
This project is still in an early stage of development. People are welcome
to add tests, benchmarks and new features.