Python interface to Doris
Project description
Apache Doris Python Client
A Apache Doris client for the Python programming language.
Apache Doris is a high-performance, real-time analytical database based on MPP architecture, known for its extreme speed and ease of use. It only requires a sub-second response time to return query results under massive data and can support not only high-concurrent point query scenarios but also high-throughput complex analysis scenarios. All this makes Apache Doris an ideal tool for scenarios including report analysis, ad-hoc query, unified data warehouse, and data lake query acceleration. On Apache Doris, users can build various applications, such as user behavior analysis, AB test platform, log retrieval analysis, user portrait analysis, and order analysis.
Installation
pip install pydoris-client
DorisClient Usage
from pydoris.doris_client import *
from pydoris.util.generate_test_data import *
fe_host = "127.0.0.1"
fe_http_port = "8040"
fe_query_port = "9030"
username = 'root'
passwd = ""
db = "test"
doris_client = DorisClient(fe_host=fe_host,
fe_query_port=fe_query_port,
fe_http_port=fe_http_port,
username=username,
password=passwd,
db=db)
def test_create_database():
return doris_client.create_database('pydoris_client_test')
def test_create_table():
doris_client.execute("""create table if not exists pydoris_client_test.write_test(
f_id int,
f_decimal decimal(18,6),
f_timestamp bigint,
f_datetime datetime(6),
f_str string,
f_float float,
f_boolean boolean
)duplicate key(`f_id`)
distributed by hash(`f_id`) buckets 1
properties("replication_allocation" = "tag.location.default: 1");""")
def test_get_table_columns():
print(doris_client.get_table_columns('pydoris_client_test', 'write_test'))
def gen_test_data(num):
list = []
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)
for i in range(num):
line = (i,
generate_decimal(),
generate_timestamp(),
generate_random_datetime(start_date, end_date),
generate_random_string(20),
generate_float(),
generate_boolean())
list.append(line)
return list
# If your data line delimiter need to be specified,use options.set_line_delimiter(delimiter)
def test_write_csv():
# print(list)
df = pd.DataFrame(gen_test_data(100000))
df.columns = ['f_id', 'f_decimal', 'f_timestamp', 'f_datetime', 'f_str', 'f_float', 'f_boolean']
# doris_client.options.set_csv_format(",").set_auto_uuid_label().set_line_delimiter("\\n")
csv = df.to_csv(header=False, index=False)
doris_client.write("pydoris_client_test.write_test", csv)
# 1. If you need use json format to insert data to Doris, you need set json format ,
# because the default format is csv format
# 2. When you json data is [{},{}] please set strip_outer_array=true
# 3. You can customize data import labels , use options.set_label(your_label)
def test_write_json():
df = pd.DataFrame(gen_test_data(100000),
columns=['f_id', 'f_decimal', 'f_timestamp', 'f_datetime', 'f_str', 'f_float', 'f_boolean'])
json_data = df.to_json(orient='records')
options = WriteOptions()
options.set_json_format()
options.set_option("strip_outer_array", "true")
doris_client.write("pydoris_client_test.write_test", json_data, options=options)
# data_df: pd.DataFrame, table_name: str, table_model: str is must
# When repeat_replacement = True, tables with duplicate names will be deleted,be careful
def test_write_from_df():
df = pd.DataFrame(gen_test_data(100000),
columns=['f_id', 'f_decimal', 'f_timestamp', 'f_datetime', 'f_str', 'f_float', 'f_boolean'])
doris_client.write_from_df(df, "pydoris_client_test.df_write_test", "UNIQUE", ['f_id'],
distributed_hash_key=["f_id"], buckets=1,
field_mapping=[("f_decimal", "Decimal(18,6)")]
, table_properties={"replication_allocation": "tag.location.default: 1"},
repeat_replacement=False)
def test_read_to_df():
dataframe = doris_client.query_to_dataframe("select * from pydoris_client_test.write_test limit 1000",
['f_id', 'f_decimal', 'f_timestamp', 'f_datetime',
'f_str', 'f_float', 'f_boolean'])
with pd.option_context('expand_frame_repr', False, 'display.max_rows', None):
print(dataframe)
def test_query():
import datetime
start = datetime.datetime.now()
result = doris_client.query("select * from pydoris_client_test.write_test")
end = datetime.datetime.now()
print((end - start).seconds)
print(len(result))
def test_list_tables():
tables = doris_client.list_tables("pydoris_client_test")
print(tables)
def test_drop_table():
db = 'pydoris_client_test'
table_name1 = 'write_test'
table_name2 = 'df_write_test'
tables = doris_client.list_tables(db)
print(tables)
doris_client.drop_table(db, table_name1)
doris_client.drop_table(db, table_name2)
tables = doris_client.list_tables(db)
print(tables)
if __name__ == '__main__':
test_create_database()
test_create_table()
test_get_table_columns()
test_write_csv()
test_write_json()
test_write_from_df()
test_read_to_df()
test_query()
test_list_tables()
test_drop_table()
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
File details
Details for the file pydoris-client-1.0.4.tar.gz
.
File metadata
- Download URL: pydoris-client-1.0.4.tar.gz
- Upload date:
- Size: 19.2 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/4.0.2 CPython/3.11.5
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | 11933db2f4d15219d9e94ed85e78edc5f30996356723da758225bedcb88a2d72 |
|
MD5 | a012758abd880782c78680f8aeb26cc0 |
|
BLAKE2b-256 | 56d277dca9c6dc6b2062377367d7956e705c89aba29005248ef07008db8bdbbd |