Skip to main content

A simple way to use Dataset. for dsm

Project description

DSM Library

DataNode

  1. init DataNode
from dsmlibrary.datanode import DataNode 

data = DataNode(
  token="<token>",
  apikey="<apikey>",
  dataplatform_api_uri="<dataplatform_api_uri>", 
  object_storage_uri="<object_storage_uri>",
  use_env=<True/False (default(True))>
)
  1. upload file
data.upload_file(directory_id=<directory_id>, file_path='<file_path>', description="<description(optional)>")
  1. download file
data.download_file(file_id=<file_id>, download_path="<place download file save> (default ./dsm.tmp)")
  1. get file
meta, file = data.get_file(file_id="<file_id>")
# meta -> dict
# file -> io bytes
# example read csv pandas
 
meta, file = data.get_file(file_id="<file_id>")
df = pd.read_csv(file)
...
  1. read df
df = data.read_df(file_id="<file_id>")
# df return as pandas dataframe
  1. read ddf
  • .parquet must use this function
ddf = data.read_ddf(file_id="<file_id>")
# ddf return as dask dataframe
  1. write parquet file
df = ... # pandas dataframe or dask dataframe

data.write(df=df, directory=<directory_id>, name="<save_file_name>", description="<description>", replace=<replace if file exists. default False>, datadict=<True or False default False>, profiling=<True or False default False>, lineage=<list of file id. eg [1,2,3]>)
  1. writeListDataNode
df = ... # pandas dataframe or dask dataframe
data.writeListDataNode(df=df, directory_id=<directory_id>, name="<save_file_name>", description="<description>", replace=<replace if file exists. default False>, datadict=<True or False default False>, profiling=<True or False default False>, lineage=<list of file id. eg [1,2,3]>)
  1. get file id
file_id = data.get_file_id(name=<file name>, directory_id=<directory id>)
# file_id return int fileID
  1. get directory id
directory_id = data.get_directory_id(parent_dir_id=<directory id>, name=<file name>)
# directory_id return int directoryID
  1. get get_file_version use for listDataNode
fileVersion = data.get_file_version(file_id=<file id>)
# return dict `file_id` and `timestamp`

Clickhouse

  1. imoprt data to clickhouse
from dsmlibrary.clickhouse import ClickHouse

ddf = ... # pandas dataframe or dask dataframe

## to warehouse
table_name = <your_table_name>
partition_by = <your_partition_by>

connection = { 
  'host': '', 
  'port': , 
  'database': '', 
  'user': '', 
  'password': '', 
  'settings':{ 
     'use_numpy': True 
  }, 
  'secure': False 
}

warehouse = ClickHouse(connection=connection)

tableName = warehouse.get_or_createTable(ddf=ddf, tableName=table_name, partition_by=partition_by)
warehouse.write(ddf=ddf, tableName=tableName)
  1. query data from clickhouse
query = f""" 
    SELECT * FROM {tableName} LIMIT 10 
""" 
warehouse.read(sqlQuery=query)
  1. drop table
warehouse.dropTable(tableName=table_name)
  • optional use for custom config insert data to clickhouse
config = {
  'n_partition_per_block': 10,
  'n_row_per_loop': 1000
}
warehouse = ClickHouse(connection=connection, config=config)
  1. truncate table
warehouse.truncateTable(tableName=table_name)

API

dsmlibrary

dsmlibrary.datanode.DataNode

  • upload_file
  • download_file
  • read_df
  • read_ddf
  • write
  • get_file_id

dsmlibrary.clickhouse.ClickHouse

  • get_or_createTable
  • write
  • read
  • dropTable

Use for pipeline

data = DataNode(apikey="<APIKEY>")

use api key for authenticate

MDM

  1. semantic similarity
pip install "dsmlibrary[mdm]"

see example here

Gendatadict PDF

from dsmlibrary.datadict import GenerateDatadict
gd = GenerateDatadict(
  token="<token>",
  apikey="<apikey>",
  dataplatform_api_uri="<dataplatform_api_uri>", 
  object_storage_uri="<object_storage_uri>"
)
gd.generate_datadict(name="<NAME>", directory_id=<DIR_ID for datadict file>, file_ids=[<FILE_ID>, <FILE_ID>, ...])
  • use token or apikey

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

dsmlibrary-1.0.54.tar.gz (18.5 kB view hashes)

Uploaded Source

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page