Dataset SDK for consistent read/write [batch, online, streaming] data.
Project description
Welcome to @datasets
TODO
import pandas as pd
from metaflow import FlowSpec, Parameter, current, step
from datasets import DatasetType, Mode
# Can also invoke from CLI:
# > python datasets/tutorials/0_hello_dataset_flow.py run \
# --hello_dataset '{"name": "foo", "partition_by": "region", "mode": "Write"}'
class HelloDatasetFlow(FlowSpec):
hello_dataset = Parameter(
"hello_dataset",
default=dict(name="HelloDataset", partition_by="region", mode=Mode.Write),
type=DatasetType,
)
@step
def start(self):
df = pd.DataFrame({"region": ["A", "A", "A", "B", "B", "B"], "zpid": [1, 2, 3, 4, 5, 6]})
print("saving df: \n", df.to_string(index=False))
# Example of writing to a dataset
print(f"{self.hello_dataset.program_name=}")
self.hello_dataset.write(df)
self.next(self.end)
@step
def end(self):
print(f"I have dataset \n{self.hello_dataset=}")
# hello_dataset to_pandas()
df: pd.DataFrame = self.hello_dataset.to_pandas(run_id=current.run_id)
print("self.hello_dataset.to_pandas():\n", df.to_string(index=False))
# save this as an output dataset
self.output_dataset = self.hello_dataset
if __name__ == "__main__":
HelloDatasetFlow()
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
zdatasets-0.0.11.tar.gz
(47.7 kB
view hashes)
Built Distribution
zdatasets-0.0.11-py3-none-any.whl
(75.1 kB
view hashes)
Close
Hashes for zdatasets-0.0.11-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | e1331a7a668d35ab0bc1f24a803db8cc2f73d12b533182b9b34a3df89e007337 |
|
MD5 | d66dfbb4431d7d30836b6294ddd4c417 |
|
BLAKE2b-256 | f22ad86364490d0f3dcfbfb1e7def05530f3a70534a3dd00405eaaa2f0d0819d |