learningOrchestra python client
Project description
pythonClient
Python client for learningOrchestra.
Installation
Requires Python 3.x
pip install learning-orchestra-client
Usage
Each functionality in learningOrchestra is contained in its own class. Check the python client docs for all the available.
Example
Shown below is an example usage of learning-orchestra-client using the Titanic Dataset:
from learning_orchestra_client import (
dataset,
builder,
transform,
)
cluster_ip = "34.95.187.26"
dataset = Dataset(cluster_ip)
print(dataset.insert_dataset_sync(
"titanic_training",
"https://filebin.net/rpfdy8clm5984a4c/titanic_training.csv?t=gcnjz1yo"))
print(dataset.insert_dataset_sync(
"titanic_testing",
"https://filebin.net/mguee52ke97k0x9h/titanic_testing.csv?t=ub4nc1rc"))
print(dataset.search_all_datasets())
projection = Projection(cluster_ip)
required_columns = [
"PassengerId",
"Pclass",
"Age",
"SibSp",
"Parch",
"Fare",
"Name",
"Sex",
"Embarked",
"Survived"
]
print(projection.insert_dataset_attributes_sync(
"titanic_training",
"titanic_training_projection",
required_columns))
required_columns.remove("Survived")
print(projection.insert_dataset_attributes_sync(
"titanic_testing",
"titanic_testing_projection",
required_columns))
data_type_handler = DataType(cluster_ip)
type_fields = {
"Age": "number",
"Fare": "number",
"Parch": "number",
"PassengerId": "number",
"Pclass": "number",
"SibSp": "number"
}
print(data_type_handler.update_dataset_types(
"titanic_testing_projection",
type_fields))
type_fields["Survived"] = "number"
print(data_type_handler.update_dataset_types(
"titanic_training_projection",
type_fields))
modeling_code = '''
from pyspark.ml import Pipeline
from pyspark.sql.functions import (
mean, col, split,
regexp_extract, when, lit)
from pyspark.ml.feature import (
VectorAssembler,
StringIndexer
)
TRAINING_DF_INDEX = 0
TESTING_DF_INDEX = 1
training_df = training_df.withColumnRenamed('Survived', 'label')
testing_df = testing_df.withColumn('label', lit(0))
datasets_list = [training_df, testing_df]
for index, dataset in enumerate(datasets_list):
dataset = dataset.withColumn(
"Initial",
regexp_extract(col("Name"), "([A-Za-z]+)\.", 1))
datasets_list[index] = dataset
misspelled_initials = [
'Mlle', 'Mme', 'Ms', 'Dr',
'Major', 'Lady', 'Countess',
'Jonkheer', 'Col', 'Rev',
'Capt', 'Sir', 'Don'
]
correct_initials = [
'Miss', 'Miss', 'Miss', 'Mr',
'Mr', 'Mrs', 'Mrs',
'Other', 'Other', 'Other',
'Mr', 'Mr', 'Mr'
]
for index, dataset in enumerate(datasets_list):
dataset = dataset.replace(misspelled_initials, correct_initials)
datasets_list[index] = dataset
initials_age = {"Miss": 22,
"Other": 46,
"Master": 5,
"Mr": 33,
"Mrs": 36}
for index, dataset in enumerate(datasets_list):
for initial, initial_age in initials_age.items():
dataset = dataset.withColumn(
"Age",
when((dataset["Initial"] == initial) &
(dataset["Age"].isNull()), initial_age).otherwise(
dataset["Age"]))
datasets_list[index] = dataset
for index, dataset in enumerate(datasets_list):
dataset = dataset.na.fill({"Embarked": 'S'})
datasets_list[index] = dataset
for index, dataset in enumerate(datasets_list):
dataset = dataset.withColumn("Family_Size", col('SibSp')+col('Parch'))
dataset = dataset.withColumn('Alone', lit(0))
dataset = dataset.withColumn(
"Alone",
when(dataset["Family_Size"] == 0, 1).otherwise(dataset["Alone"]))
datasets_list[index] = dataset
text_fields = ["Sex", "Embarked", "Initial"]
for column in text_fields:
for index, dataset in enumerate(datasets_list):
dataset = StringIndexer(
inputCol=column, outputCol=column+"_index").\
fit(dataset).\
transform(dataset)
datasets_list[index] = dataset
non_required_columns = ["Name", "Embarked", "Sex", "Initial"]
for index, dataset in enumerate(datasets_list):
dataset = dataset.drop(*non_required_columns)
datasets_list[index] = dataset
training_df = datasets_list[TRAINING_DF_INDEX]
testing_df = datasets_list[TESTING_DF_INDEX]
assembler = VectorAssembler(
inputCols=training_df.columns[:],
outputCol="features")
assembler.setHandleInvalid('skip')
features_training = assembler.transform(training_df)
(features_training, features_evaluation) =\
features_training.randomSplit([0.8, 0.2], seed=33)
features_testing = assembler.transform(testing_df)
'''
builder = Builder(cluster_ip)
print(builder.run_builder_sync(
"titanic_training_projection",
"titanic_testing_projection",
modeling_code,
["lr", "dt", "gb", "rf", "nb"]))
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
Built Distribution
Close
Hashes for learning_orchestra_client-2.0.0.tar.gz
Algorithm | Hash digest | |
---|---|---|
SHA256 | 8ae7a13735a1e77c0dc5b15823f03135e74ae3b3f9732903f37778f55fd7c79b |
|
MD5 | 0e84b6793b701359ef9f2e37523cd4a0 |
|
BLAKE2b-256 | 37dfa097711fb2f99ad6ca0d3b06de66472956684a618beea03bc0fab3f3818b |
Close
Hashes for learning_orchestra_client-2.0.0-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 0c29e2b3c0f5cfe8d79aff26baa0454fb4422ff66e37abb1d1411a9e26fda228 |
|
MD5 | f0c79cf5112045d3c3878934c99ff1e8 |
|
BLAKE2b-256 | 7840fb4822deeb7b4a5563550d25630455827792192123ebe266c2b9ad51f298 |