HyperFrame
The aim of this project is to provide a high-dimensional analogue to the two-dimensional pandas DataFrame.
This allows its user to organise information where the interaction of several factors is of interest.
The HyperFrame allows for the easy setting and saving of data for storage, and the fast, interactive creation of two-dimensional pandas DataFrames of any combination of two factors for data exploration.
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from hyperframe import HyperFrame
from sklearn.model_selection import train_test_split
from demo.helpers import metrics, X, y
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.33, random_state=42)
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=5, p=2,
weights='uniform')
Initialisation
dimension_labels = ["train_test", "species", "metric"]
index_labels = {"train_test": ["train", "test"],
"species": ["setosa", "versicolor", "virginica"],
"metric": ["precision", "recall", "f1"]}
scores = HyperFrame(dimension_labels, index_labels)
Setting data
yhat = clf.predict(X_train)
#iset alternative 1
scores.iset(metrics(y_train, yhat), "train", "", "")
<hyperframe.HyperFrame at 0x7ff4d4241320>
yhat = clf.predict(X_test)
#iset alternative 2
scores.iset(metrics(y_test, yhat), train_test="test")
<hyperframe.HyperFrame at 0x7ff4d4241320>
Getting data
#iget alternative 1
scores.iget("train", "", "", return_type="pandas").round(2)
|
precision |
recall |
f1 |
setosa |
0.89 |
1.00 |
0.94 |
versicolor |
0.71 |
0.71 |
0.71 |
virginica |
0.80 |
0.71 |
0.75 |
#iget alternative 2
scores.iget(species="versicolor", return_type="pandas").round(2)
|
precision |
recall |
f1 |
train |
0.71 |
0.71 |
0.71 |
test |
0.70 |
0.47 |
0.56 |
#iget alternative 3
scores.iget0("species", "train_test", return_type="pandas").round(2)
{'metric': 'precision'}
|
setosa |
versicolor |
virginica |
train |
0.89 |
0.71 |
0.80 |
test |
0.95 |
0.70 |
0.65 |
Initialising a second HyperFrame
scores_lr = HyperFrame(dimension_labels, index_labels)
clf = LogisticRegression(penalty="none", max_iter=1000)
clf.fit(X_train, y_train)
yhat = clf.predict(X_train)
scores_lr.iset(metrics(y_train, yhat), "train", "", "")
yhat = clf.predict(X_test)
scores_lr.iset(metrics(y_test, yhat), "test", "", "")
<hyperframe.HyperFrame at 0x7ff4d4231588>
Merging
print("scores shape: {}".format(scores.shape))
print("scores_lr shape: {}".format(scores_lr.shape))
scores shape: (2, 3, 3)
scores_lr shape: (2, 3, 3)
scores_models = scores.merge(scores_lr, "model", ["knn", "logistic regression"])
scores_models.iget("test", "", "f1", "", return_type="pandas").round(2)
|
knn |
logistic regression |
setosa |
0.97 |
0.95 |
versicolor |
0.56 |
0.58 |
virginica |
0.72 |
0.72 |
scores_models.iget("", "", "f1", "logistic regression", return_type="pandas").round(2)
|
setosa |
versicolor |
virginica |
train |
0.92 |
0.74 |
0.78 |
test |
0.95 |
0.58 |
0.72 |
Initialising a third HyperFrame
scores_rf = HyperFrame(dimension_labels, index_labels)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
yhat = clf.predict(X_train)
scores_rf.iset(metrics(y_train, yhat), "train", "", "")
yhat = clf.predict(X_test)
scores_rf.iset(metrics(y_test, yhat), "test", "", "")
<hyperframe.HyperFrame at 0x7ff4d41cd978>
scores_rf.iget("test", "", "", return_type="pandas").round(2)
|
precision |
recall |
f1 |
setosa |
0.95 |
0.95 |
0.95 |
versicolor |
0.75 |
0.40 |
0.52 |
virginica |
0.61 |
0.88 |
0.72 |
Expanding A DataFrame
print("scores_models shape: {}".format(scores_models.shape))
print("scores_rf shape: {}".format(scores_rf.shape))
scores_models shape: (2, 3, 3, 2)
scores_rf shape: (2, 3, 3)
scores_models = scores_models.expand(scores_rf, "model", "random forest")
scores_models.iget("test", "", "f1", "", return_type="pandas").round(2)
|
knn |
logistic regression |
random forest |
setosa |
0.97 |
0.95 |
0.95 |
versicolor |
0.56 |
0.58 |
0.52 |
virginica |
0.72 |
0.72 |
0.72 |
Simple Mathematical Operations
scores.max("train_test").iget("", "", return_type="pandas")
|
precision |
recall |
f1 |
setosa |
0.950000 |
1.000000 |
0.974359 |
versicolor |
0.714286 |
0.714286 |
0.714286 |
virginica |
0.800000 |
0.812500 |
0.750000 |
scores.min("train_test", "metric").iget("", return_type="pandas")
setosa 0.885714
versicolor 0.466667
virginica 0.650000
dtype: float64
scores.mean("train_test", "species", "metric")
0.7810886435641339
scores.sum()
14.059595584154408
Writing to file
scores_models.write_file("./demo/scores_models")
Reading from file
scores_models = scores_models.read_file("./demo/scores_models")