Skip to main content

Labeled high dimensional matrix access

Project description

HyperFrame

The aim of this project is to provide a high-dimensional analogue to the two-dimensional pandas DataFrame.

This allows its user to organise information where the interaction of several factors is of interest.

The HyperFrame allows for the easy setting and saving of data for storage, and the fast, interactive creation of two-dimensional pandas DataFrames of any combination of two factors for data exploration.

import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from hyperframe import HyperFrame
from sklearn.model_selection import train_test_split
from demo.helpers import metrics, X, y
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.33, random_state=42)
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

Initialisation

dimension_labels = ["train_test", "species", "metric"]

index_labels = {"train_test": ["train", "test"],
                "species": ["setosa", "versicolor", "virginica"],
                "metric": ["precision", "recall", "f1"]}

scores = HyperFrame(dimension_labels, index_labels)

Setting data

yhat = clf.predict(X_train)
#iset alternative 1
scores.iset(metrics(y_train, yhat), "train", "", "")
<hyperframe.HyperFrame at 0x7ff4d4241320>
yhat = clf.predict(X_test)
#iset alternative 2
scores.iset(metrics(y_test, yhat), train_test="test")
<hyperframe.HyperFrame at 0x7ff4d4241320>

Getting data

#iget alternative 1
scores.iget("train", "", "", return_type="pandas").round(2)
precision recall f1
setosa 0.89 1.00 0.94
versicolor 0.71 0.71 0.71
virginica 0.80 0.71 0.75
#iget alternative 2
scores.iget(species="versicolor", return_type="pandas").round(2)
precision recall f1
train 0.71 0.71 0.71
test 0.70 0.47 0.56
#iget alternative 3
scores.iget0("species", "train_test", return_type="pandas").round(2)
{'metric': 'precision'}
setosa versicolor virginica
train 0.89 0.71 0.80
test 0.95 0.70 0.65

Initialising a second HyperFrame

scores_lr = HyperFrame(dimension_labels, index_labels)
clf = LogisticRegression(penalty="none", max_iter=1000)
clf.fit(X_train, y_train)

yhat = clf.predict(X_train)
scores_lr.iset(metrics(y_train, yhat), "train", "", "")

yhat = clf.predict(X_test)
scores_lr.iset(metrics(y_test, yhat), "test", "", "")
<hyperframe.HyperFrame at 0x7ff4d4231588>

Merging

print("scores shape: {}".format(scores.shape))
print("scores_lr shape: {}".format(scores_lr.shape))
scores shape: (2, 3, 3)
scores_lr shape: (2, 3, 3)
scores_models = scores.merge(scores_lr, "model", ["knn", "logistic regression"])
scores_models.iget("test", "", "f1", "", return_type="pandas").round(2)
knn logistic regression
setosa 0.97 0.95
versicolor 0.56 0.58
virginica 0.72 0.72
scores_models.iget("", "", "f1", "logistic regression", return_type="pandas").round(2)
setosa versicolor virginica
train 0.92 0.74 0.78
test 0.95 0.58 0.72

Initialising a third HyperFrame

scores_rf = HyperFrame(dimension_labels, index_labels)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

yhat = clf.predict(X_train)
scores_rf.iset(metrics(y_train, yhat), "train", "", "")

yhat = clf.predict(X_test)
scores_rf.iset(metrics(y_test, yhat), "test", "", "")
<hyperframe.HyperFrame at 0x7ff4d41cd978>
scores_rf.iget("test", "", "", return_type="pandas").round(2)
precision recall f1
setosa 0.95 0.95 0.95
versicolor 0.75 0.40 0.52
virginica 0.61 0.88 0.72

Expanding A DataFrame

print("scores_models shape: {}".format(scores_models.shape))
print("scores_rf shape: {}".format(scores_rf.shape))
scores_models shape: (2, 3, 3, 2)
scores_rf shape: (2, 3, 3)
scores_models = scores_models.expand(scores_rf, "model", "random forest")
scores_models.iget("test", "", "f1", "", return_type="pandas").round(2)
knn logistic regression random forest
setosa 0.97 0.95 0.95
versicolor 0.56 0.58 0.52
virginica 0.72 0.72 0.72

Simple Mathematical Operations

scores.max("train_test").iget("", "", return_type="pandas")
precision recall f1
setosa 0.950000 1.000000 0.974359
versicolor 0.714286 0.714286 0.714286
virginica 0.800000 0.812500 0.750000
scores.min("train_test", "metric").iget("", return_type="pandas")
setosa        0.885714
versicolor    0.466667
virginica     0.650000
dtype: float64
scores.mean("train_test", "species", "metric")
0.7810886435641339
scores.sum()
14.059595584154408

Writing to file

scores_models.write_file("./demo/scores_models")

Reading from file

scores_models = scores_models.read_file("./demo/scores_models")

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

hyperframes-0.0.1.tar.gz (10.4 kB view hashes)

Uploaded Source

Built Distribution

hyperframes-0.0.1-py3-none-any.whl (8.6 kB view hashes)

Uploaded Python 3

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page