A fresh nbdev project for kmodel.
Project description
kmodel
kmodel provides reusable machine learning and deep learning helpers for multi-output modeling workflows. It covers tabular model training, scoring, post-processing, prediction, and fastai-based deep learning utilities through runnable examples derived from the project notebooks.
Installation
pip install kmodel
Quick start
The examples below follow the notebooks under nbs/ in order. Each
function example lives in its own cell and starts with a short comment
derived from the function docstring.
01 ML
from kmodel.ml import get_splits, split_data, train_ml, train_ml_cv, post_process, post_process_oof, predict_ml
from pathlib import Path
import pandas as pd
from sklearn.linear_model import LinearRegression
import seaborn as sns
df = sns.load_dataset("penguins").dropna(
subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"]
).reset_index(drop=True)
feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
target_df = pd.get_dummies(df["species"], prefix="species", dtype=float)
target_col = target_df.columns.tolist()
df[target_col] = target_df
df.shape
(342, 10)
# Split samples in a dataframe with stratified, grouped, or stratified-grouped K-fold logic.
splits = get_splits(df, stratified="species", nfold=3)
split0 = splits[0]
len(split0[0]), len(split0[1])
StratifiedKFold(n_splits=3, random_state=123, shuffle=True)
# species in train set: 3
# species in test set: 3
(228, 114)
# Given a split tuple, return X_train, y_train, X_test, and y_test.
X_train, y_train, X_test, y_test = split_data(df, feat_col, target_col, split0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
((228, 4), (228, 3), (114, 4), (114, 3))
# Fit and predict with a sklearn model, returning validation targets and predictions.
model = LinearRegression()
target, pred = train_ml(df, feat_col, target_col, split0, model)
pred.head()
| species_Adelie | species_Chinstrap | species_Gentoo | |
|---|---|---|---|
| 0 | 0.993427 | 0.137000 | -0.130427 |
| 3 | 1.064457 | 0.046586 | -0.111043 |
| 9 | 0.839056 | 0.118838 | 0.042105 |
| 11 | 0.669557 | 0.423417 | -0.092974 |
| 14 | 1.050863 | -0.073914 | 0.023052 |
# Run cross-validation through the given splits.
oof = train_ml_cv(df, feat_col, target_col, splits=splits, model=LinearRegression())
oof.head()
| species_Adelie | species_Chinstrap | species_Gentoo | nfold | |
|---|---|---|---|---|
| 0 | 0.993427 | 0.137000 | -0.130427 | 0 |
| 1 | 0.790344 | 0.103762 | 0.105894 | 1 |
| 2 | 0.673088 | 0.317647 | 0.009265 | 2 |
| 3 | 1.064457 | 0.046586 | -0.111043 | 0 |
| 4 | 1.122991 | 0.154406 | -0.277398 | 1 |
# Clip negatives and renormalize probability-like predictions.
post_process(pred.head())
| species_Adelie | species_Chinstrap | species_Gentoo | |
|---|---|---|---|
| 0 | 0.878807 | 1.211930e-01 | 8.846216e-09 |
| 3 | 0.958070 | 4.192990e-02 | 9.000554e-09 |
| 9 | 0.839056 | 1.188384e-01 | 4.210543e-02 |
| 11 | 0.612601 | 3.873988e-01 | 9.149350e-09 |
| 14 | 0.978535 | 9.311731e-09 | 2.146502e-02 |
# Post-process prediction columns in an out-of-fold dataframe.
oof = post_process_oof(oof, target_col)
oof[target_col].head()
| species_Adelie | species_Chinstrap | species_Gentoo | |
|---|---|---|---|
| 0 | 0.878807 | 0.121193 | 8.846216e-09 |
| 1 | 0.790344 | 0.103762 | 1.058942e-01 |
| 2 | 0.673088 | 0.317647 | 9.264531e-03 |
| 3 | 0.958070 | 0.041930 | 9.000554e-09 |
| 4 | 0.879124 | 0.120876 | 7.828416e-09 |
# Predict from a saved sklearn model.
model_path = Path("_tmp/penguins_ml.joblib")
model_path.parent.mkdir(parents=True, exist_ok=True)
_ = train_ml(df, feat_col, target_col, split0, LinearRegression(), save=model_path)
predict_ml(df.iloc[split0[1]], feat_col, target_col, model_pth=model_path).head()
| species_Adelie | species_Chinstrap | species_Gentoo | |
|---|---|---|---|
| 0 | 0.993427 | 0.137000 | -0.130427 |
| 3 | 1.064457 | 0.046586 | -0.111043 |
| 9 | 0.839056 | 0.118838 | 0.042105 |
| 11 | 0.669557 | 0.423417 | -0.092974 |
| 14 | 1.050863 | -0.073914 | 0.023052 |
02 DNN
from kmodel.dnn import seed_everything, GeneralDataset, MLP, lin_wn, CNN1D, PSSM_model, init_weights, CE, KLD, JSD, train_dl, predict_dl, train_dl_cv
import fastcore.all as fc
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from fastai.vision.all import *
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader
# Set up the objects used by the examples below.
seed_everything(123)
df = sns.load_dataset("penguins").dropna(
subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"]
).reset_index(drop=True)
feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
target_df = pd.get_dummies(df["species"], prefix="species", dtype=float)
target_col = target_df.columns.tolist()
df[target_col] = target_df
n_feature = len(feat_col)
n_target = len(target_col)
n_aa = len(target_col)
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)
splits = list(skf.split(df.index, df["species"]))
split0 = splits[0]
ds = GeneralDataset(df, feat_col, target_col, A=n_aa)
xb, yb = next(iter(DataLoader(ds, batch_size=8, shuffle=True)))
logits = PSSM_model(n_feature, n_target, A=n_aa, model="MLP")(xb)
df.shape
(342, 10)
# Feed-forward model for tabular inputs.
mlp = MLP(n_feature, n_target)
mlp(xb).shape
torch.Size([8, 3])
# Weight-normalized linear block.
lin_wn(10, 3)
Sequential(
(0): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): Dropout(p=0.1, inplace=False)
(2): ParametrizedLinear(
in_features=10, out_features=3, bias=True
(parametrizations): ModuleDict(
(weight): ParametrizationList(
(0): _WeightNorm()
)
)
)
(3): SiLU()
)
# Initialize convolution layers with Kaiming normal weights.
cnn = CNN1D(n_feature, n_target).apply(init_weights)
cnn(xb).shape
torch.Size([8, 3])
# Cross-entropy with soft labels.
CE(logits, yb)
tensor(1.0681, grad_fn=<MeanBackward0>)
# Average KL divergence across positions between target_probs and softmax(logits).
KLD(logits, yb)
tensor(1.0681, grad_fn=<MeanBackward0>)
# Average Jensen-Shannon divergence across positions between target_probs and softmax(logits).
JSD(logits, yb)
tensor(0.3023, grad_fn=<MeanBackward0>)
# Train a deep learning model with the fastai learner stack.
get_mlp = lambda: PSSM_model(n_feature, n_target, A=n_aa, model='MLP')
target, pred = train_dl(
df,
feat_col,
target_col,
split0,
model_func=get_mlp,
A=n_aa,
n_epoch=1,
bs=16,
lr=3e-3,
save='model',
)
pred.head()
lr in training is 0.003
<style>
progress { appearance: none; border: none; border-radius: 4px; width: 300px;
height: 20px; vertical-align: middle; background: #e0e0e0; }
progress::-webkit-progress-bar { background: #e0e0e0; border-radius: 4px; }
progress::-webkit-progress-value { background: #2196F3; border-radius: 4px; }
progress::-moz-progress-bar { background: #2196F3; border-radius: 4px; }
progress:not([value]) {
background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px); }
progress.progress-bar-interrupted::-webkit-progress-value { background: #F44336; }
progress.progress-bar-interrupted::-moz-progress-value { background: #F44336; }
progress.progress-bar-interrupted::-webkit-progress-bar { background: #F44336; }
progress.progress-bar-interrupted::-moz-progress-bar { background: #F44336; }
progress.progress-bar-interrupted { background: #F44336; }
table.fastprogress { border-collapse: collapse; margin: 1em 0; font-size: 0.9em; }
table.fastprogress th, table.fastprogress td { padding: 8px 12px; border: 1px solid #ddd; text-align: left; }
table.fastprogress thead tr { background: #f8f9fa; font-weight: bold; }
table.fastprogress tbody tr:nth-of-type(even) { background: #f8f9fa; }
</style>
<div>
<table class="fastprogress">
<thead>
<tr>
<th>epoch</th>
<th>train_loss</th>
<th>valid_loss</th>
<th>KLD</th>
<th>JSD</th>
<th>time</th>
</tr>
</thead>
<tbody>
<tr>
<td>0</td>
<td>0.740750</td>
<td>3.037915</td>
<td>3.037915</td>
<td>0.376177</td>
<td>00:00</td>
</tr>
</tbody>
</table>
</div>
| species_Adelie | species_Chinstrap | species_Gentoo | |
|---|---|---|---|
| 0 | 0.009204 | 0.009344 | 0.981452 |
| 3 | 0.063467 | 0.055438 | 0.881095 |
| 9 | 0.182922 | 0.156108 | 0.660971 |
| 11 | 0.294663 | 0.286624 | 0.418712 |
| 14 | 0.011959 | 0.011384 | 0.976657 |
# Predict a dataframe given a deep learning model saved by fastai.
test_pred = predict_dl(
df.iloc[split0[1]].copy(),
feat_col,
target_col,
model_func=get_mlp,
model_pth='model',
A=n_aa,
)
test_pred
| species_Adelie | species_Chinstrap | species_Gentoo | |
|---|---|---|---|
| 0 | 9.204363e-03 | 9.344031e-03 | 0.981452 |
| 3 | 6.346702e-02 | 5.543802e-02 | 0.881095 |
| 9 | 1.829216e-01 | 1.561077e-01 | 0.660971 |
| 11 | 2.946635e-01 | 2.866240e-01 | 0.418712 |
| 14 | 1.195943e-02 | 1.138383e-02 | 0.976657 |
| ... | ... | ... | ... |
| 328 | 5.574878e-09 | 1.360372e-08 | 1.000000 |
| 334 | 2.227252e-10 | 7.630787e-10 | 1.000000 |
| 335 | 5.731530e-07 | 1.071595e-06 | 0.999998 |
| 339 | 5.872652e-10 | 1.706496e-09 | 1.000000 |
| 340 | 5.236147e-08 | 1.102807e-07 | 1.000000 |
114 rows × 3 columns
# Cross-validation training loop for deep learning models.
oof = train_dl_cv(
df,
feat_col,
target_col,
splits=splits,
model_func=get_mlp,
A=n_aa,
n_epoch=1,
bs=16,
lr=3e-3,
)
oof.nfold.value_counts()
------fold0------
lr in training is 0.003
<div>
<table class="fastprogress">
<thead>
<tr>
<th>epoch</th>
<th>train_loss</th>
<th>valid_loss</th>
<th>KLD</th>
<th>JSD</th>
<th>time</th>
</tr>
</thead>
<tbody>
<tr>
<td>0</td>
<td>0.713906</td>
<td>2.601878</td>
<td>2.601878</td>
<td>0.355676</td>
<td>00:00</td>
</tr>
</tbody>
</table>
</div>
------fold1------
lr in training is 0.003
<div>
<table class="fastprogress">
<thead>
<tr>
<th>epoch</th>
<th>train_loss</th>
<th>valid_loss</th>
<th>KLD</th>
<th>JSD</th>
<th>time</th>
</tr>
</thead>
<tbody>
<tr>
<td>0</td>
<td>0.699992</td>
<td>2.666008</td>
<td>2.666008</td>
<td>0.354615</td>
<td>00:00</td>
</tr>
</tbody>
</table>
</div>
------fold2------
lr in training is 0.003
<div>
<table class="fastprogress">
<thead>
<tr>
<th>epoch</th>
<th>train_loss</th>
<th>valid_loss</th>
<th>KLD</th>
<th>JSD</th>
<th>time</th>
</tr>
</thead>
<tbody>
<tr>
<td>0</td>
<td>0.633100</td>
<td>2.994696</td>
<td>2.994696</td>
<td>0.354643</td>
<td>00:00</td>
</tr>
</tbody>
</table>
</div>
nfold
0 114
1 114
2 114
Name: count, dtype: int64
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
Built Distribution
Filter files by name, interpreter, ABI, and platform.
If you're not sure about the file name format, learn more about wheel file names.
Copy a direct link to the current filters
File details
Details for the file kmodel-0.0.2.tar.gz.
File metadata
- Download URL: kmodel-0.0.2.tar.gz
- Upload date:
- Size: 18.6 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/6.2.0 CPython/3.12.12
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
0eca1c1f1ec36e5e2e97589aecef61fddc16bad70b1b889e3c7b90fa3d735152
|
|
| MD5 |
28cd687f19bc39a481b40cae7a6dc958
|
|
| BLAKE2b-256 |
60391f3c8a5b9d7c9796742b2684ac7a8b501bcfecc5a0268b7ea63b9b4644e0
|
File details
Details for the file kmodel-0.0.2-py3-none-any.whl.
File metadata
- Download URL: kmodel-0.0.2-py3-none-any.whl
- Upload date:
- Size: 16.5 kB
- Tags: Python 3
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/6.2.0 CPython/3.12.12
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
d2ece9d73a4e60fa214a60a8e0217286d7c8e7b685ca30b600b30b405ffe447a
|
|
| MD5 |
77dde94d9a70cc3a282721f3d706b026
|
|
| BLAKE2b-256 |
81894ae469ca5d41352a12e362fce87ed7d308ac3cb14b712f346838cfcb9c07
|