Skip to main content

A fresh nbdev project for kmodel.

Project description

kmodel

kmodel provides reusable machine learning and deep learning helpers for multi-output modeling workflows. It covers tabular model training, scoring, post-processing, prediction, and fastai-based deep learning utilities through runnable examples derived from the project notebooks.

Installation

pip install kmodel

Quick start

The examples below follow the notebooks under nbs/ in order. Each function example lives in its own cell and starts with a short comment derived from the function docstring.

01 ML

from kmodel.ml import get_splits, split_data, train_ml, train_ml_cv, post_process, post_process_oof, predict_ml
from pathlib import Path
import pandas as pd
from sklearn.linear_model import LinearRegression
import seaborn as sns
df = sns.load_dataset("penguins").dropna(
    subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"]
).reset_index(drop=True)
feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
target_df = pd.get_dummies(df["species"], prefix="species", dtype=float)
target_col = target_df.columns.tolist()
df[target_col] = target_df
df.shape
(342, 10)
# Split samples in a dataframe with stratified, grouped, or stratified-grouped K-fold logic.
splits = get_splits(df, stratified="species", nfold=3)
split0 = splits[0]
len(split0[0]), len(split0[1])
StratifiedKFold(n_splits=3, random_state=123, shuffle=True)
# species in train set: 3
# species in test set: 3

(228, 114)
# Given a split tuple, return X_train, y_train, X_test, and y_test.
X_train, y_train, X_test, y_test = split_data(df, feat_col, target_col, split0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
((228, 4), (228, 3), (114, 4), (114, 3))
# Fit and predict with a sklearn model, returning validation targets and predictions.
model = LinearRegression()
target, pred = train_ml(df, feat_col, target_col, split0, model)
pred.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
species_Adelie species_Chinstrap species_Gentoo
0 0.993427 0.137000 -0.130427
3 1.064457 0.046586 -0.111043
9 0.839056 0.118838 0.042105
11 0.669557 0.423417 -0.092974
14 1.050863 -0.073914 0.023052
# Run cross-validation through the given splits.
oof = train_ml_cv(df, feat_col, target_col, splits=splits, model=LinearRegression())
oof.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
species_Adelie species_Chinstrap species_Gentoo nfold
0 0.993427 0.137000 -0.130427 0
1 0.790344 0.103762 0.105894 1
2 0.673088 0.317647 0.009265 2
3 1.064457 0.046586 -0.111043 0
4 1.122991 0.154406 -0.277398 1
# Clip negatives and renormalize probability-like predictions.
post_process(pred.head())
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
species_Adelie species_Chinstrap species_Gentoo
0 0.878807 1.211930e-01 8.846216e-09
3 0.958070 4.192990e-02 9.000554e-09
9 0.839056 1.188384e-01 4.210543e-02
11 0.612601 3.873988e-01 9.149350e-09
14 0.978535 9.311731e-09 2.146502e-02
# Post-process prediction columns in an out-of-fold dataframe.
oof = post_process_oof(oof, target_col)
oof[target_col].head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
species_Adelie species_Chinstrap species_Gentoo
0 0.878807 0.121193 8.846216e-09
1 0.790344 0.103762 1.058942e-01
2 0.673088 0.317647 9.264531e-03
3 0.958070 0.041930 9.000554e-09
4 0.879124 0.120876 7.828416e-09
# Predict from a saved sklearn model.
model_path = Path("_tmp/penguins_ml.joblib")
model_path.parent.mkdir(parents=True, exist_ok=True)
_ = train_ml(df, feat_col, target_col, split0, LinearRegression(), save=model_path)
predict_ml(df.iloc[split0[1]], feat_col, target_col, model_pth=model_path).head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
species_Adelie species_Chinstrap species_Gentoo
0 0.993427 0.137000 -0.130427
3 1.064457 0.046586 -0.111043
9 0.839056 0.118838 0.042105
11 0.669557 0.423417 -0.092974
14 1.050863 -0.073914 0.023052

02 DNN

from kmodel.dnn import seed_everything, GeneralDataset, MLP, lin_wn, CNN1D, PSSM_model, init_weights, CE, KLD, JSD, train_dl, predict_dl, train_dl_cv
import fastcore.all as fc
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from fastai.vision.all import *
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader
# Set up the objects used by the examples below.
seed_everything(123)
df = sns.load_dataset("penguins").dropna(
    subset=["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"]
).reset_index(drop=True)
feat_col = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
target_df = pd.get_dummies(df["species"], prefix="species", dtype=float)
target_col = target_df.columns.tolist()
df[target_col] = target_df
n_feature = len(feat_col)
n_target = len(target_col)
n_aa = len(target_col)
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)
splits = list(skf.split(df.index, df["species"]))
split0 = splits[0]
ds = GeneralDataset(df, feat_col, target_col, A=n_aa)
xb, yb = next(iter(DataLoader(ds, batch_size=8, shuffle=True)))
logits = PSSM_model(n_feature, n_target, A=n_aa, model="MLP")(xb)
df.shape
(342, 10)
# Feed-forward model for tabular inputs.
mlp = MLP(n_feature, n_target)
mlp(xb).shape
torch.Size([8, 3])
# Weight-normalized linear block.
lin_wn(10, 3)
Sequential(
  (0): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (1): Dropout(p=0.1, inplace=False)
  (2): ParametrizedLinear(
    in_features=10, out_features=3, bias=True
    (parametrizations): ModuleDict(
      (weight): ParametrizationList(
        (0): _WeightNorm()
      )
    )
  )
  (3): SiLU()
)
# Initialize convolution layers with Kaiming normal weights.
cnn = CNN1D(n_feature, n_target).apply(init_weights)
cnn(xb).shape
torch.Size([8, 3])
# Cross-entropy with soft labels.
CE(logits, yb)
tensor(1.0681, grad_fn=<MeanBackward0>)
# Average KL divergence across positions between target_probs and softmax(logits).
KLD(logits, yb)
tensor(1.0681, grad_fn=<MeanBackward0>)
# Average Jensen-Shannon divergence across positions between target_probs and softmax(logits).
JSD(logits, yb)
tensor(0.3023, grad_fn=<MeanBackward0>)
# Train a deep learning model with the fastai learner stack.
get_mlp = lambda: PSSM_model(n_feature, n_target, A=n_aa, model='MLP')
target, pred = train_dl(
    df,
    feat_col,
    target_col,
    split0,
    model_func=get_mlp,
    A=n_aa,
    n_epoch=1,
    bs=16,
    lr=3e-3,
    save='model',
)
pred.head()
lr in training is 0.003
<style> progress { appearance: none; border: none; border-radius: 4px; width: 300px; height: 20px; vertical-align: middle; background: #e0e0e0; } progress::-webkit-progress-bar { background: #e0e0e0; border-radius: 4px; } progress::-webkit-progress-value { background: #2196F3; border-radius: 4px; } progress::-moz-progress-bar { background: #2196F3; border-radius: 4px; } progress:not([value]) { background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px); } progress.progress-bar-interrupted::-webkit-progress-value { background: #F44336; } progress.progress-bar-interrupted::-moz-progress-value { background: #F44336; } progress.progress-bar-interrupted::-webkit-progress-bar { background: #F44336; } progress.progress-bar-interrupted::-moz-progress-bar { background: #F44336; } progress.progress-bar-interrupted { background: #F44336; } table.fastprogress { border-collapse: collapse; margin: 1em 0; font-size: 0.9em; } table.fastprogress th, table.fastprogress td { padding: 8px 12px; border: 1px solid #ddd; text-align: left; } table.fastprogress thead tr { background: #f8f9fa; font-weight: bold; } table.fastprogress tbody tr:nth-of-type(even) { background: #f8f9fa; } </style>
<div>
  <table class="fastprogress">
    <thead>
      <tr>
        <th>epoch</th>
        <th>train_loss</th>
        <th>valid_loss</th>
        <th>KLD</th>
        <th>JSD</th>
        <th>time</th>
      </tr>
    </thead>
    <tbody>
      <tr>
        <td>0</td>
        <td>0.740750</td>
        <td>3.037915</td>
        <td>3.037915</td>
        <td>0.376177</td>
        <td>00:00</td>
      </tr>
    </tbody>
  </table>
</div>
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
species_Adelie species_Chinstrap species_Gentoo
0 0.009204 0.009344 0.981452
3 0.063467 0.055438 0.881095
9 0.182922 0.156108 0.660971
11 0.294663 0.286624 0.418712
14 0.011959 0.011384 0.976657
# Predict a dataframe given a deep learning model saved by fastai.
test_pred = predict_dl(
    df.iloc[split0[1]].copy(),
    feat_col,
    target_col,
    model_func=get_mlp,
    model_pth='model',
    A=n_aa,
)
test_pred
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
species_Adelie species_Chinstrap species_Gentoo
0 9.204363e-03 9.344031e-03 0.981452
3 6.346702e-02 5.543802e-02 0.881095
9 1.829216e-01 1.561077e-01 0.660971
11 2.946635e-01 2.866240e-01 0.418712
14 1.195943e-02 1.138383e-02 0.976657
... ... ... ...
328 5.574878e-09 1.360372e-08 1.000000
334 2.227252e-10 7.630787e-10 1.000000
335 5.731530e-07 1.071595e-06 0.999998
339 5.872652e-10 1.706496e-09 1.000000
340 5.236147e-08 1.102807e-07 1.000000

114 rows × 3 columns

# Cross-validation training loop for deep learning models.
oof = train_dl_cv(
    df,
    feat_col,
    target_col,
    splits=splits,
    model_func=get_mlp,
    A=n_aa,
    n_epoch=1,
    bs=16,
    lr=3e-3,
)
oof.nfold.value_counts()
------fold0------
lr in training is 0.003
<div>
  <table class="fastprogress">
    <thead>
      <tr>
        <th>epoch</th>
        <th>train_loss</th>
        <th>valid_loss</th>
        <th>KLD</th>
        <th>JSD</th>
        <th>time</th>
      </tr>
    </thead>
    <tbody>
      <tr>
        <td>0</td>
        <td>0.713906</td>
        <td>2.601878</td>
        <td>2.601878</td>
        <td>0.355676</td>
        <td>00:00</td>
      </tr>
    </tbody>
  </table>
</div>
------fold1------
lr in training is 0.003
<div>
  <table class="fastprogress">
    <thead>
      <tr>
        <th>epoch</th>
        <th>train_loss</th>
        <th>valid_loss</th>
        <th>KLD</th>
        <th>JSD</th>
        <th>time</th>
      </tr>
    </thead>
    <tbody>
      <tr>
        <td>0</td>
        <td>0.699992</td>
        <td>2.666008</td>
        <td>2.666008</td>
        <td>0.354615</td>
        <td>00:00</td>
      </tr>
    </tbody>
  </table>
</div>
------fold2------
lr in training is 0.003
<div>
  <table class="fastprogress">
    <thead>
      <tr>
        <th>epoch</th>
        <th>train_loss</th>
        <th>valid_loss</th>
        <th>KLD</th>
        <th>JSD</th>
        <th>time</th>
      </tr>
    </thead>
    <tbody>
      <tr>
        <td>0</td>
        <td>0.633100</td>
        <td>2.994696</td>
        <td>2.994696</td>
        <td>0.354643</td>
        <td>00:00</td>
      </tr>
    </tbody>
  </table>
</div>
nfold
0    114
1    114
2    114
Name: count, dtype: int64

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

kmodel-0.0.2.tar.gz (18.6 kB view details)

Uploaded Source

Built Distribution

If you're not sure about the file name format, learn more about wheel file names.

kmodel-0.0.2-py3-none-any.whl (16.5 kB view details)

Uploaded Python 3

File details

Details for the file kmodel-0.0.2.tar.gz.

File metadata

  • Download URL: kmodel-0.0.2.tar.gz
  • Upload date:
  • Size: 18.6 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.2.0 CPython/3.12.12

File hashes

Hashes for kmodel-0.0.2.tar.gz
Algorithm Hash digest
SHA256 0eca1c1f1ec36e5e2e97589aecef61fddc16bad70b1b889e3c7b90fa3d735152
MD5 28cd687f19bc39a481b40cae7a6dc958
BLAKE2b-256 60391f3c8a5b9d7c9796742b2684ac7a8b501bcfecc5a0268b7ea63b9b4644e0

See more details on using hashes here.

File details

Details for the file kmodel-0.0.2-py3-none-any.whl.

File metadata

  • Download URL: kmodel-0.0.2-py3-none-any.whl
  • Upload date:
  • Size: 16.5 kB
  • Tags: Python 3
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.2.0 CPython/3.12.12

File hashes

Hashes for kmodel-0.0.2-py3-none-any.whl
Algorithm Hash digest
SHA256 d2ece9d73a4e60fa214a60a8e0217286d7c8e7b685ca30b600b30b405ffe447a
MD5 77dde94d9a70cc3a282721f3d706b026
BLAKE2b-256 81894ae469ca5d41352a12e362fce87ed7d308ac3cb14b712f346838cfcb9c07

See more details on using hashes here.

Supported by

AWS Cloud computing and Security Sponsor Datadog Monitoring Depot Continuous Integration Fastly CDN Google Download Analytics Pingdom Monitoring Sentry Error logging StatusPage Status page