Skip to main content

A machine learning and deep learning utilities library.

Project description

GDML

A documentation repository for machine learning projects.

Overview

PyPI Downloads PyPI Downloads PyPI Downloads

This project contains resources, guides, and documentation to support machine learning workflows.

Features

  • Project setup instructions
  • Usage examples
  • Best practices
  • Reference materials

Sample Code

from gdml.Data import DataReader, DataSplitter
data_reader = DataReader(filepath='D:\\Projects\\ML_DOC\\california_housing_train.csv')
data = data_reader.read()
print(data.head())
x = data.drop(columns=['median_house_value'])
y = data['median_house_value']

data_splitter = DataSplitter(data=data, target_column='median_house_value', test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = data_splitter.get_all_splits()

# Regression

from gdml.ML import ML_regression
ml_regression = ML_regression(model='LinearRegression', dataset={'X_train': X_train, 'y_train': y_train})
ml_regression.plot(X_test, y_test)
print("Regression score:", ml_regression.score(X_test, y_test))

from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris(as_frame=True)
iris_data = iris['data']
iris_target = iris['target']
iris_df = pd.concat([iris_data, iris_target.rename('target')], axis=1)

iris_splitter = DataSplitter(data=iris_df, target_column='target', test_size=0.2, random_state=42)
X_train_cls, X_test_cls, y_train_cls, y_test_cls = iris_splitter.get_all_splits()

# Classification

from gdml.ML import ML_classification
ml_classification = ML_classification(model='SVC_classifier', dataset={'X_train': X_train_cls, 'y_train': y_train_cls})
ml_classification.plot(X_test_cls, y_test_cls)
print("Classification score:", ml_classification.score(X_test_cls, y_test_cls))

from gdml.ML import ML_Clustering
ml_clustering = ML_Clustering(model='KMeans', dataset={'X_train': X_train_cls})
ml_clustering.plot(X_test_cls)
print("Clustering score:", ml_clustering.score(X_test_cls, y_test_cls))

# Time Series

from gdml.ML import ML_TimeSeriesToolkit
data_url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/airline-passengers.csv'
df = pd.read_csv(data_url)

toolkit = ML_TimeSeriesToolkit(data=df, value_col='Passengers', date_col='Month', freq='MS')
toolkit.plot_series(title='Airline Passengers Over Time')
toolkit.decompose(model='multiplicative')
toolkit.check_stationarity()
toolkit.plot_autocorrelations()

toolkit.fit_sarima(order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
toolkit.plot_forecast(steps=36)
toolkit.fit_ets(seasonal='mul', seasonal_periods=12)
toolkit.plot_forecast(steps=36)
toolkit.fit_prophet()
toolkit.plot_forecast(steps=36)

# Reinforcement Learning

from gdml.ML import ML_RLAgent
agent = ML_RLAgent(env_name="FrozenLake-v1", algorithm="q_learning", alpha=0.5, gamma=0.99)
agent.train(episodes=500)
agent.plot_progress()
agent.test(episodes=3)

# Image Classification

from gdml.Data import ImageDataLoader
image_data_loader = ImageDataLoader(file_path='D:\\Projects\\ML_DOC\\img', batch_size=32, image_size=(224, 224))
images, labels, class_names = image_data_loader.load_data()
images = image_data_loader.preprocess_data(images)
images_augmented = image_data_loader.augment_data(images)
print(images_augmented, labels.shape, class_names)
ml_classification = ML_classification(model='Logistic_regression', dataset={'X_train': images_augmented, 'y_train': labels})
print("Classification score:", ml_classification.score(images_augmented, labels))


# CNN

import torch
from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms
from gdml.DL import DLModel
import numpy as np

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
val_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)
test_dataset = val_dataset

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model = DLModel(
    model_type="CNN",
    input_shape=(1, 28, 28),
    num_classes=10,
    task="classification",
    lr=0.001,
    optimizer="adam",
    loss_fn="cross_entropy",
    scheduler="step",
    scheduler_kwargs={"step_size": 10, "gamma": 0.1},
    gradient_clip=1.0,
    base=32,
    num_conv_layers=2,
    dropout=0.2,
    activation="relu",
    batch_norm=True,
    global_pool="avg"
)

model.fit(
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=10,
    print_every=2,
    early_stopping=5
)

results = model.evaluate(test_loader)
print(f"Test Results: {results}")

sample_images, _ = next(iter(test_loader))
sample_predictions = model.predict(sample_images[:10])
print(f"Sample predictions shape: {sample_predictions.shape}")
print(f"Predicted classes: {sample_predictions.argmax(dim=1)}")

model.save_model("mnist_cnn_classifier.pth")

import time

class TrainingCallback:
    def __init__(self):
        self.start_time = None
        self.best_val_acc = 0

    def on_train_begin(self):
        self.start_time = time.time()
        print("Training started!")

    def on_epoch_end(self, epoch, train_metrics, val_metrics):
        val_acc = val_metrics.get('accuracy', 0)
        if val_acc > self.best_val_acc:
            self.best_val_acc = val_acc
            print(f"New best validation accuracy: {val_acc:.4f}")

    def on_train_end(self):
        elapsed = time.time() - self.start_time
        print(f"Training completed in {elapsed:.2f} seconds")
        print(f"Best validation accuracy: {self.best_val_acc:.4f}")

def train_with_callbacks(model, train_loader, val_loader, epochs, callback=None):
    if callback:
        callback.on_train_begin()
    for epoch in range(1, epochs + 1):
        train_metrics = model._train_one_epoch(train_loader)
        val_metrics = model._eval_one_epoch(val_loader) if val_loader else {}
        model.history["loss"].append(train_metrics.get("loss", 0))
        model.history["metric"].append(train_metrics)
        model.history["val_loss"].append(val_metrics.get("loss", 0))
        model.history["val_metric"].append(val_metrics)
        print(f"Epoch {epoch:03d} | Train Loss: {train_metrics.get('loss', 0):.4f} | Train Acc: {train_metrics.get('accuracy', 0):.4f} | Val Loss: {val_metrics.get('loss', 0):.4f} | Val Acc: {val_metrics.get('accuracy', 0):.4f}")
        if callback:
            callback.on_epoch_end(epoch, train_metrics, val_metrics)
        if model.scheduler:
            if isinstance(model.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                model.scheduler.step(val_metrics.get('loss', 0))
            else:
                model.scheduler.step()
    if callback:
        callback.on_train_end()

def create_image_dataset(n_samples, image_size, n_classes):
    X = torch.randn(n_samples, 3, image_size, image_size)
    y = torch.randint(0, n_classes, (n_samples,))
    return X, y

train_X, train_y = create_image_dataset(800, 32, 5)
val_X, val_y = create_image_dataset(200, 32, 5)

train_dataset = TensorDataset(train_X, train_y)
val_dataset = TensorDataset(val_X, val_y)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

model = DLModel(
    model_type="CNN",
    input_shape=(3, 32, 32),
    num_classes=5,
    task="classification",
    lr=0.001,
    optimizer="adamw",
    scheduler="cosine_warm",
    scheduler_kwargs={"T_0": 10, "T_mult": 2}
)

callback = TrainingCallback()
train_with_callbacks(model, train_loader, val_loader, epochs=25, callback=callback)

import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(model.history['loss'], label='Train Loss')
plt.plot(model.history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')

plt.subplot(1, 2, 2)
train_acc = [m.get('accuracy', 0) for m in model.history['metric']]
val_acc = [m.get('accuracy', 0) for m in model.history['val_metric']]
plt.plot(train_acc, label='Train Accuracy')
plt.plot(val_acc, label='Val Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training and Validation Accuracy')
plt.tight_layout()
plt.show()

from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler

X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=15,
    n_redundant=5,
    n_classes=3,
    random_state=42
)

scaler = StandardScaler()
X = scaler.fit_transform(X)
X = torch.FloatTensor(X)
y = torch.LongTensor(y)

train_size = int(0.7 * len(X))
val_size = int(0.2 * len(X))

train_X, train_y = X[:train_size], y[:train_size]
val_X, val_y = X[train_size:train_size+val_size], y[train_size:train_size+val_size]
test_X, test_y = X[train_size+val_size:], y[train_size+val_size:]

train_dataset = TensorDataset(train_X, train_y)
val_dataset = TensorDataset(val_X, val_y)
test_dataset = TensorDataset(test_X, test_y)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# MLP

model = DLModel(
    model_type="MLP",
    input_shape=(20,),
    num_classes=3,
    task="classification",
    lr=0.01,
    optimizer="sgd",
    optimizer_kwargs={"momentum": 0.9, "weight_decay": 1e-4},
    loss_fn="focal",
    loss_kwargs={"alpha": 1, "gamma": 2},
    scheduler="multistep",
    scheduler_kwargs={"milestones": [10, 20], "gamma": 0.1},
    hidden_sizes=(256, 128, 64),
    dropout=0.3,
    activation="gelu",
    batch_norm=True,
    bias=True
)

model.fit(
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=30,
    print_every=5,
    early_stopping=10
)

results = model.evaluate(test_loader)
print(f"Test Results: {results}")

def get_feature_importance(model, X_sample):
    X_sample = X_sample.to(model.device)
    X_sample.requires_grad_()
    model.model.eval()
    output = model.model(X_sample)
    pred_class = output.argmax(dim=1)
    loss = output[range(len(pred_class)), pred_class].sum()
    loss.backward()
    importance = X_sample.grad.abs().mean(dim=0)
    return importance.cpu().numpy()

importance = get_feature_importance(model, test_X[:10])
print(f"Feature importance: {importance}")

def create_autoencoder_dataset(n_samples=1000, img_size=64):
    X = torch.randn(n_samples, 1, img_size, img_size)
    for i in range(n_samples):
        center_x, center_y = np.random.randint(10, img_size-10, 2)
        radius = np.random.randint(5, 15)
        y, x = torch.meshgrid(torch.arange(img_size), torch.arange(img_size))
        mask = ((x - center_x)**2 + (y - center_y)**2) < radius**2
        X[i, 0][mask] += 2.0
    X = torch.sigmoid(X)
    return X

train_X = create_autoencoder_dataset(800, 64)
val_X = create_autoencoder_dataset(200, 64)

train_dataset = TensorDataset(train_X)
val_dataset = TensorDataset(val_X)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# ConvAutoencoder

model = DLModel(
    model_type="ConvAutoencoder",
    input_shape=(1, 64, 64),
    task="autoencoder",
    lr=0.001,
    optimizer="adam",
    loss_fn="mse",
    scheduler="step",
    scheduler_kwargs={"step_size": 15, "gamma": 0.5},
    latent_dim=128,
    base=32,
    num_layers=3,
    activation="relu",
    use_skip_connections=False
)

model.fit(
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=30,
    print_every=3
)

with torch.no_grad():
    model.model.eval()
    sample_images = val_X[:8]
    reconstructed = model.model(sample_images.to(model.device))
    fig, axes = plt.subplots(2, 8, figsize=(16, 4))
    for i in range(8):
        axes[0, i].imshow(sample_images[i, 0], cmap='gray')
        axes[0, i].set_title('Original')
        axes[0, i].axis('off')
        axes[1, i].imshow(reconstructed[i, 0].cpu(), cmap='gray')
        axes[1, i].set_title('Reconstructed')
        axes[1, i].axis('off')
    plt.tight_layout()
    plt.show()

def create_time_series_dataset(n_samples=1000, seq_len=50, n_features=5):
    X = torch.randn(n_samples, seq_len, n_features)
    y = X[:, -10:, :].sum(dim=(1, 2)) + torch.randn(n_samples) * 0.1
    return X, y

train_X, train_y = create_time_series_dataset(800, 50, 3)
val_X, val_y = create_time_series_dataset(200, 50, 3)
test_X, test_y = create_time_series_dataset(100, 50, 3)

train_dataset = TensorDataset(train_X, train_y)
val_dataset = TensorDataset(val_X, val_y)
test_dataset = TensorDataset(test_X, test_y)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# LSTM

model = DLModel(
    model_type="LSTM",
    input_shape=(50, 3),
    task="regression",
    output_dim=1,
    lr=0.001,
    optimizer="adam",
    loss_fn="mse",
    scheduler="plateau",
    scheduler_kwargs={"patience": 5, "factor": 0.5},
    hidden=128,
    num_layers=2,
    bidirectional=True,
    dropout=0.2,
    cell_type="LSTM",
    pooling="last",
    attention=True
)

model.fit(
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=25,
    print_every=3,
    early_stopping=7
)

results = model.evaluate(test_loader)
print(f"Test Results: {results}")

predictions = model.predict(test_X[:10])
print(f"Sample predictions: {predictions.squeeze()}")
print(f"Actual values: {test_y[:10]}")

Getting Started

  1. Clone the repository:
    git clone https://github.com/gokulraj0906/gdml.git
    
  2. Explore the documentation files for guidance.

Contributing

Contributions are welcome! Please open issues or submit pull requests.

License

This project is licensed under the License.

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

gdml-1.5.0.tar.gz (40.4 kB view details)

Uploaded Source

Built Distribution

If you're not sure about the file name format, learn more about wheel file names.

gdml-1.5.0-py3-none-any.whl (36.5 kB view details)

Uploaded Python 3

File details

Details for the file gdml-1.5.0.tar.gz.

File metadata

  • Download URL: gdml-1.5.0.tar.gz
  • Upload date:
  • Size: 40.4 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.1.0 CPython/3.13.7

File hashes

Hashes for gdml-1.5.0.tar.gz
Algorithm Hash digest
SHA256 d5dcb72e43f1b4bb28b14774ba06b83f5a07ec45b8ce3657dd30aa883e78bb69
MD5 326c9074e6d6e29473f37f00084c819d
BLAKE2b-256 5732ea4d78e1d0090d03762a3e833bf2c36535565964ee12b06b2271ccbfff54

See more details on using hashes here.

File details

Details for the file gdml-1.5.0-py3-none-any.whl.

File metadata

  • Download URL: gdml-1.5.0-py3-none-any.whl
  • Upload date:
  • Size: 36.5 kB
  • Tags: Python 3
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.1.0 CPython/3.13.7

File hashes

Hashes for gdml-1.5.0-py3-none-any.whl
Algorithm Hash digest
SHA256 11395ed89c2ac6bed35817714d67e68fb4d668440ddc39a9ed5c129c0f3274c3
MD5 3fb9127b4250fa209ae4403a88ba4222
BLAKE2b-256 0adbc3a377a48a180cb11119d0dc7f6d751179c8c515f41a9dbfd056c03142b6

See more details on using hashes here.

Supported by

AWS Cloud computing and Security Sponsor Datadog Monitoring Depot Continuous Integration Fastly CDN Google Download Analytics Pingdom Monitoring Sentry Error logging StatusPage Status page