Skip to main content

No project description provided

Project description

GDML

A documentation repository for machine learning projects.

Overview

This project contains resources, guides, and documentation to support machine learning workflows.

Features

  • Project setup instructions
  • Usage examples
  • Best practices
  • Reference materials

Sample Code

from gdml.Data import DataReader, DataSplitter
data_reader = DataReader(filepath='D:\\Projects\\ML_DOC\\california_housing_train.csv')
data = data_reader.read()
print(data.head())
x = data.drop(columns=['median_house_value'])
y = data['median_house_value']

data_splitter = DataSplitter(data=data, target_column='median_house_value', test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = data_splitter.get_all_splits()

# Regression

from gdml.ML import ML_regression
ml_regression = ML_regression(model='LinearRegression', dataset={'X_train': X_train, 'y_train': y_train})
ml_regression.plot(X_test, y_test)
print("Regression score:", ml_regression.score(X_test, y_test))

from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris(as_frame=True)
iris_data = iris['data']
iris_target = iris['target']
iris_df = pd.concat([iris_data, iris_target.rename('target')], axis=1)

iris_splitter = DataSplitter(data=iris_df, target_column='target', test_size=0.2, random_state=42)
X_train_cls, X_test_cls, y_train_cls, y_test_cls = iris_splitter.get_all_splits()

# Classification

from gdml.ML import ML_classification
ml_classification = ML_classification(model='SVC_classifier', dataset={'X_train': X_train_cls, 'y_train': y_train_cls})
ml_classification.plot(X_test_cls, y_test_cls)
print("Classification score:", ml_classification.score(X_test_cls, y_test_cls))

from gdml.ML import ML_Clustering
ml_clustering = ML_Clustering(model='KMeans', dataset={'X_train': X_train_cls})
ml_clustering.plot(X_test_cls)
print("Clustering score:", ml_clustering.score(X_test_cls, y_test_cls))

# Time Series

from gdml.ML import ML_TimeSeriesToolkit
data_url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/airline-passengers.csv'
df = pd.read_csv(data_url)

toolkit = ML_TimeSeriesToolkit(data=df, value_col='Passengers', date_col='Month', freq='MS')
toolkit.plot_series(title='Airline Passengers Over Time')
toolkit.decompose(model='multiplicative')
toolkit.check_stationarity()
toolkit.plot_autocorrelations()

toolkit.fit_sarima(order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
toolkit.plot_forecast(steps=36)
toolkit.fit_ets(seasonal='mul', seasonal_periods=12)
toolkit.plot_forecast(steps=36)
toolkit.fit_prophet()
toolkit.plot_forecast(steps=36)

# Reinforcement Learning

from gdml.ML import ML_RLAgent
agent = ML_RLAgent(env_name="FrozenLake-v1", algorithm="q_learning", alpha=0.5, gamma=0.99)
agent.train(episodes=500)
agent.plot_progress()
agent.test(episodes=3)

# Image Classification

from gdml.Data import ImageDataLoader
image_data_loader = ImageDataLoader(file_path='D:\\Projects\\ML_DOC\\img', batch_size=32, image_size=(224, 224))
images, labels, class_names = image_data_loader.load_data()
images = image_data_loader.preprocess_data(images)
images_augmented = image_data_loader.augment_data(images)
print(images_augmented, labels.shape, class_names)
ml_classification = ML_classification(model='Logistic_regression', dataset={'X_train': images_augmented, 'y_train': labels})
print("Classification score:", ml_classification.score(images_augmented, labels))


# CNN

import torch
from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms
from gdml.DL import DLModel
import numpy as np

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
val_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)
test_dataset = val_dataset

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model = DLModel(
    model_type="CNN",
    input_shape=(1, 28, 28),
    num_classes=10,
    task="classification",
    lr=0.001,
    optimizer="adam",
    loss_fn="cross_entropy",
    scheduler="step",
    scheduler_kwargs={"step_size": 10, "gamma": 0.1},
    gradient_clip=1.0,
    base=32,
    num_conv_layers=2,
    dropout=0.2,
    activation="relu",
    batch_norm=True,
    global_pool="avg"
)

model.fit(
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=10,
    print_every=2,
    early_stopping=5
)

results = model.evaluate(test_loader)
print(f"Test Results: {results}")

sample_images, _ = next(iter(test_loader))
sample_predictions = model.predict(sample_images[:10])
print(f"Sample predictions shape: {sample_predictions.shape}")
print(f"Predicted classes: {sample_predictions.argmax(dim=1)}")

model.save_model("mnist_cnn_classifier.pth")

import time

class TrainingCallback:
    def __init__(self):
        self.start_time = None
        self.best_val_acc = 0

    def on_train_begin(self):
        self.start_time = time.time()
        print("Training started!")

    def on_epoch_end(self, epoch, train_metrics, val_metrics):
        val_acc = val_metrics.get('accuracy', 0)
        if val_acc > self.best_val_acc:
            self.best_val_acc = val_acc
            print(f"New best validation accuracy: {val_acc:.4f}")

    def on_train_end(self):
        elapsed = time.time() - self.start_time
        print(f"Training completed in {elapsed:.2f} seconds")
        print(f"Best validation accuracy: {self.best_val_acc:.4f}")

def train_with_callbacks(model, train_loader, val_loader, epochs, callback=None):
    if callback:
        callback.on_train_begin()
    for epoch in range(1, epochs + 1):
        train_metrics = model._train_one_epoch(train_loader)
        val_metrics = model._eval_one_epoch(val_loader) if val_loader else {}
        model.history["loss"].append(train_metrics.get("loss", 0))
        model.history["metric"].append(train_metrics)
        model.history["val_loss"].append(val_metrics.get("loss", 0))
        model.history["val_metric"].append(val_metrics)
        print(f"Epoch {epoch:03d} | Train Loss: {train_metrics.get('loss', 0):.4f} | Train Acc: {train_metrics.get('accuracy', 0):.4f} | Val Loss: {val_metrics.get('loss', 0):.4f} | Val Acc: {val_metrics.get('accuracy', 0):.4f}")
        if callback:
            callback.on_epoch_end(epoch, train_metrics, val_metrics)
        if model.scheduler:
            if isinstance(model.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                model.scheduler.step(val_metrics.get('loss', 0))
            else:
                model.scheduler.step()
    if callback:
        callback.on_train_end()

def create_image_dataset(n_samples, image_size, n_classes):
    X = torch.randn(n_samples, 3, image_size, image_size)
    y = torch.randint(0, n_classes, (n_samples,))
    return X, y

train_X, train_y = create_image_dataset(800, 32, 5)
val_X, val_y = create_image_dataset(200, 32, 5)

train_dataset = TensorDataset(train_X, train_y)
val_dataset = TensorDataset(val_X, val_y)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

model = DLModel(
    model_type="CNN",
    input_shape=(3, 32, 32),
    num_classes=5,
    task="classification",
    lr=0.001,
    optimizer="adamw",
    scheduler="cosine_warm",
    scheduler_kwargs={"T_0": 10, "T_mult": 2}
)

callback = TrainingCallback()
train_with_callbacks(model, train_loader, val_loader, epochs=25, callback=callback)

import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(model.history['loss'], label='Train Loss')
plt.plot(model.history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')

plt.subplot(1, 2, 2)
train_acc = [m.get('accuracy', 0) for m in model.history['metric']]
val_acc = [m.get('accuracy', 0) for m in model.history['val_metric']]
plt.plot(train_acc, label='Train Accuracy')
plt.plot(val_acc, label='Val Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training and Validation Accuracy')
plt.tight_layout()
plt.show()

from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler

X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=15,
    n_redundant=5,
    n_classes=3,
    random_state=42
)

scaler = StandardScaler()
X = scaler.fit_transform(X)
X = torch.FloatTensor(X)
y = torch.LongTensor(y)

train_size = int(0.7 * len(X))
val_size = int(0.2 * len(X))

train_X, train_y = X[:train_size], y[:train_size]
val_X, val_y = X[train_size:train_size+val_size], y[train_size:train_size+val_size]
test_X, test_y = X[train_size+val_size:], y[train_size+val_size:]

train_dataset = TensorDataset(train_X, train_y)
val_dataset = TensorDataset(val_X, val_y)
test_dataset = TensorDataset(test_X, test_y)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# MLP

model = DLModel(
    model_type="MLP",
    input_shape=(20,),
    num_classes=3,
    task="classification",
    lr=0.01,
    optimizer="sgd",
    optimizer_kwargs={"momentum": 0.9, "weight_decay": 1e-4},
    loss_fn="focal",
    loss_kwargs={"alpha": 1, "gamma": 2},
    scheduler="multistep",
    scheduler_kwargs={"milestones": [10, 20], "gamma": 0.1},
    hidden_sizes=(256, 128, 64),
    dropout=0.3,
    activation="gelu",
    batch_norm=True,
    bias=True
)

model.fit(
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=30,
    print_every=5,
    early_stopping=10
)

results = model.evaluate(test_loader)
print(f"Test Results: {results}")

def get_feature_importance(model, X_sample):
    X_sample = X_sample.to(model.device)
    X_sample.requires_grad_()
    model.model.eval()
    output = model.model(X_sample)
    pred_class = output.argmax(dim=1)
    loss = output[range(len(pred_class)), pred_class].sum()
    loss.backward()
    importance = X_sample.grad.abs().mean(dim=0)
    return importance.cpu().numpy()

importance = get_feature_importance(model, test_X[:10])
print(f"Feature importance: {importance}")

def create_autoencoder_dataset(n_samples=1000, img_size=64):
    X = torch.randn(n_samples, 1, img_size, img_size)
    for i in range(n_samples):
        center_x, center_y = np.random.randint(10, img_size-10, 2)
        radius = np.random.randint(5, 15)
        y, x = torch.meshgrid(torch.arange(img_size), torch.arange(img_size))
        mask = ((x - center_x)**2 + (y - center_y)**2) < radius**2
        X[i, 0][mask] += 2.0
    X = torch.sigmoid(X)
    return X

train_X = create_autoencoder_dataset(800, 64)
val_X = create_autoencoder_dataset(200, 64)

train_dataset = TensorDataset(train_X)
val_dataset = TensorDataset(val_X)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# ConvAutoencoder

model = DLModel(
    model_type="ConvAutoencoder",
    input_shape=(1, 64, 64),
    task="autoencoder",
    lr=0.001,
    optimizer="adam",
    loss_fn="mse",
    scheduler="step",
    scheduler_kwargs={"step_size": 15, "gamma": 0.5},
    latent_dim=128,
    base=32,
    num_layers=3,
    activation="relu",
    use_skip_connections=False
)

model.fit(
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=30,
    print_every=3
)

with torch.no_grad():
    model.model.eval()
    sample_images = val_X[:8]
    reconstructed = model.model(sample_images.to(model.device))
    fig, axes = plt.subplots(2, 8, figsize=(16, 4))
    for i in range(8):
        axes[0, i].imshow(sample_images[i, 0], cmap='gray')
        axes[0, i].set_title('Original')
        axes[0, i].axis('off')
        axes[1, i].imshow(reconstructed[i, 0].cpu(), cmap='gray')
        axes[1, i].set_title('Reconstructed')
        axes[1, i].axis('off')
    plt.tight_layout()
    plt.show()

def create_time_series_dataset(n_samples=1000, seq_len=50, n_features=5):
    X = torch.randn(n_samples, seq_len, n_features)
    y = X[:, -10:, :].sum(dim=(1, 2)) + torch.randn(n_samples) * 0.1
    return X, y

train_X, train_y = create_time_series_dataset(800, 50, 3)
val_X, val_y = create_time_series_dataset(200, 50, 3)
test_X, test_y = create_time_series_dataset(100, 50, 3)

train_dataset = TensorDataset(train_X, train_y)
val_dataset = TensorDataset(val_X, val_y)
test_dataset = TensorDataset(test_X, test_y)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# LSTM

model = DLModel(
    model_type="LSTM",
    input_shape=(50, 3),
    task="regression",
    output_dim=1,
    lr=0.001,
    optimizer="adam",
    loss_fn="mse",
    scheduler="plateau",
    scheduler_kwargs={"patience": 5, "factor": 0.5},
    hidden=128,
    num_layers=2,
    bidirectional=True,
    dropout=0.2,
    cell_type="LSTM",
    pooling="last",
    attention=True
)

model.fit(
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=25,
    print_every=3,
    early_stopping=7
)

results = model.evaluate(test_loader)
print(f"Test Results: {results}")

predictions = model.predict(test_X[:10])
print(f"Sample predictions: {predictions.squeeze()}")
print(f"Actual values: {test_y[:10]}")

Getting Started

  1. Clone the repository:
    git clone https://github.com/gokulraj0906/gdml.git
    
  2. Explore the documentation files for guidance.

Contributing

Contributions are welcome! Please open issues or submit pull requests.

License

This project is licensed under the License.

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

gdml-0.1.0.tar.gz (35.9 kB view details)

Uploaded Source

Built Distribution

If you're not sure about the file name format, learn more about wheel file names.

gdml-0.1.0-py3-none-any.whl (37.7 kB view details)

Uploaded Python 3

File details

Details for the file gdml-0.1.0.tar.gz.

File metadata

  • Download URL: gdml-0.1.0.tar.gz
  • Upload date:
  • Size: 35.9 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.1.0 CPython/3.12.9

File hashes

Hashes for gdml-0.1.0.tar.gz
Algorithm Hash digest
SHA256 fc06bfdca5d12a3385d0905659bb22afcef183a22d121de7bfb04beaecc87c59
MD5 fc6ed35bc0ab4052bccb19246ae6daec
BLAKE2b-256 5222e5488582154a528477d109da15302f53d09d8724a491525bb66703bdc715

See more details on using hashes here.

File details

Details for the file gdml-0.1.0-py3-none-any.whl.

File metadata

  • Download URL: gdml-0.1.0-py3-none-any.whl
  • Upload date:
  • Size: 37.7 kB
  • Tags: Python 3
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.1.0 CPython/3.12.9

File hashes

Hashes for gdml-0.1.0-py3-none-any.whl
Algorithm Hash digest
SHA256 bd73f28d5864f8842c9aea68f95f78e99b638a1861ce83717b183e51e083be2c
MD5 85c3634e0c323ea99cc2ebd3bd1f7981
BLAKE2b-256 f90bab158f177804bd431238c531b14e916f585258d912172871b2ad24a96175

See more details on using hashes here.

Supported by

AWS Cloud computing and Security Sponsor Datadog Monitoring Depot Continuous Integration Fastly CDN Google Download Analytics Pingdom Monitoring Sentry Error logging StatusPage Status page