Skip to main content

Program for selecting the n best features for your machine learning model.

Project description

Example of use

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split
from genetic_feature_selection.genetic_search import GeneticSearch
from genetic_feature_selection.f_score_generator import FScoreSoftmaxInitPop
import pandas as pd


X, y = make_classification(n_samples=1000, n_informative=20, n_redundant=0)
X = pd.DataFrame(X)
y = pd.Series(y, name="y", dtype=int)
X_train, X_test, y_train, y_test = train_test_split(X, y)
X.columns = [f"col_{i}" for i in X.columns]


gsf = GeneticSearch(
    # the search will do 5 iterations
    iterations = 5,
    # each generation will have 4 possible solutions
    sol_per_pop = 4,
    # every iteration will go through 15 generations
    generations = 15,
    # in each generation the 4 best individuals will be kept
    keep_n_best_individuals = 4,
    # we want to find the 5 features that optimize average precision score
    select_n_features = 5,
    # 4 of the parents will be mating, this means the 4 best solutions in
    # each generation will be combined and create the basis for the next
    # generation
    num_parents_mating = 4,
    X_train = X_train,
    y_train = y_train,
    X_test = X_test,
    y_test = y_test,
    clf = LogisticRegression,
    clf_params = dict(max_iter=15),
    probas = True,
    scorer = average_precision_score,
    gen_pop = FScoreSoftmaxInitPop(
        X_train, y_train, tau = 50
    )
)


best_cols = gsf.search()

Example of use with f-score initialization and custom fitness function

.......

class FitnessFunc:
    X_train, X_test, y_train, y_test = X_train, X_test, y_train, y_test
    clf_params = {}
    clf = LogisticRegression

    def __call__(self, soln):
        X_train_soln = get_features(self.X_train, soln)
        X_val_son = get_features(self.X_test, soln)

        clf = self.clf(**self.clf_params)
        clf.fit(X_train_soln, self.y_train)

        preds = clf.predict_proba(X_val_son)[:,1]

        return average_precision_score(self.y_test, preds)


fitness_func = FitnessFunc()

gsf = GeneticSearch(
    iterations = 10,
    sol_per_pop = 4,
    generations = 15,
    keep_n_best_individuals = 4,
    select_n_features = 5,
    num_parents_mating = 4,
    X_train = X_train,
    y_train = y_train,
    X_test = X_test,
    y_test = y_test,
    gen_pop = FScoreSoftmaxInitPop(
        X_train, y_train, tau = 50
    ),
    fitness_func=fitness_func
)

gsf.search()

Example of custom fitness function where some features should be used regardless

.....
keep_cols = ["col_0", "col_10"]
X_train_keep_cols = X_train[keep_cols]
X_train = X_train[[c for c in X_train.columns if c not in keep_cols]]
X_test_keep_cols = X_test[keep_cols]
X_test = X_test[[c for c in X_test.columns if c not in keep_cols]]

logger = setup_logger(to_file=True)


class FitnessFunc:
    def __init__(
        self,
        X_train, y_train,
        X_test, y_test,
        X_train_keep_cols,
        X_test_keep_cols,
    ) -> None:
        self.X_train, self.y_train = X_train, y_train
        self.X_test, self.y_test = X_test, y_test
        self.X_train_keep_cols = X_train_keep_cols
        self.X_test_keep_cols = X_test_keep_cols
        self.clf_params = {}
        self.clf = LogisticRegression
        self.keep_cols = keep_cols

    def __call__(self, soln):
        X_train_soln = get_features(self.X_train, soln)
        X_val_son = get_features(self.X_test, soln)
        X_train_soln = pd.concat([X_train_soln, self.X_train_keep_cols], axis=1)
        X_val_son = pd.concat([X_val_son, self.X_test_keep_cols], axis=1)
        clf = self.clf(**self.clf_params)
        clf.fit(X_train_soln, self.y_train)
        preds = clf.predict_proba(X_val_son)[:,1]
        return average_precision_score(self.y_test, preds)

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

genetic_feature_selection-0.1.8.tar.gz (9.5 kB view details)

Uploaded Source

Built Distribution

genetic_feature_selection-0.1.8-py3-none-any.whl (14.5 kB view details)

Uploaded Python 3

File details

Details for the file genetic_feature_selection-0.1.8.tar.gz.

File metadata

File hashes

Hashes for genetic_feature_selection-0.1.8.tar.gz
Algorithm Hash digest
SHA256 3fb67a7f2af2c0f3aabe3f853c6e17c01efcbaf85851bbf3158c99a451a524ae
MD5 268e5045c659c093033284641c7d8a22
BLAKE2b-256 ba33839f4321f921c2d16ff1907730f29626579a9be1092d2fd1615ed601e911

See more details on using hashes here.

File details

Details for the file genetic_feature_selection-0.1.8-py3-none-any.whl.

File metadata

File hashes

Hashes for genetic_feature_selection-0.1.8-py3-none-any.whl
Algorithm Hash digest
SHA256 81482f91c1fa599632c9d9bba0530c8ab57d3394582af489cc0a46b41e517d30
MD5 063d22cfd1b21324fc92a0dd57fc2601
BLAKE2b-256 6eade844c19c21f36193a5220bab2a6fcf8837a97446c9efc1ef6c80dd43d49d

See more details on using hashes here.

Supported by

AWS Cloud computing and Security Sponsor Datadog Monitoring Fastly CDN Google Download Analytics Pingdom Monitoring Sentry Error logging StatusPage Status page