Skip to main content

Feature engineering using polars

Project description

PyPI PyPI - Downloads

PolarsFE is a Python package of helper functions for feature engineering. The code examples should be sufficient to get you started.

Installation

pip install PolarsFE

or 

pip install git+https://github.com/AdrianAntico/PolarsFE.git#egg=PolarsFE

Feature Engineering Code Examples


Categorical

Dummy Variables

Click for code example
from PolarsFE import character

df = pl.DataFrame({
    "Category": ["A", "B", "A", "C", "B", "C", "A", "B", "D"],
    "Color": ["Red", "Blue", "Green", "Red", "Green", "Blue", "Red", "Red", "Green"],
    "Value": [10, 20, 30, 40, 50, 60, 70, 80, 90]  # Numeric column (won't be affected)
})
        
# Create dummies for 'Category' and 'Color' and keep the original columns
df_dummies, levels_used = character.dummy_variables(
    df,
    columns=["Category", "Color"],
    levels=None,  # {"Category": ["A","B","G"], "Color": ["Red","Blue"]},
    top_n=None,  # {"Category": 2, "Color": 3},
    keep_original=True,
    return_levels=True)

print(df_dummies)
print(levels_used)

Categorical Encoding

Click for code example
# ----------------------------------------------------------------------------------
# Classification target version
# ----------------------------------------------------------------------------------

import os
from PolarsFE import character

# Set a seed for reproducibility
np.random.seed(42)

# Define parameters for fake data
num_rows = 1000
num_factors = 10
categories = ["A", "B", "C", "D", "E"]

# Build fake data for factors
fake_data = {f"Factor_{i}": np.random.choice(categories, size=num_rows)
             for i in range(1, num_factors + 1)}

# Create a binary target column ("Adrian")
fake_data["Adrian"] = np.random.binomial(1, 0.5, size=num_rows)

# Convert to a Polars DataFrame
df = pl.DataFrame(fake_data)

print("=== Fake Data Sample ===")
print(df.head())

# List of factor columns to encode
factor_columns = [f"Factor_{i}" for i in range(1, num_factors + 1)]

# --- Test Target Encoding ---
print("\n=== Testing Target Encoding ===")
result_target = character.categorical_encoding(
    data=df,
    ML_Type="classification",
    group_variables=factor_columns,
    target_variable="Adrian",
    method="target_encoding",
    save_path=None,          # No file saving for this test
    scoring=False,           # Training mode
    keep_original_factors=False,
    debug=True,              # Enable debug prints
)

if isinstance(result_target, dict):
    encoded_df_target = result_target["data"]
else:
    encoded_df_target = result_target

print("\n--- Target Encoding Result Sample ---")
print(encoded_df_target.head())

# --- Test James–Stein Encoding ---
print("\n=== Testing James–Stein Encoding ===")
result_js = character.categorical_encoding(
    data=df,
    ML_Type="classification",
    group_variables=factor_columns,
    target_variable="Adrian",
    method="james-stein",
    save_path=None,          # No file saving for this test
    scoring=False,           # Training mode
    keep_original_factors=True,
    debug=True,              # Enable debug prints
)

if isinstance(result_js, dict):
    encoded_df_js = result_js["data"]
else:
    encoded_df_js = result_js

print("\n--- James–Stein Encoding Result Sample ---")
print(encoded_df_js.head())


import os
import numpy as np
import polars as pl

# ------------------------------------------------------------------------------
# Regression target version
# ------------------------------------------------------------------------------

# Create a fake regression dataset
np.random.seed(42)

num_rows = 1000
num_factors = 5
categories = ["A", "B", "C", "D", "E"]

# Build fake data for categorical factors
data_dict = {f"Factor_{i}": np.random.choice(categories, size=num_rows)
             for i in range(1, num_factors + 1)}

# Create a continuous target variable (e.g., normally distributed)
data_dict["target"] = np.random.normal(loc=50, scale=10, size=num_rows)

# Convert the dictionary into a Polars DataFrame
df_reg = pl.DataFrame(data_dict)

print("=== Regression Data Sample ===")
print(df_reg.head())

# List of factor columns to encode
factor_columns = [f"Factor_{i}" for i in range(1, num_factors + 1)]

# --- Test Target Encoding for Regression ---
print("\n=== Testing Target Encoding for Regression ===")
result_target_reg = character.categorical_encoding(
    data=df_reg,
    ML_Type="regression",
    group_variables=factor_columns,
    target_variable="target",
    method="target_encoding",
    save_path=None,          # Not saving to disk in this test
    scoring=False,           # Training mode
    keep_original_factors=True,
    debug=True,              # Enable debug prints
)

# If the function returns a dict (with mapping components), extract the data
if isinstance(result_target_reg, dict):
    encoded_df_target_reg = result_target_reg["data"]
else:
    encoded_df_target_reg = result_target_reg

print("\n--- Target Encoding (Regression) Result Sample ---")
print(encoded_df_target_reg.head())

# --- Test James–Stein Encoding for Regression ---
print("\n=== Testing James–Stein Encoding for Regression ===")
result_js_reg = character.categorical_encoding(
    data=df_reg,
    ML_Type="regression",
    group_variables=factor_columns,
    target_variable="target",
    method="james-stein",
    save_path=None,          # Not saving to disk in this test
    scoring=False,           # Training mode
    keep_original_factors=False,
    debug=True,              # Enable debug prints
)

if isinstance(result_js_reg, dict):
    encoded_df_js_reg = result_js_reg["data"]
else:
    encoded_df_js_reg = result_js_reg

print("\n--- James–Stein Encoding (Regression) Result Sample ---")
print(encoded_df_js_reg.head())


import os
import numpy as np
import polars as pl

# ------------------------------------------------------------------------------
# MultiClass target version
# ------------------------------------------------------------------------------

# Create a fake multiclass dataset
np.random.seed(42)

num_rows = 1000
num_factors = 5
# For our categorical factors, use 5 possible levels.
factor_categories = ["A", "B", "C", "D", "E"]

# Build fake data for factors
data_dict = {f"Factor_{i}": np.random.choice(factor_categories, size=num_rows)
             for i in range(1, num_factors + 1)}

# Create a categorical target variable with more than 2 levels.
target_categories = ["class1", "class2", "class3"]
# Optionally, you can set probabilities for each class.
data_dict["target_class"] = np.random.choice(target_categories, size=num_rows, p=[0.3, 0.4, 0.3])

# Convert the dictionary into a Polars DataFrame
df_multi = pl.DataFrame(data_dict)

print("=== Multiclass Data Sample ===")
print(df_multi.head())

# List of factor columns to encode
factor_columns = [f"Factor_{i}" for i in range(1, num_factors + 1)]

# --- Test Target Encoding for Multiclass ---
print("\n=== Testing Target Encoding for Multiclass ===")
result_target_multi = character.categorical_encoding(
    data=df_multi,
    ML_Type="multiclass",
    group_variables=factor_columns,
    target_variable="target_class",
    method="target_encoding",
    save_path=None,          # Not saving to disk in this test
    scoring=False,           # Training mode
    keep_original_factors=False,
    debug=True,              # Enable debug prints
)

# If the function returns a dict (with mapping components), extract the data.
if isinstance(result_target_multi, dict):
    encoded_df_target_multi = result_target_multi["data"]
else:
    encoded_df_target_multi = result_target_multi

print("\n--- Target Encoding (Multiclass) Result Sample ---")
print(encoded_df_target_multi.head())

# --- Test James–Stein Encoding for Multiclass ---
print("\n=== Testing James–Stein Encoding for Multiclass ===")
result_js_multi = character.categorical_encoding(
    data=df_multi,
    ML_Type="multiclass",
    group_variables=factor_columns,
    target_variable="target_class",
    method="james-stein",
    save_path=None,          # Not saving to disk in this test
    scoring=False,           # Training mode
    keep_original_factors=False,
    debug=True,              # Enable debug prints
)

if isinstance(result_js_multi, dict):
    encoded_df_js_multi = result_js_multi["data"]
else:
    encoded_df_js_multi = result_js_multi

print("\n--- James–Stein Encoding (Multiclass) Result Sample ---")
print(encoded_df_js_multi.head())

Numeric

Standardization

Click for code example
import numpy as np
import polars as pl
from PolarsFE import numeric

# Set seed for reproducibility
np.random.seed(42)
n = 100

# Create a fake dataset with a grouping variable "Group" and two numeric variables.
groups = np.random.choice(["A", "B", "C"], size=n)
# Generate Value1 with different means per group.
value1 = np.where(groups == "A", np.random.normal(50, 5, size=n),
                  np.where(groups == "B", np.random.normal(60, 5, size=n),
                           np.random.normal(70, 5, size=n)))
# Generate Value2 as a normally distributed variable.
value2 = np.random.normal(100, 10, size=n)

df = pl.DataFrame({
    "Group": groups,
    "Value1": value1,
    "Value2": value2
})

print("=== Original Dataset ===")
print(df.head())

# -------------------------------
# TRAINING MODE: Compute standardization parameters by Group
# -------------------------------
# This call computes group-wise means and standard deviations for Value1 and Value2,
# creates standardized columns, and returns a score table.
transformed_train, score_tbl = numeric.standardize(
    data=df,
    col_names=["Value1", "Value2"],
    group_vars=["Group"],
    center=True,
    scale=True,
    score_table=True,
    mode="train",
    debug=True
)

print("\n=== Transformed Training Data ===")
print(transformed_train.head())

print("\n=== Score Table (Group-wise Means and SDs) ===")
print(score_tbl)

# -------------------------------
# APPLICATION MODE: Apply standardization to new data using the score table
# -------------------------------
# Here we simulate new data by cloning the original dataset.
# The new data does not have the standardized columns.
new_data = df.clone()

transformed_apply = numeric.standardize(
    data=new_data,
    col_names=["Value1", "Value2"],
    group_vars=["Group"],
    center=True,
    scale=True,
    mode="apply",
    score_table_data=score_tbl,
    debug=True
)

print("\n=== Transformed New Data (Standardized) ===")
print(transformed_apply.head())

# -------------------------------
# BACKTRANSFORMATION MODE: Reverse the standardization on the new data
# -------------------------------
# This reverses the standardized values back to their original scale.
backtransformed = numeric.standardize(
    data=transformed_apply,
    col_names=["Value1", "Value2"],
    group_vars=["Group"],
    center=True,
    scale=True,
    mode="backtransform",
    score_table_data=score_tbl,
    debug=True
)

print("\n=== Backtransformed Data (Reversed Standardization) ===")
print(backtransformed.head())

Percent Rank

Click for code example
import numpy as np
import polars as pl
from PolarsFE import numeric

# Set seed for reproducibility
np.random.seed(42)
n = 100

# Create a fake dataset with:
# - A grouping variable "Group" (levels: "A", "B", "C")
# - Two numeric columns "Value1" and "Value2"
groups = np.random.choice(["A", "B", "C"], size=n)
value1 = np.random.normal(50, 10, size=n)
value2 = np.random.normal(100, 20, size=n)

df = pl.DataFrame({
    "Group": groups,
    "Value1": value1,
    "Value2": value2
})

print("=== Original Training Data ===")
print(df.head())

# --------------
# TRAINING MODE: Compute percent ranks by Group for Value1 and Value2.
# --------------
transformed_train, score_tbl = numeric.percent_rank(
    data=df,
    col_names=["Value1", "Value2"],
    group_vars=["Group"],
    granularity=0.001,
    mode="train",
    score_table=True,
    debug=True
)

print("\n=== Transformed Training Data with Percent Ranks ===")
print(transformed_train.head())

print("\n=== Score Table ===")
print(score_tbl)

# --------------
# APPLY MODE: Use the score table to assign percent ranks to new data.
# --------------
# Simulate new data.
new_groups = np.random.choice(["A", "B", "C"], size=n)
new_value1 = np.random.normal(50, 10, size=n)
new_value2 = np.random.normal(100, 20, size=n)
new_df = pl.DataFrame({
    "Group": new_groups,
    "Value1": new_value1,
    "Value2": new_value2
})

print("\n=== Original New Data ===")
print(new_df.head())

transformed_new = numeric.percent_rank(
    data=new_df,
    col_names=["Value1", "Value2"],
    group_vars=["Group"],
    granularity=0.001,
    mode="apply",
    score_table_data=score_tbl,
    roll_direction="nearest",
    debug=True
)

print("\n=== Transformed New Data with Percent Ranks (Applied) ===")
print(transformed_new.head())

# --------------
# BACKTRANSFORM MODE: Reverse the percent rank transformation to recover original values.
# --------------
# For demonstration, use the new data with percent rank columns (from the apply mode).
backtransformed = numeric.percent_rank(
    data=transformed_new,
    col_names=["Value1", "Value2"],
    group_vars=["Group"],
    granularity=0.001,
    mode="backtransform",
    score_table_data=score_tbl,
    roll_direction="nearest",
    debug=True
)

print("\n=== Backtransformed Data (Recovered Original Values) ===")
print(backtransformed.head())

Numeric Transformations

Click for code example
import numpy as np
import polars as pl
from PolarsFE import numeric

# Create a fake dataset.
np.random.seed(42)
n = 10
df = pl.DataFrame({
    "Positive": np.random.uniform(5, 100, size=n),    # for BoxCox, Log, LogPlusA, Sqrt
    "AnyValue": np.random.uniform(-50, 50, size=n),     # for YeoJohnson
    "Angle": np.random.uniform(-1, 1, size=n),          # for Asin (input should be in [-1,1])
    "Probability": np.random.uniform(0.01, 0.99, size=n)  # for Logit (values in (0,1))
})

print("=== Original Data ===")
print(df)

# --------------------------
# Log Transformation
# --------------------------
df_log = numeric.numeric_transform(
    df,
    col_names=["Positive"],
    transformation="Log",
    mode="apply", debug=True
)
print("\n=== Log Applied ===")
print(df_log.select(["Positive", "Positive_log"]))

df_log_back = numeric.numeric_transform(
    df_log,
    col_names=["Positive_log"],
    transformation="Log",
    mode="backtransform", debug=True
)
print("\n=== Log Backtransformed ===")
print(df_log_back.select(["Positive_log", "Positive_log_back"]))

# --------------------------
# LogPlusA Transformation
# --------------------------
df_logplusa = numeric.numeric_transform(
    df,
    col_names=["Positive"],
    transformation="LogPlusA",
    mode="apply",
    A=None,
    debug=True
)
print("\n=== LogPlusA Applied ===")
print(df_logplusa.select(["Positive", "Positive_logplusa"]))

# For backtransformation, you must supply the same A. Compute it from the original column.
min_val = df.select(pl.col("Positive")).min().item()
A_val = max(1, 1 - min_val)
df_logplusa_back = numeric.numeric_transform(
    df_logplusa,
    col_names=["Positive_logplusa"],
    transformation="LogPlusA",
    mode="backtransform",
    A=A_val,
    debug=True
)
print("\n=== LogPlusA Backtransformed ===")
print(df_logplusa_back.select(["Positive_logplusa", "Positive_logplusa_back"]))

# --------------------------
# Sqrt Transformation
# --------------------------
df_sqrt = numeric.numeric_transform(
    df,
    col_names=["Positive"],
    transformation="Sqrt",
    mode="apply",
    debug=True
)
print("\n=== Sqrt Applied ===")
print(df_sqrt.select(["Positive", "Positive_sqrt"]))

df_sqrt_back = numeric.numeric_transform(
    df_sqrt,
    col_names=["Positive_sqrt"],
    transformation="Sqrt",
    mode="backtransform",
    debug=True
)
print("\n=== Sqrt Backtransformed ===")
print(df_sqrt_back.select(["Positive_sqrt", "Positive_sqrt_back"]))

# --------------------------
# Asin Transformation
# --------------------------
df_asin = numeric.numeric_transform(
    df,
    col_names=["Angle"],
    transformation="Asin",
    mode="apply",
    debug=True
)
print("\n=== Asin Applied ===")
print(df_asin.select(["Angle", "Angle_asin"]))

df_asin_back = numeric.numeric_transform(
    df_asin,
    col_names=["Angle"],
    transformation="Asin",
    mode="backtransform",
    debug=True
)
print("\n=== Asin Backtransformed ===")
print(df_asin_back.select(["Angle_asin", "Angle_asin_back"]))

# --------------------------
# Logit Transformation
# --------------------------
df_logit = numeric.numeric_transform(
    df,
    col_names=["Probability"],
    transformation="Logit",
    mode="apply",
    debug=True
)
print("\n=== Logit Applied ===")
print(df_logit.select(["Probability", "Probability_logit"]))

df_logit_back = numeric.numeric_transform(
    df_logit,
    col_names=["Probability"],
    transformation="Logit",
    mode="backtransform",
    debug=True
)
print("\n=== Logit Backtransformed ===")
print(df_logit_back.select(["Probability_logit", "Probability_logit_back"]))

Calendar

Calendar Features

Click for code example
import polars as pl
from PolarsFE import calendar
import datetime

# Create a fake dataset with a datetime column.
dates = [datetime.datetime(2023, 1, 1) + datetime.timedelta(days=i) for i in range(10)]
df = pl.DataFrame({
    "date": dates,
    "value": [i * 10 for i in range(10)]
})

print("=== Original DataFrame ===")
print(df)

# Test 1: Only extract 'year', 'month', and 'day'.
df_partial = calendar.calendar_features(df, "date", features=["year", "month", "day"])
print("\n=== DataFrame with 'year', 'month', and 'day' Only ===")
print(df_partial)

Cyclic Features

Click for code example
import polars as pl
from PolarsFE import calendar  # Assumes both functions are in the "calendar" module
import datetime

# Create a sample DataFrame with a date column.
df = pl.DataFrame({
    "date": [
        datetime.date(2023, 1, 1),
        datetime.date(2023, 1, 2),
        datetime.date(2023, 1, 3),
        datetime.date(2023, 1, 4)
    ],
    "value": [10, 20, 30, 40]
})

print("=== Original DataFrame ===")
print(df)

# Step 1: Compute calendar features.
# For example, here we extract "day_of_week" and "month". The resulting columns will be named
# "date_day_of_week" and "date_month".
df_cal = calendar.calendar_features(data=df, date_col="date", features=["day_of_week", "month"])
print("\n=== DataFrame with Calendar Features ===")
print(df_cal)

# Step 2: Transform the cyclic features.
# Now, use the cyclic_features function to transform "date_day_of_week" and "date_month" into sine and cosine components.
df_cyclic = calendar.cyclic_features(
    data=df_cal,
    date_col="date",
    columns=["date_day_of_week", "date_month"],
    drop_original=False
)
print("\n=== DataFrame with Transformed Cyclic Features ===")
print(df_cyclic)

# Optionally, if you wish to drop the original cyclic feature columns after transformation:
df_cyclic_drop = calendar.cyclic_features(
    data=df_cal,
    date_col="date",
    columns=["date_day_of_week", "date_month"],
    drop_original=True
)
print("\n=== DataFrame with Transformed Cyclic Features (Originals Dropped) ===")
print(df_cyclic_drop)

Holiday Features

Click for code example
import polars as pl
from PolarsFE import calendar
import datetime

# Option 1: Use dynamic holiday generation for the US.
df = pl.DataFrame({
    "date": [datetime.date(2023, 1, 1) + datetime.timedelta(days=i) for i in range(-3, 5)]
})
df_holidays = calendar.holiday_features(df, date_col="date", pre_window=2, post_window=2, add_holiday_name=True)
print("=== DataFrame with Holiday Features (Dynamic Holidays) ===")
print(df_holidays)

# Option 2: Supply specific years.
df2 = pl.DataFrame({
    "date": [datetime.date(2022, 12, 30) + datetime.timedelta(days=i) for i in range(10)]
})
df2_holidays = calendar.holiday_features(df2, date_col="date", country="US", years=[2022, 2023], pre_window=1, post_window=1)
print("\n=== DataFrame with Holiday Features (Supplied Years) ===")
print(df2_holidays)

Window

Lags

Click for code example
import polars as pl
from PolarsFE import window

# Create a sample DataFrame.
df = pl.DataFrame({
    "date": ["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04", "2023-01-05", "2023-01-06"],
    "sales": [100, 150, 200, 250, 300, 350],
    "store": ["A", "A", "B", "B", "A", "B"]
})

print("=== Original DataFrame ===")
print(df)

# Create lag features for "sales" with lags 1 and 2.
# Compute lags within each store.
df_lags = window.lags(df, date_col="date", columns=["sales"], lags=[1,2], group_vars=["store"], fill_value=0)

print("\n=== DataFrame with Lag Features ===")
print(df_lags)

Rolling Features

Click for code example
import polars as pl
from PolarsFE import window
      
# Create a sample DataFrame.
df = pl.DataFrame({
    "date": ["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04", "2023-01-05", "2023-01-06"],
    "sales": [100, 150, 200, 250, 300, 350],
    "store": ["A", "A", "B", "B", "A", "B"]
})

print("=== Original DataFrame ===")
print(df)

# Example 1: Compute rolling mean for "sales" with window sizes 2 and 3, grouped by "store".
df_roll_mean = window.rolling_features(
    data=df,
    date_col="date",
    columns=["sales"],
    window=[2, 3],
    agg="mean",
    group_vars=["store"],
    fill_value=0,
    is_sorted=False,
    min_samples=1,
    center=False
)
print("\n=== DataFrame with Rolling Mean Features ===")
print(df_roll_mean)

# Example 2: Compute rolling standard deviation for "sales" with window size 3, grouped by "store".
df_roll_std = window.rolling_features(
    data=df,
    date_col="date",
    columns=["sales"],
    window=3,
    agg="std",
    group_vars=["store"],
    fill_value=0,
    is_sorted=False,
    min_samples=1,
    center=False
)
print("\n=== DataFrame with Rolling Standard Deviation Features ===")
print(df_roll_std)

Differencing

Click for code example
import polars as pl
from PolarsFE import window
      
# Create a sample DataFrame.
df = pl.DataFrame({
    "date": ["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04", "2023-01-05"],
    "sales": [100, 150, 200, 250, 300],
    "store": ["A", "A", "B", "B", "A"]
})

print("=== Original DataFrame ===")
print(df)

# Example 1: Compute difference with a single integer (lag=1) for "sales" within each store.
df_diff1 = window.differences(
    data=df,
    date_col="date",
    columns=["sales"],
    diffs=1,
    group_vars=["store"],
    fill_value=0,
    is_sorted=False
)
print("\n=== DataFrame with Sales Difference (lag=1) ===")
print(df_diff1)

# Example 2: Compute difference using a tuple (e.g., difference between lag 1 and lag 2) for "sales" within each store.
df_diff2 = window.differences(
    data=df,
    date_col="date",
    columns=["sales"],
    diffs=(1, 2),
    group_vars=["store"],
    fill_value=0,
    is_sorted=False
)
print("\n=== DataFrame with Sales Difference (lag 1 - lag 2) ===")
print(df_diff2)

# Example 3: Multiple difference specifications (lag 1 and tuple (1,2)) for "sales".
df_diff3 = window.differences(
    data=df,
    date_col="date",
    columns=["sales"],
    diffs=[1, (1, 2)],
    group_vars=["store"],
    fill_value=0,
    is_sorted=False
)
print("\n=== DataFrame with Multiple Sales Differences ===")
print(df_diff3)

Datasets

Partition Random

Click for code example
import polars as pl
from PolarsFE import datasets

df = pl.DataFrame({
    "id": np.arange(1, 101),
    "value": np.random.rand(100) * 100,
    "category": np.random.choice(["A", "B", "C"], size=100)
})
print("=== Original Dataset ===")
print(df.head(10))
print(f"Total rows: {df.height}\\n")

# Partition into 3 equally sized parts with seed=42.
parts_equal = datasets.partition_random(
    data=df,
    num_partitions=3,
    seed=42
)
for idx, part in enumerate(parts_equal, start=1):
    print(f"--- Equal Partition {idx} (rows: {part.height}) ---")
    print(part)
    print()

# Partition into 3 parts using percentages (30%, 30%, 40%).
parts_pct = datasets.partition_random(
    data=df,
    num_partitions=3,
    seed=42,
    percentages=[0.3, 0.3, 0.4]
)
for idx, part in enumerate(parts_pct, start=1):
    print(f"--- Percentage Partition {idx} (rows: {part.height}) ---")
    print(part)
print()

Partition Time

Click for code example
import polars as pl
from PolarsFE import datasets
import datetime

# Create a DataFrame with dates spanning 100 days.
dates = [datetime.datetime(2023, 1, 1) + datetime.timedelta(days=i) for i in range(100)]
df = pl.DataFrame({
    "date": dates,
    "value": np.random.rand(100)
})

# Partition into 4 equal time intervals.
parts_equal = datasets.partition_time(
    df,
    time_col="date",
    num_partitions=4
)
for idx, part in enumerate(parts_equal, start=1):
    print(f"--- Equal Partition {idx} (rows: {part.height}) ---")
    print(part)

# Partition into 4 parts using percentages: 10%, 20%, 30%, 40%.
parts_pct = datasets.partition_time(
    df,
    time_col="date",
    num_partitions=4,
    percentages=[0.1, 0.2, 0.3, 0.4]
)
for idx, part in enumerate(parts_pct, start=1):
    print(f"--- Percentage Partition {idx} (rows: {part.height}) ---")
    print(part)

Partition Timeseries

Click for code example
import polars as pl
from PolarsFE import datasets
import datetime

# Create a fake dataset with dates spanning 100 days and a panel column.
dates = [datetime.datetime(2023, 1, 1) + datetime.timedelta(days=i) for i in range(100)]
df = pl.DataFrame({
    "date": dates,
    "value": np.random.rand(100) * 100,
    "panel": np.random.choice(["A", "B", "C"], size=100)
})

print("=== Original Dataset (first 10 rows) ===")
print(df.head(10))
print(f"Total rows: {df.height}\n")

# --- Test 1: Equal-Time Partitions ---
print("=== Equal-Time Partitions ===")
parts_equal = datasets.partition_timeseries(
    df,
    time_col="date",
    panel_vars=["panel"],
    num_partitions=4
)
for idx, part in enumerate(parts_equal, start=1):
    print(f"--- Equal Partition {idx} (rows: {part.height}) ---")
    print(part)
    print()

# --- Test 2: Percentage-Based Partitions ---
# For example, partition into 4 parts using percentages [0.1, 0.2, 0.3, 0.4].
print("=== Percentage-Based Partitions ===")
parts_pct = datasets.partition_timeseries(
    df,
    time_col="date",
    panel_vars=["panel"],
    num_partitions=4,
    percentages=[0.1, 0.2, 0.3, 0.4]
)
for idx, part in enumerate(parts_pct, start=1):
    print(f"--- Percentage Partition {idx} (rows: {part.height}) ---")
    print(part)
    print()

Stratified Sample

Click for code example
import polars as pl
from PolarsFE import datasets
import datetime

# Create a fake dataset with a datetime column and a stratification (panel) column.
dates = [datetime.datetime(2023, 1, 1) + datetime.timedelta(days=i) for i in range(100)]
df = pl.DataFrame({
    "date": dates,
    "value": np.random.rand(100) * 100,
    "panel": np.random.choice(["A", "B", "C"], size=100)
})

print("=== Original Dataset (first 10 rows) ===")
print(df.head(10))
print(f"Total rows: {df.height}\n")

# Test 1: Stratified sampling on a single column ("panel").
sample_df = datasets.stratified_sample(df, stratify_by="panel", frac=0.2)
print("=== Stratified Sample (20% from each panel) ===")
print(sample_df)
print(f"Sample rows: {sample_df.height}\n")

# Test 2: Stratified sampling on multiple columns.
# Create a dataset with two stratification variables.
df2 = pl.DataFrame({
    "id": np.arange(1, 201),
    "group": np.random.choice(["A", "B"], size=200),
    "region": np.random.choice(["North", "South"], size=200),
    "value": np.random.rand(200) * 100
})
print("=== Original Dataset with Multiple Stratification Columns ===")
print(df2.head(10))
print(f"Total rows: {df2.height}\n")

sample_df2 = datasets.stratified_sample(df2, stratify_by=["group", "region"], frac=0.15)
print("=== Stratified Sample with 'group' and 'region' (15% from each stratum) ===")
print(sample_df2)
print(f"Sample rows: {sample_df2.height}")

Impute Missing Values

Click for code example
import polars as pl
from PolarsFE import datasets

df = pl.DataFrame({
    "A": [1, None, 3, None],
    "B": [None, 2, None, 4],
    "C": ["x", None, "y", "z"],
    "group": ["G1", "G1", "G2", "G2"]
})

# Constant imputation for columns A and B with 0.
imputed_const = datasets.impute_missing(
    df,
    method="constant",
    value=0,
    columns=["A", "B"]
)

# Global mean imputation for numeric columns A and B.
imputed_mean = datasets.impute_missing(
    df,
    method="mean",
    columns=["A", "B"]
)

# Group-based median imputation for columns A and B.
imputed_median_group = datasets.impute_missing(
    df,
    method="median",
    columns=["A", "B"],
    group_vars=["group"]
)

# Forward-fill imputation globally.
imputed_ffill = datasets.impute_missing(
    df,
    method="ffill"
)

# Group-based median imputation for columns A and B.
imputed_median_group = datasets.impute_missing(
    df,
    method="median",
    columns=["A", "B"],
    group_vars=["group"]
)

# Group-based forward fill imputation for all columns.
imputed_ffill = datasets.impute_missing(
    df,
    method="ffill",
    group_vars=["group"]
)

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

polarsfe-1.0.2.tar.gz (50.4 kB view details)

Uploaded Source

Built Distribution

If you're not sure about the file name format, learn more about wheel file names.

PolarsFE-1.0.2-py3-none-any.whl (41.5 kB view details)

Uploaded Python 3

File details

Details for the file polarsfe-1.0.2.tar.gz.

File metadata

  • Download URL: polarsfe-1.0.2.tar.gz
  • Upload date:
  • Size: 50.4 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/5.0.0 CPython/3.11.7

File hashes

Hashes for polarsfe-1.0.2.tar.gz
Algorithm Hash digest
SHA256 39241810c87aac6408190bb55e62b5a482645800e7f0794b1bab31f20a05dcc2
MD5 cc6faf981b17144d0627d1a426426e76
BLAKE2b-256 afde2eca5fc4681f9fe03fde0fa46ed390663bef0860bb9f94607a033cdbed08

See more details on using hashes here.

File details

Details for the file PolarsFE-1.0.2-py3-none-any.whl.

File metadata

  • Download URL: PolarsFE-1.0.2-py3-none-any.whl
  • Upload date:
  • Size: 41.5 kB
  • Tags: Python 3
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/5.0.0 CPython/3.11.7

File hashes

Hashes for PolarsFE-1.0.2-py3-none-any.whl
Algorithm Hash digest
SHA256 42293a90adab1e05e7079feeb53db700c98f87a3b6287e993d743acaa47711b5
MD5 83e7ebf2d2cfc452df6af63e739ad65a
BLAKE2b-256 f3d8e1f7c5595627150fcd59e74643767b33aa7548532c1c9633236bb54af9b8

See more details on using hashes here.

Supported by

AWS Cloud computing and Security Sponsor Datadog Monitoring Depot Continuous Integration Fastly CDN Google Download Analytics Pingdom Monitoring Sentry Error logging StatusPage Status page