Skip to main content

No project description provided

Project description

Polars for Data Science

Discord | Documentation | User Guide | Want to Contribute?
pip install polars-ds

PDS (polars_ds)

PDS is a modern data science package that

  1. is fast and furious
  2. is small and lean, with minimal dependencies
  3. has an intuitive and concise API (if you know Polars already)
  4. has dataframe friendly design
  5. and covers a wide variety of data science topics, such as simple statistics, linear regression, string edit distances, tabular data transforms, feature extraction, traditional modelling pipelines, model evaluation metrics, etc., etc..

It stands on the shoulders of the great Polars dataframe. You can see examples. Here are some highlights!

import polars as pl
import polars_ds as pds
# Parallel evaluation of multiple ML metrics on different segments of data
df.lazy().group_by("segments").agg( 
    pds.query_roc_auc("actual", "predicted").alias("roc_auc"),
    pds.query_log_loss("actual", "predicted").alias("log_loss"),
).collect()

shape: (2, 3)
┌──────────┬──────────┬──────────┐
 segments  roc_auc   log_loss 
 ---       ---       ---      
 str       f64       f64      
╞══════════╪══════════╪══════════╡
 a         0.497745  1.006438 
 b         0.498801  0.997226 
└──────────┴──────────┴──────────┘

Tabular Machine Learning Data Transformation Pipeline

import polars as pl
import polars.selectors as cs
from polars_ds.pipeline import Pipeline, Blueprint

bp = (
    # If we specify a target, then target will be excluded from any transformations.
    Blueprint(df, name = "example", target = "approved") 
    .lowercase() # lowercase all columns
    .select(cs.numeric() | cs.by_name(["gender", "employer_category1", "city_category"]))
    .linear_impute(features = ["var1", "existing_emi"], target = "loan_period") 
    .impute(["existing_emi"], method = "median")
    .append_expr( # generate some features
        pl.col("existing_emi").log1p().alias("existing_emi_log1p"),
        pl.col("loan_amount").log1p().alias("loan_amount_log1p"),
        pl.col("loan_amount").sqrt().alias("loan_amount_sqrt"),
        pl.col("loan_amount").shift(-1).alias("loan_amount_lag_1") # any kind of lag transform
    )
    .scale( 
        cs.numeric().exclude(["var1", "existing_emi_log1p"]), method = "standard"
    ) # Scale the columns up to this point. The columns below won't be scaled
    .append_expr( # Add missing flags
        pl.col("employer_category1").is_null().cast(pl.UInt8).alias("employer_category1_is_missing")
    )
    .one_hot_encode("gender", drop_first=True)
    .woe_encode("city_category")
    .target_encode("employer_category1", min_samples_leaf = 20, smoothing = 10.0) # same as above
)

pipe:Pipeline = bp.materialize()
# Check out the result in our example notebooks! (examples/pipeline.ipynb)
df_transformed = pipe.transform(df)
df_transformed.head()

Get all neighbors within radius r, call them best friends, and count the number

df.select(
    pl.col("id"),
    pds.query_radius_ptwise(
        pl.col("var1"), pl.col("var2"), pl.col("var3"), # Columns used as the coordinates in 3d space
        index = pl.col("id"),
        r = 0.1, 
        dist = "sql2", # squared l2
        parallel = True
    ).alias("best friends"),
).with_columns( # -1 to remove the point itself
    (pl.col("best friends").list.len() - 1).alias("best friends count")
).head()

shape: (5, 3)
┌─────┬───────────────────┬────────────────────┐
 id   best friends       best friends count 
 ---  ---                ---                
 u32  list[u32]          u32                
╞═════╪═══════════════════╪════════════════════╡
 0    [0, 811,  1435]   152                
 1    [1, 953,  1723]   159                
 2    [2, 355,  835]    243                
 3    [3, 102,  1129]   110                
 4    [4, 1280,  1543]  226                
└─────┴───────────────────┴────────────────────┘

Run a linear regression on each category:

df = pds.random_data(size=5_000, n_cols=0).select(
    pds.random(0.0, 1.0).alias("x1"),
    pds.random(0.0, 1.0).alias("x2"),
    pds.random(0.0, 1.0).alias("x3"),
    pds.random_int(0, 3).alias("categories")
).with_columns(
    y = pl.col("x1") * 0.5 + pl.col("x2") * 0.25 - pl.col("x3") * 0.15 + pds.random() * 0.0001
)

df.group_by("categories").agg(
    pds.query_lstsq(
        "x1", "x2", "x3", 
        target = "y",
        method = "l2",
        l2_reg = 0.05,
        add_bias = False
    ).alias("coeffs")
) 

shape: (3, 2)
┌────────────┬─────────────────────────────────┐
 categories  coeffs                          
 ---         ---                             
 i32         list[f64]                       
╞════════════╪═════════════════════════════════╡
 0           [0.499912, 0.250005, -0.149846 
 1           [0.499922, 0.250004, -0.149856 
 2           [0.499923, 0.250004, -0.149855 
└────────────┴─────────────────────────────────┘

Various String Edit distances

df.select( # Column "word", compared to string in pl.lit(). It also supports column vs column comparison
    pds.str_leven("word", pl.lit("asasasa"), return_sim=True).alias("Levenshtein"),
    pds.str_osa("word", pl.lit("apples"), return_sim=True).alias("Optimal String Alignment"),
    pds.str_jw("word", pl.lit("apples")).alias("Jaro-Winkler"),
)

In-dataframe statistical tests

df.group_by("market_id").agg(
    pds.ttest_ind("var1", "var2", equal_var=False).alias("t-test"),
    pds.chi2("category_1", "category_2").alias("chi2-test"),
    pds.f_test("var1", group = "category_1").alias("f-test")
)

shape: (3, 4)
┌───────────┬──────────────────────┬──────────────────────┬─────────────────────┐
 market_id  t-test                chi2-test             f-test              
 ---        ---                   ---                   ---                 
 i64        struct[2]             struct[2]             struct[2]           
╞═══════════╪══════════════════════╪══════════════════════╪═════════════════════╡
 0          {2.072749,0.038272}   {33.487634,0.588673}  {0.312367,0.869842} 
 1          {0.469946,0.638424}   {42.672477,0.206119}  {2.148937,0.072536} 
 2          {-1.175325,0.239949}  {28.55723,0.806758}   {0.506678,0.730849} 
└───────────┴──────────────────────┴──────────────────────┴─────────────────────┘

Multiple Convolutions at once!

# Multiple Convolutions at once
# Modes: `same`, `left` (left-aligned same), `right` (right-aligned same), `valid` or `full`
# Method: `fft`, `direct`
# Currently slower than SciPy but provides parallelism because of Polars
df.select(
    pds.convolve("f", [-1, 0, 0, 0, 1], mode = "full", method = "fft"), # column f with the kernel given here
    pds.convolve("a", [-1, 0, 0, 0, 1], mode = "full", method = "direct"),
    pds.convolve("b", [-1, 0, 0, 0, 1], mode = "full", method = "direct"),
).head()

And more!

Getting Started

import polars_ds as pds

To make full use of the Diagnosis module, do

pip install "polars_ds[plot]"

How Fast is it?

Feel free to take a look at our benchmark notebook!

Generally speaking, the more expressions you want to evaluate simultaneously, the faster Polars + PDS will be than Pandas + (SciPy / Sklearn / NumPy). The more CPU cores you have on your machine, the bigger the time difference will be in favor of Polars + PDS.

Why does speed matter?

If your code already executes under 1s, then maybe it doesn't. But as your data grow, having a 5s run vs. a 1s run will make a lot of difference in your iterations for your project. Speed of execution becomes a bigger issues if you are building reports on demand, or if you need to pay extra for additional compute.

HELP WANTED!

  1. Documentation writing, Doc Review, and Benchmark preparation

Road Map

  1. K-means, K-medoids clustering as expressions and also standalone modules.
  2. Other improvement items. See issues.

Disclaimer

Currently in Beta. Feel free to submit feature requests in the issues section of the repo. This library will only depend on python Polars (for most of its core) and will try to be as stable as possible for polars>=1 (It currently supports polars>=0.20.16 but that will be dropped soon). Exceptions will be made when Polars's update forces changes in the plugins.

This package is not tested with Polars streaming mode and is not designed to work with data so big that has to be streamed.

Credits

  1. Rust Snowball Stemmer is taken from Tsoding's Seroost project (MIT). See here
  2. Some statistics functions are taken from Statrs (MIT) and internalized. See here
  3. Linear algebra routines are powered partly by faer
  4. String similarity metrics are soooo fast because of RapidFuzz

Other related Projects

  1. Take a look at our friendly neighbor functime

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

polars_ds-0.6.2.tar.gz (2.1 MB view details)

Uploaded Source

Built Distributions

polars_ds-0.6.2-cp39-abi3-win_amd64.whl (12.5 MB view details)

Uploaded CPython 3.9+ Windows x86-64

polars_ds-0.6.2-cp39-abi3-manylinux_2_24_aarch64.whl (10.2 MB view details)

Uploaded CPython 3.9+ manylinux: glibc 2.24+ ARM64

polars_ds-0.6.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB view details)

Uploaded CPython 3.9+ manylinux: glibc 2.17+ x86-64

polars_ds-0.6.2-cp39-abi3-macosx_11_0_arm64.whl (9.9 MB view details)

Uploaded CPython 3.9+ macOS 11.0+ ARM64

polars_ds-0.6.2-cp39-abi3-macosx_10_12_x86_64.whl (11.3 MB view details)

Uploaded CPython 3.9+ macOS 10.12+ x86-64

File details

Details for the file polars_ds-0.6.2.tar.gz.

File metadata

  • Download URL: polars_ds-0.6.2.tar.gz
  • Upload date:
  • Size: 2.1 MB
  • Tags: Source
  • Uploaded using Trusted Publishing? Yes
  • Uploaded via: maturin/1.7.4

File hashes

Hashes for polars_ds-0.6.2.tar.gz
Algorithm Hash digest
SHA256 2a85a1d4321b8ba9180bf7835ef55dea3275ac17024084601e2a2464703ed671
MD5 b214ce411e39c2cb56394364f544196a
BLAKE2b-256 d7c89290dce8fc4dbd9923fbf6ae90adfb089c23a7e6009764cd27613759d76d

See more details on using hashes here.

File details

Details for the file polars_ds-0.6.2-cp39-abi3-win_amd64.whl.

File metadata

File hashes

Hashes for polars_ds-0.6.2-cp39-abi3-win_amd64.whl
Algorithm Hash digest
SHA256 d7bf12b7086a457b29d8b3cc53a0c796b2189c6d885fe22b2e3b33ec960b4dc6
MD5 031db172be0465756a60151e237b2453
BLAKE2b-256 f8c06a600dc57049c889c56ce7f36350a31a574f5327caf06d064023d02d1573

See more details on using hashes here.

File details

Details for the file polars_ds-0.6.2-cp39-abi3-manylinux_2_24_aarch64.whl.

File metadata

File hashes

Hashes for polars_ds-0.6.2-cp39-abi3-manylinux_2_24_aarch64.whl
Algorithm Hash digest
SHA256 42f51a49bdbf4f62b3bd0bead6685c6eb23608ef60a662dfb9b0c988305cb10e
MD5 e1966eaf772be6727ecd062c540862df
BLAKE2b-256 ce9931e6541a7184710f4c6598da64b124b805ab725a9703c11d4b7d580a8f99

See more details on using hashes here.

File details

Details for the file polars_ds-0.6.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.

File metadata

File hashes

Hashes for polars_ds-0.6.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Algorithm Hash digest
SHA256 bbc15f1af2a98cb294aec2da5a249c303b01451ce0c56af24d20345f589c3bc8
MD5 023bde2b353978e3b2f32752926522ac
BLAKE2b-256 89433d97b2d9a4ef0c893fd6eab1d2741801a6d5dd34934138d1bcb0697abacb

See more details on using hashes here.

File details

Details for the file polars_ds-0.6.2-cp39-abi3-macosx_11_0_arm64.whl.

File metadata

File hashes

Hashes for polars_ds-0.6.2-cp39-abi3-macosx_11_0_arm64.whl
Algorithm Hash digest
SHA256 b8ba1b8f813dc76aec4e7f6d9ef0c9fc6632bd2ed1000a6dbe7481ba806ca8b5
MD5 9241004c1bc3729c3eb49c6030f9840c
BLAKE2b-256 78472f6f3d23716575f63e431bba2ad18e253416c64651ba82cbe446dbae4339

See more details on using hashes here.

File details

Details for the file polars_ds-0.6.2-cp39-abi3-macosx_10_12_x86_64.whl.

File metadata

File hashes

Hashes for polars_ds-0.6.2-cp39-abi3-macosx_10_12_x86_64.whl
Algorithm Hash digest
SHA256 b63d17492ee9ec7783eef9efdd74289394700afcf13ad41caaa3d1203adcf1c8
MD5 84c40ad46ffd2d4301d46ae56f60ebf1
BLAKE2b-256 3a108ae9c91626c20eb1f73d98ceb14c9d9d90ad8537666d516197fa87360cce

See more details on using hashes here.

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page