Skip to main content

No project description provided

Project description

kplot

kplot is a Python plotting toolkit for exploratory data analysis. It provides reusable helpers for styling figures, scatter and embedding plots, categorical summaries, heatmaps, hierarchical clustering, and related ranking workflows.

Installation

pip install python-kplot

Quick start

The examples below follow the notebooks under nbs/ in order. Each function example lives in its own cell and starts with a short comment derived from the function docstring.

01 utils

from kplot.utils import set_sns, save_svg, save_pdf, save_show, get_color_dict, get_plt_color, get_hue_big, add_stats
import seaborn as sns
from matplotlib import pyplot as plt

# Set up the objects used by the examples below.
df = sns.load_dataset('tips')
df.shape
(244, 7)
# Set seaborn defaults for notebook display and saved figures.
set_sns(dpi=50)
# Save the current matplotlib figure as SVG with editable text.
plt.figure()
plt.plot([0, 1], [0, 1])
# save_svg(Path('nbs') / '_tmp_utils.svg')

# Save the current matplotlib figure as PDF with TrueType fonts.
plt.figure()
plt.plot([0, 1], [1, 0])
# save_pdf(Path('nbs') / '_tmp_utils.pdf')

# Show the current figure or save it, then close open figures.
plt.figure()
plt.plot([0, 1], [0.5, 0.5])
# save_show(path=Path('nbs') / '_tmp_utils_show.png')

# Assign colors to labels while tolerating duplicate category names.
get_color_dict(['A', 'B', 'C'], palette='Set2')
{'A': (0.4, 0.7607843137254902, 0.6470588235294118),
 'B': (0.9882352941176471, 0.5529411764705883, 0.3843137254901961),
 'C': (0.5529411764705883, 0.6274509803921569, 0.796078431372549)}
# Return colors in plotting order for a dict, list, or named palette.
get_plt_color('Set2', ['a', 'b'])

# Filter a hue column down to categories that meet a count threshold.
# get_hue_big(df, 'day', cnt_thr=40).tolist()
# If `value` is str: compare between groups (x=group, y=value) If `value` is list/tuple: compare among values within each group (x=group, hue='variable')
fig, ax = plt.subplots(figsize=(5, 4))
sns.boxplot(data=df, x='sex', y='total_bill', ax=ax)
add_stats(ax, df, value='total_bill', group='sex')

02 scatter

from kplot.scatter import reduce_feature, plot_2d, plot_cluster, plot_rel
import seaborn as sns

# Set up the objects used by the examples below.
df = sns.load_dataset('penguins').dropna().reset_index(drop=True)
df2 = df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
print(df.shape)
print(df2.shape)
(333, 7)
(333, 4)
# Reduce a feature matrix to a lower-dimensional embedding dataframe.
reduce_feature(df2, method='pca', n=2)
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
PCA1 PCA2
0 -457.325073 -13.351587
1 -407.252205 -9.179113
2 -957.044676 8.160444
3 -757.115802 1.867653
4 -557.177302 -3.389158
... ... ...
328 718.068699 2.338199
329 643.090909 4.280699
330 1543.098355 -2.232010
331 992.994900 -4.605154
332 1193.002584 -5.417312

333 rows × 2 columns

# Plot the first two columns of an embedding dataframe.
df2 = reduce_feature(df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']], method='pca', n=2)
df2['species'] = df['species'].values
plot_2d(df2, hue='species', legend=True)

# Reduce features and immediately plot the first two embedding dimensions.
plot_cluster(df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'species']], method='pca', hue='species', legend=True)

# Plot a pairwise relationship with an optional correlation annotation.
df2 = df[['bill_length_mm', 'flipper_length_mm', 'species']].head(12).copy()
df2.index = [f'pt{i}' for i in range(len(df2))]
plot_rel(df2, x='bill_length_mm', y='flipper_length_mm', hue='species', index_list=['pt0', 'pt11'])

03 bar

from kplot.bar import plot_hist, plot_count, plot_bar, plot_group_bar, plot_stacked, plot_violin, plot_box, plot_pie, plot_cnt, calculate_pct, plot_composition
import seaborn as sns

# Set up the objects used by the examples below.
df = sns.load_dataset('tips').dropna()
df.shape
(244, 7)
# Plot a histogram with a KDE overlay and polygon bins.
plot_hist(df, 'total_bill')

# Plot horizontal counts from a value-count series.
plot_count(df['day'].value_counts())

# Plot a bar chart from an unstacked dataframe.
plot_bar(df, value='total_bill', group='day')

# Plot grouped bars after melting multiple value columns.
plot_group_bar(df, value_cols=['total_bill', 'tip'], group='day')

# Plot stacked counts for a categorical column.
plot_stacked(df, group='day', hue='sex')

# Plot violin plots with optional strip dots.
df2 = df[['time', 'total_bill']].rename(columns={'time': 'variable', 'total_bill': 'value'})
plot_violin(df2)

# Plot a box plot ordered by the group median.
plot_box(df, value='total_bill', group='day')

# Plot a pie chart from a value-count series.
plot_pie(df['day'].value_counts())

# Plot vertical counts with labels above the bars.
plot_cnt(df['day'].value_counts())

# Calculate within-bin percentages for a stacked composition chart.
df2 = sns.load_dataset('titanic').dropna(subset=['class', 'sex']).reset_index(drop=True)
calculate_pct(df2, 'class', 'sex')
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
sex female male
class
First 43.518519 56.481481
Second 41.304348 58.695652
Third 29.327902 70.672098
# Plot stacked percentages for a bin-by-category composition.
plot_composition(df2, 'class', 'sex')

04 heatmap

from kplot.heatmap import get_similarity, plot_corr, plot_confusion_matrix
import seaborn as sns

# Set up the objects used by the examples below.
df = sns.load_dataset('titanic').dropna(subset=['age', 'fare', 'class', 'sex', 'survived']).reset_index(drop=True)
df2 = df[['age', 'fare', 'sibsp', 'parch']].head(8).copy()
df2.index = [f'row_{i}' for i in range(len(df2))]
print(df.shape)
print(df2.shape)
(714, 15)
(8, 4)
# Calculate both distance and similarity matrices for a dataframe.
get_similarity(df2)[0]
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
row_0 row_1 row_2 row_3 row_4 row_5 row_6 row_7
row_0 0.000000 66.001996 4.177993 47.657345 13.062925 54.911521 24.415786 6.714166
row_1 66.001996 0.000000 64.492435 18.429118 63.312323 25.182682 61.821302 61.188418
row_2 4.177993 64.492435 0.000000 46.073643 9.000868 52.100901 27.548548 3.910651
row_3 47.657345 18.429118 46.073643 0.000000 45.061097 19.066500 46.039121 42.780883
row_4 13.062925 63.312323 9.000868 45.061097 0.000000 47.754949 35.618122 8.803791
row_5 54.911521 25.182682 52.100901 19.066500 47.754949 0.000000 60.513388 48.906725
row_6 24.415786 61.821302 27.548548 46.039121 35.618122 60.513388 0.000000 27.089433
row_7 6.714166 61.188418 3.910651 42.780883 8.803791 48.906725 27.089433 0.000000
# Plot a square matrix with an optional triangular mask.
plot_corr(df[['age', 'fare', 'sibsp', 'parch']].corr(numeric_only=True))

# Plot a confusion matrix from target and prediction arrays.
plot_confusion_matrix(df['survived'], df['adult_male'], class_names=['False', 'True'], normalize=True)
Normalized confusion matrix

05 hierarchical

from kplot.hierarchical import get_1d_distance, get_1d_distance_parallel, get_Z, plot_dendrogram, get_hcluster
import pandas as pd,numpy as np,seaborn as sns
from scipy.spatial.distance import euclidean

# Set up the objects used by the examples below.
df0=sns.load_dataset("iris")
df = df0.drop(columns="species")

def my_distance(u, v):
    return np.sum(np.abs(u - v))

A = np.array([[0, 0], [1, 1], [2, 2]])

df0.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
# Compute 1D distance (like pdist from scipy) but for df with column names
# return 1d distance
get_1d_distance(pd.DataFrame(A),func_flat=my_distance)
100%|██████████| 3/3 [00:00<00:00, 3381.59it/s]

array([2, 4, 2])
# Parallel compute 1D distance for each row in a dataframe given a distance function
# get_1d_distance_parallel(df, func_flat=my_distance)
# Get linkage matrix Z from pssms dataframe
Z = get_Z(df,func_flat=euclidean,parallel=False)
100%|██████████| 150/150 [00:00<00:00, 532.10it/s]
# Run the example.
plot_dendrogram(Z,dense=10,labels=df.index,thr=0.5)

# Get flat cluster assignments from hierarchical clustering linkage matrix `Z`.
get_hcluster(df,labels=df0['species'].tolist(),thr=5,dense=10)
0      1
1      1
2      1
3      1
4      1
      ..
145    2
146    4
147    2
148    2
149    4
Length: 150, dtype: int32

06 ranking

from kplot.ranking import plot_rank, get_AUCDF
import seaborn as sns

# Set up the objects used by the examples below.
df = sns.load_dataset('tips')
df.shape
(244, 7)
# Plot a ranked scatter and annotate the highest and lowest entries.
sort_df=df.sort_values('total_bill').copy()
sort_df['id'] = sort_df.index.astype(str)
plot_rank(sort_df, x='id', y='total_bill', n_hi=10, n_lo=10)

# Compute the normalized area under an empirical CDF over rank values.
get_AUCDF(df, 'total_bill', plot=True)

0.6519265042202643

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

python_kplot-0.0.3.tar.gz (28.8 kB view details)

Uploaded Source

Built Distribution

If you're not sure about the file name format, learn more about wheel file names.

python_kplot-0.0.3-py3-none-any.whl (30.4 kB view details)

Uploaded Python 3

File details

Details for the file python_kplot-0.0.3.tar.gz.

File metadata

  • Download URL: python_kplot-0.0.3.tar.gz
  • Upload date:
  • Size: 28.8 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.2.0 CPython/3.12.12

File hashes

Hashes for python_kplot-0.0.3.tar.gz
Algorithm Hash digest
SHA256 7d460c58e4dff7c303d311128c96cc4bf2094e9173454eebca058bcd6aac5199
MD5 dfdef1f8cf2bf9de87a6d47cadceb589
BLAKE2b-256 b4426f64df5444fcda7f78b77404d8e9ff499162fefa761fab9945c10f2a1b41

See more details on using hashes here.

File details

Details for the file python_kplot-0.0.3-py3-none-any.whl.

File metadata

  • Download URL: python_kplot-0.0.3-py3-none-any.whl
  • Upload date:
  • Size: 30.4 kB
  • Tags: Python 3
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.2.0 CPython/3.12.12

File hashes

Hashes for python_kplot-0.0.3-py3-none-any.whl
Algorithm Hash digest
SHA256 074cecf82bc4fa731957c04d76d1f98c5de938c1d799582aa7ae96f6e3d3e4e4
MD5 4497ba66de007d2b9648178a08e3ae38
BLAKE2b-256 751886a66f0def78980975d229346e53458a6dee06f9e18f6a29d4b1ce84ec0c

See more details on using hashes here.

Supported by

AWS Cloud computing and Security Sponsor Datadog Monitoring Depot Continuous Integration Fastly CDN Google Download Analytics Pingdom Monitoring Sentry Error logging StatusPage Status page