No project description provided
Project description
kplot
kplot is a Python plotting toolkit for exploratory data analysis. It provides reusable helpers for styling figures, scatter and embedding plots, categorical summaries, heatmaps, hierarchical clustering, and related ranking workflows.
Installation
pip install python-kplot
Quick start
The examples below follow the notebooks under nbs/ in order. Each
function example lives in its own cell and starts with a short comment
derived from the function docstring.
01 utils
from kplot.utils import set_sns, save_svg, save_pdf, save_show, get_color_dict, get_plt_color, get_hue_big, add_stats
import seaborn as sns
from matplotlib import pyplot as plt
# Set up the objects used by the examples below.
df = sns.load_dataset('tips')
df.shape
(244, 7)
# Set seaborn defaults for notebook display and saved figures.
set_sns(dpi=50)
# Save the current matplotlib figure as SVG with editable text.
plt.figure()
plt.plot([0, 1], [0, 1])
# save_svg(Path('nbs') / '_tmp_utils.svg')
# Save the current matplotlib figure as PDF with TrueType fonts.
plt.figure()
plt.plot([0, 1], [1, 0])
# save_pdf(Path('nbs') / '_tmp_utils.pdf')
# Show the current figure or save it, then close open figures.
plt.figure()
plt.plot([0, 1], [0.5, 0.5])
# save_show(path=Path('nbs') / '_tmp_utils_show.png')
# Assign colors to labels while tolerating duplicate category names.
get_color_dict(['A', 'B', 'C'], palette='Set2')
{'A': (0.4, 0.7607843137254902, 0.6470588235294118),
'B': (0.9882352941176471, 0.5529411764705883, 0.3843137254901961),
'C': (0.5529411764705883, 0.6274509803921569, 0.796078431372549)}
# Return colors in plotting order for a dict, list, or named palette.
get_plt_color('Set2', ['a', 'b'])
# Filter a hue column down to categories that meet a count threshold.
# get_hue_big(df, 'day', cnt_thr=40).tolist()
# If `value` is str: compare between groups (x=group, y=value) If `value` is list/tuple: compare among values within each group (x=group, hue='variable')
fig, ax = plt.subplots(figsize=(5, 4))
sns.boxplot(data=df, x='sex', y='total_bill', ax=ax)
add_stats(ax, df, value='total_bill', group='sex')
02 scatter
from kplot.scatter import reduce_feature, plot_2d, plot_cluster, plot_rel
import seaborn as sns
# Set up the objects used by the examples below.
df = sns.load_dataset('penguins').dropna().reset_index(drop=True)
df2 = df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
print(df.shape)
print(df2.shape)
(333, 7)
(333, 4)
# Reduce a feature matrix to a lower-dimensional embedding dataframe.
reduce_feature(df2, method='pca', n=2)
| PCA1 | PCA2 | |
|---|---|---|
| 0 | -457.325073 | -13.351587 |
| 1 | -407.252205 | -9.179113 |
| 2 | -957.044676 | 8.160444 |
| 3 | -757.115802 | 1.867653 |
| 4 | -557.177302 | -3.389158 |
| ... | ... | ... |
| 328 | 718.068699 | 2.338199 |
| 329 | 643.090909 | 4.280699 |
| 330 | 1543.098355 | -2.232010 |
| 331 | 992.994900 | -4.605154 |
| 332 | 1193.002584 | -5.417312 |
333 rows × 2 columns
# Plot the first two columns of an embedding dataframe.
df2 = reduce_feature(df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']], method='pca', n=2)
df2['species'] = df['species'].values
plot_2d(df2, hue='species', legend=True)
# Reduce features and immediately plot the first two embedding dimensions.
plot_cluster(df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'species']], method='pca', hue='species', legend=True)
# Plot a pairwise relationship with an optional correlation annotation.
df2 = df[['bill_length_mm', 'flipper_length_mm', 'species']].head(12).copy()
df2.index = [f'pt{i}' for i in range(len(df2))]
plot_rel(df2, x='bill_length_mm', y='flipper_length_mm', hue='species', index_list=['pt0', 'pt11'])
03 bar
from kplot.bar import plot_hist, plot_count, plot_bar, plot_group_bar, plot_stacked, plot_violin, plot_box, plot_pie, plot_cnt, calculate_pct, plot_composition
import seaborn as sns
# Set up the objects used by the examples below.
df = sns.load_dataset('tips').dropna()
df.shape
(244, 7)
# Plot a histogram with a KDE overlay and polygon bins.
plot_hist(df, 'total_bill')
# Plot horizontal counts from a value-count series.
plot_count(df['day'].value_counts())
# Plot a bar chart from an unstacked dataframe.
plot_bar(df, value='total_bill', group='day')
# Plot grouped bars after melting multiple value columns.
plot_group_bar(df, value_cols=['total_bill', 'tip'], group='day')
# Plot stacked counts for a categorical column.
plot_stacked(df, group='day', hue='sex')
# Plot violin plots with optional strip dots.
df2 = df[['time', 'total_bill']].rename(columns={'time': 'variable', 'total_bill': 'value'})
plot_violin(df2)
# Plot a box plot ordered by the group median.
plot_box(df, value='total_bill', group='day')
# Plot a pie chart from a value-count series.
plot_pie(df['day'].value_counts())
# Plot vertical counts with labels above the bars.
plot_cnt(df['day'].value_counts())
# Calculate within-bin percentages for a stacked composition chart.
df2 = sns.load_dataset('titanic').dropna(subset=['class', 'sex']).reset_index(drop=True)
calculate_pct(df2, 'class', 'sex')
| sex | female | male |
|---|---|---|
| class | ||
| First | 43.518519 | 56.481481 |
| Second | 41.304348 | 58.695652 |
| Third | 29.327902 | 70.672098 |
# Plot stacked percentages for a bin-by-category composition.
plot_composition(df2, 'class', 'sex')
04 heatmap
from kplot.heatmap import get_similarity, plot_corr, plot_confusion_matrix
import seaborn as sns
# Set up the objects used by the examples below.
df = sns.load_dataset('titanic').dropna(subset=['age', 'fare', 'class', 'sex', 'survived']).reset_index(drop=True)
df2 = df[['age', 'fare', 'sibsp', 'parch']].head(8).copy()
df2.index = [f'row_{i}' for i in range(len(df2))]
print(df.shape)
print(df2.shape)
(714, 15)
(8, 4)
# Calculate both distance and similarity matrices for a dataframe.
get_similarity(df2)[0]
| row_0 | row_1 | row_2 | row_3 | row_4 | row_5 | row_6 | row_7 | |
|---|---|---|---|---|---|---|---|---|
| row_0 | 0.000000 | 66.001996 | 4.177993 | 47.657345 | 13.062925 | 54.911521 | 24.415786 | 6.714166 |
| row_1 | 66.001996 | 0.000000 | 64.492435 | 18.429118 | 63.312323 | 25.182682 | 61.821302 | 61.188418 |
| row_2 | 4.177993 | 64.492435 | 0.000000 | 46.073643 | 9.000868 | 52.100901 | 27.548548 | 3.910651 |
| row_3 | 47.657345 | 18.429118 | 46.073643 | 0.000000 | 45.061097 | 19.066500 | 46.039121 | 42.780883 |
| row_4 | 13.062925 | 63.312323 | 9.000868 | 45.061097 | 0.000000 | 47.754949 | 35.618122 | 8.803791 |
| row_5 | 54.911521 | 25.182682 | 52.100901 | 19.066500 | 47.754949 | 0.000000 | 60.513388 | 48.906725 |
| row_6 | 24.415786 | 61.821302 | 27.548548 | 46.039121 | 35.618122 | 60.513388 | 0.000000 | 27.089433 |
| row_7 | 6.714166 | 61.188418 | 3.910651 | 42.780883 | 8.803791 | 48.906725 | 27.089433 | 0.000000 |
# Plot a square matrix with an optional triangular mask.
plot_corr(df[['age', 'fare', 'sibsp', 'parch']].corr(numeric_only=True))
# Plot a confusion matrix from target and prediction arrays.
plot_confusion_matrix(df['survived'], df['adult_male'], class_names=['False', 'True'], normalize=True)
Normalized confusion matrix
05 hierarchical
from kplot.hierarchical import get_1d_distance, get_1d_distance_parallel, get_Z, plot_dendrogram, get_hcluster
import pandas as pd,numpy as np,seaborn as sns
from scipy.spatial.distance import euclidean
# Set up the objects used by the examples below.
df0=sns.load_dataset("iris")
df = df0.drop(columns="species")
def my_distance(u, v):
return np.sum(np.abs(u - v))
A = np.array([[0, 0], [1, 1], [2, 2]])
df0.head()
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
# Compute 1D distance (like pdist from scipy) but for df with column names
# return 1d distance
get_1d_distance(pd.DataFrame(A),func_flat=my_distance)
100%|██████████| 3/3 [00:00<00:00, 3381.59it/s]
array([2, 4, 2])
# Parallel compute 1D distance for each row in a dataframe given a distance function
# get_1d_distance_parallel(df, func_flat=my_distance)
# Get linkage matrix Z from pssms dataframe
Z = get_Z(df,func_flat=euclidean,parallel=False)
100%|██████████| 150/150 [00:00<00:00, 532.10it/s]
# Run the example.
plot_dendrogram(Z,dense=10,labels=df.index,thr=0.5)
# Get flat cluster assignments from hierarchical clustering linkage matrix `Z`.
get_hcluster(df,labels=df0['species'].tolist(),thr=5,dense=10)
0 1
1 1
2 1
3 1
4 1
..
145 2
146 4
147 2
148 2
149 4
Length: 150, dtype: int32
06 ranking
from kplot.ranking import plot_rank, get_AUCDF
import seaborn as sns
# Set up the objects used by the examples below.
df = sns.load_dataset('tips')
df.shape
(244, 7)
# Plot a ranked scatter and annotate the highest and lowest entries.
sort_df=df.sort_values('total_bill').copy()
sort_df['id'] = sort_df.index.astype(str)
plot_rank(sort_df, x='id', y='total_bill', n_hi=10, n_lo=10)
# Compute the normalized area under an empirical CDF over rank values.
get_AUCDF(df, 'total_bill', plot=True)
0.6519265042202643
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
Built Distribution
Filter files by name, interpreter, ABI, and platform.
If you're not sure about the file name format, learn more about wheel file names.
Copy a direct link to the current filters
File details
Details for the file python_kplot-0.0.3.tar.gz.
File metadata
- Download URL: python_kplot-0.0.3.tar.gz
- Upload date:
- Size: 28.8 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/6.2.0 CPython/3.12.12
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
7d460c58e4dff7c303d311128c96cc4bf2094e9173454eebca058bcd6aac5199
|
|
| MD5 |
dfdef1f8cf2bf9de87a6d47cadceb589
|
|
| BLAKE2b-256 |
b4426f64df5444fcda7f78b77404d8e9ff499162fefa761fab9945c10f2a1b41
|
File details
Details for the file python_kplot-0.0.3-py3-none-any.whl.
File metadata
- Download URL: python_kplot-0.0.3-py3-none-any.whl
- Upload date:
- Size: 30.4 kB
- Tags: Python 3
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/6.2.0 CPython/3.12.12
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
074cecf82bc4fa731957c04d76d1f98c5de938c1d799582aa7ae96f6e3d3e4e4
|
|
| MD5 |
4497ba66de007d2b9648178a08e3ae38
|
|
| BLAKE2b-256 |
751886a66f0def78980975d229346e53458a6dee06f9e18f6a29d4b1ce84ec0c
|