The missing PySpark utils
Project description
PySpark Utils
The missing PySpark utils.
Usage
To install:
pip install pyspark-utils
helper
import pyspark_utils.helper as spark_helper
# A SparkContext named as "TestPipeline".
spark_context = spark_helper.get_context('TestPipeline')
op
import pyspark_utils.op as spark_op
# RDD<key, value> -> RDD<new_key, value>
pair_rdd.map(spark_op.do_key(lambda key: new_key))
# RDD<key, value> -> RDD<result>
pair_rdd.map(spark_op.do_tuple(lambda key, value: result))
# RDD<key, value> -> RDD<value, key>
pair_rdd.map(spark_op.swap_kv())
# RDD<key, value> -> RDD<key, value> if func(key)
pair_rdd.filter(spark_op.filter_key(lambda key: true_or_false))
# RDD<key, value> -> RDD<key, value> if func(value)
pair_rdd.filter(spark_op.filter_value(lambda value: true_or_false))
# RDD<iteratable> -> RDD<tuple_or_list> with transformed values.
rdd.map(spark_op.do_elems(lambda elem: new_elem))
# RDD<path> -> RDD<path> if path matches any given fnmatch-style patterns
rdd.filter(spark_op.filter_path(['*.txt', '*.csv', 'path/a.???']))
# RDD<element> -> RDD<element, element>
rdd.keyBy(spark_op.identity)
# RDD<key, value> -> RDD<key, value> with keys in key_rdd
subset_pair_rdd = spark_op.filter_keys(pair_rdd, key_rdd)
# RDD<key, value> -> RDD<key, value> with keys in whitelist and not in blacklist
subset_pair_rdd = spark_op.filter_keys(pair_rdd, whitelist_key_rdd, blacklist_key_rdd)
# RDD<key, value> -> RDD<key, value> with keys not in key_rdd
subset_pair_rdd = spark_op.substract_keys(pair_rdd, key_rdd)
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
pyspark_utils-1.3.0.tar.gz
(2.5 kB
view hashes)
Built Distributions
Close
Hashes for pyspark_utils-1.3.0-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 431bea202b2b1c8cc4ab24a5fd884321023393eef7dd5d5c6c76b3a9839ff540 |
|
MD5 | bd5eaa433d19eb7995e387cfd0e0efa0 |
|
BLAKE2b-256 | e0bbb7b6f75e5c5b011e02d97673f0e1556117ce52106c7f270db15ff9bb1a6f |
Close
Hashes for pyspark_utils-1.3.0-py2-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | b12b68f3b893c34bb5407da6863fd5995a2359271af03ca2898c3f64e20bed34 |
|
MD5 | 2777e0ec4378fbe1defc1aab298aeb29 |
|
BLAKE2b-256 | 5a096b2079743a585e4aeea183a19a0119999dad18919e31d378b6928908d9d8 |