The missing PySpark utils
Project description
PySpark Utils
The missing PySpark utils.
Usage
To install:
pip install pyspark-utils
helper
import pyspark_utils.helper as spark_helper
# A SparkContext named as "TestPipeline".
spark_context = spark_helper.get_context('TestPipeline')
op
import pyspark_utils.op as spark_op
# RDD<key, value> -> RDD<new_key, value>
pair_rdd.map(spark_op.do_key(lambda key: new_key))
# RDD<key, value> -> RDD<result>
pair_rdd.map(spark_op.do_tuple(lambda key, value: result))
# RDD<key, value> -> RDD<value, key>
pair_rdd.map(spark_op.swap_kv())
# RDD<key, value> -> RDD<key, value> if func(key)
pair_rdd.filter(spark_op.filter_key(lambda key: true_or_false))
# RDD<key, value> -> RDD<key, value> if func(value)
pair_rdd.filter(spark_op.filter_value(lambda value: true_or_false))
# RDD<iteratable> -> RDD<tuple_or_list> with transformed values.
rdd.map(spark_op.do_elems(lambda elem: new_elem))
# RDD<path> -> RDD<path> if path matches any given fnmatch-style patterns
rdd.filter(spark_op.filter_path(['*.txt', '*.csv', 'path/a.???']))
# RDD<element> -> RDD<element, element>
rdd.keyBy(spark_op.identity)
# RDD<key, value> -> RDD<key, value> with keys in key_rdd
spark_op.filter_keys(pair_rdd, key_rdd)
# RDD<key, value> -> RDD<key, value> with keys in whitelist and not in blacklist
spark_op.filter_keys(pair_rdd, whitelist_key_rdd, blacklist_key_rdd)
# RDD<key, value> -> RDD<key, value> with keys not in key_rdd
spark_op.substract_keys(pair_rdd, key_rdd)
# RDD<element> -> RDD<element> where element is not None
rdd.filter(spark_op.not_none)
# RDD<key> -> RDD<key, value>
rdd.map(spark_op.value_by(lambda key: value))
# Print an RDD nicely with name, count and first element.
spark_op.log_rdd(rdd, "MyRDD", glog.info)
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
pyspark_utils-1.4.0.tar.gz
(2.8 kB
view hashes)
Built Distribution
Close
Hashes for pyspark_utils-1.4.0-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 82ae9e8033d1e09fe3c1ed1592d303836f733dadbe7f47c72f9c31e2fcb60f50 |
|
MD5 | 7ef46b7fc17a0180a8a08b36cb1aa013 |
|
BLAKE2b-256 | 3ba5ceedf182939209f1c51ccf924dfda353ddba19f0e69e06589c5ecde762c2 |