Pseudonymization extensions for Dapla
Project description
Dapla Toolbelt Pseudo
Pseudonymize, repseudonymize and depseudonymize data on Dapla.
Features
Pseudonymize
from dapla_pseudo import Pseudonymize
import polars as pl
file_path="data/personer.csv"
dtypes = {"fnr": pl.Utf8, "fornavn": pl.Utf8, "etternavn": pl.Utf8, "kjonn": pl.Categorical, "fodselsdato": pl.Utf8}
df = pl.read_csv(file_path, dtypes=dtypes) # Create DataFrame from file
# Example: Single field default encryption (DAEAD)
result_df = (
Pseudonymize.from_polars(df) # Specify what dataframe to use
.on_fields("fornavn") # Select the field to pseudonymize
.with_default_encryption() # Select the pseudonymization algorithm to apply
.run() # Apply pseudonymization to the selected field
.to_polars() # Get the result as a polars dataframe
)
# Example: Multiple fields default encryption (DAEAD)
result_df = (
Pseudonymize.from_polars(df) # Specify what dataframe to use
.on_fields("fornavn", "etternavn") # Select multiple fields to pseudonymize
.with_default_encryption() # Select the pseudonymization algorithm to apply
.run() # Apply pseudonymization to the selected fields
.to_polars() # Get the result as a polars dataframe
)
# Example: Single field sid mapping and pseudonymization (FPE)
result_df = (
Pseudonymize.from_polars(df) # Specify what dataframe to use
.on_fields("fnr") # Select the field to pseudonymize
.with_stable_id() # Map the selected field to stable id
.run() # Apply pseudonymization to the selected fields
.to_polars() # Get the result as a polars dataframe
)
The default encryption algorithm is DAEAD (Deterministic Authenticated Encryption with Associated Data). However, if the
field is a valid Norwegian personal identification number (fnr, dnr), the recommended way to pseudonymize is to use
the function with_stable_id()
to convert the identification number to a stable ID (SID) prior to pseudonymization.
In that case, the pseudonymization algorithm is FPE (Format Preserving Encryption).
Validate SID mapping
from dapla_pseudo import Validator
import polars as pl
file_path="data/personer.csv"
dtypes = {"fnr": pl.Utf8, "fornavn": pl.Utf8, "etternavn": pl.Utf8, "kjonn": pl.Categorical, "fodselsdato": pl.Utf8}
df = pl.read_polars(file_path, dtypes=dtypes)
result = (
Validator.from_polars(df) # Specify what dataframe to use
.on_field("fnr") # Select the field to validate
.validate_map_to_stable_id() # Validate that all the field values can be mapped to a SID
)
# The resulting dataframe contains the field values that didn't have a corresponding SID
result.to_polars()
A sid_snapshot_date
can also be specified to validate that the field values can be mapped to a SID at a specific date:
from dapla_pseudo import Validator
from dapla_pseudo.utils import convert_to_date
import polars as pl
file_path="data/personer.csv"
dtypes = {"fnr": pl.Utf8, "fornavn": pl.Utf8, "etternavn": pl.Utf8, "kjonn": pl.Categorical, "fodselsdato": pl.Utf8}
df = pl.read_csv(file_path, dtypes=dtypes)
result = (
Validator.from_polars(df)
.on_field("fnr")
.validate_map_to_stable_id(
sid_snapshot_date=convert_to_date("2023-08-29")
)
)
# Show metadata about the validation (e.g. which version of the SID catalog was used)
result.metadata
# Show the field values that didn't have a corresponding SID
result.to_polars()
Advanced usage
Pseudonymize
Read from file systems
from dapla_pseudo import Pseudonymize
from dapla import AuthClient
file_path="data/personer.csv"
options = {
"dtypes": {"fnr": pl.Utf8, "fornavn": pl.Utf8, "etternavn": pl.Utf8, "kjonn": pl.Categorical, "fodselsdato": pl.Utf8}
}
# Example: Read DataFrame from file
result_df = (
Pseudonymize.from_file(file_path) # Read the data from file
.on_fields("fornavn", "etternavn") # Select multiple fields to pseudonymize
.with_default_encryption() # Select the pseudonymization algorithm to apply
.run() # Apply pseudonymization to the selected fields
.to_polars(**options) # Get the result as a Pandas DataFrame
)
# Example: Read dataframe from GCS bucket
options = {
"dtypes": {"fnr": pl.Utf8, "fornavn": pl.Utf8, "etternavn": pl.Utf8, "kjonn": pl.Categorical, "fodselsdato": pl.Utf8}
}
gcs_file_path = "gs://ssb-staging-dapla-felles-data-delt/felles/pseudo-examples/andeby_personer.csv"
result_df = (
Pseudonymize.from_file(gcs_file_path) # Read DataFrame from GCS
.on_fields("fornavn", "etternavn") # Select multiple fields to pseudonymize
.with_default_encryption() # Select the pseudonymization algorithm to apply
.run() # Apply pseudonymization to the selected fields
.to_polars(**options) # Get the result as a polars dataframe
)
Pseudonymize using custom keys/keysets
from dapla_pseudo import pseudonymize
# Pseudonymize fields in a local file using the default key:
df = (
Pseudonymize.from_polars(df) # Specify what dataframe to use
.on_fields("fornavn") # Select the field to pseudonymize
.with_default_encryption() # Select the pseudonymization algorithm to apply
.run() # Apply pseudonymization to the selected field
.to_polars() # Get the result as a polars dataframe
)
# Pseudonymize fields in a local file, explicitly denoting the key to use:
df = (
Pseudonymize.from_polars(df) # Specify what dataframe to use
.on_fields("fornavn") # Select the field to pseudonymize
.with_default_encryption(custom_key="ssb-common-key-2") # Select the pseudonymization algorithm to apply
.run() # Apply pseudonymization to the selected field
.to_polars() # Get the result as a polars dataframe
)
pseudonymize(file_path="./data/personer.json", fields=["fnr", "fornavn"], key="ssb-common-key-1")
# Pseudonymize a local file using a custom keyset:
import json
custom_keyset = PseudoKeyset(
encrypted_keyset="CiQAp91NBhLdknX3j9jF6vwhdyURaqcT9/M/iczV7fLn...8XYFKwxiwMtCzDT6QGzCCCM=",
keyset_info={
"primaryKeyId": 1234567890,
"keyInfo": [
{
"typeUrl": "type.googleapis.com/google.crypto.tink.AesSivKey",
"status": "ENABLED",
"keyId": 1234567890,
"outputPrefixType": "TINK",
}
],
},
kek_uri="gcp-kms://projects/some-project-id/locations/europe-north1/keyRings/some-keyring/cryptoKeys/some-kek-1",
)
df = (
Pseudonymize.from_polars(df)
.on_fields("fornavn")
.with_default_encryption(custom_key="1234567890") # Note that the custom key has to be the same as "primaryKeyId" in the custom keyset
.run(custom_keyset=custom_keyset)
.to_polars()
)
Repseudonymize
## TODO
Depseudonymize
## TODO
Note that depseudonymization requires elevated access privileges.
Requirements
- Python >= 3.10
- Dependencies can be found in
pyproject.toml
Installation
You can install Dapla Toolbelt Pseudo via pip from PyPI:
pip install dapla-toolbelt-pseudo
Usage
Please see the Reference Guide for details.
Contributing
Contributions are very welcome. To learn more, see the Contributor Guide.
License
Distributed under the terms of the MIT license, Dapla Toolbelt Pseudo is free and open source software.
Issues
If you encounter any problems, please file an issue along with a detailed description.
Credits
This project was generated from Statistics Norway's SSB PyPI Template.
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
Built Distribution
Hashes for dapla_toolbelt_pseudo-1.0.3.tar.gz
Algorithm | Hash digest | |
---|---|---|
SHA256 | 3bf04994997cf4536c929167d62f0ffce87638b90d843cb6721b33b27c5c48e9 |
|
MD5 | 4543424e360888cc825c8f361dd1f4c1 |
|
BLAKE2b-256 | 4449da606cf04a411425510569150239e3b4f1ae7df7204b61dc30da00da72ba |
Hashes for dapla_toolbelt_pseudo-1.0.3-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 3dcbb963ea975656c8c34a67c2139c185290e02a8fc607d6aa4d76b6688b00ff |
|
MD5 | 716ad4d2c8a59340add092669c2a9886 |
|
BLAKE2b-256 | 9abbc1f0ed11bb6c6c413d0f05a9e5c55e7742ed81d32338d8cef51643b9a5e2 |