Skip to main content

Data pipelines for TensorFlow and PyTorch.

Project description

rapidnlp-datasets

Python package PyPI version Python

Data pipelines for both TensorFlow and PyTorch !

If you want to load public datasets, try:

If you want to load local, personal dataset with minimized boilerplate, use rapidnlp-datasets!

installation

pip install -U rapidnlp-datasets

Usage

Here are few examples to show you how to use this library.

sequence-classification-quickstart

import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForSequenceClassification
from rapidnlp_datasets.tf import TFDatasetForSequenceClassifiation

# build dataset
tokenizer = BertWordPieceTokenizer.from_file("testdata/vocab.txt")
dataset = DatasetForSequenceClassification(tokenizer)
dataset.add_jsonl_files(input_files=["testdata/sequence_classification.jsonl"])

# convert to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset(batch_size=32)
for idx, batch in enumerate(iter(tf_dataset)):
    print("No.{} batch: \n{}".format(idx, batch))

# save tfrecord
dataset.save_tfrecord("testdata/sequence_classification.tfrecord")
# build dataset from tfrecord files
dataset = TFDatasetForSequenceClassifiation.from_tfrecord_files("testdata/sequence_classification.tfrecord")
for idx, batch in enumerate(iter(dataset)):
    print("No.{} batch: \n{}".format(idx, batch))

# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
dataloader = torch.utils.data.DataLoader(
    pt_dataset, num_workers=2, shuffle=True, batch_size=32, collate_fn=pt_dataset.batch_padding_collator
)
for idx, batch in enumerate(dataloader):
    print("No.{} batch: \n{}".format(idx, batch))

question-answering-quickstart

import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForQuestionAnswering
from rapidnlp_datasets.tf import TFDatasetForQuestionAnswering

# build dataset
tokenizer = BertWordPieceTokenizer.from_file("testdata/vocab.txt")
dataset = DatasetForQuestionAnswering(tokenizer)
dataset.add_jsonl_files(input_files="testdata/qa.jsonl")

# convert to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset()
for idx, batch in enumerate(iter(tf_dataset)):
    print("NO.{} batch: \n{}".format(idx, batch))

# save to tfrecord
dataset.save_tfrecord("testdata/qa.tfrecord")

# build dataset from tfrecord files
tf_dataset = TFDatasetForQuestionAnswering.from_tfrecord_files(
    "testdata/qa.tfrecord", 
    batch_size=32, 
    padding="batch"
)
for idx, batch in enumerate(iter(tf_dataset)):
    print()
    print("No.{} batch: \n{}".format(idx, batch))

# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
dataloader = torch.utils.data.DataLoader(
    pt_dataset,
    batch_size=32,
    collate_fn=pt_dataset.batch_padding_collator,
)
for idx, batch in enumerate(dataloader):
    print("No.{} batch: \n{}".format(idx, batch))

token-classification-quickstart

import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForTokenClassification
from rapidnlp_datasets.tf import TFDatasetForTokenClassification

# build dataset
tokenizer = BertCharLevelTokenizer.from_file("testdata/vocab.txt")
dataset = DatasetForTokenClassification(tokenizer)
dataset.add_jsonl_files("testdata/token_classification.jsonl", label2id=_label_to_id)

# conver to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset()
for idx, batch in enumerate(iter(tf_dataset)):
    print("No.{} batch:\n{}".format(idx, batch))

# save dataset to tfrecord
dataset.save_tfrecord("testdata/token_classification.tfrecord")
# build dataset from tfrecord files
tf_dataset = TFDatasetForTokenClassification.from_tfrecord_files(
    input_files="testdata/token_classification.tfrecord",
    batch_size=4,
)
for idx, batch in enumerate(iter(tf_dataset)):
    print("No.{} batch:\n{}".format(idx, batch))

# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
dataloader = torch.utils.data.DataLoader(
    pt_dataset, num_workers=1, batch_size=4, collate_fn=pt_dataset.batch_padding_collator
)
for idx, batch in enumerate(dataloader):
    print("No.{} batch:\n{}".format(idx, batch))

masked-language-models-quickstart

import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForMaskedLanguageModel
from rapidnlp_datasets.tf import TFDatasetForMaksedLanguageModel

# build dataset
tokenizer = BertWordPieceTokenizer.from_file("testdata/vocab.txt")
dataset = DatasetForMaskedLanguageModel(tokenizer)
dataset.add_jsonl_files(input_files=["testdata/mlm.jsonl"])
dataset.add_text_files(input_files=["/path/to/text/files"])

# convert to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset(batch_size=4)
for idx, batch in enumerate(iter(tf_dataset)):
    print("No.{} batch:\n{}".format(idx, batch))

# save dataset as tfrecord
dataset.save_tfrecord("testdata/mlm.tfrecord")
# load tf.data.Dataset from tfrecord files
dataset = TFDatasetForMaksedLanguageModel.from_tfrecord_files(input_files="testdata/mlm.tfrecord", batch_size=4)
for idx, batch in enumerate(iter(dataset)):
    print("No.{} batch:\n{}".format(idx, batch))

# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
# build dataloader
dataloader = torch.utils.data.DataLoader(
    pt_dataset, batch_size=4, num_workers=1, collate_fn=pt_dataset.batch_padding_collator
)
for idx, batch in enumerate(dataloader):
    print("No.{} batch:\n{}".format(idx, batch))

simcse-quickstart

import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForSimCSE
from rapidnlp_datasets.tf import TFDatasetForSimCSE

# build dataset
dataset = DatasetForSimCSE(
    tokenizer=BertWordPieceTokenizer.from_file("testdata/vocab.txt"),
    with_positive_sequence=False,
    with_negative_sequence=False,
)
dataset.add_jsonl_files("testdata/simcse.jsonl")

# convert to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset()
for idx, batch in enumerate(iter(tf_dataset)):
    print()
    print("No.{} batch: \n{}".format(idx, batch))

# save to tfrecord
dataset.save_tfrecord("testdata/simcse.tfrecord")
# build dataset from tfrecord files
tf_dataset = TFDatasetForSimCSE.from_tfrecord_files(
    "testdata/simcse.tfrecord",
    with_positive_sequence=False,
    with_negative_sequence=False,
)
for idx, batch in enumerate(iter(tf_dataset)):
    print("No.{} batch: \n{}".format(idx, batch))

# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
dataloader = torch.utils.data.DataLoader(
    pt_dataset, num_workers=2, shuffle=True, batch_size=32, collate_fn=pt_dataset.batch_padding_collator
)
for idx, batch in enumerate(dataloader):
    print("No.{} batch: \n{}".format(idx, batch))

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

rapidnlp-datasets-0.2.0.tar.gz (27.3 kB view details)

Uploaded Source

Built Distribution

rapidnlp_datasets-0.2.0-py3-none-any.whl (44.9 kB view details)

Uploaded Python 3

File details

Details for the file rapidnlp-datasets-0.2.0.tar.gz.

File metadata

  • Download URL: rapidnlp-datasets-0.2.0.tar.gz
  • Upload date:
  • Size: 27.3 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/3.7.1 importlib_metadata/4.10.1 pkginfo/1.8.2 requests/2.27.1 requests-toolbelt/0.9.1 tqdm/4.62.3 CPython/3.10.2

File hashes

Hashes for rapidnlp-datasets-0.2.0.tar.gz
Algorithm Hash digest
SHA256 4ae0e042a8f508281af2ff0c450a5fb16658d753c7d7b3486585cfcf5dcf2054
MD5 3598118ae2650c281bc98bcdc5e7fd05
BLAKE2b-256 51ca359c799445c9d6e67b8e64f213242bc44588a0d189882ac6711f9aa76503

See more details on using hashes here.

File details

Details for the file rapidnlp_datasets-0.2.0-py3-none-any.whl.

File metadata

  • Download URL: rapidnlp_datasets-0.2.0-py3-none-any.whl
  • Upload date:
  • Size: 44.9 kB
  • Tags: Python 3
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/3.7.1 importlib_metadata/4.10.1 pkginfo/1.8.2 requests/2.27.1 requests-toolbelt/0.9.1 tqdm/4.62.3 CPython/3.10.2

File hashes

Hashes for rapidnlp_datasets-0.2.0-py3-none-any.whl
Algorithm Hash digest
SHA256 95cf8fcb5b2cda5945f1eea50b9a9eb976c5ff21b5f81e244c5b6846d95b8ac5
MD5 bef4b7b0858dd8ca02b0842f7ee4c38e
BLAKE2b-256 2d1cbd87d573448173f6e7566d74baaa51a69e472b1288ff685795b404ca15a9

See more details on using hashes here.

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page