Data pipelines for TensorFlow and PyTorch.
Project description
rapidnlp-datasets
Data pipelines for both TensorFlow and PyTorch !
If you want to load public datasets, try:
If you want to load local, personal dataset with minimized boilerplate, use rapidnlp-datasets!
installation
pip install -U rapidnlp-datasets
Usage
Here are few examples to show you how to use this library.
- QuickStart: Sequence Classification Task
- QuickStart: Question Answering Task
- QuickStart: Token Classification Task
- QuickStart: Masked Language Model Task
- QuickStart: SimCSE(Sentence Embedding)
sequence-classification-quickstart
import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForSequenceClassification
from rapidnlp_datasets.tf import TFDatasetForSequenceClassifiation
# build dataset
tokenizer = BertWordPieceTokenizer.from_file("testdata/vocab.txt")
dataset = DatasetForSequenceClassification(tokenizer)
dataset.add_jsonl_files(input_files=["testdata/sequence_classification.jsonl"])
# convert to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset(batch_size=32)
for idx, batch in enumerate(iter(tf_dataset)):
print("No.{} batch: \n{}".format(idx, batch))
# save tfrecord
dataset.save_tfrecord("testdata/sequence_classification.tfrecord")
# build dataset from tfrecord files
dataset = TFDatasetForSequenceClassifiation.from_tfrecord_files("testdata/sequence_classification.tfrecord")
for idx, batch in enumerate(iter(dataset)):
print("No.{} batch: \n{}".format(idx, batch))
# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
dataloader = torch.utils.data.DataLoader(
pt_dataset, num_workers=2, shuffle=True, batch_size=32, collate_fn=pt_dataset.batch_padding_collator
)
for idx, batch in enumerate(dataloader):
print("No.{} batch: \n{}".format(idx, batch))
question-answering-quickstart
import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForQuestionAnswering
from rapidnlp_datasets.tf import TFDatasetForQuestionAnswering
# build dataset
tokenizer = BertWordPieceTokenizer.from_file("testdata/vocab.txt")
dataset = DatasetForQuestionAnswering(tokenizer)
dataset.add_jsonl_files(input_files="testdata/qa.jsonl")
# convert to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset()
for idx, batch in enumerate(iter(tf_dataset)):
print("NO.{} batch: \n{}".format(idx, batch))
# save to tfrecord
dataset.save_tfrecord("testdata/qa.tfrecord")
# build dataset from tfrecord files
tf_dataset = TFDatasetForQuestionAnswering.from_tfrecord_files(
"testdata/qa.tfrecord",
batch_size=32,
padding="batch"
)
for idx, batch in enumerate(iter(tf_dataset)):
print()
print("No.{} batch: \n{}".format(idx, batch))
# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
dataloader = torch.utils.data.DataLoader(
pt_dataset,
batch_size=32,
collate_fn=pt_dataset.batch_padding_collator,
)
for idx, batch in enumerate(dataloader):
print("No.{} batch: \n{}".format(idx, batch))
token-classification-quickstart
import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForTokenClassification
from rapidnlp_datasets.tf import TFDatasetForTokenClassification
# build dataset
tokenizer = BertCharLevelTokenizer.from_file("testdata/vocab.txt")
dataset = DatasetForTokenClassification(tokenizer)
dataset.add_jsonl_files("testdata/token_classification.jsonl", label2id=_label_to_id)
# conver to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset()
for idx, batch in enumerate(iter(tf_dataset)):
print("No.{} batch:\n{}".format(idx, batch))
# save dataset to tfrecord
dataset.save_tfrecord("testdata/token_classification.tfrecord")
# build dataset from tfrecord files
tf_dataset = TFDatasetForTokenClassification.from_tfrecord_files(
input_files="testdata/token_classification.tfrecord",
batch_size=4,
)
for idx, batch in enumerate(iter(tf_dataset)):
print("No.{} batch:\n{}".format(idx, batch))
# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
dataloader = torch.utils.data.DataLoader(
pt_dataset, num_workers=1, batch_size=4, collate_fn=pt_dataset.batch_padding_collator
)
for idx, batch in enumerate(dataloader):
print("No.{} batch:\n{}".format(idx, batch))
masked-language-models-quickstart
import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForMaskedLanguageModel
from rapidnlp_datasets.tf import TFDatasetForMaksedLanguageModel
# build dataset
tokenizer = BertWordPieceTokenizer.from_file("testdata/vocab.txt")
dataset = DatasetForMaskedLanguageModel(tokenizer)
dataset.add_jsonl_files(input_files=["testdata/mlm.jsonl"])
dataset.add_text_files(input_files=["/path/to/text/files"])
# convert to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset(batch_size=4)
for idx, batch in enumerate(iter(tf_dataset)):
print("No.{} batch:\n{}".format(idx, batch))
# save dataset as tfrecord
dataset.save_tfrecord("testdata/mlm.tfrecord")
# load tf.data.Dataset from tfrecord files
dataset = TFDatasetForMaksedLanguageModel.from_tfrecord_files(input_files="testdata/mlm.tfrecord", batch_size=4)
for idx, batch in enumerate(iter(dataset)):
print("No.{} batch:\n{}".format(idx, batch))
# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
# build dataloader
dataloader = torch.utils.data.DataLoader(
pt_dataset, batch_size=4, num_workers=1, collate_fn=pt_dataset.batch_padding_collator
)
for idx, batch in enumerate(dataloader):
print("No.{} batch:\n{}".format(idx, batch))
simcse-quickstart
import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForSimCSE
from rapidnlp_datasets.tf import TFDatasetForSimCSE
# build dataset
dataset = DatasetForSimCSE(
tokenizer=BertWordPieceTokenizer.from_file("testdata/vocab.txt"),
with_positive_sequence=False,
with_negative_sequence=False,
)
dataset.add_jsonl_files("testdata/simcse.jsonl")
# convert to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset()
for idx, batch in enumerate(iter(tf_dataset)):
print()
print("No.{} batch: \n{}".format(idx, batch))
# save to tfrecord
dataset.save_tfrecord("testdata/simcse.tfrecord")
# build dataset from tfrecord files
tf_dataset = TFDatasetForSimCSE.from_tfrecord_files(
"testdata/simcse.tfrecord",
with_positive_sequence=False,
with_negative_sequence=False,
)
for idx, batch in enumerate(iter(tf_dataset)):
print("No.{} batch: \n{}".format(idx, batch))
# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
dataloader = torch.utils.data.DataLoader(
pt_dataset, num_workers=2, shuffle=True, batch_size=32, collate_fn=pt_dataset.batch_padding_collator
)
for idx, batch in enumerate(dataloader):
print("No.{} batch: \n{}".format(idx, batch))
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
rapidnlp-datasets-0.2.0.tar.gz
(27.3 kB
view details)
Built Distribution
File details
Details for the file rapidnlp-datasets-0.2.0.tar.gz
.
File metadata
- Download URL: rapidnlp-datasets-0.2.0.tar.gz
- Upload date:
- Size: 27.3 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/3.7.1 importlib_metadata/4.10.1 pkginfo/1.8.2 requests/2.27.1 requests-toolbelt/0.9.1 tqdm/4.62.3 CPython/3.10.2
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | 4ae0e042a8f508281af2ff0c450a5fb16658d753c7d7b3486585cfcf5dcf2054 |
|
MD5 | 3598118ae2650c281bc98bcdc5e7fd05 |
|
BLAKE2b-256 | 51ca359c799445c9d6e67b8e64f213242bc44588a0d189882ac6711f9aa76503 |
File details
Details for the file rapidnlp_datasets-0.2.0-py3-none-any.whl
.
File metadata
- Download URL: rapidnlp_datasets-0.2.0-py3-none-any.whl
- Upload date:
- Size: 44.9 kB
- Tags: Python 3
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/3.7.1 importlib_metadata/4.10.1 pkginfo/1.8.2 requests/2.27.1 requests-toolbelt/0.9.1 tqdm/4.62.3 CPython/3.10.2
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | 95cf8fcb5b2cda5945f1eea50b9a9eb976c5ff21b5f81e244c5b6846d95b8ac5 |
|
MD5 | bef4b7b0858dd8ca02b0842f7ee4c38e |
|
BLAKE2b-256 | 2d1cbd87d573448173f6e7566d74baaa51a69e472b1288ff685795b404ca15a9 |