Data pipelines for TensorFlow and PyTorch.
Project description
rapidnlp-datasets
Data pipelines for both TensorFlow and PyTorch !
If you want to load public datasets, try:
If you want to load local, personal dataset with minimized boilerplate, use rapidnlp-datasets!
installation
pip install -U rapidnlp-datasets
Usage
Here are few examples to show you how to use this library.
- QuickStart: Sequence Classification Task
- QuickStart: Question Answering Task
- QuickStart: Token Classification Task
- QuickStart: Masked Language Model Task
- QuickStart: SimCSE(Sentence Embedding)
sequence-classification-quickstart
import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForSequenceClassification
from rapidnlp_datasets.tf import TFDatasetForSequenceClassifiation
# build dataset
tokenizer = BertWordPieceTokenizer.from_file("testdata/vocab.txt")
dataset = DatasetForSequenceClassification(tokenizer)
dataset.add_jsonl_files(input_files=["testdata/sequence_classification.jsonl"])
# convert to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset(batch_size=32)
for idx, batch in enumerate(iter(tf_dataset)):
print("No.{} batch: \n{}".format(idx, batch))
# save tfrecord
dataset.save_tfrecord("testdata/sequence_classification.tfrecord")
# build dataset from tfrecord files
dataset = TFDatasetForSequenceClassifiation.from_tfrecord_files("testdata/sequence_classification.tfrecord")
for idx, batch in enumerate(iter(dataset)):
print("No.{} batch: \n{}".format(idx, batch))
# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
dataloader = torch.utils.data.DataLoader(
pt_dataset, num_workers=2, shuffle=True, batch_size=32, collate_fn=pt_dataset.batch_padding_collator
)
for idx, batch in enumerate(dataloader):
print("No.{} batch: \n{}".format(idx, batch))
question-answering-quickstart
import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForQuestionAnswering
from rapidnlp_datasets.tf import TFDatasetForQuestionAnswering
# build dataset
tokenizer = BertWordPieceTokenizer.from_file("testdata/vocab.txt")
dataset = DatasetForQuestionAnswering(tokenizer)
dataset.add_jsonl_files(input_files="testdata/qa.jsonl")
# convert to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset()
for idx, batch in enumerate(iter(tf_dataset)):
print("NO.{} batch: \n{}".format(idx, batch))
# save to tfrecord
dataset.save_tfrecord("testdata/qa.tfrecord")
# build dataset from tfrecord files
tf_dataset = TFDatasetForQuestionAnswering.from_tfrecord_files(
"testdata/qa.tfrecord",
batch_size=32,
padding="batch"
)
for idx, batch in enumerate(iter(tf_dataset)):
print()
print("No.{} batch: \n{}".format(idx, batch))
# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
dataloader = torch.utils.data.DataLoader(
pt_dataset,
batch_size=32,
collate_fn=pt_dataset.batch_padding_collator,
)
for idx, batch in enumerate(dataloader):
print("No.{} batch: \n{}".format(idx, batch))
token-classification-quickstart
import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForTokenClassification
from rapidnlp_datasets.tf import TFDatasetForTokenClassification
# build dataset
tokenizer = BertCharLevelTokenizer.from_file("testdata/vocab.txt")
dataset = DatasetForTokenClassification(tokenizer)
dataset.add_jsonl_files("testdata/token_classification.jsonl", label2id=_label_to_id)
# conver to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset()
for idx, batch in enumerate(iter(tf_dataset)):
print("No.{} batch:\n{}".format(idx, batch))
# save dataset to tfrecord
dataset.save_tfrecord("testdata/token_classification.tfrecord")
# build dataset from tfrecord files
tf_dataset = TFDatasetForTokenClassification.from_tfrecord_files(
input_files="testdata/token_classification.tfrecord",
batch_size=4,
)
for idx, batch in enumerate(iter(tf_dataset)):
print("No.{} batch:\n{}".format(idx, batch))
# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
dataloader = torch.utils.data.DataLoader(
pt_dataset, num_workers=1, batch_size=4, collate_fn=pt_dataset.batch_padding_collator
)
for idx, batch in enumerate(dataloader):
print("No.{} batch:\n{}".format(idx, batch))
masked-language-models-quickstart
import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForMaskedLanguageModel
from rapidnlp_datasets.tf import TFDatasetForMaksedLanguageModel
# build dataset
tokenizer = BertWordPieceTokenizer.from_file("testdata/vocab.txt")
dataset = DatasetForMaskedLanguageModel(tokenizer)
dataset.add_jsonl_files(input_files=["testdata/mlm.jsonl"])
dataset.add_text_files(input_files=["/path/to/text/files"])
# convert to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset(batch_size=4)
for idx, batch in enumerate(iter(tf_dataset)):
print("No.{} batch:\n{}".format(idx, batch))
# save dataset as tfrecord
dataset.save_tfrecord("testdata/mlm.tfrecord")
# load tf.data.Dataset from tfrecord files
dataset = TFDatasetForMaksedLanguageModel.from_tfrecord_files(input_files="testdata/mlm.tfrecord", batch_size=4)
for idx, batch in enumerate(iter(dataset)):
print("No.{} batch:\n{}".format(idx, batch))
# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
# build dataloader
dataloader = torch.utils.data.DataLoader(
pt_dataset, batch_size=4, num_workers=1, collate_fn=pt_dataset.batch_padding_collator
)
for idx, batch in enumerate(dataloader):
print("No.{} batch:\n{}".format(idx, batch))
simcse-quickstart
import torch
from tokenizers import BertWordPieceTokenizer
from rapidnlp_datasets import DatasetForSimCSE
from rapidnlp_datasets.tf import TFDatasetForSimCSE
# build dataset
dataset = DatasetForSimCSE(
tokenizer=BertWordPieceTokenizer.from_file("testdata/vocab.txt"),
with_positive_sequence=False,
with_negative_sequence=False,
)
dataset.add_jsonl_files("testdata/simcse.jsonl")
# convert to tf.data.Dataset
tf_dataset = dataset.to_tf_dataset()
for idx, batch in enumerate(iter(tf_dataset)):
print()
print("No.{} batch: \n{}".format(idx, batch))
# save to tfrecord
dataset.save_tfrecord("testdata/simcse.tfrecord")
# build dataset from tfrecord files
tf_dataset = TFDatasetForSimCSE.from_tfrecord_files(
"testdata/simcse.tfrecord",
with_positive_sequence=False,
with_negative_sequence=False,
)
for idx, batch in enumerate(iter(tf_dataset)):
print("No.{} batch: \n{}".format(idx, batch))
# convert to torch.utils.data.Dataset
pt_dataset = dataset.to_pt_dataset()
dataloader = torch.utils.data.DataLoader(
pt_dataset, num_workers=2, shuffle=True, batch_size=32, collate_fn=pt_dataset.batch_padding_collator
)
for idx, batch in enumerate(dataloader):
print("No.{} batch: \n{}".format(idx, batch))
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
Built Distribution
Filter files by name, interpreter, ABI, and platform.
If you're not sure about the file name format, learn more about wheel file names.
Copy a direct link to the current filters
File details
Details for the file rapidnlp-datasets-0.2.0.tar.gz.
File metadata
- Download URL: rapidnlp-datasets-0.2.0.tar.gz
- Upload date:
- Size: 27.3 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/3.7.1 importlib_metadata/4.10.1 pkginfo/1.8.2 requests/2.27.1 requests-toolbelt/0.9.1 tqdm/4.62.3 CPython/3.10.2
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
4ae0e042a8f508281af2ff0c450a5fb16658d753c7d7b3486585cfcf5dcf2054
|
|
| MD5 |
3598118ae2650c281bc98bcdc5e7fd05
|
|
| BLAKE2b-256 |
51ca359c799445c9d6e67b8e64f213242bc44588a0d189882ac6711f9aa76503
|
File details
Details for the file rapidnlp_datasets-0.2.0-py3-none-any.whl.
File metadata
- Download URL: rapidnlp_datasets-0.2.0-py3-none-any.whl
- Upload date:
- Size: 44.9 kB
- Tags: Python 3
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/3.7.1 importlib_metadata/4.10.1 pkginfo/1.8.2 requests/2.27.1 requests-toolbelt/0.9.1 tqdm/4.62.3 CPython/3.10.2
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
95cf8fcb5b2cda5945f1eea50b9a9eb976c5ff21b5f81e244c5b6846d95b8ac5
|
|
| MD5 |
bef4b7b0858dd8ca02b0842f7ee4c38e
|
|
| BLAKE2b-256 |
2d1cbd87d573448173f6e7566d74baaa51a69e472b1288ff685795b404ca15a9
|