Skip to main content

An efficient NLP tool.

Project description

xdnlp

A highly efficient and easy-to-use natural language processing library.

Install

pip install xdnlp

or

git clone https://github.com/mikuh/xdnlp
pip install ./xdnlp/

APIS

There are still some function introductions that have not been written for the time being, wait for my follow-up update.

Text

Normalize

Character normalization.

from xdnlp import Text
text = Text()
text.normalize("降龍⑴⑧掌")  # 降龍⑴⑧掌 -> 降龙18掌 

Keyword Extract

Extract keywords from sentence.

from xdnlp import Text
text = Text()
text.add_keywords_from_list(["c++", 'python', 'java', 'javascript'])
text.extract_keywords("小明学习了python c++ javascript", longest_only=True)
# return  ["python", 'c++', 'javascript']

# batch mode
text.batch_extract_keywords(["小明学会了c++", "python和c++有什么区别", "Javascript和java是同一个东西吗"])
# return [['c++'], ['python', 'c++'], ['java']]

Keyword Replace

Replace keywords in sentence.

from xdnlp import Text
text = Text()
text.add_keywords_replace_map_from_dict({
    "java": "golang",
    "javascript": "node"
})
text.replace_keywords("小明学习了python c++ javascript")
# return 小明学习了python c++ node

# batch mode
text.batch_replace_keywords(["小明学会了c++", "python和c++有什么区别", "javascript和java是同一个东西吗"])
# return ["小明学会了c++", "python和c++有什么区别", "node和golang是同一个东西吗"]

Text clean

Remove extraneous characters from a sentence.

from xdnlp import Text
text = Text()
text.clean("aaaaaaAAAAA9123123我是    中国人-=[]:<>aaa", max_repeat=2)
# return aa9123123我是 中国人 aa

# batch mode
text.batch_clean(["aaaaaaAAAAA9123123我是    中国人-=[]:<>aaa", "666666"], max_repeat=2) \
# return ["aa9123123我是 中国人 aa", '66']

Text encode

A text encoder.

from xdnlp import Text
text = Text()
text.encode("wo操你妈、フちqlフq、")
# return {'contact': 1, 'unknown': 1, 'specify': 0, 'length': 13, 'low_frequency': 0, 'zh_scale': 0.6153846153846154, 'en_num_scale': 0.0, 'zh_piece_scale': 0.6666666666666666, 'not_zh_piece_scale': 0, 'pinyin': 'wocaonima、フちqlフq、'}

Text batch cut words

Batch cut words from a iterator

from xdnlp import Text
import jieba

text = Text()
text_list = ["百战大佬 要不要来6线帮打打", "对呀,觉得后面的时间才是自己的",
             "亡者酋长头饰图纸很贵哟", "嗯,不懂,快凉了,哈哈,刚看到10月就抢了"] * 1000000
out = text.batch_cut(text_list, jieba, n_jobs=20, batch_size=1000)
# return [['百战', '大佬', ' ', '要', '不要', '来', '6', '线帮', '打打'], ['对', '呀', ',', '觉得', '后面', '的', '时间', '才', '是', '自己', '的'],...]

Word Discover

Found vocabulary from massive text

from xdnlp import WordDiscover
wd = WordDiscover()
wd.word_discover(["path/to/the.txt"], save_ngram=True)

Classify

TextCNN

import os
import tensorflow as tf
from xdnlp.classify import TextCNN
from xdnlp.classify.utils import load_data_from_directory, get_vectorize_layer
max_features = 50000
max_len = 100
batch_size = 64
epochs = 20
data_dir = "path/to/your/data/dir"

train_ds, val_ds, test_ds, class_names = load_data_from_directory(data_dir, batch_size=batch_size)
vectorize_layer = get_vectorize_layer(max_features, max_len, train_ds)

model_config = dict(input_shape=(max_len,),
                    class_names=class_names,
                    model_dir="models",
                    vectorize_layer=vectorize_layer,
                    embedding_size=128,
                    hidden_size=256,
                    filter_sizes=[3, 4, 5],
                    num_filters=256,
                    dropout=0.2,
                    is_train=True)

model = TextCNN(**model_config)
model.train(train_ds, val_ds, 1)

# predict
model_save_path = "your model save path"

# load from ckpt
config = TextCNN.get_model_config(model_save_path)
vectorize_layer = get_vectorize_layer(config["max_features"], config["max_len"], vocabulary=config["vocabulary"])
model_config = dict(input_shape=(config["max_len"],),
                    vectorize_layer=vectorize_layer,
                    class_names=config["class_names"],
                    embedding_size=config["embedding_size"],
                    hidden_size=config["hidden_size"],
                    filter_sizes=config["filter_sizes"],
                    num_filters=config["num_filters"],
                    dropout=config["dropout"],
                    is_train=False)
    
model = TextCNN(**model_config)

# load from pb
model = tf.keras.models.load_model(os.path.join(model_save_path, "my_model"))
res = model(tf.constant(["这 什么 垃圾 游戏"]))
print(config["class_names"][tf.argmax(res[0]).numpy()])

TextRNN

from xdnlp.classify import TextRNN
from xdnlp.classify.utils import load_data_from_directory, get_vectorize_layer
import tensorflow.keras as keras
max_features = 50000
max_len = 100
batch_size = 64
data_dir = "path/to/your/data/dir"
model_dir = "dir/for/save/model"
embedding_size = 128
rnn_hidden_size = 256
fc_hidden_size = 128
num_layers = 2
dropout = 0.2
epochs = 2

train_ds, val_ds, test_ds, class_names = load_data_from_directory(data_dir, batch_size)
vectorize_layer = get_vectorize_layer(max_features, max_len, train_ds)

model = TextRNN(vectorize_layer=vectorize_layer,
                class_names=class_names,
                model_dir=model_dir,
                embedding_size=embedding_size,
                rnn_hidden_size=rnn_hidden_size,
                fc_hidden_size=fc_hidden_size,
                num_layers=num_layers,
                dropout=dropout,
                is_train=True)

model.train(train_ds, val_ds, epochs)
model.evaluate(test_ds)

# load from ckpt
model_config_path = "path/to/model_config"
checkpoint_path = "path/to/checkpoint/for/loading"
batch_size = 64
model_config = TextRNN.get_model_config(model_config_path)

vectorize_layer = get_vectorize_layer(model_config["max_features"],
                                        model_config["max_len"],
                                        vocabulary=model_config["vocabulary"])


model = TextRNN(vectorize_layer=vectorize_layer,
                class_names=model_config["class_names"],
                embedding_size=model_config["embedding_size"],
                rnn_hidden_size=model_config["rnn_hidden_size"],
                fc_hidden_size=model_config["fc_hidden_size"],
                num_layers=model_config["num_layers"],
                dropout=model_config["dropout"],
                is_train=False)

model.compile(optimizer='adam',
                loss='categorical_crossentropy',
                metrics=['accuracy'])
model.load_weights(checkpoint_path)

# load from pb
model_save_path = "path/to/model/for/loading"
model = keras.models.load_model(model_save_path)
model.evaluate(test_ds)

Bert or Albert classify

from xdnlp.classify import BertClassify

handle_encoder = ""  # bert or albert pre train encoder,set local savedmodel dir or tfhub model url
handle_preprocess = ""  # bert  preprocess,set local savedmodel dir or tfhub model url
model = BertClassify(handle_encoder,
                     handle_preprocess,
                    categories=2)
# set train and test data dir
train_ds, val_ds, test_ds = model.load_data("../../bert/aclImdb/train", "../../bert/aclImdb/test", )
model.preview_train_data(train_ds)
model.preview_classify()
model.train(train_ds, val_ds)

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

xdnlp-0.0.20.tar.gz (45.8 kB view details)

Uploaded Source

File details

Details for the file xdnlp-0.0.20.tar.gz.

File metadata

  • Download URL: xdnlp-0.0.20.tar.gz
  • Upload date:
  • Size: 45.8 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/4.0.1 CPython/3.9.12

File hashes

Hashes for xdnlp-0.0.20.tar.gz
Algorithm Hash digest
SHA256 16f1de661e3a227df37a7d75b4e999c51fdfd55179767dc178acf03411112b19
MD5 de0fb2e1fed5e8ae897ecc8564eed460
BLAKE2b-256 d997b6f1db967fab9dfa32077c51e711218da5ef24f559091ee5c815071a9c5e

See more details on using hashes here.

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page