This is a package to generate topics for the text corpus.
Project description
TopicGPT package
Create a virtual environment
conda create -n your_env_name
conda activate your_env_name
conda install pip
How to install this package?
pip install wm_topicgpt
How to use this package?
Set up global parameters
# ----------------------
# If using jupyter notebook, you should includes those two lines.
import nest_asyncio
nest_asyncio.apply()
# ----------------------
from topicgpt import config
# For NameFilter
config.azure_key = ""
config.azure_endpoint = ""
# For GPT3.5 or GPT4 or Ada-002
config.consumer_id = ""
config.private_key_path = ""
config.mso_llm_env = ""
Option 1: Easy to use
Step 1: Set parameters for the approach
params = {
'preprocessing': {'words_range': (2, 500)},
'name_filter': {},
'embedding': {'model': 'bge', 'batch_size': 500, 'device': 'mps'},
'clustering':{
'model': 'hdbscan',
'hdbscan': {'reduced_dim': 5, 'n_neighbors': 10, 'min_cluster_percent': 0.01, 'sampler': 'similarity'},
'kmeans': {'reduced_dim': 5, 'n_neighbors': 10, 'n_clusters_list': [50, 15, 5], 'sampler': 'similarity'},
'topic': {'model_name': "gpt-35-turbo-16k", 'temperature': 0.5, 'batch_size': 5, 'topk': 10},
},
'keywords': {'ngram_range': (1, 2), 'topk': 10},
'save_file': {'data_file': '../save/data.csv', 'topic_file': '../save/topic.csv', 'plot_file': '../save/tree.txt'},
}
Step 2: Using the pipeline method
from topicgpt.pipeline import topic_modeling
topic_modeling("Your dataset name", "The text column name", params=params)
Option 2: Flexible to use
Step 1: Load your dataset (Only provide dataset name and column name)
Load your data, must be 'pandas.DataFrame' format.
import uuid
import pandas as pd
data_df = pd.read_csv("Your dataset name.csv")
# data transform
data_df = data_df.rename(columns={'The text column name': 'input'})
data_df = data_df.dropna(subset=["input"])
data_df['input'] = data_df['input'].astype(str)
data_df['uuid'] = [str(uuid.uuid4()) for _ in range(len(data_df))]
Step 2: Set parameters for the approach
params = {
'preprocessing': {'words_range': (2, 500)},
'name_filter': {},
'embedding': {'model': 'bge', 'batch_size': 500, 'device': 'mps'},
'clustering':{
'model': 'hdbscan',
'hdbscan': {'reduced_dim': 5, 'n_neighbors': 10, 'min_cluster_percent': 0.01, 'sampler': 'similarity'},
'kmeans': {'reduced_dim': 5, 'n_neighbors': 10, 'n_clusters_list': [50, 15, 5], 'sampler': 'similarity'},
'topic': {'model_name': "gpt-35-turbo-16k", 'temperature': 0.5, 'batch_size': 5, 'topk': 10},
},
'keywords': {'ngram_range': (1, 2), 'topk': 10},
'save_file': {'data_file': '../save/data.csv', 'topic_file': '../save/topic.csv', 'plot_file': '../save/tree.txt'},
}
Step 3: Preprocessing the data
from topicgpt.preprocessing import MinMaxLengthFilter, NameFilter
filter = MinMaxLengthFilter(words_range=params['preprocessing']['words_range'])
data_df = filter.transform(data_df)
# replace names in the text with
filter = NameFilter()
data_df = filter.transform(data_df)
Step 4: Embedding the text
from topicgpt.walmart_llm import AdaEmbedModel, BGEEmbedModel
if params['embedding']['model'] == 'ada':
llm = AdaEmbedModel(batch_size=params['embedding']['batch_size'])
elif params['embedding']['model'] == 'bge':
llm = BGEEmbedModel(batch_size=params['embedding']['batch_size'], device=params['embedding']['device'])
else:
raise ValueError(f"Don't support {params['embedding']} model")
embeddings = llm.embed_documents(data_df['input'].tolist())
data_df['embeddings'] = embeddings
data_df = data_df.dropna(subset=["embeddings"])
Step 5: Clustering
from topicgpt.clustering import HDBSCANClustering
if params['clustering']['model'] == 'hdbscan':
model = HDBSCANClustering(reduced_dim=params['clustering']['hdbscan']['reduced_dim'],
n_neighbors=params['clustering']['hdbscan']['n_neighbors'],
min_cluster_percent=params['clustering']['hdbscan']['min_cluster_percent'],
sampler_mode=params['clustering']['hdbscan']['sampler'])
clusters_list = model.transform(data_df)
# generate topic and description
from topicgpt.generation import TopicGenerator
generator = TopicGenerator(model_name=params['clustering']['topic']['model_name'],
temperature=params['clustering']['topic']['temperature'],
batch_size=params['clustering']['topic']['batch_size'],
topk=params['clustering']['topic']['topk'])
generator.transform(clusters_list)
elif params['clustering']['model'] == 'kmeans':
from topicgpt.clustering import KmeansClustering
model = KmeansClustering(n_neighbors=params['clustering']['kmeans']['n_neighbors'],
reduced_dim=params['clustering']['kmeans']['reduced_dim'],
n_clusters_list=params['clustering']['kmeans']['n_clusters_list'],
sampler_mode=params['clustering']['kmeans']['sampler'],
model_name=params['clustering']['topic']['model_name'],
temperature=params['clustering']['topic']['temperature'],
batch_size=params['clustering']['topic']['batch_size'],
topk=params['clustering']['topic']['topk'])
clusters_list = model.transform(data_df)
Step 6: Generating Keywords
from topicgpt.generation import KeywordsExtractor
extractor = KeywordsExtractor(ngram_range=params['keywords']['ngram_range'], topk=params['keywords']['topk'])
extractor.transform(clusters_list)
Save 7: Save results
from topicgpt.persistence import transform_data_format, transform_topic_format, plot_topic_taxonomy_tree
saved_data = transform_data_format(clusters_list, dataset_name)
saved_data.to_csv(params['save_file']['data_file'], index=False)
topic_data = transform_topic_format(clusters_list, dataset_name)
topic_data.to_csv(params['save_file']['topic_file'], index=False)
plot_topic_taxonomy_tree(clusters_list, params['save_file']['plot_file'])
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
wm_topicgpt-0.1.5.tar.gz
(19.4 kB
view hashes)
Built Distribution
Close
Hashes for wm_topicgpt-0.1.5-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 7ab388d16e54e00dba108eeba689be50e89497aab468094a59054c3027b4d933 |
|
MD5 | 2385a5614a6953b3bc7d03cb49ef4653 |
|
BLAKE2b-256 | 7f8dadbc8e86217fdecbe3101019b6f64398cb76af676e208f8afee6e0fee4e2 |