Skip to main content

A framework for load testing llmbench APIs

Project description

LLMBench

A Tool for evaulation the performance of LLM APIs.

Installation

git clone ...
cd LLMBench
pip install -e .

Basic Usage

Example

llmbench_tokenmark 
--model test                                # 模型客户端
--num_concurrent_requests 10
--dataset test                              # 数据集
--timeout 60                                # 性能测试时长
--max_request_sample_num -1                        # 最大请求数,-1:不限制,跟数据集个数及测试时长有关, 100: 即使用数据集中100个用例
--results-dir /aaa/bbb/ccc/outpu_dir        # 测试结果路径
--extra_params '{}'                         # 额外参数,供客户端使用

简介

该项目是为了测试大模型接口性能(token 吞吐量)

客户端

import json
import os
import time
import requests
from llmbench.common.constants import METRICS
from llmbench.inference.clients.client_abc import Client


class OpenAIChatCompletionsClient(Client):
    """Client for OpenAI Chat Completions API."""
    Model_Name = "openai"

    def __init__(self, **kwargs):
        pass

    def make_request(self, request_config):
        prompt = request_config.prompt
        prompt, prompt_len = prompt

        message = [
            {"role": "system", "content": ""},
            {"role": "user", "content": prompt},
        ]
        model = request_config.model
        body = {
            "model": model,
            "messages": message,
            "stream": True,
        }
        sampling_params = request_config.sampling_params
        body.update(sampling_params or {})
        time_to_next_token = []
        tokens_received = 0
        ttft = 0
        error_response_code = -1
        generated_text = ""
        error_msg = ""
        output_throughput = 0
        total_request_time = 0

        metrics = {METRICS.ERROR_CODE: None, METRICS.ERROR_MSG: ""}

        start_time = time.monotonic()
        most_recent_received_token_time = time.monotonic()
        address = os.environ.get("OPENAI_API_BASE")
        if not address:
            raise ValueError("the environment variable OPENAI_API_BASE must be set.")
        key = os.environ.get("OPENAI_API_KEY")
        if not key:
            raise ValueError("the environment variable OPENAI_API_KEY must be set.")
        headers = {"Authorization": f"Bearer {key}"}
        if not address:
            raise ValueError("No host provided.")
        if not address.endswith("/"):
            address = address + "/"
        address += "chat/completions"
        try:
            with requests.post(
                    address,
                    json=body,
                    stream=True,
                    timeout=180,
                    headers=headers,
            ) as response:
                if response.status_code != 200:
                    error_msg = response.text
                    error_response_code = response.status_code
                    response.raise_for_status()
                for chunk in response.iter_lines(chunk_size=None):
                    chunk = chunk.strip()

                    if not chunk:
                        continue
                    stem = "data: "
                    chunk = chunk[len(stem):]
                    if chunk == b"[DONE]":
                        continue
                    tokens_received += 1
                    data = json.loads(chunk)

                    if "error" in data:
                        error_msg = data["error"]["message"]
                        error_response_code = data["error"]["code"]
                        raise RuntimeError(data["error"]["message"])

                    delta = data["choices"][0]["delta"]
                    if delta.get("content", None):
                        if not ttft:
                            ttft = time.monotonic() - start_time
                            time_to_next_token.append(ttft)
                        else:
                            time_to_next_token.append(
                                time.monotonic() - most_recent_received_token_time
                            )
                        most_recent_received_token_time = time.monotonic()
                        generated_text += delta["content"]

            total_request_time = time.monotonic() - start_time
            output_throughput = tokens_received / total_request_time

        except Exception as e:
            metrics[METRICS.ERROR_MSG] = error_msg
            metrics[METRICS.ERROR_CODE] = error_response_code
            print(f"Warning Or Error: {e}")
            print(error_response_code)

        metrics[METRICS.INTER_TOKEN_LAT] = sum(
            time_to_next_token)  # This should be same as metrics[metrics.E2E_LAT]. Leave it here for now
        metrics[METRICS.TTFT] = ttft
        metrics[METRICS.E2E_LAT] = total_request_time
        metrics[METRICS.REQ_OUTPUT_THROUGHPUT] = output_throughput
        metrics[METRICS.NUM_TOTAL_TOKENS] = tokens_received + prompt_len
        metrics[METRICS.NUM_OUTPUT_TOKENS] = tokens_received
        metrics[METRICS.NUM_INPUT_TOKENS] = prompt_len

        return metrics, generated_text, request_config

获取token长度的函数(在utils.py 脚本中,如有需要可进行调用)

import functools
import tiktoken


@functools.lru_cache(maxsize=128)
def get_token_length():
    """
    Get the token length of the tokenizer.
    :return:
    """
    tokenizer = tiktoken.get_encoding("cl100k_base")
    return lambda text: len(tokenizer.encode(text))

线下测试

# 使用pytest 框架驱动 test/test_bench.py
from pathlib import Path
import os
from llmbench.inference.token_benchmark import TokenBenchmark

current_path = os.path.dirname(__file__)
eval_client_path = str(Path(current_path, "../eval_client"))
llmbench_client_path = str(Path(current_path, "../llmbench_client"))


class TestBenchmark:

    def test_self_client(self):
        """
        llmbench 内部的客户端
        """
        TokenBenchmark("eduction", 5, 60, "test").run_token_benchmark()

    def test_evals_client(self):
        """
        evals 的客户端   模型/引擎
        """
        TokenBenchmark("test", 5, 60, "test/all").run_token_benchmark()

    def test_evals_client_by_path(self):
        """
        evals  model是客户端路径
        """
        TokenBenchmark("test", 5, 60,
                       r"D:\code\evals_code\evals\src\evals\clients\client_cmri_test.py").run_token_benchmark()

    def test_outer_client_by_path_for_llmbench(self):
        """
        llmbench 外部客户端,需指定client_path, 仅在测试时使用extra_params,真实环境使用.env进行设置客户端路径
        """
        TokenBenchmark("test", 5, 60, "outer_llmbench",
                       extra_params={"client_path": llmbench_client_path}).run_token_benchmark()

    def test_outer_client_by_path_for_evals(self):
        """
        eval 外部客户端,需指定client_path, 仅在测试时使用extra_params,真实环境使用.env进行设置客户端路径
        """
        TokenBenchmark("test", 5, 60, "outer_eval",
                       extra_params={"client_path": eval_client_path}).run_token_benchmark()

    def test_client_for_multi_engine(self):
        """
        evals 默认支持模型名/引擎 这种方式,这里只举例了llmbench 也支持模型名加引擎方式。 除了evals内部的客户端外,其他客户端都用Model_Name指定模型名即可
        """
        TokenBenchmark("test", 1, 60, "openai_multi/gpt3.5",
                       extra_params={"client_path": llmbench_client_path}).run_token_benchmark()

数据集

# llmbench/data 目录即存放数据集(jsonl)
# 由于本工具主要是测试性能, 跟期望关系不大,所以数据集中可以不写期望。
# 如下:
{"input": [{"role": "system", "content": "图片里描述了什么?", "file_path": "files/flower.png"}], "ideal": "花朵"}  # 这里的ideal 可以不写,如客户端需要,加上即可

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

llmbench_cmri-0.1.2.tar.gz (24.7 kB view details)

Uploaded Source

Built Distribution

llmbench_cmri-0.1.2-py3-none-any.whl (28.1 kB view details)

Uploaded Python 3

File details

Details for the file llmbench_cmri-0.1.2.tar.gz.

File metadata

  • Download URL: llmbench_cmri-0.1.2.tar.gz
  • Upload date:
  • Size: 24.7 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/5.1.1 CPython/3.10.12

File hashes

Hashes for llmbench_cmri-0.1.2.tar.gz
Algorithm Hash digest
SHA256 87fa7400c5bab7872a5a49dec8ec7c82f5e701b4f4adae2615c368dd907b2c2d
MD5 7bbbb5e3b770daea79d3f42a7a8f3261
BLAKE2b-256 15bbaf1f47088fca8b5fb17be465c32952280ba3f4cb9ba500f833479e5a0589

See more details on using hashes here.

File details

Details for the file llmbench_cmri-0.1.2-py3-none-any.whl.

File metadata

File hashes

Hashes for llmbench_cmri-0.1.2-py3-none-any.whl
Algorithm Hash digest
SHA256 f570502795f12a6ab268e6895fe34bfdd1c9ee61d415ece3928228eced5822c5
MD5 f9034c1d1fbdd2acb998abc18cde35c4
BLAKE2b-256 e029945e7fe88ddd547dda47e7bac687f15e827d2f2218c7e0aef01f3dbe37de

See more details on using hashes here.

Supported by

AWS AWS Cloud computing and Security Sponsor Datadog Datadog Monitoring Fastly Fastly CDN Google Google Download Analytics Microsoft Microsoft PSF Sponsor Pingdom Pingdom Monitoring Sentry Sentry Error logging StatusPage StatusPage Status page