Skip to main content

OpenTelemetry instrumentation for HuggingFace Inference API - text generation, chat, and embeddings

Project description

TraceAI HuggingFace Instrumentation

OpenTelemetry instrumentation for HuggingFace Inference API - text generation, chat completion, and embeddings.

Installation

pip install traceai-huggingface

Features

  • Automatic tracing of HuggingFace Inference API calls
  • Support for text generation, chat completion, and feature extraction (embeddings)
  • Streaming response support
  • Token usage tracking
  • Both sync and async client support
  • Full OpenTelemetry semantic conventions compliance

Usage

Basic Setup

from huggingface_hub import InferenceClient
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import ConsoleSpanExporter, SimpleSpanProcessor

from traceai_huggingface import HuggingFaceInstrumentor

# Set up tracing
provider = TracerProvider()
provider.add_span_processor(SimpleSpanProcessor(ConsoleSpanExporter()))
trace.set_tracer_provider(provider)

# Instrument HuggingFace
HuggingFaceInstrumentor().instrument(tracer_provider=provider)

# Use HuggingFace
client = InferenceClient(token="your-hf-token")
response = client.text_generation("Hello, how are you?", model="meta-llama/Llama-2-7b-chat-hf")
print(response)

Text Generation

from huggingface_hub import InferenceClient

client = InferenceClient()

# Simple text generation
response = client.text_generation(
    "The capital of France is",
    model="meta-llama/Llama-2-7b-chat-hf",
    max_new_tokens=100,
    temperature=0.7,
)
print(response)

# With more parameters
response = client.text_generation(
    "Write a poem about Python programming",
    model="meta-llama/Llama-2-7b-chat-hf",
    max_new_tokens=200,
    temperature=0.9,
    top_p=0.95,
    repetition_penalty=1.1,
    do_sample=True,
)
print(response)

Chat Completion

from huggingface_hub import InferenceClient

client = InferenceClient()

# Chat completion with messages
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What is machine learning?"}
]

response = client.chat_completion(
    messages=messages,
    model="meta-llama/Llama-2-7b-chat-hf",
    max_tokens=500,
    temperature=0.7,
)
print(response.choices[0].message.content)

# Multi-turn conversation
messages = [
    {"role": "user", "content": "My name is Alice"},
    {"role": "assistant", "content": "Hello Alice! Nice to meet you."},
    {"role": "user", "content": "What's my name?"}
]

response = client.chat_completion(
    messages=messages,
    model="meta-llama/Llama-2-7b-chat-hf",
)
print(response.choices[0].message.content)

Streaming

from huggingface_hub import InferenceClient

client = InferenceClient()

# Streaming text generation
for chunk in client.text_generation(
    "Tell me a story about a brave knight",
    model="meta-llama/Llama-2-7b-chat-hf",
    max_new_tokens=500,
    stream=True,
):
    print(chunk, end="", flush=True)
print()

# Streaming chat completion
messages = [{"role": "user", "content": "Count to 10"}]
for chunk in client.chat_completion(
    messages=messages,
    model="meta-llama/Llama-2-7b-chat-hf",
    stream=True,
):
    if chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="", flush=True)
print()

Feature Extraction (Embeddings)

from huggingface_hub import InferenceClient

client = InferenceClient()

# Single text embedding
embedding = client.feature_extraction(
    "Hello world",
    model="sentence-transformers/all-MiniLM-L6-v2",
)
print(f"Embedding dimensions: {len(embedding[0])}")

# Multiple texts
texts = ["Hello world", "Machine learning is great", "Python is awesome"]
embeddings = [
    client.feature_extraction(text, model="sentence-transformers/all-MiniLM-L6-v2")
    for text in texts
]
print(f"Generated {len(embeddings)} embeddings")

Async Client

import asyncio
from huggingface_hub import AsyncInferenceClient

async def main():
    client = AsyncInferenceClient()

    # Async text generation
    response = await client.text_generation(
        "The future of AI is",
        model="meta-llama/Llama-2-7b-chat-hf",
        max_new_tokens=100,
    )
    print(response)

    # Async chat completion
    messages = [{"role": "user", "content": "Hello!"}]
    response = await client.chat_completion(
        messages=messages,
        model="meta-llama/Llama-2-7b-chat-hf",
    )
    print(response.choices[0].message.content)

    # Async embeddings
    embedding = await client.feature_extraction(
        "Hello world",
        model="sentence-transformers/all-MiniLM-L6-v2",
    )
    print(f"Embedding dimensions: {len(embedding[0])}")

asyncio.run(main())

Tool Use

from huggingface_hub import InferenceClient

client = InferenceClient()

tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city name"
                    }
                },
                "required": ["location"]
            }
        }
    }
]

messages = [{"role": "user", "content": "What's the weather in Paris?"}]

response = client.chat_completion(
    messages=messages,
    model="meta-llama/Llama-2-7b-chat-hf",
    tools=tools,
    tool_choice="auto",
)

if response.choices[0].message.tool_calls:
    for tool_call in response.choices[0].message.tool_calls:
        print(f"Tool: {tool_call.function.name}")
        print(f"Arguments: {tool_call.function.arguments}")

Configuration Options

TraceConfig

from fi_instrumentation import TraceConfig
from traceai_huggingface import HuggingFaceInstrumentor

config = TraceConfig(
    hide_inputs=False,
    hide_outputs=False,
)

HuggingFaceInstrumentor().instrument(
    tracer_provider=provider,
    config=config
)

Captured Attributes

Text Generation Attributes

Attribute Description
fi.span.kind "LLM"
llm.system "huggingface"
llm.provider "huggingface"
llm.model Model name
llm.token_count.completion Output token count
huggingface.finish_reason Response finish reason

Chat Completion Attributes

Attribute Description
fi.span.kind "LLM"
llm.system "huggingface"
llm.model Model name
llm.token_count.prompt Input token count
llm.token_count.completion Output token count
llm.token_count.total Total token count
huggingface.tools_count Number of tools provided
huggingface.finish_reason Response finish reason

Feature Extraction Attributes

Attribute Description
fi.span.kind "EMBEDDING"
embedding.model Embedding model name
huggingface.texts_count Number of texts embedded
huggingface.embedding_dimensions Vector dimensions

Supported Models

HuggingFace Inference API supports thousands of models. Here are some popular ones:

Category Example Models
Text Generation meta-llama/Llama-2-7b-chat-hf, mistralai/Mistral-7B-Instruct-v0.1
Chat meta-llama/Llama-2-70b-chat-hf, HuggingFaceH4/zephyr-7b-beta
Embeddings sentence-transformers/all-MiniLM-L6-v2, BAAI/bge-large-en-v1.5

Real-World Use Cases

Semantic Search with Embeddings

from huggingface_hub import InferenceClient
import numpy as np

client = InferenceClient()

# Index documents
documents = [
    "Python is a programming language",
    "Machine learning uses algorithms",
    "Paris is the capital of France",
]

doc_embeddings = [
    client.feature_extraction(doc, model="sentence-transformers/all-MiniLM-L6-v2")
    for doc in documents
]

# Search
query = "What programming languages are there?"
query_embedding = client.feature_extraction(
    query, model="sentence-transformers/all-MiniLM-L6-v2"
)

# Compute cosine similarity
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

similarities = [
    cosine_similarity(query_embedding[0], doc_emb[0])
    for doc_emb in doc_embeddings
]

# Get most similar
best_idx = np.argmax(similarities)
print(f"Most similar document: {documents[best_idx]}")

RAG Pipeline

from huggingface_hub import InferenceClient

client = InferenceClient()

# Retrieve relevant documents (simplified)
documents = [
    "The Eiffel Tower is 330 meters tall.",
    "Paris has a population of 2.1 million.",
]

# Generate answer with context
context = "\n".join(documents)
messages = [
    {"role": "system", "content": f"Answer based on this context:\n{context}"},
    {"role": "user", "content": "How tall is the Eiffel Tower?"}
]

response = client.chat_completion(
    messages=messages,
    model="meta-llama/Llama-2-7b-chat-hf",
    max_tokens=200,
)

print(response.choices[0].message.content)

Batch Processing

import asyncio
from huggingface_hub import AsyncInferenceClient

async def process_batch(texts, model):
    client = AsyncInferenceClient()

    tasks = [
        client.text_generation(text, model=model, max_new_tokens=100)
        for text in texts
    ]

    return await asyncio.gather(*tasks)

texts = [
    "Summarize: AI is transforming industries...",
    "Summarize: Climate change affects...",
    "Summarize: The economy is growing...",
]

results = asyncio.run(process_batch(texts, "meta-llama/Llama-2-7b-chat-hf"))
for text, result in zip(texts, results):
    print(f"Input: {text[:50]}...")
    print(f"Output: {result}\n")

Environment Variables

Variable Description
HF_TOKEN HuggingFace API token
HUGGINGFACE_HUB_TOKEN Alternative token variable

License

Apache-2.0

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

traceai_huggingface-0.1.0.tar.gz (12.4 kB view details)

Uploaded Source

Built Distribution

If you're not sure about the file name format, learn more about wheel file names.

traceai_huggingface-0.1.0-py3-none-any.whl (13.2 kB view details)

Uploaded Python 3

File details

Details for the file traceai_huggingface-0.1.0.tar.gz.

File metadata

  • Download URL: traceai_huggingface-0.1.0.tar.gz
  • Upload date:
  • Size: 12.4 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: uv/0.9.21 {"installer":{"name":"uv","version":"0.9.21","subcommand":["publish"]},"python":null,"implementation":{"name":null,"version":null},"distro":{"name":"macOS","version":null,"id":null,"libc":null},"system":{"name":null,"release":null},"cpu":null,"openssl_version":null,"setuptools_version":null,"rustc_version":null,"ci":null}

File hashes

Hashes for traceai_huggingface-0.1.0.tar.gz
Algorithm Hash digest
SHA256 8d208f2d447b084c2b776e4f50a58c29f485ddb3c0cf7ba98d6f657660b23ee9
MD5 fad9081729656e2d5050c3b8b644a8db
BLAKE2b-256 d36ea640bfcb4420f023e3317832c4d0e3e3f8259f54f22b0a1437773a3b954c

See more details on using hashes here.

File details

Details for the file traceai_huggingface-0.1.0-py3-none-any.whl.

File metadata

  • Download URL: traceai_huggingface-0.1.0-py3-none-any.whl
  • Upload date:
  • Size: 13.2 kB
  • Tags: Python 3
  • Uploaded using Trusted Publishing? No
  • Uploaded via: uv/0.9.21 {"installer":{"name":"uv","version":"0.9.21","subcommand":["publish"]},"python":null,"implementation":{"name":null,"version":null},"distro":{"name":"macOS","version":null,"id":null,"libc":null},"system":{"name":null,"release":null},"cpu":null,"openssl_version":null,"setuptools_version":null,"rustc_version":null,"ci":null}

File hashes

Hashes for traceai_huggingface-0.1.0-py3-none-any.whl
Algorithm Hash digest
SHA256 60c6bf2bdc0dd5b38ad730aa49315054f20086577ec907388d67c44a3899f9a8
MD5 a251923c2bdebd4c5bf3cb80f38b4e0e
BLAKE2b-256 c683278774468a6e4c2369d8d50da6e590b530c575d87aa1fdc277d8f601bcba

See more details on using hashes here.

Supported by

AWS Cloud computing and Security Sponsor Datadog Monitoring Depot Continuous Integration Fastly CDN Google Download Analytics Pingdom Monitoring Sentry Error logging StatusPage Status page