Human feedback quickstart

3

%pip install --upgrade "mlflow[databricks]>=3.1.0" openai "databricks-connect>=16.1"
dbutils.library.restartPython()

5

import mlflow
from openai import OpenAI

# Enable automatic tracing for all OpenAI API calls
mlflow.openai.autolog()

# Connect to a Databricks LLM via OpenAI using the same credentials as MLflow
# Alternatively, you can use your own OpenAI credentials here
mlflow_creds = mlflow.utils.databricks_utils.get_databricks_host_creds()
client = OpenAI(
    api_key=mlflow_creds.token,
    base_url=f"{mlflow_creds.host}/serving-endpoints"
)

# Create a RAG app with tracing
@mlflow.trace
def my_chatbot(user_question: str) -> str:
    # Retrieve relevant context
    context = retrieve_context(user_question)

    # Generate response using LLM with retrieved context
    response = client.chat.completions.create(
        model="databricks-claude-3-7-sonnet",  # If using OpenAI directly, use "gpt-4o" or "gpt-3.5-turbo"
        messages=[
            {"role": "system", "content": "You are a helpful assistant. Use the provided context to answer questions."},
            {"role": "user", "content": f"Context: {context}\n\nQuestion: {user_question}"}
        ],
        temperature=0.7,
        max_tokens=150
    )
    return response.choices[0].message.content

@mlflow.trace(span_type="RETRIEVER")
def retrieve_context(query: str) -> str:
    # Simulated retrieval - in production, this would search a vector database
    if "mlflow" in query.lower():
        return "MLflow is an open-source platform for managing the end-to-end machine learning lifecycle. It provides tools for experiment tracking, model packaging, and deployment."
    return "General information about machine learning and data science."

# Run the app to generate a trace
response = my_chatbot("What is MLflow?")
print(f"Response: {response}")

# Get the trace ID for the next step
trace_id = mlflow.get_last_active_trace_id()
print(f"Trace ID: {trace_id}")

Response: MLflow is an open-source platform designed to manage the complete machine learning lifecycle. It provides tools and functionality for tracking experiments, packaging machine learning models, and deploying them. As a comprehensive ML operations platform, MLflow helps data scientists and engineers organize and streamline their machine learning workflows from development through to production deployment. Trace ID: tr-3ee2f0919f7f90fda605dfc0469f0452

Trace(trace_id=tr-3ee2f0919f7f90fda605dfc0469f0452)

from mlflow.entities.assessment import AssessmentSource, AssessmentSourceType

# Simulate end-user feedback from your app
# In production, this would be triggered when a user clicks thumbs down in your UI
mlflow.log_feedback(
    trace_id=trace_id,
    name="user_feedback",
    value=False,  # False for thumbs down - user is unsatisfied
    rationale="Missing details about MLflow's key features like Projects and Model Registry",
    source=AssessmentSource(
        source_type=AssessmentSourceType.HUMAN,
        source_id="enduser_123",  # In production, this is the actual user ID
    ),
)

print("✅ End-user feedback recorded!")

# In a real app, you would:
# 1. Return the trace_id with your response to the frontend
# 2. When user clicks thumbs up/down, call your backend API
# 3. Your backend would then call mlflow.log_feedback() with the trace_id

✅ End-user feedback recorded!

from mlflow.genai.label_schemas import create_label_schema, InputCategorical, InputText
from mlflow.genai.labeling import create_labeling_session

# Define what feedback to collect
accuracy_schema = create_label_schema(
    name="response_accuracy",
    type="feedback",
    title="Is the response factually accurate?",
    input=InputCategorical(options=["Accurate", "Partially Accurate", "Inaccurate"]),
    overwrite=True
)

ideal_response_schema = create_label_schema(
    name="expected_response",
    type="expectation",
    title="What would be the ideal response?",
    input=InputText(),
    overwrite=True
)

# Create a labeling session
labeling_session = create_labeling_session(
    name="quickstart_review",
    label_schemas=[accuracy_schema.name, ideal_response_schema.name],
)

# Add your trace to the session
# Get the most recent trace from the current experiment
traces = mlflow.search_traces(
    max_results=1  # Gets the most recent trace
)
labeling_session.add_traces(traces)

# Share with reviewers
print(f"✅ Trace sent for review!")
print(f"Share this link with reviewers: {labeling_session.url}")

traces:pandas.core.frame.DataFrame = [trace_id: object, trace: object ... 10 more fields]

WARNING:root:Label schema with name 'expected_response' already exists and will be overwritten. This impacts any labeling sessions using this schema. ✅ Trace sent for review! Share this link with reviewers: https://db-sme-demo-docs.cloud.databricks.com/ml/review-v2/3efe23e142184867afd79d76c7f86ebd/tasks/labeling/e2fea2e9-6961-4e9c-8597-39332aa83318

from mlflow.genai.scorers import Correctness

# Get traces from the labeling session
labeled_traces = mlflow.search_traces(
    run_id=labeling_session.mlflow_run_id,  # Labeling Sessions are MLflow Runs
)

# Evaluate your app against expert expectations
eval_results = mlflow.genai.evaluate(
    data=labeled_traces,
    predict_fn=my_chatbot,  # The app you created in Step 1
    scorers=[Correctness()]  # Compares outputs to expected_response
)

labeled_traces:pandas.core.frame.DataFrame = [trace_id: object, trace: object ... 10 more fields]

2025/06/25 17:56:52 INFO mlflow.genai.utils.data_validation: Testing model prediction with the first sample in the dataset. 2025/06/25 17:56:59 INFO mlflow.models.evaluation.utils.trace: Auto tracing is temporarily enabled during the model evaluation for computing some metrics and debugging. To disable tracing, call `mlflow.autolog(disable=True)`.

Trace(trace_id=tr-ab7366a126d525cd9b646f0bc0c80de8)

Human feedback quickstart

Human feedback quickstart

Install required packages

Step 1. Create and trace a simple app

Step 2. Collect end-user feedback

Step 3. View feedback in the UI

Step 4. Send trace for expert review

Step 5. Use feedback to evaluate your app