review-app

3

%pip install -U -qqq databricks-agents>=0.17.0 databricks-sdk[openai]
%restart_python

user_email = spark.sql("SELECT current_user() as username").collect()[0].username
# PLEASE CHANGE
CATALOG = "mosaic_catalog"
SCHEMA = "lilac_schema"
TABLE = "my_evals"
ASSIGNED_USERS = ["sme@company.com"]
EXPERIMENT_NAME = f"/Users/{user_email}/review_app_notebook"

import mlflow
import pandas as pd
from databricks.agents.evals import generate_evals_df

docs = pd.DataFrame.from_records(
    [
        {
            "content": """
            Apache Spark is a unified analytics engine for large-scale data processing. It provides high-level APIs in Java,
            Scala, Python and R, and an optimized engine that supports general execution graphs. It also supports a rich set
            of higher-level tools including Spark SQL for SQL and structured data processing, pandas API on Spark for pandas
            workloads, MLlib for machine learning, GraphX for graph processing, and Structured Streaming for incremental
            computation and stream processing.
        """,
            "doc_uri": "https://spark.apache.org/docs/3.5.2/",
        },
        {
            "content": """
            Spark's primary abstraction is a distributed collection of items called a Dataset. Datasets can be created from Hadoop InputFormats (such as HDFS files) or by transforming other Datasets. Due to Python’s dynamic nature, we don’t need the Dataset to be strongly-typed in Python. As a result, all Datasets in Python are Dataset[Row], and we call it DataFrame to be consistent with the data frame concept in Pandas and R.""",
            "doc_uri": "https://spark.apache.org/docs/3.5.2/quick-start.html",
        },
    ]
)

agent_description = """
The Agent is a RAG chatbot that answers questions about using Spark on Databricks. The Agent has access to a corpus of Databricks documents, and its task is to answer the user's questions by retrieving the relevant docs from the corpus and synthesizing a helpful, accurate response. The corpus covers a lot of info, but the Agent is specifically designed to interact with Databricks users who have questions about Spark. So questions outside of this scope are considered irrelevant.
"""

question_guidelines = """
# User personas
- A developer who is new to the Databricks platform
- An experienced, highly technical Data Scientist or Data Engineer

# Example questions
- what API lets me parallelize operations over rows of a delta table?
- Which cluster settings will give me the best performance when using Spark?

# Additional Guidelines
- Questions should be succinct, and human-like
"""

evals = generate_evals_df(
    docs,
    # The total number of evals to generate. The method attempts to generate evals that have full coverage over the documents
    # provided. If this number is less than the number of documents, is less than the number of documents,
    # some documents will not have any evaluations generated. See "How num_evals is used" below for more details.
    num_evals=3,
    # A set of guidelines that help guide the synthetic generation. These are free-form strings that will be used to prompt the generation.
    agent_description=agent_description,
    question_guidelines=question_guidelines,
)
display(evals)

import mlflow
from databricks.agents import review_app
from databricks.agents import datasets
from databricks.sdk.errors import NotFound
from IPython.display import Markdown

uc_table_name = f"{CATALOG}.{SCHEMA}.{TABLE}"
mlflow.set_experiment(EXPERIMENT_NAME)

try:
    datasets.delete_dataset(uc_table_name)
except NotFound:
    pass
dataset = datasets.create_dataset(uc_table_name)

# Add synthetic evals to the dataset.
dataset.insert(evals)

display(Markdown(f"Explore dataset in UC: [{uc_table_name}](/explore/data/{CATALOG}/{SCHEMA}/{TABLE}?activeTab=sample)"))

display(spark.read.table(uc_table_name))

# The review app is tied to the experiment_id set above.
my_app = review_app.get_review_app()

# Add the llama3 70b model endpoint for labeling. You should replace this with your own model serving endpoint.
my_app = my_app.add_agent(
    agent_name="llama-70b",
    model_serving_endpoint="databricks-meta-llama-3-3-70b-instruct",
    overwrite=True
)

# Optionally remove all previous labeling sessions so this is the only session we ask the SME.
# for session in my_app.get_labeling_sessions():
#     my_app.delete_labeling_session(session)

my_session = my_app.create_labeling_session(
    name="expected_facts",
    assigned_users=ASSIGNED_USERS,
    agent="llama-70b",
    # Built-in labeling schemas: EXPECTED_FACTS, GUIDELINES, EXPECTED_RESPONSE
    label_schemas=[review_app.label_schemas.EXPECTED_FACTS],
)
my_session.add_dataset(uc_table_name)

# Share with the SME.
print("Review App URL:", my_app.url)
print("Labeling session URL: ", my_session.url)

# To see the progress/results of the labeling session. Each row is a trace (an execution of the agent) with associated assessments presented in the "assessments" column and under `trace.info.assessments`.
mlflow.search_traces(run_id=my_session.mlflow_run_id)

my_session.sync_expectations(to_dataset=uc_table_name)
display(spark.read.table(uc_table_name))

global_guidelines = {
    "professional": ["The response must be professional."],
}

def my_agent(request):
    return "I'm not feeling great. Don't bother me!"

print(dataset.to_df())

eval_results = mlflow.evaluate(
    model=my_agent,
    data=dataset.to_df(),
    model_type="databricks-agent",
    evaluator_config={"databricks-agent": {"global_guidelines": global_guidelines}},
)
display(eval_results.tables["eval_results"])

import mlflow
from mlflow.deployments import get_deploy_client

@mlflow.trace(span_type="AGENT")
def llama3_agent(messages):
  SYSTEM_PROMPT = """
    You are a chatbot that answers questions about Databricks.
    For requests unrelated to Databricks, reject the request.
  """
  return get_deploy_client("databricks").predict(
    endpoint="databricks-meta-llama-3-3-70b-instruct",
    inputs={"messages": [{"role": "system", "content": SYSTEM_PROMPT}, *messages]}
  )

# Log example traces to be labeled.
with mlflow.start_run(run_name="llama3") as run:
    run_id = run.info.run_id
    llama3_agent([{"content": "What is databricks?", "role": "user"}])
    llama3_agent([{"content": "How do I set up a SQL Warehouse?", "role": "user"}])

# The review app is tied to the current MLFlow experiment.
my_app = review_app.get_review_app()

# Search for the traces above using the run_id above
traces = mlflow.search_traces(run_id=run_id)

formality_label_schema = my_app.create_label_schema(
  name="formal",
  # Type can be "expectation" or "feedback".
  type="feedback",
  title="Is the response formal?",
  input=review_app.label_schemas.InputCategorical(options=["Yes", "No"]),
  instruction="Please provide a rationale below.",
  enable_comment=True,
  overwrite=True
)

my_session = my_app.create_labeling_session(
  name="my_session",
  assigned_users=ASSIGNED_USERS,
  label_schemas=["formal"]
)
# NOTE: This will copy the traces into this labeling session so that labels do not modify the original traces.
my_session.add_traces(traces)
print(my_session.url)

for row in mlflow.search_traces(run_id=my_session.mlflow_run_id).to_dict(orient="records"):
  print(f'{row["request_id"]}: {row["assessments"]}\n')

# CHANGE TO YOUR PAYLOAD REQUEST LOGS TABLE
PAYLOAD_REQUEST_LOGS_TABLE = "catalog.schema.my_serving_endpoint_payload_request_logs"
traces = spark.table(PAYLOAD_REQUEST_LOGS_TABLE).select("trace").limit(3).toPandas()

my_session = my_app.create_labeling_session(
  name="my_session",
  assigned_users=ASSIGNED_USERS,
  label_schemas=["formal"]
)

# NOTE: This will copy the traces into this labeling session so that labels do not modify the original traces.
my_session.add_traces(traces)
print(my_session.url)

for row in mlflow.search_traces(run_id=my_session.mlflow_run_id).to_dict(orient="records"):
  print(f'{row["request_id"]}: {row["assessments"]}\n')

Datasets and Labeling Sessions

Introduction

Please provide

Create a dataset and collect assessments

Register an agent with the Review App

Create a labeling session from the eval dataset

Sync expectations back to the evaluation dataset

Label traces from an MLFlow run

Add the traces to a labeling session

Label traces from an Inference table