Custom Evaluator
To define a custom evaluator, write a class basing relai.critico.evaluate.Evaluator and override compute_evaluator_result
to describe your custom evaluation logic.
from collections.abc import Callable
from typing import override
from relai import AgentLog, EvaluatorLog
from relai.critico.evaluate import Evaluator
class CustomSentimentEvaluator(Evaluator):
"""
A custom evaluator for sentiment analysis tasks.
This evaluator compares the agent's predicted sentiment against the ground truth
and provides detailed scoring based on prediction accuracy and confidence.
"""
def __init__(
self,
transform: Callable | None = None,
correct_score: float = 1.0,
incorrect_score: float = 0.0,
partial_credit: bool = True,
):
"""
Initialize the custom sentiment evaluator.
Args:
transform: Optional function to transform agent outputs
correct_score: Score to assign for correct predictions
incorrect_score: Score to assign for incorrect predictions
partial_credit: Whether to give partial credit for neutral predictions
"""
super().__init__(
name="custom-sentiment-evaluator",
# Specify required fields from the benchmark and agent response
required_fields=["text", "predicted_sentiment", "true_sentiment"],
transform=transform,
# Store configuration as hyperparameters
correct_score=correct_score,
incorrect_score=incorrect_score,
partial_credit=partial_credit,
)
@override
async def compute_evaluator_result(self, agent_log: AgentLog) -> EvaluatorLog:
"""
Evaluate the agent's sentiment prediction against ground truth.
Args:
agent_log (AgentLog): The response from the AI agent, containing the original sample
and agent outputs.
Returns:
EvaluatorLog: Evaluator log with score and feedback
"""
# Extract required fields from different sources
text = agent_log.simulation_tape.agent_inputs["text"]
predicted_sentiment = agent_log.agent_outputs["predicted_sentiment"]
true_sentiment = agent_log.simulation_tape.extras["true_sentiment"]
# Evaluate prediction accuracy
if predicted_sentiment.lower() == true_sentiment.lower():
score = self.hyperparameters["correct_score"]
feedback = f"Correct! Predicted '{predicted_sentiment}' matches true sentiment '{true_sentiment}'"
elif (
self.hyperparameters["partial_credit"]
and predicted_sentiment.lower() == "neutral"
and true_sentiment.lower() in ["positive", "negative"]
):
# Give partial credit for neutral predictions on polar sentiments
score = self.hyperparameters["correct_score"] * 0.5
feedback = f"Partial credit: Predicted neutral for {true_sentiment} sentiment"
else:
score = self.hyperparameters["incorrect_score"]
feedback = f"Incorrect: Predicted '{predicted_sentiment}' but true sentiment is '{true_sentiment}'"
# Add text length as additional context
text_length = len(text.split())
feedback += f" (Text length: {text_length} words)"
return EvaluatorLog(
evaluator_id=self.uid,
name=self.name,
outputs={"score": score, "feedback": feedback},
)