For strings, checks if expected_output is a substring of output.
For lists/tuples, checks if expected_output is in output.
For dicts, checks if all key-value pairs in expected_output are in output.
Note: case_sensitive only applies when both the value and output are strings.
Source code in pydantic_evals/pydantic_evals/evaluators/common.py
@dataclassclassContains(Evaluator[object,object,object]):"""Check if the output contains the expected output. For strings, checks if expected_output is a substring of output. For lists/tuples, checks if expected_output is in output. For dicts, checks if all key-value pairs in expected_output are in output. Note: case_sensitive only applies when both the value and output are strings. """value:Anycase_sensitive:bool=Trueas_strings:bool=Falsedefevaluate(self,ctx:EvaluatorContext[object,object,object],)->EvaluationReason:# Convert objects to strings if requestedfailure_reason:str|None=Noneas_strings=self.as_stringsor(isinstance(self.value,str)andisinstance(ctx.output,str))ifas_strings:output_str=str(ctx.output)expected_str=str(self.value)ifnotself.case_sensitive:output_str=output_str.lower()expected_str=expected_str.lower()failure_reason:str|None=Noneifexpected_strnotinoutput_str:output_trunc=_truncated_repr(output_str,max_length=100)expected_trunc=_truncated_repr(expected_str,max_length=100)failure_reason=f'Output string {output_trunc} does not contain expected string {expected_trunc}'returnEvaluationReason(value=failure_reasonisNone,reason=failure_reason)try:# Handle different collection typesifisinstance(ctx.output,dict):ifisinstance(self.value,dict):# Cast to Any to avoid type checking issuesoutput_dict=cast(dict[Any,Any],ctx.output)# pyright: ignore[reportUnknownMemberType]expected_dict=cast(dict[Any,Any],self.value)# pyright: ignore[reportUnknownMemberType]forkinexpected_dict:ifknotinoutput_dict:k_trunc=_truncated_repr(k,max_length=30)failure_reason=f'Output dictionary does not contain expected key {k_trunc}'breakelifoutput_dict[k]!=expected_dict[k]:k_trunc=_truncated_repr(k,max_length=30)output_v_trunc=_truncated_repr(output_dict[k],max_length=100)expected_v_trunc=_truncated_repr(expected_dict[k],max_length=100)failure_reason=f'Output dictionary has different value for key {k_trunc}: {output_v_trunc} != {expected_v_trunc}'breakelse:ifself.valuenotinctx.output:# pyright: ignore[reportUnknownMemberType]output_trunc=_truncated_repr(ctx.output,max_length=200)# pyright: ignore[reportUnknownMemberType]failure_reason=f'Output {output_trunc} does not contain provided value as a key'elifself.valuenotinctx.output:# pyright: ignore[reportOperatorIssue] # will be handled by except blockoutput_trunc=_truncated_repr(ctx.output,max_length=200)failure_reason=f'Output {output_trunc} does not contain provided value'except(TypeError,ValueError)ase:failure_reason=f'Containment check failed: {e}'returnEvaluationReason(value=failure_reasonisNone,reason=failure_reason)
Check if the output exactly equals the provided value.
Source code in pydantic_evals/pydantic_evals/evaluators/common.py
2526272829303132
@dataclassclassEquals(Evaluator[object,object,object]):"""Check if the output exactly equals the provided value."""value:Anydefevaluate(self,ctx:EvaluatorContext[object,object,object])->bool:returnctx.output==self.value
Check if the output exactly equals the expected output.
Source code in pydantic_evals/pydantic_evals/evaluators/common.py
3536373839404142
@dataclassclassEqualsExpected(Evaluator[object,object,object]):"""Check if the output exactly equals the expected output."""defevaluate(self,ctx:EvaluatorContext[object,object,object])->bool|dict[str,bool]:ifctx.expected_outputisNone:return{}# Only compare if expected output is providedreturnctx.output==ctx.expected_output
Check if the span tree contains a span that matches the specified query.
Source code in pydantic_evals/pydantic_evals/evaluators/common.py
194195196197198199200201202203204
@dataclassclassHasMatchingSpan(Evaluator[object,object,object]):"""Check if the span tree contains a span that matches the specified query."""query:SpanQuerydefevaluate(self,ctx:EvaluatorContext[object,object,object],)->bool:returnctx.span_tree.any(self.query)
Check if the output is an instance of a type with the given name.
Source code in pydantic_evals/pydantic_evals/evaluators/common.py
124125126127128129130131132133134135136137138139
@dataclassclassIsInstance(Evaluator[object,object,object]):"""Check if the output is an instance of a type with the given name."""type_name:strdefevaluate(self,ctx:EvaluatorContext[object,object,object])->EvaluationReason:output=ctx.outputforclsintype(output).__mro__:ifcls.__name__==self.type_nameorcls.__qualname__==self.type_name:returnEvaluationReason(value=True)reason=f'output is of type {type(output).__name__}'iftype(output).__qualname__!=type(output).__name__:reason+=f' (qualname: {type(output).__qualname__})'returnEvaluationReason(value=False,reason=reason)
Judge whether the output of a language model meets the criteria of a provided rubric.
If you do not specify a model, it uses the default model for judging. This starts as 'openai:gpt-4o', but can be
overridden by calling set_default_judge_model.
Source code in pydantic_evals/pydantic_evals/evaluators/common.py
@dataclassclassLLMJudge(Evaluator[object,object,object]):"""Judge whether the output of a language model meets the criteria of a provided rubric. If you do not specify a model, it uses the default model for judging. This starts as 'openai:gpt-4o', but can be overridden by calling [`set_default_judge_model`][pydantic_evals.evaluators.llm_as_a_judge.set_default_judge_model]. """rubric:strmodel:models.Model|models.KnownModelName|None=Noneinclude_input:bool=Falseasyncdefevaluate(self,ctx:EvaluatorContext[object,object,object],)->EvaluationReason:ifself.include_input:from.llm_as_a_judgeimportjudge_input_outputgrading_output=awaitjudge_input_output(ctx.inputs,ctx.output,self.rubric,self.model)else:from.llm_as_a_judgeimportjudge_outputgrading_output=awaitjudge_output(ctx.output,self.rubric,self.model)returnEvaluationReason(value=grading_output.pass_,reason=grading_output.reason)defbuild_serialization_arguments(self):result=super().build_serialization_arguments()# always serialize the model as a string when present; use its name if it's a KnownModelNameif(model:=result.get('model'))andisinstance(model,models.Model):result['model']=f'{model.system}:{model.model_name}'# Note: this may lead to confusion if you try to serialize-then-deserialize with a custom model.# I expect that is rare enough to be worth not solving yet, but common enough that we probably will want to# solve it eventually. I'm imagining some kind of model registry, but don't want to work out the details yet.returnresult
Check if the execution time is under the specified maximum.
Source code in pydantic_evals/pydantic_evals/evaluators/common.py
142143144145146147148149150151152153
@dataclassclassMaxDuration(Evaluator[object,object,object]):"""Check if the execution time is under the specified maximum."""seconds:float|timedeltadefevaluate(self,ctx:EvaluatorContext[object,object,object])->bool:duration=timedelta(seconds=ctx.duration)seconds=self.secondsifnotisinstance(seconds,timedelta):seconds=timedelta(seconds=seconds)returnduration<=seconds
The output of this evaluator is the result of evaluating the provided Python expression.
WARNING: this evaluator runs arbitrary Python code, so you should NEVER use it with untrusted inputs.
Source code in pydantic_evals/pydantic_evals/evaluators/common.py
208209210211212213214215216217218219
@dataclassclassPython(Evaluator[object,object,object]):"""The output of this evaluator is the result of evaluating the provided Python expression. ***WARNING***: this evaluator runs arbitrary Python code, so you should ***NEVER*** use it with untrusted inputs. """expression:strdefevaluate(self,ctx:EvaluatorContext[object,object,object])->EvaluatorOutput:# Evaluate the condition, exposing access to the evaluator context as `ctx`.returneval(self.expression,{'ctx':ctx})
An instance of this class is the sole input to all Evaluators. It contains all the information
needed to evaluate the task execution, including inputs, outputs, metadata, and telemetry data.
Evaluators use this context to access the task inputs, actual output, expected output, and other
information when evaluating the result of the task execution.
Example:
fromdataclassesimportdataclassfrompydantic_evals.evaluatorsimportEvaluator,EvaluatorContext@dataclassclassExactMatch(Evaluator):defevaluate(self,ctx:EvaluatorContext)->bool:# Use the context to access task inputs, outputs, and expected outputsreturnctx.output==ctx.expected_output
Source code in pydantic_evals/pydantic_evals/evaluators/context.py
@dataclassclassEvaluatorContext(Generic[InputsT,OutputT,MetadataT]):"""Context for evaluating a task execution. An instance of this class is the sole input to all Evaluators. It contains all the information needed to evaluate the task execution, including inputs, outputs, metadata, and telemetry data. Evaluators use this context to access the task inputs, actual output, expected output, and other information when evaluating the result of the task execution. Example: ```python from dataclasses import dataclass from pydantic_evals.evaluators import Evaluator, EvaluatorContext @dataclass class ExactMatch(Evaluator): def evaluate(self, ctx: EvaluatorContext) -> bool: # Use the context to access task inputs, outputs, and expected outputs return ctx.output == ctx.expected_output ``` """name:str|None"""The name of the case."""inputs:InputsT"""The inputs provided to the task for this case."""metadata:MetadataT|None"""Metadata associated with the case, if provided. May be None if no metadata was specified."""expected_output:OutputT|None"""The expected output for the case, if provided. May be None if no expected output was specified."""output:OutputT"""The actual output produced by the task for this case."""duration:float"""The duration of the task run for this case."""_span_tree:SpanTree|SpanTreeRecordingError=field(repr=False)"""The span tree for the task run for this case. This will be `None` if `logfire.configure` has not been called. """attributes:dict[str,Any]"""Attributes associated with the task run for this case. These can be set by calling `pydantic_evals.dataset.set_eval_attribute` in any code executed during the evaluation task."""metrics:dict[str,int|float]"""Metrics associated with the task run for this case. These can be set by calling `pydantic_evals.dataset.increment_eval_metric` in any code executed during the evaluation task."""@propertydefspan_tree(self)->SpanTree:"""Get the `SpanTree` for this task execution. The span tree is a graph where each node corresponds to an OpenTelemetry span recorded during the task execution, including timing information and any custom spans created during execution. Returns: The span tree for the task execution. Raises: SpanTreeRecordingError: If spans were not captured during execution of the task, e.g. due to not having the necessary dependencies installed. """ifisinstance(self._span_tree,SpanTreeRecordingError):# In this case, there was a reason we couldn't record the SpanTree. We raise that nowraiseself._span_treereturnself._span_tree
The span tree is a graph where each node corresponds to an OpenTelemetry span recorded during the task
execution, including timing information and any custom spans created during execution.
Source code in pydantic_evals/pydantic_evals/evaluators/evaluator.py
36373839404142434445464748
@dataclassclassEvaluationReason:"""The result of running an evaluator with an optional explanation. Contains a scalar value and an optional "reason" explaining the value. Args: value: The scalar result of the evaluation (boolean, integer, float, or string). reason: An optional explanation of the evaluation result. """value:EvaluationScalarreason:str|None=None
@dataclassclassEvaluationResult(Generic[EvaluationScalarT]):"""The details of an individual evaluation result. Contains the name, value, reason, and source evaluator for a single evaluation. Args: name: The name of the evaluation. value: The scalar result of the evaluation. reason: An optional explanation of the evaluation result. source: The evaluator that produced this result. """name:strvalue:EvaluationScalarTreason:str|Nonesource:Evaluatordefdowncast(self,*value_types:type[T])->EvaluationResult[T]|None:"""Attempt to downcast this result to a more specific type. Args: *value_types: The types to check the value against. Returns: A downcast version of this result if the value is an instance of one of the given types, otherwise None. """# Check if value matches any of the target types, handling bool as a special caseforvalue_typeinvalue_types:ifisinstance(self.value,value_type):# Only match bool with explicit bool typeifisinstance(self.value,bool)andvalue_typeisnotbool:continuereturncast(EvaluationResult[T],self)returnNone
Source code in pydantic_evals/pydantic_evals/evaluators/evaluator.py
808182838485868788899091929394959697
defdowncast(self,*value_types:type[T])->EvaluationResult[T]|None:"""Attempt to downcast this result to a more specific type. Args: *value_types: The types to check the value against. Returns: A downcast version of this result if the value is an instance of one of the given types, otherwise None. """# Check if value matches any of the target types, handling bool as a special caseforvalue_typeinvalue_types:ifisinstance(self.value,value_type):# Only match bool with explicit bool typeifisinstance(self.value,bool)andvalue_typeisnotbool:continuereturncast(EvaluationResult[T],self)returnNone
@dataclassclassEvaluator(Generic[InputsT,OutputT,MetadataT],metaclass=_StrictABCMeta):"""Base class for all evaluators. Evaluators can assess the performance of a task in a variety of ways, as a function of the EvaluatorContext. Subclasses must implement the `evaluate` method. Note it can be defined with either `def` or `async def`. Example: ```python from dataclasses import dataclass from pydantic_evals.evaluators import Evaluator, EvaluatorContext @dataclass class ExactMatch(Evaluator): def evaluate(self, ctx: EvaluatorContext) -> bool: return ctx.output == ctx.expected_output ``` """__pydantic_config__=ConfigDict(arbitrary_types_allowed=True)@classmethoddefname(cls)->str:"""Return the 'name' of this Evaluator to use during serialization. Returns: The name of the Evaluator, which is typically the class name. """# Note: if we wanted to prefer snake_case, we could use:# from pydantic.alias_generators import to_snake# return to_snake(cls.__name__)returncls.__name__@abstractmethoddefevaluate(self,ctx:EvaluatorContext[InputsT,OutputT,MetadataT])->EvaluatorOutput|Awaitable[EvaluatorOutput]:# pragma: no cover"""Evaluate the task output in the given context. This is the main evaluation method that subclasses must implement. It can be either synchronous or asynchronous, returning either an EvaluatorOutput directly or an Awaitable[EvaluatorOutput]. Args: ctx: The context containing the inputs, outputs, and metadata for evaluation. Returns: The evaluation result, which can be a scalar value, an EvaluationReason, or a mapping of evaluation names to either of those. Can be returned either synchronously or as an awaitable for asynchronous evaluation. """raiseNotImplementedError('You must implement `evaluate`.')defevaluate_sync(self,ctx:EvaluatorContext[InputsT,OutputT,MetadataT])->EvaluatorOutput:"""Run the evaluator synchronously, handling both sync and async implementations. This method ensures synchronous execution by running any async evaluate implementation to completion using run_until_complete. Args: ctx: The context containing the inputs, outputs, and metadata for evaluation. Returns: The evaluation result, which can be a scalar value, an EvaluationReason, or a mapping of evaluation names to either of those. """output=self.evaluate(ctx)ifinspect.iscoroutine(output):# pragma: no coverreturnget_event_loop().run_until_complete(output)else:returncast(EvaluatorOutput,output)asyncdefevaluate_async(self,ctx:EvaluatorContext[InputsT,OutputT,MetadataT])->EvaluatorOutput:"""Run the evaluator asynchronously, handling both sync and async implementations. This method ensures asynchronous execution by properly awaiting any async evaluate implementation. For synchronous implementations, it returns the result directly. Args: ctx: The context containing the inputs, outputs, and metadata for evaluation. Returns: The evaluation result, which can be a scalar value, an EvaluationReason, or a mapping of evaluation names to either of those. """# Note: If self.evaluate is synchronous, but you need to prevent this from blocking, override this method with:# return await anyio.to_thread.run_sync(self.evaluate, ctx)output=self.evaluate(ctx)ifinspect.iscoroutine(output):returnawaitoutputelse:returncast(EvaluatorOutput,output)@model_serializer(mode='plain')defserialize(self,info:SerializationInfo)->Any:"""Serialize this Evaluator to a JSON-serializable form. Returns: A JSON-serializable representation of this evaluator as an EvaluatorSpec. """raw_arguments=self.build_serialization_arguments()arguments:None|tuple[Any,]|dict[str,Any]iflen(raw_arguments)==0:arguments=Noneeliflen(raw_arguments)==1:arguments=(next(iter(raw_arguments.values())),)else:arguments=raw_argumentsreturnto_jsonable_python(EvaluatorSpec(name=self.name(),arguments=arguments),context=info.context,serialize_unknown=True)defbuild_serialization_arguments(self)->dict[str,Any]:"""Build the arguments for serialization. Evaluators are serialized for inclusion as the "source" in an `EvaluationResult`. If you want to modify how the evaluator is serialized for that or other purposes, you can override this method. Returns: A dictionary of arguments to be used during serialization. """raw_arguments:dict[str,Any]={}forfieldinfields(self):value=getattr(self,field.name)# always exclude defaults:iffield.defaultisnotMISSING:ifvalue==field.default:continueiffield.default_factoryisnotMISSING:ifvalue==field.default_factory():continueraw_arguments[field.name]=valuereturnraw_arguments
The name of the Evaluator, which is typically the class name.
Source code in pydantic_evals/pydantic_evals/evaluators/evaluator.py
148149150151152153154155156157158
@classmethoddefname(cls)->str:"""Return the 'name' of this Evaluator to use during serialization. Returns: The name of the Evaluator, which is typically the class name. """# Note: if we wanted to prefer snake_case, we could use:# from pydantic.alias_generators import to_snake# return to_snake(cls.__name__)returncls.__name__
This is the main evaluation method that subclasses must implement. It can be either synchronous
or asynchronous, returning either an EvaluatorOutput directly or an Awaitable[EvaluatorOutput].
@abstractmethoddefevaluate(self,ctx:EvaluatorContext[InputsT,OutputT,MetadataT])->EvaluatorOutput|Awaitable[EvaluatorOutput]:# pragma: no cover"""Evaluate the task output in the given context. This is the main evaluation method that subclasses must implement. It can be either synchronous or asynchronous, returning either an EvaluatorOutput directly or an Awaitable[EvaluatorOutput]. Args: ctx: The context containing the inputs, outputs, and metadata for evaluation. Returns: The evaluation result, which can be a scalar value, an EvaluationReason, or a mapping of evaluation names to either of those. Can be returned either synchronously or as an awaitable for asynchronous evaluation. """raiseNotImplementedError('You must implement `evaluate`.')
defevaluate_sync(self,ctx:EvaluatorContext[InputsT,OutputT,MetadataT])->EvaluatorOutput:"""Run the evaluator synchronously, handling both sync and async implementations. This method ensures synchronous execution by running any async evaluate implementation to completion using run_until_complete. Args: ctx: The context containing the inputs, outputs, and metadata for evaluation. Returns: The evaluation result, which can be a scalar value, an EvaluationReason, or a mapping of evaluation names to either of those. """output=self.evaluate(ctx)ifinspect.iscoroutine(output):# pragma: no coverreturnget_event_loop().run_until_complete(output)else:returncast(EvaluatorOutput,output)
Run the evaluator asynchronously, handling both sync and async implementations.
This method ensures asynchronous execution by properly awaiting any async evaluate
implementation. For synchronous implementations, it returns the result directly.
asyncdefevaluate_async(self,ctx:EvaluatorContext[InputsT,OutputT,MetadataT])->EvaluatorOutput:"""Run the evaluator asynchronously, handling both sync and async implementations. This method ensures asynchronous execution by properly awaiting any async evaluate implementation. For synchronous implementations, it returns the result directly. Args: ctx: The context containing the inputs, outputs, and metadata for evaluation. Returns: The evaluation result, which can be a scalar value, an EvaluationReason, or a mapping of evaluation names to either of those. """# Note: If self.evaluate is synchronous, but you need to prevent this from blocking, override this method with:# return await anyio.to_thread.run_sync(self.evaluate, ctx)output=self.evaluate(ctx)ifinspect.iscoroutine(output):returnawaitoutputelse:returncast(EvaluatorOutput,output)
@model_serializer(mode='plain')defserialize(self,info:SerializationInfo)->Any:"""Serialize this Evaluator to a JSON-serializable form. Returns: A JSON-serializable representation of this evaluator as an EvaluatorSpec. """raw_arguments=self.build_serialization_arguments()arguments:None|tuple[Any,]|dict[str,Any]iflen(raw_arguments)==0:arguments=Noneeliflen(raw_arguments)==1:arguments=(next(iter(raw_arguments.values())),)else:arguments=raw_argumentsreturnto_jsonable_python(EvaluatorSpec(name=self.name(),arguments=arguments),context=info.context,serialize_unknown=True)
Evaluators are serialized for inclusion as the "source" in an EvaluationResult.
If you want to modify how the evaluator is serialized for that or other purposes, you can override this method.
defbuild_serialization_arguments(self)->dict[str,Any]:"""Build the arguments for serialization. Evaluators are serialized for inclusion as the "source" in an `EvaluationResult`. If you want to modify how the evaluator is serialized for that or other purposes, you can override this method. Returns: A dictionary of arguments to be used during serialization. """raw_arguments:dict[str,Any]={}forfieldinfields(self):value=getattr(self,field.name)# always exclude defaults:iffield.defaultisnotMISSING:ifvalue==field.default:continueiffield.default_factoryisnotMISSING:ifvalue==field.default_factory():continueraw_arguments[field.name]=valuereturnraw_arguments
Source code in pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py
171819202122
classGradingOutput(BaseModel,populate_by_name=True):"""The output of a grading operation."""reason:strpass_:bool=Field(validation_alias='pass',serialization_alias='pass')score:float
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
but this can be changed using the set_default_judge_model function.
Source code in pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py
46474849505152535455
asyncdefjudge_output(output:Any,rubric:str,model:models.Model|models.KnownModelName|None=None)->GradingOutput:"""Judge the output of a model based on a rubric. If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o', but this can be changed using the `set_default_judge_model` function. """user_prompt=f'<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'return(await_judge_output_agent.run(user_prompt,model=modelor_default_model)).output
Judge the output of a model based on the inputs and a rubric.
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
but this can be changed using the set_default_judge_model function.
Source code in pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py
81828384858687888990
asyncdefjudge_input_output(inputs:Any,output:Any,rubric:str,model:models.Model|models.KnownModelName|None=None)->GradingOutput:"""Judge the output of a model based on the inputs and a rubric. If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o', but this can be changed using the `set_default_judge_model` function. """user_prompt=f'<Input>\n{_stringify(inputs)}\n</Input>\n<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'return(await_judge_input_output_agent.run(user_prompt,model=modelor_default_model)).output
This model is used if None is passed to the model argument of judge_output and judge_input_output.
Source code in pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py
93949596979899
defset_default_judge_model(model:models.Model|models.KnownModelName)->None:# pragma: no cover"""Set the default model used for judging. This model is used if `None` is passed to the `model` argument of `judge_output` and `judge_input_output`. """global_default_model_default_model=model