From 1ca5a8cb93eeec5badcdf2d1b0f1cb99cf4c3ff1 Mon Sep 17 00:00:00 2001 From: GangGreenTemperTatum <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Thu, 11 Dec 2025 11:54:33 -0500 Subject: [PATCH 1/2] chore: enable native fallback_model parameter to llm_judge() scorer --- dreadnode/scorers/judge.py | 45 ++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/dreadnode/scorers/judge.py b/dreadnode/scorers/judge.py index df5bab28..d2205b06 100644 --- a/dreadnode/scorers/judge.py +++ b/dreadnode/scorers/judge.py @@ -42,6 +42,7 @@ def llm_judge( input: t.Any | None = None, expected_output: t.Any | None = None, model_params: rg.GenerateParams | AnyDict | None = None, + fallback_model: str | rg.Generator | None = None, passing: t.Callable[[float], bool] | None = None, min_score: float | None = None, max_score: float | None = None, @@ -56,12 +57,30 @@ def llm_judge( input: The input which produced the output for context, if applicable. expected_output: The expected output to compare against, if applicable. model_params: Optional parameters for the model. + fallback_model: Optional fallback model to use if the primary model fails. passing: Optional callback to determine if the score is passing based on the score value - overrides any model-specified value. min_score: Optional minimum score for the judgement - if provided, the score will be clamped to this value. max_score: Optional maximum score for the judgement - if provided, the score will be clamped to this value. name: The name of the scorer. """ + def _get_generator( + model_input: str | rg.Generator, params: rg.GenerateParams | AnyDict | None + ) -> rg.Generator: + """Helper to create a generator from model string or return existing generator.""" + if isinstance(model_input, str): + return rg.get_generator( + model_input, + params=params + if isinstance(params, rg.GenerateParams) + else rg.GenerateParams.model_validate(params) + if params + else None, + ) + if isinstance(model_input, rg.Generator): + return model_input + raise TypeError("Model must be a string identifier or a Generator instance.") + async def evaluate( data: t.Any, *, @@ -72,24 +91,10 @@ async def evaluate( input: t.Any | None = input, expected_output: t.Any | None = expected_output, model_params: rg.GenerateParams | AnyDict | None = model_params, + fallback_model: str | rg.Generator | None = fallback_model, min_score: float | None = min_score, max_score: float | None = max_score, ) -> list[Metric]: - generator: rg.Generator - if isinstance(model, str): - generator = rg.get_generator( - model, - params=model_params - if isinstance(model_params, rg.GenerateParams) - else rg.GenerateParams.model_validate(model_params) - if model_params - else None, - ) - elif isinstance(model, rg.Generator): - generator = model - else: - raise TypeError("Model must be a string identifier or a Generator instance.") - input_data = JudgeInput( input=str(input) if input is not None else None, expected_output=str(expected_output) if expected_output is not None else None, @@ -97,7 +102,15 @@ async def evaluate( rubric=rubric, ) - judgement = await judge.bind(generator)(input_data) + # Try primary model, fallback if needed + try: + generator = _get_generator(model, model_params) + judgement = await judge.bind(generator)(input_data) + except Exception: + if fallback_model is None: + raise + generator = _get_generator(fallback_model, model_params) + judgement = await judge.bind(generator)(input_data) if min_score is not None: judgement.score = max(min_score, judgement.score) From 25d63ea82bbd2c12448eff763829b6590eb8f985 Mon Sep 17 00:00:00 2001 From: GangGreenTemperTatum <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Wed, 14 Jan 2026 14:14:47 -0500 Subject: [PATCH 2/2] chore: pr feedback addressed --- dreadnode/scorers/judge.py | 76 +++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 30 deletions(-) diff --git a/dreadnode/scorers/judge.py b/dreadnode/scorers/judge.py index 16c994a7..e5824c16 100644 --- a/dreadnode/scorers/judge.py +++ b/dreadnode/scorers/judge.py @@ -1,6 +1,7 @@ import typing as t import rigging as rg +from loguru import logger from dreadnode.common_types import AnyDict from dreadnode.meta import Config @@ -66,7 +67,7 @@ def llm_judge( system_prompt: Optional custom system prompt for the judge. If None, uses default. """ - async def evaluate( # noqa: PLR0912 + async def evaluate( data: t.Any, *, model: str | rg.Generator = Config( # noqa: B008 @@ -81,21 +82,26 @@ async def evaluate( # noqa: PLR0912 max_score: float | None = max_score, system_prompt: str | None = system_prompt, ) -> list[Metric]: - generator: rg.Generator - if isinstance(model, str): - generator = rg.get_generator( - model, - params=model_params - if isinstance(model_params, rg.GenerateParams) - else rg.GenerateParams.model_validate(model_params) - if model_params - else None, - ) - elif isinstance(model, rg.Generator): - generator = model - else: + def _create_generator( + model: str | rg.Generator, + params: rg.GenerateParams | AnyDict | None, + ) -> rg.Generator: + """Create a Generator from a model identifier or return the Generator instance.""" + if isinstance(model, str): + return rg.get_generator( + model, + params=params + if isinstance(params, rg.GenerateParams) + else rg.GenerateParams.model_validate(params) + if params + else None, + ) + if isinstance(model, rg.Generator): + return model raise TypeError("Model must be a string identifier or a Generator instance.") + generator = _create_generator(model, model_params) + input_data = JudgeInput( input=str(input) if input is not None else None, expected_output=str(expected_output) if expected_output is not None else None, @@ -103,31 +109,32 @@ async def evaluate( # noqa: PLR0912 rubric=rubric, ) + # Track fallback usage for observability + used_fallback = False + primary_error: str | None = None + # Try primary model, fallback if needed try: pipeline = generator.chat([]) if system_prompt: pipeline.chat.inject_system_content(system_prompt) judgement = await judge.bind(pipeline)(input_data) - except Exception: + except Exception as e: if fallback_model is None: raise + # Log primary model failure and fallback usage + used_fallback = True + primary_error = f"{type(e).__name__}: {e}" + primary_model_name = model if isinstance(model, str) else type(model).__name__ + fallback_model_name = ( + fallback_model if isinstance(fallback_model, str) else type(fallback_model).__name__ + ) + logger.warning( + f"Primary model '{primary_model_name}' failed with {primary_error}. " + f"Using fallback model '{fallback_model_name}'." + ) # Use fallback model - if isinstance(fallback_model, str): - generator = rg.get_generator( - fallback_model, - params=model_params - if isinstance(model_params, rg.GenerateParams) - else rg.GenerateParams.model_validate(model_params) - if model_params - else None, - ) - elif isinstance(fallback_model, rg.Generator): - generator = fallback_model - else: - raise TypeError( - "Fallback model must be a string identifier or a Generator instance." - ) from None + generator = _create_generator(fallback_model, model_params) pipeline = generator.chat([]) if system_prompt: pipeline.chat.inject_system_content(system_prompt) @@ -145,6 +152,15 @@ async def evaluate( # noqa: PLR0912 value=judgement.score, attributes={ "reason": judgement.reason, + "used_fallback": used_fallback, + "fallback_model": ( + str(fallback_model) + if isinstance(fallback_model, str) + else type(fallback_model).__name__ + ) + if used_fallback + else None, + "primary_error": primary_error, }, ) pass_metric = Metric(value=float(judgement.passing))