Skip to content

Code

app.agents.agent_factories

Agent factory functions for creating PydanticAI agents.

This module provides factory functions for creating different types of agents with appropriate models, tools, and configurations. It separates agent creation logic from model creation and orchestration.

Classes

AgentFactory

Factory class for creating different types of agents.

Source code in src/app/agents/agent_factories.py
class AgentFactory:
    """Factory class for creating different types of agents."""

    def __init__(self, endpoint_config: EndpointConfig | None = None):
        """Initialize agent factory with model configuration."""
        self.endpoint_config = endpoint_config
        self._models: ModelDict | None = None

    def get_models(
        self,
        include_researcher: bool = False,
        include_analyst: bool = False,
        include_synthesiser: bool = False,
    ) -> ModelDict:
        """Get or create models for agents."""
        if self._models is None and self.endpoint_config:
            self._models = create_agent_models(
                self.endpoint_config,
                include_researcher=include_researcher,
                include_analyst=include_analyst,
                include_synthesiser=include_synthesiser,
            )
        return self._models or ModelDict.model_construct(
            model_manager=None,
            model_researcher=None,
            model_analyst=None,
            model_synthesiser=None,
        )

    def create_manager_agent(self, system_prompt: str | None = None) -> Agent:
        """Create a manager agent with delegation capabilities."""
        models = self.get_models()
        if not models.model_manager:
            raise ValueError("Manager model not available")

        agent = Agent(
            model=models.model_manager,
            system_prompt=system_prompt
            or "You are a manager agent responsible for coordinating tasks.",
        )

        logger.info("Created manager agent")
        return agent

    def create_researcher_agent(self, system_prompt: str | None = None) -> Agent:
        """Create a researcher agent for information gathering."""
        models = self.get_models(include_researcher=True)
        if not models.model_researcher:
            raise ValueError("Researcher model not available")

        agent = Agent(
            model=models.model_researcher,
            system_prompt=system_prompt
            or "You are a researcher agent specialized in information gathering.",
        )

        logger.info("Created researcher agent")
        return agent

    def create_analyst_agent(self, system_prompt: str | None = None) -> Agent:
        """Create an analyst agent for data analysis."""
        models = self.get_models(include_analyst=True)
        if not models.model_analyst:
            raise ValueError("Analyst model not available")

        agent = Agent(
            model=models.model_analyst,
            system_prompt=system_prompt or "You are an analyst agent specialized in data analysis.",
        )

        logger.info("Created analyst agent")
        return agent

    def create_synthesiser_agent(self, system_prompt: str | None = None) -> Agent:
        """Create a synthesiser agent for combining results."""
        models = self.get_models(include_synthesiser=True)
        if not models.model_synthesiser:
            raise ValueError("Synthesiser model not available")

        agent = Agent(
            model=models.model_synthesiser,
            system_prompt=system_prompt
            or "You are a synthesiser agent specialized in combining information.",
        )

        logger.info("Created synthesiser agent")
        return agent
Functions
__init__(endpoint_config=None)

Initialize agent factory with model configuration.

Source code in src/app/agents/agent_factories.py
def __init__(self, endpoint_config: EndpointConfig | None = None):
    """Initialize agent factory with model configuration."""
    self.endpoint_config = endpoint_config
    self._models: ModelDict | None = None
create_analyst_agent(system_prompt=None)

Create an analyst agent for data analysis.

Source code in src/app/agents/agent_factories.py
def create_analyst_agent(self, system_prompt: str | None = None) -> Agent:
    """Create an analyst agent for data analysis."""
    models = self.get_models(include_analyst=True)
    if not models.model_analyst:
        raise ValueError("Analyst model not available")

    agent = Agent(
        model=models.model_analyst,
        system_prompt=system_prompt or "You are an analyst agent specialized in data analysis.",
    )

    logger.info("Created analyst agent")
    return agent
create_manager_agent(system_prompt=None)

Create a manager agent with delegation capabilities.

Source code in src/app/agents/agent_factories.py
def create_manager_agent(self, system_prompt: str | None = None) -> Agent:
    """Create a manager agent with delegation capabilities."""
    models = self.get_models()
    if not models.model_manager:
        raise ValueError("Manager model not available")

    agent = Agent(
        model=models.model_manager,
        system_prompt=system_prompt
        or "You are a manager agent responsible for coordinating tasks.",
    )

    logger.info("Created manager agent")
    return agent
create_researcher_agent(system_prompt=None)

Create a researcher agent for information gathering.

Source code in src/app/agents/agent_factories.py
def create_researcher_agent(self, system_prompt: str | None = None) -> Agent:
    """Create a researcher agent for information gathering."""
    models = self.get_models(include_researcher=True)
    if not models.model_researcher:
        raise ValueError("Researcher model not available")

    agent = Agent(
        model=models.model_researcher,
        system_prompt=system_prompt
        or "You are a researcher agent specialized in information gathering.",
    )

    logger.info("Created researcher agent")
    return agent
create_synthesiser_agent(system_prompt=None)

Create a synthesiser agent for combining results.

Source code in src/app/agents/agent_factories.py
def create_synthesiser_agent(self, system_prompt: str | None = None) -> Agent:
    """Create a synthesiser agent for combining results."""
    models = self.get_models(include_synthesiser=True)
    if not models.model_synthesiser:
        raise ValueError("Synthesiser model not available")

    agent = Agent(
        model=models.model_synthesiser,
        system_prompt=system_prompt
        or "You are a synthesiser agent specialized in combining information.",
    )

    logger.info("Created synthesiser agent")
    return agent
get_models(include_researcher=False, include_analyst=False, include_synthesiser=False)

Get or create models for agents.

Source code in src/app/agents/agent_factories.py
def get_models(
    self,
    include_researcher: bool = False,
    include_analyst: bool = False,
    include_synthesiser: bool = False,
) -> ModelDict:
    """Get or create models for agents."""
    if self._models is None and self.endpoint_config:
        self._models = create_agent_models(
            self.endpoint_config,
            include_researcher=include_researcher,
            include_analyst=include_analyst,
            include_synthesiser=include_synthesiser,
        )
    return self._models or ModelDict.model_construct(
        model_manager=None,
        model_researcher=None,
        model_analyst=None,
        model_synthesiser=None,
    )

Functions

create_evaluation_agent(provider, model_name, assessment_type, api_key=None, system_prompt=None, prompts=None)

Create an agent specifically for evaluation tasks.

Parameters:

Name Type Description Default
provider str

LLM provider (e.g., “openai”, “github”)

required
model_name str

Model name (e.g., “gpt-4o-mini”)

required
assessment_type str

Type of assessment (e.g., “technical_accuracy”)

required
api_key str | None

API key (optional)

None
system_prompt str | None

Custom system prompt (optional)

None
prompts dict[str, str] | None

Prompt configuration dictionary (optional)

None

Returns:

Type Description
Agent

Agent configured for evaluation tasks

Source code in src/app/agents/agent_factories.py
def create_evaluation_agent(
    provider: str,
    model_name: str,
    assessment_type: str,
    api_key: str | None = None,
    system_prompt: str | None = None,
    prompts: dict[str, str] | None = None,
) -> Agent:
    """
    Create an agent specifically for evaluation tasks.

    Args:
        provider: LLM provider (e.g., "openai", "github")
        model_name: Model name (e.g., "gpt-4o-mini")
        assessment_type: Type of assessment (e.g., "technical_accuracy")
        api_key: API key (optional)
        system_prompt: Custom system prompt (optional)
        prompts: Prompt configuration dictionary (optional)

    Returns:
        Agent configured for evaluation tasks
    """
    model = create_simple_model(provider, model_name, api_key)

    # Try to get system prompt from prompts config first
    if system_prompt is None and prompts:
        prompt_keys = {
            "technical_accuracy": f"system_prompt_evaluator_{assessment_type}",
            "constructiveness": f"system_prompt_evaluator_{assessment_type}",
            "planning_rationality": f"system_prompt_evaluator_{assessment_type}",
        }

        prompt_key = prompt_keys.get(assessment_type, "system_prompt_evaluator_general")
        system_prompt = prompts.get(prompt_key)

    # Fallback to default prompts if not found in config
    if system_prompt is None:
        default_prompts = {
            "technical_accuracy": (
                "You are an expert at evaluating technical accuracy of reviews. "
                "Focus on factual correctness and methodology understanding."
            ),
            "constructiveness": (
                "You are an expert at evaluating constructiveness of academic reviews. "
                "Focus on actionable feedback and balanced critique."
            ),
            "planning_rationality": (
                "You are an expert at evaluating planning quality of agent executions. "
                "Focus on logical flow and decision quality."
            ),
            "general": (
                "You are an expert evaluator providing structured assessments "
                "of text quality and content."
            ),
        }
        system_prompt = default_prompts.get(assessment_type, default_prompts["general"])

    agent = Agent(
        model=model,
        system_prompt=system_prompt,
    )

    logger.info(f"Created evaluation agent for {assessment_type} using {provider}/{model_name}")
    return agent

create_simple_agent(model, system_prompt)

Create a simple agent with provided model and prompt.

Parameters:

Name Type Description Default
model Model

PydanticAI model instance

required
system_prompt str

System prompt for the agent

required

Returns:

Type Description
Agent

Configured Agent instance

Source code in src/app/agents/agent_factories.py
def create_simple_agent(model: Model, system_prompt: str) -> Agent:
    """
    Create a simple agent with provided model and prompt.

    Args:
        model: PydanticAI model instance
        system_prompt: System prompt for the agent

    Returns:
        Configured Agent instance
    """
    agent = Agent(model=model, system_prompt=system_prompt)
    logger.info("Created simple agent")
    return agent

app.agents.agent_system

Agent system utilities for orchestrating multi-agent workflows.

This module provides functions and helpers to create, configure, and run agent systems using Pydantic AI. It supports delegation of tasks to research, analysis, and synthesis agents, and manages agent configuration, environment setup, and execution. Args: provider (str): The name of the provider. provider_config (ProviderConfig): Configuration settings for the provider. api_key (str): API key for authentication with the provider. prompts (dict[str, str]): Configuration for prompts. include_researcher (bool): Flag to include the researcher agent. include_analyst (bool): Flag to include the analyst agent. include_synthesiser (bool): Flag to include the synthesiser agent. query (str | list[dict[str, str]]): The query or messages for the agent. chat_config (ChatConfig): The configuration object for agents and providers. usage_limits (UsageLimits): Usage limits for agent execution.

Functions:

Name Description
get_manager

Initializes and returns a manager agent with the specified configuration.

run_manager

Asynchronously runs the manager agent with the given query and provider.

setup_agent_env

Sets up the environment for an agent by configuring provider settings, prompts, API key, and usage limits.

Classes

Functions

get_manager(provider, provider_config, api_key, prompts, include_researcher=False, include_analyst=False, include_synthesiser=False, enable_review_tools=False)

Initializes and returns a Agent manager with the specified configuration. Args: provider (str): The name of the provider. provider_config (ProviderConfig): Configuration settings for the provider. api_key (str): API key for authentication with the provider. prompts (PromptsConfig): Configuration for prompts. include_researcher (bool, optional): Flag to include analyst model. Defaults to False. include_analyst (bool, optional): Flag to include analyst model. Defaults to False. include_synthesiser (bool, optional): Flag to include synthesiser model. Defaults to False. Returns: Agent: The initialized Agent manager.

Source code in src/app/agents/agent_system.py
def get_manager(
    provider: str,
    provider_config: ProviderConfig,
    api_key: str | None,
    prompts: dict[str, str],
    include_researcher: bool = False,
    include_analyst: bool = False,
    include_synthesiser: bool = False,
    enable_review_tools: bool = False,
) -> Agent[None, BaseModel]:
    """
    Initializes and returns a Agent manager with the specified configuration.
    Args:
        provider (str): The name of the provider.
        provider_config (ProviderConfig): Configuration settings for the provider.
        api_key (str): API key for authentication with the provider.
        prompts (PromptsConfig): Configuration for prompts.
        include_researcher (bool, optional): Flag to include analyst model.
            Defaults to False.
        include_analyst (bool, optional): Flag to include analyst model.
            Defaults to False.
        include_synthesiser (bool, optional): Flag to include synthesiser model.
            Defaults to False.
    Returns:
        Agent: The initialized Agent manager.
    """

    model_config = EndpointConfig.model_validate(
        {
            "provider": provider,
            "prompts": prompts,
            "api_key": api_key,
            "provider_config": provider_config,
        }
    )
    models = create_agent_models(
        model_config, include_researcher, include_analyst, include_synthesiser
    )
    max_content_length = provider_config.max_content_length or 15000
    manager = _create_manager(prompts, models, provider, enable_review_tools, max_content_length)

    return manager

initialize_logfire_instrumentation_from_settings(settings=None)

Initialize Logfire instrumentation from JudgeSettings.

Uses logfire.instrument_pydantic_ai() for automatic tracing. No manual decorators needed - all PydanticAI agents auto-instrumented.

Parameters:

Name Type Description Default
settings JudgeSettings | None

JudgeSettings instance. If None, uses default JudgeSettings().

None
Source code in src/app/agents/agent_system.py
def initialize_logfire_instrumentation_from_settings(
    settings: JudgeSettings | None = None,
) -> None:
    """Initialize Logfire instrumentation from JudgeSettings.

    Uses logfire.instrument_pydantic_ai() for automatic tracing.
    No manual decorators needed - all PydanticAI agents auto-instrumented.

    Args:
        settings: JudgeSettings instance. If None, uses default JudgeSettings().
    """
    try:
        if settings is None:
            settings = JudgeSettings()
        logfire_config = LogfireConfig.from_settings(settings)
        initialize_logfire_instrumentation(logfire_config)
        logger.info(f"Logfire instrumentation initialized: enabled={logfire_config.enabled}")
    except Exception as e:
        logger.warning(f"Failed to initialize Logfire instrumentation: {e}")

resilient_tool_wrapper(tool)

Wrap a PydanticAI Tool so HTTP and network errors return error strings.

Search tools are supplementary — when they fail, the agent should receive a descriptive error message and continue generating output from paper content and model knowledge. This prevents a search outage from crashing the run.

Catches
  • httpx.HTTPStatusError (403 Forbidden, 429 Too Many Requests, etc.)
  • httpx.HTTPError (broader httpx network errors)
  • Exception (any other network or library failure)

Parameters:

Name Type Description Default
tool Tool[Any]

The original PydanticAI Tool to wrap.

required

Returns:

Type Description
Tool[Any]

A new Tool with the same name and description, but with a resilient

Tool[Any]

function that catches search errors and returns a descriptive string.

Source code in src/app/agents/agent_system.py
def resilient_tool_wrapper(tool: Tool[Any]) -> Tool[Any]:
    """Wrap a PydanticAI Tool so HTTP and network errors return error strings.

    Search tools are supplementary — when they fail, the agent should receive a
    descriptive error message and continue generating output from paper content
    and model knowledge. This prevents a search outage from crashing the run.

    Catches:
        - httpx.HTTPStatusError (403 Forbidden, 429 Too Many Requests, etc.)
        - httpx.HTTPError (broader httpx network errors)
        - Exception (any other network or library failure)

    Args:
        tool: The original PydanticAI Tool to wrap.

    Returns:
        A new Tool with the same name and description, but with a resilient
        function that catches search errors and returns a descriptive string.
    """
    original_fn: Callable[..., Any] = tool.function

    async def _resilient(*args: Any, **kwargs: Any) -> Any:
        try:
            return await original_fn(*args, **kwargs)
        except httpx.HTTPStatusError as exc:
            status = exc.response.status_code
            url = str(exc.request.url) if exc.request else "unknown"
            logger.warning(f"Search tool '{tool.name}' HTTP {status} error for URL {url}: {exc}")
            return (
                f"Search tool '{tool.name}' is currently unavailable "
                f"(HTTP {status}). Proceed using paper content and model knowledge."
            )
        except httpx.HTTPError as exc:
            logger.warning(f"Search tool '{tool.name}' network error: {exc}")
            return (
                f"Search tool '{tool.name}' is currently unavailable "
                f"(network error). Proceed using paper content and model knowledge."
            )
        except Exception as exc:
            logger.warning(f"Search tool '{tool.name}' failed: {type(exc).__name__}: {exc}")
            return (
                f"Search tool '{tool.name}' is currently unavailable "
                f"({type(exc).__name__}). Proceed using paper content and model knowledge."
            )

    return Tool(
        _resilient,
        name=tool.name,
        description=tool.description,
    )

run_manager(manager, query, provider, usage_limits, execution_id=None) async

Asynchronously run the manager with the given query and provider.

Auto-instrumented by logfire.instrument_pydantic_ai() - no manual decorators needed.

Parameters:

Name Type Description Default
manager Agent[None, BaseModel]

The system agent responsible for running the query.

required
query UserPromptType

The query to be processed by the manager.

required
provider str

The provider to be used for the query.

required
usage_limits UsageLimits | None

The usage limits to be applied during the query execution.

required
execution_id str | None

Optional pre-generated execution ID. When provided, used as-is; otherwise a new exec_{hex12} ID is generated.

None

Returns:

Type Description
tuple[str, Any]

Tuple of (execution_id, manager_output) for trace retrieval and evaluation.

Source code in src/app/agents/agent_system.py
async def run_manager(
    manager: Agent[None, BaseModel],
    query: UserPromptType,
    provider: str,
    usage_limits: UsageLimits | None,
    execution_id: str | None = None,
) -> tuple[str, Any]:
    """Asynchronously run the manager with the given query and provider.

    Auto-instrumented by logfire.instrument_pydantic_ai() - no manual decorators needed.

    Args:
        manager: The system agent responsible for running the query.
        query: The query to be processed by the manager.
        provider: The provider to be used for the query.
        usage_limits: The usage limits to be applied during the query execution.
        execution_id: Optional pre-generated execution ID. When provided, used
            as-is; otherwise a new ``exec_{hex12}`` ID is generated.

    Returns:
        Tuple of (execution_id, manager_output) for trace retrieval and evaluation.
    """
    # Initialize trace collection
    trace_collector = get_trace_collector()
    if execution_id is None:
        execution_id = f"exec_{uuid.uuid4().hex[:12]}"
    trace_collector.start_execution(execution_id)

    model_obj = getattr(manager, "model", None)
    model_name = (
        model_obj
        if isinstance(model_obj, str)
        else (getattr(model_obj, "model_name", "unknown") if model_obj else "unknown")
    )
    logger.info(f"Researching with {provider}({model_name}) and Topic: {query} ...")

    try:
        logger.info("Waiting for model response ...")
        # Narrow query type for PydanticAI Agent.run() compatibility
        user_prompt: str | None = query if isinstance(query, str | None) else str(query)
        result = await manager.run(user_prompt=user_prompt, usage_limits=usage_limits)
        logger.info(f"Result: {result}")
        logger.info(f"Usage statistics: {result.usage()}")

        # Finalize trace collection
        trace_collector.end_execution()
        logger.info(f"Trace collection completed for execution: {execution_id}")

        return execution_id, result.output

    except ModelHTTPError as e:
        trace_collector.end_execution()
        if e.status_code == 429:
            detail = _extract_rate_limit_detail(e)
            logger.error(f"Rate limit exceeded for {provider}({model_name}): {detail}")
            raise SystemExit(1) from e
        _handle_model_http_error(e, provider, model_name)

    except UsageLimitExceeded as e:
        trace_collector.end_execution()
        logger.error(f"Token limit reached for {provider}({model_name}): {e}")
        raise SystemExit(1) from e

    except Exception as e:
        trace_collector.end_execution()
        logger.error(f"Error in run_manager: {e}")
        raise

setup_agent_env(provider, query, chat_config, chat_env_config, token_limit=None)

Sets up the environment for an agent by configuring provider settings, prompts, API key, and usage limits.

Parameters:

Name Type Description Default
provider str

The name of the provider.

required
query UserPromptType

The messages or queries to be sent to the agent.

required
chat_config ChatConfig | BaseModel

The configuration object containing provider and prompt settings.

required
chat_env_config AppEnv

The application environment configuration containing API keys.

required
token_limit int | None

Optional token limit override (CLI/GUI param). Priority: CLI/GUI > env var > config. Valid range: 1000-1000000.

None

Returns:

Name Type Description
EndpointConfig EndpointConfig

The configuration object for the agent.

Source code in src/app/agents/agent_system.py
def setup_agent_env(
    provider: str,
    query: UserPromptType,
    chat_config: ChatConfig | BaseModel,
    chat_env_config: AppEnv,
    token_limit: int | None = None,
) -> EndpointConfig:
    """
    Sets up the environment for an agent by configuring provider settings, prompts,
    API key, and usage limits.

    Args:
        provider (str): The name of the provider.
        query (UserPromptType): The messages or queries to be sent to the agent.
        chat_config (ChatConfig | BaseModel): The configuration object containing
            provider and prompt settings.
        chat_env_config (AppEnv): The application environment configuration
            containing API keys.
        token_limit (int | None): Optional token limit override (CLI/GUI param).
            Priority: CLI/GUI > env var > config. Valid range: 1000-1000000.

    Returns:
        EndpointConfig: The configuration object for the agent.
    """

    if not isinstance(chat_config, ChatConfig):
        raise TypeError("'chat_config' of invalid type: ChatConfig expected")

    provider_config = get_provider_config(provider, chat_config.providers)
    prompts = chat_config.prompts
    is_api_key, api_key_msg = get_api_key(provider, chat_env_config)

    if provider.lower() != "ollama" and not is_api_key:
        msg = f"API key for provider '{provider}' is not set."
        logger.error(msg)
        raise ValueError(msg)

    # Determine and validate token limit with priority: CLI/GUI > env var > config
    effective_limit = _determine_effective_token_limit(
        token_limit, chat_env_config, provider_config
    )
    _validate_token_limit(effective_limit)
    usage_limits = _create_usage_limits(effective_limit)

    return EndpointConfig.model_validate(
        {
            "provider": provider,
            "query": query,
            "api_key": api_key_msg,
            "prompts": prompts,
            "provider_config": provider_config,
            "usage_limits": usage_limits,
        }
    )

app.agents.logfire_instrumentation

Logfire tracing instrumentation for PydanticAI agents.

Uses Logfire’s native PydanticAI auto-instrumentation via logfire.instrument_pydantic_ai(). No manual decorators or wrappers needed.

Classes

LogfireInstrumentationManager

Manages Logfire tracing instrumentation for PydanticAI agents.

Uses logfire.instrument_pydantic_ai() for automatic instrumentation of all PydanticAI agent execution. No manual decorators required.

Source code in src/app/agents/logfire_instrumentation.py
class LogfireInstrumentationManager:
    """Manages Logfire tracing instrumentation for PydanticAI agents.

    Uses logfire.instrument_pydantic_ai() for automatic instrumentation
    of all PydanticAI agent execution. No manual decorators required.
    """

    def __init__(self, config: LogfireConfig):
        self.config = config
        self._initialize_logfire()

    def _initialize_logfire(self) -> None:
        """Initialize Logfire with Phoenix OTLP endpoint.

        Checks OTLP endpoint connectivity before initialization to prevent
        noisy stack traces when endpoint is unreachable. Logs single warning
        and disables tracing gracefully.
        """
        if not self.config.enabled:
            logger.info("Logfire tracing disabled")
            return

        if not _logfire_available:
            logger.warning("Logfire library not available, tracing disabled")
            self.config.enabled = False
            return

        try:
            self._configure_phoenix_endpoint()
            self._configure_logfire()
            logfire.instrument_pydantic_ai()  # type: ignore
            self._log_initialization_info()
        except Exception as e:
            logger.error(f"Failed to initialize Logfire: {e}")
            self.config.enabled = False

    def _configure_phoenix_endpoint(self) -> None:
        """Configure Phoenix OTLP endpoint environment variables.

        Checks endpoint connectivity before configuration to prevent
        ConnectionRefusedError stack traces during span export.
        """
        if self.config.send_to_cloud:
            return

        import os

        # Set Phoenix OTLP endpoint via environment variable
        # Reason: Per OTEL spec, SDK auto-appends signal-specific paths
        # (/v1/traces, /v1/metrics) to base endpoint. Set base URL only.
        # Phoenix doesn't support /v1/metrics, so disable metrics export explicitly.
        phoenix_base_url = self.config.phoenix_endpoint
        os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = phoenix_base_url
        os.environ["OTEL_METRICS_EXPORTER"] = "none"

        # Check endpoint connectivity before configuring exporters
        phoenix_traces_endpoint = f"{phoenix_base_url}/v1/traces"
        try:
            requests.head(phoenix_traces_endpoint, timeout=2.0)
        except (
            requests.exceptions.ConnectionError,
            requests.exceptions.Timeout,
        ):
            logger.warning(
                f"Logfire tracing unavailable: {phoenix_traces_endpoint} unreachable "
                f"(spans and metrics export disabled)"
            )
            self.config.enabled = False
            raise ConnectionError("Phoenix endpoint unreachable")

    def _configure_logfire(self) -> None:
        """Configure Logfire with scrubbing patterns.

        Reason: When send_to_cloud=False, pass token=None to prevent SDK from
        making API handshake calls to logfire-us.pydantic.dev. When True,
        omit token parameter to let SDK read from LOGFIRE_TOKEN env var.
        """
        from app.utils.log_scrubbing import get_logfire_scrubbing_patterns

        scrubbing_patterns = get_logfire_scrubbing_patterns()

        if self.config.send_to_cloud:
            logfire.configure(  # type: ignore
                service_name=self.config.service_name,
                send_to_logfire=True,
                scrubbing=logfire.ScrubbingOptions(extra_patterns=scrubbing_patterns),  # type: ignore
            )
        else:
            logfire.configure(  # type: ignore
                service_name=self.config.service_name,
                send_to_logfire=False,
                token=None,  # Disable cloud API calls
                scrubbing=logfire.ScrubbingOptions(extra_patterns=scrubbing_patterns),  # type: ignore
            )

    def _log_initialization_info(self) -> None:
        """Log tracing initialization info with endpoint details."""
        import os

        if self.config.send_to_cloud:
            logger.info("Logfire tracing initialized: Logfire cloud")
        else:
            base_url = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", "not set")
            metrics_exp = os.environ.get("OTEL_METRICS_EXPORTER", "default")
            logger.info(
                f"Phoenix tracing initialized: endpoint={base_url}, metrics_exporter={metrics_exp}"
            )

Functions

get_instrumentation_manager()

Get current instrumentation manager.

Returns:

Type Description
LogfireInstrumentationManager | None

Current LogfireInstrumentationManager instance or None if not initialized.

Source code in src/app/agents/logfire_instrumentation.py
def get_instrumentation_manager() -> LogfireInstrumentationManager | None:
    """Get current instrumentation manager.

    Returns:
        Current LogfireInstrumentationManager instance or None if not initialized.
    """
    return _instrumentation_manager

initialize_logfire_instrumentation(config)

Initialize Logfire instrumentation.

Parameters:

Name Type Description Default
config LogfireConfig

LogfireConfig instance with tracing settings.

required
Source code in src/app/agents/logfire_instrumentation.py
def initialize_logfire_instrumentation(config: LogfireConfig) -> None:
    """Initialize Logfire instrumentation.

    Args:
        config: LogfireConfig instance with tracing settings.
    """
    global _instrumentation_manager
    _instrumentation_manager = LogfireInstrumentationManager(config)

app.app

Main entry point for the Agents-eval application.

This module initializes the agentic system, loads configuration files, handles user input, and orchestrates the multi-agent workflow using asynchronous execution. It integrates logging, tracing, and authentication, and supports both CLI and programmatic execution.

Evaluation orchestration is delegated to app.judge.evaluation_runner.

Classes

Functions

main(chat_provider=CHAT_DEFAULT_PROVIDER, query='', include_researcher=False, include_analyst=False, include_synthesiser=False, chat_config_file=None, enable_review_tools=False, paper_id=None, skip_eval=False, download_peerread_full_only=False, download_peerread_samples_only=False, peerread_max_papers_per_sample_download=5, cc_solo_dir=None, cc_teams_dir=None, cc_teams_tasks_dir=None, token_limit=None, judge_settings=None, engine='mas', cc_result=None, cc_teams=False, cc_model=None) async

Main entry point for the application.

Returns:

Type Description
dict[str, Any] | None

Dictionary with ‘composite_result’ (CompositeResult) and ‘graph’ (nx.DiGraph)

dict[str, Any] | None

if evaluation runs successfully, None otherwise (CLI mode or download-only).

Source code in src/app/app.py
async def main(
    chat_provider: str = CHAT_DEFAULT_PROVIDER,
    query: str = "",
    include_researcher: bool = False,
    include_analyst: bool = False,
    include_synthesiser: bool = False,
    chat_config_file: str | Path | None = None,
    enable_review_tools: bool = False,
    paper_id: str | None = None,
    skip_eval: bool = False,
    download_peerread_full_only: bool = False,
    download_peerread_samples_only: bool = False,
    peerread_max_papers_per_sample_download: int | None = 5,
    cc_solo_dir: str | None = None,
    cc_teams_dir: str | None = None,
    cc_teams_tasks_dir: str | None = None,
    token_limit: int | None = None,
    judge_settings: JudgeSettings | None = None,
    engine: str = "mas",
    cc_result: Any | None = None,
    cc_teams: bool = False,
    cc_model: str | None = None,
) -> dict[str, Any] | None:
    """Main entry point for the application.

    Returns:
        Dictionary with 'composite_result' (CompositeResult) and 'graph' (nx.DiGraph)
        if evaluation runs successfully, None otherwise (CLI mode or download-only).
    """
    logger.info(f"Starting app '{PROJECT_NAME}' v{__version__} (engine={engine})")

    if _handle_download_mode(
        download_peerread_full_only,
        download_peerread_samples_only,
        peerread_max_papers_per_sample_download,
    ):
        return None

    try:
        if chat_config_file is None:
            chat_config_file = resolve_config_path(CHAT_CONFIG_FILE)
        logger.info(f"Chat config file: {chat_config_file}")

        with span("main()"):
            # Generate execution_id up-front so RunContext is active before engine runs
            execution_id = f"exec_{_uuid.uuid4().hex[:12]}"
            run_ctx = RunContext.create(
                engine_type=_resolve_engine_type(engine, cc_teams),
                paper_id=paper_id or "unknown",
                execution_id=execution_id,
            )
            set_active_run_context(run_ctx)

            # S10-F1: CC engine branch — skip MAS, use CC result directly
            if engine == "cc" and cc_result is not None:
                composite_result, graph, execution_id = await _run_cc_engine_path(
                    cc_result,
                    skip_eval,
                    paper_id,
                    cc_solo_dir,
                    cc_teams_dir,
                    cc_teams_tasks_dir,
                    chat_provider,
                    judge_settings,
                    cc_teams=cc_teams,
                    run_dir=run_ctx.run_dir,
                    cc_model=cc_model,
                )
            else:
                composite_result, graph, execution_id = await _run_mas_engine_path(
                    chat_config_file,
                    chat_provider,
                    query,
                    paper_id,
                    enable_review_tools,
                    include_researcher,
                    include_analyst,
                    include_synthesiser,
                    token_limit,
                    skip_eval,
                    cc_solo_dir,
                    cc_teams_dir,
                    cc_teams_tasks_dir,
                    judge_settings,
                    execution_id=execution_id,
                    run_dir=run_ctx.run_dir,
                )

            persist_graph(graph, run_ctx.run_dir)

            logger.info(f"Exiting app '{PROJECT_NAME}'")
            return _prepare_result_dict(
                composite_result, graph, execution_id, run_context=get_active_run_context()
            )

    except Exception as e:
        msg = generic_exception(f"Aborting app '{PROJECT_NAME}' with: {e}")
        logger.exception(msg)
        raise Exception(msg) from e
    finally:
        set_active_run_context(None)

app.benchmark.sweep_analysis

Statistical analysis for MAS composition sweep results.

This module provides functions to calculate statistics (mean, stddev, min, max) across multiple sweep runs and generate summary reports in machine-readable (JSON) and human-readable (Markdown) formats.

Classes

CompositionStats

Bases: BaseModel

Statistical summary for a single agent composition.

Aggregates metrics across all repetitions for one composition.

Source code in src/app/benchmark/sweep_analysis.py
class CompositionStats(BaseModel):
    """Statistical summary for a single agent composition.

    Aggregates metrics across all repetitions for one composition.
    """

    composition: AgentComposition
    overall_score_mean: float
    overall_score_stddev: float
    tier1_score_mean: float
    tier1_score_stddev: float
    tier2_score_mean: float
    tier2_score_stddev: float
    tier3_score_mean: float
    tier3_score_stddev: float
    confidence_mean: float
    confidence_stddev: float
    num_samples: int

SweepAnalyzer

Analyzer for sweep results.

Groups results by composition and calculates per-composition statistics.

Source code in src/app/benchmark/sweep_analysis.py
class SweepAnalyzer:
    """Analyzer for sweep results.

    Groups results by composition and calculates per-composition statistics.
    """

    def __init__(self, results: list[tuple[AgentComposition, CompositeResult]]):
        """Initialize analyzer with sweep results.

        Args:
            results: List of (composition, result) tuples from sweep run.
        """
        self.results = results

    def analyze(self) -> list[CompositionStats]:
        """Analyze sweep results and calculate per-composition statistics.

        Groups results by composition and calculates mean/stddev for all metrics.

        Returns:
            list[CompositionStats]: Statistics for each unique composition.

        Example:
            >>> analyzer = SweepAnalyzer(results)
            >>> stats = analyzer.analyze()
            >>> len(stats)  # Number of unique compositions
            8
        """
        # Group results by composition
        grouped: dict[str, list[CompositeResult]] = {}
        composition_map: dict[str, AgentComposition] = {}

        for composition, result in self.results:
            key = composition.get_name()
            if key not in grouped:
                grouped[key] = []
                composition_map[key] = composition
            grouped[key].append(result)

        # Calculate statistics for each composition
        stats_list = []
        for key, results in grouped.items():
            overall_scores = [r.composite_score for r in results]
            tier1_scores = [r.tier1_score for r in results]
            # Reason: tier2_score is optional, filter out None values
            tier2_scores = [r.tier2_score for r in results if r.tier2_score is not None]
            tier3_scores = [r.tier3_score for r in results]
            # Reason: Use composite_score as proxy for confidence (not exposed in CompositeResult)
            confidences = [r.composite_score for r in results]

            overall_stats = calculate_statistics(overall_scores)
            tier1_stats = calculate_statistics(tier1_scores)
            tier2_stats = (
                calculate_statistics(tier2_scores)
                if tier2_scores
                else {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
            )
            tier3_stats = calculate_statistics(tier3_scores)
            confidence_stats = calculate_statistics(confidences)

            stats_list.append(
                CompositionStats(
                    composition=composition_map[key],
                    overall_score_mean=overall_stats["mean"],
                    overall_score_stddev=overall_stats["stddev"],
                    tier1_score_mean=tier1_stats["mean"],
                    tier1_score_stddev=tier1_stats["stddev"],
                    tier2_score_mean=tier2_stats["mean"],
                    tier2_score_stddev=tier2_stats["stddev"],
                    tier3_score_mean=tier3_stats["mean"],
                    tier3_score_stddev=tier3_stats["stddev"],
                    confidence_mean=confidence_stats["mean"],
                    confidence_stddev=confidence_stats["stddev"],
                    num_samples=len(results),
                )
            )

        return stats_list
Functions
__init__(results)

Initialize analyzer with sweep results.

Parameters:

Name Type Description Default
results list[tuple[AgentComposition, CompositeResult]]

List of (composition, result) tuples from sweep run.

required
Source code in src/app/benchmark/sweep_analysis.py
def __init__(self, results: list[tuple[AgentComposition, CompositeResult]]):
    """Initialize analyzer with sweep results.

    Args:
        results: List of (composition, result) tuples from sweep run.
    """
    self.results = results
analyze()

Analyze sweep results and calculate per-composition statistics.

Groups results by composition and calculates mean/stddev for all metrics.

Returns:

Type Description
list[CompositionStats]

list[CompositionStats]: Statistics for each unique composition.

Example

analyzer = SweepAnalyzer(results) stats = analyzer.analyze() len(stats) # Number of unique compositions 8

Source code in src/app/benchmark/sweep_analysis.py
def analyze(self) -> list[CompositionStats]:
    """Analyze sweep results and calculate per-composition statistics.

    Groups results by composition and calculates mean/stddev for all metrics.

    Returns:
        list[CompositionStats]: Statistics for each unique composition.

    Example:
        >>> analyzer = SweepAnalyzer(results)
        >>> stats = analyzer.analyze()
        >>> len(stats)  # Number of unique compositions
        8
    """
    # Group results by composition
    grouped: dict[str, list[CompositeResult]] = {}
    composition_map: dict[str, AgentComposition] = {}

    for composition, result in self.results:
        key = composition.get_name()
        if key not in grouped:
            grouped[key] = []
            composition_map[key] = composition
        grouped[key].append(result)

    # Calculate statistics for each composition
    stats_list = []
    for key, results in grouped.items():
        overall_scores = [r.composite_score for r in results]
        tier1_scores = [r.tier1_score for r in results]
        # Reason: tier2_score is optional, filter out None values
        tier2_scores = [r.tier2_score for r in results if r.tier2_score is not None]
        tier3_scores = [r.tier3_score for r in results]
        # Reason: Use composite_score as proxy for confidence (not exposed in CompositeResult)
        confidences = [r.composite_score for r in results]

        overall_stats = calculate_statistics(overall_scores)
        tier1_stats = calculate_statistics(tier1_scores)
        tier2_stats = (
            calculate_statistics(tier2_scores)
            if tier2_scores
            else {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
        )
        tier3_stats = calculate_statistics(tier3_scores)
        confidence_stats = calculate_statistics(confidences)

        stats_list.append(
            CompositionStats(
                composition=composition_map[key],
                overall_score_mean=overall_stats["mean"],
                overall_score_stddev=overall_stats["stddev"],
                tier1_score_mean=tier1_stats["mean"],
                tier1_score_stddev=tier1_stats["stddev"],
                tier2_score_mean=tier2_stats["mean"],
                tier2_score_stddev=tier2_stats["stddev"],
                tier3_score_mean=tier3_stats["mean"],
                tier3_score_stddev=tier3_stats["stddev"],
                confidence_mean=confidence_stats["mean"],
                confidence_stddev=confidence_stats["stddev"],
                num_samples=len(results),
            )
        )

    return stats_list

Functions

calculate_statistics(scores)

Calculate mean, stddev, min, max for a list of scores.

Parameters:

Name Type Description Default
scores list[float]

List of numerical scores to analyze.

required

Returns:

Type Description
dict[str, float]

dict[str, float]: Dictionary with keys ‘mean’, ‘stddev’, ‘min’, ‘max’.

Raises:

Type Description
ValueError

If scores list is empty.

Example

calculate_statistics([0.75, 0.80, 0.70])

Source code in src/app/benchmark/sweep_analysis.py
def calculate_statistics(scores: list[float]) -> dict[str, float]:
    """Calculate mean, stddev, min, max for a list of scores.

    Args:
        scores: List of numerical scores to analyze.

    Returns:
        dict[str, float]: Dictionary with keys 'mean', 'stddev', 'min', 'max'.

    Raises:
        ValueError: If scores list is empty.

    Example:
        >>> calculate_statistics([0.75, 0.80, 0.70])
        {'mean': 0.75, 'stddev': 0.05, 'min': 0.70, 'max': 0.80}
    """
    if not scores:
        raise ValueError("Cannot calculate statistics for empty scores list")

    return {
        "mean": statistics.mean(scores),
        "stddev": statistics.stdev(scores) if len(scores) > 1 else 0.0,
        "min": min(scores),
        "max": max(scores),
    }

generate_markdown_summary(stats)

Generate human-readable Markdown summary table.

Parameters:

Name Type Description Default
stats list[CompositionStats]

List of composition statistics to summarize.

required

Returns:

Name Type Description
str str

Markdown-formatted table with mean ± stddev for all metrics.

Example

markdown = generate_markdown_summary(stats) “| Composition” in markdown True “Overall Score” in markdown True

Source code in src/app/benchmark/sweep_analysis.py
def generate_markdown_summary(stats: list[CompositionStats]) -> str:
    """Generate human-readable Markdown summary table.

    Args:
        stats: List of composition statistics to summarize.

    Returns:
        str: Markdown-formatted table with mean ± stddev for all metrics.

    Example:
        >>> markdown = generate_markdown_summary(stats)
        >>> "| Composition" in markdown
        True
        >>> "Overall Score" in markdown
        True
    """
    lines = [
        "# MAS Composition Sweep Results",
        "",
        "| Composition | Overall Score | Tier 1 | Tier 2 | Tier 3 | Confidence | Samples |",
        "|-------------|---------------|---------|---------|---------|------------|---------|",
    ]

    for stat in stats:
        comp_name = stat.composition.get_name()
        overall = f"{stat.overall_score_mean:.3f} ± {stat.overall_score_stddev:.3f}"
        tier1 = f"{stat.tier1_score_mean:.3f} ± {stat.tier1_score_stddev:.3f}"
        tier2 = f"{stat.tier2_score_mean:.3f} ± {stat.tier2_score_stddev:.3f}"
        tier3 = f"{stat.tier3_score_mean:.3f} ± {stat.tier3_score_stddev:.3f}"
        confidence = f"{stat.confidence_mean:.3f} ± {stat.confidence_stddev:.3f}"
        samples = f"n={stat.num_samples}"

        lines.append(
            f"| {comp_name} | {overall} | {tier1} | {tier2} | {tier3} | {confidence} | {samples} |"
        )

    return "\n".join(lines)

app.benchmark.sweep_config

Configuration models for MAS composition sweep.

This module defines Pydantic models for sweep configuration including agent composition definitions and convenience functions for generating standard composition sets.

Classes

AgentComposition

Bases: BaseModel

Configuration for a specific agent composition.

Defines which agents are included in a multi-agent system composition. Each toggle determines whether the corresponding agent is instantiated.

Source code in src/app/benchmark/sweep_config.py
class AgentComposition(BaseModel):
    """Configuration for a specific agent composition.

    Defines which agents are included in a multi-agent system composition.
    Each toggle determines whether the corresponding agent is instantiated.
    """

    include_researcher: bool = False
    include_analyst: bool = False
    include_synthesiser: bool = False

    def get_name(self) -> str:
        """Generate a readable name for this composition.

        Returns:
            str: A human-readable name describing the active agents.

        Example:
            >>> comp = AgentComposition(include_researcher=True, include_analyst=False)
            >>> comp.get_name()
            'researcher'
        """
        active_agents: list[str] = []
        if self.include_researcher:
            active_agents.append("researcher")
        if self.include_analyst:
            active_agents.append("analyst")
        if self.include_synthesiser:
            active_agents.append("synthesiser")

        if not active_agents:
            return "manager-only"

        return "+".join(active_agents)
Functions
get_name()

Generate a readable name for this composition.

Returns:

Name Type Description
str str

A human-readable name describing the active agents.

Example

comp = AgentComposition(include_researcher=True, include_analyst=False) comp.get_name() ‘researcher’

Source code in src/app/benchmark/sweep_config.py
def get_name(self) -> str:
    """Generate a readable name for this composition.

    Returns:
        str: A human-readable name describing the active agents.

    Example:
        >>> comp = AgentComposition(include_researcher=True, include_analyst=False)
        >>> comp.get_name()
        'researcher'
    """
    active_agents: list[str] = []
    if self.include_researcher:
        active_agents.append("researcher")
    if self.include_analyst:
        active_agents.append("analyst")
    if self.include_synthesiser:
        active_agents.append("synthesiser")

    if not active_agents:
        return "manager-only"

    return "+".join(active_agents)

SweepConfig

Bases: BaseModel

Configuration for a composition sweep run.

Defines the sweep parameters including which compositions to test, how many repetitions per composition, which papers to evaluate, and which execution engine to use (MAS pipeline or Claude Code headless).

Source code in src/app/benchmark/sweep_config.py
class SweepConfig(BaseModel):
    """Configuration for a composition sweep run.

    Defines the sweep parameters including which compositions to test,
    how many repetitions per composition, which papers to evaluate,
    and which execution engine to use (MAS pipeline or Claude Code headless).
    """

    compositions: list[AgentComposition] = Field(
        ..., description="List of agent compositions to test", min_length=1
    )
    repetitions: int = Field(..., description="Number of repetitions per composition", ge=1)
    paper_ids: list[str] = Field(..., description="List of paper IDs to evaluate", min_length=1)
    output_dir: Path = Field(..., description="Directory for sweep results")

    chat_provider: str = Field(
        default=CHAT_DEFAULT_PROVIDER, description="LLM provider to use for evaluations"
    )

    engine: str = Field(
        default="mas",
        description="Execution engine: 'mas' for MAS pipeline, 'cc' for Claude Code headless",
    )

    judge_provider: str = Field(
        default="auto",
        description="LLM provider for Tier 2 judge (default: 'auto' inherits chat_provider)",
    )

    judge_model: str | None = Field(
        default=None,
        description="LLM model for Tier 2 judge (default: None uses JudgeSettings default)",
    )

    cc_teams: bool = Field(
        default=False,
        description="Use Claude Code Agent Teams mode (requires engine='cc')",
    )

    cc_artifact_dirs: list[Path] | None = Field(
        default=None,
        description="Pre-collected CC artifact directories (skips re-running CC)",
    )

    retry_delay_seconds: float = Field(
        default=5.0,
        description="Initial delay in seconds between rate-limit retries (exponential backoff)",
    )

    @field_validator("compositions")
    @classmethod
    def validate_compositions_not_empty(cls, v: list[AgentComposition]) -> list[AgentComposition]:
        """Validate that compositions list is not empty.

        Args:
            v: The compositions list to validate.

        Returns:
            The validated compositions list.

        Raises:
            ValueError: If compositions list is empty.
        """
        if not v:
            raise ValueError("Compositions list cannot be empty")
        return v

    @field_validator("repetitions")
    @classmethod
    def validate_repetitions_positive(cls, v: int) -> int:
        """Validate that repetitions is positive.

        Args:
            v: The repetitions value to validate.

        Returns:
            The validated repetitions value.

        Raises:
            ValueError: If repetitions is zero or negative.
        """
        if v <= 0:
            raise ValueError("Repetitions must be positive")
        return v

    @field_validator("paper_ids")
    @classmethod
    def validate_paper_ids_not_empty(cls, v: list[str]) -> list[str]:
        """Validate that paper_ids list is not empty.

        Args:
            v: The paper_ids list to validate.

        Returns:
            The validated paper_ids list.

        Raises:
            ValueError: If paper_ids list is empty.
        """
        if not v:
            raise ValueError("Paper IDs list cannot be empty")
        return v
Functions
validate_compositions_not_empty(v) classmethod

Validate that compositions list is not empty.

Parameters:

Name Type Description Default
v list[AgentComposition]

The compositions list to validate.

required

Returns:

Type Description
list[AgentComposition]

The validated compositions list.

Raises:

Type Description
ValueError

If compositions list is empty.

Source code in src/app/benchmark/sweep_config.py
@field_validator("compositions")
@classmethod
def validate_compositions_not_empty(cls, v: list[AgentComposition]) -> list[AgentComposition]:
    """Validate that compositions list is not empty.

    Args:
        v: The compositions list to validate.

    Returns:
        The validated compositions list.

    Raises:
        ValueError: If compositions list is empty.
    """
    if not v:
        raise ValueError("Compositions list cannot be empty")
    return v
validate_paper_ids_not_empty(v) classmethod

Validate that paper_ids list is not empty.

Parameters:

Name Type Description Default
v list[str]

The paper_ids list to validate.

required

Returns:

Type Description
list[str]

The validated paper_ids list.

Raises:

Type Description
ValueError

If paper_ids list is empty.

Source code in src/app/benchmark/sweep_config.py
@field_validator("paper_ids")
@classmethod
def validate_paper_ids_not_empty(cls, v: list[str]) -> list[str]:
    """Validate that paper_ids list is not empty.

    Args:
        v: The paper_ids list to validate.

    Returns:
        The validated paper_ids list.

    Raises:
        ValueError: If paper_ids list is empty.
    """
    if not v:
        raise ValueError("Paper IDs list cannot be empty")
    return v
validate_repetitions_positive(v) classmethod

Validate that repetitions is positive.

Parameters:

Name Type Description Default
v int

The repetitions value to validate.

required

Returns:

Type Description
int

The validated repetitions value.

Raises:

Type Description
ValueError

If repetitions is zero or negative.

Source code in src/app/benchmark/sweep_config.py
@field_validator("repetitions")
@classmethod
def validate_repetitions_positive(cls, v: int) -> int:
    """Validate that repetitions is positive.

    Args:
        v: The repetitions value to validate.

    Returns:
        The validated repetitions value.

    Raises:
        ValueError: If repetitions is zero or negative.
    """
    if v <= 0:
        raise ValueError("Repetitions must be positive")
    return v

Functions

generate_all_compositions()

Generate all 2^3 = 8 possible agent compositions.

This convenience function generates the full Cartesian product of all agent toggle combinations.

Returns:

Type Description
list[AgentComposition]

list[AgentComposition]: List of 8 unique agent compositions.

Example

compositions = generate_all_compositions() len(compositions) 8 any(c.include_researcher and c.include_analyst for c in compositions) True

Source code in src/app/benchmark/sweep_config.py
def generate_all_compositions() -> list[AgentComposition]:
    """Generate all 2^3 = 8 possible agent compositions.

    This convenience function generates the full Cartesian product of all
    agent toggle combinations.

    Returns:
        list[AgentComposition]: List of 8 unique agent compositions.

    Example:
        >>> compositions = generate_all_compositions()
        >>> len(compositions)
        8
        >>> any(c.include_researcher and c.include_analyst for c in compositions)
        True
    """
    compositions = []
    for researcher in [False, True]:
        for analyst in [False, True]:
            for synthesiser in [False, True]:
                compositions.append(
                    AgentComposition(
                        include_researcher=researcher,
                        include_analyst=analyst,
                        include_synthesiser=synthesiser,
                    )
                )
    return compositions

app.benchmark.sweep_runner

Sweep runner for MAS composition benchmarking.

This module orchestrates multiple evaluation runs across different agent compositions and optionally invokes Claude Code in headless mode for baseline comparison.

Classes

SweepRunner

Runner for composition sweep experiments.

Executes the MAS evaluation pipeline across multiple compositions with repetitions for statistical significance.

Source code in src/app/benchmark/sweep_runner.py
class SweepRunner:
    """Runner for composition sweep experiments.

    Executes the MAS evaluation pipeline across multiple compositions with
    repetitions for statistical significance.
    """

    def __init__(self, config: SweepConfig):
        """Initialize sweep runner with configuration.

        Args:
            config: Sweep configuration defining compositions, repetitions, papers.
        """
        self.config = config
        self.results: list[tuple[AgentComposition, CompositeResult]] = []

    def _build_judge_settings(self) -> JudgeSettings | None:
        """Build JudgeSettings from sweep config if judge args are configured.

        Returns:
            JudgeSettings with configured provider/model, or None to use defaults.
        """
        if self.config.judge_provider != "auto" or self.config.judge_model is not None:
            kwargs: dict[str, Any] = {"tier2_provider": self.config.judge_provider}
            if self.config.judge_model is not None:
                kwargs["tier2_model"] = self.config.judge_model
            return JudgeSettings(**kwargs)
        return None

    async def _handle_rate_limit(self, error: ModelHTTPError, label: str, attempt: int) -> bool:
        """Handle a 429 rate-limit error, sleeping before retry if retries remain.

        Args:
            error: The ModelHTTPError with status_code 429.
            label: Descriptive label for logging (composition/paper context).
            attempt: Current attempt index (0-based).

        Returns:
            True if the caller should retry, False if max retries are exhausted.
        """
        if attempt < _MAX_RETRIES:
            delay = self.config.retry_delay_seconds * (2**attempt)
            logger.warning(
                f"Rate limit hit for {label} "
                f"(attempt {attempt + 1}/{_MAX_RETRIES + 1}). "
                f"Retrying in {delay:.1f}s..."
            )
            await asyncio.sleep(delay)
            return True
        logger.error(f"Rate limit exhausted for {label}: {error}")
        return False

    async def _call_main(
        self, composition: AgentComposition, paper_id: str, judge_settings: JudgeSettings | None
    ) -> CompositeResult | None:
        """Call main() and extract CompositeResult from the result dict.

        Args:
            composition: Agent composition to test.
            paper_id: Paper ID to evaluate.
            judge_settings: Optional judge settings.

        Returns:
            CompositeResult if found, None if result format unexpected.
        """
        result = await main(
            chat_provider=self.config.chat_provider,
            query=f"Evaluate paper {paper_id}",
            paper_id=paper_id,
            include_researcher=composition.include_researcher,
            include_analyst=composition.include_analyst,
            include_synthesiser=composition.include_synthesiser,
            enable_review_tools=True,
            skip_eval=False,
            judge_settings=judge_settings,
        )
        # Reason: main() returns dict with 'composite_result' key
        if isinstance(result, dict) and "composite_result" in result:
            composite = result["composite_result"]
            if isinstance(composite, CompositeResult):
                return composite
            return None
        logger.warning(f"Evaluation returned unexpected format: {type(result).__name__}")
        return None

    async def _run_single_evaluation(
        self, composition: AgentComposition, paper_id: str, repetition: int
    ) -> CompositeResult | None:
        """Run a single evaluation with specified composition, retrying on rate limits.

        Retries up to _MAX_RETRIES times on HTTP 429 errors with exponential backoff
        starting at retry_delay_seconds. After max retries, logs error and returns None.

        Args:
            composition: Agent composition to test.
            paper_id: Paper ID to evaluate (string, supports arxiv IDs like '1105.1072').
            repetition: Repetition number (for logging).

        Returns:
            CompositeResult if successful, None if evaluation failed.
        """
        logger.info(
            f"Running composition={composition.get_name()}, "
            f"paper={paper_id}, repetition={repetition}"
        )
        judge_settings = self._build_judge_settings()
        label = f"composition={composition.get_name()}, paper={paper_id}"

        for attempt in range(_MAX_RETRIES + 1):
            try:
                return await self._call_main(composition, paper_id, judge_settings)
            except ModelHTTPError as e:
                if e.status_code != 429 or not await self._handle_rate_limit(e, label, attempt):
                    return None
            except SystemExit as e:
                # Reason: run_manager raises SystemExit(1) on UsageLimitExceeded;
                # catch it so one evaluation's token limit doesn't abort the sweep
                logger.error(f"Evaluation aborted for {label}: {e}")
                return None
            except Exception as e:
                logger.error(f"Evaluation failed for {label}: {e}", exc_info=True)
                return None

        return None

    async def _invoke_cc_comparison(self, paper_id: str) -> CCResult | None:
        """Invoke Claude Code in headless mode for baseline comparison.

        Delegates to cc_engine.run_cc_solo or run_cc_teams depending on
        sweep configuration. No inline subprocess logic.

        Args:
            paper_id: Paper ID to evaluate (string, supports arxiv IDs).

        Returns:
            CCResult if successful, None otherwise.

        Raises:
            RuntimeError: If claude CLI not found, subprocess fails, or times out.
        """
        prompt = f"Review paper {paper_id} from the PeerRead dataset"

        if self.config.cc_teams:
            result = run_cc_teams(prompt, timeout=600)
        else:
            result = run_cc_solo(prompt, timeout=600)

        logger.info(f"CC comparison completed: execution_id={result.execution_id}")
        return result

    async def _validate_prerequisites(self) -> None:
        """Validate sweep prerequisites."""
        if self.config.engine == "cc" and not check_cc_available():
            raise RuntimeError(
                "engine=cc requires claude CLI. Install Claude Code or use --engine=mas."
            )

    async def _run_mas_evaluations(self) -> None:
        """Run MAS evaluations for all compositions, papers, and repetitions.

        Writes partial results.json after each successful evaluation for crash resilience.
        """
        for composition in self.config.compositions:
            for paper_id in self.config.paper_ids:
                for repetition in range(self.config.repetitions):
                    result = await self._run_single_evaluation(composition, paper_id, repetition)
                    if result:
                        self.results.append((composition, result))
                        await self._save_results_json()

    async def _run_cc_baselines(self) -> None:
        """Run CC comparison evaluations if engine=cc.

        Wires CC results through CCTraceAdapter for evaluation pipeline integration.
        Adapts CCResult artifacts into GraphTraceData for three-tier evaluation.
        """
        if self.config.engine != "cc":
            return

        for paper_id in self.config.paper_ids:
            cc_result = await self._invoke_cc_comparison(paper_id)
            if cc_result is None:
                continue

            logger.info(f"CC comparison completed for paper {paper_id}: {cc_result.execution_id}")

            # Wire through CCTraceAdapter when session directory is available
            if cc_result.session_dir and Path(cc_result.session_dir).exists():
                try:
                    adapter = CCTraceAdapter(Path(cc_result.session_dir))
                    trace_data = adapter.parse()
                    logger.info(
                        f"CC trace parsed: execution_id={trace_data.execution_id}, paper={paper_id}"
                    )
                except Exception as e:
                    logger.warning(f"CC trace parsing failed for paper {paper_id}: {e}")

    async def run(self) -> list[tuple[AgentComposition, CompositeResult]]:
        """Execute the full sweep across all compositions and repetitions.

        Partial results are always saved via finally block, even if an
        evaluation crashes mid-sweep (e.g. token limit exceeded).

        Returns:
            list[tuple[AgentComposition, CompositeResult]]: All evaluation results.

        Raises:
            RuntimeError: If engine=cc but claude CLI not found.
        """
        await self._validate_prerequisites()
        self.config.output_dir.mkdir(parents=True, exist_ok=True)
        try:
            await self._run_mas_evaluations()
            await self._run_cc_baselines()
        finally:
            await self._save_results()
        return self.results

    async def _save_results_json(self) -> None:
        """Save sweep results to results.json only (incremental write).

        Used for crash-resilient incremental persistence after each evaluation.
        """
        import json

        results_file = self.config.output_dir / "results.json"
        json_data = [
            {
                "composition": {
                    "include_researcher": comp.include_researcher,
                    "include_analyst": comp.include_analyst,
                    "include_synthesiser": comp.include_synthesiser,
                },
                "result": result.model_dump(),
            }
            for comp, result in self.results
        ]

        with open(results_file, "w") as f:
            json.dump(json_data, f, indent=2)

        from app.utils.artifact_registry import get_artifact_registry

        get_artifact_registry().register("Sweep results", results_file)

        logger.info(f"Saved raw results to {results_file}")

    async def _save_results(self) -> None:
        """Save sweep results to both results.json and summary.md."""
        if not self.results:
            logger.warning("No successful evaluations — skipping results write")
            return

        await self._save_results_json()

        # Generate and save statistical summary
        analyzer = SweepAnalyzer(self.results)
        stats = analyzer.analyze()
        markdown = generate_markdown_summary(stats)

        summary_file = self.config.output_dir / "summary.md"
        with open(summary_file, "w") as f:
            f.write(markdown)

        from app.utils.artifact_registry import get_artifact_registry

        get_artifact_registry().register("Sweep summary", summary_file)

        logger.info(f"Saved summary to {summary_file}")
Functions
__init__(config)

Initialize sweep runner with configuration.

Parameters:

Name Type Description Default
config SweepConfig

Sweep configuration defining compositions, repetitions, papers.

required
Source code in src/app/benchmark/sweep_runner.py
def __init__(self, config: SweepConfig):
    """Initialize sweep runner with configuration.

    Args:
        config: Sweep configuration defining compositions, repetitions, papers.
    """
    self.config = config
    self.results: list[tuple[AgentComposition, CompositeResult]] = []
run() async

Execute the full sweep across all compositions and repetitions.

Partial results are always saved via finally block, even if an evaluation crashes mid-sweep (e.g. token limit exceeded).

Returns:

Type Description
list[tuple[AgentComposition, CompositeResult]]

list[tuple[AgentComposition, CompositeResult]]: All evaluation results.

Raises:

Type Description
RuntimeError

If engine=cc but claude CLI not found.

Source code in src/app/benchmark/sweep_runner.py
async def run(self) -> list[tuple[AgentComposition, CompositeResult]]:
    """Execute the full sweep across all compositions and repetitions.

    Partial results are always saved via finally block, even if an
    evaluation crashes mid-sweep (e.g. token limit exceeded).

    Returns:
        list[tuple[AgentComposition, CompositeResult]]: All evaluation results.

    Raises:
        RuntimeError: If engine=cc but claude CLI not found.
    """
    await self._validate_prerequisites()
    self.config.output_dir.mkdir(parents=True, exist_ok=True)
    try:
        await self._run_mas_evaluations()
        await self._run_cc_baselines()
    finally:
        await self._save_results()
    return self.results

Functions

run_sweep(config) async

Convenience function to run a sweep with given configuration.

Parameters:

Name Type Description Default
config SweepConfig

Sweep configuration.

required

Returns:

Type Description
list[tuple[AgentComposition, CompositeResult]]

list[tuple[AgentComposition, CompositeResult]]: All evaluation results.

Source code in src/app/benchmark/sweep_runner.py
async def run_sweep(config: SweepConfig) -> list[tuple[AgentComposition, CompositeResult]]:
    """Convenience function to run a sweep with given configuration.

    Args:
        config: Sweep configuration.

    Returns:
        list[tuple[AgentComposition, CompositeResult]]: All evaluation results.
    """
    runner = SweepRunner(config)
    return await runner.run()

app.common.error_messages

Error message utilities for the Agents-eval application.

This module provides concise helper functions for generating standardized error messages related to configuration loading and validation.

Functions

api_connection_error(error)

Generate an error message for API connection error.

Parameters:

Name Type Description Default
error str

The error message or exception string

required

Returns:

Type Description
str

Formatted error message string

Source code in src/app/common/error_messages.py
def api_connection_error(error: str) -> str:
    """
    Generate an error message for API connection error.

    Args:
        error: The error message or exception string

    Returns:
        Formatted error message string
    """
    return f"API connection error: {error}"

failed_to_load_config(error)

Generate an error message for configuration loading failure.

Parameters:

Name Type Description Default
error str

The error message or exception string

required

Returns:

Type Description
str

Formatted error message string

Source code in src/app/common/error_messages.py
def failed_to_load_config(error: str) -> str:
    """
    Generate an error message for configuration loading failure.

    Args:
        error: The error message or exception string

    Returns:
        Formatted error message string
    """
    return f"Failed to load config: {error}"

file_not_found(file_path)

Generate an error message for a missing configuration file.

Parameters:

Name Type Description Default
file_path str | Path

Path to the missing file

required

Returns:

Type Description
str

Formatted error message string

Source code in src/app/common/error_messages.py
def file_not_found(file_path: str | Path) -> str:
    """
    Generate an error message for a missing configuration file.

    Args:
        file_path: Path to the missing file

    Returns:
        Formatted error message string
    """
    return f"File not found: {file_path}"

generic_exception(error)

Generate a generic error message.

Parameters:

Name Type Description Default
error str

The error message or exception string

required

Returns:

Type Description
str

Formatted error message string

Source code in src/app/common/error_messages.py
def generic_exception(error: str) -> str:
    """
    Generate a generic error message.

    Args:
        error: The error message or exception string

    Returns:
        Formatted error message string
    """
    return f"Exception: {error}"

get_key_error(error)

Generate a key error message.

Parameters:

Name Type Description Default
error str

The key error message

required

Returns:

Type Description
str

Formatted error message string

Source code in src/app/common/error_messages.py
def get_key_error(error: str) -> str:
    """
    Generate a key error message.

    Args:
        error: The key error message

    Returns:
        Formatted error message string
    """
    return f"Key Error: {error}"

invalid_data_model_format(error)

Generate an error message for invalid pydantic data model format.

Parameters:

Name Type Description Default
error str

The validation error message

required

Returns:

Type Description
str

Formatted error message string

Source code in src/app/common/error_messages.py
def invalid_data_model_format(error: str) -> str:
    """
    Generate an error message for invalid pydantic data model format.

    Args:
        error: The validation error message

    Returns:
        Formatted error message string
    """
    return f"Invalid pydantic data model format: {error}"

invalid_json(error)

Generate an error message for invalid JSON in a configuration file.

Parameters:

Name Type Description Default
error str

The JSON parsing error message

required

Returns:

Type Description
str

Formatted error message string

Source code in src/app/common/error_messages.py
def invalid_json(error: str) -> str:
    """
    Generate an error message for invalid JSON in a configuration file.

    Args:
        error: The JSON parsing error message

    Returns:
        Formatted error message string
    """
    return f"Invalid JSON: {error}"

invalid_type(expected_type, actual_type)

Generate an error message for invalid Type.

Parameters:

Name Type Description Default
expected_type str

The expected type as a string

required
actual_type str

The actual type received as a string

required

Returns:

Type Description
str

Formatted error message string

Source code in src/app/common/error_messages.py
def invalid_type(expected_type: str, actual_type: str) -> str:
    """
    Generate an error message for invalid Type.

    Args:
        expected_type: The expected type as a string
        actual_type: The actual type received as a string

    Returns:
        Formatted error message string
    """
    return f"Type Error: Expected {expected_type}, got {actual_type} instead."

app.common.log

Logging configuration for the Agents-eval application.

Sets up the logger with custom settings including file rotation, retention, and compression. Logs are written to a file with automatic rotation.

Functions

app.common.models

Common data models for the Agents-eval application.

This module provides shared Pydantic base models and common data structures used across the application.

Classes

CommonBaseModel

Bases: BaseModel

Common base model with shared configuration for all Pydantic models.

Provides consistent configuration across all data models in the application including validation behavior and serialization settings.

Source code in src/app/common/models.py
class CommonBaseModel(BaseModel):
    """
    Common base model with shared configuration for all Pydantic models.

    Provides consistent configuration across all data models in the application
    including validation behavior and serialization settings.
    """

    model_config = ConfigDict(
        # Enable validation on assignment
        validate_assignment=True,
        # Allow arbitrary types for complex fields
        arbitrary_types_allowed=False,
        # Use enum values instead of enum instances in JSON
        use_enum_values=True,
    )

app.config.app_env

Application environment settings loaded from environment variables or .env file.

This module uses Pydantic’s BaseSettings to manage API keys and configuration for various inference endpoints, tools, and logging/monitoring services.

Classes

AppEnv

Bases: BaseSettings

Application environment settings loaded from environment variables or .env file.

This class uses Pydantic’s BaseSettings to manage API keys and configuration for various inference endpoints, tools, and logging/monitoring services. Environment variables are loaded from a .env file by default.

Source code in src/app/config/app_env.py
class AppEnv(BaseSettings):
    """
    Application environment settings loaded from environment variables or .env file.

    This class uses Pydantic's BaseSettings to manage API keys and configuration
    for various inference endpoints, tools, and logging/monitoring services.
    Environment variables are loaded from a .env file by default.
    """

    # Inference endpoints
    ANTHROPIC_API_KEY: str = ""
    CEREBRAS_API_KEY: str = ""
    COHERE_API_KEY: str = ""
    DEEPSEEK_API_KEY: str = ""
    FIREWORKS_API_KEY: str = ""
    GEMINI_API_KEY: str = ""
    GITHUB_API_KEY: str = ""
    GROK_API_KEY: str = ""
    GROQ_API_KEY: str = ""
    HUGGINGFACE_API_KEY: str = ""
    MISTRAL_API_KEY: str = ""
    NEBIUS_API_KEY: str = ""
    OPENAI_API_KEY: str = ""
    OPENROUTER_API_KEY: str = ""
    PERPLEXITY_API_KEY: str = ""
    RESTACK_API_KEY: str = ""
    SAMBANOVA_API_KEY: str = ""
    TOGETHER_API_KEY: str = ""

    # Tools
    TAVILY_API_KEY: str = ""

    # Logging/Monitoring/Tracing
    AGENTOPS_API_KEY: str = ""
    LOGFIRE_API_KEY: str = ""
    WANDB_API_KEY: str = ""

    # Agent Configuration
    AGENT_TOKEN_LIMIT: int | None = None

    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")

app.config.common_settings

Common settings module using pydantic-settings.

This module implements configuration following 12-Factor #3 (Config) principles: - Defaults in code (version-controlled) - Environment variable overrides via EVAL_ prefix - .env file support for local development

Classes

CommonSettings

Bases: BaseSettings

Common settings for the Agents-eval application.

Configuration follows 12-Factor #3 principles with typed defaults in code and environment variable overrides using the EVAL_ prefix.

Attributes:

Name Type Description
log_level str

Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)

enable_logfire bool

Enable Logfire tracing integration

max_content_length int

Maximum content length for paper content (characters)

Source code in src/app/config/common_settings.py
class CommonSettings(BaseSettings):
    """
    Common settings for the Agents-eval application.

    Configuration follows 12-Factor #3 principles with typed defaults in code
    and environment variable overrides using the EVAL_ prefix.

    Attributes:
        log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
        enable_logfire: Enable Logfire tracing integration
        max_content_length: Maximum content length for paper content (characters)
    """

    log_level: str = "INFO"
    enable_logfire: bool = False
    max_content_length: int = 15000

    model_config = SettingsConfigDict(
        env_prefix="EVAL_", env_file=".env", env_file_encoding="utf-8", extra="ignore"
    )

app.config.config_app

Configuration constants for the application.

app.config.judge_settings

Judge settings module using pydantic-settings.

This module implements evaluation configuration following 12-Factor #3 (Config) principles: - Defaults in code (version-controlled) - Environment variable overrides via JUDGE_ prefix - .env file support for local development

Classes

JudgeSettings

Bases: BaseSettings

Judge settings for the evaluation pipeline.

Configuration follows 12-Factor #3 principles with typed defaults in code and environment variable overrides using the JUDGE_ prefix. Uses pydantic-settings for typed, environment-driven configuration.

Attributes:

Name Type Description
tiers_enabled list[int]

List of enabled evaluation tiers (1=Traditional, 2=LLM, 3=Graph)

tier1_max_seconds float

Tier 1 timeout (Traditional Metrics)

tier2_max_seconds float

Tier 2 timeout (LLM-as-Judge)

tier3_max_seconds float

Tier 3 timeout (Graph Analysis)

total_max_seconds float

Total pipeline timeout

tier1_similarity_metrics list[str]

Similarity metrics for Tier 1

tier1_confidence_threshold float

Confidence threshold for Tier 1

tier1_bertscore_model str

BERTScore model name

tier1_tfidf_max_features int

Max features for TF-IDF

tier2_provider str

LLM provider for Tier 2 evaluation

tier2_model str

LLM model for Tier 2 evaluation

tier2_fallback_provider str

Fallback LLM provider

tier2_fallback_model str

Fallback LLM model

tier2_max_retries int

Max retry attempts for LLM calls

tier2_timeout_seconds float

Request timeout for LLM calls

tier2_cost_budget_usd float

Cost budget for LLM evaluation

tier2_paper_excerpt_length int

Paper excerpt length for LLM context

tier3_min_nodes int

Minimum nodes for graph analysis

tier3_centrality_measures list[str]

Centrality measures for graph analysis

tier3_max_nodes int

Maximum nodes for graph analysis

tier3_max_edges int

Maximum edges for graph analysis

tier3_operation_timeout float

Operation timeout for graph operations

fallback_strategy str

Fallback strategy when tiers fail

composite_accept_threshold float

Score threshold for “accept” recommendation

composite_weak_accept_threshold float

Score threshold for “weak_accept”

composite_weak_reject_threshold float

Score threshold for “weak_reject”

trace_collection bool

Enable trace collection

trace_storage_path str

Directory for trace file storage

logfire_enabled bool

Enable Logfire tracing

logfire_send_to_cloud bool

Send traces to Logfire cloud (requires LOGFIRE_TOKEN)

phoenix_endpoint str

Phoenix local trace viewer endpoint

logfire_service_name str

Service name for tracing

performance_logging bool

Enable performance logging

Source code in src/app/config/judge_settings.py
class JudgeSettings(BaseSettings):
    """
    Judge settings for the evaluation pipeline.

    Configuration follows 12-Factor #3 principles with typed defaults in code
    and environment variable overrides using the JUDGE_ prefix.
    Uses pydantic-settings for typed, environment-driven configuration.

    Attributes:
        tiers_enabled: List of enabled evaluation tiers (1=Traditional, 2=LLM, 3=Graph)
        tier1_max_seconds: Tier 1 timeout (Traditional Metrics)
        tier2_max_seconds: Tier 2 timeout (LLM-as-Judge)
        tier3_max_seconds: Tier 3 timeout (Graph Analysis)
        total_max_seconds: Total pipeline timeout
        tier1_similarity_metrics: Similarity metrics for Tier 1
        tier1_confidence_threshold: Confidence threshold for Tier 1
        tier1_bertscore_model: BERTScore model name
        tier1_tfidf_max_features: Max features for TF-IDF
        tier2_provider: LLM provider for Tier 2 evaluation
        tier2_model: LLM model for Tier 2 evaluation
        tier2_fallback_provider: Fallback LLM provider
        tier2_fallback_model: Fallback LLM model
        tier2_max_retries: Max retry attempts for LLM calls
        tier2_timeout_seconds: Request timeout for LLM calls
        tier2_cost_budget_usd: Cost budget for LLM evaluation
        tier2_paper_excerpt_length: Paper excerpt length for LLM context
        tier3_min_nodes: Minimum nodes for graph analysis
        tier3_centrality_measures: Centrality measures for graph analysis
        tier3_max_nodes: Maximum nodes for graph analysis
        tier3_max_edges: Maximum edges for graph analysis
        tier3_operation_timeout: Operation timeout for graph operations
        fallback_strategy: Fallback strategy when tiers fail
        composite_accept_threshold: Score threshold for "accept" recommendation
        composite_weak_accept_threshold: Score threshold for "weak_accept"
        composite_weak_reject_threshold: Score threshold for "weak_reject"
        trace_collection: Enable trace collection
        trace_storage_path: Directory for trace file storage
        logfire_enabled: Enable Logfire tracing
        logfire_send_to_cloud: Send traces to Logfire cloud (requires LOGFIRE_TOKEN)
        phoenix_endpoint: Phoenix local trace viewer endpoint
        logfire_service_name: Service name for tracing
        performance_logging: Enable performance logging
    """

    # Tiers configuration
    tiers_enabled: list[int] = Field(default=[1, 2, 3])

    # Performance targets (with validation)
    tier1_max_seconds: float = Field(default=1.0, gt=0, le=300)
    tier2_max_seconds: float = Field(default=10.0, gt=0, le=300)
    tier3_max_seconds: float = Field(default=15.0, gt=0, le=300)
    total_max_seconds: float = Field(default=25.0, gt=0, le=300)

    # Tier 1: Traditional Metrics
    tier1_similarity_metrics: list[str] = Field(default=["cosine", "jaccard", "semantic"])
    tier1_confidence_threshold: float = Field(default=0.8)
    tier1_bertscore_model: str = Field(default="distilbert-base-uncased")
    tier1_tfidf_max_features: int = Field(default=5000)

    # Tier 2: LLM-as-Judge
    tier2_provider: str = Field(
        default="auto",
        description="LLM provider for judge. 'auto' inherits the chat provider and model.",
    )
    tier2_model: str = Field(
        default="gpt-4o-mini",
        description="LLM model for judge. Overridden by chat model when tier2_provider=auto.",
    )
    tier2_fallback_provider: str = Field(default="github")
    tier2_fallback_model: str = Field(default="gpt-4o-mini")
    tier2_max_retries: int = Field(default=2)
    tier2_timeout_seconds: float = Field(default=30.0, gt=0, le=300)
    tier2_cost_budget_usd: float = Field(default=0.05)
    tier2_paper_excerpt_length: int = Field(default=2000)

    # Tier 3: Graph Analysis
    tier3_min_nodes: int = Field(default=2, gt=0)
    tier3_centrality_measures: list[str] = Field(default=["betweenness", "closeness", "degree"])
    tier3_max_nodes: int = Field(default=1000, gt=0)
    tier3_max_edges: int = Field(default=5000, gt=0)
    tier3_operation_timeout: float = Field(default=10.0, gt=0, le=300)

    # Composite scoring
    fallback_strategy: str = Field(default="tier1_only")
    composite_accept_threshold: float = Field(default=0.8, ge=0, le=1)
    composite_weak_accept_threshold: float = Field(default=0.6, ge=0, le=1)
    composite_weak_reject_threshold: float = Field(default=0.4, ge=0, le=1)

    # Observability
    trace_collection: bool = Field(default=True)
    trace_storage_path: str = Field(default=RUNS_PATH)
    logfire_enabled: bool = Field(default=True)
    logfire_send_to_cloud: bool = Field(default=False)
    phoenix_endpoint: str = Field(default="http://localhost:6006")
    logfire_service_name: str = Field(default="peerread-evaluation")
    performance_logging: bool = Field(default=True)

    model_config = SettingsConfigDict(
        env_prefix="JUDGE_", env_file=".env", env_file_encoding="utf-8", extra="ignore"
    )

    def get_enabled_tiers(self) -> set[int]:
        """
        Get enabled tiers as a set.

        Returns:
            Set of enabled tier numbers for backward compatibility
        """
        return set(self.tiers_enabled)

    def is_tier_enabled(self, tier: int) -> bool:
        """
        Check if a specific tier is enabled.

        Args:
            tier: Tier number to check (1, 2, or 3)

        Returns:
            True if tier is enabled
        """
        return tier in self.tiers_enabled

    def get_performance_targets(self) -> dict[str, float]:
        """
        Get performance targets as dictionary.

        Returns:
            Dictionary of performance targets for backward compatibility
        """
        return {
            "tier1_max_seconds": self.tier1_max_seconds,
            "tier2_max_seconds": self.tier2_max_seconds,
            "tier3_max_seconds": self.tier3_max_seconds,
            "total_max_seconds": self.total_max_seconds,
        }
Functions
get_enabled_tiers()

Get enabled tiers as a set.

Returns:

Type Description
set[int]

Set of enabled tier numbers for backward compatibility

Source code in src/app/config/judge_settings.py
def get_enabled_tiers(self) -> set[int]:
    """
    Get enabled tiers as a set.

    Returns:
        Set of enabled tier numbers for backward compatibility
    """
    return set(self.tiers_enabled)
get_performance_targets()

Get performance targets as dictionary.

Returns:

Type Description
dict[str, float]

Dictionary of performance targets for backward compatibility

Source code in src/app/config/judge_settings.py
def get_performance_targets(self) -> dict[str, float]:
    """
    Get performance targets as dictionary.

    Returns:
        Dictionary of performance targets for backward compatibility
    """
    return {
        "tier1_max_seconds": self.tier1_max_seconds,
        "tier2_max_seconds": self.tier2_max_seconds,
        "tier3_max_seconds": self.tier3_max_seconds,
        "total_max_seconds": self.total_max_seconds,
    }
is_tier_enabled(tier)

Check if a specific tier is enabled.

Parameters:

Name Type Description Default
tier int

Tier number to check (1, 2, or 3)

required

Returns:

Type Description
bool

True if tier is enabled

Source code in src/app/config/judge_settings.py
def is_tier_enabled(self, tier: int) -> bool:
    """
    Check if a specific tier is enabled.

    Args:
        tier: Tier number to check (1, 2, or 3)

    Returns:
        True if tier is enabled
    """
    return tier in self.tiers_enabled

app.config.logfire_config

Logfire + Phoenix tracing configuration model.

Classes

LogfireConfig

Bases: BaseModel

Configuration for Logfire + Phoenix tracing integration.

Constructed from JudgeSettings via from_settings(). All values are controlled by JUDGE_LOGFIRE_ and JUDGE_PHOENIX_ env vars through pydantic-settings.

Source code in src/app/config/logfire_config.py
class LogfireConfig(BaseModel):
    """Configuration for Logfire + Phoenix tracing integration.

    Constructed from JudgeSettings via from_settings(). All values
    are controlled by JUDGE_LOGFIRE_* and JUDGE_PHOENIX_* env vars
    through pydantic-settings.
    """

    enabled: bool = True
    send_to_cloud: bool = False
    phoenix_endpoint: str = "http://localhost:6006"
    service_name: str = "peerread-evaluation"

    @classmethod
    def from_settings(cls, settings: JudgeSettings) -> LogfireConfig:
        """Create LogfireConfig from JudgeSettings.

        Args:
            settings: JudgeSettings instance with logfire fields.

        Returns:
            LogfireConfig populated from pydantic-settings.
        """
        return cls(
            enabled=settings.logfire_enabled,
            send_to_cloud=settings.logfire_send_to_cloud,
            phoenix_endpoint=settings.phoenix_endpoint,
            service_name=settings.logfire_service_name,
        )
Functions
from_settings(settings) classmethod

Create LogfireConfig from JudgeSettings.

Parameters:

Name Type Description Default
settings JudgeSettings

JudgeSettings instance with logfire fields.

required

Returns:

Type Description
LogfireConfig

LogfireConfig populated from pydantic-settings.

Source code in src/app/config/logfire_config.py
@classmethod
def from_settings(cls, settings: JudgeSettings) -> LogfireConfig:
    """Create LogfireConfig from JudgeSettings.

    Args:
        settings: JudgeSettings instance with logfire fields.

    Returns:
        LogfireConfig populated from pydantic-settings.
    """
    return cls(
        enabled=settings.logfire_enabled,
        send_to_cloud=settings.logfire_send_to_cloud,
        phoenix_endpoint=settings.phoenix_endpoint,
        service_name=settings.logfire_service_name,
    )

app.config.peerread_config

PeerRead dataset configuration model.

Classes

PeerReadConfig

Bases: BaseModel

Configuration for PeerRead dataset management.

Source code in src/app/config/peerread_config.py
class PeerReadConfig(BaseModel):
    """Configuration for PeerRead dataset management."""

    base_url: str = Field(
        default="https://github.com/allenai/PeerRead/tree/master/data",
        description="Base URL for PeerRead dataset",
    )
    github_api_base_url: str = Field(
        default="https://api.github.com/repos/allenai/PeerRead/contents/data",
        description="Base URL for GitHub API to list PeerRead dataset contents",
    )
    raw_github_base_url: str = Field(
        default="https://raw.githubusercontent.com/allenai/PeerRead/master/data",
        description="Base URL for raw GitHub content of PeerRead dataset",
    )
    cache_directory: str = Field(
        default=DATASETS_PEERREAD_PATH,
        description="Local directory for caching downloaded data",
    )
    venues: list[str] = Field(
        default=["acl_2017", "conll_2016", "iclr_2017"],
        description="Available conference venues",
    )
    splits: list[str] = Field(default=["train", "test", "dev"], description="Available data splits")
    max_papers_per_query: int = Field(default=100, description="Maximum papers to return per query")
    download_timeout: int = Field(
        default=30, description="Timeout for download requests in seconds"
    )
    max_retries: int = Field(
        default=5, description="Maximum number of retry attempts for downloads"
    )
    retry_delay_seconds: int = Field(
        default=5, description="Delay in seconds between retry attempts"
    )
    similarity_metrics: dict[str, float] = Field(
        default={"cosine_weight": 0.6, "jaccard_weight": 0.4},
        description="Weights for similarity metrics",
    )

app.data_models.app_models

Data models for agent system configuration and results.

This module defines Pydantic models for representing research and analysis results, summaries, provider and agent configurations, and model dictionaries used throughout the application. These models ensure type safety and validation for data exchanged between agents and system components.

Classes

AgentConfig

Bases: BaseModel

Configuration for an agent

Source code in src/app/data_models/app_models.py
class AgentConfig(BaseModel):
    """Configuration for an agent"""

    model: Model  # (1) Instance expected
    output_type: type[BaseModel]  # (2) Class expected
    system_prompt: str
    tools: list[Tool[Any]] = []  # (3) List of Tool instances validated at creation
    retries: int = 3

    # Avoid pydantic.errors.PydanticSchemaGenerationError:
    # Unable to generate pydantic-core schema for <class 'openai.AsyncOpenAI'>.
    # Avoid Pydantic errors related to non-Pydantic types
    model_config = ConfigDict(
        arbitrary_types_allowed=True
    )  # (4) Suppress Error non-Pydantic types caused by <class 'openai.AsyncOpenAI'>

    @field_validator("tools", mode="before")
    def validate_tools(cls, v: list[Any]) -> list[Tool[Any]]:  # noqa: N805
        """Validate that all tools are instances of Tool."""
        if not v:
            return []
        if not all(isinstance(t, Tool) for t in v):
            raise ValueError("All tools must be Tool instances")
        return v
Functions
validate_tools(v)

Validate that all tools are instances of Tool.

Source code in src/app/data_models/app_models.py
@field_validator("tools", mode="before")
def validate_tools(cls, v: list[Any]) -> list[Tool[Any]]:  # noqa: N805
    """Validate that all tools are instances of Tool."""
    if not v:
        return []
    if not all(isinstance(t, Tool) for t in v):
        raise ValueError("All tools must be Tool instances")
    return v

AnalysisResult

Bases: BaseModel

Analysis results from the analysis agent.

Source code in src/app/data_models/app_models.py
class AnalysisResult(BaseModel):
    """Analysis results from the analysis agent."""

    insights: list[str]
    recommendations: list[str]
    approval: bool

ChatConfig

Bases: BaseModel

Configuration settings for agents and model providers

Source code in src/app/data_models/app_models.py
class ChatConfig(BaseModel):
    """Configuration settings for agents and model providers"""

    providers: dict[str, ProviderConfig]
    inference: dict[str, str | int]
    prompts: dict[str, str]

EndpointConfig

Bases: BaseModel

Configuration for an agent

Source code in src/app/data_models/app_models.py
class EndpointConfig(BaseModel):
    """Configuration for an agent"""

    provider: str
    query: UserPromptType = None
    api_key: str | None
    prompts: dict[str, str]
    provider_config: ProviderConfig
    usage_limits: UsageLimits | None = None

ModelDict

Bases: BaseModel

Dictionary of models used to create agent systems

Source code in src/app/data_models/app_models.py
class ModelDict(BaseModel):
    """Dictionary of models used to create agent systems"""

    model_manager: Model
    model_researcher: Model | None
    model_analyst: Model | None
    model_synthesiser: Model | None
    model_config = ConfigDict(arbitrary_types_allowed=True)

ProviderConfig

Bases: BaseModel

Configuration for a model provider

Source code in src/app/data_models/app_models.py
class ProviderConfig(BaseModel):
    """Configuration for a model provider"""

    model_name: str
    base_url: HttpUrl
    usage_limits: int | None = None
    max_content_length: int | None = 15000

ProviderMetadata

Bases: BaseModel

Metadata for an LLM provider.

This model defines the core configuration for each supported provider, serving as a single source of truth for provider settings.

Source code in src/app/data_models/app_models.py
class ProviderMetadata(BaseModel):
    """Metadata for an LLM provider.

    This model defines the core configuration for each supported provider,
    serving as a single source of truth for provider settings.
    """

    name: str
    env_key: str | None  # None for providers without API keys (e.g., Ollama)
    model_prefix: str  # Prefix for model names (empty string if not needed)
    default_base_url: str | None = None  # Default API endpoint for OpenAI-compatible providers
    default_model: str | None = None  # Default model ID for the provider

ResearchResult

Bases: BaseModel

Research results from the research agent with flexible structure.

Source code in src/app/data_models/app_models.py
class ResearchResult(BaseModel):
    """Research results from the research agent with flexible structure."""

    topic: str | dict[str, str]
    findings: list[str] | dict[str, str | list[str]]
    sources: list[str | HttpUrl] | dict[str, str | HttpUrl | list[str | HttpUrl]]

ResearchResultSimple

Bases: BaseModel

Simplified research results for Gemini compatibility.

Source code in src/app/data_models/app_models.py
class ResearchResultSimple(BaseModel):
    """Simplified research results for Gemini compatibility."""

    topic: str
    findings: list[str]
    sources: list[str]

ResearchSummary

Bases: BaseModel

Expected model response of research on a topic

Source code in src/app/data_models/app_models.py
class ResearchSummary(BaseModel):
    """Expected model response of research on a topic"""

    topic: str
    key_points: list[str]
    key_points_explanation: list[str]
    conclusion: str
    sources: list[str]

app.data_models.evaluation_models

Data models for three-tiered evaluation system.

This module provides Pydantic models for the comprehensive evaluation framework that assesses multi-agent systems on PeerRead scientific paper review generation.

Classes

AgentMetrics

Bases: BaseModel

Simple agent-level metrics for evaluation enhancement.

Source code in src/app/data_models/evaluation_models.py
class AgentMetrics(BaseModel):
    """Simple agent-level metrics for evaluation enhancement."""

    tool_selection_score: float = 0.7  # Default neutral score
    plan_coherence_score: float = 0.7  # Default neutral score
    coordination_score: float = 0.7  # Default neutral score

    def get_agent_composite_score(self) -> float:
        """Calculate simple weighted composite score for agent metrics."""
        weights = {
            "tool_selection": 0.35,
            "plan_coherence": 0.35,
            "coordination": 0.30,
        }
        return (
            self.tool_selection_score * weights["tool_selection"]
            + self.plan_coherence_score * weights["plan_coherence"]
            + self.coordination_score * weights["coordination"]
        )
Functions
get_agent_composite_score()

Calculate simple weighted composite score for agent metrics.

Source code in src/app/data_models/evaluation_models.py
def get_agent_composite_score(self) -> float:
    """Calculate simple weighted composite score for agent metrics."""
    weights = {
        "tool_selection": 0.35,
        "plan_coherence": 0.35,
        "coordination": 0.30,
    }
    return (
        self.tool_selection_score * weights["tool_selection"]
        + self.plan_coherence_score * weights["plan_coherence"]
        + self.coordination_score * weights["coordination"]
    )

BaselineComparison

Bases: BaseModel

Pairwise comparison of two CompositeResult instances.

Captures metric-level and tier-level deltas between two evaluation results, with human-readable summary for interpretation.

Source code in src/app/data_models/evaluation_models.py
class BaselineComparison(BaseModel):
    """Pairwise comparison of two CompositeResult instances.

    Captures metric-level and tier-level deltas between two evaluation results,
    with human-readable summary for interpretation.
    """

    label_a: str = Field(description="Label for first result (e.g., 'PydanticAI')")
    label_b: str = Field(description="Label for second result (e.g., 'Claude Code solo')")

    result_a: CompositeResult = Field(description="First CompositeResult instance")
    result_b: CompositeResult = Field(description="Second CompositeResult instance")

    metric_deltas: dict[str, float] = Field(
        description="Per-metric deltas (result_a - result_b) for 6 composite metrics"
    )

    tier_deltas: dict[str, float | None] = Field(
        description="Tier-level score differences (Tier 1, Tier 2, Tier 3). None if tier missing."
    )

    summary: str = Field(
        description=(
            "Human-readable comparison summary "
            "(e.g., 'PydanticAI scored +0.12 higher on technical_accuracy vs Claude Code solo')"
        )
    )

CompositeEvaluationResult

Bases: BaseModel

Complete three-tier evaluation result.

Aggregates all evaluation tiers into a single comprehensive assessment with composite scoring and recommendation generation.

Source code in src/app/data_models/evaluation_models.py
class CompositeEvaluationResult(BaseModel):
    """Complete three-tier evaluation result.

    Aggregates all evaluation tiers into a single comprehensive assessment
    with composite scoring and recommendation generation.
    """

    paper_id: str = Field(description="Evaluated paper identifier")
    agent_review: str = Field(description="Generated review text")

    tier1_results: Tier1Result
    tier2_results: Tier2Result | None = None
    tier3_results: Tier3Result | None = None

    composite_score: float = Field(ge=0.0, le=1.0, description="Final weighted score")
    recommendation: str = Field(description="accept/weak_accept/weak_reject/reject")
    confidence: float = Field(ge=0.0, le=1.0, description="Confidence in evaluation")

    # Performance metrics
    tier1_duration: float = Field(description="Tier 1 execution time")
    tier2_duration: float | None = None
    tier3_duration: float | None = None
    total_duration: float = Field(description="Total evaluation time")

    # Metadata
    timestamp: str = Field(description="ISO 8601 evaluation timestamp")
    config_version: str = Field(description="Configuration version used")

CompositeResult

Bases: BaseModel

Result of composite scoring across all three evaluation tiers.

Integrates Traditional Metrics, LLM-as-Judge, and Graph Analysis into unified scoring system with recommendation mapping.

Source code in src/app/data_models/evaluation_models.py
class CompositeResult(BaseModel):
    """Result of composite scoring across all three evaluation tiers.

    Integrates Traditional Metrics, LLM-as-Judge, and Graph Analysis
    into unified scoring system with recommendation mapping.
    """

    composite_score: float = Field(
        ge=0.0, le=1.0, description="Weighted composite score across all tiers"
    )
    recommendation: str = Field(
        description="Recommendation category: accept, weak_accept, weak_reject, reject"
    )
    recommendation_weight: float = Field(
        ge=-1.0, le=1.0, description="Numerical weight for recommendation (-1.0 to 1.0)"
    )

    # Individual metric contributions
    metric_scores: dict[str, float] = Field(
        description="Individual metric values used in composite calculation"
    )

    # Tier-level scores
    tier1_score: float = Field(ge=0.0, le=1.0, description="Traditional metrics overall score")
    tier2_score: float | None = Field(
        default=None, ge=0.0, le=1.0, description="LLM-as-Judge overall score (None if skipped)"
    )
    tier3_score: float = Field(ge=0.0, le=1.0, description="Graph analysis overall score")

    # Evaluation metadata
    evaluation_complete: bool = Field(description="Whether all required tiers completed")
    single_agent_mode: bool = Field(
        default=False,
        description="Whether single-agent mode weight redistribution was applied",
    )
    timestamp: str = Field(description="ISO 8601 evaluation timestamp", default="")
    config_version: str = Field(description="Configuration version used", default="1.0.0")
    weights_used: dict[str, float] | None = Field(
        description="Tier weights used in composite calculation", default=None
    )
    tiers_enabled: list[int] | None = Field(
        description="List of enabled tier numbers", default=None
    )

    agent_assessment_scores: dict[str, float] | None = Field(
        description="Optional agent-level assessment scores", default=None
    )

    # S10-F1: track source engine for downstream display and comparability labeling
    engine_type: str = Field(
        default="mas",
        description="Source engine: 'mas', 'cc_solo', or 'cc_teams'",
    )

ConstructivenessAssessment

Bases: BaseModel

LLM assessment of constructiveness.

Source code in src/app/data_models/evaluation_models.py
class ConstructivenessAssessment(BaseModel):
    """LLM assessment of constructiveness."""

    actionable_feedback: float = Field(ge=1.0, le=5.0, description="Actionable feedback score")
    balanced_critique: float = Field(ge=1.0, le=5.0, description="Balanced critique score")
    improvement_guidance: float = Field(ge=1.0, le=5.0, description="Improvement guidance score")
    explanation: str = Field(description="Explanation of the assessment")

EvaluationResults

Bases: BaseModel

Container for all three evaluation tier results.

Source code in src/app/data_models/evaluation_models.py
class EvaluationResults(BaseModel):
    """Container for all three evaluation tier results."""

    tier1: Tier1Result | None = None
    tier2: Tier2Result | None = None
    tier3: Tier3Result | None = None

    def is_complete(self) -> bool:
        """Check if all required tiers have results."""
        return all([self.tier1, self.tier2, self.tier3])
Functions
is_complete()

Check if all required tiers have results.

Source code in src/app/data_models/evaluation_models.py
def is_complete(self) -> bool:
    """Check if all required tiers have results."""
    return all([self.tier1, self.tier2, self.tier3])

GraphTraceData

Bases: BaseModel

Trace data structure for graph-based analysis.

Captures execution traces from agent interactions, tool usage, and coordination patterns for NetworkX graph construction.

Source code in src/app/data_models/evaluation_models.py
class GraphTraceData(BaseModel):
    """Trace data structure for graph-based analysis.

    Captures execution traces from agent interactions, tool usage,
    and coordination patterns for NetworkX graph construction.
    """

    execution_id: str = Field(description="Unique execution identifier")
    agent_interactions: list[dict[str, Any]] = Field(
        description="Agent-to-agent communications", default_factory=list
    )
    tool_calls: list[dict[str, Any]] = Field(
        description="Tool usage sequence", default_factory=list
    )
    timing_data: dict[str, Any] = Field(description="Execution timestamps", default_factory=dict)
    coordination_events: list[dict[str, Any]] = Field(
        description="Manager delegation patterns", default_factory=list
    )

    @classmethod
    def from_trace_dict(
        cls, trace: dict[str, Any] | None, fallback_id: str = "minimal"
    ) -> "GraphTraceData":
        """Create GraphTraceData from an execution trace dict, with safe defaults.

        Args:
            trace: Raw execution trace dict, or None for a minimal empty instance.
            fallback_id: Execution ID to use when trace is None.

        Returns:
            GraphTraceData populated from dict or with empty defaults.
        """
        if trace:
            return cls(
                execution_id=trace.get("execution_id", fallback_id),
                agent_interactions=trace.get("agent_interactions", []),
                tool_calls=trace.get("tool_calls", []),
                timing_data=trace.get("timing_data", {}),
                coordination_events=trace.get("coordination_events", []),
            )
        return cls(execution_id=fallback_id)
Functions
from_trace_dict(trace, fallback_id='minimal') classmethod

Create GraphTraceData from an execution trace dict, with safe defaults.

Parameters:

Name Type Description Default
trace dict[str, Any] | None

Raw execution trace dict, or None for a minimal empty instance.

required
fallback_id str

Execution ID to use when trace is None.

'minimal'

Returns:

Type Description
GraphTraceData

GraphTraceData populated from dict or with empty defaults.

Source code in src/app/data_models/evaluation_models.py
@classmethod
def from_trace_dict(
    cls, trace: dict[str, Any] | None, fallback_id: str = "minimal"
) -> "GraphTraceData":
    """Create GraphTraceData from an execution trace dict, with safe defaults.

    Args:
        trace: Raw execution trace dict, or None for a minimal empty instance.
        fallback_id: Execution ID to use when trace is None.

    Returns:
        GraphTraceData populated from dict or with empty defaults.
    """
    if trace:
        return cls(
            execution_id=trace.get("execution_id", fallback_id),
            agent_interactions=trace.get("agent_interactions", []),
            tool_calls=trace.get("tool_calls", []),
            timing_data=trace.get("timing_data", {}),
            coordination_events=trace.get("coordination_events", []),
        )
    return cls(execution_id=fallback_id)

PeerReadEvalResult

Bases: BaseModel

Result of evaluating agent review against PeerRead ground truth.

Source code in src/app/data_models/evaluation_models.py
class PeerReadEvalResult(BaseModel):
    """Result of evaluating agent review against PeerRead ground truth."""

    paper_id: str = Field(description="Paper being evaluated")
    agent_review: str = Field(description="Review generated by agent")
    ground_truth_reviews: list[PeerReadReview] = Field(
        description="Original peer reviews from dataset"
    )
    similarity_scores: dict[str, float] = Field(
        description="Similarity metrics (semantic, cosine, jaccard)"
    )
    overall_similarity: float = Field(description="Weighted overall similarity score (0-1)")
    recommendation_match: bool = Field(
        description="Whether agent recommendation matches ground truth"
    )

PlanningRationalityAssessment

Bases: BaseModel

LLM assessment of planning rationality.

Source code in src/app/data_models/evaluation_models.py
class PlanningRationalityAssessment(BaseModel):
    """LLM assessment of planning rationality."""

    logical_flow: float = Field(ge=1.0, le=5.0, description="Logical flow score")
    decision_quality: float = Field(ge=1.0, le=5.0, description="Decision quality score")
    resource_efficiency: float = Field(ge=1.0, le=5.0, description="Resource efficiency score")
    explanation: str = Field(description="Explanation of the assessment")

TechnicalAccuracyAssessment

Bases: BaseModel

LLM assessment of technical accuracy.

Source code in src/app/data_models/evaluation_models.py
class TechnicalAccuracyAssessment(BaseModel):
    """LLM assessment of technical accuracy."""

    factual_correctness: float = Field(ge=1.0, le=5.0, description="Factual correctness score")
    methodology_understanding: float = Field(
        ge=1.0, le=5.0, description="Methodology understanding score"
    )
    domain_knowledge: float = Field(ge=1.0, le=5.0, description="Domain knowledge score")
    explanation: str = Field(description="Explanation of the assessment")

Tier1Result

Bases: BaseModel

Traditional metrics evaluation result.

Contains text similarity metrics, execution performance, and task success indicators using lightweight computational approaches.

Source code in src/app/data_models/evaluation_models.py
class Tier1Result(BaseModel):
    """Traditional metrics evaluation result.

    Contains text similarity metrics, execution performance, and task success
    indicators using lightweight computational approaches.
    """

    cosine_score: float = Field(ge=0.0, le=1.0, description="TF-IDF cosine similarity")
    jaccard_score: float = Field(ge=0.0, le=1.0, description="Word-level Jaccard similarity")
    semantic_score: float = Field(
        ge=0.0,
        le=1.0,
        description="Levenshtein-based sequence similarity (BERTScore disabled)",
    )
    execution_time: float = Field(ge=0.0, description="Raw execution time in seconds")
    time_score: float = Field(ge=0.0, le=1.0, description="Normalized time score")
    task_success: float = Field(
        description="Continuous task success score (0.0 to 1.0, proportional below threshold)"
    )
    overall_score: float = Field(ge=0.0, le=1.0, description="Weighted traditional metrics score")

Tier2Result

Bases: BaseModel

LLM-as-Judge evaluation result.

Contains quality assessments from large language model evaluation including technical accuracy, constructiveness, and planning rationality.

Source code in src/app/data_models/evaluation_models.py
class Tier2Result(BaseModel):
    """LLM-as-Judge evaluation result.

    Contains quality assessments from large language model evaluation including
    technical accuracy, constructiveness, and planning rationality.
    """

    technical_accuracy: float = Field(ge=0.0, le=1.0, description="Technical accuracy score")
    constructiveness: float = Field(ge=0.0, le=1.0, description="Constructiveness score")
    planning_rationality: float = Field(ge=0.0, le=1.0, description="Planning quality score")
    overall_score: float = Field(ge=0.0, le=1.0, description="Weighted LLM judge score")
    model_used: str = Field(description="LLM model used for evaluation")
    api_cost: float | None = Field(
        default=None, description="Estimated API cost in USD; None when cost is unavailable"
    )
    fallback_used: bool = Field(default=False, description="Whether fallback was used")

Tier3Result

Bases: BaseModel

Graph-based analysis result.

Contains metrics derived from analyzing agent coordination patterns, tool usage efficiency using NetworkX.

Source code in src/app/data_models/evaluation_models.py
class Tier3Result(BaseModel):
    """Graph-based analysis result.

    Contains metrics derived from analyzing agent coordination patterns,
    tool usage efficiency using NetworkX.
    """

    path_convergence: float = Field(ge=0.0, le=1.0, description="Tool usage efficiency")
    tool_selection_accuracy: float = Field(ge=0.0, le=1.0, description="Tool choice accuracy")
    coordination_centrality: float = Field(ge=0.0, le=1.0, description="Coordination quality")
    task_distribution_balance: float = Field(ge=0.0, le=1.0, description="Load balancing")
    overall_score: float = Field(ge=0.0, le=1.0, description="Weighted graph analysis score")
    graph_complexity: int = Field(description="Number of nodes in interaction graph")

app.data_models.peerread_models

PeerRead dataset data models.

This module defines Pydantic models for representing PeerRead scientific paper review data structures. These models ensure type safety and validation for papers, reviews, and evaluation results used in the multi-agent system evaluation.

The models are based on the actual PeerRead dataset structure validated from: https://raw.githubusercontent.com/allenai/PeerRead/master/data/acl_2017/train/reviews/104.json

This module also includes structured data models for LLM-generated reviews, ensuring consistency and validation against the PeerRead format.

Classes

DownloadResult

Bases: BaseModel

Result of dataset download operation.

Source code in src/app/data_models/peerread_models.py
class DownloadResult(BaseModel):
    """Result of dataset download operation."""

    success: bool = Field(description="Whether download was successful")
    cache_path: str = Field(description="Path to cached data")
    papers_downloaded: int = Field(default=0, description="Number of papers downloaded")
    error_message: str | None = Field(default=None, description="Error message if download failed")

GeneratedReview

Bases: BaseModel

Structured data model for LLM-generated reviews.

This model enforces the PeerRead review format and ensures all required fields are present with proper validation.

Source code in src/app/data_models/peerread_models.py
class GeneratedReview(BaseModel):
    """
    Structured data model for LLM-generated reviews.

    This model enforces the PeerRead review format and ensures
    all required fields are present with proper validation.
    """

    impact: _ScoreInt = Field(
        ..., ge=1, le=5, description="Impact rating (1=minimal, 5=high impact)"
    )

    substance: _ScoreInt = Field(
        ..., ge=1, le=5, description="Substance/depth rating (1=shallow, 5=substantial)"
    )

    appropriateness: _ScoreInt = Field(
        ...,
        ge=1,
        le=5,
        description="Venue appropriateness rating (1=inappropriate, 5=appropriate)",
    )

    meaningful_comparison: _ScoreInt = Field(
        ...,
        ge=1,
        le=5,
        description="Related work comparison rating (1=poor, 5=excellent)",
    )

    presentation_format: _PresentationFormatLiteral = Field(
        ..., description="Recommended presentation format"
    )

    comments: str = Field(
        ...,
        min_length=100,
        description="Detailed review comments covering contributions, strengths, "
        "weaknesses, technical soundness, clarity, and suggestions",
    )

    soundness_correctness: _ScoreInt = Field(
        ...,
        ge=1,
        le=5,
        description="Technical soundness rating (1=many errors, 5=very sound)",
    )

    originality: _ScoreInt = Field(
        ...,
        ge=1,
        le=5,
        description="Originality rating (1=not original, 5=highly original)",
    )

    recommendation: _ScoreInt = Field(
        ...,
        ge=1,
        le=5,
        description=(
            "Overall recommendation (1=strong reject, 2=reject, 3=borderline, "
            "4=accept, 5=strong accept)"
        ),
    )

    clarity: _ScoreInt = Field(
        ...,
        ge=1,
        le=5,
        description="Presentation clarity rating (1=very unclear, 5=very clear)",
    )

    reviewer_confidence: _ScoreInt = Field(
        ...,
        ge=1,
        le=5,
        description="Reviewer confidence rating (1=low confidence, 5=high confidence)",
    )

    @field_validator("comments")
    def validate_comments_structure(cls, v: str) -> str:  # noqa: N805
        """Ensure comments contain key review sections."""
        required_sections = [
            "contributions",
            "strengths",
            "weaknesses",
            "technical",
            "clarity",
        ]

        v_lower = v.lower()
        missing_sections = [section for section in required_sections if section not in v_lower]

        if missing_sections:
            # Just warn but don't fail - LLM might use different wording
            pass

        return v

    def to_peerread_format(self) -> dict[str, str | None]:
        """Convert to PeerRead dataset format for compatibility."""
        return {
            "IMPACT": str(self.impact),
            "SUBSTANCE": str(self.substance),
            "APPROPRIATENESS": str(self.appropriateness),
            "MEANINGFUL_COMPARISON": str(self.meaningful_comparison),
            "PRESENTATION_FORMAT": self.presentation_format,
            "comments": self.comments,
            "SOUNDNESS_CORRECTNESS": str(self.soundness_correctness),
            "ORIGINALITY": str(self.originality),
            "RECOMMENDATION": str(self.recommendation),
            "CLARITY": str(self.clarity),
            "REVIEWER_CONFIDENCE": str(self.reviewer_confidence),
            "is_meta_review": None,
        }
Functions
to_peerread_format()

Convert to PeerRead dataset format for compatibility.

Source code in src/app/data_models/peerread_models.py
def to_peerread_format(self) -> dict[str, str | None]:
    """Convert to PeerRead dataset format for compatibility."""
    return {
        "IMPACT": str(self.impact),
        "SUBSTANCE": str(self.substance),
        "APPROPRIATENESS": str(self.appropriateness),
        "MEANINGFUL_COMPARISON": str(self.meaningful_comparison),
        "PRESENTATION_FORMAT": self.presentation_format,
        "comments": self.comments,
        "SOUNDNESS_CORRECTNESS": str(self.soundness_correctness),
        "ORIGINALITY": str(self.originality),
        "RECOMMENDATION": str(self.recommendation),
        "CLARITY": str(self.clarity),
        "REVIEWER_CONFIDENCE": str(self.reviewer_confidence),
        "is_meta_review": None,
    }
validate_comments_structure(v)

Ensure comments contain key review sections.

Source code in src/app/data_models/peerread_models.py
@field_validator("comments")
def validate_comments_structure(cls, v: str) -> str:  # noqa: N805
    """Ensure comments contain key review sections."""
    required_sections = [
        "contributions",
        "strengths",
        "weaknesses",
        "technical",
        "clarity",
    ]

    v_lower = v.lower()
    missing_sections = [section for section in required_sections if section not in v_lower]

    if missing_sections:
        # Just warn but don't fail - LLM might use different wording
        pass

    return v

PeerReadPaper

Bases: BaseModel

Scientific paper from PeerRead dataset.

Source code in src/app/data_models/peerread_models.py
class PeerReadPaper(BaseModel):
    """Scientific paper from PeerRead dataset."""

    paper_id: str = Field(description="Unique paper identifier")
    title: str = Field(description="Paper title")
    abstract: str = Field(description="Paper abstract")
    reviews: list[PeerReadReview] = Field(description="Peer reviews for this paper")
    review_histories: list[str] = Field(
        default_factory=list, description="Paper revision histories"
    )

PeerReadReview

Bases: BaseModel

Individual peer review from PeerRead dataset.

Note: Some PeerRead papers (e.g., 304-308, 330) lack optional fields. Defaults to “UNKNOWN” for missing review criteria fields.

Accepts both PeerRead uppercase keys (IMPACT) and model lowercase keys (impact) via populate_by_name with aliases. Numeric score fields are coerced to str to handle raw PeerRead JSON integer values.

Source code in src/app/data_models/peerread_models.py
class PeerReadReview(BaseModel):
    """Individual peer review from PeerRead dataset.

    Note: Some PeerRead papers (e.g., 304-308, 330) lack optional fields.
    Defaults to "UNKNOWN" for missing review criteria fields.

    Accepts both PeerRead uppercase keys (IMPACT) and model lowercase keys
    (impact) via populate_by_name with aliases. Numeric score fields are
    coerced to str to handle raw PeerRead JSON integer values.
    """

    model_config = ConfigDict(populate_by_name=True)

    impact: _ScoreStr = Field(
        default="UNKNOWN", validation_alias="IMPACT", description="Impact score (1-5)"
    )
    substance: _ScoreStr = Field(
        default="UNKNOWN", validation_alias="SUBSTANCE", description="Substance score (1-5)"
    )
    appropriateness: _ScoreStr = Field(
        default="UNKNOWN",
        validation_alias="APPROPRIATENESS",
        description="Appropriateness score (1-5)",
    )
    meaningful_comparison: _ScoreStr = Field(
        default="UNKNOWN",
        validation_alias="MEANINGFUL_COMPARISON",
        description="Meaningful comparison score (1-5)",
    )
    presentation_format: str = Field(
        default="Poster",
        validation_alias="PRESENTATION_FORMAT",
        description="Presentation format (Poster/Oral)",
    )
    comments: str = Field(default="", description="Detailed review comments")
    soundness_correctness: _ScoreStr = Field(
        default="UNKNOWN",
        validation_alias="SOUNDNESS_CORRECTNESS",
        description="Soundness/correctness score (1-5)",
    )
    originality: _ScoreStr = Field(
        default="UNKNOWN", validation_alias="ORIGINALITY", description="Originality score (1-5)"
    )
    recommendation: _ScoreStr = Field(
        default="UNKNOWN",
        validation_alias="RECOMMENDATION",
        description="Overall recommendation score (1-5)",
    )
    clarity: _ScoreStr = Field(
        default="UNKNOWN", validation_alias="CLARITY", description="Clarity score (1-5)"
    )
    reviewer_confidence: _ScoreStr = Field(
        default="UNKNOWN",
        validation_alias="REVIEWER_CONFIDENCE",
        description="Reviewer confidence score (1-5)",
    )
    is_meta_review: bool | None = Field(default=None, description="Whether this is a meta review")

    def is_compliant(self) -> bool:
        """Check if all score fields are populated (not UNKNOWN).

        A review is compliant when every field that defaults to UNKNOWN
        has been populated with an actual value from the raw JSON.

        Returns:
            True if all score fields have non-UNKNOWN values.
        """
        # Reason: Derive dynamically from model_fields to stay in sync with field definitions.
        return all(
            getattr(self, name) != "UNKNOWN"
            for name, info in PeerReadReview.model_fields.items()
            if info.default == "UNKNOWN"
        )
Functions
is_compliant()

Check if all score fields are populated (not UNKNOWN).

A review is compliant when every field that defaults to UNKNOWN has been populated with an actual value from the raw JSON.

Returns:

Type Description
bool

True if all score fields have non-UNKNOWN values.

Source code in src/app/data_models/peerread_models.py
def is_compliant(self) -> bool:
    """Check if all score fields are populated (not UNKNOWN).

    A review is compliant when every field that defaults to UNKNOWN
    has been populated with an actual value from the raw JSON.

    Returns:
        True if all score fields have non-UNKNOWN values.
    """
    # Reason: Derive dynamically from model_fields to stay in sync with field definitions.
    return all(
        getattr(self, name) != "UNKNOWN"
        for name, info in PeerReadReview.model_fields.items()
        if info.default == "UNKNOWN"
    )

ReviewGenerationResult

Bases: BaseModel

Complete result from the review generation process.

Contains the structured review along with metadata.

Source code in src/app/data_models/peerread_models.py
class ReviewGenerationResult(BaseModel):
    """
    Complete result from the review generation process.

    Contains the structured review along with metadata.
    """

    paper_id: str = Field(..., description=("The unique paper identifier provided by PeerRead"))
    review: GeneratedReview = Field(..., description="The structured review povided by LLM")
    timestamp: str = Field(..., description="Generation timestamp in ISO format")
    model_info: str = Field(
        ...,
        description=("Information about the generating model: your model name, version, etc."),
    )

app.data_models.report_models

Data models for evaluation report generation.

This module provides Pydantic models for structured report output including suggestion severity levels and individual suggestion records.

Classes

Suggestion

Bases: BaseModel

A single actionable suggestion derived from evaluation results.

Each suggestion is grounded in a specific metric and tier, with a severity level indicating urgency. The action field provides concrete guidance.

Example

s = Suggestion( … metric=”cosine_score”, … tier=1, … severity=SuggestionSeverity.CRITICAL, … message=”Tier 1 cosine score very low (0.08) — vocabulary overlap minimal.”, … action=”Incorporate domain-specific terminology from the paper abstract.”, … )

Source code in src/app/data_models/report_models.py
class Suggestion(BaseModel):
    """A single actionable suggestion derived from evaluation results.

    Each suggestion is grounded in a specific metric and tier, with a severity
    level indicating urgency. The action field provides concrete guidance.

    Example:
        >>> s = Suggestion(
        ...     metric="cosine_score",
        ...     tier=1,
        ...     severity=SuggestionSeverity.CRITICAL,
        ...     message="Tier 1 cosine score very low (0.08) — vocabulary overlap minimal.",
        ...     action="Incorporate domain-specific terminology from the paper abstract.",
        ... )
    """

    metric: str = Field(
        description="Metric name that triggered this suggestion (e.g., 'cosine_score')"
    )
    tier: int = Field(
        ge=1, le=3, description="Evaluation tier (1=Traditional, 2=LLM Judge, 3=Graph)"
    )
    severity: SuggestionSeverity = Field(description="Severity level: critical, warning, or info")
    message: str = Field(
        description="Human-readable description of the issue referencing the metric and score"
    )
    action: str = Field(description="Concrete, actionable recommendation to address the issue")

SuggestionSeverity

Bases: StrEnum

Severity level for evaluation suggestions.

Attributes:

Name Type Description
CRITICAL

Score below critical threshold (< 0.2); immediate action required.

WARNING

Score below average (< 0.5); improvement recommended.

INFO

Improvement opportunity; score acceptable but can be enhanced.

Source code in src/app/data_models/report_models.py
class SuggestionSeverity(StrEnum):
    """Severity level for evaluation suggestions.

    Attributes:
        CRITICAL: Score below critical threshold (< 0.2); immediate action required.
        WARNING: Score below average (< 0.5); improvement recommended.
        INFO: Improvement opportunity; score acceptable but can be enhanced.
    """

    CRITICAL = "critical"
    WARNING = "warning"
    INFO = "info"

app.data_utils.datasets_peerread

PeerRead dataset core utilities for download and loading.

This module provides pure dataset functionality for downloading, caching, and loading the PeerRead scientific paper review dataset. It contains no evaluation logic - only data access and management.

Classes

DataTypeSpec dataclass

Specification for a PeerRead data type.

Attributes:

Name Type Description
extension str

File extension including leading dot(s), e.g. ‘.json’.

is_json bool

True if the file content is JSON, False for binary (PDF).

Source code in src/app/data_utils/datasets_peerread.py
@dataclass(frozen=True)
class DataTypeSpec:
    """Specification for a PeerRead data type.

    Attributes:
        extension: File extension including leading dot(s), e.g. '.json'.
        is_json: True if the file content is JSON, False for binary (PDF).
    """

    extension: str
    is_json: bool

PeerReadDownloader

Downloads PeerRead dataset files with caching and validation.

Handles direct download from GitHub repository with progress tracking, error recovery, and integrity verification.

Source code in src/app/data_utils/datasets_peerread.py
class PeerReadDownloader:
    """Downloads PeerRead dataset files with caching and validation.

    Handles direct download from GitHub repository with progress tracking,
    error recovery, and integrity verification.
    """

    def __init__(self, config: PeerReadConfig):
        """Initialize downloader with configuration.

        Args:
            config: PeerRead dataset configuration.
        """
        self.config = config
        # Resolve cache directory relative to project root
        self.cache_dir = resolve_project_path(config.cache_directory)
        headers: dict[str, str] = {}
        app_env = AppEnv()
        if app_env.GITHUB_API_KEY:
            logger.info("Using GitHub API key for authenticated requests")
            headers["Authorization"] = f"token {app_env.GITHUB_API_KEY}"
        self.client = Client(headers=headers)

    def _construct_url(
        self,
        venue: str,
        split: str,
        data_type: str,
        paper_id: str,
    ) -> str:
        """Construct download URL for specific file.

        Args:
            venue: Conference venue (e.g., 'acl_2017').
            split: Data split ('train', 'test', 'dev').
            data_type: Type of data ('reviews', 'parsed_pdfs', 'pdfs').
            paper_id: Unique paper identifier.

        Returns:
            Complete download URL.

        Raises:
            ValueError: If venue or split is invalid.
        """
        if venue not in self.config.venues:
            raise ValueError(f"Invalid venue: {venue}. Valid venues: {self.config.venues}")

        if split not in self.config.splits:
            raise ValueError(f"Invalid split: {split}. Valid splits: {self.config.splits}")

        if data_type not in DATA_TYPE_SPECS:
            raise ValueError(
                f"Invalid data_type: {data_type}. Valid types: {sorted(DATA_TYPE_SPECS)}"
            )

        spec = DATA_TYPE_SPECS[data_type]
        filename = f"{paper_id}{spec.extension}"
        return f"{self.config.raw_github_base_url}/{venue}/{split}/{data_type}/{filename}"

    def _extract_paper_id_from_filename(
        self,
        filename: str,
        data_type: str,
    ) -> str | None:
        """Extract paper ID from filename based on data type.

        Args:
            filename: Name of the file.
            data_type: Type of data ('reviews', 'parsed_pdfs', 'pdfs').

        Returns:
            Paper ID without extension, or None if filename doesn't match.
        """
        spec = DATA_TYPE_SPECS.get(data_type)
        if spec is None or not filename.endswith(spec.extension):
            return None
        return filename[: -len(spec.extension)]

    def _discover_available_files(
        self,
        venue: str,
        split: str,
        data_type: str,
    ) -> list[str]:
        """Discover available files in a GitHub repository directory.

        Args:
            venue: Conference venue (e.g., 'acl_2017').
            split: Data split ('train', 'test', 'dev').
            data_type: Type of data ('reviews', 'parsed_pdfs', 'pdfs').

        Returns:
            List of paper IDs (without extensions) available in the directory.
        """
        api_url = f"{self.config.github_api_base_url}/{venue}/{split}/{data_type}"

        try:
            # Validate URL for SSRF protection (CVE-2026-25580 mitigation)
            validated_url = validate_url(api_url)
            logger.info(f"Discovering {data_type} files in {venue}/{split} via GitHub API")
            response = self.client.get(validated_url, timeout=self.config.download_timeout)
            response.raise_for_status()

            files_data = response.json()

            paper_ids: list[str] = []
            for file_info in files_data:
                if file_info.get("type") != "file":
                    continue

                filename = file_info.get("name", "")
                paper_id = self._extract_paper_id_from_filename(filename, data_type)
                if paper_id:
                    paper_ids.append(paper_id)

            logger.info(f"Found {len(paper_ids)} {data_type} files in {venue}/{split}")
            return sorted(paper_ids)

        except (RequestError, HTTPStatusError) as e:
            logger.error(f"Failed to discover {data_type} files for {venue}/{split}: {e}")
            return []
        except (KeyError, ValueError) as e:
            logger.error(
                f"Failed to parse GitHub API response for {venue}/{split}/{data_type}: {e}"
            )
            return []

    def _handle_download_error(
        self,
        error: Exception,
        data_type: str,
        paper_id: str,
    ) -> bool:
        """Handle download errors and determine if retry should continue.

        Args:
            error: The exception that occurred.
            data_type: Type of data being downloaded.
            paper_id: Paper identifier.

        Returns:
            True if retry should continue, False otherwise.
        """
        if isinstance(error, HTTPStatusError) and error.response.status_code == 429:
            logger.warning(
                f"Rate limit hit for {data_type}/{paper_id}. "
                f"Retrying in {self.config.retry_delay_seconds} seconds..."
            )
            sleep(self.config.retry_delay_seconds)
            return True

        logger.error(f"Failed to download {data_type}/{paper_id}: {error}")
        return False

    def download_file(
        self,
        venue: str,
        split: str,
        data_type: str,
        paper_id: str,
    ) -> bytes | dict[str, Any] | None:
        """Download a single file.

        Args:
            venue: Conference venue.
            split: Data split.
            data_type: Type of data ('reviews', 'parsed_pdfs', 'pdfs').
            paper_id: Paper identifier.

        Returns:
            File content (JSON dict for .json files, bytes for PDFs),
            or None if download fails.

        Raises:
            ValueError: If venue/split is invalid.
        """
        url = self._construct_url(venue, split, data_type, paper_id)

        for attempt in range(self.config.max_retries):
            try:
                # Validate URL for SSRF protection (CVE-2026-25580 mitigation)
                validated_url = validate_url(url)
                logger.info(
                    f"Downloading {data_type}/{paper_id} from {validated_url} "
                    f"(Attempt {attempt + 1}/{self.config.max_retries})"
                )

                response = self.client.get(validated_url, timeout=self.config.download_timeout)
                response.raise_for_status()

                if DATA_TYPE_SPECS[data_type].is_json:
                    return response.json()
                return response.content

            except (HTTPStatusError, RequestError, JSONDecodeError) as e:
                should_retry = self._handle_download_error(e, data_type, paper_id)
                if not should_retry:
                    return None

        logger.error(
            f"Failed to download {data_type}/{paper_id} after {self.config.max_retries} attempts."
        )
        return None

    def _get_cache_filename(self, data_type: str, paper_id: str) -> str:
        """Get cache filename for given data type and paper ID.

        Args:
            data_type: Type of data ('reviews', 'parsed_pdfs', 'pdfs').
            paper_id: Paper identifier.

        Returns:
            Cache filename.
        """
        if data_type not in DATA_TYPE_SPECS:
            raise ValueError(
                f"Invalid data_type: {data_type}. Valid types: {sorted(DATA_TYPE_SPECS)}"
            )
        return f"{paper_id}{DATA_TYPE_SPECS[data_type].extension}"

    def _save_file_data(
        self,
        file_data: bytes | dict[str, Any],
        cache_file: Path,
        data_type: str,
    ) -> None:
        """Save downloaded file data to cache.

        Args:
            file_data: Downloaded file content.
            cache_file: Path to cache file.
            data_type: Type of data being saved.
        """
        spec = DATA_TYPE_SPECS.get(data_type)
        if spec is not None and spec.is_json:
            with open(cache_file, "w", encoding="utf-8") as f:
                dump(file_data, f, indent=2)
        elif isinstance(file_data, bytes):
            with open(cache_file, "wb") as f:
                f.write(file_data)

    def _download_single_data_type(
        self,
        venue: str,
        split: str,
        data_type: str,
        paper_id: str,
        base_cache_path: Path,
        errors: list[str],
    ) -> bool:
        """Download a single data type for a paper.

        Args:
            venue: Conference venue.
            split: Data split.
            data_type: Type of data to download.
            paper_id: Paper identifier.
            base_cache_path: Base cache directory path.
            errors: List to append errors to.

        Returns:
            True if file was downloaded or already cached, False otherwise.
        """
        data_type_path = base_cache_path / data_type
        data_type_path.mkdir(parents=True, exist_ok=True)

        cache_filename = self._get_cache_filename(data_type, paper_id)
        if not cache_filename:
            return False

        cache_file = data_type_path / cache_filename

        if cache_file.exists():
            logger.debug(f"{data_type}/{paper_id} already cached")
            return True

        file_data = self.download_file(venue, split, data_type, paper_id)
        if file_data is None:
            errors.append(f"Failed to download {data_type}/{paper_id}")
            return False

        self._save_file_data(file_data, cache_file, data_type)
        logger.info(f"Cached {data_type}/{paper_id}")
        return True

    def _download_paper_all_types(
        self,
        venue: str,
        split: str,
        paper_id: str,
        base_cache_path: Path,
        errors: list[str],
    ) -> bool:
        """Download all data types for a single paper.

        Args:
            venue: Conference venue.
            split: Data split.
            paper_id: Paper identifier.
            base_cache_path: Base cache directory path.
            errors: List to append errors to.

        Returns:
            True if at least one file was downloaded successfully.
        """
        data_types = ["reviews", "parsed_pdfs", "pdfs"]
        paper_downloaded = False

        for data_type in data_types:
            success = self._download_single_data_type(
                venue, split, data_type, paper_id, base_cache_path, errors
            )
            if success and not paper_downloaded:
                paper_downloaded = True

        return paper_downloaded

    def download_venue_split(
        self,
        venue: str,
        split: str,
        max_papers: int | None = None,
    ) -> DownloadResult:
        """Download all files for a venue/split combination across all data types.

        Args:
            venue: Conference venue.
            split: Data split.
            max_papers: Maximum number of papers to download.

        Returns:
            DownloadResult with download statistics.
        """
        base_cache_path = self.cache_dir / venue / split
        available_paper_ids = self._discover_available_files(venue, split, "reviews")

        if not available_paper_ids:
            error_msg = f"No review files discovered for {venue}/{split}"
            logger.error(error_msg)
            return DownloadResult(
                success=False,
                cache_path=str(base_cache_path),
                papers_downloaded=0,
                error_message=error_msg,
            )

        max_papers = max_papers or self.config.max_papers_per_query
        paper_ids_to_download = available_paper_ids[:max_papers]
        logger.info(
            f"Will download {len(paper_ids_to_download)} of "
            f"{len(available_paper_ids)} available papers across all data types"
        )

        downloaded = 0
        errors: list[str] = []

        for paper_id in paper_ids_to_download:
            if self._download_paper_all_types(venue, split, paper_id, base_cache_path, errors):
                downloaded += 1

        success = downloaded > 0
        error_message = None if success else "; ".join(errors[:5])

        return DownloadResult(
            success=success,
            cache_path=str(base_cache_path),
            papers_downloaded=downloaded,
            error_message=error_message,
        )
Functions
__init__(config)

Initialize downloader with configuration.

Parameters:

Name Type Description Default
config PeerReadConfig

PeerRead dataset configuration.

required
Source code in src/app/data_utils/datasets_peerread.py
def __init__(self, config: PeerReadConfig):
    """Initialize downloader with configuration.

    Args:
        config: PeerRead dataset configuration.
    """
    self.config = config
    # Resolve cache directory relative to project root
    self.cache_dir = resolve_project_path(config.cache_directory)
    headers: dict[str, str] = {}
    app_env = AppEnv()
    if app_env.GITHUB_API_KEY:
        logger.info("Using GitHub API key for authenticated requests")
        headers["Authorization"] = f"token {app_env.GITHUB_API_KEY}"
    self.client = Client(headers=headers)
download_file(venue, split, data_type, paper_id)

Download a single file.

Parameters:

Name Type Description Default
venue str

Conference venue.

required
split str

Data split.

required
data_type str

Type of data (‘reviews’, ‘parsed_pdfs’, ‘pdfs’).

required
paper_id str

Paper identifier.

required

Returns:

Type Description
bytes | dict[str, Any] | None

File content (JSON dict for .json files, bytes for PDFs),

bytes | dict[str, Any] | None

or None if download fails.

Raises:

Type Description
ValueError

If venue/split is invalid.

Source code in src/app/data_utils/datasets_peerread.py
def download_file(
    self,
    venue: str,
    split: str,
    data_type: str,
    paper_id: str,
) -> bytes | dict[str, Any] | None:
    """Download a single file.

    Args:
        venue: Conference venue.
        split: Data split.
        data_type: Type of data ('reviews', 'parsed_pdfs', 'pdfs').
        paper_id: Paper identifier.

    Returns:
        File content (JSON dict for .json files, bytes for PDFs),
        or None if download fails.

    Raises:
        ValueError: If venue/split is invalid.
    """
    url = self._construct_url(venue, split, data_type, paper_id)

    for attempt in range(self.config.max_retries):
        try:
            # Validate URL for SSRF protection (CVE-2026-25580 mitigation)
            validated_url = validate_url(url)
            logger.info(
                f"Downloading {data_type}/{paper_id} from {validated_url} "
                f"(Attempt {attempt + 1}/{self.config.max_retries})"
            )

            response = self.client.get(validated_url, timeout=self.config.download_timeout)
            response.raise_for_status()

            if DATA_TYPE_SPECS[data_type].is_json:
                return response.json()
            return response.content

        except (HTTPStatusError, RequestError, JSONDecodeError) as e:
            should_retry = self._handle_download_error(e, data_type, paper_id)
            if not should_retry:
                return None

    logger.error(
        f"Failed to download {data_type}/{paper_id} after {self.config.max_retries} attempts."
    )
    return None
download_venue_split(venue, split, max_papers=None)

Download all files for a venue/split combination across all data types.

Parameters:

Name Type Description Default
venue str

Conference venue.

required
split str

Data split.

required
max_papers int | None

Maximum number of papers to download.

None

Returns:

Type Description
DownloadResult

DownloadResult with download statistics.

Source code in src/app/data_utils/datasets_peerread.py
def download_venue_split(
    self,
    venue: str,
    split: str,
    max_papers: int | None = None,
) -> DownloadResult:
    """Download all files for a venue/split combination across all data types.

    Args:
        venue: Conference venue.
        split: Data split.
        max_papers: Maximum number of papers to download.

    Returns:
        DownloadResult with download statistics.
    """
    base_cache_path = self.cache_dir / venue / split
    available_paper_ids = self._discover_available_files(venue, split, "reviews")

    if not available_paper_ids:
        error_msg = f"No review files discovered for {venue}/{split}"
        logger.error(error_msg)
        return DownloadResult(
            success=False,
            cache_path=str(base_cache_path),
            papers_downloaded=0,
            error_message=error_msg,
        )

    max_papers = max_papers or self.config.max_papers_per_query
    paper_ids_to_download = available_paper_ids[:max_papers]
    logger.info(
        f"Will download {len(paper_ids_to_download)} of "
        f"{len(available_paper_ids)} available papers across all data types"
    )

    downloaded = 0
    errors: list[str] = []

    for paper_id in paper_ids_to_download:
        if self._download_paper_all_types(venue, split, paper_id, base_cache_path, errors):
            downloaded += 1

    success = downloaded > 0
    error_message = None if success else "; ".join(errors[:5])

    return DownloadResult(
        success=success,
        cache_path=str(base_cache_path),
        papers_downloaded=downloaded,
        error_message=error_message,
    )

PeerReadLoader

Loads and queries PeerRead dataset with structured access.

Source code in src/app/data_utils/datasets_peerread.py
class PeerReadLoader:
    """Loads and queries PeerRead dataset with structured access."""

    def __init__(self, config: PeerReadConfig | None = None):
        """Initialize loader with configuration.

        Args:
            config: PeerRead dataset configuration. Loads from file if None.
        """
        self.config = config or load_peerread_config()
        # Resolve cache directory relative to project root
        self.cache_dir = resolve_project_path(self.config.cache_directory)

    def _extract_text_from_parsed_data(self, parsed_data: dict[str, Any]) -> str:
        """Extract text content from parsed PDF data.

        Args:
            parsed_data: Parsed PDF JSON data.

        Returns:
            Concatenated text from all sections.
        """
        full_text: list[str] = []
        sections = parsed_data.get("metadata", {}).get("sections", [])
        for section in sections:
            if "text" in section:
                full_text.append(section["text"])
        return "\n".join(full_text).strip()

    def _load_parsed_file(self, parsed_file: Path) -> str | None:
        """Load and parse a single parsed PDF file.

        Args:
            parsed_file: Path to parsed PDF file.

        Returns:
            Extracted text content, or None if loading fails.
        """
        try:
            with open(parsed_file, encoding="utf-8") as f:
                parsed_data = load(f)
            return self._extract_text_from_parsed_data(parsed_data)
        except Exception as e:
            logger.warning(f"Failed to load/parse {parsed_file}: {e}")
            return None

    def _find_parsed_pdf_in_split(
        self,
        venue: str,
        split: str,
        paper_id: str,
    ) -> str | None:
        """Find and load parsed PDF content in a specific venue/split.

        Args:
            venue: Conference venue.
            split: Data split.
            paper_id: Paper identifier.

        Returns:
            Extracted text content, or None if not found.
        """
        parsed_pdfs_path = self.cache_dir / venue / split / "parsed_pdfs"
        if not parsed_pdfs_path.exists():
            return None

        parsed_files = sorted(parsed_pdfs_path.glob(f"{paper_id}.pdf.json"), reverse=True)
        if not parsed_files:
            return None

        return self._load_parsed_file(parsed_files[0])

    def load_parsed_pdf_content(self, paper_id: str) -> str | None:
        """Load the text content from the parsed PDF for a given paper ID.

        Assumes parsed PDF files are JSON and contain a 'sections' key with 'text'
        within. Defaults to the latest revision if multiple exist (by filename).

        Args:
            paper_id: Unique identifier for the paper.

        Returns:
            str: The extracted text content, or None if not found/parsed.
        """
        for venue in self.config.venues:
            for split in self.config.splits:
                content = self._find_parsed_pdf_in_split(venue, split, paper_id)
                if content:
                    return content
        return None

    def get_raw_pdf_path(self, paper_id: str) -> str | None:
        """Get the absolute path to the raw PDF file for a given paper ID.

        Args:
            paper_id: Unique identifier for the paper.

        Returns:
            str: The absolute path to the PDF file, or None if not found.
        """
        for venue in self.config.venues:
            for split in self.config.splits:
                pdf_path = self.cache_dir / venue / split / "pdfs" / f"{paper_id}.pdf"
                if pdf_path.exists():
                    return str(pdf_path)
        return None

    def _create_review_from_dict(self, review_data: dict[str, Any]) -> PeerReadReview:
        """Create PeerReadReview from dictionary with optional field handling.

        Args:
            review_data: Review dictionary from PeerRead dataset.

        Returns:
            Validated PeerReadReview model.
        """
        return PeerReadReview.model_validate(review_data)

    def _validate_papers(
        self,
        papers_data: list[dict[str, Any]],
    ) -> list[PeerReadPaper]:
        """Validate and convert paper data to Pydantic models.

        Args:
            papers_data: List of paper dictionaries.

        Returns:
            List of validated PeerReadPaper models.
        """
        validated_papers: list[PeerReadPaper] = []
        skipped_ids: list[str] = []

        for paper_data in papers_data:
            try:
                # Convert from PeerRead format to our model format
                paper_id = str(paper_data.get("id", "unknown"))
                reviews: list[PeerReadReview] = [
                    self._create_review_from_dict(r) for r in paper_data.get("reviews", [])
                ]

                paper = PeerReadPaper(
                    paper_id=paper_id,
                    title=paper_data["title"],
                    abstract=paper_data["abstract"],
                    reviews=reviews,
                    review_histories=[
                        " ".join(map(str, h)) for h in paper_data.get("histories", [])
                    ],
                )

                # Exclude papers where any review is missing required score fields
                if reviews and not all(r.is_compliant() for r in reviews):
                    skipped_ids.append(paper_id)
                    continue

                validated_papers.append(paper)

            except Exception as e:
                logger.warning(f"Failed to validate paper {paper_data.get('id', 'unknown')}: {e}")
                continue

        if skipped_ids:
            logger.info(
                f"Skipping {len(skipped_ids)} non-compliant papers "
                f"(missing required score fields): {', '.join(skipped_ids)}"
            )

        return validated_papers

    def load_papers(
        self,
        venue: str = "acl_2017",
        split: str = "train",
    ) -> list[PeerReadPaper]:
        """Load papers from cached data or download if needed.

        Args:
            venue: Conference venue.
            split: Data split.

        Returns:
            List of validated PeerReadPaper models.

        Raises:
            FileNotFoundError: If cache directory doesn't exist and download fails.
        """
        cache_path = self.cache_dir / venue / split

        if not cache_path.exists():
            error_msg = (
                f"PeerRead dataset not found for {venue}/{split}. "
                f"Please download the dataset first using: "
                f"'python src/app/main.py --download-peerread-only' or "
                f"'make app_cli ARGS=\"--download-peerread-only\"'"
            )
            logger.error(error_msg)
            raise FileNotFoundError(error_msg)

        # Load all cached papers from reviews directory
        reviews_path = cache_path / "reviews"

        if not reviews_path.exists():
            error_msg = (
                f"PeerRead reviews not found for {venue}/{split}. "
                f"Please download the dataset first using: "
                f"'python src/app/main.py --download-peerread-only' or "
                f"'make app_cli ARGS=\"--download-peerread-only\"'"
            )
            logger.error(error_msg)
            raise FileNotFoundError(error_msg)

        papers_data: list[dict[str, Any]] = []
        for json_file in reviews_path.glob("*.json"):
            try:
                with open(json_file, encoding="utf-8") as f:
                    papers_data.append(load(f))
            except Exception as e:
                logger.warning(f"Failed to load {json_file}: {e}")
                continue

        return self._validate_papers(papers_data)

    def _load_paper_from_path(self, cache_path: Path, paper_id: str) -> PeerReadPaper | None:
        """Load and validate a paper from a specific cache path.

        Args:
            cache_path: Path to the cached paper JSON file.
            paper_id: Paper identifier for logging.

        Returns:
            Validated PeerReadPaper, or None if loading fails.
        """
        try:
            with open(cache_path, encoding="utf-8") as f:
                data: dict[str, Any] = load(f)
            papers = self._validate_papers([data])
            return papers[0] if papers else None
        except Exception as e:
            logger.warning(f"Failed to load paper {paper_id}: {e}")
            return None

    def get_paper_by_id(self, paper_id: str) -> PeerReadPaper | None:
        """Get a specific paper by ID.

        Args:
            paper_id: Paper identifier.

        Returns:
            PeerReadPaper if found, None otherwise.
        """
        for venue in self.config.venues:
            for split in self.config.splits:
                cache_path = self.cache_dir / venue / split / "reviews" / f"{paper_id}.json"
                if not cache_path.exists():
                    continue

                paper = self._load_paper_from_path(cache_path, paper_id)
                if paper:
                    return paper

        return None

    def query_papers(
        self,
        venue: str | None = None,
        min_reviews: int = 1,
        limit: int | None = None,
    ) -> list[PeerReadPaper]:
        """Query papers with filters.

        Args:
            venue: Filter by venue (None for all venues).
            min_reviews: Minimum number of reviews required.
            limit: Maximum number of papers to return.

        Returns:
            List of filtered PeerReadPaper models.
        """
        all_papers: list[PeerReadPaper] = []
        venues_to_search = [venue] if venue else self.config.venues

        for search_venue in venues_to_search:
            for split in self.config.splits:
                try:
                    papers = self.load_papers(search_venue, split)
                    all_papers.extend(papers)
                except Exception as e:
                    logger.warning(f"Failed to load {search_venue}/{split}: {e}")
                    continue

        # Apply filters
        filtered_papers = [paper for paper in all_papers if len(paper.reviews) >= min_reviews]

        # Apply limit
        if limit:
            filtered_papers = filtered_papers[:limit]

        return filtered_papers
Functions
__init__(config=None)

Initialize loader with configuration.

Parameters:

Name Type Description Default
config PeerReadConfig | None

PeerRead dataset configuration. Loads from file if None.

None
Source code in src/app/data_utils/datasets_peerread.py
def __init__(self, config: PeerReadConfig | None = None):
    """Initialize loader with configuration.

    Args:
        config: PeerRead dataset configuration. Loads from file if None.
    """
    self.config = config or load_peerread_config()
    # Resolve cache directory relative to project root
    self.cache_dir = resolve_project_path(self.config.cache_directory)
get_paper_by_id(paper_id)

Get a specific paper by ID.

Parameters:

Name Type Description Default
paper_id str

Paper identifier.

required

Returns:

Type Description
PeerReadPaper | None

PeerReadPaper if found, None otherwise.

Source code in src/app/data_utils/datasets_peerread.py
def get_paper_by_id(self, paper_id: str) -> PeerReadPaper | None:
    """Get a specific paper by ID.

    Args:
        paper_id: Paper identifier.

    Returns:
        PeerReadPaper if found, None otherwise.
    """
    for venue in self.config.venues:
        for split in self.config.splits:
            cache_path = self.cache_dir / venue / split / "reviews" / f"{paper_id}.json"
            if not cache_path.exists():
                continue

            paper = self._load_paper_from_path(cache_path, paper_id)
            if paper:
                return paper

    return None
get_raw_pdf_path(paper_id)

Get the absolute path to the raw PDF file for a given paper ID.

Parameters:

Name Type Description Default
paper_id str

Unique identifier for the paper.

required

Returns:

Name Type Description
str str | None

The absolute path to the PDF file, or None if not found.

Source code in src/app/data_utils/datasets_peerread.py
def get_raw_pdf_path(self, paper_id: str) -> str | None:
    """Get the absolute path to the raw PDF file for a given paper ID.

    Args:
        paper_id: Unique identifier for the paper.

    Returns:
        str: The absolute path to the PDF file, or None if not found.
    """
    for venue in self.config.venues:
        for split in self.config.splits:
            pdf_path = self.cache_dir / venue / split / "pdfs" / f"{paper_id}.pdf"
            if pdf_path.exists():
                return str(pdf_path)
    return None
load_papers(venue='acl_2017', split='train')

Load papers from cached data or download if needed.

Parameters:

Name Type Description Default
venue str

Conference venue.

'acl_2017'
split str

Data split.

'train'

Returns:

Type Description
list[PeerReadPaper]

List of validated PeerReadPaper models.

Raises:

Type Description
FileNotFoundError

If cache directory doesn’t exist and download fails.

Source code in src/app/data_utils/datasets_peerread.py
def load_papers(
    self,
    venue: str = "acl_2017",
    split: str = "train",
) -> list[PeerReadPaper]:
    """Load papers from cached data or download if needed.

    Args:
        venue: Conference venue.
        split: Data split.

    Returns:
        List of validated PeerReadPaper models.

    Raises:
        FileNotFoundError: If cache directory doesn't exist and download fails.
    """
    cache_path = self.cache_dir / venue / split

    if not cache_path.exists():
        error_msg = (
            f"PeerRead dataset not found for {venue}/{split}. "
            f"Please download the dataset first using: "
            f"'python src/app/main.py --download-peerread-only' or "
            f"'make app_cli ARGS=\"--download-peerread-only\"'"
        )
        logger.error(error_msg)
        raise FileNotFoundError(error_msg)

    # Load all cached papers from reviews directory
    reviews_path = cache_path / "reviews"

    if not reviews_path.exists():
        error_msg = (
            f"PeerRead reviews not found for {venue}/{split}. "
            f"Please download the dataset first using: "
            f"'python src/app/main.py --download-peerread-only' or "
            f"'make app_cli ARGS=\"--download-peerread-only\"'"
        )
        logger.error(error_msg)
        raise FileNotFoundError(error_msg)

    papers_data: list[dict[str, Any]] = []
    for json_file in reviews_path.glob("*.json"):
        try:
            with open(json_file, encoding="utf-8") as f:
                papers_data.append(load(f))
        except Exception as e:
            logger.warning(f"Failed to load {json_file}: {e}")
            continue

    return self._validate_papers(papers_data)
load_parsed_pdf_content(paper_id)

Load the text content from the parsed PDF for a given paper ID.

Assumes parsed PDF files are JSON and contain a ‘sections’ key with ‘text’ within. Defaults to the latest revision if multiple exist (by filename).

Parameters:

Name Type Description Default
paper_id str

Unique identifier for the paper.

required

Returns:

Name Type Description
str str | None

The extracted text content, or None if not found/parsed.

Source code in src/app/data_utils/datasets_peerread.py
def load_parsed_pdf_content(self, paper_id: str) -> str | None:
    """Load the text content from the parsed PDF for a given paper ID.

    Assumes parsed PDF files are JSON and contain a 'sections' key with 'text'
    within. Defaults to the latest revision if multiple exist (by filename).

    Args:
        paper_id: Unique identifier for the paper.

    Returns:
        str: The extracted text content, or None if not found/parsed.
    """
    for venue in self.config.venues:
        for split in self.config.splits:
            content = self._find_parsed_pdf_in_split(venue, split, paper_id)
            if content:
                return content
    return None
query_papers(venue=None, min_reviews=1, limit=None)

Query papers with filters.

Parameters:

Name Type Description Default
venue str | None

Filter by venue (None for all venues).

None
min_reviews int

Minimum number of reviews required.

1
limit int | None

Maximum number of papers to return.

None

Returns:

Type Description
list[PeerReadPaper]

List of filtered PeerReadPaper models.

Source code in src/app/data_utils/datasets_peerread.py
def query_papers(
    self,
    venue: str | None = None,
    min_reviews: int = 1,
    limit: int | None = None,
) -> list[PeerReadPaper]:
    """Query papers with filters.

    Args:
        venue: Filter by venue (None for all venues).
        min_reviews: Minimum number of reviews required.
        limit: Maximum number of papers to return.

    Returns:
        List of filtered PeerReadPaper models.
    """
    all_papers: list[PeerReadPaper] = []
    venues_to_search = [venue] if venue else self.config.venues

    for search_venue in venues_to_search:
        for split in self.config.splits:
            try:
                papers = self.load_papers(search_venue, split)
                all_papers.extend(papers)
            except Exception as e:
                logger.warning(f"Failed to load {search_venue}/{split}: {e}")
                continue

    # Apply filters
    filtered_papers = [paper for paper in all_papers if len(paper.reviews) >= min_reviews]

    # Apply limit
    if limit:
        filtered_papers = filtered_papers[:limit]

    return filtered_papers

Functions

download_peerread_dataset(peerread_max_papers_per_sample_download=None)

Download PeerRead dataset and verify the download.

This function handles the setup phase separately from MAS execution, following Separation of Concerns principle. It downloads the dataset to the configured path and verifies the download was successful.

Parameters:

Name Type Description Default
peerread_max_papers_per_sample_download int | None

The maximum number of papers to download. If None, downloads all papers it can find.

None

Raises:

Type Description
Exception

If download or verification fails.

Source code in src/app/data_utils/datasets_peerread.py
def download_peerread_dataset(
    peerread_max_papers_per_sample_download: int | None = None,
) -> None:
    """
    Download PeerRead dataset and verify the download.

    This function handles the setup phase separately from MAS execution,
    following Separation of Concerns principle. It downloads the dataset
    to the configured path and verifies the download was successful.

    Args:
        peerread_max_papers_per_sample_download: The maximum number of papers to
            download. If None, downloads all papers it can find.

    Raises:
        Exception: If download or verification fails.
    """
    logger.info("Starting PeerRead dataset download (setup mode)")

    try:
        config = load_peerread_config()
        logger.info(
            f"Loaded PeerRead config: {len(config.venues)} venues, {len(config.splits)} splits"
        )

        downloader = PeerReadDownloader(config)
        logger.info(f"Download target directory: {downloader.cache_dir}")

        max_papers = (
            peerread_max_papers_per_sample_download
            if peerread_max_papers_per_sample_download is not None
            else config.max_papers_per_query
        )

        total_downloaded, failed_downloads = _perform_downloads(downloader, config, max_papers)

        loader = PeerReadLoader(config)
        verification_count = _verify_downloads(loader, config, failed_downloads)

        logger.info("=== Download Summary ===")
        logger.info(f"Total papers downloaded: {total_downloaded}")
        logger.info(f"Total papers verified: {verification_count}")
        logger.info(f"Download directory: {downloader.cache_dir}")

        _validate_download_results(total_downloaded, verification_count, failed_downloads)

        logger.info("✓ PeerRead dataset download and verification completed successfully")

    except Exception as e:
        error_msg = f"PeerRead dataset download failed: {e}"
        logger.error(error_msg)
        raise Exception(error_msg) from e

load_peerread_config()

Load PeerRead dataset configuration from config file.

Returns:

Name Type Description
PeerReadConfig PeerReadConfig

Validated configuration object.

Raises:

Type Description
FileNotFoundError

If config file doesn’t exist.

ValidationError

If config data is invalid.

Source code in src/app/data_utils/datasets_peerread.py
def load_peerread_config() -> PeerReadConfig:
    """Load PeerRead dataset configuration from config file.

    Returns:
        PeerReadConfig: Validated configuration object.

    Raises:
        FileNotFoundError: If config file doesn't exist.
        ValidationError: If config data is invalid.
    """
    # Get absolute path to config file
    ds_cfg_file_path = resolve_config_path(DATASETS_CONFIG_FILE)
    try:
        # Load as raw JSON data first
        with open(ds_cfg_file_path, encoding="utf-8") as f:
            data = load(f)
        return PeerReadConfig.model_validate(data["peerread"])
    except Exception as e:
        logger.error(f"Failed to load PeerRead config: {e}")
        raise

app.data_utils.review_persistence

Review persistence interface for MAS and evaluation system integration.

Classes

ReviewPersistence

Handles saving and loading of MAS-generated reviews.

Source code in src/app/data_utils/review_persistence.py
class ReviewPersistence:
    """Handles saving and loading of MAS-generated reviews."""

    def __init__(self, reviews_dir: str = _DEFAULT_REVIEWS_DIR):
        """Initialize with reviews directory path.

        Args:
            reviews_dir: Directory to store review files
        """
        # Resolve reviews directory relative to project root
        self.reviews_dir = resolve_project_path(reviews_dir)
        self.reviews_dir.mkdir(parents=True, exist_ok=True)

    def save_review(
        self,
        paper_id: str,
        review: PeerReadReview,
        timestamp: str | None = None,
        run_dir: Path | None = None,
        structured_review: dict[str, object] | None = None,
        model_info: str | None = None,
    ) -> str:
        """Save a review to the run directory or legacy reviews directory.

        Args:
            paper_id: Unique identifier for the paper
            review: The generated review object
            timestamp: Optional timestamp, defaults to current UTC time
            run_dir: Optional per-run directory; writes review.json there if provided.
            structured_review: Optional GeneratedReview dict with validated scores.
            model_info: Optional model identifier string.

        Returns:
            str: Path to the saved review file
        """
        if run_dir is not None:
            filepath = run_dir / "review.json"
        else:
            if timestamp is None:
                timestamp = datetime.now(UTC).strftime("%Y-%m-%dT%H-%M-%SZ")
            filename = f"{paper_id}_{timestamp}.json"
            filepath = self.reviews_dir / filename

        review_data: dict[str, object] = {
            "paper_id": paper_id,
            "timestamp": timestamp or datetime.now(UTC).strftime("%Y-%m-%dT%H-%M-%SZ"),
            "review": review.model_dump(),
        }
        if structured_review is not None:
            review_data["structured_review"] = structured_review
        if model_info is not None:
            review_data["model_info"] = model_info

        with open(filepath, "w", encoding="utf-8") as f:
            json.dump(review_data, f, indent=2, ensure_ascii=False)

        from app.utils.artifact_registry import get_artifact_registry

        get_artifact_registry().register("Review", filepath)

        return str(filepath)

    def load_review(self, filepath: str) -> tuple[str, PeerReadReview]:
        """Load a review from file.

        Args:
            filepath: Path to the review file

        Returns:
            tuple: (paper_id, PeerReadReview object)
        """
        with open(filepath, encoding="utf-8") as f:
            review_data = json.load(f)

        paper_id = review_data["paper_id"]
        review = PeerReadReview.model_validate(review_data["review"])

        return paper_id, review

    def list_reviews(self, paper_id: str | None = None) -> list[str]:
        """List available review files.

        Args:
            paper_id: Optional filter by paper ID

        Returns:
            list: Paths to matching review files
        """
        pattern = f"{paper_id}_*.json" if paper_id else "*.json"
        return [str(p) for p in self.reviews_dir.glob(pattern)]

    def get_latest_review(self, paper_id: str) -> str | None:
        """Get the most recent review file for a paper.

        Args:
            paper_id: Paper identifier

        Returns:
            str: Path to latest review file, or None if not found
        """
        reviews = self.list_reviews(paper_id)
        if not reviews:
            return None

        # Sort by timestamp in filename (newest first)
        reviews.sort(reverse=True)
        return reviews[0]
Functions
__init__(reviews_dir=_DEFAULT_REVIEWS_DIR)

Initialize with reviews directory path.

Parameters:

Name Type Description Default
reviews_dir str

Directory to store review files

_DEFAULT_REVIEWS_DIR
Source code in src/app/data_utils/review_persistence.py
def __init__(self, reviews_dir: str = _DEFAULT_REVIEWS_DIR):
    """Initialize with reviews directory path.

    Args:
        reviews_dir: Directory to store review files
    """
    # Resolve reviews directory relative to project root
    self.reviews_dir = resolve_project_path(reviews_dir)
    self.reviews_dir.mkdir(parents=True, exist_ok=True)
get_latest_review(paper_id)

Get the most recent review file for a paper.

Parameters:

Name Type Description Default
paper_id str

Paper identifier

required

Returns:

Name Type Description
str str | None

Path to latest review file, or None if not found

Source code in src/app/data_utils/review_persistence.py
def get_latest_review(self, paper_id: str) -> str | None:
    """Get the most recent review file for a paper.

    Args:
        paper_id: Paper identifier

    Returns:
        str: Path to latest review file, or None if not found
    """
    reviews = self.list_reviews(paper_id)
    if not reviews:
        return None

    # Sort by timestamp in filename (newest first)
    reviews.sort(reverse=True)
    return reviews[0]
list_reviews(paper_id=None)

List available review files.

Parameters:

Name Type Description Default
paper_id str | None

Optional filter by paper ID

None

Returns:

Name Type Description
list list[str]

Paths to matching review files

Source code in src/app/data_utils/review_persistence.py
def list_reviews(self, paper_id: str | None = None) -> list[str]:
    """List available review files.

    Args:
        paper_id: Optional filter by paper ID

    Returns:
        list: Paths to matching review files
    """
    pattern = f"{paper_id}_*.json" if paper_id else "*.json"
    return [str(p) for p in self.reviews_dir.glob(pattern)]
load_review(filepath)

Load a review from file.

Parameters:

Name Type Description Default
filepath str

Path to the review file

required

Returns:

Name Type Description
tuple tuple[str, PeerReadReview]

(paper_id, PeerReadReview object)

Source code in src/app/data_utils/review_persistence.py
def load_review(self, filepath: str) -> tuple[str, PeerReadReview]:
    """Load a review from file.

    Args:
        filepath: Path to the review file

    Returns:
        tuple: (paper_id, PeerReadReview object)
    """
    with open(filepath, encoding="utf-8") as f:
        review_data = json.load(f)

    paper_id = review_data["paper_id"]
    review = PeerReadReview.model_validate(review_data["review"])

    return paper_id, review
save_review(paper_id, review, timestamp=None, run_dir=None, structured_review=None, model_info=None)

Save a review to the run directory or legacy reviews directory.

Parameters:

Name Type Description Default
paper_id str

Unique identifier for the paper

required
review PeerReadReview

The generated review object

required
timestamp str | None

Optional timestamp, defaults to current UTC time

None
run_dir Path | None

Optional per-run directory; writes review.json there if provided.

None
structured_review dict[str, object] | None

Optional GeneratedReview dict with validated scores.

None
model_info str | None

Optional model identifier string.

None

Returns:

Name Type Description
str str

Path to the saved review file

Source code in src/app/data_utils/review_persistence.py
def save_review(
    self,
    paper_id: str,
    review: PeerReadReview,
    timestamp: str | None = None,
    run_dir: Path | None = None,
    structured_review: dict[str, object] | None = None,
    model_info: str | None = None,
) -> str:
    """Save a review to the run directory or legacy reviews directory.

    Args:
        paper_id: Unique identifier for the paper
        review: The generated review object
        timestamp: Optional timestamp, defaults to current UTC time
        run_dir: Optional per-run directory; writes review.json there if provided.
        structured_review: Optional GeneratedReview dict with validated scores.
        model_info: Optional model identifier string.

    Returns:
        str: Path to the saved review file
    """
    if run_dir is not None:
        filepath = run_dir / "review.json"
    else:
        if timestamp is None:
            timestamp = datetime.now(UTC).strftime("%Y-%m-%dT%H-%M-%SZ")
        filename = f"{paper_id}_{timestamp}.json"
        filepath = self.reviews_dir / filename

    review_data: dict[str, object] = {
        "paper_id": paper_id,
        "timestamp": timestamp or datetime.now(UTC).strftime("%Y-%m-%dT%H-%M-%SZ"),
        "review": review.model_dump(),
    }
    if structured_review is not None:
        review_data["structured_review"] = structured_review
    if model_info is not None:
        review_data["model_info"] = model_info

    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(review_data, f, indent=2, ensure_ascii=False)

    from app.utils.artifact_registry import get_artifact_registry

    get_artifact_registry().register("Review", filepath)

    return str(filepath)

Functions

app.engines.cc_engine

Consolidated Claude Code (CC) engine for solo and teams execution.

Replaces duplicated subprocess logic scattered across run_cli.py, sweep_runner.py, and shell scripts with a single, well-tested Python module.

Critical constraint (from AGENT_LEARNINGS.md): CC teams artifacts are ephemeral in claude -p print mode. This module uses --output-format stream-json with Popen to parse team events from the live stream instead of filesystem artifacts.

Classes

CCResult

Bases: BaseModel

Result of a Claude Code execution (solo or teams mode).

Attributes:

Name Type Description
execution_id str

Session or team identifier extracted from stream.

output_data dict[str, Any]

Parsed JSON output (solo) or aggregated result data (teams).

session_dir str | None

Solo session directory path (from JSON output), if present.

team_artifacts list[dict[str, Any]]

Team-related events parsed from stream-json (teams mode).

Source code in src/app/engines/cc_engine.py
class CCResult(BaseModel):
    """Result of a Claude Code execution (solo or teams mode).

    Attributes:
        execution_id: Session or team identifier extracted from stream.
        output_data: Parsed JSON output (solo) or aggregated result data (teams).
        session_dir: Solo session directory path (from JSON output), if present.
        team_artifacts: Team-related events parsed from stream-json (teams mode).
    """

    execution_id: str = Field(default="unknown", description="Session or team execution ID")
    output_data: dict[str, Any] = Field(
        default_factory=dict, description="Parsed output from CC process"
    )
    session_dir: str | None = Field(
        default=None, description="Solo session directory (if provided by CC)"
    )
    team_artifacts: list[dict[str, Any]] = Field(
        default_factory=list, description="Team events parsed from stream-json output"
    )

Functions

build_cc_query(query, paper_id=None, cc_teams=False)

Build a non-empty query for CC engine execution.

When no explicit query is provided but a paper_id is available, generates a default review prompt using DEFAULT_REVIEW_PROMPT_TEMPLATE. In teams mode, prepends a team instruction to increase likelihood of CC spawning teammates.

Parameters:

Name Type Description Default
query str

User-provided query string (may be empty).

required
paper_id str | None

Optional PeerRead paper ID for auto-generating a prompt.

None
cc_teams bool

Whether CC teams mode is enabled.

False

Returns:

Type Description
str

Non-empty query string for CC subprocess.

Raises:

Type Description
ValueError

When both query and paper_id are empty/None.

Example

build_cc_query(“”, paper_id=”1105.1072”) “Generate a structured peer review for paper ‘1105.1072’.” build_cc_query(“”, paper_id=”1105.1072”, cc_teams=True) “Use a team of agents. Generate a structured peer review for paper ‘1105.1072’.”

Source code in src/app/engines/cc_engine.py
def build_cc_query(query: str, paper_id: str | None = None, cc_teams: bool = False) -> str:
    """Build a non-empty query for CC engine execution.

    When no explicit query is provided but a paper_id is available, generates
    a default review prompt using DEFAULT_REVIEW_PROMPT_TEMPLATE. In teams mode,
    prepends a team instruction to increase likelihood of CC spawning teammates.

    Args:
        query: User-provided query string (may be empty).
        paper_id: Optional PeerRead paper ID for auto-generating a prompt.
        cc_teams: Whether CC teams mode is enabled.

    Returns:
        Non-empty query string for CC subprocess.

    Raises:
        ValueError: When both query and paper_id are empty/None.

    Example:
        >>> build_cc_query("", paper_id="1105.1072")
        "Generate a structured peer review for paper '1105.1072'."
        >>> build_cc_query("", paper_id="1105.1072", cc_teams=True)
        "Use a team of agents. Generate a structured peer review for paper '1105.1072'."
    """
    if query:
        return query

    if not paper_id:
        raise ValueError(
            "Either query or paper_id must be provided. Use --query or --paper-id to specify input."
        )

    generated = DEFAULT_REVIEW_PROMPT_TEMPLATE.format(paper_id=paper_id)
    if cc_teams:
        return f"Use a team of agents. {generated}"
    return generated

cc_result_to_graph_trace(cc_result)

Build GraphTraceData from a CCResult for graph-based analysis.

Solo mode: returns minimal GraphTraceData with empty lists (the composite scorer detects single_agent_mode and redistributes weights).

Teams mode: maps Task events to agent_interactions and TeamCreate events to coordination_events.

Parameters:

Name Type Description Default
cc_result CCResult

CCResult from solo or teams execution.

required

Returns:

Type Description
GraphTraceData

GraphTraceData populated from CC artifacts.

Example

result = CCResult(execution_id=”solo-1”, output_data={}) trace = cc_result_to_graph_trace(result) trace.execution_id ‘solo-1’

Source code in src/app/engines/cc_engine.py
def cc_result_to_graph_trace(cc_result: CCResult) -> GraphTraceData:
    """Build GraphTraceData from a CCResult for graph-based analysis.

    Solo mode: returns minimal GraphTraceData with empty lists (the composite
    scorer detects single_agent_mode and redistributes weights).

    Teams mode: maps Task events to agent_interactions and TeamCreate events
    to coordination_events.

    Args:
        cc_result: CCResult from solo or teams execution.

    Returns:
        GraphTraceData populated from CC artifacts.

    Example:
        >>> result = CCResult(execution_id="solo-1", output_data={})
        >>> trace = cc_result_to_graph_trace(result)
        >>> trace.execution_id
        'solo-1'
    """
    from app.data_models.evaluation_models import GraphTraceData

    agent_interactions: list[dict[str, Any]] = []
    coordination_events: list[dict[str, Any]] = []

    for artifact in cc_result.team_artifacts:
        subtype = artifact.get("subtype", "")
        if subtype == "task_started":
            agent_interactions.append(_normalize_task_started(artifact))
        elif subtype == "task_completed":
            coordination_events.append(artifact)

    return GraphTraceData(
        execution_id=cc_result.execution_id,
        agent_interactions=agent_interactions,
        coordination_events=coordination_events,
    )

check_cc_available()

Check whether the Claude Code CLI is installed and on PATH.

Returns:

Type Description
bool

True if ‘claude’ binary is found on PATH, False otherwise.

Example

if not check_cc_available(): … raise RuntimeError(“claude CLI required for –engine=cc”)

Source code in src/app/engines/cc_engine.py
def check_cc_available() -> bool:
    """Check whether the Claude Code CLI is installed and on PATH.

    Returns:
        True if 'claude' binary is found on PATH, False otherwise.

    Example:
        >>> if not check_cc_available():
        ...     raise RuntimeError("claude CLI required for --engine=cc")
    """
    return shutil.which("claude") is not None

extract_cc_review_text(cc_result)

Extract review text from a CC execution result.

Parameters:

Name Type Description Default
cc_result CCResult

CCResult from solo or teams execution.

required

Returns:

Type Description
str

Review text string, or empty string if not present.

Example

result = CCResult(execution_id=”x”, output_data={“result”: “Good paper.”}) extract_cc_review_text(result) ‘Good paper.’

Source code in src/app/engines/cc_engine.py
def extract_cc_review_text(cc_result: CCResult) -> str:
    """Extract review text from a CC execution result.

    Args:
        cc_result: CCResult from solo or teams execution.

    Returns:
        Review text string, or empty string if not present.

    Example:
        >>> result = CCResult(execution_id="x", output_data={"result": "Good paper."})
        >>> extract_cc_review_text(result)
        'Good paper.'
    """
    return str(cc_result.output_data.get("result", ""))

parse_stream_json(stream)

Parse a JSONL stream from CC --output-format stream-json into CCResult.

Extracts: - type=system, subtype=init → session_id becomes execution_id - type=result → duration_ms, total_cost_usd, num_turns → output_data - type=system, subtype in _TEAM_SUBTYPES → appended to team_artifacts

Skips blank lines and malformed JSON without raising.

Parameters:

Name Type Description Default
stream Iterator[str]

Iterator of raw JSONL lines (strings) from CC stdout.

required

Returns:

Type Description
CCResult

CCResult populated from parsed events.

Example

lines = [‘{“type”: “result”, “num_turns”: 3}’] result = parse_stream_json(iter(lines)) result.output_data[“num_turns”] 3

Source code in src/app/engines/cc_engine.py
def parse_stream_json(stream: Iterator[str]) -> CCResult:
    """Parse a JSONL stream from CC ``--output-format stream-json`` into CCResult.

    Extracts:
    - ``type=system, subtype=init`` → ``session_id`` becomes ``execution_id``
    - ``type=result`` → ``duration_ms``, ``total_cost_usd``, ``num_turns`` → ``output_data``
    - ``type=system, subtype in _TEAM_SUBTYPES`` → appended to ``team_artifacts``

    Skips blank lines and malformed JSON without raising.

    Args:
        stream: Iterator of raw JSONL lines (strings) from CC stdout.

    Returns:
        CCResult populated from parsed events.

    Example:
        >>> lines = ['{"type": "result", "num_turns": 3}']
        >>> result = parse_stream_json(iter(lines))
        >>> result.output_data["num_turns"]
        3
    """
    state: dict[str, Any] = {
        "execution_id": "unknown",
        "output_data": {},
        "team_artifacts": [],
    }

    for raw_line in stream:
        event = _parse_jsonl_line(raw_line)
        if event is not None:
            _apply_event(event, state)

    return CCResult(
        execution_id=state["execution_id"],
        output_data=state["output_data"],
        team_artifacts=state["team_artifacts"],
    )

run_cc_solo(query, timeout=600, run_context=None)

Run Claude Code in solo (headless print) mode.

Uses blocking subprocess.run with --output-format json. The full JSON response is returned as a single object after the process exits.

Parameters:

Name Type Description Default
query str

Prompt string passed to claude -p.

required
timeout int

Maximum seconds to wait for the process. Defaults to 600.

600
run_context RunContext | None

Optional RunContext for per-run output directory.

None

Returns:

Type Description
CCResult

CCResult with output_data from parsed JSON stdout and session_dir if present.

Raises:

Type Description
ValueError

If query fails sanitization (empty, dash-prefixed, over-length) or if stdout cannot be parsed as JSON.

RuntimeError

If the subprocess exits with non-zero code or times out.

Example

result = run_cc_solo(“Summarise this paper”, timeout=300) print(result.execution_id)

Source code in src/app/engines/cc_engine.py
def run_cc_solo(query: str, timeout: int = 600, run_context: RunContext | None = None) -> CCResult:
    """Run Claude Code in solo (headless print) mode.

    Uses blocking ``subprocess.run`` with ``--output-format json``. The full JSON
    response is returned as a single object after the process exits.

    Args:
        query: Prompt string passed to ``claude -p``.
        timeout: Maximum seconds to wait for the process. Defaults to 600.
        run_context: Optional RunContext for per-run output directory.

    Returns:
        CCResult with output_data from parsed JSON stdout and session_dir if present.

    Raises:
        ValueError: If query fails sanitization (empty, dash-prefixed, over-length)
            or if stdout cannot be parsed as JSON.
        RuntimeError: If the subprocess exits with non-zero code or times out.

    Example:
        >>> result = run_cc_solo("Summarise this paper", timeout=300)
        >>> print(result.execution_id)
    """
    query = _sanitize_cc_query(query)
    cmd = ["claude", "-p", query, "--output-format", "json"]
    logger.info(f"CC solo: running query (timeout={timeout}s)")

    try:
        # Reason: query is sanitized by _sanitize_cc_query (empty, dash-prefix, length);
        # shell=False (list args) prevents shell interpretation — no injection risk.
        proc = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=timeout,
        )
    except subprocess.TimeoutExpired as e:
        raise RuntimeError(f"CC timed out after {e.timeout}s") from e

    if proc.returncode != 0:
        raise RuntimeError(f"CC failed: {proc.stderr}")

    try:
        data: dict[str, Any] = json.loads(proc.stdout)
    except json.JSONDecodeError as e:
        raise ValueError(f"CC output not valid JSON: {e}") from e

    execution_id = data.get("execution_id", data.get("session_id", "unknown"))
    session_dir: str | None = data.get("session_dir")

    if run_context is not None:
        _persist_solo_stream(proc.stdout, run_context.stream_path)
    else:
        # Reason: fallback when no RunContext — mirror per-run directory structure
        ts = datetime.now().strftime("%Y%m%dT%H%M%S")
        fallback_dir = Path(CC_RUNS_PATH) / f"{ts}_cc_solo_{execution_id[:8]}"
        fallback_dir.mkdir(parents=True, exist_ok=True)
        _persist_solo_stream(proc.stdout, fallback_dir / "stream.json")

    logger.info(f"CC solo completed: execution_id={execution_id}")
    return CCResult(
        execution_id=execution_id,
        output_data=data,
        session_dir=session_dir,
    )

run_cc_teams(query, timeout=600, run_context=None)

Run Claude Code in teams (agent orchestration) mode.

Uses subprocess.Popen with --output-format stream-json and the CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1 environment variable. Team events (TeamCreate, Task) are parsed from the live JSONL stream, since teams artifacts are ephemeral in print mode and not available on the filesystem after the process exits.

Parameters:

Name Type Description Default
query str

Prompt string passed to claude -p.

required
timeout int

Maximum seconds to allow the process to run. Defaults to 600.

600
run_context RunContext | None

Optional RunContext for per-run output directory.

None

Returns:

Type Description
CCResult

CCResult with team_artifacts populated from stream events.

Raises:

Type Description
ValueError

If query is empty, whitespace-only, or exceeds max length.

RuntimeError

If the subprocess exits with non-zero code or times out.

Example

result = run_cc_teams(“Review paper 1234 using a team”, timeout=600) print(len(result.team_artifacts))

Source code in src/app/engines/cc_engine.py
def run_cc_teams(query: str, timeout: int = 600, run_context: RunContext | None = None) -> CCResult:
    """Run Claude Code in teams (agent orchestration) mode.

    Uses ``subprocess.Popen`` with ``--output-format stream-json`` and the
    ``CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1`` environment variable. Team events
    (``TeamCreate``, ``Task``) are parsed from the live JSONL stream, since teams
    artifacts are ephemeral in print mode and not available on the filesystem after
    the process exits.

    Args:
        query: Prompt string passed to ``claude -p``.
        timeout: Maximum seconds to allow the process to run. Defaults to 600.
        run_context: Optional RunContext for per-run output directory.

    Returns:
        CCResult with team_artifacts populated from stream events.

    Raises:
        ValueError: If query is empty, whitespace-only, or exceeds max length.
        RuntimeError: If the subprocess exits with non-zero code or times out.

    Example:
        >>> result = run_cc_teams("Review paper 1234 using a team", timeout=600)
        >>> print(len(result.team_artifacts))
    """
    query = _sanitize_cc_query(query)
    # S8-F3: teams env var required for CC agent orchestration
    env = {**os.environ, "CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS": "1"}
    cmd = ["claude", "-p", query, "--output-format", "stream-json", "--verbose"]
    logger.info(f"CC teams: running query (timeout={timeout}s)")

    if run_context is not None:
        stream_path = run_context.stream_path
        stream_path.parent.mkdir(parents=True, exist_ok=True)
    else:
        ts = datetime.now().strftime("%Y%m%dT%H%M%S")
        fallback_dir = Path(CC_RUNS_PATH) / f"{ts}_cc_teams_unknown"
        fallback_dir.mkdir(parents=True, exist_ok=True)
        stream_path = fallback_dir / "stream.jsonl"

    popen_start = time.time()
    try:
        # Reason: query is sanitized by _sanitize_cc_query (empty, dash-prefix, length);
        # shell=False (list args) prevents shell interpretation — no injection risk.
        with subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            env=env,
            # S10-F1: new session so killpg can reach teammate child processes
            start_new_session=True,
        ) as proc:
            try:
                tee_stream = _tee_stream(iter(proc.stdout or []), stream_path)
                result = parse_stream_json(tee_stream)
            except subprocess.TimeoutExpired as e:
                # S10-F1: kill entire process group, not just the lead process
                os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
                proc.kill()
                raise RuntimeError(f"CC timed out after {e.timeout}s") from e

            remaining = max(1, timeout - int(time.time() - popen_start))
            _wait_with_timeout(proc, remaining, timeout)

    except subprocess.TimeoutExpired as e:
        raise RuntimeError(f"CC timed out after {e.timeout}s") from e

    get_artifact_registry().register("CC teams stream", stream_path)

    logger.info(f"CC teams completed: execution_id={result.execution_id}")
    return result

app.judge.baseline_comparison

Baseline comparison engine for CompositeResult diffing.

Provides pairwise comparison of CompositeResult instances across three systems: - PydanticAI MAS (multi-agent system) - Claude Code solo (Claude Code without orchestration) - Claude Code teams (Claude Code with Agent Teams orchestration)

Reuses existing CompositeResult model and CompositeScorer.extract_metric_values().

Classes

Functions

compare(result_a, result_b, label_a, label_b)

Compare two CompositeResult instances and return pairwise diff.

Parameters:

Name Type Description Default
result_a CompositeResult

First CompositeResult instance

required
result_b CompositeResult

Second CompositeResult instance

required
label_a str

Label for first result (e.g., “PydanticAI”)

required
label_b str

Label for second result (e.g., “Claude Code solo”)

required

Returns:

Type Description
BaselineComparison

BaselineComparison with metric deltas, tier deltas, and summary

Note

All deltas are calculated as (result_a - result_b). Positive delta means result_a scored higher.

Source code in src/app/judge/baseline_comparison.py
def compare(
    result_a: CompositeResult,
    result_b: CompositeResult,
    label_a: str,
    label_b: str,
) -> BaselineComparison:
    """Compare two CompositeResult instances and return pairwise diff.

    Args:
        result_a: First CompositeResult instance
        result_b: Second CompositeResult instance
        label_a: Label for first result (e.g., "PydanticAI")
        label_b: Label for second result (e.g., "Claude Code solo")

    Returns:
        BaselineComparison with metric deltas, tier deltas, and summary

    Note:
        All deltas are calculated as (result_a - result_b).
        Positive delta means result_a scored higher.
    """
    # Calculate per-metric deltas for all 6 composite metrics
    metric_deltas: dict[str, float] = {}
    for metric in result_a.metric_scores.keys():
        score_a = result_a.metric_scores[metric]
        score_b = result_b.metric_scores.get(metric, 0.0)
        metric_deltas[metric] = score_a - score_b

    # Calculate tier-level deltas
    tier_deltas: dict[str, float | None] = {
        "tier1": result_a.tier1_score - result_b.tier1_score,
        "tier2": (
            None
            if result_a.tier2_score is None or result_b.tier2_score is None
            else result_a.tier2_score - result_b.tier2_score
        ),
        "tier3": result_a.tier3_score - result_b.tier3_score,
    }

    # Generate human-readable summary
    if not metric_deltas:
        summary = f"{label_a} and {label_b} have no shared metrics to compare"
    else:
        # Calculate average delta across all metrics
        avg_delta: float = sum(metric_deltas.values()) / len(metric_deltas)

        # Find metric with largest absolute delta
        max_metric: tuple[str, float] = max(metric_deltas.items(), key=lambda x: abs(x[1]))
        max_metric_name: str = max_metric[0]
        max_metric_delta: float = max_metric[1]

        if avg_delta > 0:
            summary = (
                f"{label_a} scored +{avg_delta:.2f} higher on average vs {label_b} "
                f"(largest diff: {max_metric_name} +{max_metric_delta:.2f})"
            )
        elif avg_delta < 0:
            summary = (
                f"{label_a} scored {avg_delta:.2f} lower on average vs {label_b} "
                f"(largest diff: {max_metric_name} {max_metric_delta:.2f})"
            )
        else:
            summary = f"{label_a} and {label_b} scored identically on average"

    return BaselineComparison(
        label_a=label_a,
        label_b=label_b,
        result_a=result_a,
        result_b=result_b,
        metric_deltas=metric_deltas,
        tier_deltas=tier_deltas,
        summary=summary,
    )

compare_all(pydantic_result, cc_solo_result, cc_teams_result)

Generate all three pairwise comparisons across the three systems.

Parameters:

Name Type Description Default
pydantic_result CompositeResult | None

PydanticAI MAS evaluation result (or None)

required
cc_solo_result CompositeResult | None

Claude Code solo evaluation result (or None)

required
cc_teams_result CompositeResult | None

Claude Code teams evaluation result (or None)

required

Returns:

Type Description
list[BaselineComparison]

List of BaselineComparison instances for all valid pairwise comparisons.

list[BaselineComparison]

Empty list if fewer than 2 results provided.

Note

Skips comparisons involving None results. Order: (PydanticAI vs Claude Code solo, PydanticAI vs Claude Code teams, Claude Code solo vs Claude Code teams)

Source code in src/app/judge/baseline_comparison.py
def compare_all(
    pydantic_result: CompositeResult | None,
    cc_solo_result: CompositeResult | None,
    cc_teams_result: CompositeResult | None,
) -> list[BaselineComparison]:
    """Generate all three pairwise comparisons across the three systems.

    Args:
        pydantic_result: PydanticAI MAS evaluation result (or None)
        cc_solo_result: Claude Code solo evaluation result (or None)
        cc_teams_result: Claude Code teams evaluation result (or None)

    Returns:
        List of BaselineComparison instances for all valid pairwise comparisons.
        Empty list if fewer than 2 results provided.

    Note:
        Skips comparisons involving None results.
        Order: (PydanticAI vs Claude Code solo, PydanticAI vs Claude Code teams,
                Claude Code solo vs Claude Code teams)
    """
    comparisons = []

    # PydanticAI vs CC-solo
    if pydantic_result is not None and cc_solo_result is not None:
        comparisons.append(compare(pydantic_result, cc_solo_result, "PydanticAI", "CC-solo"))

    # PydanticAI vs CC-teams
    if pydantic_result is not None and cc_teams_result is not None:
        comparisons.append(compare(pydantic_result, cc_teams_result, "PydanticAI", "CC-teams"))

    # CC-solo vs CC-teams
    if cc_solo_result is not None and cc_teams_result is not None:
        comparisons.append(compare(cc_solo_result, cc_teams_result, "CC-solo", "CC-teams"))

    return comparisons

app.judge.cc_trace_adapter

Claude Code trace adapter for evaluation pipeline integration.

Parses Claude Code artifacts (solo and teams mode) into GraphTraceData format for three-tier evaluation pipeline, enabling side-by-side comparison with PydanticAI MAS runs.

Classes

CCTraceAdapter

Adapter for parsing Claude Code execution artifacts into GraphTraceData.

Supports two modes: - Teams mode: Parses CC Agent Teams artifacts (config.json, inboxes/, tasks/) - Solo mode: Parses single CC session exports (metadata.json, tool_calls.jsonl)

Auto-detects mode from directory structure.

Attributes:

Name Type Description
artifacts_dir

Path to CC artifacts directory

mode Literal['teams', 'solo']

Detected mode (‘teams’ or ‘solo’)

Source code in src/app/judge/cc_trace_adapter.py
class CCTraceAdapter:
    """
    Adapter for parsing Claude Code execution artifacts into GraphTraceData.

    Supports two modes:
    - Teams mode: Parses CC Agent Teams artifacts (config.json, inboxes/, tasks/)
    - Solo mode: Parses single CC session exports (metadata.json, tool_calls.jsonl)

    Auto-detects mode from directory structure.

    Attributes:
        artifacts_dir: Path to CC artifacts directory
        mode: Detected mode ('teams' or 'solo')
    """

    def __init__(self, artifacts_dir: Path, *, tasks_dir: Path | None = None):
        """Initialize adapter with artifacts directory.

        Args:
            artifacts_dir: Path to directory containing CC artifacts (teams mode)
                          or session exports (solo mode)
            tasks_dir: Optional explicit path to tasks directory. If None, will
                      auto-discover for teams mode by checking sibling and child layouts.

        Raises:
            ValueError: If directory does not exist
        """
        if not artifacts_dir.exists():
            raise ValueError(f"Artifacts directory does not exist: {artifacts_dir}")

        self.artifacts_dir = artifacts_dir
        self.mode: Literal["teams", "solo"] = self._detect_mode()
        self.tasks_dir = self._resolve_tasks_dir(tasks_dir)

        logger.debug(
            f"CCTraceAdapter initialized: mode={self.mode}, teams_path={artifacts_dir}, "
            f"tasks_path={self.tasks_dir}"
        )

    def _detect_mode(self) -> Literal["teams", "solo"]:
        """Auto-detect mode from directory structure.

        Teams mode: config.json exists with 'members' array
        Solo mode: Otherwise (or if config.json doesn't have members array)

        Returns:
            Detected mode string
        """
        config_path = self.artifacts_dir / "config.json"

        if config_path.exists():
            try:
                config = json.loads(config_path.read_text())
                if "members" in config and isinstance(config["members"], list):
                    return "teams"
                # Valid JSON but no members array - treat as incomplete teams config
                # which will fail during parse with clear error message
                if "team_name" in config or "members" in config:
                    return "teams"
            except json.JSONDecodeError:
                # Malformed JSON in config.json likely indicates attempted teams mode
                # Let parse() handle the error with a clear message
                return "teams"

        return "solo"

    def _resolve_tasks_dir(self, explicit_tasks_dir: Path | None) -> Path | None:
        """Resolve tasks directory path for teams mode.

        Supports two directory layouts:
        1. Sibling layout (real CC): ~/.claude/teams/{name}/ + ~/.claude/tasks/{name}/
        2. Child layout (legacy): teams/{name}/tasks/

        Args:
            explicit_tasks_dir: Explicitly provided tasks directory path

        Returns:
            Resolved tasks directory path, or None if not in teams mode or not found
        """
        # Solo mode doesn't use separate tasks directory
        if self.mode != "teams":
            return None

        # If explicitly provided, use it
        if explicit_tasks_dir is not None:
            if explicit_tasks_dir.exists():
                return explicit_tasks_dir
            logger.warning(f"Explicit tasks_dir does not exist: {explicit_tasks_dir}")
            return None

        # Auto-discovery: try sibling layout first (real CC structure)
        # ~/.claude/teams/{team-name}/ -> ~/.claude/tasks/{team-name}/
        team_name = self.artifacts_dir.name
        sibling_tasks = self.artifacts_dir.parent.parent / "tasks" / team_name

        if sibling_tasks.exists():
            logger.debug(f"Found tasks dir via sibling layout: {sibling_tasks}")
            return sibling_tasks

        # Fallback: child layout (backward compatibility)
        child_tasks = self.artifacts_dir / "tasks"

        if child_tasks.exists():
            logger.debug(f"Found tasks dir via child layout: {child_tasks}")
            return child_tasks

        # No tasks directory found (not an error - tasks are optional)
        logger.debug("No tasks directory found (neither sibling nor child layout)")
        return None

    def parse(self) -> GraphTraceData:
        """Parse CC artifacts into GraphTraceData format.

        Returns:
            GraphTraceData instance ready for Tier 3 evaluation

        Raises:
            ValueError: If artifacts are missing or malformed
        """
        if self.mode == "teams":
            return self._parse_teams_mode()
        else:
            return self._parse_solo_mode()

    def _parse_teams_mode(self) -> GraphTraceData:
        """Parse CC Agent Teams artifacts into GraphTraceData.

        Reads:
        - config.json: team name -> execution_id, members
        - inboxes/*.json: agent messages -> agent_interactions
        - tasks/*.json: task completions -> tool_calls (proxy)

        Returns:
            GraphTraceData with teams mode data

        Raises:
            ValueError: If required artifacts are missing or malformed
        """
        config_path = self.artifacts_dir / "config.json"

        if not config_path.exists():
            raise ValueError("No CC artifacts found: config.json missing in teams mode")

        try:
            config = json.loads(config_path.read_text())
            execution_id = config.get("team_name", "unknown-team")
        except Exception as e:
            raise ValueError(f"Failed to parse config.json: {e}") from e

        # Parse agent interactions from inboxes/
        agent_interactions = self._parse_agent_messages()

        # Parse tool calls from tasks/ (task completions as proxy)
        tool_calls = self._parse_team_tasks()

        # Derive timing data from all timestamps
        timing_data = self._derive_timing_data(agent_interactions, tool_calls)

        # Extract coordination events from task assignments
        coordination_events = self._extract_coordination_events()

        return GraphTraceData(
            execution_id=execution_id,
            agent_interactions=agent_interactions,
            tool_calls=tool_calls,
            timing_data=timing_data,
            coordination_events=coordination_events,
        )

    def _parse_solo_mode(self) -> GraphTraceData:
        """Parse CC solo session artifacts into GraphTraceData.

        Reads:
        - metadata.json: session_id -> execution_id, start_time, end_time
        - tool_calls.jsonl: tool usage events

        Returns:
            GraphTraceData with solo mode data (empty interactions/coordination)

        Raises:
            ValueError: If required artifacts are missing
        """
        metadata_path = self.artifacts_dir / "metadata.json"

        if not metadata_path.exists():
            raise ValueError("No CC artifacts found: metadata.json missing")

        try:
            metadata = json.loads(metadata_path.read_text())
            execution_id = metadata.get("session_id", "unknown-session")
        except Exception as e:
            raise ValueError(f"Failed to parse metadata.json: {e}") from e

        # Parse tool calls from logs
        tool_calls = self._parse_solo_tool_calls()

        # Extract timing from metadata
        timing_data = {
            "start_time": metadata.get("start_time", 0.0),
            "end_time": metadata.get("end_time", 0.0),
        }

        # Solo mode: no agent interactions or coordination
        return GraphTraceData(
            execution_id=execution_id,
            agent_interactions=[],
            tool_calls=tool_calls,
            timing_data=timing_data,
            coordination_events=[],
        )

    def _parse_agent_messages(self) -> list[dict[str, Any]]:
        """Parse agent-to-agent messages from inboxes/ directory.

        Returns:
            List of agent interaction dictionaries
        """
        inboxes_dir = self.artifacts_dir / "inboxes"

        if not inboxes_dir.exists():
            return []

        messages: list[dict[str, Any]] = []

        for msg_file in sorted(inboxes_dir.glob("*.json")):
            try:
                msg_data = json.loads(msg_file.read_text())
                messages.append(msg_data)
            except Exception as e:
                logger.warning(f"Failed to parse message {msg_file}: {e}")

        return messages

    def _parse_team_tasks(self) -> list[dict[str, Any]]:
        """Parse task completions as proxy tool calls.

        Task completions represent coordination work in teams mode.

        Returns:
            List of tool call dictionaries (derived from tasks)
        """
        # Use resolved tasks directory instead of assuming child layout
        if self.tasks_dir is None or not self.tasks_dir.exists():
            return []

        tasks_dir = self.tasks_dir

        tool_calls: list[dict[str, Any]] = []

        for task_file in sorted(tasks_dir.glob("*.json")):
            try:
                task_data = json.loads(task_file.read_text())

                # Map task completion to tool call proxy
                if task_data.get("status") == "completed":
                    tool_call = {
                        "tool_name": f"task_{task_data.get('id', 'unknown')}",
                        "agent_id": task_data.get("owner", "unknown"),
                        "timestamp": task_data.get("completed_at", 0.0),
                        "duration": task_data.get("completed_at", 0.0)
                        - task_data.get("created_at", 0.0),
                        "success": True,
                        "context": task_data.get("title", ""),
                    }
                    tool_calls.append(tool_call)
            except Exception as e:
                logger.warning(f"Failed to parse task {task_file}: {e}")

        return tool_calls

    def _parse_solo_tool_calls(self) -> list[dict[str, Any]]:
        """Parse tool calls from solo session logs.

        Reads tool_calls.jsonl file with one JSON object per line.

        Returns:
            List of tool call dictionaries
        """
        tool_calls_path = self.artifacts_dir / "tool_calls.jsonl"

        if not tool_calls_path.exists():
            return []

        tool_calls: list[dict[str, Any]] = []

        try:
            for line in tool_calls_path.read_text().splitlines():
                if line.strip():
                    tool_call = json.loads(line)
                    tool_calls.append(tool_call)
        except Exception as e:
            logger.warning(f"Failed to parse tool_calls.jsonl: {e}")

        return tool_calls

    def _derive_timing_data(
        self,
        agent_interactions: list[dict[str, Any]],
        tool_calls: list[dict[str, Any]],
    ) -> dict[str, float]:
        """Derive timing data from first/last timestamps across all events.

        Args:
            agent_interactions: List of agent message events
            tool_calls: List of tool call events

        Returns:
            Dictionary with start_time and end_time
        """
        all_timestamps: list[float] = []

        for interaction in agent_interactions:
            if "timestamp" in interaction:
                all_timestamps.append(interaction["timestamp"])

        for tool_call in tool_calls:
            if "timestamp" in tool_call:
                all_timestamps.append(tool_call["timestamp"])

        if not all_timestamps:
            return {"start_time": 0.0, "end_time": 0.0}

        return {"start_time": min(all_timestamps), "end_time": max(all_timestamps)}

    def _extract_coordination_events(self) -> list[dict[str, Any]]:
        """Extract coordination events from teams mode inboxes/*.json messages.

        In teams mode, agent-to-agent messages in inboxes/ represent coordination
        events (task assignments, status updates, completions).

        Returns:
            List of coordination event dictionaries parsed from inbox messages.
            Empty list if no inboxes/ directory or not in teams mode.
        """
        inboxes_dir = self.artifacts_dir / "inboxes"

        if not inboxes_dir.exists():
            return []

        events: list[dict[str, Any]] = []

        for msg_file in sorted(inboxes_dir.glob("*.json")):
            try:
                msg_data = json.loads(msg_file.read_text())
                events.append(msg_data)
            except Exception as e:
                logger.warning(f"Failed to parse inbox message {msg_file}: {e}")

        return events
Functions
__init__(artifacts_dir, *, tasks_dir=None)

Initialize adapter with artifacts directory.

Parameters:

Name Type Description Default
artifacts_dir Path

Path to directory containing CC artifacts (teams mode) or session exports (solo mode)

required
tasks_dir Path | None

Optional explicit path to tasks directory. If None, will auto-discover for teams mode by checking sibling and child layouts.

None

Raises:

Type Description
ValueError

If directory does not exist

Source code in src/app/judge/cc_trace_adapter.py
def __init__(self, artifacts_dir: Path, *, tasks_dir: Path | None = None):
    """Initialize adapter with artifacts directory.

    Args:
        artifacts_dir: Path to directory containing CC artifacts (teams mode)
                      or session exports (solo mode)
        tasks_dir: Optional explicit path to tasks directory. If None, will
                  auto-discover for teams mode by checking sibling and child layouts.

    Raises:
        ValueError: If directory does not exist
    """
    if not artifacts_dir.exists():
        raise ValueError(f"Artifacts directory does not exist: {artifacts_dir}")

    self.artifacts_dir = artifacts_dir
    self.mode: Literal["teams", "solo"] = self._detect_mode()
    self.tasks_dir = self._resolve_tasks_dir(tasks_dir)

    logger.debug(
        f"CCTraceAdapter initialized: mode={self.mode}, teams_path={artifacts_dir}, "
        f"tasks_path={self.tasks_dir}"
    )
parse()

Parse CC artifacts into GraphTraceData format.

Returns:

Type Description
GraphTraceData

GraphTraceData instance ready for Tier 3 evaluation

Raises:

Type Description
ValueError

If artifacts are missing or malformed

Source code in src/app/judge/cc_trace_adapter.py
def parse(self) -> GraphTraceData:
    """Parse CC artifacts into GraphTraceData format.

    Returns:
        GraphTraceData instance ready for Tier 3 evaluation

    Raises:
        ValueError: If artifacts are missing or malformed
    """
    if self.mode == "teams":
        return self._parse_teams_mode()
    else:
        return self._parse_solo_mode()

app.judge.composite_scorer

Composite scoring system for three-tiered evaluation framework.

Integrates Traditional Metrics (Tier 1), LLM-as-Judge (Tier 2), and Graph Analysis (Tier 3) into unified scoring system with recommendation mapping.

Classes

CompositeScorer

Composite scoring system that integrates all three evaluation tiers.

Implements the six-metric equal-weight formula: - time_taken (0.167) - task_success (0.167) - coordination_quality (0.167) - tool_efficiency (0.167) - planning_rationality (0.167) - output_similarity (0.167)

Maps scores to recommendation categories with thresholds.

Source code in src/app/judge/composite_scorer.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
class CompositeScorer:
    """
    Composite scoring system that integrates all three evaluation tiers.

    Implements the six-metric equal-weight formula:
    - time_taken (0.167)
    - task_success (0.167)
    - coordination_quality (0.167)
    - tool_efficiency (0.167)
    - planning_rationality (0.167)
    - output_similarity (0.167)

    Maps scores to recommendation categories with thresholds.
    """

    def __init__(
        self,
        settings: "JudgeSettings | None" = None,
    ):
        """Initialize composite scorer with configuration.

        Args:
            settings: JudgeSettings instance. If None, uses default JudgeSettings().
        """
        # Import here to avoid circular dependency
        if settings is None:
            from app.config.judge_settings import JudgeSettings

            settings = JudgeSettings()

        # Use JudgeSettings
        self.settings = settings

        # Equal-weight scoring across six composite metrics
        self.weights = {
            "time_taken": 0.167,
            "task_success": 0.167,
            "coordination_quality": 0.167,
            "tool_efficiency": 0.167,
            "planning_rationality": 0.167,
            "output_similarity": 0.167,
        }
        self.thresholds = {
            "accept": settings.composite_accept_threshold,
            "weak_accept": settings.composite_weak_accept_threshold,
            "weak_reject": settings.composite_weak_reject_threshold,
            "reject": 0.0,
        }
        self.recommendation_weights = {
            "accept": 1.0,
            "weak_accept": 0.7,
            "weak_reject": -0.7,
            "reject": -1.0,
        }

        logger.info(f"CompositeScorer initialized with JudgeSettings ({len(self.weights)} metrics)")

    def extract_metric_values(self, results: EvaluationResults) -> dict[str, float]:
        """Extract the six composite metrics from tier results.

        Args:
            results: Container with tier1, tier2, tier3 evaluation results

        Returns:
            Dictionary with normalized metric values (0.0 to 1.0)

        Raises:
            ValueError: If required tier results are missing
        """
        if not results.is_complete():
            missing_tiers = []
            if not results.tier1:
                missing_tiers.append("tier1")
            if not results.tier2:
                missing_tiers.append("tier2")
            if not results.tier3:
                missing_tiers.append("tier3")
            raise ValueError(f"Missing required tier results: {missing_tiers}")

        # Extract metrics following the sprint document specification
        # At this point, we know all tiers are non-None due to is_complete() check
        assert results.tier1 is not None, "tier1 should not be None after check"
        assert results.tier2 is not None, "tier2 should not be None after check"
        assert results.tier3 is not None, "tier3 should not be None after check"

        # Reason: Task 4.1 requires these exact 6 metrics with specific source mappings
        # Each metric maps to specific fields from evaluation tier results
        metrics = {
            # From Tier 1: Traditional metrics + execution performance
            # Reason: time_score is already normalized [0,1] where higher = better (faster)
            "time_taken": results.tier1.time_score,
            "task_success": results.tier1.task_success,  # binary completion flag
            "output_similarity": results.tier1.overall_score,  # weighted similarity
            # From Tier 2: LLM-as-Judge quality assessment - use specific metric
            "planning_rationality": results.tier2.planning_rationality,
            # From Tier 3: Graph-based coordination analysis
            "coordination_quality": results.tier3.coordination_centrality,  # centrality
            "tool_efficiency": results.tier3.tool_selection_accuracy,  # tool accuracy
        }

        # Validate all metrics are in valid range
        for metric_name, value in metrics.items():
            if not (0.0 <= value <= 1.0):
                logger.warning(f"Metric {metric_name} = {value:.3f} outside valid range [0.0, 1.0]")
                # Clamp to valid range
                metrics[metric_name] = max(0.0, min(1.0, value))

        logger.debug(f"Extracted metrics: {[(k, f'{v:.3f}') for k, v in metrics.items()]}")
        return metrics

    def calculate_composite_score(self, results: EvaluationResults) -> float:
        """Calculate weighted composite score from all evaluation tiers.

        Args:
            results: Container with tier1, tier2, tier3 evaluation results

        Returns:
            Composite score (0.0 to 1.0)

        Raises:
            ValueError: If required tier results are missing
        """
        metrics = self.extract_metric_values(results)

        # Apply weighted formula from configuration
        composite_score = sum(metrics[metric] * weight for metric, weight in self.weights.items())

        # Ensure score is in valid range
        composite_score = max(0.0, min(1.0, composite_score))

        logger.info(f"Composite score calculated: {composite_score:.3f}")
        contributions = [(m, f"{metrics[m] * self.weights[m]:.3f}") for m in self.weights.keys()]
        logger.debug(f"Metric contributions: {contributions}")

        return composite_score

    def map_to_recommendation(self, composite_score: float) -> str:
        """Map composite score to recommendation category.

        Args:
            composite_score: Composite score (0.0 to 1.0)

        Returns:
            Recommendation category: "accept", "weak_accept", "weak_reject", or "reject"
        """
        # Apply threshold mapping (descending order)
        if composite_score >= self.thresholds.get("accept", 0.8):
            return "accept"
        elif composite_score >= self.thresholds.get("weak_accept", 0.6):
            return "weak_accept"
        elif composite_score >= self.thresholds.get("weak_reject", 0.4):
            return "weak_reject"
        else:
            return "reject"

    def get_recommendation_weight(self, recommendation: str) -> float:
        """Get numerical weight for recommendation category.

        Args:
            recommendation: Recommendation category

        Returns:
            Numerical weight (-1.0 to 1.0)
        """
        return self.recommendation_weights.get(recommendation, 0.0)

    def _score_and_recommend(
        self, metrics: dict[str, float], weights: dict[str, float]
    ) -> tuple[float, str, float]:
        """Calculate clamped composite score and map to recommendation.

        Args:
            metrics: Metric name to value mapping.
            weights: Metric name to weight mapping (must share keys with metrics).

        Returns:
            Tuple of (composite_score, recommendation, recommendation_weight).
        """
        composite_score = sum(metrics[m] * w for m, w in weights.items())
        composite_score = max(0.0, min(1.0, composite_score))
        recommendation = self.map_to_recommendation(composite_score)
        recommendation_weight = self.get_recommendation_weight(recommendation)
        return composite_score, recommendation, recommendation_weight

    def _detect_single_agent_mode(self, trace_data: GraphTraceData) -> bool:
        """Detect if execution was single-agent (no multi-agent delegation).

        Single-agent mode is detected when:
        - coordination_events is empty (no delegation), OR
        - 0 or 1 unique agent IDs in tool_calls

        Args:
            trace_data: Graph trace data from agent execution

        Returns:
            True if single-agent mode, False if multi-agent coordination occurred
        """
        # Check coordination events first (most reliable signal)
        if trace_data.coordination_events:
            return False

        # Check unique agent IDs in tool_calls
        agent_ids = {call.get("agent_id") for call in trace_data.tool_calls if "agent_id" in call}
        unique_agent_count = len(agent_ids)

        # 0 or 1 unique agent = single-agent mode
        return unique_agent_count <= 1

    def evaluate_composite(self, results: EvaluationResults) -> CompositeResult:
        """Complete composite evaluation with score and recommendation.

        Args:
            results: Container with tier1, tier2, tier3 evaluation results

        Returns:
            CompositeResult with score, recommendation, and detailed metrics

        Raises:
            ValueError: If required tier results are missing
        """
        try:
            # Calculate composite score
            composite_score = self.calculate_composite_score(results)

            # Map to recommendation
            recommendation = self.map_to_recommendation(composite_score)
            recommendation_weight = self.get_recommendation_weight(recommendation)

            # Extract individual metrics for detailed analysis
            metrics = self.extract_metric_values(results)

            # Create result object
            # We know tiers are non-None since calculate_composite_score succeeded
            assert results.tier1 is not None
            assert results.tier2 is not None
            assert results.tier3 is not None

            # Get enabled tiers for metadata
            enabled_tiers = self.settings.get_enabled_tiers()

            # Reason: Store composite metric weights for transparency
            # These show how each metric contributes to final score
            composite_weights = self.weights.copy()

            result = CompositeResult(
                composite_score=composite_score,
                recommendation=recommendation,
                recommendation_weight=recommendation_weight,
                metric_scores=metrics,
                tier1_score=results.tier1.overall_score,
                tier2_score=results.tier2.overall_score,
                tier3_score=results.tier3.overall_score,
                evaluation_complete=results.is_complete(),
                weights_used=composite_weights,
                tiers_enabled=sorted(enabled_tiers),
            )

            logger.info(f"Composite evaluation complete: {composite_score:.3f} → {recommendation}")
            return result

        except Exception as e:
            logger.error(f"Composite evaluation failed: {e}")
            raise

    def get_scoring_summary(self) -> dict[str, Any]:
        """Get summary of scoring configuration for validation.

        Returns:
            Dictionary with configuration summary
        """
        return {
            "metrics_count": len(self.weights),
            "total_weight": sum(self.weights.values()),
            "weights": self.weights.copy(),
            "thresholds": self.thresholds.copy(),
            "recommendation_weights": self.recommendation_weights.copy(),
        }

    def _calculate_tool_score(self, tools_used: list[str]) -> float:
        """Calculate tool selection score based on usage count."""
        tool_count = len(tools_used)
        if tool_count == 0:
            return 0.3
        if tool_count > 5:
            return max(0.4, 0.8 - (tool_count - 5) * 0.1)
        return 0.8

    def _calculate_coherence_score(
        self, error_occurred: bool, output_length: int, execution_time: float
    ) -> float:
        """Calculate plan coherence score based on execution quality."""
        score = 0.7
        if error_occurred:
            score -= 0.4
        if output_length > 100:
            score += 0.1
        elif output_length < 20:
            score -= 0.2
        if execution_time > 30.0:
            score -= 0.2
        return max(0.0, min(1.0, score))

    def _calculate_coordination_score(self, delegation_count: int, output_length: int) -> float:
        """Calculate coordination score based on delegation and output quality."""
        score = 0.7
        if delegation_count > 0:
            if delegation_count <= 3:
                score += 0.2
            else:
                score -= (delegation_count - 3) * 0.1
        if output_length > 50:
            score += 0.1
        return max(0.0, min(1.0, score))

    def assess_agent_performance(
        self,
        execution_time: float,
        tools_used: list[str],
        delegation_count: int = 0,
        error_occurred: bool = False,
        output_length: int = 0,
    ) -> AgentMetrics:
        """Assess agent performance with simple rule-based metrics.

        Args:
            execution_time: Time taken for agent execution in seconds
            tools_used: List of tools used during execution
            delegation_count: Number of delegations made (for manager agents)
            error_occurred: Whether an error occurred during execution
            output_length: Length of output result in characters

        Returns:
            AgentMetrics with evaluated scores
        """
        tool_score = self._calculate_tool_score(tools_used)
        coherence_score = self._calculate_coherence_score(
            error_occurred, output_length, execution_time
        )
        coordination_score = self._calculate_coordination_score(delegation_count, output_length)

        agent_metrics = AgentMetrics(
            tool_selection_score=tool_score,
            plan_coherence_score=coherence_score,
            coordination_score=coordination_score,
        )

        logger.debug(
            f"Agent assessment: tool={tool_score:.3f}, coherence={coherence_score:.3f}, "
            f"coordination={coordination_score:.3f}"
        )
        return agent_metrics

    def _determine_excluded_metrics(
        self, single_agent_mode: bool, tier2_available: bool
    ) -> list[str]:
        """Determine which metrics to exclude based on execution mode.

        Args:
            single_agent_mode: Whether single-agent mode detected
            tier2_available: Whether Tier 2 results are available

        Returns:
            List of metric names to exclude from composite scoring
        """
        excluded_metrics: list[str] = []
        if single_agent_mode:
            excluded_metrics.append("coordination_quality")
            logger.info(
                "Single-agent mode detected - redistributing coordination_quality weight "
                "to remaining metrics"
            )

        if not tier2_available:
            excluded_metrics.append("planning_rationality")
            logger.warning(
                "Tier 2 (LLM-as-Judge) skipped - redistributing planning_rationality weight"
            )

        return excluded_metrics

    def _extract_tier1_metrics(
        self, tier1: Tier1Result, remaining_metrics: dict[str, float]
    ) -> dict[str, float]:
        """Extract Tier 1 metrics if they are not excluded."""
        metrics: dict[str, float] = {}
        if "time_taken" in remaining_metrics:
            metrics["time_taken"] = tier1.time_score
        if "task_success" in remaining_metrics:
            metrics["task_success"] = tier1.task_success
        if "output_similarity" in remaining_metrics:
            metrics["output_similarity"] = tier1.overall_score
        return metrics

    def _extract_tier3_metrics(
        self, tier3: Tier3Result, remaining_metrics: dict[str, float]
    ) -> dict[str, float]:
        """Extract Tier 3 metrics if they are not excluded."""
        metrics: dict[str, float] = {}
        if "coordination_quality" in remaining_metrics:
            metrics["coordination_quality"] = tier3.coordination_centrality
        if "tool_efficiency" in remaining_metrics:
            metrics["tool_efficiency"] = tier3.tool_selection_accuracy
        return metrics

    def _extract_metrics_with_exclusions(
        self, results: EvaluationResults, remaining_metrics: dict[str, float]
    ) -> dict[str, float]:
        """Extract metric values from tier results, excluding specified metrics.

        Args:
            results: Container with tier results
            remaining_metrics: Dictionary of metrics to include (not excluded)

        Returns:
            Dictionary mapping metric names to values
        """
        metrics: dict[str, float] = {}

        # Extract Tier 1 metrics
        if results.tier1:
            metrics.update(self._extract_tier1_metrics(results.tier1, remaining_metrics))

        # Extract Tier 2 metrics
        if results.tier2 and "planning_rationality" in remaining_metrics:
            metrics["planning_rationality"] = results.tier2.planning_rationality

        # Extract Tier 3 metrics
        if results.tier3:
            metrics.update(self._extract_tier3_metrics(results.tier3, remaining_metrics))

        return metrics

    def evaluate_composite_with_trace(
        self, results: EvaluationResults, trace_data: GraphTraceData
    ) -> CompositeResult:
        """Evaluate composite score with single-agent mode detection and weight redistribution.

        Detects single-agent runs from trace data and redistributes coordination_quality
        weight to remaining metrics. Also handles Tier 2 skip for compound redistribution.

        Args:
            results: Container with tier1, tier2, tier3 evaluation results
            trace_data: Graph trace data for single-agent detection

        Returns:
            CompositeResult with adjusted weights for single-agent mode
        """
        # Detect single-agent mode from trace data
        single_agent_mode = self._detect_single_agent_mode(trace_data)

        # Determine which metrics to exclude
        excluded_metrics = self._determine_excluded_metrics(
            single_agent_mode, tier2_available=results.tier2 is not None
        )

        # If no exclusions, use standard evaluation
        if not excluded_metrics:
            result = self.evaluate_composite(results)
            result.single_agent_mode = single_agent_mode
            return result

        # Build adjusted weights by redistributing to remaining metrics
        remaining_metrics = {k: v for k, v in self.weights.items() if k not in excluded_metrics}
        weight_per_remaining = (1.0 / len(remaining_metrics)) if remaining_metrics else 0.0
        adjusted_weights = {metric: weight_per_remaining for metric in remaining_metrics}

        # Extract metrics (only those not excluded)
        metrics = self._extract_metrics_with_exclusions(results, remaining_metrics)

        # Validate all required metrics are present
        missing_metrics = set(remaining_metrics.keys()) - set(metrics.keys())
        if missing_metrics:
            raise ValueError(f"Missing required metrics after exclusion: {missing_metrics}")

        score, rec, rec_weight = self._score_and_recommend(metrics, adjusted_weights)

        logger.info(
            f"Composite score with redistributed weights: {score:.3f} "
            f"(excluded: {excluded_metrics})"
        )

        return CompositeResult(
            composite_score=score,
            recommendation=rec,
            recommendation_weight=rec_weight,
            metric_scores=metrics,
            tier1_score=results.tier1.overall_score if results.tier1 else 0.0,
            tier2_score=results.tier2.overall_score if results.tier2 else None,
            tier3_score=results.tier3.overall_score if results.tier3 else 0.0,
            evaluation_complete=results.is_complete(),
            single_agent_mode=single_agent_mode,
            weights_used=adjusted_weights,
            tiers_enabled=sorted(self.settings.get_enabled_tiers()),
        )

    def evaluate_composite_with_optional_tier2(self, results: EvaluationResults) -> CompositeResult:
        """Evaluate composite score with optional Tier 2 (handles missing Tier 2).

        When Tier 2 is None, redistributes weights to Tier 1 and Tier 3.

        Args:
            results: Container with tier1, tier3, and optional tier2 results

        Returns:
            CompositeResult with adjusted weights when Tier 2 is missing
        """
        if results.tier2 is None:
            logger.warning(
                "Tier 2 (LLM-as-Judge) skipped - no valid provider available. "
                "Redistributing weights to Tier 1 + Tier 3."
            )
            # Redistribute Tier 2 metrics (planning_rationality: 0.167) to other metrics
            # Split evenly across remaining 5 metrics
            adjusted_weights = {
                "time_taken": 0.2,  # 0.167 + 0.033
                "task_success": 0.2,  # 0.167 + 0.033
                "coordination_quality": 0.2,  # 0.167 + 0.033
                "tool_efficiency": 0.2,  # 0.167 + 0.033
                "output_similarity": 0.2,  # 0.167 + 0.033
            }

            # Extract metrics from Tier 1 and Tier 3 only
            if not results.tier1 or not results.tier3:
                raise ValueError("Tier 1 and Tier 3 are required when Tier 2 is missing")

            metrics = {
                "time_taken": results.tier1.time_score,
                "task_success": results.tier1.task_success,
                "output_similarity": results.tier1.overall_score,
                "coordination_quality": results.tier3.coordination_centrality,
                "tool_efficiency": results.tier3.tool_selection_accuracy,
            }

            score, rec, rec_weight = self._score_and_recommend(metrics, adjusted_weights)

            return CompositeResult(
                composite_score=score,
                recommendation=rec,
                recommendation_weight=rec_weight,
                metric_scores=metrics,
                tier1_score=results.tier1.overall_score,
                tier2_score=None,  # Tier 2 skipped
                tier3_score=results.tier3.overall_score,
                evaluation_complete=False,  # Not complete without Tier 2
                weights_used=adjusted_weights,
                tiers_enabled=sorted(self.settings.get_enabled_tiers()),
            )
        else:
            # All tiers available, use standard evaluation
            return self.evaluate_composite(results)
Functions
__init__(settings=None)

Initialize composite scorer with configuration.

Parameters:

Name Type Description Default
settings JudgeSettings | None

JudgeSettings instance. If None, uses default JudgeSettings().

None
Source code in src/app/judge/composite_scorer.py
def __init__(
    self,
    settings: "JudgeSettings | None" = None,
):
    """Initialize composite scorer with configuration.

    Args:
        settings: JudgeSettings instance. If None, uses default JudgeSettings().
    """
    # Import here to avoid circular dependency
    if settings is None:
        from app.config.judge_settings import JudgeSettings

        settings = JudgeSettings()

    # Use JudgeSettings
    self.settings = settings

    # Equal-weight scoring across six composite metrics
    self.weights = {
        "time_taken": 0.167,
        "task_success": 0.167,
        "coordination_quality": 0.167,
        "tool_efficiency": 0.167,
        "planning_rationality": 0.167,
        "output_similarity": 0.167,
    }
    self.thresholds = {
        "accept": settings.composite_accept_threshold,
        "weak_accept": settings.composite_weak_accept_threshold,
        "weak_reject": settings.composite_weak_reject_threshold,
        "reject": 0.0,
    }
    self.recommendation_weights = {
        "accept": 1.0,
        "weak_accept": 0.7,
        "weak_reject": -0.7,
        "reject": -1.0,
    }

    logger.info(f"CompositeScorer initialized with JudgeSettings ({len(self.weights)} metrics)")
assess_agent_performance(execution_time, tools_used, delegation_count=0, error_occurred=False, output_length=0)

Assess agent performance with simple rule-based metrics.

Parameters:

Name Type Description Default
execution_time float

Time taken for agent execution in seconds

required
tools_used list[str]

List of tools used during execution

required
delegation_count int

Number of delegations made (for manager agents)

0
error_occurred bool

Whether an error occurred during execution

False
output_length int

Length of output result in characters

0

Returns:

Type Description
AgentMetrics

AgentMetrics with evaluated scores

Source code in src/app/judge/composite_scorer.py
def assess_agent_performance(
    self,
    execution_time: float,
    tools_used: list[str],
    delegation_count: int = 0,
    error_occurred: bool = False,
    output_length: int = 0,
) -> AgentMetrics:
    """Assess agent performance with simple rule-based metrics.

    Args:
        execution_time: Time taken for agent execution in seconds
        tools_used: List of tools used during execution
        delegation_count: Number of delegations made (for manager agents)
        error_occurred: Whether an error occurred during execution
        output_length: Length of output result in characters

    Returns:
        AgentMetrics with evaluated scores
    """
    tool_score = self._calculate_tool_score(tools_used)
    coherence_score = self._calculate_coherence_score(
        error_occurred, output_length, execution_time
    )
    coordination_score = self._calculate_coordination_score(delegation_count, output_length)

    agent_metrics = AgentMetrics(
        tool_selection_score=tool_score,
        plan_coherence_score=coherence_score,
        coordination_score=coordination_score,
    )

    logger.debug(
        f"Agent assessment: tool={tool_score:.3f}, coherence={coherence_score:.3f}, "
        f"coordination={coordination_score:.3f}"
    )
    return agent_metrics
calculate_composite_score(results)

Calculate weighted composite score from all evaluation tiers.

Parameters:

Name Type Description Default
results EvaluationResults

Container with tier1, tier2, tier3 evaluation results

required

Returns:

Type Description
float

Composite score (0.0 to 1.0)

Raises:

Type Description
ValueError

If required tier results are missing

Source code in src/app/judge/composite_scorer.py
def calculate_composite_score(self, results: EvaluationResults) -> float:
    """Calculate weighted composite score from all evaluation tiers.

    Args:
        results: Container with tier1, tier2, tier3 evaluation results

    Returns:
        Composite score (0.0 to 1.0)

    Raises:
        ValueError: If required tier results are missing
    """
    metrics = self.extract_metric_values(results)

    # Apply weighted formula from configuration
    composite_score = sum(metrics[metric] * weight for metric, weight in self.weights.items())

    # Ensure score is in valid range
    composite_score = max(0.0, min(1.0, composite_score))

    logger.info(f"Composite score calculated: {composite_score:.3f}")
    contributions = [(m, f"{metrics[m] * self.weights[m]:.3f}") for m in self.weights.keys()]
    logger.debug(f"Metric contributions: {contributions}")

    return composite_score
evaluate_composite(results)

Complete composite evaluation with score and recommendation.

Parameters:

Name Type Description Default
results EvaluationResults

Container with tier1, tier2, tier3 evaluation results

required

Returns:

Type Description
CompositeResult

CompositeResult with score, recommendation, and detailed metrics

Raises:

Type Description
ValueError

If required tier results are missing

Source code in src/app/judge/composite_scorer.py
def evaluate_composite(self, results: EvaluationResults) -> CompositeResult:
    """Complete composite evaluation with score and recommendation.

    Args:
        results: Container with tier1, tier2, tier3 evaluation results

    Returns:
        CompositeResult with score, recommendation, and detailed metrics

    Raises:
        ValueError: If required tier results are missing
    """
    try:
        # Calculate composite score
        composite_score = self.calculate_composite_score(results)

        # Map to recommendation
        recommendation = self.map_to_recommendation(composite_score)
        recommendation_weight = self.get_recommendation_weight(recommendation)

        # Extract individual metrics for detailed analysis
        metrics = self.extract_metric_values(results)

        # Create result object
        # We know tiers are non-None since calculate_composite_score succeeded
        assert results.tier1 is not None
        assert results.tier2 is not None
        assert results.tier3 is not None

        # Get enabled tiers for metadata
        enabled_tiers = self.settings.get_enabled_tiers()

        # Reason: Store composite metric weights for transparency
        # These show how each metric contributes to final score
        composite_weights = self.weights.copy()

        result = CompositeResult(
            composite_score=composite_score,
            recommendation=recommendation,
            recommendation_weight=recommendation_weight,
            metric_scores=metrics,
            tier1_score=results.tier1.overall_score,
            tier2_score=results.tier2.overall_score,
            tier3_score=results.tier3.overall_score,
            evaluation_complete=results.is_complete(),
            weights_used=composite_weights,
            tiers_enabled=sorted(enabled_tiers),
        )

        logger.info(f"Composite evaluation complete: {composite_score:.3f} → {recommendation}")
        return result

    except Exception as e:
        logger.error(f"Composite evaluation failed: {e}")
        raise
evaluate_composite_with_optional_tier2(results)

Evaluate composite score with optional Tier 2 (handles missing Tier 2).

When Tier 2 is None, redistributes weights to Tier 1 and Tier 3.

Parameters:

Name Type Description Default
results EvaluationResults

Container with tier1, tier3, and optional tier2 results

required

Returns:

Type Description
CompositeResult

CompositeResult with adjusted weights when Tier 2 is missing

Source code in src/app/judge/composite_scorer.py
def evaluate_composite_with_optional_tier2(self, results: EvaluationResults) -> CompositeResult:
    """Evaluate composite score with optional Tier 2 (handles missing Tier 2).

    When Tier 2 is None, redistributes weights to Tier 1 and Tier 3.

    Args:
        results: Container with tier1, tier3, and optional tier2 results

    Returns:
        CompositeResult with adjusted weights when Tier 2 is missing
    """
    if results.tier2 is None:
        logger.warning(
            "Tier 2 (LLM-as-Judge) skipped - no valid provider available. "
            "Redistributing weights to Tier 1 + Tier 3."
        )
        # Redistribute Tier 2 metrics (planning_rationality: 0.167) to other metrics
        # Split evenly across remaining 5 metrics
        adjusted_weights = {
            "time_taken": 0.2,  # 0.167 + 0.033
            "task_success": 0.2,  # 0.167 + 0.033
            "coordination_quality": 0.2,  # 0.167 + 0.033
            "tool_efficiency": 0.2,  # 0.167 + 0.033
            "output_similarity": 0.2,  # 0.167 + 0.033
        }

        # Extract metrics from Tier 1 and Tier 3 only
        if not results.tier1 or not results.tier3:
            raise ValueError("Tier 1 and Tier 3 are required when Tier 2 is missing")

        metrics = {
            "time_taken": results.tier1.time_score,
            "task_success": results.tier1.task_success,
            "output_similarity": results.tier1.overall_score,
            "coordination_quality": results.tier3.coordination_centrality,
            "tool_efficiency": results.tier3.tool_selection_accuracy,
        }

        score, rec, rec_weight = self._score_and_recommend(metrics, adjusted_weights)

        return CompositeResult(
            composite_score=score,
            recommendation=rec,
            recommendation_weight=rec_weight,
            metric_scores=metrics,
            tier1_score=results.tier1.overall_score,
            tier2_score=None,  # Tier 2 skipped
            tier3_score=results.tier3.overall_score,
            evaluation_complete=False,  # Not complete without Tier 2
            weights_used=adjusted_weights,
            tiers_enabled=sorted(self.settings.get_enabled_tiers()),
        )
    else:
        # All tiers available, use standard evaluation
        return self.evaluate_composite(results)
evaluate_composite_with_trace(results, trace_data)

Evaluate composite score with single-agent mode detection and weight redistribution.

Detects single-agent runs from trace data and redistributes coordination_quality weight to remaining metrics. Also handles Tier 2 skip for compound redistribution.

Parameters:

Name Type Description Default
results EvaluationResults

Container with tier1, tier2, tier3 evaluation results

required
trace_data GraphTraceData

Graph trace data for single-agent detection

required

Returns:

Type Description
CompositeResult

CompositeResult with adjusted weights for single-agent mode

Source code in src/app/judge/composite_scorer.py
def evaluate_composite_with_trace(
    self, results: EvaluationResults, trace_data: GraphTraceData
) -> CompositeResult:
    """Evaluate composite score with single-agent mode detection and weight redistribution.

    Detects single-agent runs from trace data and redistributes coordination_quality
    weight to remaining metrics. Also handles Tier 2 skip for compound redistribution.

    Args:
        results: Container with tier1, tier2, tier3 evaluation results
        trace_data: Graph trace data for single-agent detection

    Returns:
        CompositeResult with adjusted weights for single-agent mode
    """
    # Detect single-agent mode from trace data
    single_agent_mode = self._detect_single_agent_mode(trace_data)

    # Determine which metrics to exclude
    excluded_metrics = self._determine_excluded_metrics(
        single_agent_mode, tier2_available=results.tier2 is not None
    )

    # If no exclusions, use standard evaluation
    if not excluded_metrics:
        result = self.evaluate_composite(results)
        result.single_agent_mode = single_agent_mode
        return result

    # Build adjusted weights by redistributing to remaining metrics
    remaining_metrics = {k: v for k, v in self.weights.items() if k not in excluded_metrics}
    weight_per_remaining = (1.0 / len(remaining_metrics)) if remaining_metrics else 0.0
    adjusted_weights = {metric: weight_per_remaining for metric in remaining_metrics}

    # Extract metrics (only those not excluded)
    metrics = self._extract_metrics_with_exclusions(results, remaining_metrics)

    # Validate all required metrics are present
    missing_metrics = set(remaining_metrics.keys()) - set(metrics.keys())
    if missing_metrics:
        raise ValueError(f"Missing required metrics after exclusion: {missing_metrics}")

    score, rec, rec_weight = self._score_and_recommend(metrics, adjusted_weights)

    logger.info(
        f"Composite score with redistributed weights: {score:.3f} "
        f"(excluded: {excluded_metrics})"
    )

    return CompositeResult(
        composite_score=score,
        recommendation=rec,
        recommendation_weight=rec_weight,
        metric_scores=metrics,
        tier1_score=results.tier1.overall_score if results.tier1 else 0.0,
        tier2_score=results.tier2.overall_score if results.tier2 else None,
        tier3_score=results.tier3.overall_score if results.tier3 else 0.0,
        evaluation_complete=results.is_complete(),
        single_agent_mode=single_agent_mode,
        weights_used=adjusted_weights,
        tiers_enabled=sorted(self.settings.get_enabled_tiers()),
    )
extract_metric_values(results)

Extract the six composite metrics from tier results.

Parameters:

Name Type Description Default
results EvaluationResults

Container with tier1, tier2, tier3 evaluation results

required

Returns:

Type Description
dict[str, float]

Dictionary with normalized metric values (0.0 to 1.0)

Raises:

Type Description
ValueError

If required tier results are missing

Source code in src/app/judge/composite_scorer.py
def extract_metric_values(self, results: EvaluationResults) -> dict[str, float]:
    """Extract the six composite metrics from tier results.

    Args:
        results: Container with tier1, tier2, tier3 evaluation results

    Returns:
        Dictionary with normalized metric values (0.0 to 1.0)

    Raises:
        ValueError: If required tier results are missing
    """
    if not results.is_complete():
        missing_tiers = []
        if not results.tier1:
            missing_tiers.append("tier1")
        if not results.tier2:
            missing_tiers.append("tier2")
        if not results.tier3:
            missing_tiers.append("tier3")
        raise ValueError(f"Missing required tier results: {missing_tiers}")

    # Extract metrics following the sprint document specification
    # At this point, we know all tiers are non-None due to is_complete() check
    assert results.tier1 is not None, "tier1 should not be None after check"
    assert results.tier2 is not None, "tier2 should not be None after check"
    assert results.tier3 is not None, "tier3 should not be None after check"

    # Reason: Task 4.1 requires these exact 6 metrics with specific source mappings
    # Each metric maps to specific fields from evaluation tier results
    metrics = {
        # From Tier 1: Traditional metrics + execution performance
        # Reason: time_score is already normalized [0,1] where higher = better (faster)
        "time_taken": results.tier1.time_score,
        "task_success": results.tier1.task_success,  # binary completion flag
        "output_similarity": results.tier1.overall_score,  # weighted similarity
        # From Tier 2: LLM-as-Judge quality assessment - use specific metric
        "planning_rationality": results.tier2.planning_rationality,
        # From Tier 3: Graph-based coordination analysis
        "coordination_quality": results.tier3.coordination_centrality,  # centrality
        "tool_efficiency": results.tier3.tool_selection_accuracy,  # tool accuracy
    }

    # Validate all metrics are in valid range
    for metric_name, value in metrics.items():
        if not (0.0 <= value <= 1.0):
            logger.warning(f"Metric {metric_name} = {value:.3f} outside valid range [0.0, 1.0]")
            # Clamp to valid range
            metrics[metric_name] = max(0.0, min(1.0, value))

    logger.debug(f"Extracted metrics: {[(k, f'{v:.3f}') for k, v in metrics.items()]}")
    return metrics
get_recommendation_weight(recommendation)

Get numerical weight for recommendation category.

Parameters:

Name Type Description Default
recommendation str

Recommendation category

required

Returns:

Type Description
float

Numerical weight (-1.0 to 1.0)

Source code in src/app/judge/composite_scorer.py
def get_recommendation_weight(self, recommendation: str) -> float:
    """Get numerical weight for recommendation category.

    Args:
        recommendation: Recommendation category

    Returns:
        Numerical weight (-1.0 to 1.0)
    """
    return self.recommendation_weights.get(recommendation, 0.0)
get_scoring_summary()

Get summary of scoring configuration for validation.

Returns:

Type Description
dict[str, Any]

Dictionary with configuration summary

Source code in src/app/judge/composite_scorer.py
def get_scoring_summary(self) -> dict[str, Any]:
    """Get summary of scoring configuration for validation.

    Returns:
        Dictionary with configuration summary
    """
    return {
        "metrics_count": len(self.weights),
        "total_weight": sum(self.weights.values()),
        "weights": self.weights.copy(),
        "thresholds": self.thresholds.copy(),
        "recommendation_weights": self.recommendation_weights.copy(),
    }
map_to_recommendation(composite_score)

Map composite score to recommendation category.

Parameters:

Name Type Description Default
composite_score float

Composite score (0.0 to 1.0)

required

Returns:

Type Description
str

Recommendation category: “accept”, “weak_accept”, “weak_reject”, or “reject”

Source code in src/app/judge/composite_scorer.py
def map_to_recommendation(self, composite_score: float) -> str:
    """Map composite score to recommendation category.

    Args:
        composite_score: Composite score (0.0 to 1.0)

    Returns:
        Recommendation category: "accept", "weak_accept", "weak_reject", or "reject"
    """
    # Apply threshold mapping (descending order)
    if composite_score >= self.thresholds.get("accept", 0.8):
        return "accept"
    elif composite_score >= self.thresholds.get("weak_accept", 0.6):
        return "weak_accept"
    elif composite_score >= self.thresholds.get("weak_reject", 0.4):
        return "weak_reject"
    else:
        return "reject"

app.judge.evaluation_pipeline

Streamlined three-tier evaluation pipeline orchestrator.

Coordinates Traditional Metrics (Tier 1), LLM-as-Judge (Tier 2), and Graph Analysis (Tier 3) into unified evaluation workflow with graceful degradation. Uses modular components for configuration and monitoring.

Classes

EvaluationPipeline

Streamlined evaluation pipeline orchestrator for three-tier assessment.

Coordinates execution of Traditional Metrics → LLM-as-Judge → Graph Analysis with configurable tier enabling and graceful degradation. Uses modular components for configuration management and performance monitoring.

Source code in src/app/judge/evaluation_pipeline.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
class EvaluationPipeline:
    """
    Streamlined evaluation pipeline orchestrator for three-tier assessment.

    Coordinates execution of Traditional Metrics → LLM-as-Judge → Graph Analysis
    with configurable tier enabling and graceful degradation. Uses modular
    components for configuration management and performance monitoring.
    """

    def __init__(
        self,
        settings: JudgeSettings | None = None,
        chat_provider: str | None = None,
        chat_model: str | None = None,
    ):
        """Initialize evaluation pipeline with configuration.

        Args:
            settings: JudgeSettings instance. If None, uses default JudgeSettings().
            chat_provider: Active chat provider from agent system. Passed to LLMJudgeEngine
                          for tier2_provider=auto mode.
            chat_model: Active chat model from agent system. Forwarded to LLMJudgeEngine
                       for model inheritance in auto mode.

        Raises:
            ValueError: If configuration is invalid
        """
        # Use provided settings or create default
        if settings is None:
            settings = JudgeSettings()

        self.settings = settings
        self.chat_provider = chat_provider
        self.chat_model = chat_model
        self.performance_monitor = PerformanceMonitor(settings.get_performance_targets())

        # Initialize engines with settings
        self.traditional_engine = TraditionalMetricsEngine()
        self.llm_engine = LLMJudgeEngine(
            settings, chat_provider=chat_provider, chat_model=chat_model
        )
        self.graph_engine = GraphAnalysisEngine(settings)
        self.composite_scorer = CompositeScorer(settings=settings)

        enabled_tiers = sorted(settings.get_enabled_tiers())
        fallback_strategy = settings.fallback_strategy
        logger.info(
            f"EvaluationPipeline initialized with JudgeSettings: tiers={enabled_tiers}, "
            f"fallback_strategy={fallback_strategy}, chat_provider={chat_provider}"
        )

    @property
    def enabled_tiers(self) -> set[int]:
        """Get enabled tiers (backward compatibility property).

        Returns:
            Set of enabled tier numbers
        """
        return self.settings.get_enabled_tiers()

    @property
    def performance_targets(self) -> dict[str, float]:
        """Get performance targets (backward compatibility property).

        Returns:
            Dictionary of performance targets
        """
        return self.settings.get_performance_targets()

    @property
    def fallback_strategy(self) -> str:
        """Get fallback strategy (backward compatibility property).

        Returns:
            Fallback strategy name
        """
        return self.settings.fallback_strategy

    @property
    def config_path(self) -> Path | None:
        """Get configuration path (backward compatibility property).

        Returns:
            Always None (settings-based configuration only)
        """
        return None

    @property
    def execution_stats(self) -> dict[str, Any]:
        """Get execution statistics (backward compatibility property).

        Returns:
            Dictionary with execution statistics
        """
        return self.performance_monitor.get_execution_stats()

    def _is_tier_enabled(self, tier: int) -> bool:
        """Check if tier is enabled (internal helper).

        Args:
            tier: Tier number to check

        Returns:
            True if tier is enabled
        """
        return self.settings.is_tier_enabled(tier)

    def _skip_tier1(self, reason: str) -> tuple[None, float]:
        """Return skip result for Tier 1 with logging and monitoring.

        Args:
            reason: Human-readable reason for skipping, included in log.

        Returns:
            Tuple of (None, 0.0) indicating tier was skipped.
        """
        logger.info(f"Tier 1 skipped: {reason}")
        self.performance_monitor.record_tier_execution(1, 0.0)
        return None, 0.0

    async def _execute_tier1(
        self, paper: str, review: str, reference_reviews: list[str] | None = None
    ) -> tuple[Tier1Result | None, float]:
        """Execute Traditional Metrics evaluation (Tier 1).

        Args:
            paper: Paper content text
            review: Generated review text
            reference_reviews: Optional list of ground truth reviews for similarity

        Returns:
            Tuple of (Tier1Result or None, execution_time)
        """
        if not self._is_tier_enabled(1):
            logger.debug("Tier 1 disabled, skipping traditional metrics")
            return None, 0.0

        # Reason: Empty review cannot produce meaningful similarity scores —
        # empty-vs-empty returns 1.0 (false perfect), non-empty-vs-empty returns 0.0 (noise).
        if not review.strip():
            return self._skip_tier1("review text is empty")

        # Reason: No usable references means T1 compares against [""] fallback,
        # producing all-zero similarities regardless of review quality — no signal.
        usable_refs = [r for r in (reference_reviews or []) if r.strip()]
        if not usable_refs:
            return self._skip_tier1("no usable reference reviews available")

        performance_targets = self.performance_targets
        timeout = performance_targets.get("tier1_max_seconds", 1.0)
        start_time = time.time()

        try:
            logger.info("Executing Tier 1: Traditional Metrics")
            start_evaluation = time.time()

            ref_reviews = usable_refs

            result = await asyncio.wait_for(
                asyncio.create_task(
                    asyncio.to_thread(
                        self.traditional_engine.evaluate_traditional_metrics,
                        review,  # agent_output
                        ref_reviews,  # reference_texts
                        start_evaluation,  # start_time
                        time.time(),  # end_time (will be updated in method)
                        self.settings,  # settings
                    )
                ),
                timeout=timeout,
            )

            execution_time = time.time() - start_time
            self.performance_monitor.record_tier_execution(1, execution_time)
            logger.info(f"Tier 1 completed in {execution_time:.2f}s")
            return result, execution_time

        except TimeoutError:
            execution_time = time.time() - start_time
            error_msg = f"Tier 1 timeout after {timeout}s (traditional metrics evaluation)"
            logger.error(f"{error_msg}. Consider increasing tier1_max_seconds in config.")
            self.performance_monitor.record_tier_failure(1, "timeout", execution_time, error_msg)
            return None, execution_time
        except Exception as e:
            execution_time = time.time() - start_time
            error_msg = f"Tier 1 failed with {type(e).__name__}: {e}"
            logger.error(f"{error_msg}. Paper length: {len(paper)}, Review length: {len(review)}")
            self.performance_monitor.record_tier_failure(1, "error", execution_time, str(e))
            return None, execution_time

    async def _execute_tier2(
        self, paper: str, review: str, execution_trace: dict[str, Any] | None = None
    ) -> tuple[Tier2Result | None, float]:
        """Execute LLM-as-Judge evaluation (Tier 2).

        Args:
            paper: Paper content text
            review: Generated review text
            execution_trace: Optional execution trace data

        Returns:
            Tuple of (Tier2Result or None, execution_time)
        """
        if not self._is_tier_enabled(2):
            logger.debug("Tier 2 disabled, skipping LLM judge")
            return None, 0.0

        # Check if Tier 2 providers are available (STORY-001)
        if not self.llm_engine.tier2_available:
            logger.warning("Tier 2 skipped: no valid LLM providers available")
            return None, 0.0

        performance_targets = self.performance_targets
        timeout = performance_targets.get("tier2_max_seconds", 10.0)
        start_time = time.time()

        try:
            logger.info("Executing Tier 2: LLM-as-Judge")
            result = await asyncio.wait_for(
                self.llm_engine.evaluate_comprehensive(paper, review, execution_trace or {}),
                timeout=timeout,
            )

            execution_time = time.time() - start_time
            self.performance_monitor.record_tier_execution(2, execution_time)
            logger.info(f"Tier 2 completed in {execution_time:.2f}s")
            return result, execution_time

        except TimeoutError:
            execution_time = time.time() - start_time
            error_msg = f"Tier 2 timeout after {timeout}s (LLM-as-Judge evaluation)"
            logger.error(
                f"{error_msg}. Consider increasing tier2_max_seconds or check "
                "LLM service availability."
            )
            self.performance_monitor.record_tier_failure(2, "timeout", execution_time, error_msg)
            return None, execution_time
        except Exception as e:
            execution_time = time.time() - start_time
            error_type = type(e).__name__
            error_msg = f"Tier 2 failed with {error_type}: {e}"
            logger.error(f"{error_msg}. Paper length: {len(paper)}, Review length: {len(review)}")
            # Add specific guidance based on error type
            if "rate limit" in str(e).lower():
                logger.error("Rate limit exceeded - consider adjusting request frequency")
            elif "authentication" in str(e).lower():
                logger.error("Authentication failed - check API keys and configuration")
            elif "connection" in str(e).lower():
                logger.error(
                    "Connection failed - check network connectivity and service availability"
                )
            self.performance_monitor.record_tier_failure(2, "error", execution_time, str(e))
            return None, execution_time

    def _create_trace_data(self, execution_trace: dict[str, Any] | None) -> GraphTraceData:
        """Convert execution trace to GraphTraceData."""
        return GraphTraceData.from_trace_dict(execution_trace, fallback_id="pipeline_exec")

    def _should_apply_fallback(self, results: EvaluationResults) -> bool:
        """Check if fallback strategy should be applied.

        Args:
            results: Partial evaluation results

        Returns:
            True if fallback should be applied, False otherwise
        """
        # Don't apply fallback if Tier 2 is missing due to provider unavailability
        tier2_provider_unavailable = results.tier2 is None and not self.llm_engine.tier2_available
        return not tier2_provider_unavailable

    def _generate_composite_score(
        self, results: EvaluationResults, trace_data: GraphTraceData | None = None
    ) -> CompositeResult:
        """Generate composite score using appropriate scorer method.

        Args:
            results: Evaluation results from all tiers
            trace_data: Optional trace data for single-agent detection and weight
                redistribution. When provided with complete results, enables
                evaluate_composite_with_trace routing.

        Returns:
            CompositeResult with appropriate weight handling

        Raises:
            ValueError: If insufficient tier results for scoring
        """
        if trace_data is not None and results.is_complete():
            return self.composite_scorer.evaluate_composite_with_trace(results, trace_data)
        elif results.tier1 is None:
            return self._composite_without_tier1(results)
        elif results.tier2 is None:
            return self._composite_without_tier2(results)
        elif results.is_complete():
            # All tiers available, no trace data
            return self.composite_scorer.evaluate_composite(results)
        else:
            raise ValueError("Cannot generate composite score: insufficient tier results")

    def _composite_without_tier1(self, results: EvaluationResults) -> CompositeResult:
        """Handle composite scoring when Tier 1 was skipped (empty review or no references).

        Routes to T2+T3 when available, T2-only (capped) when T3 missing,
        or returns degraded 0.0 result when all tiers are unavailable.

        Args:
            results: Evaluation results (tier1 is None).

        Returns:
            CompositeResult with T2+T3 weight redistribution or degraded scoring.
        """
        if results.tier2 and results.tier3:
            score = (results.tier2.overall_score + results.tier3.overall_score) / 2
            recommendation = self.composite_scorer.map_to_recommendation(score)
            return CompositeResult(
                composite_score=score,
                recommendation=recommendation,
                recommendation_weight=self.composite_scorer.get_recommendation_weight(
                    recommendation
                ),
                metric_scores={
                    "planning_rationality": results.tier2.planning_rationality,
                    "coordination_quality": results.tier3.coordination_centrality,
                    "tool_efficiency": results.tier3.tool_selection_accuracy,
                },
                tier1_score=0.0,
                tier2_score=results.tier2.overall_score,
                tier3_score=results.tier3.overall_score,
                evaluation_complete=False,
                weights_used={"tier1": 0.0, "tier2": 0.5, "tier3": 0.5},
            )
        if results.tier2:
            penalized = min(
                results.tier2.overall_score, self.settings.composite_weak_reject_threshold
            )
            return CompositeResult(
                composite_score=penalized,
                recommendation="weak_reject",
                recommendation_weight=self.composite_scorer.get_recommendation_weight(
                    "weak_reject"
                ),
                metric_scores={"planning_rationality": results.tier2.planning_rationality},
                tier1_score=0.0,
                tier2_score=results.tier2.overall_score,
                tier3_score=0.0,
                evaluation_complete=False,
                weights_used={"tier1": 0.0, "tier2": 1.0, "tier3": 0.0},
            )
        # All tiers skipped — return empty evaluation with score 0.0
        logger.warning(
            "All tiers skipped — no evaluation data available. "
            "Check that review text and reference reviews are non-empty."
        )
        return CompositeResult(
            composite_score=0.0,
            recommendation="reject",
            recommendation_weight=self.composite_scorer.get_recommendation_weight("reject"),
            metric_scores={},
            tier1_score=0.0,
            tier2_score=None,
            tier3_score=0.0,
            evaluation_complete=False,
            weights_used={"tier1": 0.0, "tier2": 0.0, "tier3": 0.0},
        )

    def _composite_without_tier2(self, results: EvaluationResults) -> CompositeResult:
        """Handle composite scoring when Tier 2 was skipped.

        Args:
            results: Evaluation results (tier2 is None)

        Returns:
            CompositeResult with weight redistribution or degraded scoring

        Raises:
            ValueError: If neither Tier 1 nor Tier 3 results available
        """
        if results.tier1 and results.tier3:
            return self.composite_scorer.evaluate_composite_with_optional_tier2(results)
        if results.tier1:
            # Reason: Tier 1 only — cap at weak_reject threshold to prevent
            # misleading high scores from incomplete evaluations.
            penalized_score = min(
                results.tier1.overall_score, self.settings.composite_weak_reject_threshold
            )
            logger.warning(
                "Composite score degraded: only Tier 1 available "
                "(Tier 2 skipped, Tier 3 unavailable). "
                f"Score capped at {self.settings.composite_weak_reject_threshold} "
                f"(was {results.tier1.overall_score:.3f})."
            )
            return CompositeResult(
                composite_score=penalized_score,
                recommendation="weak_reject",
                recommendation_weight=-0.25,
                metric_scores={
                    "cosine_score": results.tier1.cosine_score,
                    "jaccard_score": results.tier1.jaccard_score,
                    "semantic_score": results.tier1.semantic_score,
                },
                tier1_score=results.tier1.overall_score,
                tier2_score=None,
                tier3_score=0.0,
                evaluation_complete=False,
                weights_used={"tier1": 1.0, "tier2": 0.0, "tier3": 0.0},
            )
        raise ValueError(
            "Cannot generate composite score: Tier 1 and Tier 3 required when Tier 2 is skipped"
        )

    def _handle_tier3_error(
        self, e: Exception, execution_trace: dict[str, Any] | None, start_time: float
    ) -> tuple[None, float]:
        """Handle Tier 3 execution errors with specific guidance."""
        execution_time = time.time() - start_time
        error_type = type(e).__name__
        trace_size = len(str(execution_trace)) if execution_trace else 0
        error_msg = f"Tier 3 failed with {error_type}: {e}"
        logger.error(f"{error_msg}. Trace data size: {trace_size} chars")

        if "memory" in str(e).lower():
            logger.error("Memory error - consider reducing trace data complexity")
        elif "networkx" in str(e).lower():
            logger.error("Graph construction error - check trace data format")

        self.performance_monitor.record_tier_failure(3, "error", execution_time, str(e))
        return None, execution_time

    async def _execute_tier3(
        self, execution_trace: dict[str, Any] | None = None
    ) -> tuple[Tier3Result | None, float]:
        """Execute Graph Analysis evaluation (Tier 3).

        Args:
            execution_trace: Optional execution trace data for graph construction

        Returns:
            Tuple of (Tier3Result or None, execution_time)
        """
        if not self._is_tier_enabled(3):
            logger.debug("Tier 3 disabled, skipping graph analysis")
            return None, 0.0

        performance_targets = self.performance_targets
        timeout = performance_targets.get("tier3_max_seconds", 15.0)
        start_time = time.time()

        try:
            trace_data = self._create_trace_data(execution_trace)

            if not trace_data.tool_calls and not trace_data.agent_interactions:
                logger.info(
                    "Tier 3 skipped: trace data has no tool_calls or agent_interactions "
                    "(expected for CC solo mode — single-agent stream has no delegation events)"
                )
                self.performance_monitor.record_tier_execution(3, 0.0)
                return None, 0.0

            logger.info("Executing Tier 3: Graph Analysis")

            result = await asyncio.wait_for(
                asyncio.create_task(
                    asyncio.to_thread(self.graph_engine.evaluate_graph_metrics, trace_data)
                ),
                timeout=timeout,
            )

            execution_time = time.time() - start_time
            self.performance_monitor.record_tier_execution(3, execution_time)
            logger.info(f"Tier 3 completed in {execution_time:.2f}s")
            return result, execution_time

        except TimeoutError:
            execution_time = time.time() - start_time
            error_msg = f"Tier 3 timeout after {timeout}s (Graph analysis evaluation)"
            logger.error(
                f"{error_msg}. Consider increasing tier3_max_seconds or simplifying trace data."
            )
            self.performance_monitor.record_tier_failure(3, "timeout", execution_time, error_msg)
            return None, execution_time
        except Exception as e:
            return self._handle_tier3_error(e, execution_trace, start_time)

    def _apply_fallback_strategy(self, results: EvaluationResults) -> EvaluationResults:
        """Apply fallback strategy when tiers fail.

        Args:
            results: Partial evaluation results

        Returns:
            EvaluationResults with fallback applied
        """
        fallback_strategy = self.fallback_strategy
        fallback_applied = False

        if fallback_strategy == "tier1_only" and results.tier1:
            logger.info(
                "Applying tier1_only fallback strategy - creating fallback "
                "results for missing tiers"
            )

            # Create fallback results for missing tiers to enable composite scoring
            if not results.tier2:
                logger.debug("Creating fallback Tier 2 result")
                results.tier2 = Tier2Result(
                    technical_accuracy=0.5,
                    constructiveness=0.5,
                    planning_rationality=0.5,
                    overall_score=0.5,
                    model_used="fallback",
                    api_cost=0.0,
                    fallback_used=True,
                )
                fallback_applied = True

            if not results.tier3:
                logger.debug("Creating fallback Tier 3 result")
                results.tier3 = Tier3Result(
                    path_convergence=0.5,
                    tool_selection_accuracy=0.5,
                    coordination_centrality=0.5,
                    task_distribution_balance=0.5,
                    overall_score=0.5,
                    graph_complexity=1,
                )
                fallback_applied = True

            if fallback_applied:
                self.performance_monitor.record_fallback_usage(True)
                logger.info(f"Fallback strategy '{fallback_strategy}' applied successfully.")

        elif not results.tier1:
            logger.warning(
                f"Cannot apply fallback strategy '{fallback_strategy}' - Tier 1 results unavailable"
            )

        return results

    def _log_metric_comparison(
        self, results: EvaluationResults, composite_result: CompositeResult
    ) -> None:
        """Log comparative summary of Tier 1 (text) vs Tier 3 (graph) metrics.

        Args:
            results: EvaluationResults containing tier1 and tier3 results
            composite_result: CompositeResult with composite scoring information
        """
        logger.info("=" * 60)
        logger.info("Evaluation Metrics Comparison Summary")
        logger.info("=" * 60)

        # Log overall tier scores comparison
        tier1_score = composite_result.tier1_score
        tier3_score = composite_result.tier3_score
        logger.info(f"Tier 1 (Text Metrics) overall score: {tier1_score:.3f}")
        logger.info(f"Tier 3 (Graph Analysis) overall score: {tier3_score:.3f}")
        logger.info("")

        # Log individual text metrics from Tier 1
        if results.tier1:
            logger.info("Text Metrics (Tier 1):")
            logger.info(f"  cosine_score: {results.tier1.cosine_score:.3f}")
            logger.info(f"  jaccard_score: {results.tier1.jaccard_score:.3f}")
            logger.info(f"  semantic_score: {results.tier1.semantic_score:.3f}")
            logger.info("")

        # Log individual graph metrics from Tier 3
        if results.tier3:
            logger.info("Graph Metrics (Tier 3):")
            logger.info(f"  path_convergence: {results.tier3.path_convergence:.3f}")
            logger.info(f"  tool_selection_accuracy: {results.tier3.tool_selection_accuracy:.3f}")
            logger.info(f"  coordination_centrality: {results.tier3.coordination_centrality:.3f}")
            logger.info(
                f"  task_distribution_balance: {results.tier3.task_distribution_balance:.3f}"
            )
            logger.info("")

        # Log composite score with tier contributions
        logger.info("Composite Score Summary:")
        logger.info(f"  Final composite score: {composite_result.composite_score:.3f}")
        logger.info(f"  Recommendation: {composite_result.recommendation}")

        # Show metric weights used in composite calculation
        if hasattr(composite_result, "weights_used") and composite_result.weights_used:
            logger.info("  Metric weights used:")
            for metric, weight in composite_result.weights_used.items():
                logger.info(f"    {metric}: {weight:.3f}")

        logger.info("=" * 60)

    async def evaluate_comprehensive(
        self,
        paper: str,
        review: str,
        execution_trace: GraphTraceData | dict[str, Any] | None = None,
        reference_reviews: list[str] | None = None,
    ) -> CompositeResult:
        """Execute comprehensive three-tier evaluation pipeline.

        Args:
            paper: Paper content text for evaluation
            review: Generated review text to assess
            execution_trace: Optional execution trace (GraphTraceData or dict) for graph analysis
            reference_reviews: Optional list of ground truth reviews for similarity

        Returns:
            CompositeResult with scores from all applicable tiers

        Raises:
            ValueError: If critical evaluation components fail
        """
        # Retain GraphTraceData for composite scoring, convert to dict for tier execution
        trace_obj: GraphTraceData | None = None
        trace_dict: dict[str, Any] | None = None
        if execution_trace is not None:
            if isinstance(execution_trace, GraphTraceData):
                trace_obj = execution_trace
                trace_dict = execution_trace.model_dump()
            else:
                trace_dict = execution_trace

        # Execute comprehensive evaluation pipeline
        pipeline_start = time.time()
        logger.info("Starting comprehensive three-tier evaluation pipeline")

        # Reset execution stats for new evaluation
        self.performance_monitor.reset_stats()

        try:
            # Execute all enabled tiers
            tier1_result, _ = await self._execute_tier1(paper, review, reference_reviews)
            tier2_result, _ = await self._execute_tier2(paper, review, trace_dict)
            tier3_result, _ = await self._execute_tier3(trace_dict)

            # Execution times are already tracked by performance_monitor in tier methods

            # Assemble results
            results = EvaluationResults(
                tier1=tier1_result,
                tier2=tier2_result,
                tier3=tier3_result,
            )

            # Apply fallback strategy if needed (but NOT for Tier 2 when provider unavailable)
            if not results.is_complete() and self._should_apply_fallback(results):
                results = self._apply_fallback_strategy(results)

            # Generate composite score with appropriate weight handling
            composite_result = self._generate_composite_score(results, trace_data=trace_obj)

            # Finalize performance monitoring
            total_time = time.time() - pipeline_start
            self.performance_monitor.finalize_execution(total_time)

            # Get execution statistics and performance summary
            execution_stats = self.performance_monitor.get_execution_stats()
            performance_summary = self.performance_monitor.get_performance_summary()

            logger.info(
                f"Pipeline completed in {total_time:.2f}s, "
                f"tiers executed: {execution_stats['tiers_executed']}, "
                f"composite score: {composite_result.composite_score:.3f}, "
                f"performance: {performance_summary}"
            )

            # Log metric comparison summary
            self._log_metric_comparison(results, composite_result)

            return composite_result

        except Exception as e:
            total_time = time.time() - pipeline_start
            error_type = type(e).__name__
            logger.error(
                f"Pipeline evaluation failed after {total_time:.2f}s with {error_type}: {e}"
            )

            # Record pipeline-level failure for monitoring
            # Note: Using tier 0 for pipeline-level failures
            self.performance_monitor.record_tier_failure(0, "critical_error", total_time, str(e))
            self.performance_monitor.finalize_execution(total_time)

            raise

    def get_execution_stats(self) -> dict[str, Any]:
        """Get detailed execution statistics from last pipeline run.

        Returns:
            Dictionary with timing and execution details including performance analysis
        """
        return self.performance_monitor.get_execution_stats()

    def get_pipeline_summary(self) -> dict[str, Any]:
        """Get pipeline configuration summary.

        Returns:
            Dictionary with pipeline configuration details
        """
        return {
            "config_path": None,
            "enabled_tiers": sorted(self.settings.get_enabled_tiers()),
            "fallback_strategy": self.settings.fallback_strategy,
            "performance_targets": self.settings.get_performance_targets(),
            "has_tier1_config": True,
            "has_tier2_config": True,
            "has_tier3_config": True,
        }
Attributes
config_path property

Get configuration path (backward compatibility property).

Returns:

Type Description
Path | None

Always None (settings-based configuration only)

enabled_tiers property

Get enabled tiers (backward compatibility property).

Returns:

Type Description
set[int]

Set of enabled tier numbers

execution_stats property

Get execution statistics (backward compatibility property).

Returns:

Type Description
dict[str, Any]

Dictionary with execution statistics

fallback_strategy property

Get fallback strategy (backward compatibility property).

Returns:

Type Description
str

Fallback strategy name

performance_targets property

Get performance targets (backward compatibility property).

Returns:

Type Description
dict[str, float]

Dictionary of performance targets

Functions
__init__(settings=None, chat_provider=None, chat_model=None)

Initialize evaluation pipeline with configuration.

Parameters:

Name Type Description Default
settings JudgeSettings | None

JudgeSettings instance. If None, uses default JudgeSettings().

None
chat_provider str | None

Active chat provider from agent system. Passed to LLMJudgeEngine for tier2_provider=auto mode.

None
chat_model str | None

Active chat model from agent system. Forwarded to LLMJudgeEngine for model inheritance in auto mode.

None

Raises:

Type Description
ValueError

If configuration is invalid

Source code in src/app/judge/evaluation_pipeline.py
def __init__(
    self,
    settings: JudgeSettings | None = None,
    chat_provider: str | None = None,
    chat_model: str | None = None,
):
    """Initialize evaluation pipeline with configuration.

    Args:
        settings: JudgeSettings instance. If None, uses default JudgeSettings().
        chat_provider: Active chat provider from agent system. Passed to LLMJudgeEngine
                      for tier2_provider=auto mode.
        chat_model: Active chat model from agent system. Forwarded to LLMJudgeEngine
                   for model inheritance in auto mode.

    Raises:
        ValueError: If configuration is invalid
    """
    # Use provided settings or create default
    if settings is None:
        settings = JudgeSettings()

    self.settings = settings
    self.chat_provider = chat_provider
    self.chat_model = chat_model
    self.performance_monitor = PerformanceMonitor(settings.get_performance_targets())

    # Initialize engines with settings
    self.traditional_engine = TraditionalMetricsEngine()
    self.llm_engine = LLMJudgeEngine(
        settings, chat_provider=chat_provider, chat_model=chat_model
    )
    self.graph_engine = GraphAnalysisEngine(settings)
    self.composite_scorer = CompositeScorer(settings=settings)

    enabled_tiers = sorted(settings.get_enabled_tiers())
    fallback_strategy = settings.fallback_strategy
    logger.info(
        f"EvaluationPipeline initialized with JudgeSettings: tiers={enabled_tiers}, "
        f"fallback_strategy={fallback_strategy}, chat_provider={chat_provider}"
    )
evaluate_comprehensive(paper, review, execution_trace=None, reference_reviews=None) async

Execute comprehensive three-tier evaluation pipeline.

Parameters:

Name Type Description Default
paper str

Paper content text for evaluation

required
review str

Generated review text to assess

required
execution_trace GraphTraceData | dict[str, Any] | None

Optional execution trace (GraphTraceData or dict) for graph analysis

None
reference_reviews list[str] | None

Optional list of ground truth reviews for similarity

None

Returns:

Type Description
CompositeResult

CompositeResult with scores from all applicable tiers

Raises:

Type Description
ValueError

If critical evaluation components fail

Source code in src/app/judge/evaluation_pipeline.py
async def evaluate_comprehensive(
    self,
    paper: str,
    review: str,
    execution_trace: GraphTraceData | dict[str, Any] | None = None,
    reference_reviews: list[str] | None = None,
) -> CompositeResult:
    """Execute comprehensive three-tier evaluation pipeline.

    Args:
        paper: Paper content text for evaluation
        review: Generated review text to assess
        execution_trace: Optional execution trace (GraphTraceData or dict) for graph analysis
        reference_reviews: Optional list of ground truth reviews for similarity

    Returns:
        CompositeResult with scores from all applicable tiers

    Raises:
        ValueError: If critical evaluation components fail
    """
    # Retain GraphTraceData for composite scoring, convert to dict for tier execution
    trace_obj: GraphTraceData | None = None
    trace_dict: dict[str, Any] | None = None
    if execution_trace is not None:
        if isinstance(execution_trace, GraphTraceData):
            trace_obj = execution_trace
            trace_dict = execution_trace.model_dump()
        else:
            trace_dict = execution_trace

    # Execute comprehensive evaluation pipeline
    pipeline_start = time.time()
    logger.info("Starting comprehensive three-tier evaluation pipeline")

    # Reset execution stats for new evaluation
    self.performance_monitor.reset_stats()

    try:
        # Execute all enabled tiers
        tier1_result, _ = await self._execute_tier1(paper, review, reference_reviews)
        tier2_result, _ = await self._execute_tier2(paper, review, trace_dict)
        tier3_result, _ = await self._execute_tier3(trace_dict)

        # Execution times are already tracked by performance_monitor in tier methods

        # Assemble results
        results = EvaluationResults(
            tier1=tier1_result,
            tier2=tier2_result,
            tier3=tier3_result,
        )

        # Apply fallback strategy if needed (but NOT for Tier 2 when provider unavailable)
        if not results.is_complete() and self._should_apply_fallback(results):
            results = self._apply_fallback_strategy(results)

        # Generate composite score with appropriate weight handling
        composite_result = self._generate_composite_score(results, trace_data=trace_obj)

        # Finalize performance monitoring
        total_time = time.time() - pipeline_start
        self.performance_monitor.finalize_execution(total_time)

        # Get execution statistics and performance summary
        execution_stats = self.performance_monitor.get_execution_stats()
        performance_summary = self.performance_monitor.get_performance_summary()

        logger.info(
            f"Pipeline completed in {total_time:.2f}s, "
            f"tiers executed: {execution_stats['tiers_executed']}, "
            f"composite score: {composite_result.composite_score:.3f}, "
            f"performance: {performance_summary}"
        )

        # Log metric comparison summary
        self._log_metric_comparison(results, composite_result)

        return composite_result

    except Exception as e:
        total_time = time.time() - pipeline_start
        error_type = type(e).__name__
        logger.error(
            f"Pipeline evaluation failed after {total_time:.2f}s with {error_type}: {e}"
        )

        # Record pipeline-level failure for monitoring
        # Note: Using tier 0 for pipeline-level failures
        self.performance_monitor.record_tier_failure(0, "critical_error", total_time, str(e))
        self.performance_monitor.finalize_execution(total_time)

        raise
get_execution_stats()

Get detailed execution statistics from last pipeline run.

Returns:

Type Description
dict[str, Any]

Dictionary with timing and execution details including performance analysis

Source code in src/app/judge/evaluation_pipeline.py
def get_execution_stats(self) -> dict[str, Any]:
    """Get detailed execution statistics from last pipeline run.

    Returns:
        Dictionary with timing and execution details including performance analysis
    """
    return self.performance_monitor.get_execution_stats()
get_pipeline_summary()

Get pipeline configuration summary.

Returns:

Type Description
dict[str, Any]

Dictionary with pipeline configuration details

Source code in src/app/judge/evaluation_pipeline.py
def get_pipeline_summary(self) -> dict[str, Any]:
    """Get pipeline configuration summary.

    Returns:
        Dictionary with pipeline configuration details
    """
    return {
        "config_path": None,
        "enabled_tiers": sorted(self.settings.get_enabled_tiers()),
        "fallback_strategy": self.settings.fallback_strategy,
        "performance_targets": self.settings.get_performance_targets(),
        "has_tier1_config": True,
        "has_tier2_config": True,
        "has_tier3_config": True,
    }

app.judge.evaluation_runner

Evaluation orchestration extracted from the main entry point.

Handles post-execution evaluation pipeline, baseline comparisons, and interaction graph construction from trace data.

Classes

Functions

build_graph_from_trace(execution_id)

Build interaction graph from execution trace data.

Parameters:

Name Type Description Default
execution_id str | None

Execution ID for trace retrieval.

required

Returns:

Type Description
DiGraph[str] | None

NetworkX DiGraph if trace data available, None otherwise.

Source code in src/app/judge/evaluation_runner.py
def build_graph_from_trace(execution_id: str | None) -> nx.DiGraph[str] | None:
    """Build interaction graph from execution trace data.

    Args:
        execution_id: Execution ID for trace retrieval.

    Returns:
        NetworkX DiGraph if trace data available, None otherwise.
    """
    if not execution_id:
        return None

    from app.judge.trace_processors import get_trace_collector

    trace_collector = get_trace_collector()
    execution_trace = trace_collector.load_trace(execution_id)

    if not execution_trace:
        return None

    graph = build_interaction_graph(execution_trace)
    logger.info(
        f"Built interaction graph: {graph.number_of_nodes()} nodes, {graph.number_of_edges()} edges"
    )
    return graph

run_baseline_comparisons(pipeline, pydantic_result, cc_solo_dir, cc_teams_dir, cc_teams_tasks_dir) async

Run baseline comparisons against Claude Code solo and teams if directories provided.

Parameters:

Name Type Description Default
pipeline EvaluationPipeline

Evaluation pipeline instance.

required
pydantic_result CompositeResult | None

PydanticAI evaluation result.

required
cc_solo_dir str | None

Path to Claude Code solo artifacts directory.

required
cc_teams_dir str | None

Path to Claude Code teams artifacts directory.

required
cc_teams_tasks_dir str | None

Path to Claude Code teams tasks directory (optional, auto-discovered if not specified).

required
Source code in src/app/judge/evaluation_runner.py
async def run_baseline_comparisons(
    pipeline: EvaluationPipeline,
    pydantic_result: CompositeResult | None,
    cc_solo_dir: str | None,
    cc_teams_dir: str | None,
    cc_teams_tasks_dir: str | None,
) -> None:
    """Run baseline comparisons against Claude Code solo and teams if directories provided.

    Args:
        pipeline: Evaluation pipeline instance.
        pydantic_result: PydanticAI evaluation result.
        cc_solo_dir: Path to Claude Code solo artifacts directory.
        cc_teams_dir: Path to Claude Code teams artifacts directory.
        cc_teams_tasks_dir: Path to Claude Code teams tasks directory (optional,
                           auto-discovered if not specified).
    """
    if not cc_solo_dir and not cc_teams_dir:
        return

    logger.info("Running baseline comparisons...")

    # Evaluate Claude Code solo baseline if directory provided
    cc_solo_result: CompositeResult | None = None
    if cc_solo_dir:
        try:
            logger.info(f"Evaluating Claude Code solo baseline from {cc_solo_dir}")
            adapter = CCTraceAdapter(Path(cc_solo_dir))
            cc_solo_trace = adapter.parse()
            cc_solo_result = await pipeline.evaluate_comprehensive(
                paper="",
                review="",
                execution_trace=cc_solo_trace,
                reference_reviews=None,
            )
            logger.info(f"Claude Code solo baseline score: {cc_solo_result.composite_score:.2f}")
        except Exception as e:
            logger.warning(f"Failed to evaluate Claude Code solo baseline: {e}")

    # Evaluate Claude Code teams baseline if directory provided
    cc_teams_result: CompositeResult | None = None
    if cc_teams_dir:
        try:
            logger.info(f"Evaluating Claude Code teams baseline from {cc_teams_dir}")
            # Pass optional tasks_dir if provided, otherwise let adapter auto-discover
            tasks_path = Path(cc_teams_tasks_dir) if cc_teams_tasks_dir else None
            adapter = CCTraceAdapter(Path(cc_teams_dir), tasks_dir=tasks_path)
            cc_teams_trace = adapter.parse()
            cc_teams_result = await pipeline.evaluate_comprehensive(
                paper="",
                review="",
                execution_trace=cc_teams_trace,
                reference_reviews=None,
            )
            logger.info(f"Claude Code teams baseline score: {cc_teams_result.composite_score:.2f}")
        except Exception as e:
            logger.warning(f"Failed to evaluate Claude Code teams baseline: {e}")

    # Generate and log comparisons
    comparisons = compare_all(pydantic_result, cc_solo_result, cc_teams_result)
    for comparison in comparisons:
        logger.info(f"Baseline comparison: {comparison.summary}")

run_evaluation_if_enabled(skip_eval, paper_id, execution_id, cc_solo_dir=None, cc_teams_dir=None, cc_teams_tasks_dir=None, chat_provider=None, chat_model=None, judge_settings=None, manager_output=None, review_text=None, run_dir=None, execution_trace=None, engine_type='mas') async

Run evaluation pipeline after manager completes if enabled.

Parameters:

Name Type Description Default
skip_eval bool

Whether to skip evaluation via CLI flag.

required
paper_id str | None

Paper ID for PeerRead review (indicates ground truth availability).

required
execution_id str | None

Execution ID for trace retrieval.

required
cc_solo_dir str | None

Path to Claude Code solo artifacts directory for baseline comparison.

None
cc_teams_dir str | None

Path to Claude Code teams artifacts directory for baseline comparison.

None
cc_teams_tasks_dir str | None

Path to Claude Code teams tasks directory (optional, auto-discovered if not specified).

None
chat_provider str | None

Active chat provider from agent system.

None
chat_model str | None

Active chat model from agent system. Forwarded to LLMJudgeEngine for model inheritance when tier2_provider=auto.

None
judge_settings JudgeSettings | None

Optional JudgeSettings override from GUI or programmatic calls.

None
manager_output Any

Manager result output containing ReviewGenerationResult (optional).

None
review_text str | None

Pre-extracted review text (e.g. from CC engine). When provided, overrides text extraction from manager_output.

None
run_dir Path | None

Optional per-run output directory. When provided, evaluation results are persisted to evaluation.json in this directory.

None
execution_trace Any

Optional pre-built GraphTraceData (e.g. from CC engine). When provided, skips SQLite trace lookup. When None, falls back to trace_collector.load_trace() (existing MAS behavior).

None
engine_type str

Source engine identifier (‘mas’, ‘cc_solo’, or ‘cc_teams’). Set on CompositeResult before persisting to evaluation.json.

'mas'

Returns:

Type Description
CompositeResult | None

CompositeResult from PydanticAI evaluation or None if skipped.

Source code in src/app/judge/evaluation_runner.py
async def run_evaluation_if_enabled(
    skip_eval: bool,
    paper_id: str | None,
    execution_id: str | None,
    cc_solo_dir: str | None = None,
    cc_teams_dir: str | None = None,
    cc_teams_tasks_dir: str | None = None,
    chat_provider: str | None = None,
    chat_model: str | None = None,
    judge_settings: JudgeSettings | None = None,
    manager_output: Any = None,
    review_text: str | None = None,
    run_dir: Path | None = None,
    execution_trace: Any = None,
    engine_type: str = "mas",
) -> CompositeResult | None:
    """Run evaluation pipeline after manager completes if enabled.

    Args:
        skip_eval: Whether to skip evaluation via CLI flag.
        paper_id: Paper ID for PeerRead review (indicates ground truth availability).
        execution_id: Execution ID for trace retrieval.
        cc_solo_dir: Path to Claude Code solo artifacts directory for baseline comparison.
        cc_teams_dir: Path to Claude Code teams artifacts directory for baseline comparison.
        cc_teams_tasks_dir: Path to Claude Code teams tasks directory (optional,
                           auto-discovered if not specified).
        chat_provider: Active chat provider from agent system.
        chat_model: Active chat model from agent system. Forwarded to LLMJudgeEngine
            for model inheritance when tier2_provider=auto.
        judge_settings: Optional JudgeSettings override from GUI or programmatic calls.
        manager_output: Manager result output containing ReviewGenerationResult (optional).
        review_text: Pre-extracted review text (e.g. from CC engine). When provided,
            overrides text extraction from manager_output.
        run_dir: Optional per-run output directory. When provided, evaluation results
            are persisted to evaluation.json in this directory.
        execution_trace: Optional pre-built GraphTraceData (e.g. from CC engine).
            When provided, skips SQLite trace lookup. When None, falls back to
            trace_collector.load_trace() (existing MAS behavior).
        engine_type: Source engine identifier ('mas', 'cc_solo', or 'cc_teams').
            Set on CompositeResult before persisting to evaluation.json.

    Returns:
        CompositeResult from PydanticAI evaluation or None if skipped.
    """
    if skip_eval:
        logger.info("Evaluation skipped via --skip-eval flag")
        return None

    logger.info("Running evaluation pipeline...")
    pipeline = EvaluationPipeline(
        settings=judge_settings, chat_provider=chat_provider, chat_model=chat_model
    )

    if not paper_id:
        logger.info("Skipping evaluation: no ground-truth reviews available")

    execution_trace = _resolve_execution_trace(execution_trace, execution_id)

    # Extract paper and review content from manager_output (or use override)
    paper_content, extracted_review = _extract_paper_and_review_content(manager_output)

    # CC paper content fallback: when manager_output is None (CC path) but paper_id
    # is available, load paper content directly from PeerRead cache
    if not paper_content and paper_id:
        paper_content = _load_paper_content(paper_id)

    # S10-F1: CC engine passes review_text directly, overriding extraction
    if review_text is None:
        review_text = extracted_review

    # S10-F1: load reference reviews from PeerRead for all modes (fixes hardcoded None)
    reference_reviews = _load_reference_reviews(paper_id)

    pydantic_result = await pipeline.evaluate_comprehensive(
        paper=paper_content,
        review=review_text,
        execution_trace=execution_trace,
        reference_reviews=reference_reviews,
    )

    # Set engine_type before persisting so evaluation.json has the correct value
    if pydantic_result is not None:  # type: ignore[reportUnnecessaryComparison]
        pydantic_result.engine_type = engine_type

    # Persist evaluation results to run directory
    if run_dir is not None:
        eval_path = run_dir / "evaluation.json"
        eval_path.write_text(json.dumps(pydantic_result.model_dump(), indent=2), encoding="utf-8")
        get_artifact_registry().register("Evaluation", eval_path)
        logger.info(f"Evaluation results written to {eval_path}")

    # Run baseline comparisons if Claude Code directories provided
    await run_baseline_comparisons(
        pipeline, pydantic_result, cc_solo_dir, cc_teams_dir, cc_teams_tasks_dir
    )

    return pydantic_result

app.judge.graph_analysis

Graph-based analysis engine for Tier 3 evaluation.

Provides NetworkX-based analysis of agent coordination patterns, tool usage efficiency, and communication overhead with streamlined implementation focusing on essential multi-agent interaction metrics.

Note: This module contains type: ignore comments for NetworkX operations due to incomplete type hints in the NetworkX library itself.

Classes

GraphAnalysisEngine

NetworkX-based graph analysis engine for agent coordination evaluation.

Implements essential graph-based complexity metrics for multi-agent systems with focus on tool usage patterns, communication efficiency, and coordination quality using lightweight NetworkX operations.

Source code in src/app/judge/graph_analysis.py
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
class GraphAnalysisEngine:
    """NetworkX-based graph analysis engine for agent coordination evaluation.

    Implements essential graph-based complexity metrics for multi-agent systems
    with focus on tool usage patterns, communication efficiency, and coordination
    quality using lightweight NetworkX operations.
    """

    def __init__(self, settings: JudgeSettings) -> None:
        """Initialize graph analysis engine with settings.

        Args:
            settings: JudgeSettings instance with tier3 configuration.

        Raises:
            ValueError: If configuration is invalid
        """
        self.settings = settings

        self.min_nodes_for_analysis = settings.tier3_min_nodes
        self.centrality_measures = list(settings.tier3_centrality_measures)

        # Weights for composite scoring
        self.weights = {
            "path_convergence": 0.3,
            "tool_accuracy": 0.25,
            "coordination_quality": 0.25,
            "task_balance": 0.2,
        }

        # Resource limits for production safety
        self.max_nodes = settings.tier3_max_nodes
        self.max_edges = settings.tier3_max_edges
        self.operation_timeout = settings.tier3_operation_timeout

    def _validate_trace_data(self, trace_data: GraphTraceData) -> None:
        """Validate GraphTraceData structure and content before analysis.

        Args:
            trace_data: Execution trace data to validate

        Raises:
            ValueError: If trace data is invalid or incomplete
        """
        if not trace_data.execution_id:
            raise ValueError("execution_id is required in trace data")

        self._validate_agent_interactions(trace_data.agent_interactions)
        self._validate_tool_calls(trace_data.tool_calls)
        self._check_data_size_limits(trace_data)

    def _validate_agent_interactions(self, interactions: list[dict[str, Any]]) -> None:
        """Validate agent interactions structure."""
        for i, interaction in enumerate(interactions):
            if "from" not in interaction or "to" not in interaction:
                raise ValueError(f"Agent interaction {i} missing 'from' or 'to' field")
            if not interaction["from"] or not interaction["to"]:
                raise ValueError(f"Agent interaction {i} has empty 'from' or 'to' field")

    def _validate_tool_calls(self, tool_calls: list[dict[str, Any]]) -> None:
        """Validate tool calls structure."""
        for i, call in enumerate(tool_calls):
            if "agent_id" not in call:
                raise ValueError(f"Tool call {i} missing 'agent_id' field")
            if not call["agent_id"]:
                raise ValueError(f"Tool call {i} has empty 'agent_id' field")

    def _check_data_size_limits(self, trace_data: GraphTraceData) -> None:
        """Check trace data against size limits."""
        total_interactions = len(trace_data.agent_interactions)
        total_calls = len(trace_data.tool_calls)
        total_events = total_interactions + total_calls

        if total_events > self.max_nodes:
            logger.warning(f"Trace has {total_events} events, exceeding max_nodes={self.max_nodes}")

        estimated_edges = total_interactions + (total_calls * 2)
        if estimated_edges > self.max_edges:
            logger.warning(
                f"Trace may generate ~{estimated_edges} edges, exceeding max_edges={self.max_edges}"
            )

    def _with_timeout(self, func: Any, *args: Any, **kwargs: Any) -> Any:
        """Execute function with thread-safe timeout protection.

        Uses ThreadPoolExecutor to enable timeout in both main and non-main threads
        (e.g., Streamlit GUI context). Replaces signal-based timeout which only
        works in the main thread.

        Args:
            func: Function to execute
            *args: Function arguments
            **kwargs: Function keyword arguments

        Returns:
            Function result or raises TimeoutError

        Raises:
            TimeoutError: If operation exceeds timeout limit
            NetworkXError: If NetworkX operation fails
        """
        with ThreadPoolExecutor(max_workers=1) as executor:
            future = executor.submit(func, *args, **kwargs)
            try:
                result = future.result(timeout=self.operation_timeout)
                return result
            except FuturesTimeoutError:
                logger.error(f"Graph operation timed out after {self.operation_timeout}s")
                raise TimeoutError(f"Graph operation exceeded {self.operation_timeout}s timeout")
            except (
                nx.NetworkXError,
                nx.NetworkXPointlessConcept,
                nx.NetworkXAlgorithmError,
            ) as e:
                logger.warning(f"NetworkX operation failed: {e}")
                raise

    def _accumulate_tool_outcomes(
        self, tool_calls: list[dict[str, Any]]
    ) -> tuple[dict[str, list[bool]], dict[tuple[str, str], list[bool]]]:
        """Accumulate tool and edge outcomes from tool calls.

        Args:
            tool_calls: List of tool call dictionaries

        Returns:
            Tuple of (tool_outcomes, edge_outcomes) dictionaries
        """
        tool_outcomes: dict[str, list[bool]] = {}
        edge_outcomes: dict[tuple[str, str], list[bool]] = {}

        for i, call in enumerate(tool_calls):
            tool_name = call.get("tool_name", f"tool_{i}")
            agent_id = call.get("agent_id", f"agent_{i}")
            success = call.get("success", False)

            # Accumulate outcomes instead of overwriting
            if tool_name not in tool_outcomes:
                tool_outcomes[tool_name] = []
            tool_outcomes[tool_name].append(success)

            edge_key = (agent_id, tool_name)
            if edge_key not in edge_outcomes:
                edge_outcomes[edge_key] = []
            edge_outcomes[edge_key].append(success)

        return tool_outcomes, edge_outcomes

    def _build_tool_graph(
        self,
        tool_calls: list[dict[str, Any]],
        tool_outcomes: dict[str, list[bool]],
        edge_outcomes: dict[tuple[str, str], list[bool]],
    ) -> Any:
        """Build tool usage graph with accumulated metrics.

        Args:
            tool_calls: List of tool call dictionaries
            tool_outcomes: Accumulated tool outcomes
            edge_outcomes: Accumulated edge outcomes

        Returns:
            NetworkX directed graph
        """
        tool_graph = nx.DiGraph()

        # Add tool nodes with accumulated success rates
        for tool_name, outcomes in tool_outcomes.items():
            success_rate = sum(outcomes) / len(outcomes)
            tool_graph.add_node(tool_name, type="tool", success_rate=success_rate)

        # Add agent nodes
        for call in tool_calls:
            agent_id = call.get("agent_id", f"agent_{call}")
            if not tool_graph.has_node(agent_id):
                tool_graph.add_node(agent_id, type="agent")

        # Add edges with accumulated weights
        for (agent_id, tool_name), outcomes in edge_outcomes.items():
            avg_weight = sum(1.0 if s else 0.5 for s in outcomes) / len(outcomes)
            tool_graph.add_edge(agent_id, tool_name, weight=avg_weight)

        return tool_graph

    def analyze_tool_usage_patterns(self, trace_data: GraphTraceData) -> dict[str, float]:
        """Analyze tool usage efficiency and selection patterns.

        Args:
            trace_data: Processed execution trace data

        Returns:
            Dictionary with tool analysis metrics
        """
        # Validate trace data first
        self._validate_trace_data(trace_data)

        if not trace_data.tool_calls:
            return {"path_convergence": 0.0, "tool_selection_accuracy": 0.0}

        try:
            # Accumulate outcomes and build graph
            tool_outcomes, edge_outcomes = self._accumulate_tool_outcomes(trace_data.tool_calls)
            tool_graph = self._build_tool_graph(trace_data.tool_calls, tool_outcomes, edge_outcomes)

            if len(tool_graph.nodes) < self.min_nodes_for_analysis:  # type: ignore[arg-type]
                return {"path_convergence": 0.5, "tool_selection_accuracy": 0.5}

            # Calculate path convergence using graph connectivity
            path_convergence = self._calculate_path_convergence(tool_graph)

            # Calculate tool selection accuracy from success rates
            tool_nodes = [n for n, d in tool_graph.nodes(data=True) if d.get("type") == "tool"]
            if tool_nodes:
                success_rates = [
                    tool_graph.nodes[tool].get("success_rate", 0.0) for tool in tool_nodes
                ]
                tool_accuracy = sum(success_rates) / len(success_rates)
            else:
                tool_accuracy = 0.0

            return {
                "path_convergence": path_convergence,
                "tool_selection_accuracy": tool_accuracy,
            }

        except Exception as e:
            logger.warning(f"Tool usage pattern analysis failed: {e}")
            return {"path_convergence": 0.0, "tool_selection_accuracy": 0.0}

    def analyze_agent_interactions(self, trace_data: GraphTraceData) -> dict[str, float]:
        """Analyze agent-to-agent communication and coordination patterns.

        Args:
            trace_data: Processed execution trace data

        Returns:
            Dictionary with interaction analysis metrics
        """
        self._validate_trace_data(trace_data)

        if not trace_data.agent_interactions:
            return {"communication_overhead": 1.0, "coordination_centrality": 0.0}

        try:
            interaction_graph = self._build_interaction_graph(trace_data.agent_interactions)

            if len(interaction_graph.nodes) < self.min_nodes_for_analysis:  # type: ignore[arg-type]
                return {"communication_overhead": 0.8, "coordination_centrality": 0.5}

            efficiency_ratio = self._calculate_communication_efficiency(interaction_graph)
            max_centrality = self._calculate_coordination_centrality(interaction_graph)

            return {
                "communication_overhead": efficiency_ratio,
                "coordination_centrality": max_centrality,
            }

        except Exception as e:
            logger.warning(f"Agent interaction analysis failed: {e}")
            return {"communication_overhead": 0.5, "coordination_centrality": 0.0}

    def _build_interaction_graph(self, interactions: list[dict[str, Any]]) -> Any:
        """Build NetworkX graph from agent interactions."""
        interaction_graph = nx.DiGraph()

        for interaction in interactions:
            from_agent = interaction.get("from", "unknown")
            to_agent = interaction.get("to", "unknown")
            interaction_type = interaction.get("type", "communication")

            weight = 1.0 if interaction_type in ["delegation", "coordination"] else 0.5
            interaction_graph.add_edge(from_agent, to_agent, weight=weight)

        return interaction_graph

    def _calculate_communication_efficiency(self, graph: Any) -> float:
        """Calculate communication efficiency ratio."""
        total_edges = len(graph.edges)  # type: ignore[arg-type]
        total_nodes = len(graph.nodes)  # type: ignore[arg-type]

        if total_nodes <= 1:
            return 1.0

        ideal_communications = total_nodes * math.log2(total_nodes)
        return min(1.0, ideal_communications / max(1, total_edges))

    def _calculate_coordination_centrality(self, graph: Any) -> float:
        """Calculate coordination centrality from betweenness."""
        if len(graph.nodes) <= 2:  # type: ignore[arg-type]
            return 0.5

        centrality_scores = nx.betweenness_centrality(graph)  # type: ignore[arg-type]
        return max(centrality_scores.values()) if centrality_scores else 0.0  # type: ignore[arg-type]

    def analyze_task_distribution(self, trace_data: GraphTraceData) -> float:
        """Analyze task distribution balance across agents.

        Args:
            trace_data: Processed execution trace data

        Returns:
            Task distribution balance score (0.0-1.0)
        """
        self._validate_trace_data(trace_data)

        try:
            agent_activities = self._count_agent_activities(trace_data)

            if not agent_activities:
                return 0.0

            activities = list(agent_activities.values())
            if len(activities) <= 1:
                return 1.0

            return self._calculate_balance_score(activities)

        except Exception as e:
            logger.warning(f"Task distribution analysis failed: {e}")
            return 0.0

    def _count_agent_activities(self, trace_data: GraphTraceData) -> dict[str, int]:
        """Count activities per agent from trace data."""
        agent_activities: dict[str, int] = {}

        for call in trace_data.tool_calls:
            agent_id = call.get("agent_id", "unknown")
            agent_activities[agent_id] = agent_activities.get(agent_id, 0) + 1

        for interaction in trace_data.agent_interactions:
            from_agent = interaction.get("from", "unknown")
            agent_activities[from_agent] = agent_activities.get(from_agent, 0) + 1

        return agent_activities

    def _calculate_balance_score(self, activities: list[int]) -> float:
        """Calculate balance score from activity counts."""
        mean_activity = sum(activities) / len(activities)
        if mean_activity == 0:
            return 0.0

        variance = sum((x - mean_activity) ** 2 for x in activities) / len(activities)
        std_dev = math.sqrt(variance)
        cv = std_dev / mean_activity

        balance_score = max(0.0, 1.0 - cv)
        return min(1.0, balance_score)

    def _calculate_path_convergence(self, graph: Any) -> float:
        """Calculate path convergence efficiency in tool usage graph.

        Args:
            graph: NetworkX graph of tool usage patterns

        Returns:
            Path convergence score (0.0-1.0)
        """
        if len(graph.nodes) < 2:
            return 0.5

        try:
            undirected_graph = graph.to_undirected()
            if not nx.is_connected(undirected_graph):
                return 0.2  # Disconnected graph has poor convergence

            return self._calculate_connected_graph_convergence(graph, undirected_graph)
        except Exception as e:
            logger.debug(f"Path convergence calculation failed: {e}")
            return 0.0

    def _calculate_connected_graph_convergence(self, graph: Any, undirected_graph: Any) -> float:
        """Calculate convergence for connected graph."""
        try:
            avg_path_length = self._with_timeout(nx.average_shortest_path_length, undirected_graph)
            return self._normalize_path_length(len(graph.nodes), avg_path_length)
        except (TimeoutError, nx.NetworkXError):
            logger.warning("Path length calculation failed or timed out")
            return 0.3

    def _normalize_path_length(self, num_nodes: int, avg_path_length: float) -> float:
        """Normalize average path length to convergence score."""
        max_possible_length = num_nodes - 1
        denominator = max_possible_length - 1

        if denominator <= 0:
            return 1.0 if num_nodes == 2 else 0.5

        convergence = 1.0 - (avg_path_length - 1) / denominator
        return max(0.0, min(1.0, convergence))

    def evaluate_graph_metrics(self, trace_data: GraphTraceData) -> Tier3Result:
        """Complete graph-based analysis evaluation.

        Args:
            trace_data: Processed execution trace data

        Returns:
            Tier3Result with all graph analysis metrics
        """
        try:
            # Analyze different aspects of the execution graph
            tool_metrics = self.analyze_tool_usage_patterns(trace_data)
            interaction_metrics = self.analyze_agent_interactions(trace_data)
            task_balance = self.analyze_task_distribution(trace_data)

            # Extract individual metrics
            path_convergence = tool_metrics.get("path_convergence", 0.0)
            tool_accuracy = tool_metrics.get("tool_selection_accuracy", 0.0)
            coordination_quality = interaction_metrics.get("coordination_centrality", 0.0)

            # Calculate graph complexity (total unique nodes)
            unique_agents = set()
            for interaction in trace_data.agent_interactions:
                unique_agents.add(interaction.get("from", "unknown"))
                unique_agents.add(interaction.get("to", "unknown"))
            for call in trace_data.tool_calls:
                unique_agents.add(call.get("agent_id", "unknown"))
            graph_complexity = len(unique_agents)  # type: ignore[arg-type]

            # Calculate weighted overall score
            overall_score = (
                path_convergence * self.weights.get("path_convergence", 0.3)
                + tool_accuracy * self.weights.get("tool_accuracy", 0.25)
                + coordination_quality * self.weights.get("coordination_quality", 0.25)
                + task_balance * self.weights.get("task_balance", 0.2)
            )

            return Tier3Result(
                path_convergence=path_convergence,
                tool_selection_accuracy=tool_accuracy,
                coordination_centrality=coordination_quality,
                task_distribution_balance=task_balance,
                overall_score=overall_score,
                graph_complexity=graph_complexity,
            )

        except Exception as e:
            logger.error(f"Graph metrics evaluation failed: {e}")
            # Return minimal baseline scores
            return Tier3Result(
                path_convergence=0.0,
                tool_selection_accuracy=0.0,
                coordination_centrality=0.0,
                task_distribution_balance=0.0,
                overall_score=0.0,
                graph_complexity=0,
            )

    def export_trace_to_networkx(self, trace_data: GraphTraceData) -> nx.DiGraph[str] | None:
        """Export trace data to NetworkX graph for Phoenix visualization.

        Args:
            trace_data: Execution trace data to convert

        Returns:
            NetworkX directed graph or None if export fails
        """
        try:
            graph = nx.DiGraph()
            agent_nodes = self._add_agent_interactions_to_graph(
                graph, trace_data.agent_interactions
            )
            self._add_tool_usage_to_graph(graph, trace_data.tool_calls)
            self._add_graph_metadata(graph, trace_data, agent_nodes)

            logger.debug(
                f"Exported NetworkX graph: {graph.number_of_nodes()} nodes, "
                f"{graph.number_of_edges()} edges"
            )
            return graph

        except Exception as e:
            logger.error(f"Failed to export trace to NetworkX: {e}")
            return None

    def _add_agent_interactions_to_graph(
        self, graph: Any, interactions: list[dict[str, Any]]
    ) -> set[str]:
        """Add agent nodes and interactions to graph."""
        agent_nodes: set[str] = set()

        for interaction in interactions:
            source = interaction.get("from", "unknown")
            target = interaction.get("to", "unknown")
            agent_nodes.add(source)
            agent_nodes.add(target)

            self._ensure_agent_node(graph, source)
            self._ensure_agent_node(graph, target)
            self._add_interaction_edge(graph, source, target)

        return agent_nodes

    def _ensure_agent_node(self, graph: Any, agent_id: str) -> None:
        """Ensure agent node exists in graph."""
        if not graph.has_node(agent_id):
            graph.add_node(agent_id, type="agent", interaction_count=0)

    def _add_interaction_edge(self, graph: Any, source: str, target: str) -> None:
        """Add or update interaction edge between agents."""
        if not graph.has_edge(source, target):
            graph.add_edge(source, target, interaction_count=0)

        graph.edges[source, target]["interaction_count"] += 1
        graph.nodes[source]["interaction_count"] += 1
        graph.nodes[target]["interaction_count"] += 1

    def _add_tool_usage_to_graph(self, graph: Any, tool_calls: list[dict[str, Any]]) -> None:
        """Add tool nodes and usage edges to graph."""
        for tool_call in tool_calls:
            agent_id = tool_call.get("agent_id", "unknown")
            tool_name = tool_call.get("tool_name", "unknown_tool")

            self._ensure_tool_node(graph, tool_name)
            self._add_tool_usage_edge(graph, agent_id, tool_name)

    def _ensure_tool_node(self, graph: Any, tool_name: str) -> None:
        """Ensure tool node exists in graph."""
        if not graph.has_node(tool_name):
            graph.add_node(tool_name, type="tool", usage_count=0)

    def _add_tool_usage_edge(self, graph: Any, agent_id: str, tool_name: str) -> None:
        """Add or update tool usage edge."""
        if not graph.has_edge(agent_id, tool_name):
            graph.add_edge(agent_id, tool_name, usage_count=0)

        graph.edges[agent_id, tool_name]["usage_count"] += 1
        graph.nodes[tool_name]["usage_count"] += 1

    def _add_graph_metadata(
        self, graph: Any, trace_data: GraphTraceData, agent_nodes: set[str]
    ) -> None:
        """Add metadata to graph for Phoenix visualization."""
        graph.graph.update(
            {
                "execution_id": trace_data.execution_id,
                "total_agents": len(agent_nodes),
                "total_interactions": len(trace_data.agent_interactions),
                "total_tool_calls": len(trace_data.tool_calls),
                "timing_data": trace_data.timing_data,
            }
        )
Functions
__init__(settings)

Initialize graph analysis engine with settings.

Parameters:

Name Type Description Default
settings JudgeSettings

JudgeSettings instance with tier3 configuration.

required

Raises:

Type Description
ValueError

If configuration is invalid

Source code in src/app/judge/graph_analysis.py
def __init__(self, settings: JudgeSettings) -> None:
    """Initialize graph analysis engine with settings.

    Args:
        settings: JudgeSettings instance with tier3 configuration.

    Raises:
        ValueError: If configuration is invalid
    """
    self.settings = settings

    self.min_nodes_for_analysis = settings.tier3_min_nodes
    self.centrality_measures = list(settings.tier3_centrality_measures)

    # Weights for composite scoring
    self.weights = {
        "path_convergence": 0.3,
        "tool_accuracy": 0.25,
        "coordination_quality": 0.25,
        "task_balance": 0.2,
    }

    # Resource limits for production safety
    self.max_nodes = settings.tier3_max_nodes
    self.max_edges = settings.tier3_max_edges
    self.operation_timeout = settings.tier3_operation_timeout
analyze_agent_interactions(trace_data)

Analyze agent-to-agent communication and coordination patterns.

Parameters:

Name Type Description Default
trace_data GraphTraceData

Processed execution trace data

required

Returns:

Type Description
dict[str, float]

Dictionary with interaction analysis metrics

Source code in src/app/judge/graph_analysis.py
def analyze_agent_interactions(self, trace_data: GraphTraceData) -> dict[str, float]:
    """Analyze agent-to-agent communication and coordination patterns.

    Args:
        trace_data: Processed execution trace data

    Returns:
        Dictionary with interaction analysis metrics
    """
    self._validate_trace_data(trace_data)

    if not trace_data.agent_interactions:
        return {"communication_overhead": 1.0, "coordination_centrality": 0.0}

    try:
        interaction_graph = self._build_interaction_graph(trace_data.agent_interactions)

        if len(interaction_graph.nodes) < self.min_nodes_for_analysis:  # type: ignore[arg-type]
            return {"communication_overhead": 0.8, "coordination_centrality": 0.5}

        efficiency_ratio = self._calculate_communication_efficiency(interaction_graph)
        max_centrality = self._calculate_coordination_centrality(interaction_graph)

        return {
            "communication_overhead": efficiency_ratio,
            "coordination_centrality": max_centrality,
        }

    except Exception as e:
        logger.warning(f"Agent interaction analysis failed: {e}")
        return {"communication_overhead": 0.5, "coordination_centrality": 0.0}
analyze_task_distribution(trace_data)

Analyze task distribution balance across agents.

Parameters:

Name Type Description Default
trace_data GraphTraceData

Processed execution trace data

required

Returns:

Type Description
float

Task distribution balance score (0.0-1.0)

Source code in src/app/judge/graph_analysis.py
def analyze_task_distribution(self, trace_data: GraphTraceData) -> float:
    """Analyze task distribution balance across agents.

    Args:
        trace_data: Processed execution trace data

    Returns:
        Task distribution balance score (0.0-1.0)
    """
    self._validate_trace_data(trace_data)

    try:
        agent_activities = self._count_agent_activities(trace_data)

        if not agent_activities:
            return 0.0

        activities = list(agent_activities.values())
        if len(activities) <= 1:
            return 1.0

        return self._calculate_balance_score(activities)

    except Exception as e:
        logger.warning(f"Task distribution analysis failed: {e}")
        return 0.0
analyze_tool_usage_patterns(trace_data)

Analyze tool usage efficiency and selection patterns.

Parameters:

Name Type Description Default
trace_data GraphTraceData

Processed execution trace data

required

Returns:

Type Description
dict[str, float]

Dictionary with tool analysis metrics

Source code in src/app/judge/graph_analysis.py
def analyze_tool_usage_patterns(self, trace_data: GraphTraceData) -> dict[str, float]:
    """Analyze tool usage efficiency and selection patterns.

    Args:
        trace_data: Processed execution trace data

    Returns:
        Dictionary with tool analysis metrics
    """
    # Validate trace data first
    self._validate_trace_data(trace_data)

    if not trace_data.tool_calls:
        return {"path_convergence": 0.0, "tool_selection_accuracy": 0.0}

    try:
        # Accumulate outcomes and build graph
        tool_outcomes, edge_outcomes = self._accumulate_tool_outcomes(trace_data.tool_calls)
        tool_graph = self._build_tool_graph(trace_data.tool_calls, tool_outcomes, edge_outcomes)

        if len(tool_graph.nodes) < self.min_nodes_for_analysis:  # type: ignore[arg-type]
            return {"path_convergence": 0.5, "tool_selection_accuracy": 0.5}

        # Calculate path convergence using graph connectivity
        path_convergence = self._calculate_path_convergence(tool_graph)

        # Calculate tool selection accuracy from success rates
        tool_nodes = [n for n, d in tool_graph.nodes(data=True) if d.get("type") == "tool"]
        if tool_nodes:
            success_rates = [
                tool_graph.nodes[tool].get("success_rate", 0.0) for tool in tool_nodes
            ]
            tool_accuracy = sum(success_rates) / len(success_rates)
        else:
            tool_accuracy = 0.0

        return {
            "path_convergence": path_convergence,
            "tool_selection_accuracy": tool_accuracy,
        }

    except Exception as e:
        logger.warning(f"Tool usage pattern analysis failed: {e}")
        return {"path_convergence": 0.0, "tool_selection_accuracy": 0.0}
evaluate_graph_metrics(trace_data)

Complete graph-based analysis evaluation.

Parameters:

Name Type Description Default
trace_data GraphTraceData

Processed execution trace data

required

Returns:

Type Description
Tier3Result

Tier3Result with all graph analysis metrics

Source code in src/app/judge/graph_analysis.py
def evaluate_graph_metrics(self, trace_data: GraphTraceData) -> Tier3Result:
    """Complete graph-based analysis evaluation.

    Args:
        trace_data: Processed execution trace data

    Returns:
        Tier3Result with all graph analysis metrics
    """
    try:
        # Analyze different aspects of the execution graph
        tool_metrics = self.analyze_tool_usage_patterns(trace_data)
        interaction_metrics = self.analyze_agent_interactions(trace_data)
        task_balance = self.analyze_task_distribution(trace_data)

        # Extract individual metrics
        path_convergence = tool_metrics.get("path_convergence", 0.0)
        tool_accuracy = tool_metrics.get("tool_selection_accuracy", 0.0)
        coordination_quality = interaction_metrics.get("coordination_centrality", 0.0)

        # Calculate graph complexity (total unique nodes)
        unique_agents = set()
        for interaction in trace_data.agent_interactions:
            unique_agents.add(interaction.get("from", "unknown"))
            unique_agents.add(interaction.get("to", "unknown"))
        for call in trace_data.tool_calls:
            unique_agents.add(call.get("agent_id", "unknown"))
        graph_complexity = len(unique_agents)  # type: ignore[arg-type]

        # Calculate weighted overall score
        overall_score = (
            path_convergence * self.weights.get("path_convergence", 0.3)
            + tool_accuracy * self.weights.get("tool_accuracy", 0.25)
            + coordination_quality * self.weights.get("coordination_quality", 0.25)
            + task_balance * self.weights.get("task_balance", 0.2)
        )

        return Tier3Result(
            path_convergence=path_convergence,
            tool_selection_accuracy=tool_accuracy,
            coordination_centrality=coordination_quality,
            task_distribution_balance=task_balance,
            overall_score=overall_score,
            graph_complexity=graph_complexity,
        )

    except Exception as e:
        logger.error(f"Graph metrics evaluation failed: {e}")
        # Return minimal baseline scores
        return Tier3Result(
            path_convergence=0.0,
            tool_selection_accuracy=0.0,
            coordination_centrality=0.0,
            task_distribution_balance=0.0,
            overall_score=0.0,
            graph_complexity=0,
        )
export_trace_to_networkx(trace_data)

Export trace data to NetworkX graph for Phoenix visualization.

Parameters:

Name Type Description Default
trace_data GraphTraceData

Execution trace data to convert

required

Returns:

Type Description
DiGraph[str] | None

NetworkX directed graph or None if export fails

Source code in src/app/judge/graph_analysis.py
def export_trace_to_networkx(self, trace_data: GraphTraceData) -> nx.DiGraph[str] | None:
    """Export trace data to NetworkX graph for Phoenix visualization.

    Args:
        trace_data: Execution trace data to convert

    Returns:
        NetworkX directed graph or None if export fails
    """
    try:
        graph = nx.DiGraph()
        agent_nodes = self._add_agent_interactions_to_graph(
            graph, trace_data.agent_interactions
        )
        self._add_tool_usage_to_graph(graph, trace_data.tool_calls)
        self._add_graph_metadata(graph, trace_data, agent_nodes)

        logger.debug(
            f"Exported NetworkX graph: {graph.number_of_nodes()} nodes, "
            f"{graph.number_of_edges()} edges"
        )
        return graph

    except Exception as e:
        logger.error(f"Failed to export trace to NetworkX: {e}")
        return None

Functions

evaluate_single_graph_analysis(trace_data, settings=None)

Convenience function for single graph analysis evaluation.

Parameters:

Name Type Description Default
trace_data GraphTraceData | None

Execution trace data for analysis

required
settings JudgeSettings | None

Optional JudgeSettings override. If None, uses defaults.

None

Returns:

Type Description
Tier3Result

Tier3Result with graph analysis metrics

Example

from app.judge.trace_processors import get_trace_collector collector = get_trace_collector() trace_data = collector.load_trace(“execution_001”) result = evaluate_single_graph_analysis(trace_data) print(f”Overall score: {result.overall_score:.3f}”)

Source code in src/app/judge/graph_analysis.py
def evaluate_single_graph_analysis(
    trace_data: GraphTraceData | None, settings: JudgeSettings | None = None
) -> Tier3Result:
    """Convenience function for single graph analysis evaluation.

    Args:
        trace_data: Execution trace data for analysis
        settings: Optional JudgeSettings override. If None, uses defaults.

    Returns:
        Tier3Result with graph analysis metrics

    Example:
        >>> from app.judge.trace_processors import get_trace_collector
        >>> collector = get_trace_collector()
        >>> trace_data = collector.load_trace("execution_001")
        >>> result = evaluate_single_graph_analysis(trace_data)
        >>> print(f"Overall score: {result.overall_score:.3f}")
    """
    if settings is None:
        from app.config.judge_settings import JudgeSettings

        settings = JudgeSettings()
    engine = GraphAnalysisEngine(settings)

    if trace_data is None:
        # Return zero scores for missing trace data
        return Tier3Result(
            path_convergence=0.0,
            tool_selection_accuracy=0.0,
            coordination_centrality=0.0,
            task_distribution_balance=0.0,
            overall_score=0.0,
            graph_complexity=0,
        )

    return engine.evaluate_graph_metrics(trace_data)

app.judge.graph_builder

Utility for building NetworkX graphs from GraphTraceData.

Converts execution trace data into interactive network visualizations showing agent-to-agent interactions and tool usage patterns.

Classes

Functions

build_interaction_graph(trace_data)

Build NetworkX directed graph from execution trace data.

Creates a visual representation of agent interactions and tool usage: - Agent nodes (blue circles in visualization) - Tool nodes (green squares in visualization) - Edges representing delegations and tool calls

Parameters:

Name Type Description Default
trace_data GraphTraceData

GraphTraceData containing agent interactions and tool calls

required

Returns:

Type Description
DiGraph[str]

NetworkX DiGraph with nodes and edges representing the execution flow

Source code in src/app/judge/graph_builder.py
def build_interaction_graph(trace_data: GraphTraceData) -> nx.DiGraph[str]:
    """Build NetworkX directed graph from execution trace data.

    Creates a visual representation of agent interactions and tool usage:
    - Agent nodes (blue circles in visualization)
    - Tool nodes (green squares in visualization)
    - Edges representing delegations and tool calls

    Args:
        trace_data: GraphTraceData containing agent interactions and tool calls

    Returns:
        NetworkX DiGraph with nodes and edges representing the execution flow
    """
    graph = nx.DiGraph()

    # Add agent-to-agent interactions
    for interaction in trace_data.agent_interactions:
        source = interaction.get("from", interaction.get("source_agent", "unknown"))
        target = interaction.get("to", interaction.get("target_agent", "unknown"))
        interaction_type = interaction.get(
            "type", interaction.get("interaction_type", "communication")
        )

        # Add agent nodes if not already present
        if source not in graph:
            graph.add_node(source, type="agent", label=source.capitalize())
        if target not in graph:
            graph.add_node(target, type="agent", label=target.capitalize())

        # Add edge with interaction type
        graph.add_edge(source, target, interaction=interaction_type)

    # Add tool usage patterns
    for tool_call in trace_data.tool_calls:
        agent_id = tool_call.get("agent_id", "unknown")
        tool_name = tool_call.get("tool_name", "unknown_tool")

        # Add agent node if not already present
        if agent_id not in graph:
            graph.add_node(agent_id, type="agent", label=agent_id.capitalize())

        # Add tool node
        if tool_name not in graph:
            graph.add_node(tool_name, type="tool", label=tool_name.replace("_", " ").title())

        # Add edge from agent to tool
        success = tool_call.get("success", False)
        graph.add_edge(agent_id, tool_name, interaction="tool_call", success=success)

    logger.debug(
        f"Built interaction graph: {graph.number_of_nodes()} nodes, {graph.number_of_edges()} edges"
    )

    return graph

app.judge.graph_export

Export nx.DiGraph as JSON (node-link format) and PNG (static render).

Persists the agent interaction graph built after each run to the per-run output directory. Both functions register their output with the ArtifactRegistry for end-of-run summary display.

Functions

export_graph_json(graph, output_dir)

Serialize an nx.DiGraph to agent_graph.json using node-link format.

Parameters:

Name Type Description Default
graph DiGraph[str]

NetworkX directed graph to export.

required
output_dir Path

Directory to write the JSON file into.

required

Returns:

Type Description
Path

Path to the written agent_graph.json file.

Source code in src/app/judge/graph_export.py
def export_graph_json(graph: nx.DiGraph[str], output_dir: Path) -> Path:
    """Serialize an nx.DiGraph to agent_graph.json using node-link format.

    Args:
        graph: NetworkX directed graph to export.
        output_dir: Directory to write the JSON file into.

    Returns:
        Path to the written agent_graph.json file.
    """
    out_path = output_dir / "agent_graph.json"
    data = nx.node_link_data(graph)
    out_path.write_text(json.dumps(data, indent=2), encoding="utf-8")
    get_artifact_registry().register("Agent Graph (JSON)", out_path)
    logger.info(f"Agent graph JSON written to {out_path}")
    return out_path

export_graph_png(graph, output_dir)

Render an nx.DiGraph to agent_graph.png as a static image.

Agent nodes are drawn as circles (#4e79a7 blue), tool nodes as squares (#59a14f green). Layout uses spring_layout with a fixed seed for reproducibility.

Parameters:

Name Type Description Default
graph DiGraph[str]

NetworkX directed graph to render.

required
output_dir Path

Directory to write the PNG file into.

required

Returns:

Type Description
Path

Path to the written agent_graph.png file.

Source code in src/app/judge/graph_export.py
def export_graph_png(graph: nx.DiGraph[str], output_dir: Path) -> Path:
    """Render an nx.DiGraph to agent_graph.png as a static image.

    Agent nodes are drawn as circles (#4e79a7 blue), tool nodes as squares
    (#59a14f green). Layout uses spring_layout with a fixed seed for
    reproducibility.

    Args:
        graph: NetworkX directed graph to render.
        output_dir: Directory to write the PNG file into.

    Returns:
        Path to the written agent_graph.png file.
    """
    # Reason: must set before importing matplotlib to avoid writable-dir warning in containers
    os.environ.setdefault("MPLCONFIGDIR", str(Path.home() / ".config" / "matplotlib"))
    import matplotlib

    matplotlib.use("Agg")
    import matplotlib.pyplot as plt

    out_path = output_dir / "agent_graph.png"

    fig, ax = plt.subplots(figsize=(10, 8))

    if graph.number_of_nodes() == 0:
        ax.set_title("Agent Interaction Graph (empty)")
        ax.text(0.5, 0.5, "No agents or tools", ha="center", va="center", fontsize=14)
        ax.set_axis_off()
    else:
        pos = nx.spring_layout(graph, seed=42)

        agent_nodes = [n for n, d in graph.nodes(data=True) if d.get("type") == "agent"]
        tool_nodes = [n for n, d in graph.nodes(data=True) if d.get("type") == "tool"]

        # Draw agent nodes (circles)
        if agent_nodes:
            nx.draw_networkx_nodes(
                graph,
                pos,
                nodelist=agent_nodes,
                node_color="#4e79a7",
                node_shape="o",
                node_size=600,
                ax=ax,
            )

        # Draw tool nodes (squares)
        if tool_nodes:
            nx.draw_networkx_nodes(
                graph,
                pos,
                nodelist=tool_nodes,
                node_color="#59a14f",
                node_shape="s",
                node_size=400,
                ax=ax,
            )

        nx.draw_networkx_edges(graph, pos, ax=ax, arrows=True)

        labels = {n: d.get("label", n) for n, d in graph.nodes(data=True)}
        nx.draw_networkx_labels(graph, pos, labels=labels, font_size=8, ax=ax)

        ax.set_title("Agent Interaction Graph")

    fig.savefig(out_path, format="png", dpi=100, bbox_inches="tight")
    plt.close(fig)

    get_artifact_registry().register("Agent Graph (PNG)", out_path)
    logger.info(f"Agent graph PNG written to {out_path}")
    return out_path

persist_graph(graph, output_dir)

Export graph as JSON and PNG if graph is available.

No-op when graph is None. Convenience wrapper used by app.main() to avoid adding branching complexity.

Parameters:

Name Type Description Default
graph DiGraph[str] | None

NetworkX directed graph, or None if unavailable.

required
output_dir Path

Per-run output directory.

required
Source code in src/app/judge/graph_export.py
def persist_graph(graph: nx.DiGraph[str] | None, output_dir: Path) -> None:
    """Export graph as JSON and PNG if graph is available.

    No-op when graph is None. Convenience wrapper used by app.main()
    to avoid adding branching complexity.

    Args:
        graph: NetworkX directed graph, or None if unavailable.
        output_dir: Per-run output directory.
    """
    if graph is None:
        logger.debug("No graph available, skipping export")
        return
    export_graph_json(graph, output_dir)
    export_graph_png(graph, output_dir)

app.judge.llm_evaluation_managers

LLM evaluation management and orchestration.

This module provides managers for orchestrating LLM-based evaluations, handling provider selection, fallback mechanisms, and cost optimization for evaluation tasks.

Classes

LLMJudgeEngine

Manager for LLM-based evaluation with provider flexibility and fallbacks.

Source code in src/app/judge/llm_evaluation_managers.py
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
class LLMJudgeEngine:
    """Manager for LLM-based evaluation with provider flexibility and fallbacks."""

    def __init__(
        self,
        settings: JudgeSettings,
        env_config: AppEnv | None = None,
        chat_provider: str | None = None,
        chat_model: str | None = None,
    ) -> None:
        """Initialize evaluation LLM manager with settings.

        Args:
            settings: JudgeSettings instance with tier2 configuration.
            env_config: Application environment configuration. If None, creates default AppEnv().
            chat_provider: Active chat provider from agent system. Used when tier2_provider='auto'.
            chat_model: Active chat model from agent system. Inherited when tier2_provider='auto'
                and provider resolves to chat_provider (not fallen back to another provider).
        """
        self.settings = settings
        self.fallback_engine = TraditionalMetricsEngine()

        # Get environment configuration
        if env_config is None:
            env_config = AppEnv()
        self.env_config = env_config

        # Resolve provider using auto mode if configured
        resolved_provider = settings.tier2_provider
        if resolved_provider == "auto" and chat_provider:
            resolved_provider = chat_provider
            if resolved_provider != "openai":
                logger.info(f"Judge provider: auto \u2192 {resolved_provider}")

        # Provider and model settings (before selection)
        self.provider = resolved_provider
        self.model = settings.tier2_model
        self.fallback_provider = settings.tier2_fallback_provider
        self.fallback_model = settings.tier2_fallback_model

        # Call select_available_provider to validate and fallback if needed
        self._api_key: str | None = None
        selected = self.select_available_provider(env_config)
        if selected:
            self.provider, self.model, self._api_key = selected
            self.model = self._resolve_model(chat_model, resolved_provider, settings.tier2_provider)
            logger.info(f"Judge model resolved: {self.provider}/{self.model}")
            self.tier2_available = True
        else:
            # No providers available - mark Tier 2 as unavailable
            self.tier2_available = False
            logger.warning("Tier 2 evaluation will be skipped (no valid providers)")

        # Performance settings
        self.timeout = settings.tier2_timeout_seconds
        self.max_retries = settings.tier2_max_retries
        self.paper_excerpt_length = settings.tier2_paper_excerpt_length
        self.cost_budget = settings.tier2_cost_budget_usd

        # Evaluation weights
        self.weights = {
            "technical_accuracy": 0.4,
            "constructiveness": 0.3,
            "planning_rationality": 0.3,
        }

        # Track auth failures for fallback_used flag
        self._auth_failure_count = 0

    def _resolve_model(
        self, chat_model: str | None, resolved_provider: str, configured_provider: str
    ) -> str:
        """Resolve the correct model after provider selection.

        Args:
            chat_model: Explicit chat model from agent system, or None.
            resolved_provider: Provider after auto-resolution (before fallback).
            configured_provider: Original tier2_provider from settings.

        Returns:
            Model name to use for evaluation.
        """
        # Explicit chat_model wins when provider didn't fall back
        if chat_model is not None and self.provider == resolved_provider:
            return chat_model
        # Auto-resolved provider with no chat_model — use registry default
        if (
            chat_model is None
            and self.provider == resolved_provider
            and resolved_provider != configured_provider
        ):
            from app.data_models.app_models import PROVIDER_REGISTRY

            registry_entry = PROVIDER_REGISTRY.get(self.provider)
            if registry_entry and registry_entry.default_model:
                return registry_entry.default_model
        return self.model

    def _resolve_provider_key(self, provider: str, env_config: AppEnv) -> tuple[bool, str | None]:
        """Resolve API key for a provider.

        Args:
            provider: Provider name to resolve
            env_config: Application environment configuration

        Returns:
            Tuple of (is_valid, api_key). Key string on success, None on failure.
        """
        is_valid, key_or_message = get_api_key(provider, env_config)
        if not is_valid:
            logger.debug(f"API key validation failed for {provider}: {key_or_message}")
            return (False, None)
        return (True, key_or_message)

    def select_available_provider(self, env_config: AppEnv) -> tuple[str, str, str | None] | None:
        """Select available provider with fallback chain.

        Args:
            env_config: Application environment configuration

        Returns:
            Tuple of (provider, model, api_key) if available, None if no providers available.
        """
        # Try primary provider first
        is_valid, api_key = self._resolve_provider_key(self.provider, env_config)
        if is_valid:
            logger.info(f"Using primary provider: {self.provider}/{self.model}")
            return (self.provider, self.model, api_key)

        # Try fallback provider
        is_valid, api_key = self._resolve_provider_key(self.fallback_provider, env_config)
        if is_valid:
            logger.info(
                f"Primary provider unavailable, using fallback: "
                f"{self.fallback_provider}/{self.fallback_model}"
            )
            return (self.fallback_provider, self.fallback_model, api_key)

        # No providers available
        logger.warning(
            f"Neither primary ({self.provider}) nor fallback ({self.fallback_provider}) "
            f"providers have valid API keys. Tier 2 will be skipped."
        )
        return None

    async def create_judge_agent(self, assessment_type: str, use_fallback: bool = False) -> Agent:
        """
        Create an LLM judge agent for specific assessment type.

        Args:
            assessment_type: Type of assessment ("technical_accuracy", etc.)
            use_fallback: Whether to use fallback provider

        Returns:
            Configured Agent for evaluation
        """
        if use_fallback:
            provider = self.fallback_provider
            model = self.fallback_model
            logger.info(f"Using fallback provider: {provider}/{model}")
        else:
            provider = self.provider
            model = self.model

        return create_evaluation_agent(
            provider=provider,
            model_name=model,
            assessment_type=assessment_type,
            api_key=self._api_key,
        )

    async def assess_technical_accuracy(self, paper: str, review: str) -> float:
        """Assess technical accuracy of review against paper."""
        try:
            # Truncate paper content for cost efficiency
            paper_excerpt = (
                paper[: self.paper_excerpt_length]
                if len(paper) > self.paper_excerpt_length
                else paper
            )

            # Sanitize user-controlled content with XML delimiters
            sanitized_paper = sanitize_for_prompt(
                paper_excerpt, max_length=self.paper_excerpt_length, delimiter="paper_excerpt"
            )
            sanitized_review = sanitize_review_text(review)

            prompt = f"""Evaluate technical accuracy of this review (1-5 scale):

Paper Excerpt: {sanitized_paper}

Review: {sanitized_review}

Rate each aspect (1=poor, 5=excellent):
1. Factual Correctness: Are claims supported by the paper?
2. Methodology Understanding: Does reviewer grasp the approach?
3. Domain Knowledge: Appropriate technical terminology?

Provide scores and brief explanation."""

            agent = await self.create_judge_agent("technical_accuracy")
            result = await asyncio.wait_for(
                agent.run(prompt, output_type=TechnicalAccuracyAssessment),
                timeout=self.timeout,
            )

            # Calculate weighted score and normalize to 0-1
            weighted_score = (
                result.output.factual_correctness * 0.5
                + result.output.methodology_understanding * 0.3
                + result.output.domain_knowledge * 0.2
            ) / 5.0

            return min(1.0, max(0.0, weighted_score))

        except Exception as e:
            logger.warning(f"Technical accuracy assessment failed: {e}")
            # Distinguish auth failures (401) from timeouts per STORY-001
            error_msg = str(e).lower()
            is_auth_failure = "401" in error_msg or "unauthorized" in error_msg

            if is_auth_failure:
                # Auth failures get neutral score (0.5) - provider unavailable
                logger.warning("Auth failure detected - using neutral fallback score")
                self._auth_failure_count += 1
                return 0.5
            else:
                # Timeouts and other errors use semantic similarity fallback
                return self.fallback_engine.compute_semantic_similarity(paper, review)

    async def assess_constructiveness(self, review: str) -> float:
        """Assess constructiveness and helpfulness of review."""
        try:
            # Sanitize user-controlled content with XML delimiters
            sanitized_review = sanitize_review_text(review)

            prompt = f"""Evaluate constructiveness of this review (1-5 scale):

Review: {sanitized_review}

Rate each aspect (1=poor, 5=excellent):
1. Actionable Feedback: Specific, implementable suggestions?
2. Balanced Critique: Both strengths and weaknesses noted?
3. Improvement Guidance: Clear direction for authors?

Provide scores and brief explanation."""

            agent = await self.create_judge_agent("constructiveness")
            result = await asyncio.wait_for(
                agent.run(prompt, output_type=ConstructivenessAssessment),
                timeout=self.timeout,
            )

            # Equal weighting for constructiveness aspects
            average_score = (
                result.output.actionable_feedback
                + result.output.balanced_critique
                + result.output.improvement_guidance
            ) / 15.0  # Normalize to 0-1

            return min(1.0, max(0.0, average_score))

        except Exception as e:
            logger.warning(f"Constructiveness assessment failed: {e}")
            # Distinguish auth failures (401) from other errors
            error_msg = str(e).lower()
            is_auth_failure = "401" in error_msg or "unauthorized" in error_msg

            if is_auth_failure:
                # Auth failures get neutral score (0.5) - provider unavailable
                logger.warning("Auth failure detected - using neutral fallback score")
                self._auth_failure_count += 1
                return 0.5
            else:
                # Other errors use heuristic fallback
                return self._fallback_constructiveness_check(review)

    async def assess_planning_rationality(self, execution_trace: dict[str, Any]) -> float:
        """Assess quality of agent planning and decision-making."""
        try:
            # Extract planning summary from trace
            planning_summary = self._extract_planning_decisions(execution_trace)

            prompt = f"""Evaluate planning rationality of this execution (1-5 scale):

Execution Summary: {planning_summary}

Rate each aspect (1=poor, 5=excellent):
1. Logical Flow: Coherent step progression?
2. Decision Quality: Appropriate choices made?
3. Resource Efficiency: Optimal tool/agent usage?

Provide scores and brief explanation."""

            agent = await self.create_judge_agent("planning_rationality")
            result = await asyncio.wait_for(
                agent.run(prompt, output_type=PlanningRationalityAssessment),
                timeout=self.timeout,
            )

            # Weight decision quality most heavily
            weighted_score = (
                result.output.logical_flow * 0.3
                + result.output.decision_quality * 0.5
                + result.output.resource_efficiency * 0.2
            ) / 5.0

            return min(1.0, max(0.0, weighted_score))

        except Exception as e:
            logger.warning(f"Planning rationality assessment failed: {e}")
            # Distinguish auth failures (401) from other errors
            error_msg = str(e).lower()
            is_auth_failure = "401" in error_msg or "unauthorized" in error_msg

            if is_auth_failure:
                # Auth failures get neutral score (0.5) - provider unavailable
                logger.warning("Auth failure detected - using neutral fallback score")
                self._auth_failure_count += 1
                return 0.5
            else:
                # Other errors use heuristic fallback
                return self._fallback_planning_check(execution_trace)

    def _handle_assessment_failures(
        self,
        technical_score: float | BaseException,
        constructiveness_score: float | BaseException,
        planning_score: float | BaseException,
        paper: str,
        review: str,
        execution_trace: dict[str, Any],
    ) -> tuple[float, float, float, bool]:
        """Handle individual assessment failures with fallbacks.

        Called after ``asyncio.gather(return_exceptions=True)`` so each score may
        be either the float result or an exception instance. Replaces any exception
        with its corresponding fallback value and sets ``fallback_used``.

        Args:
            technical_score: Technical accuracy score or exception from gather.
            constructiveness_score: Constructiveness score or exception from gather.
            planning_score: Planning rationality score or exception from gather.
            paper: Original paper text used for semantic similarity fallback.
            review: Review text used for constructiveness fallback.
            execution_trace: Execution trace dict used for planning fallback.

        Returns:
            Tuple of (technical_float, constructiveness_float, planning_float,
            fallback_used) where fallback_used is True if any score was replaced.
        """
        fallback_used = False

        if isinstance(technical_score, BaseException):
            logger.warning(f"Technical assessment failed: {technical_score}")
            technical_score = float(self.fallback_engine.compute_semantic_similarity(paper, review))
            fallback_used = True

        if isinstance(constructiveness_score, BaseException):
            logger.warning(f"Constructiveness assessment failed: {constructiveness_score}")
            constructiveness_score = float(self._fallback_constructiveness_check(review))
            fallback_used = True

        if isinstance(planning_score, BaseException):
            logger.warning(f"Planning assessment failed: {planning_score}")
            planning_score = float(self._fallback_planning_check(execution_trace))
            fallback_used = True

        return (
            float(technical_score),
            float(constructiveness_score),
            float(planning_score),
            fallback_used,
        )

    def _calculate_overall_score(
        self, technical_score: float, constructiveness_score: float, planning_score: float
    ) -> float:
        """Calculate weighted overall score from assessment scores."""
        return (
            technical_score * self.weights.get("technical_accuracy", 0.4)
            + constructiveness_score * self.weights.get("constructiveness", 0.3)
            + planning_score * self.weights.get("planning_rationality", 0.3)
        )

    async def evaluate_comprehensive(
        self, paper: str, review: str, execution_trace: dict[str, Any]
    ) -> Tier2Result:
        """Run comprehensive LLM-based evaluation."""
        try:
            # Reset auth failure counter for this evaluation
            self._auth_failure_count = 0

            # Run assessments concurrently for efficiency
            technical_task = self.assess_technical_accuracy(paper, review)
            constructiveness_task = self.assess_constructiveness(review)
            planning_task = self.assess_planning_rationality(execution_trace)

            (
                technical_score,
                constructiveness_score,
                planning_score,
            ) = await asyncio.gather(
                technical_task,
                constructiveness_task,
                planning_task,
                return_exceptions=True,
            )

            # Handle individual assessment failures
            (
                technical_score_float,
                constructiveness_score_float,
                planning_score_float,
                fallback_used,
            ) = self._handle_assessment_failures(
                technical_score,
                constructiveness_score,
                planning_score,
                paper,
                review,
                execution_trace,
            )

            # Check if any auth failures occurred (tracked in assess_* methods)
            if self._auth_failure_count > 0:
                fallback_used = True

            # Estimate API cost (approximate)
            total_tokens = len(paper) / 4 + len(review) / 4 + 500
            api_cost = (total_tokens / 1000) * 0.0001

            # Calculate overall score
            overall_score = self._calculate_overall_score(
                technical_score_float, constructiveness_score_float, planning_score_float
            )

            return Tier2Result(
                technical_accuracy=technical_score_float,
                constructiveness=constructiveness_score_float,
                planning_rationality=planning_score_float,
                overall_score=overall_score,
                model_used=f"{self.provider}/{self.model}",
                api_cost=api_cost,
                fallback_used=fallback_used,
            )

        except Exception as e:
            logger.error(f"Complete LLM judge evaluation failed: {e}")
            return self._complete_fallback(paper, review, execution_trace)

    def _extract_planning_decisions(self, execution_trace: dict[str, Any]) -> str:
        """Extract key planning decisions from execution trace.

        Args:
            execution_trace: Dictionary with ``agent_interactions`` and ``tool_calls`` keys.

        Returns:
            str: Summary string truncated to 500 chars, or stub on parse failure.
        """
        try:
            decisions = execution_trace.get("agent_interactions", [])
            tool_calls = execution_trace.get("tool_calls", [])

            summary = f"Agents involved: {len(decisions)} interactions, "
            summary += f"Tools used: {len(tool_calls)} calls"

            # Extract key decision points
            if decisions:
                decision_types = [d.get("type", "unknown") for d in decisions[:5]]
                summary += f", Decision types: {', '.join(set(decision_types))}"

            return summary[:500]  # Limit length for API efficiency

        except (AttributeError, KeyError, TypeError) as e:
            logger.debug(f"_extract_planning_decisions failed: {e}", exc_info=True)
            return "Limited trace data available"

    def _fallback_constructiveness_check(self, review: str) -> float:
        """Simple fallback for constructiveness assessment.

        Returns:
            Fallback score capped at 0.5 (neutral) per STORY-001 acceptance criteria
        """
        constructive_phrases = [
            "suggest",
            "recommend",
            "could improve",
            "might consider",
            "strength",
            "weakness",
            "clear",
            "unclear",
            "future work",
            "however",
            "although",
            "while",
            "despite",
            "potential",
        ]

        review_lower = review.lower()
        matches = sum(1 for phrase in constructive_phrases if phrase in review_lower)

        # Cap fallback scores at 0.5 (neutral) per STORY-001
        raw_score = matches / len(constructive_phrases)
        return min(0.5, raw_score)

    def _fallback_planning_check(self, execution_trace: dict[str, Any]) -> float:
        """Simple fallback for planning rationality.

        Returns:
            Fallback score capped at 0.5 (neutral) per STORY-001 acceptance criteria
        """
        try:
            interactions = len(execution_trace.get("agent_interactions", []))
            tool_calls = len(execution_trace.get("tool_calls", []))

            # Simple heuristic: moderate activity indicates good planning
            total_activity = interactions + tool_calls

            if total_activity <= 2:
                activity_score = total_activity / 4.0  # Cap at 0.5 for 2 activities
            elif total_activity <= 10:
                activity_score = 0.5  # Optimal range capped at neutral
            else:
                activity_score = max(0.0, 0.5 - (total_activity - 10) * 0.05)

            # Cap fallback scores at 0.5 (neutral) per STORY-001
            return min(0.5, max(0.0, activity_score))

        except Exception:
            return 0.5  # Neutral score when trace unavailable

    def _complete_fallback(
        self, paper: str, review: str, execution_trace: dict[str, Any]
    ) -> Tier2Result:
        """Complete fallback when all LLM assessments fail."""
        # Use traditional metrics as fallback
        semantic_score = self.fallback_engine.compute_semantic_similarity(paper, review)
        constructiveness_score = self._fallback_constructiveness_check(review)
        planning_score = self._fallback_planning_check(execution_trace)

        overall_score = (semantic_score + constructiveness_score + planning_score) / 3.0

        return Tier2Result(
            technical_accuracy=semantic_score,
            constructiveness=constructiveness_score,
            planning_rationality=planning_score,
            overall_score=overall_score,
            model_used="fallback_traditional",
            api_cost=0.0,
            fallback_used=True,
        )
Functions
__init__(settings, env_config=None, chat_provider=None, chat_model=None)

Initialize evaluation LLM manager with settings.

Parameters:

Name Type Description Default
settings JudgeSettings

JudgeSettings instance with tier2 configuration.

required
env_config AppEnv | None

Application environment configuration. If None, creates default AppEnv().

None
chat_provider str | None

Active chat provider from agent system. Used when tier2_provider=’auto’.

None
chat_model str | None

Active chat model from agent system. Inherited when tier2_provider=’auto’ and provider resolves to chat_provider (not fallen back to another provider).

None
Source code in src/app/judge/llm_evaluation_managers.py
def __init__(
    self,
    settings: JudgeSettings,
    env_config: AppEnv | None = None,
    chat_provider: str | None = None,
    chat_model: str | None = None,
) -> None:
    """Initialize evaluation LLM manager with settings.

    Args:
        settings: JudgeSettings instance with tier2 configuration.
        env_config: Application environment configuration. If None, creates default AppEnv().
        chat_provider: Active chat provider from agent system. Used when tier2_provider='auto'.
        chat_model: Active chat model from agent system. Inherited when tier2_provider='auto'
            and provider resolves to chat_provider (not fallen back to another provider).
    """
    self.settings = settings
    self.fallback_engine = TraditionalMetricsEngine()

    # Get environment configuration
    if env_config is None:
        env_config = AppEnv()
    self.env_config = env_config

    # Resolve provider using auto mode if configured
    resolved_provider = settings.tier2_provider
    if resolved_provider == "auto" and chat_provider:
        resolved_provider = chat_provider
        if resolved_provider != "openai":
            logger.info(f"Judge provider: auto \u2192 {resolved_provider}")

    # Provider and model settings (before selection)
    self.provider = resolved_provider
    self.model = settings.tier2_model
    self.fallback_provider = settings.tier2_fallback_provider
    self.fallback_model = settings.tier2_fallback_model

    # Call select_available_provider to validate and fallback if needed
    self._api_key: str | None = None
    selected = self.select_available_provider(env_config)
    if selected:
        self.provider, self.model, self._api_key = selected
        self.model = self._resolve_model(chat_model, resolved_provider, settings.tier2_provider)
        logger.info(f"Judge model resolved: {self.provider}/{self.model}")
        self.tier2_available = True
    else:
        # No providers available - mark Tier 2 as unavailable
        self.tier2_available = False
        logger.warning("Tier 2 evaluation will be skipped (no valid providers)")

    # Performance settings
    self.timeout = settings.tier2_timeout_seconds
    self.max_retries = settings.tier2_max_retries
    self.paper_excerpt_length = settings.tier2_paper_excerpt_length
    self.cost_budget = settings.tier2_cost_budget_usd

    # Evaluation weights
    self.weights = {
        "technical_accuracy": 0.4,
        "constructiveness": 0.3,
        "planning_rationality": 0.3,
    }

    # Track auth failures for fallback_used flag
    self._auth_failure_count = 0
assess_constructiveness(review) async

Assess constructiveness and helpfulness of review.

Source code in src/app/judge/llm_evaluation_managers.py
    async def assess_constructiveness(self, review: str) -> float:
        """Assess constructiveness and helpfulness of review."""
        try:
            # Sanitize user-controlled content with XML delimiters
            sanitized_review = sanitize_review_text(review)

            prompt = f"""Evaluate constructiveness of this review (1-5 scale):

Review: {sanitized_review}

Rate each aspect (1=poor, 5=excellent):
1. Actionable Feedback: Specific, implementable suggestions?
2. Balanced Critique: Both strengths and weaknesses noted?
3. Improvement Guidance: Clear direction for authors?

Provide scores and brief explanation."""

            agent = await self.create_judge_agent("constructiveness")
            result = await asyncio.wait_for(
                agent.run(prompt, output_type=ConstructivenessAssessment),
                timeout=self.timeout,
            )

            # Equal weighting for constructiveness aspects
            average_score = (
                result.output.actionable_feedback
                + result.output.balanced_critique
                + result.output.improvement_guidance
            ) / 15.0  # Normalize to 0-1

            return min(1.0, max(0.0, average_score))

        except Exception as e:
            logger.warning(f"Constructiveness assessment failed: {e}")
            # Distinguish auth failures (401) from other errors
            error_msg = str(e).lower()
            is_auth_failure = "401" in error_msg or "unauthorized" in error_msg

            if is_auth_failure:
                # Auth failures get neutral score (0.5) - provider unavailable
                logger.warning("Auth failure detected - using neutral fallback score")
                self._auth_failure_count += 1
                return 0.5
            else:
                # Other errors use heuristic fallback
                return self._fallback_constructiveness_check(review)
assess_planning_rationality(execution_trace) async

Assess quality of agent planning and decision-making.

Source code in src/app/judge/llm_evaluation_managers.py
    async def assess_planning_rationality(self, execution_trace: dict[str, Any]) -> float:
        """Assess quality of agent planning and decision-making."""
        try:
            # Extract planning summary from trace
            planning_summary = self._extract_planning_decisions(execution_trace)

            prompt = f"""Evaluate planning rationality of this execution (1-5 scale):

Execution Summary: {planning_summary}

Rate each aspect (1=poor, 5=excellent):
1. Logical Flow: Coherent step progression?
2. Decision Quality: Appropriate choices made?
3. Resource Efficiency: Optimal tool/agent usage?

Provide scores and brief explanation."""

            agent = await self.create_judge_agent("planning_rationality")
            result = await asyncio.wait_for(
                agent.run(prompt, output_type=PlanningRationalityAssessment),
                timeout=self.timeout,
            )

            # Weight decision quality most heavily
            weighted_score = (
                result.output.logical_flow * 0.3
                + result.output.decision_quality * 0.5
                + result.output.resource_efficiency * 0.2
            ) / 5.0

            return min(1.0, max(0.0, weighted_score))

        except Exception as e:
            logger.warning(f"Planning rationality assessment failed: {e}")
            # Distinguish auth failures (401) from other errors
            error_msg = str(e).lower()
            is_auth_failure = "401" in error_msg or "unauthorized" in error_msg

            if is_auth_failure:
                # Auth failures get neutral score (0.5) - provider unavailable
                logger.warning("Auth failure detected - using neutral fallback score")
                self._auth_failure_count += 1
                return 0.5
            else:
                # Other errors use heuristic fallback
                return self._fallback_planning_check(execution_trace)
assess_technical_accuracy(paper, review) async

Assess technical accuracy of review against paper.

Source code in src/app/judge/llm_evaluation_managers.py
    async def assess_technical_accuracy(self, paper: str, review: str) -> float:
        """Assess technical accuracy of review against paper."""
        try:
            # Truncate paper content for cost efficiency
            paper_excerpt = (
                paper[: self.paper_excerpt_length]
                if len(paper) > self.paper_excerpt_length
                else paper
            )

            # Sanitize user-controlled content with XML delimiters
            sanitized_paper = sanitize_for_prompt(
                paper_excerpt, max_length=self.paper_excerpt_length, delimiter="paper_excerpt"
            )
            sanitized_review = sanitize_review_text(review)

            prompt = f"""Evaluate technical accuracy of this review (1-5 scale):

Paper Excerpt: {sanitized_paper}

Review: {sanitized_review}

Rate each aspect (1=poor, 5=excellent):
1. Factual Correctness: Are claims supported by the paper?
2. Methodology Understanding: Does reviewer grasp the approach?
3. Domain Knowledge: Appropriate technical terminology?

Provide scores and brief explanation."""

            agent = await self.create_judge_agent("technical_accuracy")
            result = await asyncio.wait_for(
                agent.run(prompt, output_type=TechnicalAccuracyAssessment),
                timeout=self.timeout,
            )

            # Calculate weighted score and normalize to 0-1
            weighted_score = (
                result.output.factual_correctness * 0.5
                + result.output.methodology_understanding * 0.3
                + result.output.domain_knowledge * 0.2
            ) / 5.0

            return min(1.0, max(0.0, weighted_score))

        except Exception as e:
            logger.warning(f"Technical accuracy assessment failed: {e}")
            # Distinguish auth failures (401) from timeouts per STORY-001
            error_msg = str(e).lower()
            is_auth_failure = "401" in error_msg or "unauthorized" in error_msg

            if is_auth_failure:
                # Auth failures get neutral score (0.5) - provider unavailable
                logger.warning("Auth failure detected - using neutral fallback score")
                self._auth_failure_count += 1
                return 0.5
            else:
                # Timeouts and other errors use semantic similarity fallback
                return self.fallback_engine.compute_semantic_similarity(paper, review)
create_judge_agent(assessment_type, use_fallback=False) async

Create an LLM judge agent for specific assessment type.

Parameters:

Name Type Description Default
assessment_type str

Type of assessment (“technical_accuracy”, etc.)

required
use_fallback bool

Whether to use fallback provider

False

Returns:

Type Description
Agent

Configured Agent for evaluation

Source code in src/app/judge/llm_evaluation_managers.py
async def create_judge_agent(self, assessment_type: str, use_fallback: bool = False) -> Agent:
    """
    Create an LLM judge agent for specific assessment type.

    Args:
        assessment_type: Type of assessment ("technical_accuracy", etc.)
        use_fallback: Whether to use fallback provider

    Returns:
        Configured Agent for evaluation
    """
    if use_fallback:
        provider = self.fallback_provider
        model = self.fallback_model
        logger.info(f"Using fallback provider: {provider}/{model}")
    else:
        provider = self.provider
        model = self.model

    return create_evaluation_agent(
        provider=provider,
        model_name=model,
        assessment_type=assessment_type,
        api_key=self._api_key,
    )
evaluate_comprehensive(paper, review, execution_trace) async

Run comprehensive LLM-based evaluation.

Source code in src/app/judge/llm_evaluation_managers.py
async def evaluate_comprehensive(
    self, paper: str, review: str, execution_trace: dict[str, Any]
) -> Tier2Result:
    """Run comprehensive LLM-based evaluation."""
    try:
        # Reset auth failure counter for this evaluation
        self._auth_failure_count = 0

        # Run assessments concurrently for efficiency
        technical_task = self.assess_technical_accuracy(paper, review)
        constructiveness_task = self.assess_constructiveness(review)
        planning_task = self.assess_planning_rationality(execution_trace)

        (
            technical_score,
            constructiveness_score,
            planning_score,
        ) = await asyncio.gather(
            technical_task,
            constructiveness_task,
            planning_task,
            return_exceptions=True,
        )

        # Handle individual assessment failures
        (
            technical_score_float,
            constructiveness_score_float,
            planning_score_float,
            fallback_used,
        ) = self._handle_assessment_failures(
            technical_score,
            constructiveness_score,
            planning_score,
            paper,
            review,
            execution_trace,
        )

        # Check if any auth failures occurred (tracked in assess_* methods)
        if self._auth_failure_count > 0:
            fallback_used = True

        # Estimate API cost (approximate)
        total_tokens = len(paper) / 4 + len(review) / 4 + 500
        api_cost = (total_tokens / 1000) * 0.0001

        # Calculate overall score
        overall_score = self._calculate_overall_score(
            technical_score_float, constructiveness_score_float, planning_score_float
        )

        return Tier2Result(
            technical_accuracy=technical_score_float,
            constructiveness=constructiveness_score_float,
            planning_rationality=planning_score_float,
            overall_score=overall_score,
            model_used=f"{self.provider}/{self.model}",
            api_cost=api_cost,
            fallback_used=fallback_used,
        )

    except Exception as e:
        logger.error(f"Complete LLM judge evaluation failed: {e}")
        return self._complete_fallback(paper, review, execution_trace)
select_available_provider(env_config)

Select available provider with fallback chain.

Parameters:

Name Type Description Default
env_config AppEnv

Application environment configuration

required

Returns:

Type Description
tuple[str, str, str | None] | None

Tuple of (provider, model, api_key) if available, None if no providers available.

Source code in src/app/judge/llm_evaluation_managers.py
def select_available_provider(self, env_config: AppEnv) -> tuple[str, str, str | None] | None:
    """Select available provider with fallback chain.

    Args:
        env_config: Application environment configuration

    Returns:
        Tuple of (provider, model, api_key) if available, None if no providers available.
    """
    # Try primary provider first
    is_valid, api_key = self._resolve_provider_key(self.provider, env_config)
    if is_valid:
        logger.info(f"Using primary provider: {self.provider}/{self.model}")
        return (self.provider, self.model, api_key)

    # Try fallback provider
    is_valid, api_key = self._resolve_provider_key(self.fallback_provider, env_config)
    if is_valid:
        logger.info(
            f"Primary provider unavailable, using fallback: "
            f"{self.fallback_provider}/{self.fallback_model}"
        )
        return (self.fallback_provider, self.fallback_model, api_key)

    # No providers available
    logger.warning(
        f"Neither primary ({self.provider}) nor fallback ({self.fallback_provider}) "
        f"providers have valid API keys. Tier 2 will be skipped."
    )
    return None

Functions

app.judge.performance_monitor

Performance monitoring and analytics for evaluation pipeline.

Handles execution statistics, bottleneck detection, performance warnings, and failure tracking for the three-tier evaluation system.

Classes

PerformanceMonitor

Performance monitoring and analytics for evaluation pipelines.

Tracks execution times, detects bottlenecks, records failures, and provides performance insights for optimization.

Source code in src/app/judge/performance_monitor.py
class PerformanceMonitor:
    """
    Performance monitoring and analytics for evaluation pipelines.

    Tracks execution times, detects bottlenecks, records failures,
    and provides performance insights for optimization.
    """

    def __init__(self, performance_targets: dict[str, float]):
        """Initialize performance monitor with targets.

        Args:
            performance_targets: Dictionary of performance targets (e.g., tier timeouts)
        """
        self.performance_targets = performance_targets.copy()
        self.execution_stats: dict[str, Any] = self._initialize_stats()

    def _initialize_stats(self) -> dict[str, Any]:
        """Initialize execution statistics structure.

        Returns:
            Dictionary with default statistics structure
        """
        return {
            "tier1_time": 0.0,
            "tier2_time": 0.0,
            "tier3_time": 0.0,
            "total_time": 0.0,
            "tiers_executed": [],
            "fallback_used": False,
            "tier_failures": [],
            "performance_warnings": [],
            "bottlenecks_detected": [],
        }

    def reset_stats(self) -> None:
        """Reset execution statistics for new evaluation."""
        self.execution_stats = self._initialize_stats()

    def record_tier_execution(self, tier: int, duration: float) -> None:
        """Record successful tier execution time.

        Args:
            tier: Tier number (1, 2, or 3)
            duration: Execution duration in seconds
        """
        tier_key = f"tier{tier}"
        self.execution_stats[tier_key] = duration

        if tier not in self.execution_stats["tiers_executed"]:
            self.execution_stats["tiers_executed"].append(tier)

        logger.debug(f"Recorded tier {tier} execution: {duration:.3f}s")

    def record_tier_failure(
        self, tier: int, failure_type: str, execution_time: float, error_msg: str
    ) -> None:
        """Record tier failure details for monitoring and analysis.

        Args:
            tier: Tier number that failed (0 for pipeline-level failures)
            failure_type: Type of failure (timeout, error)
            execution_time: Time spent before failure
            error_msg: Error message
        """
        failure_record = {
            "tier": tier,
            "failure_type": failure_type,
            "execution_time": execution_time,
            "error_msg": error_msg,
            "timestamp": time.time(),
        }

        self.execution_stats["tier_failures"].append(failure_record)

        logger.debug(f"Recorded tier {tier} failure: {failure_type} after {execution_time:.2f}s")

    def record_fallback_usage(self, fallback_used: bool) -> None:
        """Record whether fallback strategy was used.

        Args:
            fallback_used: Whether fallback strategy was applied
        """
        self.execution_stats["fallback_used"] = fallback_used
        logger.debug(f"Fallback strategy used: {fallback_used}")

    def finalize_execution(self, total_time: float) -> None:
        """Finalize execution statistics and perform analysis.

        Args:
            total_time: Total pipeline execution time
        """
        self.execution_stats["total_time"] = total_time
        self._analyze_performance(total_time)

    def _detect_bottlenecks(
        self, tier_times: dict[str, float], total_time: float
    ) -> list[dict[str, Any]]:
        """Detect performance bottlenecks in tier execution."""
        bottleneck_threshold = total_time * 0.4
        bottlenecks = []

        for tier, time_taken in tier_times.items():
            if time_taken > bottleneck_threshold and time_taken > 0:
                bottlenecks.append(
                    {
                        "tier": tier,
                        "time": time_taken,
                        "percentage": (time_taken / total_time) * 100,
                    }
                )

        if bottlenecks:
            for bottleneck in bottlenecks:
                logger.warning(
                    f"Performance bottleneck detected: {bottleneck['tier']} took "
                    f"{bottleneck['time']:.2f}s "
                    f"({bottleneck['percentage']:.1f}% of total time)"
                )

        return bottlenecks

    def _check_tier_targets(self, tier_times: dict[str, float]) -> None:
        """Check individual tier performance against targets."""
        for tier_num in range(1, 4):
            tier_key = f"tier{tier_num}"
            target_key = f"tier{tier_num}_max_seconds"

            if target_key in self.performance_targets and tier_times[tier_key] > 0:
                target_time = self.performance_targets[target_key]
                actual_time = tier_times[tier_key]

                if actual_time > target_time:
                    warning_msg = (
                        f"Tier {tier_num} exceeded target: {actual_time:.2f}s > {target_time}s"
                    )
                    self._record_performance_warning(
                        f"tier{tier_num}_time_exceeded", warning_msg, actual_time
                    )

    def _check_total_time_target(self, total_time: float) -> None:
        """Check total pipeline time against target."""
        if "total_max_seconds" in self.performance_targets:
            total_target = self.performance_targets["total_max_seconds"]
            if total_time > total_target:
                warning_msg = f"Pipeline exceeded time target: {total_time:.2f}s > {total_target}s"
                self._record_performance_warning("total_time_exceeded", warning_msg, total_time)
                logger.warning(warning_msg)

    def _analyze_performance(self, total_time: float) -> None:
        """Analyze pipeline performance and detect bottlenecks.

        Args:
            total_time: Total pipeline execution time
        """
        tier_times = {
            "tier1": self.execution_stats["tier1_time"],
            "tier2": self.execution_stats["tier2_time"],
            "tier3": self.execution_stats["tier3_time"],
        }

        bottlenecks = self._detect_bottlenecks(tier_times, total_time)
        if bottlenecks:
            self.execution_stats["bottlenecks_detected"] = bottlenecks

        self._check_tier_targets(tier_times)
        self._check_total_time_target(total_time)

    def _record_performance_warning(self, warning_type: str, message: str, value: float) -> None:
        """Record performance warning for monitoring.

        Args:
            warning_type: Type of warning
            message: Warning message
            value: Associated numeric value
        """
        warning_record = {
            "type": warning_type,
            "message": message,
            "value": value,
            "timestamp": time.time(),
        }

        self.execution_stats["performance_warnings"].append(warning_record)

    def get_execution_stats(self) -> dict[str, Any]:
        """Get detailed execution statistics from last pipeline run.

        Returns:
            Dictionary with timing and execution details including performance analysis
        """
        stats = self.execution_stats.copy()

        # Add derived performance metrics
        if stats["total_time"] > 0:
            stats["tier_time_percentages"] = {
                "tier1": (stats["tier1_time"] / stats["total_time"]) * 100,
                "tier2": (stats["tier2_time"] / stats["total_time"]) * 100,
                "tier3": (stats["tier3_time"] / stats["total_time"]) * 100,
            }

        return stats

    def get_performance_summary(self) -> str:
        """Get concise performance summary.

        Returns:
            Performance summary string
        """
        bottlenecks = len(self.execution_stats.get("bottlenecks_detected", []))
        warnings = len(self.execution_stats.get("performance_warnings", []))
        failures = len(self.execution_stats.get("tier_failures", []))

        return f"bottlenecks={bottlenecks}, warnings={warnings}, failures={failures}"

    def has_performance_issues(self) -> bool:
        """Check if there are any performance issues detected.

        Returns:
            True if bottlenecks or warnings were detected
        """
        return (
            len(self.execution_stats.get("bottlenecks_detected", [])) > 0
            or len(self.execution_stats.get("performance_warnings", [])) > 0
        )

    def get_bottlenecks(self) -> list[dict[str, Any]]:
        """Get detected performance bottlenecks.

        Returns:
            List of bottleneck information dictionaries
        """
        return self.execution_stats.get("bottlenecks_detected", [])

    def get_warnings(self) -> list[dict[str, Any]]:
        """Get performance warnings.

        Returns:
            List of performance warning dictionaries
        """
        return self.execution_stats.get("performance_warnings", [])

    def get_failures(self) -> list[dict[str, Any]]:
        """Get tier failure records.

        Returns:
            List of tier failure dictionaries
        """
        return self.execution_stats.get("tier_failures", [])
Functions
__init__(performance_targets)

Initialize performance monitor with targets.

Parameters:

Name Type Description Default
performance_targets dict[str, float]

Dictionary of performance targets (e.g., tier timeouts)

required
Source code in src/app/judge/performance_monitor.py
def __init__(self, performance_targets: dict[str, float]):
    """Initialize performance monitor with targets.

    Args:
        performance_targets: Dictionary of performance targets (e.g., tier timeouts)
    """
    self.performance_targets = performance_targets.copy()
    self.execution_stats: dict[str, Any] = self._initialize_stats()
finalize_execution(total_time)

Finalize execution statistics and perform analysis.

Parameters:

Name Type Description Default
total_time float

Total pipeline execution time

required
Source code in src/app/judge/performance_monitor.py
def finalize_execution(self, total_time: float) -> None:
    """Finalize execution statistics and perform analysis.

    Args:
        total_time: Total pipeline execution time
    """
    self.execution_stats["total_time"] = total_time
    self._analyze_performance(total_time)
get_bottlenecks()

Get detected performance bottlenecks.

Returns:

Type Description
list[dict[str, Any]]

List of bottleneck information dictionaries

Source code in src/app/judge/performance_monitor.py
def get_bottlenecks(self) -> list[dict[str, Any]]:
    """Get detected performance bottlenecks.

    Returns:
        List of bottleneck information dictionaries
    """
    return self.execution_stats.get("bottlenecks_detected", [])
get_execution_stats()

Get detailed execution statistics from last pipeline run.

Returns:

Type Description
dict[str, Any]

Dictionary with timing and execution details including performance analysis

Source code in src/app/judge/performance_monitor.py
def get_execution_stats(self) -> dict[str, Any]:
    """Get detailed execution statistics from last pipeline run.

    Returns:
        Dictionary with timing and execution details including performance analysis
    """
    stats = self.execution_stats.copy()

    # Add derived performance metrics
    if stats["total_time"] > 0:
        stats["tier_time_percentages"] = {
            "tier1": (stats["tier1_time"] / stats["total_time"]) * 100,
            "tier2": (stats["tier2_time"] / stats["total_time"]) * 100,
            "tier3": (stats["tier3_time"] / stats["total_time"]) * 100,
        }

    return stats
get_failures()

Get tier failure records.

Returns:

Type Description
list[dict[str, Any]]

List of tier failure dictionaries

Source code in src/app/judge/performance_monitor.py
def get_failures(self) -> list[dict[str, Any]]:
    """Get tier failure records.

    Returns:
        List of tier failure dictionaries
    """
    return self.execution_stats.get("tier_failures", [])
get_performance_summary()

Get concise performance summary.

Returns:

Type Description
str

Performance summary string

Source code in src/app/judge/performance_monitor.py
def get_performance_summary(self) -> str:
    """Get concise performance summary.

    Returns:
        Performance summary string
    """
    bottlenecks = len(self.execution_stats.get("bottlenecks_detected", []))
    warnings = len(self.execution_stats.get("performance_warnings", []))
    failures = len(self.execution_stats.get("tier_failures", []))

    return f"bottlenecks={bottlenecks}, warnings={warnings}, failures={failures}"
get_warnings()

Get performance warnings.

Returns:

Type Description
list[dict[str, Any]]

List of performance warning dictionaries

Source code in src/app/judge/performance_monitor.py
def get_warnings(self) -> list[dict[str, Any]]:
    """Get performance warnings.

    Returns:
        List of performance warning dictionaries
    """
    return self.execution_stats.get("performance_warnings", [])
has_performance_issues()

Check if there are any performance issues detected.

Returns:

Type Description
bool

True if bottlenecks or warnings were detected

Source code in src/app/judge/performance_monitor.py
def has_performance_issues(self) -> bool:
    """Check if there are any performance issues detected.

    Returns:
        True if bottlenecks or warnings were detected
    """
    return (
        len(self.execution_stats.get("bottlenecks_detected", [])) > 0
        or len(self.execution_stats.get("performance_warnings", [])) > 0
    )
record_fallback_usage(fallback_used)

Record whether fallback strategy was used.

Parameters:

Name Type Description Default
fallback_used bool

Whether fallback strategy was applied

required
Source code in src/app/judge/performance_monitor.py
def record_fallback_usage(self, fallback_used: bool) -> None:
    """Record whether fallback strategy was used.

    Args:
        fallback_used: Whether fallback strategy was applied
    """
    self.execution_stats["fallback_used"] = fallback_used
    logger.debug(f"Fallback strategy used: {fallback_used}")
record_tier_execution(tier, duration)

Record successful tier execution time.

Parameters:

Name Type Description Default
tier int

Tier number (1, 2, or 3)

required
duration float

Execution duration in seconds

required
Source code in src/app/judge/performance_monitor.py
def record_tier_execution(self, tier: int, duration: float) -> None:
    """Record successful tier execution time.

    Args:
        tier: Tier number (1, 2, or 3)
        duration: Execution duration in seconds
    """
    tier_key = f"tier{tier}"
    self.execution_stats[tier_key] = duration

    if tier not in self.execution_stats["tiers_executed"]:
        self.execution_stats["tiers_executed"].append(tier)

    logger.debug(f"Recorded tier {tier} execution: {duration:.3f}s")
record_tier_failure(tier, failure_type, execution_time, error_msg)

Record tier failure details for monitoring and analysis.

Parameters:

Name Type Description Default
tier int

Tier number that failed (0 for pipeline-level failures)

required
failure_type str

Type of failure (timeout, error)

required
execution_time float

Time spent before failure

required
error_msg str

Error message

required
Source code in src/app/judge/performance_monitor.py
def record_tier_failure(
    self, tier: int, failure_type: str, execution_time: float, error_msg: str
) -> None:
    """Record tier failure details for monitoring and analysis.

    Args:
        tier: Tier number that failed (0 for pipeline-level failures)
        failure_type: Type of failure (timeout, error)
        execution_time: Time spent before failure
        error_msg: Error message
    """
    failure_record = {
        "tier": tier,
        "failure_type": failure_type,
        "execution_time": execution_time,
        "error_msg": error_msg,
        "timestamp": time.time(),
    }

    self.execution_stats["tier_failures"].append(failure_record)

    logger.debug(f"Recorded tier {tier} failure: {failure_type} after {execution_time:.2f}s")
reset_stats()

Reset execution statistics for new evaluation.

Source code in src/app/judge/performance_monitor.py
def reset_stats(self) -> None:
    """Reset execution statistics for new evaluation."""
    self.execution_stats = self._initialize_stats()

app.judge.plugins.base

Base classes for evaluator plugin system.

Defines the EvaluatorPlugin ABC and PluginRegistry for typed, tier-ordered plugin execution with Pydantic models at all boundaries.

Classes

EvaluatorPlugin

Bases: ABC

Abstract base class for evaluation plugins.

Each plugin implements a specific evaluation tier (1, 2, or 3) and provides typed input/output using Pydantic models.

Attributes:

Name Type Description
name str

Unique identifier for the plugin

tier int

Evaluation tier (1=Traditional, 2=LLM-Judge, 3=Graph)

Source code in src/app/judge/plugins/base.py
class EvaluatorPlugin(ABC):
    """Abstract base class for evaluation plugins.

    Each plugin implements a specific evaluation tier (1, 2, or 3)
    and provides typed input/output using Pydantic models.

    Attributes:
        name: Unique identifier for the plugin
        tier: Evaluation tier (1=Traditional, 2=LLM-Judge, 3=Graph)
    """

    @property
    @abstractmethod
    def name(self) -> str:
        """Return unique plugin identifier.

        Returns:
            Plugin name string
        """
        pass

    @property
    @abstractmethod
    def tier(self) -> int:
        """Return evaluation tier number.

        Returns:
            Tier number (1, 2, or 3)
        """
        pass

    @abstractmethod
    def evaluate(self, input_data: BaseModel, context: dict[str, Any] | None = None) -> BaseModel:
        """Execute plugin evaluation.

        Args:
            input_data: Typed input data (Pydantic model)
            context: Optional context from previous tier evaluations

        Returns:
            Evaluation result as Pydantic model (Tier1Result, Tier2Result, or Tier3Result)

        Raises:
            ValueError: If input validation fails
            RuntimeError: If evaluation execution fails
        """
        pass

    @abstractmethod
    def get_context_for_next_tier(self, result: BaseModel) -> dict[str, Any]:
        """Extract context to pass to next tier.

        Args:
            result: Evaluation result from this tier

        Returns:
            Dictionary of context data for next tier
        """
        pass
Attributes
name abstractmethod property

Return unique plugin identifier.

Returns:

Type Description
str

Plugin name string

tier abstractmethod property

Return evaluation tier number.

Returns:

Type Description
int

Tier number (1, 2, or 3)

Functions
evaluate(input_data, context=None) abstractmethod

Execute plugin evaluation.

Parameters:

Name Type Description Default
input_data BaseModel

Typed input data (Pydantic model)

required
context dict[str, Any] | None

Optional context from previous tier evaluations

None

Returns:

Type Description
BaseModel

Evaluation result as Pydantic model (Tier1Result, Tier2Result, or Tier3Result)

Raises:

Type Description
ValueError

If input validation fails

RuntimeError

If evaluation execution fails

Source code in src/app/judge/plugins/base.py
@abstractmethod
def evaluate(self, input_data: BaseModel, context: dict[str, Any] | None = None) -> BaseModel:
    """Execute plugin evaluation.

    Args:
        input_data: Typed input data (Pydantic model)
        context: Optional context from previous tier evaluations

    Returns:
        Evaluation result as Pydantic model (Tier1Result, Tier2Result, or Tier3Result)

    Raises:
        ValueError: If input validation fails
        RuntimeError: If evaluation execution fails
    """
    pass
get_context_for_next_tier(result) abstractmethod

Extract context to pass to next tier.

Parameters:

Name Type Description Default
result BaseModel

Evaluation result from this tier

required

Returns:

Type Description
dict[str, Any]

Dictionary of context data for next tier

Source code in src/app/judge/plugins/base.py
@abstractmethod
def get_context_for_next_tier(self, result: BaseModel) -> dict[str, Any]:
    """Extract context to pass to next tier.

    Args:
        result: Evaluation result from this tier

    Returns:
        Dictionary of context data for next tier
    """
    pass

PluginRegistry

Registry for managing and executing evaluation plugins.

Maintains plugins in tier order and orchestrates sequential execution with context passing between tiers.

Source code in src/app/judge/plugins/base.py
class PluginRegistry:
    """Registry for managing and executing evaluation plugins.

    Maintains plugins in tier order and orchestrates sequential execution
    with context passing between tiers.
    """

    def __init__(self) -> None:
        """Initialize empty plugin registry."""
        self._plugins: dict[str, EvaluatorPlugin] = {}

    def register(self, plugin: EvaluatorPlugin) -> None:
        """Register an evaluation plugin.

        Args:
            plugin: Plugin instance to register

        Raises:
            ValueError: If plugin with same name already registered
        """
        if plugin.name in self._plugins:
            raise ValueError(f"Plugin '{plugin.name}' already registered")

        self._plugins[plugin.name] = plugin
        logger.debug(f"Registered plugin: {plugin.name} (Tier {plugin.tier})")

    def get_plugin(self, name: str) -> EvaluatorPlugin | None:
        """Retrieve plugin by name.

        Args:
            name: Plugin name to retrieve

        Returns:
            Plugin instance if found, None otherwise
        """
        return self._plugins.get(name)

    def list_plugins(self) -> list[EvaluatorPlugin]:
        """List all registered plugins in tier order.

        Returns:
            List of plugins sorted by tier number
        """
        return sorted(self._plugins.values(), key=lambda p: p.tier)

    def execute_all(self, input_data: BaseModel) -> list[BaseModel]:
        """Execute all plugins in tier order with context passing.

        Args:
            input_data: Input data for first plugin

        Returns:
            List of results from each plugin in tier order

        Raises:
            ValueError: If plugin evaluation fails
            RuntimeError: If plugin execution fails
        """
        results: list[BaseModel] = []
        context: dict[str, Any] = {}

        for plugin in self.list_plugins():
            logger.debug(f"Executing plugin: {plugin.name} (Tier {plugin.tier})")

            # Execute plugin with accumulated context
            result = plugin.evaluate(input_data, context=context or None)
            results.append(result)

            # Extract context for next tier
            next_context = plugin.get_context_for_next_tier(result)
            context.update(next_context)

        return results
Functions
__init__()

Initialize empty plugin registry.

Source code in src/app/judge/plugins/base.py
def __init__(self) -> None:
    """Initialize empty plugin registry."""
    self._plugins: dict[str, EvaluatorPlugin] = {}
execute_all(input_data)

Execute all plugins in tier order with context passing.

Parameters:

Name Type Description Default
input_data BaseModel

Input data for first plugin

required

Returns:

Type Description
list[BaseModel]

List of results from each plugin in tier order

Raises:

Type Description
ValueError

If plugin evaluation fails

RuntimeError

If plugin execution fails

Source code in src/app/judge/plugins/base.py
def execute_all(self, input_data: BaseModel) -> list[BaseModel]:
    """Execute all plugins in tier order with context passing.

    Args:
        input_data: Input data for first plugin

    Returns:
        List of results from each plugin in tier order

    Raises:
        ValueError: If plugin evaluation fails
        RuntimeError: If plugin execution fails
    """
    results: list[BaseModel] = []
    context: dict[str, Any] = {}

    for plugin in self.list_plugins():
        logger.debug(f"Executing plugin: {plugin.name} (Tier {plugin.tier})")

        # Execute plugin with accumulated context
        result = plugin.evaluate(input_data, context=context or None)
        results.append(result)

        # Extract context for next tier
        next_context = plugin.get_context_for_next_tier(result)
        context.update(next_context)

    return results
get_plugin(name)

Retrieve plugin by name.

Parameters:

Name Type Description Default
name str

Plugin name to retrieve

required

Returns:

Type Description
EvaluatorPlugin | None

Plugin instance if found, None otherwise

Source code in src/app/judge/plugins/base.py
def get_plugin(self, name: str) -> EvaluatorPlugin | None:
    """Retrieve plugin by name.

    Args:
        name: Plugin name to retrieve

    Returns:
        Plugin instance if found, None otherwise
    """
    return self._plugins.get(name)
list_plugins()

List all registered plugins in tier order.

Returns:

Type Description
list[EvaluatorPlugin]

List of plugins sorted by tier number

Source code in src/app/judge/plugins/base.py
def list_plugins(self) -> list[EvaluatorPlugin]:
    """List all registered plugins in tier order.

    Returns:
        List of plugins sorted by tier number
    """
    return sorted(self._plugins.values(), key=lambda p: p.tier)
register(plugin)

Register an evaluation plugin.

Parameters:

Name Type Description Default
plugin EvaluatorPlugin

Plugin instance to register

required

Raises:

Type Description
ValueError

If plugin with same name already registered

Source code in src/app/judge/plugins/base.py
def register(self, plugin: EvaluatorPlugin) -> None:
    """Register an evaluation plugin.

    Args:
        plugin: Plugin instance to register

    Raises:
        ValueError: If plugin with same name already registered
    """
    if plugin.name in self._plugins:
        raise ValueError(f"Plugin '{plugin.name}' already registered")

    self._plugins[plugin.name] = plugin
    logger.debug(f"Registered plugin: {plugin.name} (Tier {plugin.tier})")

app.judge.plugins.graph_metrics

GraphEvaluatorPlugin wrapper for Tier 3 evaluation.

Wraps the existing GraphAnalysisEngine as an EvaluatorPlugin following the adapter pattern with configurable timeout.

Classes

GraphEvaluatorPlugin

Bases: EvaluatorPlugin

Adapter wrapping GraphAnalysisEngine as an EvaluatorPlugin.

Provides Tier 3 evaluation using graph-based analysis of agent coordination patterns with configurable timeout from JudgeSettings.

Attributes:

Name Type Description
timeout_seconds

Maximum execution time for this plugin

_engine

Underlying GraphAnalysisEngine instance

_settings

JudgeSettings instance for configuration

Source code in src/app/judge/plugins/graph_metrics.py
class GraphEvaluatorPlugin(EvaluatorPlugin):
    """Adapter wrapping GraphAnalysisEngine as an EvaluatorPlugin.

    Provides Tier 3 evaluation using graph-based analysis of agent
    coordination patterns with configurable timeout from JudgeSettings.

    Attributes:
        timeout_seconds: Maximum execution time for this plugin
        _engine: Underlying GraphAnalysisEngine instance
        _settings: JudgeSettings instance for configuration
    """

    def __init__(self, timeout_seconds: float | None = None):
        """Initialize plugin with optional timeout override.

        Args:
            timeout_seconds: Optional timeout override. If None, uses JudgeSettings default.
        """
        self._settings = JudgeSettings()
        self.timeout_seconds = timeout_seconds or self._settings.tier3_max_seconds
        self._engine = GraphAnalysisEngine(self._settings)

    @property
    def name(self) -> str:
        """Return unique plugin identifier.

        Returns:
            Plugin name string
        """
        return "graph_metrics"

    @property
    def tier(self) -> int:
        """Return evaluation tier number.

        Returns:
            Tier 3 (Graph Analysis)
        """
        return 3

    def evaluate(self, input_data: BaseModel, context: dict[str, Any] | None = None) -> BaseModel:
        """Execute Tier 3 graph-based evaluation.

        Args:
            input_data: Input containing trace_data (GraphTraceData)
            context: Optional context from previous tiers (Tier 1 and Tier 2)

        Returns:
            Tier3Result with graph analysis metrics

        Raises:
            ValueError: If input validation fails
            RuntimeError: If evaluation execution fails
        """
        # Extract trace_data from input_data
        # Reason: Pydantic BaseModel doesn't support attribute access without type checking
        trace_data = getattr(input_data, "trace_data", None)

        if trace_data is None:
            logger.warning("No trace_data provided for graph evaluation")
            # Return zero scores for missing trace data
            return Tier3Result(
                path_convergence=0.0,
                tool_selection_accuracy=0.0,
                coordination_centrality=0.0,
                task_distribution_balance=0.0,
                overall_score=0.0,
                graph_complexity=0,
            )

        # Log context enrichment if previous tier data available
        if context:
            tier1_score = context.get("tier1_overall_score")
            tier2_score = context.get("tier2_overall_score")
            if tier1_score is not None and tier2_score is not None:
                logger.debug(
                    f"Previous tier context available: "
                    f"Tier1={tier1_score:.2f}, Tier2={tier2_score:.2f}"
                )

        # Delegate to existing engine
        result = self._engine.evaluate_graph_metrics(trace_data)

        return result

    def get_context_for_next_tier(self, result: BaseModel) -> dict[str, Any]:
        """Extract context from Tier 3 results for potential future tiers.

        Args:
            result: Tier3Result from this plugin's evaluation

        Returns:
            Dictionary containing tier3_overall_score and graph metrics
        """
        # Reason: Type narrowing for BaseModel to Tier3Result
        if not isinstance(result, Tier3Result):
            return {}

        return {
            "tier3_overall_score": result.overall_score,
            "tier3_graph_metrics": {
                "path_convergence": result.path_convergence,
                "tool_selection_accuracy": result.tool_selection_accuracy,
                "coordination_centrality": result.coordination_centrality,
                "task_distribution_balance": result.task_distribution_balance,
            },
            "tier3_graph_complexity": result.graph_complexity,
        }
Attributes
name property

Return unique plugin identifier.

Returns:

Type Description
str

Plugin name string

tier property

Return evaluation tier number.

Returns:

Type Description
int

Tier 3 (Graph Analysis)

Functions
__init__(timeout_seconds=None)

Initialize plugin with optional timeout override.

Parameters:

Name Type Description Default
timeout_seconds float | None

Optional timeout override. If None, uses JudgeSettings default.

None
Source code in src/app/judge/plugins/graph_metrics.py
def __init__(self, timeout_seconds: float | None = None):
    """Initialize plugin with optional timeout override.

    Args:
        timeout_seconds: Optional timeout override. If None, uses JudgeSettings default.
    """
    self._settings = JudgeSettings()
    self.timeout_seconds = timeout_seconds or self._settings.tier3_max_seconds
    self._engine = GraphAnalysisEngine(self._settings)
evaluate(input_data, context=None)

Execute Tier 3 graph-based evaluation.

Parameters:

Name Type Description Default
input_data BaseModel

Input containing trace_data (GraphTraceData)

required
context dict[str, Any] | None

Optional context from previous tiers (Tier 1 and Tier 2)

None

Returns:

Type Description
BaseModel

Tier3Result with graph analysis metrics

Raises:

Type Description
ValueError

If input validation fails

RuntimeError

If evaluation execution fails

Source code in src/app/judge/plugins/graph_metrics.py
def evaluate(self, input_data: BaseModel, context: dict[str, Any] | None = None) -> BaseModel:
    """Execute Tier 3 graph-based evaluation.

    Args:
        input_data: Input containing trace_data (GraphTraceData)
        context: Optional context from previous tiers (Tier 1 and Tier 2)

    Returns:
        Tier3Result with graph analysis metrics

    Raises:
        ValueError: If input validation fails
        RuntimeError: If evaluation execution fails
    """
    # Extract trace_data from input_data
    # Reason: Pydantic BaseModel doesn't support attribute access without type checking
    trace_data = getattr(input_data, "trace_data", None)

    if trace_data is None:
        logger.warning("No trace_data provided for graph evaluation")
        # Return zero scores for missing trace data
        return Tier3Result(
            path_convergence=0.0,
            tool_selection_accuracy=0.0,
            coordination_centrality=0.0,
            task_distribution_balance=0.0,
            overall_score=0.0,
            graph_complexity=0,
        )

    # Log context enrichment if previous tier data available
    if context:
        tier1_score = context.get("tier1_overall_score")
        tier2_score = context.get("tier2_overall_score")
        if tier1_score is not None and tier2_score is not None:
            logger.debug(
                f"Previous tier context available: "
                f"Tier1={tier1_score:.2f}, Tier2={tier2_score:.2f}"
            )

    # Delegate to existing engine
    result = self._engine.evaluate_graph_metrics(trace_data)

    return result
get_context_for_next_tier(result)

Extract context from Tier 3 results for potential future tiers.

Parameters:

Name Type Description Default
result BaseModel

Tier3Result from this plugin’s evaluation

required

Returns:

Type Description
dict[str, Any]

Dictionary containing tier3_overall_score and graph metrics

Source code in src/app/judge/plugins/graph_metrics.py
def get_context_for_next_tier(self, result: BaseModel) -> dict[str, Any]:
    """Extract context from Tier 3 results for potential future tiers.

    Args:
        result: Tier3Result from this plugin's evaluation

    Returns:
        Dictionary containing tier3_overall_score and graph metrics
    """
    # Reason: Type narrowing for BaseModel to Tier3Result
    if not isinstance(result, Tier3Result):
        return {}

    return {
        "tier3_overall_score": result.overall_score,
        "tier3_graph_metrics": {
            "path_convergence": result.path_convergence,
            "tool_selection_accuracy": result.tool_selection_accuracy,
            "coordination_centrality": result.coordination_centrality,
            "task_distribution_balance": result.task_distribution_balance,
        },
        "tier3_graph_complexity": result.graph_complexity,
    }

app.judge.plugins.llm_judge

LLMJudgePlugin wrapper for Tier 2 evaluation.

Wraps the existing LLMJudgeEngine as an EvaluatorPlugin following the adapter pattern with opt-in Tier 1 context enrichment.

Classes

LLMJudgePlugin

Bases: EvaluatorPlugin

Adapter wrapping LLMJudgeEngine as an EvaluatorPlugin.

Provides Tier 2 evaluation using LLM-as-Judge methodology with configurable timeout and optional Tier 1 context enrichment.

Attributes:

Name Type Description
timeout_seconds

Maximum execution time for this plugin

_engine

Underlying LLMJudgeEngine instance

_settings

JudgeSettings instance for configuration

Source code in src/app/judge/plugins/llm_judge.py
class LLMJudgePlugin(EvaluatorPlugin):
    """Adapter wrapping LLMJudgeEngine as an EvaluatorPlugin.

    Provides Tier 2 evaluation using LLM-as-Judge methodology
    with configurable timeout and optional Tier 1 context enrichment.

    Attributes:
        timeout_seconds: Maximum execution time for this plugin
        _engine: Underlying LLMJudgeEngine instance
        _settings: JudgeSettings instance for configuration
    """

    def __init__(self, timeout_seconds: float | None = None):
        """Initialize plugin with optional timeout override.

        Args:
            timeout_seconds: Optional timeout override. If None, uses JudgeSettings default.
        """
        self._settings = JudgeSettings()
        self.timeout_seconds = timeout_seconds or self._settings.tier2_timeout_seconds
        self._engine = LLMJudgeEngine(self._settings)

    @property
    def name(self) -> str:
        """Return unique plugin identifier.

        Returns:
            Plugin name string
        """
        return "llm_judge"

    @property
    def tier(self) -> int:
        """Return evaluation tier number.

        Returns:
            Tier 2 (LLM-as-Judge)
        """
        return 2

    def evaluate(self, input_data: BaseModel, context: dict[str, Any] | None = None) -> BaseModel:
        """Execute Tier 2 LLM-as-Judge evaluation.

        Args:
            input_data: Input containing paper, review, execution_trace
            context: Optional context from Tier 1 (for enrichment)

        Returns:
            Tier2Result with LLM quality assessments

        Raises:
            ValueError: If input validation fails
            RuntimeError: If evaluation execution fails
        """
        # Extract fields from input_data
        # Reason: Pydantic BaseModel doesn't support attribute access without type checking
        paper = getattr(input_data, "paper", "")
        review = getattr(input_data, "review", "")
        execution_trace = getattr(input_data, "execution_trace", {})

        # Log context enrichment if Tier 1 data available
        if context and "tier1_overall_score" in context:
            logger.debug(
                f"Tier 1 context available for enrichment: "
                f"score={context['tier1_overall_score']:.2f}"
            )

        # Delegate to existing engine (run async method in new event loop)
        result = asyncio.run(
            self._engine.evaluate_comprehensive(
                paper=paper, review=review, execution_trace=execution_trace
            )
        )

        return result

    def get_context_for_next_tier(self, result: BaseModel) -> dict[str, Any]:
        """Extract context from Tier 2 results for Tier 3.

        Args:
            result: Tier2Result from this plugin's evaluation

        Returns:
            Dictionary containing tier2_overall_score and quality metrics
        """
        # Reason: Type narrowing for BaseModel to Tier2Result
        if not isinstance(result, Tier2Result):
            return {}

        return {
            "tier2_overall_score": result.overall_score,
            "tier2_quality_metrics": {
                "technical_accuracy": result.technical_accuracy,
                "constructiveness": result.constructiveness,
                "planning_rationality": result.planning_rationality,
            },
            "tier2_model_used": result.model_used,
            "tier2_fallback_used": result.fallback_used,
        }
Attributes
name property

Return unique plugin identifier.

Returns:

Type Description
str

Plugin name string

tier property

Return evaluation tier number.

Returns:

Type Description
int

Tier 2 (LLM-as-Judge)

Functions
__init__(timeout_seconds=None)

Initialize plugin with optional timeout override.

Parameters:

Name Type Description Default
timeout_seconds float | None

Optional timeout override. If None, uses JudgeSettings default.

None
Source code in src/app/judge/plugins/llm_judge.py
def __init__(self, timeout_seconds: float | None = None):
    """Initialize plugin with optional timeout override.

    Args:
        timeout_seconds: Optional timeout override. If None, uses JudgeSettings default.
    """
    self._settings = JudgeSettings()
    self.timeout_seconds = timeout_seconds or self._settings.tier2_timeout_seconds
    self._engine = LLMJudgeEngine(self._settings)
evaluate(input_data, context=None)

Execute Tier 2 LLM-as-Judge evaluation.

Parameters:

Name Type Description Default
input_data BaseModel

Input containing paper, review, execution_trace

required
context dict[str, Any] | None

Optional context from Tier 1 (for enrichment)

None

Returns:

Type Description
BaseModel

Tier2Result with LLM quality assessments

Raises:

Type Description
ValueError

If input validation fails

RuntimeError

If evaluation execution fails

Source code in src/app/judge/plugins/llm_judge.py
def evaluate(self, input_data: BaseModel, context: dict[str, Any] | None = None) -> BaseModel:
    """Execute Tier 2 LLM-as-Judge evaluation.

    Args:
        input_data: Input containing paper, review, execution_trace
        context: Optional context from Tier 1 (for enrichment)

    Returns:
        Tier2Result with LLM quality assessments

    Raises:
        ValueError: If input validation fails
        RuntimeError: If evaluation execution fails
    """
    # Extract fields from input_data
    # Reason: Pydantic BaseModel doesn't support attribute access without type checking
    paper = getattr(input_data, "paper", "")
    review = getattr(input_data, "review", "")
    execution_trace = getattr(input_data, "execution_trace", {})

    # Log context enrichment if Tier 1 data available
    if context and "tier1_overall_score" in context:
        logger.debug(
            f"Tier 1 context available for enrichment: "
            f"score={context['tier1_overall_score']:.2f}"
        )

    # Delegate to existing engine (run async method in new event loop)
    result = asyncio.run(
        self._engine.evaluate_comprehensive(
            paper=paper, review=review, execution_trace=execution_trace
        )
    )

    return result
get_context_for_next_tier(result)

Extract context from Tier 2 results for Tier 3.

Parameters:

Name Type Description Default
result BaseModel

Tier2Result from this plugin’s evaluation

required

Returns:

Type Description
dict[str, Any]

Dictionary containing tier2_overall_score and quality metrics

Source code in src/app/judge/plugins/llm_judge.py
def get_context_for_next_tier(self, result: BaseModel) -> dict[str, Any]:
    """Extract context from Tier 2 results for Tier 3.

    Args:
        result: Tier2Result from this plugin's evaluation

    Returns:
        Dictionary containing tier2_overall_score and quality metrics
    """
    # Reason: Type narrowing for BaseModel to Tier2Result
    if not isinstance(result, Tier2Result):
        return {}

    return {
        "tier2_overall_score": result.overall_score,
        "tier2_quality_metrics": {
            "technical_accuracy": result.technical_accuracy,
            "constructiveness": result.constructiveness,
            "planning_rationality": result.planning_rationality,
        },
        "tier2_model_used": result.model_used,
        "tier2_fallback_used": result.fallback_used,
    }

app.judge.plugins.traditional

TraditionalMetricsPlugin wrapper for Tier 1 evaluation.

Wraps the existing TraditionalMetricsEngine as an EvaluatorPlugin following the adapter pattern with configurable timeout.

Classes

TraditionalMetricsPlugin

Bases: EvaluatorPlugin

Adapter wrapping TraditionalMetricsEngine as an EvaluatorPlugin.

Provides Tier 1 evaluation using lightweight text similarity metrics with configurable timeout from JudgeSettings.

Attributes:

Name Type Description
timeout_seconds

Maximum execution time for this plugin

_engine

Underlying TraditionalMetricsEngine instance

_settings

JudgeSettings instance for configuration

Source code in src/app/judge/plugins/traditional.py
class TraditionalMetricsPlugin(EvaluatorPlugin):
    """Adapter wrapping TraditionalMetricsEngine as an EvaluatorPlugin.

    Provides Tier 1 evaluation using lightweight text similarity metrics
    with configurable timeout from JudgeSettings.

    Attributes:
        timeout_seconds: Maximum execution time for this plugin
        _engine: Underlying TraditionalMetricsEngine instance
        _settings: JudgeSettings instance for configuration
    """

    def __init__(self, timeout_seconds: float | None = None):
        """Initialize plugin with optional timeout override.

        Args:
            timeout_seconds: Optional timeout override. If None, uses JudgeSettings default.
        """
        self._settings = JudgeSettings()
        self.timeout_seconds = timeout_seconds or self._settings.tier1_max_seconds
        self._engine = TraditionalMetricsEngine()

    @property
    def name(self) -> str:
        """Return unique plugin identifier.

        Returns:
            Plugin name string
        """
        return "traditional_metrics"

    @property
    def tier(self) -> int:
        """Return evaluation tier number.

        Returns:
            Tier 1 (Traditional Metrics)
        """
        return 1

    def evaluate(self, input_data: BaseModel, context: dict[str, Any] | None = None) -> BaseModel:
        """Execute Tier 1 traditional metrics evaluation.

        Args:
            input_data: Input containing agent_output, reference_texts, start_time, end_time
            context: Optional context from previous tiers (unused for Tier 1)

        Returns:
            Tier1Result with similarity metrics and execution timing

        Raises:
            ValueError: If input validation fails
            RuntimeError: If evaluation execution fails
        """
        # Extract fields from input_data
        # Reason: Pydantic BaseModel doesn't support attribute access without type checking
        agent_output = getattr(input_data, "agent_output", "")
        reference_texts = getattr(input_data, "reference_texts", [])
        start_time = getattr(input_data, "start_time", 0.0)
        end_time = getattr(input_data, "end_time", 0.0)

        # Delegate to existing engine
        result = self._engine.evaluate_traditional_metrics(
            agent_output=agent_output,
            reference_texts=reference_texts,
            start_time=start_time,
            end_time=end_time,
            settings=self._settings,
        )

        return result

    def get_context_for_next_tier(self, result: BaseModel) -> dict[str, Any]:
        """Extract context from Tier 1 results for Tier 2.

        Args:
            result: Tier1Result from this plugin's evaluation

        Returns:
            Dictionary containing tier1_overall_score and similarity metrics
        """
        # Reason: Type narrowing for BaseModel to Tier1Result
        if not isinstance(result, Tier1Result):
            return {}

        return {
            "tier1_overall_score": result.overall_score,
            "tier1_similarity_metrics": {
                "cosine": result.cosine_score,
                "jaccard": result.jaccard_score,
                "semantic": result.semantic_score,
            },
            "tier1_execution_time": result.execution_time,
            "tier1_task_success": result.task_success,
        }
Attributes
name property

Return unique plugin identifier.

Returns:

Type Description
str

Plugin name string

tier property

Return evaluation tier number.

Returns:

Type Description
int

Tier 1 (Traditional Metrics)

Functions
__init__(timeout_seconds=None)

Initialize plugin with optional timeout override.

Parameters:

Name Type Description Default
timeout_seconds float | None

Optional timeout override. If None, uses JudgeSettings default.

None
Source code in src/app/judge/plugins/traditional.py
def __init__(self, timeout_seconds: float | None = None):
    """Initialize plugin with optional timeout override.

    Args:
        timeout_seconds: Optional timeout override. If None, uses JudgeSettings default.
    """
    self._settings = JudgeSettings()
    self.timeout_seconds = timeout_seconds or self._settings.tier1_max_seconds
    self._engine = TraditionalMetricsEngine()
evaluate(input_data, context=None)

Execute Tier 1 traditional metrics evaluation.

Parameters:

Name Type Description Default
input_data BaseModel

Input containing agent_output, reference_texts, start_time, end_time

required
context dict[str, Any] | None

Optional context from previous tiers (unused for Tier 1)

None

Returns:

Type Description
BaseModel

Tier1Result with similarity metrics and execution timing

Raises:

Type Description
ValueError

If input validation fails

RuntimeError

If evaluation execution fails

Source code in src/app/judge/plugins/traditional.py
def evaluate(self, input_data: BaseModel, context: dict[str, Any] | None = None) -> BaseModel:
    """Execute Tier 1 traditional metrics evaluation.

    Args:
        input_data: Input containing agent_output, reference_texts, start_time, end_time
        context: Optional context from previous tiers (unused for Tier 1)

    Returns:
        Tier1Result with similarity metrics and execution timing

    Raises:
        ValueError: If input validation fails
        RuntimeError: If evaluation execution fails
    """
    # Extract fields from input_data
    # Reason: Pydantic BaseModel doesn't support attribute access without type checking
    agent_output = getattr(input_data, "agent_output", "")
    reference_texts = getattr(input_data, "reference_texts", [])
    start_time = getattr(input_data, "start_time", 0.0)
    end_time = getattr(input_data, "end_time", 0.0)

    # Delegate to existing engine
    result = self._engine.evaluate_traditional_metrics(
        agent_output=agent_output,
        reference_texts=reference_texts,
        start_time=start_time,
        end_time=end_time,
        settings=self._settings,
    )

    return result
get_context_for_next_tier(result)

Extract context from Tier 1 results for Tier 2.

Parameters:

Name Type Description Default
result BaseModel

Tier1Result from this plugin’s evaluation

required

Returns:

Type Description
dict[str, Any]

Dictionary containing tier1_overall_score and similarity metrics

Source code in src/app/judge/plugins/traditional.py
def get_context_for_next_tier(self, result: BaseModel) -> dict[str, Any]:
    """Extract context from Tier 1 results for Tier 2.

    Args:
        result: Tier1Result from this plugin's evaluation

    Returns:
        Dictionary containing tier1_overall_score and similarity metrics
    """
    # Reason: Type narrowing for BaseModel to Tier1Result
    if not isinstance(result, Tier1Result):
        return {}

    return {
        "tier1_overall_score": result.overall_score,
        "tier1_similarity_metrics": {
            "cosine": result.cosine_score,
            "jaccard": result.jaccard_score,
            "semantic": result.semantic_score,
        },
        "tier1_execution_time": result.execution_time,
        "tier1_task_success": result.task_success,
    }

app.judge.trace_processors

Trace processing infrastructure for local observability.

Provides JSON/JSONL trace storage and processing capabilities for graph-based analysis and agent coordination evaluation.

Classes

ProcessedTrace dataclass

Processed trace with extracted patterns.

Source code in src/app/judge/trace_processors.py
@dataclass
class ProcessedTrace:
    """Processed trace with extracted patterns."""

    execution_id: str
    start_time: float
    end_time: float
    agent_interactions: list[dict[str, Any]]
    tool_calls: list[dict[str, Any]]
    coordination_events: list[dict[str, Any]]
    performance_metrics: dict[str, float]

TraceCollector

Collects and stores execution traces for analysis.

Provides local storage capabilities with JSON/JSONL format and SQLite database for structured queries.

Source code in src/app/judge/trace_processors.py
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
class TraceCollector:
    """Collects and stores execution traces for analysis.

    Provides local storage capabilities with JSON/JSONL format
    and SQLite database for structured queries.
    """

    def __init__(self, settings: JudgeSettings) -> None:
        """Initialize trace collector with settings.

        Args:
            settings: JudgeSettings instance with observability configuration.
        """
        self.settings = settings

        self.trace_enabled = settings.trace_collection
        self.storage_path = Path(settings.trace_storage_path)
        self.performance_logging = settings.performance_logging

        # Ensure storage directory exists
        self.storage_path.mkdir(parents=True, exist_ok=True)

        # Initialize SQLite database
        self.db_path = self.storage_path / TRACES_DB_FILE
        self._init_database()

        # Current execution state
        self.current_execution_id: str | None = None
        self.current_events: list[TraceEvent] = []

    def _init_database(self):
        """Initialize SQLite database schema for trace storage."""
        try:
            conn = sqlite3.connect(self.db_path)
            try:
                conn.execute("""
                    CREATE TABLE IF NOT EXISTS trace_executions (
                        execution_id TEXT PRIMARY KEY,
                        start_time REAL,
                        end_time REAL,
                        agent_count INTEGER,
                        tool_count INTEGER,
                        total_duration REAL,
                        created_at TEXT
                    )
                """)

                conn.execute("""
                    CREATE TABLE IF NOT EXISTS trace_events (
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        execution_id TEXT,
                        timestamp REAL,
                        event_type TEXT,
                        agent_id TEXT,
                        data TEXT,
                        FOREIGN KEY (execution_id)
                        REFERENCES trace_executions (execution_id)
                    )
                """)

                conn.commit()
                logger.debug("Trace database initialized successfully")
            finally:
                conn.close()

        except Exception as e:
            logger.error(f"Failed to initialize trace database: {e}")

    def start_execution(self, execution_id: str) -> None:
        """Start a new execution trace.

        Args:
            execution_id: Unique identifier for the execution
        """
        if not self.trace_enabled:
            return

        self.current_execution_id = execution_id
        self.current_events = []

        logger.debug(f"Started trace collection for execution: {execution_id}")

    def log_agent_interaction(
        self,
        from_agent: str,
        to_agent: str,
        interaction_type: str,
        data: dict[str, Any],
    ) -> None:
        """Log an agent-to-agent interaction.

        Args:
            from_agent: Source agent identifier
            to_agent: Target agent identifier
            interaction_type: Type of interaction (task_request, result_delivery, etc.)
            data: Additional interaction data
        """
        if not self.trace_enabled or not self.current_execution_id:
            return

        event = TraceEvent(
            timestamp=time.perf_counter(),
            event_type="agent_interaction",
            agent_id=from_agent,
            data={"from": from_agent, "to": to_agent, "type": interaction_type, **data},
            execution_id=self.current_execution_id,
        )

        self.current_events.append(event)

    def log_tool_call(
        self,
        agent_id: str,
        tool_name: str,
        success: bool,
        duration: float,
        context: str = "",
    ) -> None:
        """Log a tool usage event.

        Args:
            agent_id: Agent making the tool call
            tool_name: Name of the tool used
            success: Whether the tool call was successful
            duration: Tool execution duration in seconds
            context: Context or purpose of the tool call
        """
        if not self.trace_enabled or not self.current_execution_id:
            return

        event = TraceEvent(
            timestamp=time.perf_counter(),
            event_type="tool_call",
            agent_id=agent_id,
            data={
                "tool_name": tool_name,
                "success": success,
                "duration": duration,
                "context": context,
            },
            execution_id=self.current_execution_id,
        )

        self.current_events.append(event)

    def log_coordination_event(
        self,
        manager_agent: str,
        event_type: str,
        target_agents: list[str],
        data: dict[str, Any],
    ) -> None:
        """Log a coordination event (delegation, synchronization, etc.).

        Args:
            manager_agent: Managing agent identifier
            event_type: Type of coordination (delegation, sync, handoff)
            target_agents: List of agents involved
            data: Additional coordination data
        """
        if not self.trace_enabled or not self.current_execution_id:
            return

        event = TraceEvent(
            timestamp=time.perf_counter(),
            event_type="coordination",
            agent_id=manager_agent,
            data={
                "coordination_type": event_type,
                "target_agents": target_agents,
                **data,
            },
            execution_id=self.current_execution_id,
        )

        self.current_events.append(event)

    def end_execution(self) -> ProcessedTrace | None:
        """End the current execution and process traces.

        Returns:
            ProcessedTrace object with patterns, or None if no execution active
        """
        # Reason: check trace_enabled first so callers get an explicit warning,
        # then idempotent guard for double-call safety (silent return)
        if not self.trace_enabled:
            logger.warning("Trace storage skipped: tracing disabled")
            return None

        if not self.current_execution_id:
            return None

        if not self.current_events:
            logger.warning("Trace storage skipped: no events collected")
            return None

        try:
            processed_trace = self._process_events()
            self._store_trace(processed_trace)

            # Reset current execution state
            execution_id = self.current_execution_id
            self.current_execution_id = None
            self.current_events = []

            logger.debug(f"Completed trace processing for execution: {execution_id}")
            return processed_trace

        except Exception as e:
            logger.error(f"Failed to process trace: {e}")
            return None

    def _process_events(self) -> ProcessedTrace:
        """Process raw events into structured trace data.

        Returns:
            ProcessedTrace with organized data
        """
        if not self.current_events:
            raise ValueError("No events to process")

        # Sort events by timestamp
        sorted_events = sorted(self.current_events, key=lambda e: e.timestamp)

        # Extract different event types
        agent_interactions: list[dict[str, Any]] = []
        tool_calls: list[dict[str, Any]] = []
        coordination_events: list[dict[str, Any]] = []

        for event in sorted_events:
            if event.event_type == "agent_interaction":
                agent_interactions.append(event.data)
            elif event.event_type == "tool_call":
                tool_calls.append(
                    {**event.data, "timestamp": event.timestamp, "agent_id": event.agent_id}
                )
            elif event.event_type == "coordination":
                coordination_events.append(event.data)

        # Calculate performance metrics
        start_time = sorted_events[0].timestamp
        end_time = sorted_events[-1].timestamp
        total_duration = end_time - start_time

        performance_metrics = {
            "total_duration": total_duration,
            "agent_interactions": len(agent_interactions),
            "tool_calls": len(tool_calls),
            "coordination_events": len(coordination_events),
            "avg_tool_duration": sum(tc.get("duration", 0) for tc in tool_calls)
            / max(1, len(tool_calls)),
        }

        return ProcessedTrace(
            execution_id=self.current_execution_id or "",
            start_time=start_time,
            end_time=end_time,
            agent_interactions=agent_interactions,
            tool_calls=tool_calls,
            coordination_events=coordination_events,
            performance_metrics=performance_metrics,
        )

    def _store_trace(self, trace: ProcessedTrace) -> None:
        """Store processed trace to JSON file and SQLite database.

        Writes trace to the per-run directory when a RunContext is active,
        otherwise falls back to flat storage under trace_storage_path.

        Args:
            trace: ProcessedTrace to store
        """
        try:
            # Determine target path: per-run directory when active, else flat storage
            from app.utils.run_context import get_active_run_context

            run_ctx = get_active_run_context()
            if run_ctx is not None:
                json_file = run_ctx.trace_path
            else:
                timestamp_str = datetime.now(UTC).strftime("%Y-%m-%dT%H-%M-%SZ")
                json_file = self.storage_path / f"trace_{trace.execution_id}_{timestamp_str}.json"

            with open(json_file, "w") as f:
                json.dump(asdict(trace), f)

            # Store in SQLite database
            conn = sqlite3.connect(self.db_path)
            try:
                conn.execute(
                    """
                    INSERT OR REPLACE INTO trace_executions
                    (execution_id, start_time, end_time, agent_count,
                     tool_count, total_duration, created_at)
                    VALUES (?, ?, ?, ?, ?, ?, ?)
                """,
                    (
                        trace.execution_id,
                        trace.start_time,
                        trace.end_time,
                        len(set(ia.get("from", "") for ia in trace.agent_interactions)),
                        len(trace.tool_calls),
                        trace.performance_metrics["total_duration"],
                        datetime.now(UTC).isoformat(),
                    ),
                )

                # Store individual events
                for event in self.current_events:
                    conn.execute(
                        """
                        INSERT INTO trace_events
                        (execution_id, timestamp, event_type, agent_id, data)
                        VALUES (?, ?, ?, ?, ?)
                    """,
                        (
                            event.execution_id,
                            event.timestamp,
                            event.event_type,
                            event.agent_id,
                            json.dumps(event.data),
                        ),
                    )

                conn.commit()
            finally:
                conn.close()

            from app.utils.artifact_registry import get_artifact_registry

            get_artifact_registry().register("Trace", json_file)

            if self.performance_logging:
                logger.info(
                    f"Stored trace {trace.execution_id}: "
                    f"{trace.performance_metrics['total_duration']:.3f}s, "
                    f"{len(trace.agent_interactions)} interactions, "
                    f"{len(trace.tool_calls)} tool calls "
                    f"(storage: {self.storage_path})"
                )

        except Exception as e:
            logger.error(f"Failed to store trace: {e}")

    def _parse_trace_events(
        self, events: list[tuple[float, str, str, str]]
    ) -> tuple[list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]]]:
        """Parse database events into categorized lists."""
        agent_interactions: list[dict[str, Any]] = []
        tool_calls: list[dict[str, Any]] = []
        coordination_events: list[dict[str, Any]] = []

        for timestamp, event_type, agent_id, data_json in events:
            data = json.loads(data_json)

            if event_type == "agent_interaction":
                agent_interactions.append({**data, "timestamp": timestamp})
            elif event_type == "tool_call":
                tool_calls.append({**data, "timestamp": timestamp, "agent_id": agent_id})
            elif event_type == "coordination":
                coordination_events.append({**data, "timestamp": timestamp})

        return agent_interactions, tool_calls, coordination_events

    def _build_timing_data(self, execution: tuple[Any, ...]) -> dict[str, Any]:
        """Build timing data from execution record."""
        return {
            "start_time": execution[1],
            "end_time": execution[2],
            "total_duration": execution[5],
        }

    def load_trace(self, execution_id: str) -> GraphTraceData | None:
        """Load a stored trace by execution ID.

        Args:
            execution_id: Execution identifier

        Returns:
            GraphTraceData object or None if not found
        """
        try:
            conn = sqlite3.connect(self.db_path)
            try:
                execution = conn.execute(
                    "SELECT * FROM trace_executions WHERE execution_id = ?",
                    (execution_id,),
                ).fetchone()

                if not execution:
                    return None

                events = conn.execute(
                    """
                    SELECT timestamp, event_type, agent_id, data
                    FROM trace_events
                    WHERE execution_id = ?
                    ORDER BY timestamp
                """,
                    (execution_id,),
                ).fetchall()

                agent_interactions, tool_calls, coordination_events = self._parse_trace_events(
                    events
                )

                timing_data = self._build_timing_data(execution) if events else {}

                return GraphTraceData(
                    execution_id=execution_id,
                    agent_interactions=agent_interactions,
                    tool_calls=tool_calls,
                    timing_data=timing_data,
                    coordination_events=coordination_events,
                )
            finally:
                conn.close()

        except Exception as e:
            logger.error(f"Failed to load trace {execution_id}: {e}")
            return None

    def list_executions(self, limit: int = 50) -> list[dict[str, Any]]:
        """List recent execution traces.

        Args:
            limit: Maximum number of executions to return

        Returns:
            List of execution metadata dictionaries
        """
        try:
            conn = sqlite3.connect(self.db_path)
            try:
                executions = conn.execute(
                    """
                    SELECT execution_id, start_time, end_time, agent_count,
                           tool_count, total_duration, created_at
                    FROM trace_executions
                    ORDER BY created_at DESC
                    LIMIT ?
                """,
                    (limit,),
                ).fetchall()

                return [
                    {
                        "execution_id": row[0],
                        "start_time": row[1],
                        "end_time": row[2],
                        "agent_count": row[3],
                        "tool_count": row[4],
                        "total_duration": row[5],
                        "created_at": row[6],
                    }
                    for row in executions
                ]
            finally:
                conn.close()

        except Exception as e:
            logger.error(f"Failed to list executions: {e}")
            return []
Functions
__init__(settings)

Initialize trace collector with settings.

Parameters:

Name Type Description Default
settings JudgeSettings

JudgeSettings instance with observability configuration.

required
Source code in src/app/judge/trace_processors.py
def __init__(self, settings: JudgeSettings) -> None:
    """Initialize trace collector with settings.

    Args:
        settings: JudgeSettings instance with observability configuration.
    """
    self.settings = settings

    self.trace_enabled = settings.trace_collection
    self.storage_path = Path(settings.trace_storage_path)
    self.performance_logging = settings.performance_logging

    # Ensure storage directory exists
    self.storage_path.mkdir(parents=True, exist_ok=True)

    # Initialize SQLite database
    self.db_path = self.storage_path / TRACES_DB_FILE
    self._init_database()

    # Current execution state
    self.current_execution_id: str | None = None
    self.current_events: list[TraceEvent] = []
end_execution()

End the current execution and process traces.

Returns:

Type Description
ProcessedTrace | None

ProcessedTrace object with patterns, or None if no execution active

Source code in src/app/judge/trace_processors.py
def end_execution(self) -> ProcessedTrace | None:
    """End the current execution and process traces.

    Returns:
        ProcessedTrace object with patterns, or None if no execution active
    """
    # Reason: check trace_enabled first so callers get an explicit warning,
    # then idempotent guard for double-call safety (silent return)
    if not self.trace_enabled:
        logger.warning("Trace storage skipped: tracing disabled")
        return None

    if not self.current_execution_id:
        return None

    if not self.current_events:
        logger.warning("Trace storage skipped: no events collected")
        return None

    try:
        processed_trace = self._process_events()
        self._store_trace(processed_trace)

        # Reset current execution state
        execution_id = self.current_execution_id
        self.current_execution_id = None
        self.current_events = []

        logger.debug(f"Completed trace processing for execution: {execution_id}")
        return processed_trace

    except Exception as e:
        logger.error(f"Failed to process trace: {e}")
        return None
list_executions(limit=50)

List recent execution traces.

Parameters:

Name Type Description Default
limit int

Maximum number of executions to return

50

Returns:

Type Description
list[dict[str, Any]]

List of execution metadata dictionaries

Source code in src/app/judge/trace_processors.py
def list_executions(self, limit: int = 50) -> list[dict[str, Any]]:
    """List recent execution traces.

    Args:
        limit: Maximum number of executions to return

    Returns:
        List of execution metadata dictionaries
    """
    try:
        conn = sqlite3.connect(self.db_path)
        try:
            executions = conn.execute(
                """
                SELECT execution_id, start_time, end_time, agent_count,
                       tool_count, total_duration, created_at
                FROM trace_executions
                ORDER BY created_at DESC
                LIMIT ?
            """,
                (limit,),
            ).fetchall()

            return [
                {
                    "execution_id": row[0],
                    "start_time": row[1],
                    "end_time": row[2],
                    "agent_count": row[3],
                    "tool_count": row[4],
                    "total_duration": row[5],
                    "created_at": row[6],
                }
                for row in executions
            ]
        finally:
            conn.close()

    except Exception as e:
        logger.error(f"Failed to list executions: {e}")
        return []
load_trace(execution_id)

Load a stored trace by execution ID.

Parameters:

Name Type Description Default
execution_id str

Execution identifier

required

Returns:

Type Description
GraphTraceData | None

GraphTraceData object or None if not found

Source code in src/app/judge/trace_processors.py
def load_trace(self, execution_id: str) -> GraphTraceData | None:
    """Load a stored trace by execution ID.

    Args:
        execution_id: Execution identifier

    Returns:
        GraphTraceData object or None if not found
    """
    try:
        conn = sqlite3.connect(self.db_path)
        try:
            execution = conn.execute(
                "SELECT * FROM trace_executions WHERE execution_id = ?",
                (execution_id,),
            ).fetchone()

            if not execution:
                return None

            events = conn.execute(
                """
                SELECT timestamp, event_type, agent_id, data
                FROM trace_events
                WHERE execution_id = ?
                ORDER BY timestamp
            """,
                (execution_id,),
            ).fetchall()

            agent_interactions, tool_calls, coordination_events = self._parse_trace_events(
                events
            )

            timing_data = self._build_timing_data(execution) if events else {}

            return GraphTraceData(
                execution_id=execution_id,
                agent_interactions=agent_interactions,
                tool_calls=tool_calls,
                timing_data=timing_data,
                coordination_events=coordination_events,
            )
        finally:
            conn.close()

    except Exception as e:
        logger.error(f"Failed to load trace {execution_id}: {e}")
        return None
log_agent_interaction(from_agent, to_agent, interaction_type, data)

Log an agent-to-agent interaction.

Parameters:

Name Type Description Default
from_agent str

Source agent identifier

required
to_agent str

Target agent identifier

required
interaction_type str

Type of interaction (task_request, result_delivery, etc.)

required
data dict[str, Any]

Additional interaction data

required
Source code in src/app/judge/trace_processors.py
def log_agent_interaction(
    self,
    from_agent: str,
    to_agent: str,
    interaction_type: str,
    data: dict[str, Any],
) -> None:
    """Log an agent-to-agent interaction.

    Args:
        from_agent: Source agent identifier
        to_agent: Target agent identifier
        interaction_type: Type of interaction (task_request, result_delivery, etc.)
        data: Additional interaction data
    """
    if not self.trace_enabled or not self.current_execution_id:
        return

    event = TraceEvent(
        timestamp=time.perf_counter(),
        event_type="agent_interaction",
        agent_id=from_agent,
        data={"from": from_agent, "to": to_agent, "type": interaction_type, **data},
        execution_id=self.current_execution_id,
    )

    self.current_events.append(event)
log_coordination_event(manager_agent, event_type, target_agents, data)

Log a coordination event (delegation, synchronization, etc.).

Parameters:

Name Type Description Default
manager_agent str

Managing agent identifier

required
event_type str

Type of coordination (delegation, sync, handoff)

required
target_agents list[str]

List of agents involved

required
data dict[str, Any]

Additional coordination data

required
Source code in src/app/judge/trace_processors.py
def log_coordination_event(
    self,
    manager_agent: str,
    event_type: str,
    target_agents: list[str],
    data: dict[str, Any],
) -> None:
    """Log a coordination event (delegation, synchronization, etc.).

    Args:
        manager_agent: Managing agent identifier
        event_type: Type of coordination (delegation, sync, handoff)
        target_agents: List of agents involved
        data: Additional coordination data
    """
    if not self.trace_enabled or not self.current_execution_id:
        return

    event = TraceEvent(
        timestamp=time.perf_counter(),
        event_type="coordination",
        agent_id=manager_agent,
        data={
            "coordination_type": event_type,
            "target_agents": target_agents,
            **data,
        },
        execution_id=self.current_execution_id,
    )

    self.current_events.append(event)
log_tool_call(agent_id, tool_name, success, duration, context='')

Log a tool usage event.

Parameters:

Name Type Description Default
agent_id str

Agent making the tool call

required
tool_name str

Name of the tool used

required
success bool

Whether the tool call was successful

required
duration float

Tool execution duration in seconds

required
context str

Context or purpose of the tool call

''
Source code in src/app/judge/trace_processors.py
def log_tool_call(
    self,
    agent_id: str,
    tool_name: str,
    success: bool,
    duration: float,
    context: str = "",
) -> None:
    """Log a tool usage event.

    Args:
        agent_id: Agent making the tool call
        tool_name: Name of the tool used
        success: Whether the tool call was successful
        duration: Tool execution duration in seconds
        context: Context or purpose of the tool call
    """
    if not self.trace_enabled or not self.current_execution_id:
        return

    event = TraceEvent(
        timestamp=time.perf_counter(),
        event_type="tool_call",
        agent_id=agent_id,
        data={
            "tool_name": tool_name,
            "success": success,
            "duration": duration,
            "context": context,
        },
        execution_id=self.current_execution_id,
    )

    self.current_events.append(event)
start_execution(execution_id)

Start a new execution trace.

Parameters:

Name Type Description Default
execution_id str

Unique identifier for the execution

required
Source code in src/app/judge/trace_processors.py
def start_execution(self, execution_id: str) -> None:
    """Start a new execution trace.

    Args:
        execution_id: Unique identifier for the execution
    """
    if not self.trace_enabled:
        return

    self.current_execution_id = execution_id
    self.current_events = []

    logger.debug(f"Started trace collection for execution: {execution_id}")

TraceEvent dataclass

Individual trace event container.

Source code in src/app/judge/trace_processors.py
@dataclass
class TraceEvent:
    """Individual trace event container."""

    timestamp: float
    event_type: str  # 'agent_interaction', 'tool_call', 'coordination'
    agent_id: str
    data: dict[str, Any]
    execution_id: str

TraceProcessor

Processes stored traces for graph-based analysis.

Source code in src/app/judge/trace_processors.py
class TraceProcessor:
    """Processes stored traces for graph-based analysis."""

    def __init__(self, collector: TraceCollector):
        """Initialize with a trace collector.

        Args:
            collector: TraceCollector instance
        """
        self.collector = collector

    def process_for_graph_analysis(self, execution_id: str) -> dict[str, Any] | None:
        """Process trace data specifically for graph analysis.

        Args:
            execution_id: Execution to process

        Returns:
            Dictionary with graph-ready data structures
        """
        trace_data = self.collector.load_trace(execution_id)
        if not trace_data:
            return None

        return {
            "agent_interactions": trace_data.agent_interactions,
            "tool_calls": trace_data.tool_calls,
            "coordination_events": trace_data.coordination_events,
            "timing_data": trace_data.timing_data,
            "execution_id": trace_data.execution_id,
        }
Functions
__init__(collector)

Initialize with a trace collector.

Parameters:

Name Type Description Default
collector TraceCollector

TraceCollector instance

required
Source code in src/app/judge/trace_processors.py
def __init__(self, collector: TraceCollector):
    """Initialize with a trace collector.

    Args:
        collector: TraceCollector instance
    """
    self.collector = collector
process_for_graph_analysis(execution_id)

Process trace data specifically for graph analysis.

Parameters:

Name Type Description Default
execution_id str

Execution to process

required

Returns:

Type Description
dict[str, Any] | None

Dictionary with graph-ready data structures

Source code in src/app/judge/trace_processors.py
def process_for_graph_analysis(self, execution_id: str) -> dict[str, Any] | None:
    """Process trace data specifically for graph analysis.

    Args:
        execution_id: Execution to process

    Returns:
        Dictionary with graph-ready data structures
    """
    trace_data = self.collector.load_trace(execution_id)
    if not trace_data:
        return None

    return {
        "agent_interactions": trace_data.agent_interactions,
        "tool_calls": trace_data.tool_calls,
        "coordination_events": trace_data.coordination_events,
        "timing_data": trace_data.timing_data,
        "execution_id": trace_data.execution_id,
    }

Functions

get_trace_collector(settings=None)

Get or create the global trace collector instance.

Parameters:

Name Type Description Default
settings JudgeSettings | None

JudgeSettings instance. If None, uses defaults.

None

Returns:

Type Description
TraceCollector

TraceCollector instance

Source code in src/app/judge/trace_processors.py
def get_trace_collector(settings: JudgeSettings | None = None) -> TraceCollector:
    """Get or create the global trace collector instance.

    Args:
        settings: JudgeSettings instance. If None, uses defaults.

    Returns:
        TraceCollector instance
    """
    global _global_collector

    if _global_collector is None:
        if settings is None:
            from app.config.judge_settings import JudgeSettings

            settings = JudgeSettings()
        _global_collector = TraceCollector(settings)

    return _global_collector

trace_execution(execution_id)

Decorator for automatic execution tracing.

Parameters:

Name Type Description Default
execution_id str

Unique identifier for the execution

required
Usage

@trace_execution(“paper_001_evaluation”) def evaluate_paper(): # Execution will be automatically traced pass

Source code in src/app/judge/trace_processors.py
def trace_execution(execution_id: str) -> Any:
    """Decorator for automatic execution tracing.

    Args:
        execution_id: Unique identifier for the execution

    Usage:
        @trace_execution("paper_001_evaluation")
        def evaluate_paper():
            # Execution will be automatically traced
            pass
    """

    def decorator(func: Any) -> Any:
        def wrapper(*args: Any, **kwargs: Any) -> Any:
            collector = get_trace_collector()
            collector.start_execution(execution_id)

            try:
                result = func(*args, **kwargs)
                collector.end_execution()
                return result
            except Exception as e:
                collector.end_execution()
                raise e

        return wrapper

    return decorator

app.judge.traditional_metrics

Traditional metrics implementation for Tier 1 evaluation.

Provides fast, lightweight text similarity and execution metrics using minimal dependencies with <1s performance target.

Classes

SimilarityScores dataclass

Container for similarity metric results.

Source code in src/app/judge/traditional_metrics.py
@dataclass
class SimilarityScores:
    """Container for similarity metric results."""

    cosine: float
    jaccard: float
    semantic: float
    levenshtein: float = 0.0  # Optional for backward compatibility

TraditionalMetricsEngine

Lightweight traditional metrics engine for fast evaluation.

Implements text similarity metrics using minimal computational resources with performance targets under 1 second for typical academic reviews.

Source code in src/app/judge/traditional_metrics.py
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
class TraditionalMetricsEngine:
    """Lightweight traditional metrics engine for fast evaluation.

    Implements text similarity metrics using minimal computational resources
    with performance targets under 1 second for typical academic reviews.
    """

    # Reason: Class-level cache so BERTScorer init failure (e.g. read-only FS)
    # is not retried on every new engine instance.
    _bertscore_instance = None
    _bertscore_init_failed = False

    def __init__(self):
        """Initialize metrics engine with cached components.

        Uses lazy loading for computationally expensive components
        to minimize startup time and memory usage.
        """
        self._vectorizer = TfidfVectorizer(
            stop_words="english",
            ngram_range=(1, 2),
            max_features=5000,  # Limit for performance
        )

    def _get_bertscore_model(self):
        """Lazy-load BERTScorer instance for semantic similarity.

        Returns:
            BERTScorer instance if available, None if bert-score not installed or init failed.
        """
        if TraditionalMetricsEngine._bertscore_instance is not None:
            return TraditionalMetricsEngine._bertscore_instance
        if TraditionalMetricsEngine._bertscore_init_failed or BERTScorer is None:
            return None
        try:
            TraditionalMetricsEngine._bertscore_instance = BERTScorer(
                model_type="distilbert-base-uncased", lang="en"
            )
            return TraditionalMetricsEngine._bertscore_instance
        except Exception as e:
            logger.warning(f"BERTScore initialization failed: {e}")
            TraditionalMetricsEngine._bertscore_init_failed = True
            return None

    def _compute_word_overlap_fallback(self, text1: str, text2: str) -> float:
        """Fallback to simple word overlap when TF-IDF fails."""
        words1 = set(re.findall(r"\w+", text1.lower()))
        words2 = set(re.findall(r"\w+", text2.lower()))

        if not words1 or not words2:
            return 0.0

        intersection = len(words1 & words2)
        union = len(words1 | words2)

        return intersection / union if union > 0 else 0.0

    def compute_cosine_similarity(self, text1: str, text2: str) -> float:
        """Compute TF-IDF cosine similarity with enhanced error handling.

        Args:
            text1: Agent-generated review text
            text2: Reference review text

        Returns:
            Similarity score between 0.0 and 1.0

        Performance: ~50ms for typical review lengths
        """
        if not text1.strip() and not text2.strip():
            return 1.0
        if not text1.strip() or not text2.strip():
            return 0.0

        try:
            vectorizer = TfidfVectorizer(stop_words="english", lowercase=True, max_features=1000)
            texts = [text1, text2]
            tfidf_matrix = vectorizer.fit_transform(texts)
            dense_matrix = tfidf_matrix.toarray()  # type: ignore[union-attr]
            similarity_matrix = cosine_similarity(dense_matrix[0:1], dense_matrix[1:2])
            score: float = similarity_matrix[0][0]  # type: ignore[assignment]
            return score

        except Exception as e:
            logger.warning(f"TF-IDF cosine similarity failed: {e}")
            try:
                return self._compute_word_overlap_fallback(text1, text2)
            except Exception:
                logger.warning("Cosine similarity calculation failed completely")
                return 0.0

    def _compute_jaccard_basic(self, text1: str, text2: str) -> float:
        """Basic word-based Jaccard implementation."""
        words1 = set(text1.lower().split())
        words2 = set(text2.lower().split())

        if len(words1) == 0 and len(words2) == 0:
            return 1.0

        intersection = len(words1.intersection(words2))
        union = len(words1.union(words2))
        return intersection / union if union > 0 else 0.0

    def _compute_jaccard_regex_fallback(self, text1: str, text2: str) -> float:
        """Regex-based Jaccard fallback."""
        words1 = set(re.findall(r"\w+", text1.lower()))
        words2 = set(re.findall(r"\w+", text2.lower()))

        if not words1 and not words2:
            return 1.0

        intersection = len(words1 & words2)
        union = len(words1 | words2)
        return intersection / union if union > 0 else 0.0

    def compute_jaccard_similarity(self, text1: str, text2: str, enhanced: bool = False) -> float:
        """Compute Jaccard similarity with optional textdistance enhancement.

        Args:
            text1: Agent-generated review text
            text2: Reference review text
            enhanced: Use textdistance library for robust calculation

        Returns:
            Similarity score between 0.0 and 1.0

        Performance: ~10ms for typical review lengths
        """
        if not text1.strip() and not text2.strip():
            return 1.0
        if not text1.strip() or not text2.strip():
            return 0.0

        if enhanced:
            try:
                return float(
                    textdistance.jaccard.normalized_similarity(text1.lower(), text2.lower())
                )
            except Exception as e:
                logger.warning(f"Enhanced Jaccard similarity failed: {e}")

        try:
            return self._compute_jaccard_basic(text1, text2)
        except Exception as e:
            logger.warning(f"Jaccard similarity calculation failed: {e}")
            try:
                return self._compute_jaccard_regex_fallback(text1, text2)
            except Exception:
                return 0.0

    def _compute_char_overlap_fallback(self, text1: str, text2: str) -> float:
        """Fallback to simple character overlap when Levenshtein fails."""
        text1_clean = text1.lower().strip()
        text2_clean = text2.lower().strip()

        if text1_clean == text2_clean:
            return 1.0

        chars1 = set(text1_clean)
        chars2 = set(text2_clean)
        intersection = len(chars1 & chars2)
        union = len(chars1 | chars2)

        return intersection / union if union > 0 else 0.0

    def compute_levenshtein_similarity(self, text1: str, text2: str) -> float:
        """Compute Levenshtein (edit distance) similarity using textdistance.

        Args:
            text1: Agent-generated review text
            text2: Reference review text

        Returns:
            Normalized Levenshtein similarity score between 0.0 and 1.0

        Performance: ~20ms for typical review lengths
        """
        if not text1.strip() and not text2.strip():
            return 1.0
        if not text1.strip() or not text2.strip():
            return 0.0

        try:
            return float(
                textdistance.levenshtein.normalized_similarity(text1.lower(), text2.lower())
            )
        except Exception as e:
            logger.warning(f"Levenshtein similarity calculation failed: {e}")
            try:
                return self._compute_char_overlap_fallback(text1, text2)
            except Exception:
                return 0.0

    def compute_semantic_similarity(self, text1: str, text2: str) -> float:
        """Compute semantic similarity using BERTScore with Levenshtein fallback.

        Args:
            text1: Agent-generated review text
            text2: Reference review text

        Returns:
            Similarity score between 0.0 and 1.0

        Performance: ~200ms with BERTScore, ~20ms with Levenshtein fallback
        """
        if not text1.strip() and not text2.strip():
            return 1.0
        if not text1.strip() or not text2.strip():
            return 0.0

        scorer = self._get_bertscore_model()
        if scorer is not None:
            try:
                _, _, f1 = scorer.score([text1], [text2])
                return float(f1.mean().item())  # type: ignore[union-attr]
            except Exception as e:
                logger.warning(f"BERTScore computation failed, falling back to Levenshtein: {e}")

        return self.compute_levenshtein_similarity(text1, text2)

    def measure_execution_time(self, start_time: float, end_time: float) -> float:
        """Calculate execution time with normalization for scoring.

        Args:
            start_time: Start timestamp (from time.perf_counter())
            end_time: End timestamp (from time.perf_counter())

        Returns:
            Normalized time score for composite scoring (0.0-1.0)
        """
        duration = max(0.001, end_time - start_time)  # Minimum 1ms

        # Normalize using exponential decay: faster is better
        # Formula: exp(-duration) with max at 1.0 for very fast execution
        normalized_score = math.exp(-duration)
        return max(0.0, min(1.0, normalized_score))

    def assess_task_success(
        self, similarity_scores: SimilarityScores, threshold: float = 0.8
    ) -> float:
        """Assess task completion success with continuous proportional scoring.

        Returns a continuous score in [0.0, 1.0] rather than a binary result.
        When weighted similarity meets or exceeds the threshold, returns 1.0.
        When below, returns proportional credit (weighted_similarity / threshold).
        When threshold is 0.0, returns 0.0 to avoid division by zero.

        Args:
            similarity_scores: Container with semantic, cosine, jaccard scores
            threshold: Similarity value representing full credit (from config)

        Returns:
            Continuous float in [0.0, 1.0]; 1.0 when similarity >= threshold,
            weighted_similarity / threshold when below, 0.0 when threshold is 0.
        """
        try:
            # Weighted average of similarity metrics
            weights = {"semantic": 0.5, "cosine": 0.3, "jaccard": 0.2}

            overall_similarity = (
                similarity_scores.semantic * weights["semantic"]
                + similarity_scores.cosine * weights["cosine"]
                + similarity_scores.jaccard * weights["jaccard"]
            )

            return min(1.0, overall_similarity / threshold) if threshold > 0.0 else 0.0

        except Exception as e:
            logger.warning(f"Task success assessment failed: {e}")
            return 0.0

    def compute_all_similarities(
        self, agent_output: str, reference_text: str, enhanced: bool = False
    ) -> SimilarityScores:
        """Compute all similarity metrics for a single reference.

        Args:
            agent_output: Generated review text
            reference_text: Single ground truth review
            enhanced: Enable enhanced similarity features (textdistance)

        Returns:
            SimilarityScores container with all computed metrics
        """
        cosine_score = self.compute_cosine_similarity(agent_output, reference_text)
        jaccard_score = self.compute_jaccard_similarity(
            agent_output, reference_text, enhanced=enhanced
        )
        semantic_score = self.compute_semantic_similarity(agent_output, reference_text)

        # Add Levenshtein similarity when enhanced mode is enabled
        levenshtein_score = 0.0
        if enhanced:
            levenshtein_score = self.compute_levenshtein_similarity(agent_output, reference_text)

        return SimilarityScores(
            cosine=cosine_score,
            jaccard=jaccard_score,
            semantic=semantic_score,
            levenshtein=levenshtein_score,
        )

    def find_best_match(
        self, agent_output: str, reference_texts: list[str], enhanced: bool = False
    ) -> SimilarityScores:
        """Find best matching reference and return its similarity scores.

        Args:
            agent_output: Generated review text
            reference_texts: List of ground truth reviews
            enhanced: Enable enhanced similarity features

        Returns:
            Best similarity scores across all reference texts
        """
        if not reference_texts:
            return SimilarityScores(cosine=0.0, jaccard=0.0, semantic=0.0, levenshtein=0.0)

        all_scores = [
            self.compute_all_similarities(agent_output, ref, enhanced=enhanced)
            for ref in reference_texts
        ]

        # Take maximum score for each metric (best match approach)
        best_cosine = max(scores.cosine for scores in all_scores)
        best_jaccard = max(scores.jaccard for scores in all_scores)
        best_semantic = max(scores.semantic for scores in all_scores)
        best_levenshtein = max(scores.levenshtein for scores in all_scores) if enhanced else 0.0

        return SimilarityScores(
            cosine=best_cosine,
            jaccard=best_jaccard,
            semantic=best_semantic,
            levenshtein=best_levenshtein,
        )

    def evaluate_traditional_metrics(
        self,
        agent_output: str,
        reference_texts: list[str],
        start_time: float,
        end_time: float,
        settings: JudgeSettings | None = None,
    ) -> Tier1Result:
        """Complete traditional metrics evaluation.

        Args:
            agent_output: Generated review text
            reference_texts: List of ground truth reviews
            start_time: Execution start timestamp
            end_time: Execution end timestamp
            settings: JudgeSettings instance. If None, uses defaults.

        Returns:
            Tier1Result with all traditional metrics
        """
        # Find best similarity scores across all references
        best_scores = self.find_best_match(agent_output, reference_texts)

        # Reason: Clamp cosine/semantic scores to [0, 1] — TF-IDF + sklearn cosine_similarity
        # can return 1.0000000000000002 due to floating-point precision (tests-review C1).
        cosine_score = min(1.0, max(0.0, best_scores.cosine))
        semantic_score = min(1.0, max(0.0, best_scores.semantic))

        # Calculate execution metrics
        confidence_threshold = settings.tier1_confidence_threshold if settings else 0.8
        time_score = self.measure_execution_time(start_time, end_time)
        task_success = self.assess_task_success(best_scores, confidence_threshold)

        # Calculate weighted overall score
        overall_score = min(
            1.0,
            max(
                0.0,
                (
                    semantic_score * 0.4
                    + cosine_score * 0.3
                    + best_scores.jaccard * 0.2
                    + time_score * 0.1
                ),
            ),
        )

        return Tier1Result(
            cosine_score=cosine_score,
            jaccard_score=best_scores.jaccard,
            semantic_score=semantic_score,
            execution_time=end_time - start_time,
            time_score=time_score,
            task_success=task_success,
            overall_score=overall_score,
        )

    def evaluate_enhanced_similarity(
        self,
        agent_output: str,
        reference_texts: list[str],
        config_weights: dict[str, float] | None = None,
    ) -> float:
        """Enhanced multi-metric evaluation with config-driven weighting.

        This method provides enhanced similarity evaluation with:
        - Levenshtein similarity calculation
        - Config-driven weighting system
        - Enhanced error fallbacks
        - Multi-metric weighted combination

        Args:
            agent_output: Generated review text
            reference_texts: List of ground truth reviews
            config_weights: Optional weight configuration for metrics

        Returns:
            Weighted overall similarity score (0-1)
        """
        try:
            # Default balanced weights
            default_weights = {
                "cosine_weight": 0.4,
                "jaccard_weight": 0.4,
                "semantic_weight": 0.2,
            }

            weights = config_weights or default_weights

            # Find best matching scores with enhanced features enabled
            best_scores = self.find_best_match(agent_output, reference_texts, enhanced=True)

            # Calculate multiple similarity metrics
            cosine_sim = best_scores.cosine
            jaccard_sim = best_scores.jaccard
            levenshtein_sim = best_scores.levenshtein

            # Weighted combination using config weights
            cosine_weight = weights.get("cosine_weight", 0.4)
            jaccard_weight = weights.get("jaccard_weight", 0.4)
            semantic_weight = weights.get("semantic_weight", 0.2)

            # Calculate weighted average
            weighted_score = (
                cosine_sim * cosine_weight
                + jaccard_sim * jaccard_weight
                + levenshtein_sim * semantic_weight
            )

            return min(1.0, max(0.0, weighted_score))

        except Exception as e:
            logger.warning(f"Enhanced similarity evaluation failed: {e}")
            # Fallback to basic Jaccard similarity
            try:
                basic_scores = self.find_best_match(agent_output, reference_texts)
                return basic_scores.jaccard
            except Exception:
                return 0.0
Functions
__init__()

Initialize metrics engine with cached components.

Uses lazy loading for computationally expensive components to minimize startup time and memory usage.

Source code in src/app/judge/traditional_metrics.py
def __init__(self):
    """Initialize metrics engine with cached components.

    Uses lazy loading for computationally expensive components
    to minimize startup time and memory usage.
    """
    self._vectorizer = TfidfVectorizer(
        stop_words="english",
        ngram_range=(1, 2),
        max_features=5000,  # Limit for performance
    )
assess_task_success(similarity_scores, threshold=0.8)

Assess task completion success with continuous proportional scoring.

Returns a continuous score in [0.0, 1.0] rather than a binary result. When weighted similarity meets or exceeds the threshold, returns 1.0. When below, returns proportional credit (weighted_similarity / threshold). When threshold is 0.0, returns 0.0 to avoid division by zero.

Parameters:

Name Type Description Default
similarity_scores SimilarityScores

Container with semantic, cosine, jaccard scores

required
threshold float

Similarity value representing full credit (from config)

0.8

Returns:

Type Description
float

Continuous float in [0.0, 1.0]; 1.0 when similarity >= threshold,

float

weighted_similarity / threshold when below, 0.0 when threshold is 0.

Source code in src/app/judge/traditional_metrics.py
def assess_task_success(
    self, similarity_scores: SimilarityScores, threshold: float = 0.8
) -> float:
    """Assess task completion success with continuous proportional scoring.

    Returns a continuous score in [0.0, 1.0] rather than a binary result.
    When weighted similarity meets or exceeds the threshold, returns 1.0.
    When below, returns proportional credit (weighted_similarity / threshold).
    When threshold is 0.0, returns 0.0 to avoid division by zero.

    Args:
        similarity_scores: Container with semantic, cosine, jaccard scores
        threshold: Similarity value representing full credit (from config)

    Returns:
        Continuous float in [0.0, 1.0]; 1.0 when similarity >= threshold,
        weighted_similarity / threshold when below, 0.0 when threshold is 0.
    """
    try:
        # Weighted average of similarity metrics
        weights = {"semantic": 0.5, "cosine": 0.3, "jaccard": 0.2}

        overall_similarity = (
            similarity_scores.semantic * weights["semantic"]
            + similarity_scores.cosine * weights["cosine"]
            + similarity_scores.jaccard * weights["jaccard"]
        )

        return min(1.0, overall_similarity / threshold) if threshold > 0.0 else 0.0

    except Exception as e:
        logger.warning(f"Task success assessment failed: {e}")
        return 0.0
compute_all_similarities(agent_output, reference_text, enhanced=False)

Compute all similarity metrics for a single reference.

Parameters:

Name Type Description Default
agent_output str

Generated review text

required
reference_text str

Single ground truth review

required
enhanced bool

Enable enhanced similarity features (textdistance)

False

Returns:

Type Description
SimilarityScores

SimilarityScores container with all computed metrics

Source code in src/app/judge/traditional_metrics.py
def compute_all_similarities(
    self, agent_output: str, reference_text: str, enhanced: bool = False
) -> SimilarityScores:
    """Compute all similarity metrics for a single reference.

    Args:
        agent_output: Generated review text
        reference_text: Single ground truth review
        enhanced: Enable enhanced similarity features (textdistance)

    Returns:
        SimilarityScores container with all computed metrics
    """
    cosine_score = self.compute_cosine_similarity(agent_output, reference_text)
    jaccard_score = self.compute_jaccard_similarity(
        agent_output, reference_text, enhanced=enhanced
    )
    semantic_score = self.compute_semantic_similarity(agent_output, reference_text)

    # Add Levenshtein similarity when enhanced mode is enabled
    levenshtein_score = 0.0
    if enhanced:
        levenshtein_score = self.compute_levenshtein_similarity(agent_output, reference_text)

    return SimilarityScores(
        cosine=cosine_score,
        jaccard=jaccard_score,
        semantic=semantic_score,
        levenshtein=levenshtein_score,
    )
compute_cosine_similarity(text1, text2)

Compute TF-IDF cosine similarity with enhanced error handling.

Parameters:

Name Type Description Default
text1 str

Agent-generated review text

required
text2 str

Reference review text

required

Returns:

Type Description
float

Similarity score between 0.0 and 1.0

Performance: ~50ms for typical review lengths

Source code in src/app/judge/traditional_metrics.py
def compute_cosine_similarity(self, text1: str, text2: str) -> float:
    """Compute TF-IDF cosine similarity with enhanced error handling.

    Args:
        text1: Agent-generated review text
        text2: Reference review text

    Returns:
        Similarity score between 0.0 and 1.0

    Performance: ~50ms for typical review lengths
    """
    if not text1.strip() and not text2.strip():
        return 1.0
    if not text1.strip() or not text2.strip():
        return 0.0

    try:
        vectorizer = TfidfVectorizer(stop_words="english", lowercase=True, max_features=1000)
        texts = [text1, text2]
        tfidf_matrix = vectorizer.fit_transform(texts)
        dense_matrix = tfidf_matrix.toarray()  # type: ignore[union-attr]
        similarity_matrix = cosine_similarity(dense_matrix[0:1], dense_matrix[1:2])
        score: float = similarity_matrix[0][0]  # type: ignore[assignment]
        return score

    except Exception as e:
        logger.warning(f"TF-IDF cosine similarity failed: {e}")
        try:
            return self._compute_word_overlap_fallback(text1, text2)
        except Exception:
            logger.warning("Cosine similarity calculation failed completely")
            return 0.0
compute_jaccard_similarity(text1, text2, enhanced=False)

Compute Jaccard similarity with optional textdistance enhancement.

Parameters:

Name Type Description Default
text1 str

Agent-generated review text

required
text2 str

Reference review text

required
enhanced bool

Use textdistance library for robust calculation

False

Returns:

Type Description
float

Similarity score between 0.0 and 1.0

Performance: ~10ms for typical review lengths

Source code in src/app/judge/traditional_metrics.py
def compute_jaccard_similarity(self, text1: str, text2: str, enhanced: bool = False) -> float:
    """Compute Jaccard similarity with optional textdistance enhancement.

    Args:
        text1: Agent-generated review text
        text2: Reference review text
        enhanced: Use textdistance library for robust calculation

    Returns:
        Similarity score between 0.0 and 1.0

    Performance: ~10ms for typical review lengths
    """
    if not text1.strip() and not text2.strip():
        return 1.0
    if not text1.strip() or not text2.strip():
        return 0.0

    if enhanced:
        try:
            return float(
                textdistance.jaccard.normalized_similarity(text1.lower(), text2.lower())
            )
        except Exception as e:
            logger.warning(f"Enhanced Jaccard similarity failed: {e}")

    try:
        return self._compute_jaccard_basic(text1, text2)
    except Exception as e:
        logger.warning(f"Jaccard similarity calculation failed: {e}")
        try:
            return self._compute_jaccard_regex_fallback(text1, text2)
        except Exception:
            return 0.0
compute_levenshtein_similarity(text1, text2)

Compute Levenshtein (edit distance) similarity using textdistance.

Parameters:

Name Type Description Default
text1 str

Agent-generated review text

required
text2 str

Reference review text

required

Returns:

Type Description
float

Normalized Levenshtein similarity score between 0.0 and 1.0

Performance: ~20ms for typical review lengths

Source code in src/app/judge/traditional_metrics.py
def compute_levenshtein_similarity(self, text1: str, text2: str) -> float:
    """Compute Levenshtein (edit distance) similarity using textdistance.

    Args:
        text1: Agent-generated review text
        text2: Reference review text

    Returns:
        Normalized Levenshtein similarity score between 0.0 and 1.0

    Performance: ~20ms for typical review lengths
    """
    if not text1.strip() and not text2.strip():
        return 1.0
    if not text1.strip() or not text2.strip():
        return 0.0

    try:
        return float(
            textdistance.levenshtein.normalized_similarity(text1.lower(), text2.lower())
        )
    except Exception as e:
        logger.warning(f"Levenshtein similarity calculation failed: {e}")
        try:
            return self._compute_char_overlap_fallback(text1, text2)
        except Exception:
            return 0.0
compute_semantic_similarity(text1, text2)

Compute semantic similarity using BERTScore with Levenshtein fallback.

Parameters:

Name Type Description Default
text1 str

Agent-generated review text

required
text2 str

Reference review text

required

Returns:

Type Description
float

Similarity score between 0.0 and 1.0

Performance: ~200ms with BERTScore, ~20ms with Levenshtein fallback

Source code in src/app/judge/traditional_metrics.py
def compute_semantic_similarity(self, text1: str, text2: str) -> float:
    """Compute semantic similarity using BERTScore with Levenshtein fallback.

    Args:
        text1: Agent-generated review text
        text2: Reference review text

    Returns:
        Similarity score between 0.0 and 1.0

    Performance: ~200ms with BERTScore, ~20ms with Levenshtein fallback
    """
    if not text1.strip() and not text2.strip():
        return 1.0
    if not text1.strip() or not text2.strip():
        return 0.0

    scorer = self._get_bertscore_model()
    if scorer is not None:
        try:
            _, _, f1 = scorer.score([text1], [text2])
            return float(f1.mean().item())  # type: ignore[union-attr]
        except Exception as e:
            logger.warning(f"BERTScore computation failed, falling back to Levenshtein: {e}")

    return self.compute_levenshtein_similarity(text1, text2)
evaluate_enhanced_similarity(agent_output, reference_texts, config_weights=None)

Enhanced multi-metric evaluation with config-driven weighting.

This method provides enhanced similarity evaluation with: - Levenshtein similarity calculation - Config-driven weighting system - Enhanced error fallbacks - Multi-metric weighted combination

Parameters:

Name Type Description Default
agent_output str

Generated review text

required
reference_texts list[str]

List of ground truth reviews

required
config_weights dict[str, float] | None

Optional weight configuration for metrics

None

Returns:

Type Description
float

Weighted overall similarity score (0-1)

Source code in src/app/judge/traditional_metrics.py
def evaluate_enhanced_similarity(
    self,
    agent_output: str,
    reference_texts: list[str],
    config_weights: dict[str, float] | None = None,
) -> float:
    """Enhanced multi-metric evaluation with config-driven weighting.

    This method provides enhanced similarity evaluation with:
    - Levenshtein similarity calculation
    - Config-driven weighting system
    - Enhanced error fallbacks
    - Multi-metric weighted combination

    Args:
        agent_output: Generated review text
        reference_texts: List of ground truth reviews
        config_weights: Optional weight configuration for metrics

    Returns:
        Weighted overall similarity score (0-1)
    """
    try:
        # Default balanced weights
        default_weights = {
            "cosine_weight": 0.4,
            "jaccard_weight": 0.4,
            "semantic_weight": 0.2,
        }

        weights = config_weights or default_weights

        # Find best matching scores with enhanced features enabled
        best_scores = self.find_best_match(agent_output, reference_texts, enhanced=True)

        # Calculate multiple similarity metrics
        cosine_sim = best_scores.cosine
        jaccard_sim = best_scores.jaccard
        levenshtein_sim = best_scores.levenshtein

        # Weighted combination using config weights
        cosine_weight = weights.get("cosine_weight", 0.4)
        jaccard_weight = weights.get("jaccard_weight", 0.4)
        semantic_weight = weights.get("semantic_weight", 0.2)

        # Calculate weighted average
        weighted_score = (
            cosine_sim * cosine_weight
            + jaccard_sim * jaccard_weight
            + levenshtein_sim * semantic_weight
        )

        return min(1.0, max(0.0, weighted_score))

    except Exception as e:
        logger.warning(f"Enhanced similarity evaluation failed: {e}")
        # Fallback to basic Jaccard similarity
        try:
            basic_scores = self.find_best_match(agent_output, reference_texts)
            return basic_scores.jaccard
        except Exception:
            return 0.0
evaluate_traditional_metrics(agent_output, reference_texts, start_time, end_time, settings=None)

Complete traditional metrics evaluation.

Parameters:

Name Type Description Default
agent_output str

Generated review text

required
reference_texts list[str]

List of ground truth reviews

required
start_time float

Execution start timestamp

required
end_time float

Execution end timestamp

required
settings JudgeSettings | None

JudgeSettings instance. If None, uses defaults.

None

Returns:

Type Description
Tier1Result

Tier1Result with all traditional metrics

Source code in src/app/judge/traditional_metrics.py
def evaluate_traditional_metrics(
    self,
    agent_output: str,
    reference_texts: list[str],
    start_time: float,
    end_time: float,
    settings: JudgeSettings | None = None,
) -> Tier1Result:
    """Complete traditional metrics evaluation.

    Args:
        agent_output: Generated review text
        reference_texts: List of ground truth reviews
        start_time: Execution start timestamp
        end_time: Execution end timestamp
        settings: JudgeSettings instance. If None, uses defaults.

    Returns:
        Tier1Result with all traditional metrics
    """
    # Find best similarity scores across all references
    best_scores = self.find_best_match(agent_output, reference_texts)

    # Reason: Clamp cosine/semantic scores to [0, 1] — TF-IDF + sklearn cosine_similarity
    # can return 1.0000000000000002 due to floating-point precision (tests-review C1).
    cosine_score = min(1.0, max(0.0, best_scores.cosine))
    semantic_score = min(1.0, max(0.0, best_scores.semantic))

    # Calculate execution metrics
    confidence_threshold = settings.tier1_confidence_threshold if settings else 0.8
    time_score = self.measure_execution_time(start_time, end_time)
    task_success = self.assess_task_success(best_scores, confidence_threshold)

    # Calculate weighted overall score
    overall_score = min(
        1.0,
        max(
            0.0,
            (
                semantic_score * 0.4
                + cosine_score * 0.3
                + best_scores.jaccard * 0.2
                + time_score * 0.1
            ),
        ),
    )

    return Tier1Result(
        cosine_score=cosine_score,
        jaccard_score=best_scores.jaccard,
        semantic_score=semantic_score,
        execution_time=end_time - start_time,
        time_score=time_score,
        task_success=task_success,
        overall_score=overall_score,
    )
find_best_match(agent_output, reference_texts, enhanced=False)

Find best matching reference and return its similarity scores.

Parameters:

Name Type Description Default
agent_output str

Generated review text

required
reference_texts list[str]

List of ground truth reviews

required
enhanced bool

Enable enhanced similarity features

False

Returns:

Type Description
SimilarityScores

Best similarity scores across all reference texts

Source code in src/app/judge/traditional_metrics.py
def find_best_match(
    self, agent_output: str, reference_texts: list[str], enhanced: bool = False
) -> SimilarityScores:
    """Find best matching reference and return its similarity scores.

    Args:
        agent_output: Generated review text
        reference_texts: List of ground truth reviews
        enhanced: Enable enhanced similarity features

    Returns:
        Best similarity scores across all reference texts
    """
    if not reference_texts:
        return SimilarityScores(cosine=0.0, jaccard=0.0, semantic=0.0, levenshtein=0.0)

    all_scores = [
        self.compute_all_similarities(agent_output, ref, enhanced=enhanced)
        for ref in reference_texts
    ]

    # Take maximum score for each metric (best match approach)
    best_cosine = max(scores.cosine for scores in all_scores)
    best_jaccard = max(scores.jaccard for scores in all_scores)
    best_semantic = max(scores.semantic for scores in all_scores)
    best_levenshtein = max(scores.levenshtein for scores in all_scores) if enhanced else 0.0

    return SimilarityScores(
        cosine=best_cosine,
        jaccard=best_jaccard,
        semantic=best_semantic,
        levenshtein=best_levenshtein,
    )
measure_execution_time(start_time, end_time)

Calculate execution time with normalization for scoring.

Parameters:

Name Type Description Default
start_time float

Start timestamp (from time.perf_counter())

required
end_time float

End timestamp (from time.perf_counter())

required

Returns:

Type Description
float

Normalized time score for composite scoring (0.0-1.0)

Source code in src/app/judge/traditional_metrics.py
def measure_execution_time(self, start_time: float, end_time: float) -> float:
    """Calculate execution time with normalization for scoring.

    Args:
        start_time: Start timestamp (from time.perf_counter())
        end_time: End timestamp (from time.perf_counter())

    Returns:
        Normalized time score for composite scoring (0.0-1.0)
    """
    duration = max(0.001, end_time - start_time)  # Minimum 1ms

    # Normalize using exponential decay: faster is better
    # Formula: exp(-duration) with max at 1.0 for very fast execution
    normalized_score = math.exp(-duration)
    return max(0.0, min(1.0, normalized_score))

Functions

calculate_cosine_similarity(text1, text2)

Calculate cosine similarity between two texts.

Convenience wrapper for compute_cosine_similarity. Handles empty strings gracefully.

Parameters:

Name Type Description Default
text1 str

First text to compare

required
text2 str

Second text to compare

required

Returns:

Type Description
float

Cosine similarity score (0-1)

Source code in src/app/judge/traditional_metrics.py
def calculate_cosine_similarity(text1: str, text2: str) -> float:
    """Calculate cosine similarity between two texts.

    Convenience wrapper for compute_cosine_similarity.
    Handles empty strings gracefully.

    Args:
        text1: First text to compare
        text2: Second text to compare

    Returns:
        Cosine similarity score (0-1)
    """
    # Handle empty strings like original implementation
    if not text1.strip() or not text2.strip():
        return 0.0

    engine = TraditionalMetricsEngine()
    return engine.compute_cosine_similarity(text1, text2)

calculate_jaccard_similarity(text1, text2)

Calculate Jaccard similarity between two texts.

Backward compatibility wrapper for compute_jaccard_similarity with enhanced features.

Parameters:

Name Type Description Default
text1 str

First text to compare

required
text2 str

Second text to compare

required

Returns:

Type Description
float

Enhanced Jaccard similarity score (0-1)

Source code in src/app/judge/traditional_metrics.py
def calculate_jaccard_similarity(text1: str, text2: str) -> float:
    """Calculate Jaccard similarity between two texts.

    Backward compatibility wrapper for compute_jaccard_similarity with enhanced
    features.

    Args:
        text1: First text to compare
        text2: Second text to compare

    Returns:
        Enhanced Jaccard similarity score (0-1)
    """
    engine = TraditionalMetricsEngine()
    return engine.compute_jaccard_similarity(text1, text2, enhanced=True)

create_evaluation_result(paper_id, agent_review, ground_truth_reviews)

Create evaluation result comparing agent review to ground truth.

This function creates comprehensive evaluation results using enhanced similarity evaluation capabilities.

Parameters:

Name Type Description Default
paper_id str

Paper identifier.

required
agent_review str

Review generated by agent.

required
ground_truth_reviews list[PeerReadReview]

Original peer reviews.

required

Returns:

Type Description
PeerReadEvalResult

PeerReadEvalResult with similarity metrics.

Source code in src/app/judge/traditional_metrics.py
def create_evaluation_result(
    paper_id: str,
    agent_review: str,
    ground_truth_reviews: list[PeerReadReview],
) -> PeerReadEvalResult:
    """Create evaluation result comparing agent review to ground truth.

    This function creates comprehensive evaluation results
    using enhanced similarity evaluation capabilities.

    Args:
        paper_id: Paper identifier.
        agent_review: Review generated by agent.
        ground_truth_reviews: Original peer reviews.

    Returns:
        PeerReadEvalResult with similarity metrics.
    """
    # Extract reference texts for similarity calculation
    reference_texts = [review.comments for review in ground_truth_reviews]

    # Use enhanced similarity evaluation (equivalent to evaluate_review_similarity)
    overall_similarity = evaluate_single_enhanced(
        agent_output=agent_review,
        reference_texts=reference_texts,
        config_weights=None,  # Use default weights
    )

    # Calculate individual similarity metrics for detailed breakdown
    engine = TraditionalMetricsEngine()
    best_scores = engine.find_best_match(agent_review, reference_texts, enhanced=True)

    similarity_scores = {
        "cosine": best_scores.cosine,
        "jaccard": best_scores.jaccard,
        "semantic": best_scores.semantic,
    }

    gt_recommendations = [float(r.recommendation) for r in ground_truth_reviews]

    if len(gt_recommendations) == 0:
        # No ground truth to compare - default to False
        recommendation_match = False
    else:
        # Reason: Use numeric GT recommendation directly (threshold 3.0 = borderline accept).
        # Approximation: agent review text is used as a proxy because structured
        # GeneratedReview scores are not available in this call context.
        # Simple positive word heuristic is intentionally avoided (Review F19).
        avg_gt_recommendation = sum(gt_recommendations) / len(gt_recommendations)
        # Positive review words that indicate acceptance (excluding negatable "good")
        positive_indicators = [
            "accept",
            "strong contribution",
            "recommend",
            "excellent",
            "solid",
            "novel",
        ]
        negative_indicators = [
            "reject",
            "weak",
            "insufficient",
            "lacks novelty",
            "serious issues",
        ]
        agent_review_lower = agent_review.lower()
        positive_hits = sum(1 for p in positive_indicators if p in agent_review_lower)
        negative_hits = sum(1 for n in negative_indicators if n in agent_review_lower)
        agent_is_positive = positive_hits > negative_hits
        recommendation_match = (agent_is_positive and avg_gt_recommendation >= 3.0) or (
            not agent_is_positive and avg_gt_recommendation < 3.0
        )

    return PeerReadEvalResult(
        paper_id=paper_id,
        agent_review=agent_review,
        ground_truth_reviews=ground_truth_reviews,
        similarity_scores=similarity_scores,
        overall_similarity=overall_similarity,
        recommendation_match=recommendation_match,
    )

evaluate_review_similarity(agent_review, ground_truth)

Evaluate similarity between agent review and ground truth.

Backward compatibility wrapper for evaluate_enhanced_similarity.

Parameters:

Name Type Description Default
agent_review str

Review generated by agent

required
ground_truth str

Ground truth review text

required

Returns:

Type Description
float

Weighted similarity score (0-1)

Source code in src/app/judge/traditional_metrics.py
def evaluate_review_similarity(agent_review: str, ground_truth: str) -> float:
    """Evaluate similarity between agent review and ground truth.

    Backward compatibility wrapper for evaluate_enhanced_similarity.

    Args:
        agent_review: Review generated by agent
        ground_truth: Ground truth review text

    Returns:
        Weighted similarity score (0-1)
    """
    return evaluate_single_enhanced(
        agent_output=agent_review,
        reference_texts=[ground_truth],
        config_weights=None,  # Use default weights
    )

evaluate_single_enhanced(agent_output, reference_texts, config_weights=None)

Convenience function for enhanced similarity evaluation.

This function provides the PeerRead-style evaluation workflow with Levenshtein similarity, config-driven weights, and enhanced error handling.

Parameters:

Name Type Description Default
agent_output str

Generated review text

required
reference_texts list[str]

List of ground truth reviews

required
config_weights dict[str, float] | None

Optional weight configuration for similarity metrics

None

Returns:

Type Description
float

Weighted overall similarity score (0-1)

Example

weights = { … “cosine_weight”: 0.6, … “jaccard_weight”: 0.4, … “semantic_weight”: 0.0, … } result = evaluate_single_enhanced( … agent_output=”This paper demonstrates strong methodology…”, … reference_texts=[ … “The work shows solid approach…”, … “Good technical quality…”, … ], … config_weights=weights, … ) print(f”Enhanced similarity: {result:.3f}”)

Source code in src/app/judge/traditional_metrics.py
def evaluate_single_enhanced(
    agent_output: str,
    reference_texts: list[str],
    config_weights: dict[str, float] | None = None,
) -> float:
    """Convenience function for enhanced similarity evaluation.

    This function provides the PeerRead-style evaluation workflow with
    Levenshtein similarity, config-driven weights, and enhanced error handling.

    Args:
        agent_output: Generated review text
        reference_texts: List of ground truth reviews
        config_weights: Optional weight configuration for similarity metrics

    Returns:
        Weighted overall similarity score (0-1)

    Example:
        >>> weights = {
        ...     "cosine_weight": 0.6,
        ...     "jaccard_weight": 0.4,
        ...     "semantic_weight": 0.0,
        ... }
        >>> result = evaluate_single_enhanced(
        ...     agent_output="This paper demonstrates strong methodology...",
        ...     reference_texts=[
        ...         "The work shows solid approach...",
        ...         "Good technical quality...",
        ...     ],
        ...     config_weights=weights,
        ... )
        >>> print(f"Enhanced similarity: {result:.3f}")
    """
    engine = TraditionalMetricsEngine()
    return engine.evaluate_enhanced_similarity(
        agent_output=agent_output,
        reference_texts=reference_texts,
        config_weights=config_weights,
    )

evaluate_single_traditional(agent_output, reference_texts, settings=None)

Convenience function for single traditional evaluation.

Parameters:

Name Type Description Default
agent_output str

Generated review text

required
reference_texts list[str]

List of ground truth reviews

required
settings JudgeSettings | None

Optional JudgeSettings override. If None, uses defaults.

None

Returns:

Type Description
Tier1Result

Tier1Result with traditional metrics

Example

result = evaluate_single_traditional( … agent_output=”This paper presents…”, … reference_texts=[“The work demonstrates…”, “Strong contribution…”], … ) print(f”Overall score: {result.overall_score:.3f}”)

Source code in src/app/judge/traditional_metrics.py
def evaluate_single_traditional(
    agent_output: str,
    reference_texts: list[str],
    settings: JudgeSettings | None = None,
) -> Tier1Result:
    """Convenience function for single traditional evaluation.

    Args:
        agent_output: Generated review text
        reference_texts: List of ground truth reviews
        settings: Optional JudgeSettings override. If None, uses defaults.

    Returns:
        Tier1Result with traditional metrics

    Example:
        >>> result = evaluate_single_traditional(
        ...     agent_output="This paper presents...",
        ...     reference_texts=["The work demonstrates...", "Strong contribution..."],
        ... )
        >>> print(f"Overall score: {result.overall_score:.3f}")
    """
    if settings is None:
        from app.config.judge_settings import JudgeSettings

        settings = JudgeSettings()
    engine = TraditionalMetricsEngine()

    start_time = time.perf_counter()
    end_time = time.perf_counter()

    return engine.evaluate_traditional_metrics(
        agent_output=agent_output,
        reference_texts=reference_texts,
        start_time=start_time,
        end_time=end_time,
        settings=settings,
    )

app.llms.models

LLM model creation and abstraction.

This module provides pure model creation functionality without business logic. Handles model instantiation for different providers in a unified way.

Classes

Functions

create_agent_models(endpoint_config, include_researcher=False, include_analyst=False, include_synthesiser=False)

Create models for the system agents.

Parameters:

Name Type Description Default
endpoint_config EndpointConfig

Configuration for the model.

required
include_researcher bool

Whether to include the researcher model.

False
include_analyst bool

Whether to include the analyst model.

False
include_synthesiser bool

Whether to include the synthesiser model.

False

Returns:

Name Type Description
ModelDict ModelDict

A dictionary containing compatible models for the system agents.

Source code in src/app/llms/models.py
def create_agent_models(
    endpoint_config: EndpointConfig,
    include_researcher: bool = False,
    include_analyst: bool = False,
    include_synthesiser: bool = False,
) -> ModelDict:
    """
    Create models for the system agents.

    Args:
        endpoint_config (EndpointConfig): Configuration for the model.
        include_researcher (bool): Whether to include the researcher model.
        include_analyst (bool): Whether to include the analyst model.
        include_synthesiser (bool): Whether to include the synthesiser model.

    Returns:
        ModelDict: A dictionary containing compatible models for the system agents.
    """

    model = create_llm_model(endpoint_config)
    return ModelDict.model_validate(
        {
            "model_manager": model,
            "model_researcher": model if include_researcher else None,
            "model_analyst": model if include_analyst else None,
            "model_synthesiser": model if include_synthesiser else None,
        }
    )

create_llm_model(endpoint_config)

Create a model that works with PydanticAI.

Parameters:

Name Type Description Default
endpoint_config EndpointConfig

Full endpoint configuration including provider, model, key, and URL.

required

Returns:

Type Description
Model

PydanticAI Model instance.

Source code in src/app/llms/models.py
def create_llm_model(endpoint_config: EndpointConfig) -> Model:
    """Create a model that works with PydanticAI.

    Args:
        endpoint_config: Full endpoint configuration including provider, model, key, and URL.

    Returns:
        PydanticAI Model instance.
    """
    provider = endpoint_config.provider.lower()
    model_name = endpoint_config.provider_config.model_name
    api_key = endpoint_config.api_key
    base_url = str(endpoint_config.provider_config.base_url)

    llm_model_name = get_llm_model_name(provider, model_name)
    logger.info(f"Creating LLM model: {llm_model_name}")

    return _create_model_for_provider(provider, model_name, api_key, base_url)

create_simple_model(provider, model_name, api_key=None)

Create a simple model for basic usage like evaluation.

Routes to the correct provider backend using the same logic as create_llm_model. Looks up default_base_url from PROVIDER_REGISTRY when no EndpointConfig is available.

Parameters:

Name Type Description Default
provider str

Provider name (e.g., “openai”, “anthropic”, “cerebras”).

required
model_name str

Model name (e.g., “gpt-4o-mini”, “claude-sonnet-4-20250514”).

required
api_key str | None

API key (optional, will use environment if not provided).

None

Returns:

Type Description
Model

PydanticAI Model instance routed to the correct backend.

Source code in src/app/llms/models.py
def create_simple_model(provider: str, model_name: str, api_key: str | None = None) -> Model:
    """Create a simple model for basic usage like evaluation.

    Routes to the correct provider backend using the same logic as create_llm_model.
    Looks up default_base_url from PROVIDER_REGISTRY when no EndpointConfig is available.

    Args:
        provider: Provider name (e.g., "openai", "anthropic", "cerebras").
        model_name: Model name (e.g., "gpt-4o-mini", "claude-sonnet-4-20250514").
        api_key: API key (optional, will use environment if not provided).

    Returns:
        PydanticAI Model instance routed to the correct backend.
    """
    provider_lower = provider.lower()
    registry_entry = PROVIDER_REGISTRY.get(provider_lower)
    base_url = registry_entry.default_base_url if registry_entry else None
    return _create_model_for_provider(provider_lower, model_name, api_key, base_url)

get_llm_model_name(provider, model_name)

Convert provider and model name to required format.

Parameters:

Name Type Description Default
provider str

Provider name (case-insensitive)

required
model_name str

Model name to format

required

Returns:

Type Description
str

Formatted model name with appropriate provider prefix

Source code in src/app/llms/models.py
def get_llm_model_name(provider: str, model_name: str) -> str:
    """Convert provider and model name to required format.

    Args:
        provider: Provider name (case-insensitive)
        model_name: Model name to format

    Returns:
        Formatted model name with appropriate provider prefix
    """
    provider_lower = provider.lower()

    # Get provider metadata from registry
    provider_metadata = PROVIDER_REGISTRY.get(provider_lower)
    if not provider_metadata:
        # Fallback for unknown providers
        logger.warning(f"Provider '{provider}' not in registry, using default prefix")
        prefix = f"{provider_lower}/"
    else:
        prefix = provider_metadata.model_prefix

    # Handle special cases where model name already includes provider
    if "/" in model_name:
        # Check if it already has a valid provider prefix
        for registered_provider in PROVIDER_REGISTRY.values():
            if registered_provider.model_prefix and model_name.startswith(
                registered_provider.model_prefix
            ):
                return model_name

    return f"{prefix}{model_name}"

app.llms.providers

LLM provider configuration and API key management.

This module provides pure provider abstraction without business logic. Handles API key retrieval, provider configurations, and environment setup.

Classes

Functions

get_api_key(provider, chat_env_config)

Retrieve API key from chat env config variable.

Parameters:

Name Type Description Default
provider str

Provider name (case-insensitive)

required
chat_env_config AppEnv

Application environment configuration

required

Returns:

Type Description
tuple[bool, str]

Tuple of (success: bool, message: str) where message is either the API key or error message

Source code in src/app/llms/providers.py
def get_api_key(
    provider: str,
    chat_env_config: AppEnv,
) -> tuple[bool, str]:
    """Retrieve API key from chat env config variable.

    Args:
        provider: Provider name (case-insensitive)
        chat_env_config: Application environment configuration

    Returns:
        Tuple of (success: bool, message: str) where message is either the API key or error message
    """
    provider_lower = provider.lower()

    # Check if provider exists in registry
    provider_metadata = PROVIDER_REGISTRY.get(provider_lower)
    if not provider_metadata:
        return (False, f"Provider '{provider}' is not supported.")

    # Handle providers without API keys (e.g., Ollama)
    if provider_metadata.env_key is None:
        return (False, f"{provider_metadata.name.title()} does not require an API key.")

    # Retrieve API key from environment config
    key_content = getattr(chat_env_config, provider_metadata.env_key, None)
    if key_content and key_content.strip():
        logger.info(f"Found API key for provider: '{provider}'")
        return (True, key_content)
    else:
        # Reason: Diagnose transient .env loading issues (CWD mismatch, unset env vars)
        if key_content is not None and not key_content.strip():
            logger.debug(f"Provider '{provider}' has empty API key for {provider_metadata.env_key}")
        return (
            False,
            f"API key for provider '{provider}' not found in configuration.",
        )

get_provider_config(provider, providers)

Retrieve configuration settings for the specified provider.

Parameters:

Name Type Description Default
provider str

Provider name key used to look up the configuration.

required
providers dict[str, ProviderConfig]

Mapping of provider name to ProviderConfig instances.

required

Returns:

Type Description
ProviderConfig

ProviderConfig for the requested provider.

Raises:

Type Description
KeyError

If the provider is not found in the providers mapping.

Exception

On unexpected lookup failures.

Source code in src/app/llms/providers.py
def get_provider_config(provider: str, providers: dict[str, ProviderConfig]) -> ProviderConfig:
    """Retrieve configuration settings for the specified provider.

    Args:
        provider: Provider name key used to look up the configuration.
        providers: Mapping of provider name to ProviderConfig instances.

    Returns:
        ProviderConfig for the requested provider.

    Raises:
        KeyError: If the provider is not found in the providers mapping.
        Exception: On unexpected lookup failures.
    """
    try:
        return providers[provider]
    except KeyError as e:
        msg = get_key_error(str(e))
        logger.error(msg)
        raise KeyError(msg)
    except Exception as e:
        msg = generic_exception(str(e))
        logger.exception(msg)
        raise Exception(msg)

get_supported_providers()

Get list of supported LLM providers from the registry.

Source code in src/app/llms/providers.py
def get_supported_providers() -> list[str]:
    """Get list of supported LLM providers from the registry."""
    return list(PROVIDER_REGISTRY.keys())

setup_llm_environment(api_keys)

No-op: retained for backward compatibility only.

Previously wrote API keys to os.environ, exposing them to child processes, crash reporters, and debug dumps (Sprint 5 Finding 10, Review F1 HIGH). All call sites have been migrated — keys are now passed directly via provider constructors in models.py.

Parameters:

Name Type Description Default
api_keys dict[str, str]

Ignored. Dictionary mapping provider names to API keys.

required

.. deprecated:: Use provider constructor api_key parameter instead. This function is scheduled for removal.

Source code in src/app/llms/providers.py
def setup_llm_environment(api_keys: dict[str, str]) -> None:
    """No-op: retained for backward compatibility only.

    Previously wrote API keys to ``os.environ``, exposing them to child
    processes, crash reporters, and debug dumps (Sprint 5 Finding 10,
    Review F1 HIGH). All call sites have been migrated — keys are now
    passed directly via provider constructors in ``models.py``.

    Args:
        api_keys: Ignored. Dictionary mapping provider names to API keys.

    .. deprecated::
        Use provider constructor ``api_key`` parameter instead.
        This function is scheduled for removal.
    """
    logger.debug("setup_llm_environment: no-op (keys passed via constructor, not os.environ)")

app.reports.report_generator

Report generator for evaluation result summarization.

This module produces structured Markdown reports from CompositeResult objects. Reports include an executive summary, per-tier score breakdown, weakness identification, and actionable suggestions sourced from the SuggestionEngine.

Report structure
  1. Executive Summary — composite score, recommendation, timestamp
  2. Tier Score Breakdown — T1/T2/T3 scores with weights
  3. Weaknesses & Suggestions — severity-ordered list from SuggestionEngine
Example

from app.reports.report_generator import generate_report, save_report md = generate_report(composite_result) save_report(md, Path(“results/reports/latest.md”))

Classes

Functions

generate_report(result, suggestions=None)

Generate a Markdown report from a CompositeResult.

Parameters:

Name Type Description Default
result CompositeResult

Composite evaluation result to report on.

required
suggestions list[Suggestion] | None

Optional pre-computed suggestion list. When provided, skips the SuggestionEngine and uses these directly.

None

Returns:

Type Description
str

Markdown-formatted report string.

Source code in src/app/reports/report_generator.py
def generate_report(
    result: CompositeResult,
    suggestions: list[Suggestion] | None = None,
) -> str:
    """Generate a Markdown report from a CompositeResult.

    Args:
        result: Composite evaluation result to report on.
        suggestions: Optional pre-computed suggestion list.  When provided,
            skips the SuggestionEngine and uses these directly.

    Returns:
        Markdown-formatted report string.
    """
    # S8-F6.1: build suggestions if not provided by caller
    if suggestions is None:
        engine = SuggestionEngine(no_llm_suggestions=True)
        suggestions = engine.generate(result)

    sections: list[str] = [
        _render_executive_summary(result),
        _render_tier_breakdown(result),
        _render_weaknesses(suggestions),
    ]
    return "\n\n".join(sections) + "\n"

save_report(markdown, output_path)

Write a Markdown report string to disk.

Parameters:

Name Type Description Default
markdown str

Report content as a Markdown string.

required
output_path Path

Destination file path. Parent directories are created automatically if they do not exist.

required
Source code in src/app/reports/report_generator.py
def save_report(markdown: str, output_path: Path) -> None:
    """Write a Markdown report string to disk.

    Args:
        markdown: Report content as a Markdown string.
        output_path: Destination file path.  Parent directories are created
            automatically if they do not exist.
    """
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(markdown, encoding="utf-8")

    from app.utils.artifact_registry import get_artifact_registry

    get_artifact_registry().register("Report", output_path)

app.reports.suggestion_engine

Suggestion engine for generating actionable evaluation improvement suggestions.

This module analyses evaluation results across all three tiers and produces structured, actionable suggestions. It supports a rule-based mode (always available) and an optional LLM-assisted mode for richer recommendations.

Severity mapping
  • critical: score < CRITICAL_THRESHOLD (0.2)
  • warning: CRITICAL_THRESHOLD <= score < WARNING_THRESHOLD (0.5)
  • info: score >= WARNING_THRESHOLD but still worth noting
Example

engine = SuggestionEngine() suggestions = engine.generate(composite_result) for s in suggestions: … print(s.severity, s.metric, s.message)

Classes

SuggestionEngine

Generates structured improvement suggestions from evaluation results.

Operates in two modes: - Rule-based (default): Fast, deterministic suggestions from score thresholds. - LLM-assisted (async): Richer suggestions using the judge provider LLM.

Parameters:

Name Type Description Default
no_llm_suggestions bool

When True, disables LLM path even if provider available.

False
Example

engine = SuggestionEngine() suggestions = engine.generate(composite_result) async_suggestions = await engine.generate_async(composite_result)

Source code in src/app/reports/suggestion_engine.py
class SuggestionEngine:
    """Generates structured improvement suggestions from evaluation results.

    Operates in two modes:
    - Rule-based (default): Fast, deterministic suggestions from score thresholds.
    - LLM-assisted (async): Richer suggestions using the judge provider LLM.

    Args:
        no_llm_suggestions: When True, disables LLM path even if provider available.

    Example:
        >>> engine = SuggestionEngine()
        >>> suggestions = engine.generate(composite_result)
        >>> async_suggestions = await engine.generate_async(composite_result)
    """

    def __init__(self, no_llm_suggestions: bool = False) -> None:
        """Initialize the suggestion engine.

        Args:
            no_llm_suggestions: Disable LLM-assisted suggestions when True.
        """
        self.no_llm_suggestions = no_llm_suggestions

    def generate(self, result: CompositeResult) -> list[Suggestion]:
        """Generate rule-based suggestions from evaluation results.

        Analyses metric_scores, tier-level scores, and tiers_enabled to produce
        actionable suggestions. Tier 2 absence is noted as an info suggestion.

        Args:
            result: Composite evaluation result to analyse.

        Returns:
            List of Suggestion objects ordered by severity (critical first).
        """
        suggestions: list[Suggestion] = []

        # Process known metric-level rules
        for metric, (tier, msg_tmpl, action) in _ALL_RULES.items():
            score = result.metric_scores.get(metric)
            if score is None:
                continue
            severity = _classify_severity(score)
            suggestions.append(
                Suggestion(
                    metric=metric,
                    tier=tier,
                    severity=severity,
                    message=msg_tmpl.format(score=score),
                    action=action,
                )
            )

        # Tier-level fallback: produce suggestions from tier scores when metric_scores empty
        if not result.metric_scores:
            tier_entries = [
                ("tier1_score", 1, result.tier1_score),
                ("tier3_score", 3, result.tier3_score),
            ]
            if result.tier2_score is not None:
                tier_entries.append(("tier2_score", 2, result.tier2_score))

            for metric_name, tier, score in tier_entries:
                severity = _classify_severity(score)
                suggestions.append(
                    Suggestion(
                        metric=metric_name,
                        tier=tier,
                        severity=severity,
                        message=f"Tier {tier} overall score {score:.2f} — improvement needed.",
                        action="Review individual metric scores to identify specific weaknesses.",
                    )
                )

        # Tier 2 absence: inform the user LLM judging was not run
        if result.tier2_score is None:
            suggestions.append(
                Suggestion(
                    metric="tier2_score",
                    tier=2,
                    severity=SuggestionSeverity.INFO,
                    message="Tier 2 LLM-as-Judge was not run — quality assessment incomplete.",
                    action="Configure a judge provider in Settings to enable Tier 2 scoring.",
                )
            )

        # Sort: critical → warning → info
        _order = {
            SuggestionSeverity.CRITICAL: 0,
            SuggestionSeverity.WARNING: 1,
            SuggestionSeverity.INFO: 2,
        }
        suggestions.sort(key=lambda s: _order[s.severity])
        return suggestions

    async def generate_async(self, result: CompositeResult) -> list[Suggestion]:
        """Generate suggestions with optional LLM enhancement.

        Attempts LLM-assisted suggestions first; falls back to rule-based on error.

        Args:
            result: Composite evaluation result to analyse.

        Returns:
            List of Suggestion objects, potentially enriched by LLM.
        """
        if self.no_llm_suggestions:
            return self.generate(result)

        try:
            llm_suggestions = await self._generate_llm_suggestions(result)
            if llm_suggestions:
                return llm_suggestions
        except Exception:
            logger.warning("LLM suggestion generation failed; falling back to rule-based.")

        return self.generate(result)

    async def _generate_llm_suggestions(self, _result: CompositeResult) -> list[Suggestion]:
        """Generate LLM-assisted suggestions using the judge provider.

        Args:
            _result: Composite evaluation result (reserved for LLM prompt construction).

        Returns:
            List of LLM-generated Suggestion objects.

        Raises:
            NotImplementedError: When LLM provider is not yet configured.
        """
        # Reason: LLM path is optional; raise to trigger fallback in generate_async
        raise NotImplementedError("LLM suggestion generation requires a configured judge provider.")
Functions
__init__(no_llm_suggestions=False)

Initialize the suggestion engine.

Parameters:

Name Type Description Default
no_llm_suggestions bool

Disable LLM-assisted suggestions when True.

False
Source code in src/app/reports/suggestion_engine.py
def __init__(self, no_llm_suggestions: bool = False) -> None:
    """Initialize the suggestion engine.

    Args:
        no_llm_suggestions: Disable LLM-assisted suggestions when True.
    """
    self.no_llm_suggestions = no_llm_suggestions
generate(result)

Generate rule-based suggestions from evaluation results.

Analyses metric_scores, tier-level scores, and tiers_enabled to produce actionable suggestions. Tier 2 absence is noted as an info suggestion.

Parameters:

Name Type Description Default
result CompositeResult

Composite evaluation result to analyse.

required

Returns:

Type Description
list[Suggestion]

List of Suggestion objects ordered by severity (critical first).

Source code in src/app/reports/suggestion_engine.py
def generate(self, result: CompositeResult) -> list[Suggestion]:
    """Generate rule-based suggestions from evaluation results.

    Analyses metric_scores, tier-level scores, and tiers_enabled to produce
    actionable suggestions. Tier 2 absence is noted as an info suggestion.

    Args:
        result: Composite evaluation result to analyse.

    Returns:
        List of Suggestion objects ordered by severity (critical first).
    """
    suggestions: list[Suggestion] = []

    # Process known metric-level rules
    for metric, (tier, msg_tmpl, action) in _ALL_RULES.items():
        score = result.metric_scores.get(metric)
        if score is None:
            continue
        severity = _classify_severity(score)
        suggestions.append(
            Suggestion(
                metric=metric,
                tier=tier,
                severity=severity,
                message=msg_tmpl.format(score=score),
                action=action,
            )
        )

    # Tier-level fallback: produce suggestions from tier scores when metric_scores empty
    if not result.metric_scores:
        tier_entries = [
            ("tier1_score", 1, result.tier1_score),
            ("tier3_score", 3, result.tier3_score),
        ]
        if result.tier2_score is not None:
            tier_entries.append(("tier2_score", 2, result.tier2_score))

        for metric_name, tier, score in tier_entries:
            severity = _classify_severity(score)
            suggestions.append(
                Suggestion(
                    metric=metric_name,
                    tier=tier,
                    severity=severity,
                    message=f"Tier {tier} overall score {score:.2f} — improvement needed.",
                    action="Review individual metric scores to identify specific weaknesses.",
                )
            )

    # Tier 2 absence: inform the user LLM judging was not run
    if result.tier2_score is None:
        suggestions.append(
            Suggestion(
                metric="tier2_score",
                tier=2,
                severity=SuggestionSeverity.INFO,
                message="Tier 2 LLM-as-Judge was not run — quality assessment incomplete.",
                action="Configure a judge provider in Settings to enable Tier 2 scoring.",
            )
        )

    # Sort: critical → warning → info
    _order = {
        SuggestionSeverity.CRITICAL: 0,
        SuggestionSeverity.WARNING: 1,
        SuggestionSeverity.INFO: 2,
    }
    suggestions.sort(key=lambda s: _order[s.severity])
    return suggestions
generate_async(result) async

Generate suggestions with optional LLM enhancement.

Attempts LLM-assisted suggestions first; falls back to rule-based on error.

Parameters:

Name Type Description Default
result CompositeResult

Composite evaluation result to analyse.

required

Returns:

Type Description
list[Suggestion]

List of Suggestion objects, potentially enriched by LLM.

Source code in src/app/reports/suggestion_engine.py
async def generate_async(self, result: CompositeResult) -> list[Suggestion]:
    """Generate suggestions with optional LLM enhancement.

    Attempts LLM-assisted suggestions first; falls back to rule-based on error.

    Args:
        result: Composite evaluation result to analyse.

    Returns:
        List of Suggestion objects, potentially enriched by LLM.
    """
    if self.no_llm_suggestions:
        return self.generate(result)

    try:
        llm_suggestions = await self._generate_llm_suggestions(result)
        if llm_suggestions:
            return llm_suggestions
    except Exception:
        logger.warning("LLM suggestion generation failed; falling back to rule-based.")

    return self.generate(result)

app.tools.peerread_tools

PeerRead agent tools for multi-agent system integration.

This module provides agent tools that enable the manager agent to interact with the PeerRead dataset for paper retrieval, querying, and review evaluation.

Classes

Functions

add_peerread_review_tools_to_agent(agent, agent_id='manager', max_content_length=15000)

Add PeerRead review generation and persistence tools to an agent.

Parameters:

Name Type Description Default
agent Agent[None, BaseModel]

The agent to which review tools will be added.

required
agent_id str

The agent identifier for tracing (default: “manager”).

'manager'
max_content_length int

The maximum number of characters to include in the prompt.

15000
Source code in src/app/tools/peerread_tools.py
def add_peerread_review_tools_to_agent(
    agent: Agent[None, BaseModel],
    agent_id: str = "manager",
    max_content_length: int = 15000,
):
    """Add PeerRead review generation and persistence tools to an agent.

    Args:
        agent: The agent to which review tools will be added.
        agent_id: The agent identifier for tracing (default: "manager").
        max_content_length: The maximum number of characters to include in the prompt.
    """

    @agent.tool
    async def generate_paper_review_content_from_template(  # type: ignore[reportUnusedFunction]
        ctx: RunContext[None],
        paper_id: str,
        review_focus: str = "comprehensive",
        tone: str = "professional",
    ) -> str:
        """Create a review template for a specific paper.

        Only call this tool when the user explicitly asks to review a paper.
        Do NOT call for greetings, general questions, or non-paper queries.

        WARNING: This function does NOT generate actual reviews. It creates a
        structured template that would need to be filled in manually or by
        another AI system. This is a demonstration/template function only.

        Args:
            paper_id: Unique identifier for the paper being reviewed.
            review_focus: Type of review (comprehensive, technical, high-level).
            tone: Tone of the review (professional, constructive, critical).

        Returns:
            str: Review template with paper information and placeholder sections
                 that need to be manually completed.
        """

        async def _fn() -> str:
            config = load_peerread_config()
            loader = PeerReadLoader(config)
            paper = loader.get_paper_by_id(paper_id)
            if not paper:
                raise ModelRetry(f"Paper {paper_id} not found in PeerRead dataset")
            paper_content = _load_paper_content_with_fallback(ctx, loader, paper_id, paper.abstract)
            review_template = _load_and_format_template(
                paper.title, paper.abstract, paper_content, tone, review_focus, max_content_length
            )
            logger.info(
                f"Created review template for paper {paper_id} "
                f"(intermediate step, requires agent completion)"
            )
            return review_template

        return await _traced_tool_call(
            agent_id=agent_id,
            tool_name="generate_paper_review_content_from_template",
            context=f"paper_id={paper_id},focus={review_focus}",
            fn=_fn,
            error_msg="Failed to create review template",
        )

    @agent.tool
    async def save_paper_review(  # type: ignore[reportUnusedFunction]
        ctx: RunContext[None],
        paper_id: str,
        review_text: str,
        recommendation: str = "",
        confidence: float = 0.0,
    ) -> str:
        """Save agent-generated review to persistent storage.

        Only call this tool after generating a review the user requested.
        Do NOT call for greetings, general questions, or non-paper queries.

        Args:
            paper_id: Unique identifier for the paper being reviewed.
            review_text: Review text generated by the agent.
            recommendation: Review recommendation (accept/reject/etc).
            confidence: Confidence score for the review (0.0-1.0).

        Returns:
            str: Path to the saved review file.
        """

        async def _fn() -> str:
            from app.utils.run_context import get_active_run_context

            run_ctx = get_active_run_context()
            review = PeerReadReview(
                comments=review_text,
                recommendation=recommendation if recommendation else "UNKNOWN",
                reviewer_confidence=str(confidence) if confidence > 0 else "UNKNOWN",
            )
            persistence = ReviewPersistence()
            filepath = persistence.save_review(
                paper_id, review, run_dir=run_ctx.run_dir if run_ctx else None
            )
            logger.info(f"Saved review for paper {paper_id} to {filepath}")
            return filepath

        return await _traced_tool_call(
            agent_id=agent_id,
            tool_name="save_paper_review",
            context=f"paper_id={paper_id}",
            fn=_fn,
            error_cls=ValueError,
            error_msg="Failed to save review",
        )

    @agent.tool
    async def save_structured_review(  # type: ignore[reportUnusedFunction]
        ctx: RunContext[None],
        paper_id: str,
        structured_review: GeneratedReview,
    ) -> str:
        """Save a structured review object to persistent storage.

        Only call this tool after generating a review the user requested.
        Do NOT call for greetings, general questions, or non-paper queries.

        Args:
            paper_id: Unique identifier for the paper being reviewed.
            structured_review: GeneratedReview object with validated fields.

        Returns:
            str: Path to the saved review file.
        """

        # Reason: derive model_info from actual model name instead of hardcoding
        agent_model = agent.model
        resolved_name = (
            agent_model
            if isinstance(agent_model, str)
            else (getattr(agent_model, "model_name", "unknown") if agent_model else "unknown")
        )
        model_info = f"{resolved_name} via PydanticAI"

        async def _fn() -> str:
            from datetime import UTC, datetime

            from app.utils.run_context import get_active_run_context

            run_ctx = get_active_run_context()
            peerread_format = structured_review.to_peerread_format()
            review = PeerReadReview.model_validate(peerread_format)

            persistence = ReviewPersistence()
            filepath = persistence.save_review(
                paper_id,
                review,
                run_dir=run_ctx.run_dir if run_ctx else None,
                structured_review=structured_review.model_dump(),
                model_info=model_info,
            )

            timestamp = datetime.now(UTC).strftime("%Y-%m-%dT%H-%M-%SZ")
            ReviewGenerationResult(
                paper_id=paper_id,
                review=structured_review,
                timestamp=timestamp,
                model_info=model_info,
            )

            logger.info(f"Saved review for paper {paper_id} to {filepath}")
            return filepath

        return await _traced_tool_call(
            agent_id=agent_id,
            tool_name="save_structured_review",
            context=f"paper_id={paper_id}",
            fn=_fn,
            error_cls=ValueError,
            error_msg="Failed to save structured review",
        )

add_peerread_review_tools_to_manager(manager_agent, max_content_length=15000)

Backward compatibility wrapper for add_peerread_review_tools_to_agent.

Deprecated: Use add_peerread_review_tools_to_agent instead.

Parameters:

Name Type Description Default
manager_agent Agent[None, BaseModel]

The manager agent to which review tools will be added.

required
max_content_length int

The maximum number of characters to include in the prompt.

15000
Source code in src/app/tools/peerread_tools.py
def add_peerread_review_tools_to_manager(
    manager_agent: Agent[None, BaseModel], max_content_length: int = 15000
):
    """Backward compatibility wrapper for add_peerread_review_tools_to_agent.

    Deprecated: Use add_peerread_review_tools_to_agent instead.

    Args:
        manager_agent: The manager agent to which review tools will be added.
        max_content_length: The maximum number of characters to include in the prompt.
    """
    return add_peerread_review_tools_to_agent(
        manager_agent, agent_id="manager", max_content_length=max_content_length
    )

add_peerread_tools_to_agent(agent, agent_id='manager')

Add PeerRead dataset tools to an agent.

Parameters:

Name Type Description Default
agent Agent[None, BaseModel]

The agent to which PeerRead tools will be added.

required
agent_id str

The agent identifier for tracing (default: “manager”).

'manager'
Source code in src/app/tools/peerread_tools.py
def add_peerread_tools_to_agent(agent: Agent[None, BaseModel], agent_id: str = "manager"):
    """Add PeerRead dataset tools to an agent.

    Args:
        agent: The agent to which PeerRead tools will be added.
        agent_id: The agent identifier for tracing (default: "manager").
    """

    @agent.tool
    async def get_peerread_paper(ctx: RunContext[None], paper_id: str) -> PeerReadPaper:  # type: ignore[reportUnusedFunction]
        """Get a specific paper from the PeerRead dataset.

        Only call this tool when the user explicitly asks about a specific paper.
        Do NOT call for greetings, general questions, or non-paper queries.

        Args:
            paper_id: Unique identifier for the paper (e.g. '104', '305').

        Returns:
            PeerReadPaper with title, abstract, and reviews.
        """

        async def _fn() -> PeerReadPaper:
            config = load_peerread_config()
            loader = PeerReadLoader(config)
            paper = loader.get_paper_by_id(paper_id)
            if not paper:
                raise ModelRetry(f"Paper {paper_id} not found in PeerRead dataset")
            logger.info(f"Retrieved paper {paper_id}: {paper.title[:50]}...")
            return paper

        return await _traced_tool_call(
            agent_id=agent_id,
            tool_name="get_peerread_paper",
            context=f"paper_id={paper_id}",
            fn=_fn,
            error_msg="Failed to retrieve paper",
        )

    @agent.tool
    async def query_peerread_papers(  # type: ignore[reportUnusedFunction]
        ctx: RunContext[None], venue: str = "", min_reviews: int = 1
    ) -> list[PeerReadPaper]:
        """Query papers from PeerRead dataset with filters.

        Only call this tool when the user explicitly asks to search or list papers.
        Do NOT call for greetings, general questions, or non-paper queries.

        Args:
            venue: Filter by conference venue (empty for all venues).
            min_reviews: Minimum number of reviews required per paper.

        Returns:
            List of PeerReadPaper objects matching the criteria.
        """

        async def _fn() -> list[PeerReadPaper]:
            config = load_peerread_config()
            loader = PeerReadLoader(config)
            papers = loader.query_papers(
                venue=venue if venue else None,
                min_reviews=min_reviews,
                limit=config.max_papers_per_query,
            )
            logger.info(f"Found {len(papers)} papers matching criteria")
            return papers

        return await _traced_tool_call(
            agent_id=agent_id,
            tool_name="query_peerread_papers",
            context=f"venue={venue},min_reviews={min_reviews}",
            fn=_fn,
            error_msg="Failed to query papers",
        )

    @agent.tool
    async def get_paper_content(  # type: ignore[reportUnusedFunction]
        ctx: RunContext[None],
        paper_id: str,
    ) -> str:
        """Get the full text content of a paper from the local PeerRead dataset.

        Returns full paper text using a fallback chain: parsed JSON → raw PDF → abstract.
        Use this tool to read a paper's body text for analysis or review generation.

        Note: Requires `paper_id` (e.g. "1105.1072"), NOT a file path or URL.

        Args:
            paper_id: Unique identifier for the paper (e.g. "1105.1072").
                      Do NOT pass a URL or file path.

        Returns:
            str: Full paper text content from the local PeerRead dataset.
        """

        async def _fn() -> str:
            config = load_peerread_config()
            loader = PeerReadLoader(config)
            paper = loader.get_paper_by_id(paper_id)
            if not paper:
                raise ValueError(f"Paper {paper_id} not found in PeerRead dataset")
            content = _load_paper_content_with_fallback(ctx, loader, paper_id, paper.abstract)
            logger.info(f"Retrieved content for paper {paper_id}")
            return content

        return await _traced_tool_call(
            agent_id=agent_id,
            tool_name="get_paper_content",
            context=f"paper_id={paper_id}",
            fn=_fn,
            error_cls=ValueError,
            error_msg="Failed to retrieve paper content",
        )

read_paper_pdf(ctx, pdf_path)

Read text content from a PDF file using MarkItDown.

Note: MarkItDown extracts the entire PDF content as a single text block. Page-level extraction is not supported by the underlying library.

Parameters:

Name Type Description Default
ctx RunContext[None] | None

RunContext (unused but required for tool compatibility).

required
pdf_path str | Path

Path to the PDF file.

required

Returns:

Name Type Description
str str

Extracted text content from the entire PDF in Markdown format.

Raises:

Type Description
FileNotFoundError

If the PDF file doesn’t exist.

ValueError

If the file is not a PDF or conversion fails.

Source code in src/app/tools/peerread_tools.py
def read_paper_pdf(
    ctx: RunContext[None] | None,
    pdf_path: str | Path,
) -> str:
    """Read text content from a PDF file using MarkItDown.

    Note: MarkItDown extracts the entire PDF content as a single text block.
    Page-level extraction is not supported by the underlying library.

    Args:
        ctx: RunContext (unused but required for tool compatibility).
        pdf_path: Path to the PDF file.

    Returns:
        str: Extracted text content from the entire PDF in Markdown format.

    Raises:
        FileNotFoundError: If the PDF file doesn't exist.
        ValueError: If the file is not a PDF or conversion fails.
    """
    # Reason: LLMs hallucinate URLs for paper PDFs; reject them defensively instead of crashing
    if isinstance(pdf_path, str) and pdf_path.startswith(("http://", "https://")):
        return (
            f"Error: URLs are not supported. "
            f"Use paper_id with get_paper_content instead. Received: {pdf_path}"
        )

    if isinstance(pdf_path, str):
        pdf_file = Path(pdf_path)
    else:
        pdf_file = pdf_path
    if not pdf_file.exists():
        raise FileNotFoundError(f"PDF file not found: {pdf_file}")
    if pdf_file.suffix.lower() != ".pdf":
        raise ValueError(f"Not a PDF file: {pdf_file}")

    try:
        md_converter = MarkItDown()
        result = md_converter.convert(pdf_file)
        logger.info(f"Extracted text from {pdf_file}")
        return result.text_content.strip()

    except Exception as e:
        logger.error(f"Error reading PDF with MarkItDown: {e}")
        raise ValueError(f"Failed to read PDF: {str(e)}")

app.utils.artifact_registry

Artifact registry for tracking output paths during CLI runs.

Provides a thread-safe singleton registry where components register file paths they write during execution. At run end, the registry produces a summary block listing all artifacts and their locations.

Example

from app.utils.artifact_registry import get_artifact_registry registry = get_artifact_registry() registry.register(“Log file”, Path(“logs/run.log”)) print(registry.format_summary_block())

Classes

ArtifactRegistry

Thread-safe registry for tracking artifact output paths.

Components call register() during execution to record what files they wrote. At run end, format_summary_block() produces a human-readable summary for stdout and logging.

Source code in src/app/utils/artifact_registry.py
class ArtifactRegistry:
    """Thread-safe registry for tracking artifact output paths.

    Components call ``register()`` during execution to record what
    files they wrote. At run end, ``format_summary_block()`` produces
    a human-readable summary for stdout and logging.
    """

    def __init__(self) -> None:
        """Initialize empty registry with thread lock."""
        self._entries: list[tuple[str, Path]] = []
        self._lock = threading.Lock()

    def register(self, label: str, path: Path) -> None:
        """Register an artifact path with a descriptive label.

        Args:
            label: Human-readable category (e.g., "Log file", "Report").
            path: Path to the artifact file or directory.
        """
        abs_path = path if path.is_absolute() else path.resolve()
        with self._lock:
            self._entries.append((label, abs_path))

    def summary(self) -> list[tuple[str, Path]]:
        """Return all registered artifacts as (label, absolute_path) tuples.

        Returns:
            List of (label, path) tuples in registration order.
        """
        with self._lock:
            return list(self._entries)

    def reset(self) -> None:
        """Clear all registered artifacts."""
        with self._lock:
            self._entries.clear()

    def format_summary_block(self) -> str:
        """Format a human-readable summary block for stdout.

        Returns:
            Multi-line string with artifact listing, or a
            "No artifacts written" message if the registry is empty.
        """
        entries = self.summary()
        if not entries:
            return "No artifacts written"

        lines = ["", "Artifacts written:"]
        for label, path in entries:
            lines.append(f"  {label}: {path}")
        return "\n".join(lines)
Functions
__init__()

Initialize empty registry with thread lock.

Source code in src/app/utils/artifact_registry.py
def __init__(self) -> None:
    """Initialize empty registry with thread lock."""
    self._entries: list[tuple[str, Path]] = []
    self._lock = threading.Lock()
format_summary_block()

Format a human-readable summary block for stdout.

Returns:

Type Description
str

Multi-line string with artifact listing, or a

str

“No artifacts written” message if the registry is empty.

Source code in src/app/utils/artifact_registry.py
def format_summary_block(self) -> str:
    """Format a human-readable summary block for stdout.

    Returns:
        Multi-line string with artifact listing, or a
        "No artifacts written" message if the registry is empty.
    """
    entries = self.summary()
    if not entries:
        return "No artifacts written"

    lines = ["", "Artifacts written:"]
    for label, path in entries:
        lines.append(f"  {label}: {path}")
    return "\n".join(lines)
register(label, path)

Register an artifact path with a descriptive label.

Parameters:

Name Type Description Default
label str

Human-readable category (e.g., “Log file”, “Report”).

required
path Path

Path to the artifact file or directory.

required
Source code in src/app/utils/artifact_registry.py
def register(self, label: str, path: Path) -> None:
    """Register an artifact path with a descriptive label.

    Args:
        label: Human-readable category (e.g., "Log file", "Report").
        path: Path to the artifact file or directory.
    """
    abs_path = path if path.is_absolute() else path.resolve()
    with self._lock:
        self._entries.append((label, abs_path))
reset()

Clear all registered artifacts.

Source code in src/app/utils/artifact_registry.py
def reset(self) -> None:
    """Clear all registered artifacts."""
    with self._lock:
        self._entries.clear()
summary()

Return all registered artifacts as (label, absolute_path) tuples.

Returns:

Type Description
list[tuple[str, Path]]

List of (label, path) tuples in registration order.

Source code in src/app/utils/artifact_registry.py
def summary(self) -> list[tuple[str, Path]]:
    """Return all registered artifacts as (label, absolute_path) tuples.

    Returns:
        List of (label, path) tuples in registration order.
    """
    with self._lock:
        return list(self._entries)

Functions

get_artifact_registry()

Get or create the global ArtifactRegistry singleton.

Returns:

Type Description
ArtifactRegistry

The global ArtifactRegistry instance.

Source code in src/app/utils/artifact_registry.py
def get_artifact_registry() -> ArtifactRegistry:
    """Get or create the global ArtifactRegistry singleton.

    Returns:
        The global ArtifactRegistry instance.
    """
    global _global_registry
    with _registry_lock:
        if _global_registry is None:
            _global_registry = ArtifactRegistry()
        return _global_registry

app.utils.error_messages

Error message utilities for the Agents-eval application.

This module provides concise helper functions for generating standardized error messages related to configuration loading and validation.

Functions

api_connection_error(error)

Generate a error message for API connection error.

Source code in src/app/utils/error_messages.py
def api_connection_error(error: str) -> str:
    """
    Generate a error message for API connection error.
    """
    return f"API connection error: {error}"

failed_to_load_config(error)

Generate a error message for configuration loading failure.

Source code in src/app/utils/error_messages.py
def failed_to_load_config(error: str) -> str:
    """
    Generate a error message for configuration loading failure.
    """
    return f"Failed to load config: {error}"

file_not_found(file_path)

Generate an error message for a missing configuration file.

Source code in src/app/utils/error_messages.py
def file_not_found(file_path: str | Path) -> str:
    """
    Generate an error message for a missing configuration file.
    """
    return f"File not found: {file_path}"

generic_exception(error)

Generate a generic error message.

Source code in src/app/utils/error_messages.py
def generic_exception(error: str) -> str:
    """
    Generate a generic error message.
    """
    return f"Exception: {error}"

get_key_error(error)

Generate a generic error message.

Source code in src/app/utils/error_messages.py
def get_key_error(error: str) -> str:
    """
    Generate a generic error message.
    """
    return f"Key Error: {error}"

invalid_data_model_format(error)

Generate an error message for invalid pydantic data model format.

Source code in src/app/utils/error_messages.py
def invalid_data_model_format(error: str) -> str:
    """
    Generate an error message for invalid pydantic data model format.
    """
    return f"Invalid pydantic data model format: {error}"

invalid_json(error)

Generate an error message for invalid JSON in a configuration file.

Source code in src/app/utils/error_messages.py
def invalid_json(error: str) -> str:
    """
    Generate an error message for invalid JSON in a configuration file.
    """
    return f"Invalid JSON: {error}"

invalid_type(expected_type, actual_type)

Generate an error message for invalid Type.

Source code in src/app/utils/error_messages.py
def invalid_type(expected_type: str, actual_type: str) -> str:
    """
    Generate an error message for invalid Type.
    """
    return f"Type Error: Expected {expected_type}, got {actual_type} instead."

app.utils.load_configs

Configuration loading utilities.

Provides a generic function for loading and validating JSON configuration files against Pydantic models, with error handling and logging support.

Classes

LogfireConfig

Bases: BaseModel

Configuration for Logfire + Phoenix tracing integration.

Constructed from JudgeSettings via from_settings(). All values are controlled by JUDGE_LOGFIRE_ and JUDGE_PHOENIX_ env vars through pydantic-settings.

Source code in src/app/config/logfire_config.py
class LogfireConfig(BaseModel):
    """Configuration for Logfire + Phoenix tracing integration.

    Constructed from JudgeSettings via from_settings(). All values
    are controlled by JUDGE_LOGFIRE_* and JUDGE_PHOENIX_* env vars
    through pydantic-settings.
    """

    enabled: bool = True
    send_to_cloud: bool = False
    phoenix_endpoint: str = "http://localhost:6006"
    service_name: str = "peerread-evaluation"

    @classmethod
    def from_settings(cls, settings: JudgeSettings) -> LogfireConfig:
        """Create LogfireConfig from JudgeSettings.

        Args:
            settings: JudgeSettings instance with logfire fields.

        Returns:
            LogfireConfig populated from pydantic-settings.
        """
        return cls(
            enabled=settings.logfire_enabled,
            send_to_cloud=settings.logfire_send_to_cloud,
            phoenix_endpoint=settings.phoenix_endpoint,
            service_name=settings.logfire_service_name,
        )
Functions
from_settings(settings) classmethod

Create LogfireConfig from JudgeSettings.

Parameters:

Name Type Description Default
settings JudgeSettings

JudgeSettings instance with logfire fields.

required

Returns:

Type Description
LogfireConfig

LogfireConfig populated from pydantic-settings.

Source code in src/app/config/logfire_config.py
@classmethod
def from_settings(cls, settings: JudgeSettings) -> LogfireConfig:
    """Create LogfireConfig from JudgeSettings.

    Args:
        settings: JudgeSettings instance with logfire fields.

    Returns:
        LogfireConfig populated from pydantic-settings.
    """
    return cls(
        enabled=settings.logfire_enabled,
        send_to_cloud=settings.logfire_send_to_cloud,
        phoenix_endpoint=settings.phoenix_endpoint,
        service_name=settings.logfire_service_name,
    )

Functions

load_config(config_path, data_model)

Generic configuration loader that validates against any Pydantic model.

Parameters:

Name Type Description Default
config_path str | Path

Path to the JSON configuration file

required
data_model type[T]

Pydantic model class for validation

required

Returns:

Type Description
T

Validated configuration instance

Source code in src/app/utils/load_configs.py
def load_config[T: BaseModel](config_path: str | Path, data_model: type[T]) -> T:
    """
    Generic configuration loader that validates against any Pydantic model.

    Args:
        config_path: Path to the JSON configuration file
        data_model: Pydantic model class for validation

    Returns:
        Validated configuration instance
    """

    try:
        with open(config_path, encoding="utf-8") as f:
            data = json.load(f)
        return data_model.model_validate(data)
    except FileNotFoundError as e:
        msg = file_not_found(config_path)
        logger.error(msg)
        raise FileNotFoundError(msg) from e
    except json.JSONDecodeError as e:
        msg = invalid_json(str(e))
        logger.error(msg)
        raise ValueError(msg) from e
    except ValidationError as e:
        msg = invalid_data_model_format(str(e))
        logger.error(msg)
        raise ValidationError(msg) from e
    except Exception as e:
        msg = failed_to_load_config(str(e))
        logger.exception(msg)
        raise Exception(msg) from e

app.utils.load_settings

Utility functions for loading application settings and configuration.

This module provides functions to load and validate application configuration from a JSON file. For environment variables, use AppEnv from app.data_models.app_models.

Classes

Functions

load_config(config_path)

Load and validate application configuration from a JSON file.

Parameters:

Name Type Description Default
config_path str

Path to the JSON configuration file.

required

Returns:

Name Type Description
ChatConfig ChatConfig

An instance of ChatConfig with validated configuration data.

Raises:

Type Description
FileNotFoundError

If the configuration file does not exist.

JSONDecodeError

If the file contains invalid JSON.

Exception

For any other unexpected errors during loading or validation.

Source code in src/app/utils/load_settings.py
def load_config(config_path: str | Path) -> ChatConfig:
    """
    Load and validate application configuration from a JSON file.

    Args:
        config_path (str): Path to the JSON configuration file.

    Returns:
        ChatConfig: An instance of ChatConfig with validated configuration data.

    Raises:
        FileNotFoundError: If the configuration file does not exist.
        json.JSONDecodeError: If the file contains invalid JSON.
        Exception: For any other unexpected errors during loading or validation.
    """

    try:
        with open(config_path) as f:
            config_data = json.load(f)
    except FileNotFoundError as e:
        msg = file_not_found(config_path)
        logger.error(msg)
        raise FileNotFoundError(msg) from e
    except json.JSONDecodeError as e:
        msg = invalid_json(str(e))
        logger.error(msg)
        raise json.JSONDecodeError(msg, str(config_path), 0) from e
    except Exception as e:
        msg = failed_to_load_config(str(e))
        logger.exception(msg)
        raise Exception(msg) from e

    return ChatConfig.model_validate(config_data)

app.utils.log

Set up the logger with custom settings. Logs are written to a file with automatic rotation.

Functions

app.utils.log_scrubbing

Log scrubbing patterns and sensitive data filtering.

This module provides scrubbing patterns and filters to redact sensitive data from two independent output channels:

  1. Loguru (file/console logs): Uses scrub_log_record() filter with the full SENSITIVE_PATTERNS set, since Loguru has no built-in scrubbing.
  2. Logfire (OTLP trace export): Has built-in default patterns covering password, secret, credential, api_key, jwt, session, cookie, csrf, ssn, credit_card. We only supply extra patterns Logfire doesn’t cover.

Security features: - Pattern-based redaction for common secret types - Loguru filter function for file sink integration - Logfire extra patterns (additive, not duplicating built-in defaults) - Case-insensitive pattern matching

Functions

get_logfire_scrubbing_patterns()

Get extra scrubbing patterns for Logfire trace export.

Returns only patterns NOT already covered by Logfire’s built-in defaults. These are passed to logfire.ScrubbingOptions(extra_patterns=...).

Returns:

Type Description
list[str]

list[str]: List of regex pattern strings for Logfire extra scrubbing.

Example

import logfire patterns = get_logfire_scrubbing_patterns() logfire.configure(scrubbing=logfire.ScrubbingOptions(extra_patterns=patterns))

Source code in src/app/utils/log_scrubbing.py
def get_logfire_scrubbing_patterns() -> list[str]:
    """Get extra scrubbing patterns for Logfire trace export.

    Returns only patterns NOT already covered by Logfire's built-in defaults.
    These are passed to ``logfire.ScrubbingOptions(extra_patterns=...)``.

    Returns:
        list[str]: List of regex pattern strings for Logfire extra scrubbing.

    Example:
        >>> import logfire
        >>> patterns = get_logfire_scrubbing_patterns()
        >>> logfire.configure(scrubbing=logfire.ScrubbingOptions(extra_patterns=patterns))
    """
    return list(LOGFIRE_EXTRA_PATTERNS)

scrub_log_record(record)

Scrub sensitive data from Loguru log record.

This function is intended to be used as a Loguru filter. It modifies the log record in-place by replacing sensitive patterns with [REDACTED]. Uses the full SENSITIVE_PATTERNS set since Loguru has no built-in scrubbing.

Parameters:

Name Type Description Default
record dict[str, Any]

Loguru log record dict with ‘message’ key.

required

Returns:

Name Type Description
bool bool

Always True to allow the (scrubbed) record to pass through.

Example

logger.add(“file.log”, filter=scrub_log_record)

Source code in src/app/utils/log_scrubbing.py
def scrub_log_record(record: dict[str, Any]) -> bool:
    """Scrub sensitive data from Loguru log record.

    This function is intended to be used as a Loguru filter. It modifies
    the log record in-place by replacing sensitive patterns with [REDACTED].
    Uses the full SENSITIVE_PATTERNS set since Loguru has no built-in scrubbing.

    Args:
        record: Loguru log record dict with 'message' key.

    Returns:
        bool: Always True to allow the (scrubbed) record to pass through.

    Example:
        >>> logger.add("file.log", filter=scrub_log_record)
    """
    message = record.get("message", "")

    for pattern in SENSITIVE_PATTERNS:
        message = re.sub(pattern, "[REDACTED]", message, flags=re.IGNORECASE)

    record["message"] = message
    return True

app.utils.login

This module provides utility functions for managing login state and initializing the environment for a given project. It includes functionality to load and save login state, perform a one-time login, and check if the user is logged in.

Classes

Functions

login(project_name, chat_env_config)

Logs in to the workspace and initializes the environment for the given project. Args: project_name (str): The name of the project to initialize. chat_env_config (AppEnv): The application environment configuration containing the API keys. Returns: None

Source code in src/app/utils/login.py
def login(project_name: str, chat_env_config: AppEnv):
    """
    Logs in to the workspace and initializes the environment for the given project.
    Args:
        project_name (str): The name of the project to initialize.
        chat_env_config (AppEnv): The application environment configuration
            containing the API keys.
    Returns:
        None
    """

    try:
        logger.info(f"Logging in to the workspaces for project: {project_name}")
        is_api_key, api_key_msg = get_api_key("LOGFIRE", chat_env_config)
        if is_api_key:
            logfire_conf(token=api_key_msg)
        is_api_key, api_key_msg = get_api_key("WANDB", chat_env_config)
        if is_api_key:
            try:
                os.environ.setdefault("WANDB_ERROR_REPORTING", "false")
                # Reason: Weave initializes sentry_sdk.Hub at import time.
                # Disable weave by default to prevent sentry telemetry.
                # Set WEAVE_DISABLED=false to enable weave tracing.
                os.environ.setdefault("WEAVE_DISABLED", "true")
                from wandb import login as wandb_login  # type: ignore[reportMissingImports]
                from weave import init as weave_init  # type: ignore[reportMissingImports]

                wandb_login(key=api_key_msg)
                weave_init(project_name)
            except ImportError:
                logger.warning("wandb/weave not installed (optional: uv sync --group wandb)")
    except Exception as e:
        msg = generic_exception(str(e))
        logger.exception(e)
        raise Exception(msg) from e
    finally:
        api_key_msg = ""

app.utils.paths

Centralized path resolution utilities for the application.

Functions

get_app_root()

Get the application root directory (src/app).

Returns:

Name Type Description
Path Path

Absolute path to the src/app directory.

Source code in src/app/utils/paths.py
def get_app_root() -> Path:
    """Get the application root directory (src/app).

    Returns:
        Path: Absolute path to the src/app directory.
    """

    return Path(__file__).parent.parent

get_config_dir()

Get the application config directory (src/app/config).

Returns:

Name Type Description
Path Path

Absolute path to the src/app/config directory.

Source code in src/app/utils/paths.py
def get_config_dir() -> Path:
    """Get the application config directory (src/app/config).

    Returns:
        Path: Absolute path to the src/app/config directory.
    """
    return get_app_root() / CONFIGS_PATH

get_project_root()

Get the project root directory.

Returns:

Name Type Description
Path Path

Absolute path to the project root directory.

Source code in src/app/utils/paths.py
def get_project_root() -> Path:
    """Get the project root directory.

    Returns:
        Path: Absolute path to the project root directory.
    """
    return get_app_root().parent.parent

get_review_template_path()

Get the path to the review template file.

Returns:

Name Type Description
Path Path

Absolute path to the REVIEW_PROMPT_TEMPLATE file.

Source code in src/app/utils/paths.py
def get_review_template_path() -> Path:
    """Get the path to the review template file.

    Returns:
        Path: Absolute path to the REVIEW_PROMPT_TEMPLATE file.
    """
    return get_config_dir() / REVIEW_PROMPT_TEMPLATE

resolve_app_path(relative_path)

Resolve a path relative to the application root.

Parameters:

Name Type Description Default
relative_path str

Path relative to src/app directory.

required

Returns:

Name Type Description
Path Path

Absolute path resolved from the application root.

Example

resolve_app_path(“datasets/peerread”) -> /full/path/to/src/app/datasets/peerread

Source code in src/app/utils/paths.py
def resolve_app_path(relative_path: str) -> Path:
    """Resolve a path relative to the application root.

    Args:
        relative_path: Path relative to src/app directory.

    Returns:
        Path: Absolute path resolved from the application root.

    Example:
        resolve_app_path("datasets/peerread") -> /full/path/to/src/app/datasets/peerread
    """

    return get_app_root() / relative_path

resolve_config_path(filename)

Resolve a config file path within the config directory.

Parameters:

Name Type Description Default
filename str

Name of the config file (e.g., “config_chat.json”).

required

Returns:

Name Type Description
Path Path

Absolute path to the config file.

Example

resolve_config_path(“config_chat.json”) -> /full/path/to/src/app/config/config_chat.json

Source code in src/app/utils/paths.py
def resolve_config_path(filename: str) -> Path:
    """Resolve a config file path within the config directory.

    Args:
        filename: Name of the config file (e.g., "config_chat.json").

    Returns:
        Path: Absolute path to the config file.

    Example:
        resolve_config_path("config_chat.json") ->
        /full/path/to/src/app/config/config_chat.json
    """
    return get_config_dir() / filename

resolve_project_path(relative_path)

Resolve a path relative to the project root.

Parameters:

Name Type Description Default
relative_path str

Path relative to the project root directory.

required

Returns:

Name Type Description
Path Path

Absolute path resolved from the project root.

Source code in src/app/utils/paths.py
def resolve_project_path(relative_path: str) -> Path:
    """Resolve a path relative to the project root.

    Args:
        relative_path: Path relative to the project root directory.

    Returns:
        Path: Absolute path resolved from the project root.
    """
    return get_project_root() / relative_path

app.utils.prompt_sanitization

Prompt input sanitization with length limits and XML delimiter wrapping.

This module provides functions to sanitize user-controlled content before interpolation into LLM prompts. It prevents prompt injection attacks by: 1. Truncating content to configurable length limits 2. Wrapping content in XML delimiters to separate data from instructions 3. Preserving content integrity (no escaping needed for LLM consumption)

Security features: - Length-limited inputs prevent token-based DoS - XML delimiters provide clear instruction/data separation - No format string interpolation vulnerabilities

Functions

sanitize_for_prompt(content, max_length, delimiter='content')

Sanitize content for inclusion in LLM prompts.

Parameters:

Name Type Description Default
content str

User-controlled content to sanitize.

required
max_length int

Maximum content length before truncation.

required
delimiter str

XML tag name for wrapping (default: “content”).

'content'

Returns:

Name Type Description
str str

Sanitized content wrapped in XML delimiters, truncated if needed.

Example

sanitize_for_prompt(“user input”, max_length=100) ‘user input

Source code in src/app/utils/prompt_sanitization.py
def sanitize_for_prompt(content: str, max_length: int, delimiter: str = "content") -> str:
    """Sanitize content for inclusion in LLM prompts.

    Args:
        content: User-controlled content to sanitize.
        max_length: Maximum content length before truncation.
        delimiter: XML tag name for wrapping (default: "content").

    Returns:
        str: Sanitized content wrapped in XML delimiters, truncated if needed.

    Example:
        >>> sanitize_for_prompt("user input", max_length=100)
        '<content>user input</content>'
    """
    # Truncate if content exceeds max_length
    truncated = content[:max_length] if len(content) > max_length else content

    # Wrap in XML delimiters
    return f"<{delimiter}>{truncated}</{delimiter}>"

sanitize_paper_abstract(abstract)

Sanitize paper abstract with 5000 character limit.

Parameters:

Name Type Description Default
abstract str

Paper abstract from PeerRead dataset.

required

Returns:

Name Type Description
str str

Sanitized abstract wrapped in delimiters.

Source code in src/app/utils/prompt_sanitization.py
def sanitize_paper_abstract(abstract: str) -> str:
    """Sanitize paper abstract with 5000 character limit.

    Args:
        abstract: Paper abstract from PeerRead dataset.

    Returns:
        str: Sanitized abstract wrapped in <paper_abstract> delimiters.
    """
    return sanitize_for_prompt(abstract, max_length=5000, delimiter="paper_abstract")

sanitize_paper_content(content, max_length=50000)

Sanitize paper body content with format string injection protection.

Unlike other sanitize functions, this also escapes curly braces to prevent Python str.format() injection when the content is interpolated into templates. Paper body content is adversary-controlled (raw PDF text) and may contain format string placeholders like {tone} or {0.class}.

Parameters:

Name Type Description Default
content str

Paper body content from PDF extraction.

required
max_length int

Maximum length of the escaped content before truncation (default: 50000). Applied after brace escaping, so the original content may be shorter than max_length when braces are present.

50000

Returns:

Name Type Description
str str

Content with braces escaped, wrapped in delimiters.

Source code in src/app/utils/prompt_sanitization.py
def sanitize_paper_content(content: str, max_length: int = 50000) -> str:
    """Sanitize paper body content with format string injection protection.

    Unlike other sanitize functions, this also escapes curly braces to prevent
    Python str.format() injection when the content is interpolated into templates.
    Paper body content is adversary-controlled (raw PDF text) and may contain
    format string placeholders like {tone} or {0.__class__}.

    Args:
        content: Paper body content from PDF extraction.
        max_length: Maximum length of the escaped content before truncation
            (default: 50000). Applied after brace escaping, so the original
            content may be shorter than max_length when braces are present.

    Returns:
        str: Content with braces escaped, wrapped in <paper_content> delimiters.
    """
    # Reason: Escape braces BEFORE truncation to prevent splitting a {{ pair
    escaped = content.replace("{", "{{").replace("}", "}}")
    return sanitize_for_prompt(escaped, max_length=max_length, delimiter="paper_content")

sanitize_paper_title(title)

Sanitize paper title with 500 character limit.

Parameters:

Name Type Description Default
title str

Paper title from PeerRead dataset or user input.

required

Returns:

Name Type Description
str str

Sanitized title wrapped in delimiters.

Source code in src/app/utils/prompt_sanitization.py
def sanitize_paper_title(title: str) -> str:
    """Sanitize paper title with 500 character limit.

    Args:
        title: Paper title from PeerRead dataset or user input.

    Returns:
        str: Sanitized title wrapped in <paper_title> delimiters.
    """
    return sanitize_for_prompt(title, max_length=500, delimiter="paper_title")

sanitize_review_text(review)

Sanitize review text with 50000 character limit.

Parameters:

Name Type Description Default
review str

Generated review text or user input.

required

Returns:

Name Type Description
str str

Sanitized review wrapped in delimiters.

Source code in src/app/utils/prompt_sanitization.py
def sanitize_review_text(review: str) -> str:
    """Sanitize review text with 50000 character limit.

    Args:
        review: Generated review text or user input.

    Returns:
        str: Sanitized review wrapped in <review_text> delimiters.
    """
    return sanitize_for_prompt(review, max_length=50000, delimiter="review_text")

app.utils.run_context

Per-run output directory management for the application.

Provides RunContext dataclass that owns the per-run output directory structure. Each run creates a timestamped directory under output/runs/ and writes metadata.json.

Classes

RunContext dataclass

Per-run context owning the output directory for a single application run.

Created at the start of each main() invocation after the execution_id is known. Exposes path helpers for standard output files.

Attributes:

Name Type Description
engine_type str

Engine that produced this run (‘mas’, ‘cc_solo’, ‘cc_teams’).

paper_id str

PeerRead paper identifier.

execution_id str

Unique execution trace ID.

start_time datetime

Datetime when the run started.

run_dir Path

Path to the per-run output directory.

Source code in src/app/utils/run_context.py
@dataclass
class RunContext:
    """Per-run context owning the output directory for a single application run.

    Created at the start of each main() invocation after the execution_id
    is known. Exposes path helpers for standard output files.

    Attributes:
        engine_type: Engine that produced this run ('mas', 'cc_solo', 'cc_teams').
        paper_id: PeerRead paper identifier.
        execution_id: Unique execution trace ID.
        start_time: Datetime when the run started.
        run_dir: Path to the per-run output directory.
    """

    engine_type: str
    paper_id: str
    execution_id: str
    start_time: datetime
    run_dir: Path

    @classmethod
    def create(
        cls,
        engine_type: str,
        paper_id: str,
        execution_id: str,
        cli_args: dict[str, Any] | None = None,
    ) -> RunContext:
        """Create a RunContext and its output directory.

        Creates output/runs/{category}/{ts}_{engine}_{paper}_{exec_id_8}/
        and writes metadata.json. Category is ``mas`` or ``cc``.

        Args:
            engine_type: Engine identifier ('mas', 'cc_solo', 'cc_teams').
            paper_id: PeerRead paper identifier.
            execution_id: Unique execution trace ID.
            cli_args: Optional CLI arguments dict to persist in metadata.

        Returns:
            RunContext with run_dir created and metadata.json written.
        """
        start_time = datetime.now()
        ts = start_time.strftime("%Y%m%d_%H%M%S")
        safe_engine = _sanitize_path_component(engine_type)
        safe_paper = _sanitize_path_component(paper_id)
        safe_exec_id = _sanitize_path_component(execution_id[:8])
        dir_name = f"{ts}_{safe_engine}_{safe_paper}_{safe_exec_id}"
        category = "cc" if engine_type.startswith("cc") else "mas"

        run_dir = (
            OUTPUT_BASE / "runs" / category / dir_name
        ).resolve()  # CodeQL[py/path-injection]
        if not run_dir.is_relative_to(OUTPUT_BASE.resolve()):
            msg = f"Path traversal detected: {run_dir}"
            raise ValueError(msg)
        run_dir.mkdir(parents=True, exist_ok=True)  # CodeQL[py/path-injection]

        ctx = cls(
            engine_type=engine_type,
            paper_id=paper_id,
            execution_id=execution_id,
            start_time=start_time,
            run_dir=run_dir,
        )
        ctx._write_metadata(cli_args)
        return ctx

    def _write_metadata(self, cli_args: dict[str, Any] | None) -> None:
        """Write metadata.json to the run directory.

        Args:
            cli_args: Optional CLI arguments to include in metadata.
        """
        metadata: dict[str, Any] = {
            "engine_type": self.engine_type,
            "paper_id": self.paper_id,
            "execution_id": self.execution_id,
            "start_time": self.start_time.isoformat(),
            "cli_args": cli_args,
        }
        (self.run_dir / "metadata.json").write_text(
            json.dumps(metadata, indent=2), encoding="utf-8"
        )

    @property
    def stream_path(self) -> Path:
        """Path to the stream output file.

        Returns:
            stream.jsonl for CC engines, stream.json for MAS engine.
        """
        ext = "jsonl" if self.engine_type.startswith("cc") else "json"
        return self.run_dir / f"stream.{ext}"

    @property
    def trace_path(self) -> Path:
        """Path to the trace output file.

        Returns:
            trace.json in run_dir.
        """
        return self.run_dir / "trace.json"

    @property
    def review_path(self) -> Path:
        """Path to the review output file.

        Returns:
            review.json in run_dir.
        """
        return self.run_dir / "review.json"

    @property
    def report_path(self) -> Path:
        """Path to the report output file.

        Returns:
            report.md in run_dir.
        """
        return self.run_dir / "report.md"

    @property
    def evaluation_path(self) -> Path:
        """Path to the evaluation output file.

        Returns:
            evaluation.json in run_dir.
        """
        return self.run_dir / "evaluation.json"

    @property
    def graph_json_path(self) -> Path:
        """Path to the agent graph JSON export file.

        Returns:
            agent_graph.json in run_dir.
        """
        return self.run_dir / "agent_graph.json"

    @property
    def graph_png_path(self) -> Path:
        """Path to the agent graph PNG export file.

        Returns:
            agent_graph.png in run_dir.
        """
        return self.run_dir / "agent_graph.png"
Attributes
evaluation_path property

Path to the evaluation output file.

Returns:

Type Description
Path

evaluation.json in run_dir.

graph_json_path property

Path to the agent graph JSON export file.

Returns:

Type Description
Path

agent_graph.json in run_dir.

graph_png_path property

Path to the agent graph PNG export file.

Returns:

Type Description
Path

agent_graph.png in run_dir.

report_path property

Path to the report output file.

Returns:

Type Description
Path

report.md in run_dir.

review_path property

Path to the review output file.

Returns:

Type Description
Path

review.json in run_dir.

stream_path property

Path to the stream output file.

Returns:

Type Description
Path

stream.jsonl for CC engines, stream.json for MAS engine.

trace_path property

Path to the trace output file.

Returns:

Type Description
Path

trace.json in run_dir.

Functions
create(engine_type, paper_id, execution_id, cli_args=None) classmethod

Create a RunContext and its output directory.

Creates output/runs/{category}/{ts}{engine}/ and writes metadata.json. Category is }_{exec_id_8mas or cc.

Parameters:

Name Type Description Default
engine_type str

Engine identifier (‘mas’, ‘cc_solo’, ‘cc_teams’).

required
paper_id str

PeerRead paper identifier.

required
execution_id str

Unique execution trace ID.

required
cli_args dict[str, Any] | None

Optional CLI arguments dict to persist in metadata.

None

Returns:

Type Description
RunContext

RunContext with run_dir created and metadata.json written.

Source code in src/app/utils/run_context.py
@classmethod
def create(
    cls,
    engine_type: str,
    paper_id: str,
    execution_id: str,
    cli_args: dict[str, Any] | None = None,
) -> RunContext:
    """Create a RunContext and its output directory.

    Creates output/runs/{category}/{ts}_{engine}_{paper}_{exec_id_8}/
    and writes metadata.json. Category is ``mas`` or ``cc``.

    Args:
        engine_type: Engine identifier ('mas', 'cc_solo', 'cc_teams').
        paper_id: PeerRead paper identifier.
        execution_id: Unique execution trace ID.
        cli_args: Optional CLI arguments dict to persist in metadata.

    Returns:
        RunContext with run_dir created and metadata.json written.
    """
    start_time = datetime.now()
    ts = start_time.strftime("%Y%m%d_%H%M%S")
    safe_engine = _sanitize_path_component(engine_type)
    safe_paper = _sanitize_path_component(paper_id)
    safe_exec_id = _sanitize_path_component(execution_id[:8])
    dir_name = f"{ts}_{safe_engine}_{safe_paper}_{safe_exec_id}"
    category = "cc" if engine_type.startswith("cc") else "mas"

    run_dir = (
        OUTPUT_BASE / "runs" / category / dir_name
    ).resolve()  # CodeQL[py/path-injection]
    if not run_dir.is_relative_to(OUTPUT_BASE.resolve()):
        msg = f"Path traversal detected: {run_dir}"
        raise ValueError(msg)
    run_dir.mkdir(parents=True, exist_ok=True)  # CodeQL[py/path-injection]

    ctx = cls(
        engine_type=engine_type,
        paper_id=paper_id,
        execution_id=execution_id,
        start_time=start_time,
        run_dir=run_dir,
    )
    ctx._write_metadata(cli_args)
    return ctx

Functions

get_active_run_context()

Get the active per-run context, if any.

Returns:

Type Description
RunContext | None

The active RunContext, or None if no run is in progress.

Source code in src/app/utils/run_context.py
def get_active_run_context() -> RunContext | None:
    """Get the active per-run context, if any.

    Returns:
        The active RunContext, or None if no run is in progress.
    """
    return _active_run_context

set_active_run_context(ctx)

Set or clear the active per-run context.

Parameters:

Name Type Description Default
ctx RunContext | None

RunContext to activate, or None to clear.

required
Source code in src/app/utils/run_context.py
def set_active_run_context(ctx: RunContext | None) -> None:
    """Set or clear the active per-run context.

    Args:
        ctx: RunContext to activate, or None to clear.
    """
    global _active_run_context
    _active_run_context = ctx

app.utils.url_validation

URL validation and SSRF prevention utilities.

This module provides URL validation functionality to prevent SSRF (Server-Side Request Forgery) attacks by enforcing HTTPS-only and domain allowlisting for all external requests.

CVE Context: - CVE-2026-25580: PydanticAI SSRF vulnerability allowing information disclosure via malicious URLs in message history. This module mitigates the vulnerability by validating all URLs before HTTP requests.

Functions

validate_url(url)

Validate URL for SSRF protection.

Enforces HTTPS-only and domain allowlisting to prevent SSRF attacks.

Parameters:

Name Type Description Default
url str

URL to validate.

required

Returns:

Type Description
str

The validated URL if it passes all checks.

Raises:

Type Description
ValueError

If URL fails validation (non-HTTPS, blocked domain, malformed).

Examples:

>>> validate_url("https://raw.githubusercontent.com/data.json")
'https://raw.githubusercontent.com/data.json'
>>> validate_url("http://evil.com/secrets")
Traceback (most recent call last):
    ...
ValueError: Only HTTPS URLs allowed
>>> validate_url("https://169.254.169.254/metadata")
Traceback (most recent call last):
    ...
ValueError: URL domain not allowed: 169.254.169.254
Source code in src/app/utils/url_validation.py
def validate_url(url: str) -> str:
    """
    Validate URL for SSRF protection.

    Enforces HTTPS-only and domain allowlisting to prevent SSRF attacks.

    Args:
        url: URL to validate.

    Returns:
        The validated URL if it passes all checks.

    Raises:
        ValueError: If URL fails validation (non-HTTPS, blocked domain, malformed).

    Examples:
        >>> validate_url("https://raw.githubusercontent.com/data.json")
        'https://raw.githubusercontent.com/data.json'

        >>> validate_url("http://evil.com/secrets")
        Traceback (most recent call last):
            ...
        ValueError: Only HTTPS URLs allowed

        >>> validate_url("https://169.254.169.254/metadata")
        Traceback (most recent call last):
            ...
        ValueError: URL domain not allowed: 169.254.169.254
    """
    from urllib.parse import urlparse

    # Validate input is not empty or whitespace-only
    if not url or not url.strip():
        raise ValueError("URL cannot be empty or whitespace-only")

    # Parse URL
    try:
        parsed = urlparse(url)
    except Exception as e:
        raise ValueError(f"Malformed URL: {e}") from e

    # Enforce HTTPS-only
    if parsed.scheme != "https":
        raise ValueError("Only HTTPS URLs allowed")

    # Extract domain (netloc without port/credentials)
    # netloc format: [user[:password]@]host[:port]
    netloc = parsed.netloc
    if not netloc:
        raise ValueError("URL must contain a domain")

    # Remove credentials if present (user:pass@domain)
    if "@" in netloc:
        netloc = netloc.split("@")[-1]

    # Remove port if present (domain:port)
    domain = netloc.split(":")[0]

    # Check domain against allowlist
    if domain not in ALLOWED_DOMAINS:
        # Error message contains only domain, not full URL (prevents log injection)
        raise ValueError(f"URL domain not allowed: {domain}")

    return url

app.utils.utils

This module provides utility functions and context managers for handling configurations, error handling, and setting up agent environments.

Functions:

Name Description
load_config

str) -> Config: Load and validate configuration from a JSON file.

print_research_result

Dict, usage: Usage) -> None: Output structured summary of the research topic.

error_handling_context

str, console: Console = None): Context manager for handling errors during operations.

setup_agent_env

Config, console: Console = None) -> AgentConfig: Set up the agent environment based on the provided configuration.

Classes

Functions

log_research_result(summary, usage)

Prints the research summary and usage details in a formatted manner.

Parameters:

Name Type Description Default
summary Dict

A dictionary containing the research summary with keys ‘topic’, ‘key_points’, ‘key_points_explanation’, and ‘conclusion’.

required
usage RunUsage

An object containing usage details to be printed.

required
Source code in src/app/utils/utils.py
def log_research_result(summary: ResearchSummary, usage: RunUsage) -> None:
    """
    Prints the research summary and usage details in a formatted manner.

    Args:
        summary (Dict): A dictionary containing the research summary with keys 'topic',
            'key_points', 'key_points_explanation', and 'conclusion'.
        usage (RunUsage): An object containing usage details to be printed.
    """

    logger.info(f"\n=== Research Summary: {summary.topic} ===")
    logger.info("\nKey Points:")
    for i, point in enumerate(summary.key_points, 1):
        logger.info(f"{i}. {point}")
    logger.info("\nKey Points Explanation:")
    for i, point in enumerate(summary.key_points_explanation, 1):
        logger.info(f"{i}. {point}")
    logger.info(f"\nConclusion: {summary.conclusion}")
    logger.info(f"\nResponse structure: {list(dict(summary).keys())}")
    logger.info(usage)

examples._helpers

Shared utilities for example scripts.

Functions

print_mas_result(output)

Print MAS example result summary to stdout.

Parameters:

Name Type Description Default
output dict[str, Any] | None

Result dict from app.main() with optional ‘composite_result’ key, or None if the run failed.

required
Source code in src/examples/_helpers.py
def print_mas_result(output: dict[str, Any] | None) -> None:
    """Print MAS example result summary to stdout.

    Args:
        output: Result dict from app.main() with optional 'composite_result' key,
                or None if the run failed.
    """
    if output is not None:
        composite = output.get("composite_result")
        if composite is not None:
            print(f"Composite score  : {composite.composite_score:.3f}")
            print(f"Recommendation   : {composite.recommendation}")
            print(f"Tiers enabled    : {composite.tiers_enabled}")
        else:
            print("Run completed — no composite result produced (eval may be skipped).")
    else:
        print("Run completed — no result returned (download-only or error).")

examples.basic_evaluation

Basic evaluation example using the three-tier EvaluationPipeline.

Purpose

Demonstrates the plugin-based evaluation system with realistic paper/review data. Shows how to construct a GraphTraceData trace, configure a pipeline, and interpret the resulting CompositeResult.

Prerequisites
  • API key for the Tier 2 LLM provider set in .env (e.g. OPENAI_API_KEY) or run with tiers_enabled=[1, 3] to skip LLM calls entirely.
  • No dataset download required: uses synthetic data.
Expected output

Composite score in [0.0, 1.0] and a recommendation string such as “accept”, “weak_accept”, “weak_reject”, or “reject”.

Usage

uv run python src/examples/basic_evaluation.py

Classes

Functions

run_example() async

Run a complete three-tier evaluation with synthetic data.

Tier 1 (Traditional Metrics) and Tier 3 (Graph Analysis) run locally. Tier 2 (LLM-as-Judge) requires an API key; set tiers_enabled=[1, 3] in JudgeSettings to skip it without an API key.

Returns:

Type Description
CompositeResult

CompositeResult with composite_score and recommendation.

Source code in src/examples/basic_evaluation.py
async def run_example() -> CompositeResult:
    """Run a complete three-tier evaluation with synthetic data.

    Tier 1 (Traditional Metrics) and Tier 3 (Graph Analysis) run locally.
    Tier 2 (LLM-as-Judge) requires an API key; set tiers_enabled=[1, 3]
    in JudgeSettings to skip it without an API key.

    Returns:
        CompositeResult with composite_score and recommendation.
    """
    paper = _make_synthetic_paper()
    trace = _make_synthetic_trace()

    # Configure pipeline — disable Tier 2 to skip LLM calls for the example
    settings = JudgeSettings(tiers_enabled=[1, 3])
    pipeline = EvaluationPipeline(settings=settings)

    # Compose a plausible agent-generated review from the paper data
    agent_review = (
        f"Review of: {paper.title}\n\n"
        "This paper introduces an efficient attention mechanism for transformers. "
        "The empirical evaluation is solid with clear ablations. "
        "The memory reduction claims are well-supported. "
        "Recommended for acceptance pending minor revisions to the related work section."
    )

    result = await pipeline.evaluate_comprehensive(
        paper=paper.abstract,
        review=agent_review,
        execution_trace=trace,
        reference_reviews=[r.comments for r in paper.reviews if r.comments],
    )

    logger.info(
        f"Evaluation complete — score: {result.composite_score:.3f}, "
        f"recommendation: {result.recommendation}"
    )
    return result

examples.cc_solo

CC solo example: run Claude Code in headless solo mode.

Purpose

Demonstrates how to invoke the Claude Code CLI in solo (single-agent) headless mode using run_cc_solo(). Includes a check_cc_available() guard that prints a helpful message if the ‘claude’ CLI is not installed.

Prerequisites
  • Claude Code CLI installed and available on PATH (check with claude --version).
  • Authenticated Claude Code session (run claude interactively once to log in).
  • No LLM API keys required: CC uses its own authenticated session.
Expected output

A CCResult with execution_id and output_data from the CC JSON response. The review text extracted from the result is printed to stdout. If ‘claude’ is not on PATH, a helpful installation message is printed and the example exits without error.

Usage

uv run python src/examples/cc_solo.py

Classes

Functions

run_example() async

Run Claude Code in solo headless mode for paper review.

Checks CC availability first. If ‘claude’ CLI is missing, prints an installation hint and returns None. Otherwise builds a non-empty query using build_cc_query() and invokes run_cc_solo() with a timeout.

Returns:

Type Description
CCResult | None

CCResult with execution_id and output_data, or None if CC unavailable.

Source code in src/examples/cc_solo.py
async def run_example() -> CCResult | None:
    """Run Claude Code in solo headless mode for paper review.

    Checks CC availability first. If 'claude' CLI is missing, prints an
    installation hint and returns None. Otherwise builds a non-empty query
    using build_cc_query() and invokes run_cc_solo() with a timeout.

    Returns:
        CCResult with execution_id and output_data, or None if CC unavailable.
    """
    if not check_cc_available():
        print(
            "Claude Code CLI not found on PATH.\n"
            "Install it from https://claude.ai/code and authenticate with `claude`.\n"
            "Skipping CC solo example."
        )
        return None

    query = build_cc_query("", paper_id=_PAPER_ID, cc_teams=False)
    logger.info(f"CC solo: query={query!r}")

    result = run_cc_solo(query, timeout=_TIMEOUT_SECONDS)

    logger.info(
        f"CC solo completed — execution_id={result.execution_id}, "
        f"output_keys={list(result.output_data.keys())}"
    )
    return result

examples.cc_teams

CC teams example: run Claude Code in agent-teams orchestration mode.

Purpose

Demonstrates how to invoke Claude Code in teams mode using run_cc_teams(). Teams mode sets CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1 so CC can spawn teammate agents for parallel task execution. Includes a check_cc_available() guard that prints a helpful message if ‘claude’ is not on PATH.

Prerequisites
  • Claude Code CLI installed and available on PATH (check with claude --version).
  • Authenticated Claude Code session (run claude interactively once to log in).
  • No LLM API keys required: CC uses its own authenticated session.
Expected output

A CCResult with team_artifacts populated from the JSONL stream events. The number of TeamCreate and Task events is printed to stdout. If ‘claude’ is not on PATH, a helpful installation message is printed and the example exits without error.

Usage

uv run python src/examples/cc_teams.py

Classes

Functions

run_example() async

Run Claude Code in agent-teams orchestration mode for paper review.

Checks CC availability first. If ‘claude’ CLI is missing, prints an installation hint and returns None. Otherwise builds a teams-mode query using build_cc_query(cc_teams=True) and invokes run_cc_teams() which sets the CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1 environment variable.

Returns:

Type Description
CCResult | None

CCResult with team_artifacts from stream events, or None if CC unavailable.

Source code in src/examples/cc_teams.py
async def run_example() -> CCResult | None:
    """Run Claude Code in agent-teams orchestration mode for paper review.

    Checks CC availability first. If 'claude' CLI is missing, prints an
    installation hint and returns None. Otherwise builds a teams-mode query
    using build_cc_query(cc_teams=True) and invokes run_cc_teams() which
    sets the CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1 environment variable.

    Returns:
        CCResult with team_artifacts from stream events, or None if CC unavailable.
    """
    if not check_cc_available():
        print(
            "Claude Code CLI not found on PATH.\n"
            "Install it from https://claude.ai/code and authenticate with `claude`.\n"
            "Skipping CC teams example."
        )
        return None

    # cc_teams=True prepends "Use a team of agents." to encourage CC to spawn teammates
    query = build_cc_query("", paper_id=_PAPER_ID, cc_teams=True)
    logger.info(f"CC teams: query={query!r}")

    result = run_cc_teams(query, timeout=_TIMEOUT_SECONDS)

    started = sum(1 for e in result.team_artifacts if e.get("subtype") == "task_started")
    completed = sum(1 for e in result.team_artifacts if e.get("subtype") == "task_completed")
    logger.info(
        f"CC teams completed — execution_id={result.execution_id}, "
        f"task_started={started}, task_completed={completed}"
    )
    return result

examples.engine_comparison

Engine comparison example: MAS vs Claude Code evaluation.

Purpose

Demonstrates how to compare evaluation scores between: - Multi-LLM MAS (PydanticAI agents) - Single-LLM MAS (baseline) - Claude Code headless (optional, requires CC artifacts)

Uses CCTraceAdapter to load CC execution artifacts and feed them into the EvaluationPipeline for apples-to-apples comparison.

Prerequisites

For MAS evaluation: API key in .env (or use tiers_enabled=[1, 3]). For CC comparison: Collect CC artifacts first using the scripts: scripts/collect-cc-traces/collect-cc-solo.sh # solo mode scripts/collect-cc-traces/collect-cc-teams.sh # teams mode Artifacts are stored in ~/.claude/teams/ and ~/.claude/tasks/ during interactive sessions, or parsed from raw_stream.jsonl in headless mode.

Usage

uv run python src/examples/engine_comparison.py

Classes

Functions

evaluate_mas(trace, label) async

Run Tier 1 + Tier 3 evaluation for a given execution trace.

Parameters:

Name Type Description Default
trace GraphTraceData

GraphTraceData from MAS execution.

required
label str

Human-readable label for logging.

required

Returns:

Type Description
CompositeResult

CompositeResult with composite_score and recommendation.

Source code in src/examples/engine_comparison.py
async def evaluate_mas(trace: GraphTraceData, label: str) -> CompositeResult:
    """Run Tier 1 + Tier 3 evaluation for a given execution trace.

    Args:
        trace: GraphTraceData from MAS execution.
        label: Human-readable label for logging.

    Returns:
        CompositeResult with composite_score and recommendation.
    """
    settings = JudgeSettings(tiers_enabled=[1, 3])  # skip Tier 2 for example
    pipeline = EvaluationPipeline(settings=settings)

    result = await pipeline.evaluate_comprehensive(
        paper=_PAPER_ABSTRACT,
        review=_AGENT_REVIEW,
        execution_trace=trace,
        reference_reviews=[_REFERENCE_REVIEW],
    )
    logger.info(f"{label}: score={result.composite_score:.3f}, rec={result.recommendation}")
    return result

load_cc_trace(artifacts_dir)

Load CC execution artifacts into GraphTraceData.

Parameters:

Name Type Description Default
artifacts_dir Path

Path to CC artifact directory (teams or solo mode). Teams mode: contains config.json with ‘members’ array. Solo mode: contains metadata.json + tool_calls.jsonl.

required

Returns:

Type Description
GraphTraceData | None

GraphTraceData parsed from artifacts, or None if directory missing.

Source code in src/examples/engine_comparison.py
def load_cc_trace(artifacts_dir: Path) -> GraphTraceData | None:
    """Load CC execution artifacts into GraphTraceData.

    Args:
        artifacts_dir: Path to CC artifact directory (teams or solo mode).
            Teams mode: contains config.json with 'members' array.
            Solo mode: contains metadata.json + tool_calls.jsonl.

    Returns:
        GraphTraceData parsed from artifacts, or None if directory missing.
    """
    if not artifacts_dir.exists():
        logger.warning(f"CC artifacts not found at {artifacts_dir}. Skipping CC comparison.")
        return None

    try:
        adapter = CCTraceAdapter(artifacts_dir)
        trace = adapter.parse()
        logger.info(f"Loaded CC trace (mode={adapter.mode}): {trace.execution_id}")
        return trace
    except ValueError as e:
        logger.error(f"Failed to parse CC artifacts: {e}")
        return None

run_example() async

Compare MAS multi-agent, MAS single-agent, and optionally CC evaluation scores.

Returns:

Type Description
dict[str, CompositeResult]

Dict mapping engine label to CompositeResult.

Source code in src/examples/engine_comparison.py
async def run_example() -> dict[str, CompositeResult]:
    """Compare MAS multi-agent, MAS single-agent, and optionally CC evaluation scores.

    Returns:
        Dict mapping engine label to CompositeResult.
    """
    results: dict[str, CompositeResult] = {}

    # Multi-LLM MAS (the evaluation target)
    results["MAS-MultiLLM"] = await evaluate_mas(_MAS_TRACE, label="MAS-MultiLLM")

    # Single-LLM MAS baseline
    results["MAS-SingleLLM"] = await evaluate_mas(_BASELINE_TRACE, label="MAS-SingleLLM")

    # Optional: Claude Code comparison (requires prior artifact collection)
    cc_artifacts_dir = Path.home() / ".claude" / "teams" / "evaluation-run"
    cc_trace = load_cc_trace(cc_artifacts_dir)
    if cc_trace is not None:
        results["ClaudeCode"] = await evaluate_mas(cc_trace, label="ClaudeCode")

    return results

examples.judge_settings_customization

JudgeSettings customization example.

Purpose

Demonstrates how to configure the evaluation pipeline via JudgeSettings: - Environment variable overrides (JUDGE_ prefix) - Programmatic settings modification - Timeout adjustment, tier selection, provider configuration

Prerequisites

None — JudgeSettings is pure Python/Pydantic, no API keys required.

Environment variable override pattern

All settings can be overridden via JUDGE_ in .env or shell:

JUDGE_TIER2_PROVIDER=anthropic
JUDGE_TIER1_MAX_SECONDS=2.0
JUDGE_TIERS_ENABLED=[1,3]

Pydantic-settings reads these automatically when JudgeSettings() is created.

Usage

uv run python src/examples/judge_settings_customization.py

Classes

Functions

example_composite_thresholds()

Adjust composite score thresholds for stricter evaluation.

Returns:

Type Description
JudgeSettings

JudgeSettings with raised acceptance thresholds.

Source code in src/examples/judge_settings_customization.py
def example_composite_thresholds() -> JudgeSettings:
    """Adjust composite score thresholds for stricter evaluation.

    Returns:
        JudgeSettings with raised acceptance thresholds.
    """
    settings = JudgeSettings(
        composite_accept_threshold=0.85,  # raise bar for "accept"
        composite_weak_accept_threshold=0.65,  # raise bar for "weak_accept"
        composite_weak_reject_threshold=0.35,  # lower bar for "weak_reject"
        fallback_strategy="tier1_only",
    )
    logger.info(
        f"Thresholds — accept: {settings.composite_accept_threshold}, "
        f"weak_accept: {settings.composite_weak_accept_threshold}, "
        f"weak_reject: {settings.composite_weak_reject_threshold}"
    )
    return settings

example_provider_selection()

Switch the Tier 2 LLM judge to a specific provider.

Returns:

Type Description
JudgeSettings

JudgeSettings configured for Anthropic as Tier 2 provider.

Source code in src/examples/judge_settings_customization.py
def example_provider_selection() -> JudgeSettings:
    """Switch the Tier 2 LLM judge to a specific provider.

    Returns:
        JudgeSettings configured for Anthropic as Tier 2 provider.
    """
    settings = JudgeSettings(
        tier2_provider="anthropic",
        tier2_model="claude-haiku-4-5",
        tier2_fallback_provider="openai",
    )
    logger.info(
        f"Tier 2 provider: {settings.tier2_provider} / {settings.tier2_model}, "
        f"fallback: {settings.tier2_fallback_provider}"
    )
    return settings

example_tier_selection()

Enable only Tier 1 and Tier 3 (no LLM calls, no API key needed).

Returns:

Type Description
JudgeSettings

JudgeSettings with Tier 2 disabled.

Source code in src/examples/judge_settings_customization.py
def example_tier_selection() -> JudgeSettings:
    """Enable only Tier 1 and Tier 3 (no LLM calls, no API key needed).

    Returns:
        JudgeSettings with Tier 2 disabled.
    """
    settings = JudgeSettings(tiers_enabled=[1, 3])
    enabled = settings.get_enabled_tiers()
    logger.info(f"Enabled tiers: {sorted(enabled)}")
    assert not settings.is_tier_enabled(2), "Tier 2 should be disabled"
    return settings

example_timeout_adjustment()

Adjust tier timeouts for slower or faster environments.

Returns:

Type Description
JudgeSettings

JudgeSettings with increased timeouts suitable for larger models.

Source code in src/examples/judge_settings_customization.py
def example_timeout_adjustment() -> JudgeSettings:
    """Adjust tier timeouts for slower or faster environments.

    Returns:
        JudgeSettings with increased timeouts suitable for larger models.
    """
    settings = JudgeSettings(
        tier1_max_seconds=2.0,  # allow more time for BERTScore on long abstracts
        tier2_max_seconds=30.0,  # allow slow LLM providers
        tier3_max_seconds=20.0,  # allow larger graphs
        total_max_seconds=60.0,
    )
    logger.info(
        f"Timeouts — T1: {settings.tier1_max_seconds}s, "
        f"T2: {settings.tier2_max_seconds}s, "
        f"T3: {settings.tier3_max_seconds}s"
    )
    return settings

examples.mas_multi_agent

MAS multi-agent example: full 4-agent delegation via app.main().

Purpose

Demonstrates the full MAS execution mode where the manager agent delegates tasks to all three sub-agents: researcher, analyst, and synthesiser. All include_* flags are True, enabling the complete multi-agent review workflow.

Prerequisites
  • API key for the default LLM provider set in .env (e.g. OPENAI_API_KEY)
  • PeerRead sample dataset downloaded (run make app_quickstart or make setup_dataset to fetch samples).
Expected output

A ReviewGenerationResult from the full 4-agent pipeline (manager + researcher + analyst + synthesiser) for paper ‘1105.1072’. The composite evaluation score and recommendation are printed to stdout.

Usage

uv run python src/examples/mas_multi_agent.py

Functions

run_example() async

Run the MAS pipeline in full multi-agent mode (4 agents).

Uses app.main() with all include_* flags set to True so that the manager delegates research, analysis, and synthesis to specialist sub-agents. The researcher agent is equipped with DuckDuckGo search and PeerRead tools.

Returns:

Type Description
dict[str, Any] | None

Dictionary with ‘composite_result’ and ‘graph’ keys, or None if the

dict[str, Any] | None

run fails (e.g. missing dataset, API key not set).

Source code in src/examples/mas_multi_agent.py
async def run_example() -> dict[str, Any] | None:
    """Run the MAS pipeline in full multi-agent mode (4 agents).

    Uses app.main() with all include_* flags set to True so that the manager
    delegates research, analysis, and synthesis to specialist sub-agents.
    The researcher agent is equipped with DuckDuckGo search and PeerRead tools.

    Returns:
        Dictionary with 'composite_result' and 'graph' keys, or None if the
        run fails (e.g. missing dataset, API key not set).
    """
    logger.info(f"Starting MAS multi-agent example for paper {_PAPER_ID}")

    result = await main(
        paper_id=_PAPER_ID,
        include_researcher=True,
        include_analyst=True,
        include_synthesiser=True,
        enable_review_tools=True,
        skip_eval=False,
    )

    if result is not None:
        composite = result.get("composite_result")
        if composite is not None:
            logger.info(
                f"MAS multi-agent complete — score: {composite.composite_score:.3f}, "
                f"recommendation: {composite.recommendation}"
            )
    return result

examples.mas_single_agent

MAS single-agent example: manager-only mode via app.main().

Purpose

Demonstrates the minimal MAS execution mode where the manager agent handles the entire review workflow without delegating to sub-agents (researcher, analyst, synthesiser). All include_* flags are False.

Prerequisites
  • API key for the default LLM provider set in .env (e.g. OPENAI_API_KEY)
  • PeerRead sample dataset downloaded (run make app_quickstart or make setup_dataset to fetch samples).
Expected output

A ReviewGenerationResult or ResearchResult from the manager agent with a structured peer review for paper ‘1105.1072’. The result is printed to stdout after the evaluation pipeline completes.

Usage

uv run python src/examples/mas_single_agent.py

Functions

run_example() async

Run the MAS pipeline in manager-only (single-agent) mode.

Uses app.main() with all include_* flags set to False so that the manager agent processes the full review workflow without delegation to sub-agents. Tier 2 (LLM judge) is skipped to avoid requiring a second API key.

Returns:

Type Description
dict[str, Any] | None

Dictionary with ‘composite_result’ and ‘graph’ keys, or None if the

dict[str, Any] | None

run fails (e.g. missing dataset, API key not set).

Source code in src/examples/mas_single_agent.py
async def run_example() -> dict[str, Any] | None:
    """Run the MAS pipeline in manager-only (single-agent) mode.

    Uses app.main() with all include_* flags set to False so that the manager
    agent processes the full review workflow without delegation to sub-agents.
    Tier 2 (LLM judge) is skipped to avoid requiring a second API key.

    Returns:
        Dictionary with 'composite_result' and 'graph' keys, or None if the
        run fails (e.g. missing dataset, API key not set).
    """
    logger.info(f"Starting MAS single-agent example for paper {_PAPER_ID}")

    result = await main(
        paper_id=_PAPER_ID,
        include_researcher=False,
        include_analyst=False,
        include_synthesiser=False,
        enable_review_tools=True,
        skip_eval=False,
    )

    if result is not None:
        composite = result.get("composite_result")
        if composite is not None:
            logger.info(
                f"MAS single-agent complete — score: {composite.composite_score:.3f}, "
                f"recommendation: {composite.recommendation}"
            )
    return result

examples.sweep_benchmark

Sweep benchmark example: SweepRunner with SweepConfig.

Purpose

Demonstrates how to configure and run a composition sweep using SweepRunner and SweepConfig. A sweep evaluates multiple agent compositions across one or more papers and repetitions for statistical comparison of results.

Prerequisites
  • API key for the default LLM provider set in .env (e.g. OPENAI_API_KEY)
  • PeerRead sample dataset downloaded (run make app_quickstart or make setup_dataset to fetch samples).
Expected output

SweepRunner executes each composition (manager-only, researcher-only, full 3-agent) on paper ‘1105.1072’ for 1 repetition and prints a summary table of composite scores per composition. Output is written to a temporary directory that is removed after the example completes.

Usage

uv run python src/examples/sweep_benchmark.py

Classes

Functions

run_example() async

Run the sweep benchmark with 3 compositions, 1 paper, 1 repetition.

Results are written to a temporary directory that is cleaned up after the example completes.

Returns:

Type Description
list[tuple[AgentComposition, CompositeResult]]

List of (AgentComposition, CompositeResult) tuples from the sweep.

Source code in src/examples/sweep_benchmark.py
async def run_example() -> list[tuple[AgentComposition, CompositeResult]]:
    """Run the sweep benchmark with 3 compositions, 1 paper, 1 repetition.

    Results are written to a temporary directory that is cleaned up after
    the example completes.

    Returns:
        List of (AgentComposition, CompositeResult) tuples from the sweep.
    """
    with tempfile.TemporaryDirectory(prefix="sweep_example_") as tmp_dir:
        output_dir = Path(tmp_dir)
        config = _build_sweep_config(output_dir)
        runner = SweepRunner(config)

        logger.info(
            f"Starting sweep: {len(config.compositions)} compositions, "
            f"{config.repetitions} repetition(s), paper_ids={config.paper_ids}"
        )

        await runner.run()

        logger.info(f"Sweep complete — {len(runner.results)} result(s)")
        # Snapshot results before temp dir cleanup
        results = list(runner.results)

    return results

gui.components.footer

Functions

Render the page footer.

Source code in src/gui/components/footer.py
4
5
6
7
def render_footer(footer_caption: str):
    """Render the page footer."""
    divider()
    caption(footer_caption)

gui.components.header

Functions

render_header(header_title)

Render the page header with title.

Source code in src/gui/components/header.py
4
5
6
7
def render_header(header_title: str):
    """Render the page header with title."""
    title(header_title)
    divider()

gui.components.output

Output rendering component with type-aware dispatch.

Renders results using appropriate Streamlit widgets based on the result type: st.json() for dicts and Pydantic models, st.markdown() for strings, and st.write() as a fallback.

Functions

render_output(result=None, info_str=None, output_type=None)

Renders output using type-appropriate Streamlit widgets.

Parameters:

Name Type Description Default
result Any

The content to be displayed. Dispatches to st.json() for dicts/Pydantic models, st.markdown() for strings, st.write() for other types.

None
info_str str

Info message displayed when result is None/falsy.

None
output_type str

The type hint for the result content.

None
Source code in src/gui/components/output.py
def render_output(
    result: Any = None,
    info_str: str | None = None,
    output_type: str | None = None,
) -> None:
    """Renders output using type-appropriate Streamlit widgets.

    Args:
        result (Any, optional): The content to be displayed. Dispatches to
            st.json() for dicts/Pydantic models, st.markdown() for strings,
            st.write() for other types.
        info_str (str, optional): Info message displayed when result is None/falsy.
        output_type (str, optional): The type hint for the result content.
    """
    if result:
        if isinstance(result, BaseModel):
            st.json(result.model_dump(), expanded=True)
        elif isinstance(result, dict):
            st.json(cast(dict[str, Any], result), expanded=True)
        elif isinstance(result, str):
            st.markdown(result)
        else:
            output_container = st.empty()
            output_container.write(result)
    else:
        st.info(info_str)

gui.components.prompts

Functions

render_prompt_editor(prompt_name, prompt_value, height=150)

Render a read-only prompt text area for display.

Parameters:

Name Type Description Default
prompt_name str

Snake_case prompt key used to generate the label.

required
prompt_value str

Current prompt text content.

required
height int

Text area height in pixels.

150

Returns:

Type Description
str | None

The displayed prompt value (always unchanged since field is read-only).

Source code in src/gui/components/prompts.py
def render_prompt_editor(prompt_name: str, prompt_value: str, height: int = 150) -> str | None:
    """Render a read-only prompt text area for display.

    Args:
        prompt_name: Snake_case prompt key used to generate the label.
        prompt_value: Current prompt text content.
        height: Text area height in pixels.

    Returns:
        The displayed prompt value (always unchanged since field is read-only).
    """
    return text_area(
        f"{prompt_name.replace('_', ' ').title()}",
        value=prompt_value,
        height=height,
        disabled=True,
        help="Read-only. Edit config_chat.json to modify prompts.",
    )

gui.components.sidebar

Functions

render_sidebar(sidebar_title, execution_state='idle')

Render sidebar with page navigation, Phoenix trace link, and execution indicator.

Parameters:

Name Type Description Default
sidebar_title str

Title to display in the sidebar.

required
execution_state str

Current execution state — ‘idle’, ‘running’, ‘completed’, or ‘error’. When ‘running’, an in-progress indicator is shown at the top of the sidebar.

'idle'

Returns:

Type Description
str

Selected page name from the radio button selection.

Source code in src/gui/components/sidebar.py
def render_sidebar(sidebar_title: str, execution_state: str = "idle") -> str:
    """Render sidebar with page navigation, Phoenix trace link, and execution indicator.

    Args:
        sidebar_title: Title to display in the sidebar.
        execution_state: Current execution state — 'idle', 'running', 'completed', or 'error'.
            When 'running', an in-progress indicator is shown at the top of the sidebar.

    Returns:
        Selected page name from the radio button selection.
    """
    sidebar.title(sidebar_title)

    # S8-F3.3: execution-in-progress indicator (WCAG 4.1.3)
    if execution_state == "running":
        sidebar.info("Execution in progress…")

    # S8-F8.1: WCAG 1.3.1, 2.4.6 — meaningful label with visual collapse avoids empty label
    # key persists tab selection across Streamlit reruns within a session (AC4)
    selected_page = sidebar.radio("Navigation", PAGES, label_visibility="hidden", key="sidebar_tab")

    # STORY-010: Phoenix trace viewer in collapsed sidebar expander
    with sidebar.expander("Tracing (optional)", expanded=False):
        st.markdown(
            f"[Open Phoenix Traces (opens in new tab)]({PHOENIX_DEFAULT_ENDPOINT})",
            help="View detailed execution traces in Arize Phoenix",
        )
        st.caption("Phoenix must be running locally on port 6006")

    return selected_page

gui.config.config

GUI configuration constants and environment-aware URL resolution.

Functions

resolve_service_url(port)

Resolve a service URL for the given port based on the current environment.

Detection chain (first match wins): 1. PHOENIX_ENDPOINT env var — explicit user override 2. GitHub Codespaces — CODESPACE_NAME + GITHUB_CODESPACES_PORT_FORWARDING_DOMAIN 3. Gitpod — GITPOD_WORKSPACE_URL 4. Fallback — http://localhost:{port}

Parameters:

Name Type Description Default
port int

The port number the service listens on.

required

Returns:

Name Type Description
str str

A fully-qualified URL for the service appropriate to the environment.

Example

url = resolve_service_url(6006) url.startswith(“http”) True

Source code in src/gui/config/config.py
def resolve_service_url(port: int) -> str:
    """Resolve a service URL for the given port based on the current environment.

    Detection chain (first match wins):
    1. ``PHOENIX_ENDPOINT`` env var — explicit user override
    2. GitHub Codespaces — ``CODESPACE_NAME`` + ``GITHUB_CODESPACES_PORT_FORWARDING_DOMAIN``
    3. Gitpod — ``GITPOD_WORKSPACE_URL``
    4. Fallback — ``http://localhost:{port}``

    Args:
        port (int): The port number the service listens on.

    Returns:
        str: A fully-qualified URL for the service appropriate to the environment.

    Example:
        >>> url = resolve_service_url(6006)
        >>> url.startswith("http")
        True
    """
    # Priority 1: explicit user override
    explicit = os.environ.get("PHOENIX_ENDPOINT")
    if explicit:
        return explicit

    # Priority 2: GitHub Codespaces
    codespace_name = os.environ.get("CODESPACE_NAME")
    codespace_domain = os.environ.get("GITHUB_CODESPACES_PORT_FORWARDING_DOMAIN")
    if codespace_name and codespace_domain:
        return f"https://{codespace_name}-{port}.{codespace_domain}/"

    # Priority 3: Gitpod
    gitpod_url = os.environ.get("GITPOD_WORKSPACE_URL")
    if gitpod_url:
        # Gitpod convention: replace "https://" with "https://{port}-"
        # e.g. https://my-workspace.gitpod.io → https://6006-my-workspace.gitpod.io/
        without_scheme = gitpod_url.removeprefix("https://")
        return f"https://{port}-{without_scheme}/"

    # Priority 4: fallback
    return f"http://localhost:{port}"

gui.config.styling

GUI theming utilities.

Provides helper functions that read the active Streamlit theme (light or dark) and return colors for custom elements such as the Pyvis agent graph.

Theme colors are defined in .streamlit/config.toml via the native [theme.dark] and [theme.light] sections. Users switch themes through Streamlit’s built-in Settings menu (hamburger icon → Settings → Theme).

The THEMES dict below mirrors those config values so that non-Streamlit components (Pyvis, custom HTML) can access the palette at runtime.

Functions

add_custom_styling(page_title)

Configure the Streamlit page layout.

Parameters:

Name Type Description Default
page_title str

Title shown in the browser tab.

required
Source code in src/gui/config/styling.py
def add_custom_styling(page_title: str):
    """Configure the Streamlit page layout.

    Args:
        page_title: Title shown in the browser tab.
    """
    set_page_config(
        page_title=f"{page_title}",
        page_icon="🤖",
        layout="wide",
        initial_sidebar_state="expanded",
    )

get_active_theme()

Get the active theme dict based on Streamlit’s current mode.

Returns:

Type Description
dict[str, str]

dict[str, str]: Theme color mapping with keys like primaryColor, accentColor, etc.

Source code in src/gui/config/styling.py
def get_active_theme() -> dict[str, str]:
    """Get the active theme dict based on Streamlit's current mode.

    Returns:
        dict[str, str]: Theme color mapping with keys like ``primaryColor``,
            ``accentColor``, etc.
    """
    return THEMES[get_active_theme_name()]

get_active_theme_name()

Get the name of the currently active theme.

Detects Streamlit’s active theme (light or dark) and returns the corresponding theme name from :data:THEMES.

Returns:

Name Type Description
str str

Theme name string ("nord_light" or "expanse_dark").

Source code in src/gui/config/styling.py
def get_active_theme_name() -> str:
    """Get the name of the currently active theme.

    Detects Streamlit's active theme (light or dark) and returns the
    corresponding theme name from :data:`THEMES`.

    Returns:
        str: Theme name string (``"nord_light"`` or ``"expanse_dark"``).
    """
    return _LIGHT_THEME if _is_streamlit_light_mode() else _DARK_THEME

get_graph_font_color()

Get the font color for Pyvis graph labels based on active theme.

Returns "#000000" for light themes (>= 4.5:1 contrast on light bg) and "#ECEFF4" for dark themes (>= 4.5:1 contrast on dark bg).

Returns:

Name Type Description
str str

Hex color string for graph label text.

Source code in src/gui/config/styling.py
def get_graph_font_color() -> str:
    """Get the font color for Pyvis graph labels based on active theme.

    Returns ``"#000000"`` for light themes (>= 4.5:1 contrast on light bg)
    and ``"#ECEFF4"`` for dark themes (>= 4.5:1 contrast on dark bg).

    Returns:
        str: Hex color string for graph label text.
    """
    if _is_streamlit_light_mode():
        return "#000000"
    return "#ECEFF4"

get_graph_node_colors()

Get node colors for agent graph from the active theme.

Alias for :func:get_theme_node_colors used by agent_graph.py.

Returns:

Type Description
tuple[str, str]

tuple[str, str]: (primaryColor, accentColor) from the active theme. primaryColor is used for agent nodes, accentColor for tool nodes.

Source code in src/gui/config/styling.py
def get_graph_node_colors() -> tuple[str, str]:
    """Get node colors for agent graph from the active theme.

    Alias for :func:`get_theme_node_colors` used by agent_graph.py.

    Returns:
        tuple[str, str]: ``(primaryColor, accentColor)`` from the active theme.
            *primaryColor* is used for agent nodes, *accentColor* for tool nodes.
    """
    return get_theme_node_colors()

get_theme_bgcolor()

Get the background color from the active theme dict.

Reads backgroundColor from the active theme in :data:THEMES. Falls back to Streamlit’s theme.backgroundColor option, then to "#ffffff" as a last resort.

Returns:

Name Type Description
str str

Hex color string for the theme background.

Source code in src/gui/config/styling.py
def get_theme_bgcolor() -> str:
    """Get the background color from the active theme dict.

    Reads ``backgroundColor`` from the active theme in :data:`THEMES`.
    Falls back to Streamlit's ``theme.backgroundColor`` option, then
    to ``"#ffffff"`` as a last resort.

    Returns:
        str: Hex color string for the theme background.
    """
    theme = get_active_theme()
    bg = theme.get("backgroundColor")
    if isinstance(bg, str) and bg.startswith("#"):
        return bg
    # Reason: Fallback to Streamlit option when theme dict lacks backgroundColor
    st_bg = st.get_option("theme.backgroundColor")
    if isinstance(st_bg, str) and st_bg.startswith("#"):
        return st_bg
    return "#ffffff"

get_theme_node_colors()

Get node colors for agent graph from the active theme.

Returns:

Type Description
tuple[str, str]

tuple[str, str]: (primaryColor, accentColor) from the active theme. primaryColor is used for agent nodes, accentColor for tool nodes.

Source code in src/gui/config/styling.py
def get_theme_node_colors() -> tuple[str, str]:
    """Get node colors for agent graph from the active theme.

    Returns:
        tuple[str, str]: ``(primaryColor, accentColor)`` from the active theme.
            *primaryColor* is used for agent nodes, *accentColor* for tool nodes.
    """
    theme = get_active_theme()
    return theme["primaryColor"], theme["accentColor"]

is_light_theme(theme_name)

Check whether a theme name refers to a light theme.

Parameters:

Name Type Description Default
theme_name str

Name of the theme to check.

required

Returns:

Name Type Description
bool bool

True if the theme is a light theme, False otherwise.

Source code in src/gui/config/styling.py
def is_light_theme(theme_name: str) -> bool:
    """Check whether a theme name refers to a light theme.

    Args:
        theme_name: Name of the theme to check.

    Returns:
        bool: True if the theme is a light theme, False otherwise.
    """
    return theme_name == _LIGHT_THEME

gui.config.text

gui.pages.agent_graph

Streamlit page for Agent Graph visualization.

Renders NetworkX agent interaction graphs as interactive Pyvis visualizations. Displays agent-to-agent delegations and tool usage patterns with visual distinction between agent nodes and tool nodes.

Functions

render_agent_graph(graph=None, composite_result=None)

Render agent interaction graph as interactive Pyvis visualization.

Displays: - Agent nodes (distinguished visually from tool nodes) - Tool nodes - Interaction edges (delegations, tool calls) - Interactive pan/zoom/hover features

Parameters:

Name Type Description Default
graph DiGraph[str] | None

NetworkX DiGraph with agent and tool nodes, or None for empty state.

None
composite_result Any | None

Optional CompositeResult for mode-specific empty messages.

None
Source code in src/gui/pages/agent_graph.py
def render_agent_graph(
    graph: nx.DiGraph[str] | None = None,
    composite_result: Any | None = None,
) -> None:
    """Render agent interaction graph as interactive Pyvis visualization.

    Displays:
    - Agent nodes (distinguished visually from tool nodes)
    - Tool nodes
    - Interaction edges (delegations, tool calls)
    - Interactive pan/zoom/hover features

    Args:
        graph: NetworkX DiGraph with agent and tool nodes, or None for empty state.
        composite_result: Optional CompositeResult for mode-specific empty messages.
    """
    st.header(AGENT_GRAPH_HEADER)

    if graph is None:
        st.info("No agent interaction data available. Run a query to see the graph here.")
        return

    if graph.number_of_nodes() == 0:
        engine_type = getattr(composite_result, "engine_type", "mas") if composite_result else "mas"
        st.info(_EMPTY_GRAPH_MESSAGES.get(engine_type, _EMPTY_GRAPH_DEFAULT))
        return

    st.subheader(AGENT_GRAPH_NETWORK_SUBHEADER)

    if Network is None:
        st.error("Pyvis library not installed. Install with: uv pip install pyvis")
        return

    # Create Pyvis network
    net = Network(
        height="600px",
        width="100%",
        directed=True,
        notebook=False,
        bgcolor=get_theme_bgcolor(),
        font_color=get_graph_font_color(),  # type: ignore[arg-type]
    )

    # Configure physics for better layout
    net.set_options(
        """
        {
            "physics": {
                "enabled": true,
                "barnesHut": {
                    "gravitationalConstant": -8000,
                    "centralGravity": 0.3,
                    "springLength": 95,
                    "springConstant": 0.04
                },
                "stabilization": {
                    "enabled": true,
                    "iterations": 200
                }
            },
            "nodes": {
                "font": {
                    "size": 14
                }
            },
            "edges": {
                "arrows": {
                    "to": {
                        "enabled": true,
                        "scaleFactor": 0.5
                    }
                },
                "smooth": {
                    "type": "continuous"
                }
            }
        }
        """
    )

    # Add nodes with visual distinction — colors from active theme
    agent_color, tool_color = get_graph_node_colors()
    for node in graph.nodes():
        node_data: dict[str, Any] = graph.nodes[node]  # type: ignore[assignment]
        node_type = node_data.get("type", "agent")
        label = node_data.get("label", str(node))

        if node_type == "agent":
            # Agent nodes: themed circles
            net.add_node(
                str(node),
                label=label,
                color=agent_color,
                shape="dot",
                size=25,
                title=f"Agent: {label}",
            )
        else:
            # Tool nodes: themed squares
            net.add_node(
                str(node),
                label=label,
                color=tool_color,
                shape="box",
                size=20,
                title=f"Tool: {label}",
            )

    # Add edges
    for source, target in graph.edges():
        edge_data: dict[str, Any] = graph.edges[source, target]  # type: ignore[assignment]
        interaction = edge_data.get("interaction", "interaction")
        net.add_edge(str(source), str(target), title=interaction)

    # Generate HTML
    with tempfile.NamedTemporaryFile(
        mode="w", delete=False, suffix=".html", encoding="utf-8"
    ) as tmp_file:
        net.save_graph(tmp_file.name)
        tmp_path = Path(tmp_file.name)

    # Read and render HTML with accessibility enhancements
    html_content = tmp_path.read_text(encoding="utf-8")

    # AC-6: Insert <title> element into Pyvis HTML for screen readers
    html_content = html_content.replace("<head>", "<head><title>Agent Interaction Graph</title>", 1)

    # AC-7: Descriptive caption before the graph component
    st.caption(
        "Agent interaction graph showing agent and tool relationships. "
        "See statistics below for details."
    )
    # AC-8: scrolling=True to prevent keyboard trap
    components.html(html_content, height=620, scrolling=True)

    # Cleanup temporary file
    tmp_path.unlink()

    # Display graph statistics
    agent_nodes = sum(1 for n in graph.nodes() if graph.nodes[n].get("type") == "agent")
    tool_nodes = graph.number_of_nodes() - agent_nodes
    agent_names = [
        str(graph.nodes[n].get("label", n))
        for n in graph.nodes()
        if graph.nodes[n].get("type") == "agent"
    ]

    with st.expander("Graph Statistics"):
        st.text(f"Total Nodes: {graph.number_of_nodes()}")
        st.text(f"Total Edges: {graph.number_of_edges()}")
        st.text(f"Agent Nodes: {agent_nodes}")
        st.text(f"Tool Nodes: {tool_nodes}")

    # AC-1: Accessible text summary with node/edge counts and agent names
    st.markdown(
        f"**Graph summary:** {graph.number_of_nodes()} nodes, "
        f"{graph.number_of_edges()} edges. "
        f"Agents: {', '.join(agent_names)}."
    )

gui.pages.evaluation

Streamlit page for Evaluation Results visualization.

Displays three-tier evaluation results including traditional metrics (Tier 1), LLM-as-Judge scores (Tier 2), and graph analysis metrics (Tier 3). Provides comparative visualization of graph-based vs text-based metrics.

Classes

Functions

format_metric_label(metric_key)

Return a human-readable label for a metric key.

Falls back to title-casing the key when no explicit mapping exists.

Parameters:

Name Type Description Default
metric_key str

Snake-case metric name (e.g. “cosine_score”).

required

Returns:

Type Description
str

Human-readable label string (e.g. “Cosine Similarity”).

Source code in src/gui/pages/evaluation.py
def format_metric_label(metric_key: str) -> str:
    """Return a human-readable label for a metric key.

    Falls back to title-casing the key when no explicit mapping exists.

    Args:
        metric_key: Snake-case metric name (e.g. "cosine_score").

    Returns:
        Human-readable label string (e.g. "Cosine Similarity").
    """
    return METRIC_LABELS.get(metric_key, metric_key.replace("_", " ").title())

render_baseline_comparison(comparisons)

Render baseline comparison section for Claude Code solo and teams.

Parameters:

Name Type Description Default
comparisons list[BaselineComparison] | None

List of BaselineComparison instances or None.

required
Source code in src/gui/pages/evaluation.py
def render_baseline_comparison(comparisons: list[BaselineComparison] | None) -> None:
    """Render baseline comparison section for Claude Code solo and teams.

    Args:
        comparisons: List of BaselineComparison instances or None.
    """
    if not comparisons:
        st.info(
            "No baseline comparisons available. "
            "Provide Claude Code artifact directories to compare."
        )
        return

    st.subheader("Baseline Comparisons")

    # Display three-way comparison table if we have 3 comparisons
    if len(comparisons) == 3:
        _render_three_way_table(comparisons)

    # Display individual comparisons
    for comp in comparisons:
        _render_single_comparison(comp)

render_evaluation(result=None)

Render evaluation results page with tier scores and metric comparisons.

Displays: - Overall composite score and recommendation - Individual tier scores (Tier 1, 2, 3) - Bar chart comparing graph metrics vs text metrics - Detailed metric breakdowns - Baseline comparisons (if available in session state)

Parameters:

Name Type Description Default
result CompositeResult | None

CompositeResult containing evaluation data, or None for empty state.

None
Source code in src/gui/pages/evaluation.py
def render_evaluation(result: CompositeResult | None = None) -> None:
    """Render evaluation results page with tier scores and metric comparisons.

    Displays:
    - Overall composite score and recommendation
    - Individual tier scores (Tier 1, 2, 3)
    - Bar chart comparing graph metrics vs text metrics
    - Detailed metric breakdowns
    - Baseline comparisons (if available in session state)

    Args:
        result: CompositeResult containing evaluation data, or None for empty state.
    """
    st.header(EVALUATION_HEADER)

    if result is None:
        _render_empty_state()
        return

    _render_overall_results(result)
    _render_tier_scores(result)
    _render_metrics_comparison(result)

    # Render baseline comparisons if available in session state
    if "baseline_comparisons" in st.session_state:
        render_baseline_comparison(st.session_state["baseline_comparisons"])

    _render_evaluation_details(result)

gui.pages.home

gui.pages.prompts

Streamlit component for displaying agent system prompts.

This module provides a function to display prompt configurations for agent roles using a Streamlit-based UI. Loads prompts directly from ChatConfig without hardcoded fallbacks (DRY principle).

Classes

Functions

render_prompts(chat_config)

Render and edit the prompt configuration for agent roles in the Streamlit UI.

Loads prompts directly from ChatConfig.prompts without hardcoded fallbacks. Follows DRY principle with config_chat.json as single source of truth.

Source code in src/gui/pages/prompts.py
def render_prompts(chat_config: ChatConfig | BaseModel):  # -> dict[str, str]:
    """
    Render and edit the prompt configuration for agent roles in the Streamlit UI.

    Loads prompts directly from ChatConfig.prompts without hardcoded fallbacks.
    Follows DRY principle with config_chat.json as single source of truth.
    """

    header(PROMPTS_HEADER)
    # S8-F8.1: prominent notice — prompts are read-only display
    info("Prompts are read-only. To modify prompts, edit config_chat.json directly.")

    if not isinstance(chat_config, ChatConfig):
        msg = invalid_type("ChatConfig", type(chat_config).__name__)
        logger.error(msg)
        error(msg)
        return None

    # Load prompts directly from ChatConfig - single source of truth
    prompts = chat_config.prompts

    if not prompts:
        info("No prompts configured. Add prompts to config_chat.json.")
        return None

    updated_prompts = prompts.copy()

    # Edit prompts
    for prompt_key, prompt_value in prompts.items():
        new_value = render_prompt_editor(prompt_key, prompt_value, height=200)
        if new_value != prompt_value and new_value is not None:
            updated_prompts[prompt_key] = new_value

gui.pages.run_app

Streamlit interface for running the agentic system interactively.

This module defines the render_app function, which provides a Streamlit-based UI for users to select a provider, enter a query, and execute the main agent workflow. Results and errors are displayed in real time, supporting asynchronous execution.

Provider and sub-agent configuration are read from session state, allowing users to configure these settings on the Settings page before running queries.

Background execution support allows queries to continue running even when users navigate to other tabs, with results persisted in session state.

Input mode supports both free-form text queries and paper selection from downloaded PeerRead papers via a dropdown with abstract preview.

Classes

Functions

render_app(provider=None, chat_config_file=None) async

Render the main app interface for running agentic queries via Streamlit.

Displays input fields for provider and query, a button to trigger execution, and an area for output or error messages. Handles async invocation of the main agent workflow and logs any exceptions.

Provider and sub-agent configuration are read from session state (configured on the Settings page). Execution runs in background with results persisted to session state, allowing navigation across tabs without losing progress.

Engine selection (MAS or Claude Code) is per-run via a radio widget and stored in session state. When CC is selected, MAS-specific controls are disabled and CC availability is checked.

Source code in src/gui/pages/run_app.py
async def render_app(provider: str | None = None, chat_config_file: str | Path | None = None):
    """Render the main app interface for running agentic queries via Streamlit.

    Displays input fields for provider and query, a button to trigger execution,
    and an area for output or error messages. Handles async invocation of the
    main agent workflow and logs any exceptions.

    Provider and sub-agent configuration are read from session state (configured
    on the Settings page). Execution runs in background with results persisted
    to session state, allowing navigation across tabs without losing progress.

    Engine selection (MAS or Claude Code) is per-run via a radio widget and
    stored in session state. When CC is selected, MAS-specific controls are
    disabled and CC availability is checked.
    """
    header(RUN_APP_HEADER)
    _initialize_execution_state()

    st.session_state.setdefault("cc_available", shutil.which("claude") is not None)
    cc_available: bool = st.session_state.cc_available

    provider_from_state, include_researcher, include_analyst, include_synthesiser = (
        _get_session_config(provider)
    )
    token_limit: int | None = st.session_state.get("token_limit")

    engine, cc_teams = _render_engine_selector()
    _render_engine_status(
        engine,
        cc_available,
        provider_from_state,
        token_limit,
        include_researcher,
        include_analyst,
        include_synthesiser,
    )

    query, selected_paper_id = _render_query_input()

    if button(RUN_APP_BUTTON):
        if not (query or selected_paper_id):
            st.session_state.show_validation_warning = True
        else:
            st.session_state.show_validation_warning = False
            await _handle_query_submission(
                query,
                selected_paper_id,
                provider_from_state,
                include_researcher,
                include_analyst,
                include_synthesiser,
                chat_config_file,
                token_limit,
                engine=engine,
                cc_teams=cc_teams,
            )

    if st.session_state.get("show_validation_warning"):
        warning(RUN_APP_QUERY_WARNING)

    subheader(OUTPUT_SUBHEADER)
    _display_execution_result(_get_execution_state())
    _render_artifact_summary_panel()
    _render_debug_log_panel()

    composite_result = st.session_state.get("execution_composite_result")
    _render_report_section(composite_result)

gui.pages.settings

Streamlit settings UI for displaying and editing application settings.

This module provides a function to display and edit settings from pydantic-settings classes (CommonSettings and JudgeSettings). Settings are editable via the GUI and applied to the current session via st.session_state.

Also provides UI controls for chat provider selection and sub-agent configuration with session state persistence.

Classes

Functions

render_settings(common_settings, judge_settings)

Render application settings in the Streamlit UI.

Displays actual default values from CommonSettings and JudgeSettings pydantic-settings classes. Read-only display using Streamlit expanders to organize settings by category.

Also provides UI controls for chat provider selection and sub-agent configuration with session state persistence across page navigation.

Parameters:

Name Type Description Default
common_settings CommonSettings

CommonSettings instance with application-level configuration

required
judge_settings JudgeSettings

JudgeSettings instance with evaluation pipeline configuration

required
Source code in src/gui/pages/settings.py
def render_settings(common_settings: CommonSettings, judge_settings: JudgeSettings) -> None:
    """
    Render application settings in the Streamlit UI.

    Displays actual default values from CommonSettings and JudgeSettings
    pydantic-settings classes. Read-only display using Streamlit expanders
    to organize settings by category.

    Also provides UI controls for chat provider selection and sub-agent configuration
    with session state persistence across page navigation.

    Args:
        common_settings: CommonSettings instance with application-level configuration
        judge_settings: JudgeSettings instance with evaluation pipeline configuration
    """
    header(SETTINGS_HEADER)

    logger.info("Displaying actual settings from pydantic-settings classes")

    # Agent Configuration Section
    _render_agent_configuration()

    # Common Settings Section (editable)
    _render_common_settings(common_settings)

    # Advanced Settings header before judge settings expanders
    st.subheader("Advanced Settings")

    # Judge Settings - Editable Sections
    _render_tier_configuration(judge_settings)
    _render_composite_scoring(judge_settings)
    _render_tier2_llm_judge(judge_settings)
    _render_observability_settings(judge_settings)

    # Reset to Defaults Button
    _render_reset_button()

gui.pages.trace_viewer

Streamlit page for browsing trace execution data.

Reads traces.db (SQLite) directly via the built-in sqlite3 module. Displays an executions overview table with drill-down to individual trace events for a selected execution.

Functions

render_trace_viewer()

Render the Trace Viewer page.

Displays: - Executions overview table from traces.db - Drill-down event table when an execution is selected

Source code in src/gui/pages/trace_viewer.py
def render_trace_viewer() -> None:
    """Render the Trace Viewer page.

    Displays:
    - Executions overview table from traces.db
    - Drill-down event table when an execution is selected
    """
    st.header(TRACE_VIEWER_HEADER)

    db_path = _get_db_path()
    if not db_path.exists():
        st.info("No traces.db found. Run an evaluation first.")
        return

    executions = _query_executions(db_path)
    if not executions:
        st.info("No executions recorded yet. Run an evaluation to populate traces.")
        return

    st.dataframe(executions, width="stretch")

    execution_ids = [e["execution_id"] for e in executions]
    selected = st.selectbox("Select execution for details", execution_ids)

    if selected:
        st.subheader(f"Events for {selected}")
        events = _query_events(db_path, str(selected))
        st.dataframe(events, width="stretch")

gui.utils.log_capture

Log capture utility for GUI debug panel.

This module provides a loguru sink that captures log entries from app.* modules during execution and stores them in memory for display in the Streamlit debug panel. Supports thread-safe incremental polling via get_new_logs_since() for real-time streaming.

Classes

LogCapture

Captures and formats log entries for the debug panel.

This class acts as a loguru sink that filters and stores log entries from app.* modules. It provides methods to retrieve, clear, and format logs for display in the Streamlit UI.

Thread safety: _buffer and _lock allow safe concurrent access from a worker thread (writes via add_log_entry) and the Streamlit render thread (reads via get_new_logs_since / get_logs).

Source code in src/gui/utils/log_capture.py
class LogCapture:
    """Captures and formats log entries for the debug panel.

    This class acts as a loguru sink that filters and stores log entries from
    app.* modules. It provides methods to retrieve, clear, and format logs
    for display in the Streamlit UI.

    Thread safety: _buffer and _lock allow safe concurrent access from a worker
    thread (writes via add_log_entry) and the Streamlit render thread (reads via
    get_new_logs_since / get_logs).
    """

    def __init__(self) -> None:
        """Initialize empty log buffer with thread lock."""
        self._buffer: list[dict[str, str]] = []
        self._lock = threading.Lock()
        self._handler_id: int | None = None

    def add_log_entry(self, timestamp: str, level: str, module: str, message: str) -> None:
        """Add a log entry to the buffer if it's from an app.* module.

        Args:
            timestamp: ISO format timestamp string
            level: Log level (INFO, WARNING, ERROR, etc.)
            module: Module name that generated the log
            message: Log message content
        """
        # Filter: only capture logs from app.* modules
        if not module.startswith("app."):
            return

        with self._lock:
            self._buffer.append(
                {
                    "timestamp": timestamp,
                    "level": level,
                    "module": module,
                    "message": message,
                }
            )

    def get_new_logs_since(self, index: int) -> list[dict[str, str]]:
        """Return log entries added since the given index (for incremental polling).

        The caller tracks the last-seen index and passes it on each poll.
        Only entries at positions >= index are returned, allowing a Streamlit
        fragment or polling loop to render only new content on each re-run.

        Args:
            index: Number of entries already seen (0 = return all entries)

        Returns:
            List of new log entry dictionaries since index
        """
        with self._lock:
            return list(self._buffer[index:])

    def log_count(self) -> int:
        """Return the current number of buffered log entries.

        Returns:
            Number of entries in the buffer
        """
        with self._lock:
            return len(self._buffer)

    def get_logs(self) -> list[dict[str, str]]:
        """Retrieve all captured log entries.

        Returns:
            List of log entry dictionaries
        """
        with self._lock:
            return list(self._buffer)

    def clear(self) -> None:
        """Clear the log buffer."""
        with self._lock:
            self._buffer.clear()

    def format_html(self) -> str:
        """Format log entries as HTML with color-coded levels.

        Returns:
            HTML string with styled log entries
        """
        return self.format_logs_as_html(self.get_logs())

    @staticmethod
    def format_logs_as_html(logs: list[dict[str, str]]) -> str:
        """Format a list of log entries as HTML with color-coded levels.

        Args:
            logs: List of log entry dictionaries

        Returns:
            HTML string with styled log entries
        """
        if not logs:
            return "<p>No logs captured.</p>"

        html_parts: list[str] = []
        level_colors = {
            "WARNING": "#DAA520",  # Yellow (goldenrod)
            "ERROR": "#F44336",  # Red
            "DEBUG": "#2196F3",  # Blue
            "CRITICAL": "#9C27B0",  # Purple
        }
        # S8-F8.1: WCAG 1.4.1 — text badges prevent color-only log level identification
        level_badges = {
            "WARNING": "[WARN]",
            "ERROR": "[ERR]",
            "DEBUG": "[DBG]",
            "CRITICAL": "[CRIT]",
            "INFO": "[INFO]",
        }

        for entry in logs:
            level = entry["level"]
            color = level_colors.get(level, "#666666")
            badge = level_badges.get(level, f"[{level}]")
            html_parts.append(
                f'<div style="margin-bottom: 8px;">'
                f'<span style="color: #666;">{entry["timestamp"]}</span> '
                # S8-F8.1: WCAG 1.4.1 — text badge + color (not color alone)
                f'<span style="color: {color}; font-weight: bold;">{badge}</span> '
                # S8-F8.1: WCAG 1.4.3 — #696969 contrast ratio 5.9:1 (passes AA)
                f'<span style="color: #696969;">{entry["module"]}</span> '
                f'<span style="color: inherit;">{entry["message"]}</span>'
                f"</div>"
            )

        return f'<section role="log" aria-label="Debug logs">{"".join(html_parts)}</section>'

    def _sink_handler(self, message: Any) -> None:
        """Loguru sink handler that processes log records.

        Args:
            message: Loguru message record
        """
        record = message.record
        module = record.get("name", "unknown")
        timestamp = record["time"].strftime("%Y-%m-%d %H:%M:%S")
        level = record["level"].name
        msg = record["message"]

        self.add_log_entry(timestamp, level, module, msg)

    def attach_to_logger(self) -> int:
        """Attach this capture instance as a loguru sink.

        Returns:
            Handler ID for later removal
        """
        self._handler_id = logger.add(self._sink_handler, format="{message}")
        return self._handler_id

    def detach_from_logger(self, handler_id: int) -> None:
        """Detach this capture instance from loguru.

        Args:
            handler_id: Handler ID returned by attach_to_logger
        """
        logger.remove(handler_id)
        self._handler_id = None
Functions
__init__()

Initialize empty log buffer with thread lock.

Source code in src/gui/utils/log_capture.py
def __init__(self) -> None:
    """Initialize empty log buffer with thread lock."""
    self._buffer: list[dict[str, str]] = []
    self._lock = threading.Lock()
    self._handler_id: int | None = None
add_log_entry(timestamp, level, module, message)

Add a log entry to the buffer if it’s from an app.* module.

Parameters:

Name Type Description Default
timestamp str

ISO format timestamp string

required
level str

Log level (INFO, WARNING, ERROR, etc.)

required
module str

Module name that generated the log

required
message str

Log message content

required
Source code in src/gui/utils/log_capture.py
def add_log_entry(self, timestamp: str, level: str, module: str, message: str) -> None:
    """Add a log entry to the buffer if it's from an app.* module.

    Args:
        timestamp: ISO format timestamp string
        level: Log level (INFO, WARNING, ERROR, etc.)
        module: Module name that generated the log
        message: Log message content
    """
    # Filter: only capture logs from app.* modules
    if not module.startswith("app."):
        return

    with self._lock:
        self._buffer.append(
            {
                "timestamp": timestamp,
                "level": level,
                "module": module,
                "message": message,
            }
        )
attach_to_logger()

Attach this capture instance as a loguru sink.

Returns:

Type Description
int

Handler ID for later removal

Source code in src/gui/utils/log_capture.py
def attach_to_logger(self) -> int:
    """Attach this capture instance as a loguru sink.

    Returns:
        Handler ID for later removal
    """
    self._handler_id = logger.add(self._sink_handler, format="{message}")
    return self._handler_id
clear()

Clear the log buffer.

Source code in src/gui/utils/log_capture.py
def clear(self) -> None:
    """Clear the log buffer."""
    with self._lock:
        self._buffer.clear()
detach_from_logger(handler_id)

Detach this capture instance from loguru.

Parameters:

Name Type Description Default
handler_id int

Handler ID returned by attach_to_logger

required
Source code in src/gui/utils/log_capture.py
def detach_from_logger(self, handler_id: int) -> None:
    """Detach this capture instance from loguru.

    Args:
        handler_id: Handler ID returned by attach_to_logger
    """
    logger.remove(handler_id)
    self._handler_id = None
format_html()

Format log entries as HTML with color-coded levels.

Returns:

Type Description
str

HTML string with styled log entries

Source code in src/gui/utils/log_capture.py
def format_html(self) -> str:
    """Format log entries as HTML with color-coded levels.

    Returns:
        HTML string with styled log entries
    """
    return self.format_logs_as_html(self.get_logs())
format_logs_as_html(logs) staticmethod

Format a list of log entries as HTML with color-coded levels.

Parameters:

Name Type Description Default
logs list[dict[str, str]]

List of log entry dictionaries

required

Returns:

Type Description
str

HTML string with styled log entries

Source code in src/gui/utils/log_capture.py
@staticmethod
def format_logs_as_html(logs: list[dict[str, str]]) -> str:
    """Format a list of log entries as HTML with color-coded levels.

    Args:
        logs: List of log entry dictionaries

    Returns:
        HTML string with styled log entries
    """
    if not logs:
        return "<p>No logs captured.</p>"

    html_parts: list[str] = []
    level_colors = {
        "WARNING": "#DAA520",  # Yellow (goldenrod)
        "ERROR": "#F44336",  # Red
        "DEBUG": "#2196F3",  # Blue
        "CRITICAL": "#9C27B0",  # Purple
    }
    # S8-F8.1: WCAG 1.4.1 — text badges prevent color-only log level identification
    level_badges = {
        "WARNING": "[WARN]",
        "ERROR": "[ERR]",
        "DEBUG": "[DBG]",
        "CRITICAL": "[CRIT]",
        "INFO": "[INFO]",
    }

    for entry in logs:
        level = entry["level"]
        color = level_colors.get(level, "#666666")
        badge = level_badges.get(level, f"[{level}]")
        html_parts.append(
            f'<div style="margin-bottom: 8px;">'
            f'<span style="color: #666;">{entry["timestamp"]}</span> '
            # S8-F8.1: WCAG 1.4.1 — text badge + color (not color alone)
            f'<span style="color: {color}; font-weight: bold;">{badge}</span> '
            # S8-F8.1: WCAG 1.4.3 — #696969 contrast ratio 5.9:1 (passes AA)
            f'<span style="color: #696969;">{entry["module"]}</span> '
            f'<span style="color: inherit;">{entry["message"]}</span>'
            f"</div>"
        )

    return f'<section role="log" aria-label="Debug logs">{"".join(html_parts)}</section>'
get_logs()

Retrieve all captured log entries.

Returns:

Type Description
list[dict[str, str]]

List of log entry dictionaries

Source code in src/gui/utils/log_capture.py
def get_logs(self) -> list[dict[str, str]]:
    """Retrieve all captured log entries.

    Returns:
        List of log entry dictionaries
    """
    with self._lock:
        return list(self._buffer)
get_new_logs_since(index)

Return log entries added since the given index (for incremental polling).

The caller tracks the last-seen index and passes it on each poll. Only entries at positions >= index are returned, allowing a Streamlit fragment or polling loop to render only new content on each re-run.

Parameters:

Name Type Description Default
index int

Number of entries already seen (0 = return all entries)

required

Returns:

Type Description
list[dict[str, str]]

List of new log entry dictionaries since index

Source code in src/gui/utils/log_capture.py
def get_new_logs_since(self, index: int) -> list[dict[str, str]]:
    """Return log entries added since the given index (for incremental polling).

    The caller tracks the last-seen index and passes it on each poll.
    Only entries at positions >= index are returned, allowing a Streamlit
    fragment or polling loop to render only new content on each re-run.

    Args:
        index: Number of entries already seen (0 = return all entries)

    Returns:
        List of new log entry dictionaries since index
    """
    with self._lock:
        return list(self._buffer[index:])
log_count()

Return the current number of buffered log entries.

Returns:

Type Description
int

Number of entries in the buffer

Source code in src/gui/utils/log_capture.py
def log_count(self) -> int:
    """Return the current number of buffered log entries.

    Returns:
        Number of entries in the buffer
    """
    with self._lock:
        return len(self._buffer)

run_cli

Lightweight CLI wrapper for the Agents-eval application.

This wrapper handles help and basic argument parsing quickly without loading heavy dependencies. It only imports the main application when actual processing is needed.

Functions

cli_main()

Run the CLI application entry point.

Parses arguments, selects the execution engine, runs the pipeline, and logs the artifact summary.

Source code in src/run_cli.py
def cli_main() -> None:
    """Run the CLI application entry point.

    Parses arguments, selects the execution engine, runs the pipeline,
    and logs the artifact summary.
    """
    import sys

    args = parse_args(argv[1:])
    engine = args.pop("engine")
    cc_teams = args.pop("cc_teams", False) or False
    generate_report_flag = args.pop("generate_report", False) or False
    no_llm_suggestions = args.pop("no_llm_suggestions", False) or False

    # Reason: main() expects a JudgeSettings object, not raw provider/model strings.
    # Mirrors SweepRunner._build_judge_settings() logic.
    judge_provider = args.pop("judge_provider", None)
    judge_model = args.pop("judge_model", None)
    judge_settings = None
    if judge_provider or judge_model:
        from app.judge.evaluation_pipeline import JudgeSettings

        kwargs: dict[str, Any] = {}
        if judge_provider:
            kwargs["tier2_provider"] = judge_provider
        if judge_model:
            kwargs["tier2_model"] = judge_model
        judge_settings = JudgeSettings(**kwargs)
    args["judge_settings"] = judge_settings

    if engine == "cc" and not shutil.which("claude"):
        print(
            "error: --engine=cc requires the 'claude' CLI to be installed and on PATH",
            file=sys.stderr,
        )
        exit(1)

    from asyncio import run

    from app.app import main
    from app.utils.artifact_registry import get_artifact_registry
    from app.utils.log import logger

    logger.info(f"Used arguments: {args}")

    cc_result_obj = _run_cc_engine(args, cc_teams) if engine == "cc" else None

    try:
        result_dict = run(main(**args, engine=engine, cc_result=cc_result_obj, cc_teams=cc_teams))
        if generate_report_flag and result_dict:
            _maybe_generate_report(result_dict, no_llm_suggestions)
    finally:
        logger.info(get_artifact_registry().format_summary_block())

parse_args(argv)

Parse command line arguments into a dictionary.

Parameters:

Name Type Description Default
argv list[str]

List of CLI argument strings (without the program name).

required

Returns:

Type Description
dict[str, Any]

Dictionary of explicitly-provided arguments (plus engine default).

Example

parse_args([“–chat-provider”, “ollama”, “–include-researcher”])

Source code in src/run_cli.py
def parse_args(argv: list[str]) -> dict[str, Any]:
    """Parse command line arguments into a dictionary.

    Args:
        argv: List of CLI argument strings (without the program name).

    Returns:
        Dictionary of explicitly-provided arguments (plus engine default).

    Example:
        >>> parse_args(["--chat-provider", "ollama", "--include-researcher"])
        {'chat_provider': 'ollama', 'include_researcher': True, 'engine': 'mas'}
    """
    return {k: v for k, v in vars(_parser.parse_args(argv)).items() if v is not None}

run_gui

This module sets up and runs a Streamlit application for a Multi-Agent System.

The application uses a sidebar tab layout with five navigation sections: - Run Research App: execution controls (provider, engine, paper, query, run button) - Settings: configuration options for provider and sub-agents - Evaluation Results: evaluation results and baseline comparison - Agent Graph: visual representation of agent interactions - Trace Viewer: SQLite browser for traces.db execution data

The main function loads the configuration, renders the UI components, and handles the execution of the Multi-Agent System based on user input.

Functions: - main(): Main function to set up and run the Streamlit application.

Classes

Functions

get_session_state_defaults()

Get default values for session state.

Returns:

Type Description
dict[str, str | bool]

Dict with default provider and sub-agent configuration flags

Source code in src/run_gui.py
def get_session_state_defaults() -> dict[str, str | bool]:
    """
    Get default values for session state.

    Returns:
        Dict with default provider and sub-agent configuration flags
    """
    return {
        "chat_provider": CHAT_DEFAULT_PROVIDER,
        "include_researcher": False,
        "include_analyst": False,
        "include_synthesiser": False,
    }

initialize_session_state()

Initialize session state with default values if not already set.

Uses st.session_state to persist user selections across page navigation.

Source code in src/run_gui.py
def initialize_session_state() -> None:
    """
    Initialize session state with default values if not already set.

    Uses st.session_state to persist user selections across page navigation.
    """
    defaults = get_session_state_defaults()
    for key, value in defaults.items():
        if key not in st.session_state:
            st.session_state[key] = value

run_sweep

CLI entry point for MAS composition sweep.

Run automated benchmarking across multiple agent compositions with statistical analysis of results.

Classes

Functions

main()

Synchronous main entry point.

Returns:

Name Type Description
int int

Exit code (0 for success, 1 for error).

Source code in src/run_sweep.py
def main() -> int:
    """Synchronous main entry point.

    Returns:
        int: Exit code (0 for success, 1 for error).
    """
    return asyncio.run(main_async())

main_async() async

Async main entry point.

Returns:

Name Type Description
int int

Exit code (0 for success, 1 for error).

Source code in src/run_sweep.py
async def main_async() -> int:
    """Async main entry point.

    Returns:
        int: Exit code (0 for success, 1 for error).
    """
    args = parse_args()

    try:
        config = (
            _load_config_from_file(args.config) if args.config else _build_config_from_args(args)
        )

        if config is None:
            return 1

        # Run sweep
        logger.info(f"Starting sweep with {len(config.compositions)} compositions")
        logger.info(f"Provider: {config.chat_provider}")
        logger.info(f"Papers: {config.paper_ids}")
        logger.info(f"Repetitions: {config.repetitions}")
        logger.info(f"Output: {config.output_dir}")

        results = await run_sweep(config)
        n = len(results)
        print(f"\nSweep complete: {n} evaluation{'s' if n != 1 else ''} succeeded.")

        logger.info(f"Sweep completed with {n} total evaluations")
        logger.info(f"Results saved to {config.output_dir}")

        # Log artifact summary at end of sweep (AC7)
        from app.utils.artifact_registry import get_artifact_registry

        logger.info(get_artifact_registry().format_summary_block())

        return 0

    except Exception as e:
        logger.error(f"Sweep failed with {type(e).__name__}: {e}", exc_info=True)
        return 1

parse_args()

Parse command line arguments.

Returns:

Type Description
Namespace

argparse.Namespace: Parsed arguments.

Source code in src/run_sweep.py
def parse_args() -> argparse.Namespace:
    """Parse command line arguments.

    Returns:
        argparse.Namespace: Parsed arguments.
    """
    parser = argparse.ArgumentParser(
        description="Run MAS composition sweep with configurable parameters"
    )

    # Config file option
    parser.add_argument(
        "--config",
        type=Path,
        help="Path to sweep configuration JSON file",
    )

    # Individual parameter options (override config file)
    parser.add_argument(
        "--paper-ids",
        type=str,
        help="Comma-separated list of paper IDs (e.g., '1,2,3' or '1105.1072')",
    )
    parser.add_argument(
        "--repetitions",
        type=int,
        default=3,
        help="Number of repetitions per composition (default: 3)",
    )
    parser.add_argument(
        "--output-dir",
        type=Path,
        help="Output directory for results (default: results/sweeps/<timestamp>)",
    )
    parser.add_argument(
        "--all-compositions",
        action="store_true",
        help="Use all 2^3=8 agent compositions (default)",
    )
    parser.add_argument(
        "--chat-provider",
        type=str,
        choices=list(PROVIDER_REGISTRY.keys()),
        default=CHAT_DEFAULT_PROVIDER,
        help=f"LLM provider to use for MAS agents (default: {CHAT_DEFAULT_PROVIDER})",
    )
    parser.add_argument(
        "--judge-provider",
        type=str,
        default="auto",
        help="LLM provider for Tier 2 judge (default: auto, inherits --chat-provider)",
    )
    parser.add_argument(
        "--judge-model",
        type=str,
        default=None,
        help="LLM model for Tier 2 judge (default: inherits chat model when auto)",
    )
    parser.add_argument(
        "--engine",
        type=str,
        choices=["mas", "cc"],
        default="mas",
        help="Execution engine: 'mas' for MAS pipeline (default), 'cc' for Claude Code headless",
    )
    parser.add_argument(
        "--cc-teams",
        action="store_true",
        default=False,
        help="Use Claude Code Agent Teams mode (requires --engine=cc)",
    )

    return parser.parse_args()