├── .claude
    ├── commands
    │   ├── execute-prp.md
    │   └── generate-prp.md
    └── settings.local.json
├── .devcontainer
    ├── setup_dev
    │   └── devcontainer.json
    ├── setup_dev_claude
    │   └── devcontainer.json
    └── setup_dev_ollama
    │   └── devcontainer.json
├── .env.example
├── .github
    ├── dependabot.yaml
    ├── scripts
    │   ├── create_pr.sh
    │   └── delete_branch_pr_tag.sh
    └── workflows
    │   ├── bump-my-version.yaml
    │   ├── codeql.yaml
    │   ├── generate-deploy-mkdocs-ghpages.yaml
    │   ├── links-fail-fast.yaml
    │   ├── pytest.yaml
    │   ├── ruff.yaml
    │   ├── summarize-jobs-reusable.yaml
    │   └── write-llms-txt.yaml
├── .gitignore
├── .gitmessage
├── .streamlit
    └── config.toml
├── .vscode
    ├── extensions.json
    └── settings.json
├── AGENTS.md
├── CHANGELOG.md
├── CLAUDE.md
├── Dockerfile
├── LICENSE.md
├── Makefile
├── README.md
├── assets
    └── images
    │   ├── c4-multi-agent-system.png
    │   ├── customer-journey-activity-dark.png
    │   ├── customer-journey-activity-light.png
    │   └── metrics-eval-sweep.png
├── context
    └── PRPs
    │   ├── coordination_quality.md
    │   ├── features
    │       ├── coordination_quality.md
    │       └── tool_efficiency.md
    │   ├── templates
    │       ├── feature_base.md
    │       └── prp_base.md
    │   └── tool_efficiency.md
├── docs
    ├── PRD.md
    ├── SprintPlan.md
    ├── UserStory.md
    ├── architecture
    │   ├── c4-multi-agent-system.plantuml
    │   ├── customer-journey-activity-dark
    │   ├── customer-journey-activity-light.plantuml
    │   └── metrics-eval-sweep.plantuml
    └── llms.txt
├── mkdocs.yaml
├── pyproject.toml
├── src
    ├── app
    │   ├── __init__.py
    │   ├── agents
    │   │   ├── __init__.py
    │   │   ├── agent_system.py
    │   │   └── llm_model_funs.py
    │   ├── config
    │   │   ├── __init__.py
    │   │   ├── config_app.py
    │   │   ├── config_chat.json
    │   │   ├── config_eval.json
    │   │   └── data_models.py
    │   ├── evals
    │   │   ├── __init__.py
    │   │   └── metrics.py
    │   ├── main.py
    │   ├── py.typed
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── error_messages.py
    │   │   ├── load_configs.py
    │   │   ├── load_settings.py
    │   │   ├── log.py
    │   │   ├── login.py
    │   │   └── utils.py
    ├── examples
    │   ├── config.json
    │   ├── run_simple_agent_no_tools.py
    │   ├── run_simple_agent_system.py
    │   ├── run_simple_agent_tools.py
    │   └── utils
    │   │   ├── agent_simple_no_tools.py
    │   │   ├── agent_simple_system.py
    │   │   ├── agent_simple_tools.py
    │   │   ├── data_models.py
    │   │   ├── tools.py
    │   │   └── utils.py
    ├── gui
    │   ├── components
    │   │   ├── footer.py
    │   │   ├── header.py
    │   │   ├── output.py
    │   │   ├── prompts.py
    │   │   └── sidebar.py
    │   ├── config
    │   │   ├── config.py
    │   │   ├── styling.py
    │   │   └── text.py
    │   └── pages
    │   │   ├── home.py
    │   │   ├── prompts.py
    │   │   ├── run_app.py
    │   │   └── settings.py
    └── run_gui.py
├── tests
    ├── test_agent_system.py
    ├── test_env.py
    ├── test_metrics_output_similarity.py
    ├── test_metrics_time_taken.py
    └── test_provider_config.py
└── uv.lock


/.claude/commands/execute-prp.md:
--------------------------------------------------------------------------------
 1 | # Execute Product Requirements Prompt (PRP)
 2 | 
 3 | Implement a feature using using the PRP file.
 4 | 
 5 | ## PRP File: $ARGUMENTS
 6 | 
 7 | ## Execution Process
 8 | 
 9 | 1. **Load PRP**
10 |    - Read the specified PRP file
11 |    - Understand all context and requirements
12 |    - Follow all instructions in the PRP and extend the research if needed
13 |    - Ensure you have all needed context to implement the PRP fully
14 |    - Do more web searches and codebase exploration as needed
15 | 
16 | 2. **ULTRATHINK**
17 |    - Think hard before you execute the plan. Create a comprehensive plan addressing all requirements.
18 |    - Break down complex tasks into smaller, manageable steps using your todos tools.
19 |    - Use the TodoWrite tool to create and track your implementation plan.
20 |    - Identify implementation patterns from existing code to follow.
21 | 
22 | 3. **Execute the plan**
23 |    - Execute the PRP
24 |    - Implement all the code
25 | 
26 | 4. **Validate**
27 |    - Run each validation command
28 |    - Fix any failures
29 |    - Re-run until all pass
30 | 
31 | 5. **Complete**
32 |    - Ensure all checklist items done
33 |    - Run final validation suite
34 |    - Report completion status
35 |    - Read the PRP again to ensure you have implemented everything
36 | 
37 | 6. **Reference the PRP**
38 |    - You can always reference the PRP again if needed
39 | 
40 | Note: If validation fails, use error patterns in PRP to fix and retry.
41 | 


--------------------------------------------------------------------------------
/.claude/commands/generate-prp.md:
--------------------------------------------------------------------------------
 1 | # Create Product Requirements Prompt (PRP)
 2 | 
 3 | ## Feature file: $ARGUMENTS
 4 | 
 5 | Generate a complete PRP (Product Requirements Prompt) for general feature implementation with thorough research. Ensure context is passed to the AI agent to enable self-validation and iterative refinement. Read the feature file first to understand what needs to be created, how the examples provided help, and any other considerations.
 6 | 
 7 | The AI agent only gets the context you are appending to the PRP and training data. Assume the AI agent has access to the codebase and the same knowledge cutoff as you, so its important that your research findings are included or referenced in the PRP. The Agent has Websearch capabilities, so pass urls to documentation and examples.
 8 | 
 9 | - Use `/context/PRPs` as `$base_path`
10 | - Extract only the filename from `$ARGUMENTS` into `$file_name`
11 | 
12 | ## Research Process
13 | 
14 | 1. **Codebase Analysis**
15 |    - Search for similar features/patterns in the codebase
16 |    - Identify files to reference in PRP
17 |    - Note existing conventions to follow
18 |    - Check test patterns for validation approach
19 | 
20 | 2. **External Research**
21 |    - Search for similar features/patterns online
22 |    - Library documentation (include specific URLs)
23 |    - Implementation examples (GitHub/StackOverflow/blogs)
24 |    - Best practices and common pitfalls
25 | 
26 | 3. **User Clarification** (if needed)
27 |    - Specific patterns to mirror and where to find them?
28 |    - Integration requirements and where to find them?
29 | 
30 | ## PRP Generation
31 | 
32 | - Use `${base_path}/templates/prp_base.md` in the base folder as template
33 | 
34 | ### Critical Context to Include and pass to the AI agent as part of the PRP
35 | 
36 | - **Documentation**: URLs with specific sections
37 | - **Code Examples**: Real snippets from codebase
38 | - **Gotchas**: Library quirks, version issues
39 | - **Patterns**: Existing approaches to follow
40 | 
41 | ### Implementation Blueprint
42 | 
43 | - Start with pseudocode showing approach
44 | - Reference real files for patterns
45 | - Include error handling strategy
46 | - list tasks to be completed to fullfill the PRP in the order they should be completed
47 | 
48 | ### Validation Gates (Must be Executable) eg for python
49 | 
50 | ```bash
51 | # Syntax/Style
52 | make ruff
53 | make type_check
54 | 
55 | # Unit Tests
56 | make coverage_all
57 | ```
58 | 
59 | ***CRITICAL AFTER YOU ARE DONE RESEARCHING AND EXPLORING THE CODEBASE BEFORE YOU START WRITING THE PRP***
60 | 
61 | ***ULTRATHINK ABOUT THE PRP AND PLAN YOUR APPROACH THEN START WRITING THE PRP***
62 | 
63 | ## Output
64 | 
65 | - Save the result to `${base_path}/${file_name}`
66 | 
67 | ## Quality Checklist
68 | 
69 | - [ ] All necessary context included
70 | - [ ] Validation gates are executable by AI
71 | - [ ] References existing patterns
72 | - [ ] Clear implementation path
73 | - [ ] Error handling documented
74 | 
75 | Score the PRP on a scale of 1-10 (confidence level to succeed in one-pass implementation using claude codes)
76 | 
77 | Remember: The goal is one-pass implementation success through comprehensive context.
78 | 


--------------------------------------------------------------------------------
/.claude/settings.local.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "permissions": {
 3 |     "allow": [
 4 |       "Bash(cat:*)",
 5 |       "Bash(find:*)",
 6 |       "Bash(git:diff*)",
 7 |       "Bash(git:status*)",
 8 |       "Bash(grep:*)",
 9 |       "Bash(ls:*)",
10 |       "Bash(mkdir:*)",
11 |       "Bash(source:*)",
12 |       "Bash(touch:*)",
13 |       "Bash(tree:*)",
14 |       "Bash(uv run:*)", 
15 |       "Edit(AGENTS.md)",
16 |       "Edit(docs/**/*.md)",
17 |       "Edit(src/**/*.py)",
18 |       "Edit(src/**/*.json)",
19 |       "Edit(tests/**/*.py)",
20 |       "WebFetch(domain:docs.anthropic.com)"
21 |     ],
22 |     "deny": [
23 |       "Bash(mv:*)",
24 |       "Bash(rm:*)"
25 |     ]
26 |   }
27 | }


--------------------------------------------------------------------------------
/.devcontainer/setup_dev/devcontainer.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name": "make setup_dev",
3 |   "image": "mcr.microsoft.com/vscode/devcontainers/python:3.13",
4 |   "postCreateCommand": "make setup_dev"
5 | }


--------------------------------------------------------------------------------
/.devcontainer/setup_dev_claude/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "make setup_dev_claude",
 3 |   "image": "mcr.microsoft.com/vscode/devcontainers/python:3.13",
 4 |   "features": {
 5 |     "ghcr.io/devcontainers/features/node:1": {}
 6 |   },
 7 |   "customizations": {
 8 |     "vscode": {
 9 |       "extensions": [
10 |         "anthropic.claude-code"
11 |       ]
12 |     }
13 |   },
14 |   "postCreateCommand": "make setup_dev_claude"
15 | }


--------------------------------------------------------------------------------
/.devcontainer/setup_dev_ollama/devcontainer.json:
--------------------------------------------------------------------------------
1 | {
2 |     "name": "make setup_dev_ollama",
3 |     "image": "mcr.microsoft.com/vscode/devcontainers/python:3.13",
4 |     "postCreateCommand": "make setup_dev_ollama"
5 | }


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | # inference EP
 2 | ANTHROPIC_API_KEY="sk-abc-xyz"
 3 | GEMINI_API_KEY="xyz"
 4 | GITHUB_API_KEY="ghp_xyz"
 5 | GROK_API_KEY="xai-xyz"
 6 | HUGGINGFACE_API_KEY="hf_xyz"
 7 | OPENROUTER_API_KEY="sk-or-v1-xyz"
 8 | PERPLEXITY_API_KEY=""
 9 | RESTACK_API_KEY="xyz"
10 | TOGETHER_API_KEY="xyz"
11 | 
12 | # tools
13 | TAVILY_API_KEY=""
14 | 
15 | # log/mon/trace
16 | AGENTOPS_API_KEY="x-y-z-x-y"
17 | LOGFIRE_API_KEY="pylf_v1_xx_y"  # LOGFIRE_TOKEN
18 | WANDB_API_KEY="xyz"
19 | 
20 | # eval
21 | 


--------------------------------------------------------------------------------
/.github/dependabot.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 3 | version: 2
 4 | updates:
 5 |   - package-ecosystem: "pip"
 6 |     directory: "/"
 7 |     schedule:
 8 |       interval: "weekly"
 9 | ...
10 | 


--------------------------------------------------------------------------------
/.github/scripts/create_pr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 1 base ref, 2 target ref, 3 title suffix
 3 | # 4 current version, 5 bumped
 4 | 
 5 | pr_title="PR $2 $3"
 6 | pr_body="PR automatically created from \`$1\` to bump from \`$4\` to \`$5\` on \`$2\`. Tag \`v$5\` will be created and has to be deleted manually if PR gets closed without merge."
 7 | 
 8 | gh pr create \
 9 |   --base $1 \
10 |   --head $2 \
11 |   --title "${pr_title}" \
12 |   --body "${pr_body}"
13 |   # --label "bump"
14 | 


--------------------------------------------------------------------------------
/.github/scripts/delete_branch_pr_tag.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 1 repo, 2 target ref, 3 current version
 3 | 
 4 | tag_to_delete="v$3"
 5 | branch_del_api_call="repos/$1/git/refs/heads/$2"
 6 | del_msg="'$2' force deletion attempted."
 7 | close_msg="Closing PR '$2' to rollback after failure"
 8 | 
 9 | echo "Tag $tag_to_delete for $del_msg"
10 | git tag -d "$tag_to_delete"
11 | echo "PR for $del_msg"
12 | gh pr close "$2" --comment "$close_msg"
13 | echo "Branch $del_msg"
14 | gh api "$branch_del_api_call" -X DELETE && \
15 |   echo "Branch without error return deleted."


--------------------------------------------------------------------------------
/.github/workflows/bump-my-version.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | name: bump-my-version
  3 | 
  4 | on:
  5 |   # pull_request:
  6 |   #  types: [closed]
  7 |   #  branches: [main]
  8 |   workflow_dispatch:
  9 |     inputs:
 10 |       bump_type:
 11 |         description: '[major|minor|patch]'
 12 |         required: true
 13 |         default: 'patch'
 14 |         type: choice
 15 |         options:
 16 |         - 'major'
 17 |         - 'minor'
 18 |         - 'patch'
 19 | 
 20 | env:
 21 |   BRANCH_NEW: "bump-${{ github.run_number }}-${{ github.ref_name }}"
 22 |   SKIP_PR_HINT: "[skip ci bump]"
 23 |   SCRIPT_PATH: ".github/scripts"
 24 | 
 25 | jobs:
 26 |   bump_my_version:
 27 |     # TODO bug? currently resulting in: Unrecognized named-value: 'env'.
 28 |     # https://stackoverflow.com/questions/61238849/github-actions-if-contains-function-not-working-with-env-variable/61240761
 29 |     # if: !contains(
 30 |     #      github.event.pull_request.title,
 31 |     #      ${{ env.SKIP_PR_HINT }}
 32 |     #    )
 33 |     # TODO check for PR closed by bot to avoid PR creation loop
 34 |     # github.actor != 'github-actions'
 35 |     if: >
 36 |         github.event_name == 'workflow_dispatch' ||
 37 |         ( github.event.pull_request.merged == true &&
 38 |         github.event.pull_request.closed_by != 'github-actions' )
 39 |     runs-on: ubuntu-latest
 40 |     outputs:
 41 |       branch_new: ${{ steps.create_branch.outputs.branch_new }}
 42 |       summary_data: ${{ steps.set_summary.outputs.summary_data }}
 43 |     permissions:
 44 |       actions: read
 45 |       checks: write
 46 |       contents: write
 47 |       pull-requests: write
 48 |     steps:
 49 | 
 50 |       - name: Checkout repo
 51 |         uses: actions/checkout@v4
 52 |         with:
 53 |           fetch-depth: 1
 54 | 
 55 |       - name: Set git cfg and create branch
 56 |         id: create_branch
 57 |         run: |
 58 |           git config user.email "bumped@qte77.gha"
 59 |           git config user.name "bump-my-version"
 60 |           git checkout -b "${{ env.BRANCH_NEW }}"
 61 |           echo "branch_new=${{ env.BRANCH_NEW }}" >> $GITHUB_OUTPUT
 62 | 
 63 |       - name: Bump version
 64 |         id: bump
 65 |         uses: callowayproject/bump-my-version@0.29.0
 66 |         env:
 67 |           BUMPVERSION_TAG: "true"
 68 |         with:
 69 |           args: ${{ inputs.bump_type }}
 70 |           branch: ${{ env.BRANCH_NEW }}
 71 | 
 72 |       - name: "Create PR '${{ env.BRANCH_NEW }}'"
 73 |         if: steps.bump.outputs.bumped == 'true'
 74 |         env:
 75 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 76 |         run: |
 77 |           src="${{ env.SCRIPT_PATH }}/create_pr.sh"
 78 |           chmod +x "$src"
 79 |           $src "${{ github.ref_name }}" "${{ env.BRANCH_NEW }}" "${{ env.SKIP_PR_HINT }}" "${{ steps.bump.outputs.previous-version }}" "${{ steps.bump.outputs.current-version }}"
 80 | 
 81 |       - name: Delete branch, PR and tag in case of failure or cancel
 82 |         if: failure() || cancelled()
 83 |         env:
 84 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 85 |         run: |
 86 |           src="${{ env.SCRIPT_PATH }}/delete_branch_pr_tag.sh"
 87 |           chmod +x "$src"
 88 |           $src "${{ github.repository }}" "${{ env.BRANCH_NEW }}" "${{ steps.bump.outputs.current-version }}"
 89 | 
 90 |       - name: Set summary data
 91 |         id: set_summary
 92 |         if: ${{ always() }}
 93 |         run: echo "summary_data=${GITHUB_STEP_SUMMARY}" >> $GITHUB_OUTPUT
 94 |   
 95 |   generate_summary:
 96 |     name: Generate Summary Report 
 97 |     if: ${{ always() }}
 98 |     needs: bump_my_version
 99 |     uses: ./.github/workflows/summarize-jobs-reusable.yaml
100 |     with:
101 |       branch_to_summarize: ${{ needs.bump_my_version.outputs.branch_new }}
102 |       summary_data: ${{ needs.bump_my_version.outputs.summary_data }}
103 | ...
104 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # https://github.blog/changelog/2023-01-18-code-scanning-codeql-action-v1-is-now-deprecated/
 3 | name: "CodeQL"
 4 | 
 5 | on:
 6 |   push:
 7 |   pull_request:
 8 |     types: [closed]
 9 |     branches: [ main ]
10 |   schedule:
11 |     - cron: '27 11 * * 0'
12 |   workflow_dispatch:
13 | 
14 | jobs:
15 |   analyze:
16 |     name: Analyze
17 |     runs-on: ubuntu-latest
18 |     permissions:
19 |       actions: read
20 |       contents: read
21 |       security-events: write
22 | 
23 |     steps:
24 |     - name: Checkout repository
25 |       uses: actions/checkout@v4
26 | 
27 |     - name: Initialize CodeQL
28 |       uses: github/codeql-action/init@v3
29 |       with:
30 |         languages: python
31 | 
32 |     - name: Autobuild
33 |       uses: github/codeql-action/autobuild@v3
34 |     # if autobuild fails
35 |     #- run: |
36 |     #   make bootstrap
37 |     #   make release
38 | 
39 |     - name: Perform CodeQL Analysis
40 |       uses: github/codeql-action/analyze@v3
41 |     #- name: sarif
42 |     #  uses: github/codeql-action/upload-sarif@v2
43 | ...
44 | 


--------------------------------------------------------------------------------
/.github/workflows/generate-deploy-mkdocs-ghpages.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | name: Deploy Docs
  3 | 
  4 | on:
  5 |   pull_request:
  6 |     types: [closed]
  7 |     branches: [main]
  8 |   workflow_dispatch:
  9 | 
 10 | env:
 11 |   DOCSTRINGS_FILE: "docstrings.md"
 12 |   DOC_DIR: "docs"
 13 |   SRC_DIR: "src"
 14 |   SITE_DIR: "site"
 15 |   IMG_DIR: "assets/images"
 16 | 
 17 | jobs:
 18 |   build-and-deploy:
 19 |     runs-on: ubuntu-latest
 20 |     permissions:
 21 |       contents: read
 22 |       pages: write
 23 |       id-token: write
 24 |     environment:
 25 |       name: github-pages
 26 |     steps:
 27 | 
 28 |     - name: Checkout the repository
 29 |       uses: actions/checkout@v4.0.0
 30 |       with:
 31 |         ref:
 32 |           ${{
 33 |             github.event.pull_request.merged == true &&
 34 |             'main' ||
 35 |             github.ref_name
 36 |           }}
 37 |         fetch-depth: 0
 38 | 
 39 |     - uses: actions/configure-pages@v5.0.0
 40 | 
 41 |     # caching instead of actions/cache@v4.0.0
 42 |     # https://docs.astral.sh/uv/guides/integration/github/#caching
 43 |     - name: Install uv with cache dependency glob
 44 |       uses: astral-sh/setup-uv@v5.0.0
 45 |       with:
 46 |         enable-cache: true
 47 |         cache-dependency-glob: "uv.lock"
 48 | 
 49 |     # setup python from pyproject.toml using uv
 50 |     # instead of using actions/setup-python@v5.0.0
 51 |     # https://docs.astral.sh/uv/guides/integration/github/#setting-up-python
 52 |     - name: "Set up Python"
 53 |       run: uv python install
 54 | 
 55 |     - name: Install only doc deps
 56 |       run: uv sync --only-group docs # --frozen
 57 | 
 58 |     - name: Get repo info and stream into mkdocs.yaml
 59 |       id: repo_info
 60 |       run: |
 61 |         REPO_INFO=$(curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
 62 |           -H "Accept: application/vnd.github.v3+json" \
 63 |           https://api.github.com/repos/${{ github.repository }})
 64 |         REPO_URL="${{ github.server_url }}/${{ github.repository }}"
 65 |         REPO_URL=$(echo ${REPO_URL} | sed 's|/|\\/|g')
 66 |         SITE_NAME=$(sed '1!d' README.md | sed '0,/# /{s/# //}')
 67 |         SITE_DESC=$(echo $REPO_INFO | jq -r .description)
 68 |         sed -i "s/<gha_sed_repo_url_here>/${REPO_URL}/g" mkdocs.yaml
 69 |         sed -i "s/<gha_sed_site_name_here>/${SITE_NAME}/g" mkdocs.yaml
 70 |         sed -i "s/<gha_sed_site_description_here>/${SITE_DESC}/g" mkdocs.yaml
 71 | 
 72 |     - name: Copy text files to be included
 73 |       run: |
 74 |         CFG_PATH="src/app/config"
 75 |         mkdir -p "${DOC_DIR}/${CFG_PATH}"
 76 |         cp README.md "${DOC_DIR}/index.md"
 77 |         cp {CHANGELOG,LICENSE}.md "${DOC_DIR}"
 78 |         # Auxiliary files
 79 |         cp .env.example "${DOC_DIR}"
 80 |         cp "${CFG_PATH}/config_chat.json" "${DOC_DIR}/${CFG_PATH}"
 81 | 
 82 |     - name: Generate code docstrings concat file
 83 |       run: |
 84 |         PREFIX="::: "
 85 |         find "${SRC_DIR}" -type f -name "*.py" \
 86 |           -type f -not -name "__*__*" -printf "%P\n" | \
 87 |           sed 's/\//./g' | sed 's/\.py$//' | \
 88 |           sed "s/^/${PREFIX}/" | sort > \
 89 |           "${DOC_DIR}/${DOCSTRINGS_FILE}"
 90 | 
 91 |     - name: Build documentation
 92 |       run: uv run --locked --only-group docs mkdocs build
 93 | 
 94 |     - name: Copy image files to be included
 95 |       run: |
 96 |         # copy images, mkdocs does not by default
 97 |         # mkdocs also overwrites pre-made directories
 98 |         dir="${{ env.SITE_DIR }}/${{ env.IMG_DIR }}"
 99 |         if [ -d "${{ env.IMG_DIR }}" ]; then
100 |           mkdir -p "${dir}"
101 |           cp "${{ env.IMG_DIR }}"/* "${dir}"
102 |         fi
103 | 
104 | #    - name: Push to gh-pages
105 | #      run: uv run mkdocs gh-deploy --force
106 | 
107 |     - name: Upload artifact
108 |       uses: actions/upload-pages-artifact@v3.0.0
109 |       with:
110 |         path: "${{ env.SITE_DIR }}"
111 | 
112 |     - name: Deploy to GitHub Pages
113 |       id: deployment
114 |       uses: actions/deploy-pages@v4.0.0
115 | ...
116 | 


--------------------------------------------------------------------------------
/.github/workflows/links-fail-fast.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # https://github.com/lycheeverse/lychee-action
 3 | # https://github.com/marketplace/actions/lychee-broken-link-checker
 4 | name: "Link Checker"
 5 | 
 6 | on:
 7 |   workflow_dispatch:
 8 |   push:
 9 |     branches-ignore: [main]
10 |   pull_request:
11 |     types: [closed]
12 |     branches: [main]
13 |   schedule:
14 |     - cron: "00 00 * * 0"
15 | 
16 | jobs:
17 |   linkChecker:
18 |     runs-on: ubuntu-latest
19 |     permissions:
20 |       issues: write
21 | 
22 |     steps:
23 |       - uses: actions/checkout@v4
24 | 
25 |       - name: Link Checker
26 |         id: lychee
27 |         uses: lycheeverse/lychee-action@v2
28 | 
29 |       - name: Create Issue From File
30 |         if: steps.lychee.outputs.exit_code != 0
31 |         uses: peter-evans/create-issue-from-file@v5
32 |         with:
33 |           title: lychee Link Checker Report
34 |           content-filepath: ./lychee/out.md
35 |           labels: report, automated issue
36 | ...
37 | 


--------------------------------------------------------------------------------
/.github/workflows/pytest.yaml:
--------------------------------------------------------------------------------
 1 | name: pytest
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | jobs:
 7 |   test:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - name: Checkout repository
11 |         uses: actions/checkout@v4
12 | 
13 |       - name: Set up Python
14 |         uses: actions/setup-python@v4
15 |         with:
16 |           python-version: '3.12'
17 | 
18 |       - name: Install dependencies
19 |         run: |
20 |           python -m pip install --upgrade pip
21 |           pip install pytest
22 | 
23 |       - name: Run tests
24 |         run: pytest
25 | 


--------------------------------------------------------------------------------
/.github/workflows/ruff.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # https://github.com/astral-sh/ruff-action
 3 | # https://github.com/astral-sh/ruff
 4 | name: ruff
 5 | on: 
 6 |   push:
 7 |   pull_request:
 8 |     types: [closed]
 9 |     branches: [main]
10 |   schedule:
11 |     - cron: "0 0 * * 0"
12 |   workflow_dispatch:
13 | jobs:
14 |   ruff:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - uses: actions/checkout@v4
18 |       - uses: astral-sh/ruff-action@v3
19 | ...
20 | 


--------------------------------------------------------------------------------
/.github/workflows/summarize-jobs-reusable.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | # https://ecanarys.com/supercharging-github-actions-with-job-summaries-and-pull-request-comments/
  3 | # FIXME currently bug in gha summaries ? $GITHUB_STEP_SUMMARY files are empty
  4 | # https://github.com/orgs/community/discussions/110283
  5 | # https://github.com/orgs/community/discussions/67991
  6 | # Possible workaround
  7 | # echo ${{ fromJSON(step).name }}" >> $GITHUB_STEP_SUMMARY
  8 | # echo ${{ fromJSON(step).outcome }}" >> $GITHUB_STEP_SUMMARY
  9 | # echo ${{ fromJSON(step).conclusion }}"
 10 | 
 11 | name: Summarize workflow jobs
 12 | 
 13 | on:
 14 |   workflow_call:
 15 |     outputs:
 16 |       summary:
 17 |         description: "Outputs summaries of jobs in a workflow"
 18 |         value: ${{ jobs.generate_summary.outputs.summary }}
 19 |     inputs:
 20 |       branch_to_summarize:
 21 |         required: false
 22 |         default: 'main'
 23 |         type: string
 24 |       summary_data:
 25 |         required: false
 26 |         type: string
 27 | 
 28 | jobs:
 29 |   generate_summary:
 30 |     name: Generate Summary
 31 |     runs-on: ubuntu-latest
 32 |     permissions:
 33 |       contents: read
 34 |       actions: read
 35 |       checks: read
 36 |       pull-requests: none
 37 |     outputs:
 38 |       summary: ${{ steps.add_changed_files.outputs.summary }}
 39 |     steps:
 40 | 
 41 |       - name: Add general information
 42 |         id: general_info
 43 |         run: |
 44 |           echo "# Job Summaries" >> $GITHUB_STEP_SUMMARY
 45 |           echo "Job: `${{ github.job }}`" >> $GITHUB_STEP_SUMMARY
 46 |           echo "Date: $(date +'%Y-%m-%d %H:%M:%S')" >> $GITHUB_STEP_SUMMARY
 47 | 
 48 |       - name: Add step states
 49 |         id: step_states
 50 |         run: |
 51 |           echo "### Steps:" >> $GITHUB_STEP_SUMMARY
 52 |           # loop summary_data if valid json
 53 |           if jq -e . >/dev/null 2>&1 <<< "${{ inputs.summary_data }}"; then
 54 |             jq -r '
 55 |               .steps[]
 56 |               | select(.conclusion != null)
 57 |               | "- **\(.name)**: \(
 58 |                 if .conclusion == "success" then ":white_check_mark:"
 59 |                 elif .conclusion == "failure" then ":x:"
 60 |                 else ":warning:" end
 61 |               )"
 62 |             ' <<< "${{ inputs.summary_data }}" >> $GITHUB_STEP_SUMMARY
 63 |           else
 64 |             echo "Invalid JSON in summary data." >> $GITHUB_STEP_SUMMARY
 65 |           fi
 66 | 
 67 |       - name: Checkout repo
 68 |         uses: actions/checkout@v4
 69 |         with:
 70 |           ref: "${{ inputs.branch_to_summarize }}"
 71 |           fetch-depth: 0
 72 | 
 73 |       - name: Add changed files since last push
 74 |         id: add_changed_files
 75 |         run: |
 76 |           # Get the tags
 77 |           # Use disabled lines to get last two commits
 78 |           # current=$(git show -s --format=%ci HEAD)
 79 |           # previous=$(git show -s --format=%ci HEAD~1)
 80 |           # git diff --name-only HEAD^ HEAD >> $GITHUB_STEP_SUMMARY
 81 |           version_tag_regex="^v[0-9]+\.[0-9]+\.[0-9]+$" # v0.0.0 
 82 |           tags=$(git tag --sort=-version:refname | \
 83 |             grep -E "${version_tag_regex}" || echo "")
 84 | 
 85 |           # Get latest and previous tags
 86 |           latest_tag=$(echo "${tags}" | head -n 1)
 87 |           previous_tag=$(echo "${tags}" | head -n 2 | tail -n 1)
 88 | 
 89 |           echo "tags: latest '${latest_tag}', previous '${previous_tag}'"
 90 | 
 91 |           # Write to summary
 92 |           error_msg="No files to output. Tag not found:"
 93 |           echo ${{ steps.step_states.outputs.summary }} >> $GITHUB_STEP_SUMMARY
 94 |           echo "## Changed files on '${{ inputs.branch_to_summarize }}'" >> $GITHUB_STEP_SUMMARY
 95 | 
 96 |           if [ -z "${latest_tag}" ]; then
 97 |             echo "${error_msg} latest" >> $GITHUB_STEP_SUMMARY
 98 |           elif [ -z "${previous_tag}" ]; then
 99 |             echo "${error_msg} previous" >> $GITHUB_STEP_SUMMARY
100 |           elif [ "${latest_tag}" == "${previous_tag}" ]; then
101 |             echo "Latest and previous tags are the same: '${latest_tag}'" >> $GITHUB_STEP_SUMMARY
102 |           else
103 |             # Get commit dates and hashes
104 |             latest_date=$(git log -1 --format=%ci $latest_tag)
105 |             previous_date=$(git log -1 --format=%ci $previous_tag)
106 |             current_hash=$(git rev-parse --short $latest_tag)
107 |             previous_hash=$(git rev-parse --short $previous_tag)
108 | 
109 |             # Append summary to the job summary
110 |             echo "Latest Tag Commit: '${latest_tag}' (${current_hash}) ${latest_date}" >> $GITHUB_STEP_SUMMARY
111 |             echo "Previous Tag Commit: '${previous_tag}' (${previous_hash}) ${previous_date}" >> $GITHUB_STEP_SUMMARY
112 |             echo "Files changed:" >> $GITHUB_STEP_SUMMARY
113 |             echo '```' >> $GITHUB_STEP_SUMMARY
114 |             git diff --name-only $previous_tag..$latest_tag >> $GITHUB_STEP_SUMMARY
115 |             echo '```' >> $GITHUB_STEP_SUMMARY
116 |           fi
117 | 
118 |       - name: Output error message in case of failure or cancel
119 |         if: failure() || cancelled()
120 |         run: |
121 |           if [ "${{ job.status }}" == "cancelled" ]; then
122 |             out_msg="## Workflow was cancelled"
123 |           else
124 |             out_msg="## Error in previous step"
125 |           fi
126 |           echo $out_msg >> $GITHUB_STEP_SUMMARY
127 | ...


--------------------------------------------------------------------------------
/.github/workflows/write-llms-txt.yaml:
--------------------------------------------------------------------------------
 1 | # TODO use local installation of repo to text
 2 | # https://github.com/itsitgroup/repo2txt
 3 | name: Write repo llms.txt
 4 | 
 5 | on:
 6 |   push:
 7 |     branches: [main]
 8 |   workflow_dispatch:
 9 |     inputs:
10 |       LLMS_TXT_PATH:
11 |         description: 'Path to the directory to save llsm.txt'
12 |         required: true
13 |         default: 'docs'
14 |         type: string
15 |       LLMS_TXT_NAME:
16 |         description: 'Path to the directory to save llsm.txt'
17 |         required: true
18 |         default: 'llms.txt'
19 |         type: string
20 |       CONVERTER_URL:
21 |         description: '[uithub|gittodoc]'  # |repo2txt
22 |         required: true
23 |         default: 'uithub.com'
24 |         type: choice
25 |         options:
26 |         - 'uithub.com'
27 |         - 'gittodoc.com'
28 |         # - 'repo2txt.com'
29 | 
30 | jobs:
31 |   generate-file:
32 |     runs-on: ubuntu-latest
33 | 
34 |     steps:
35 |       - name: Checkout repo
36 |         uses: actions/checkout@v4
37 | 
38 |       - name: Construct and create llms.txt path
39 |         id: construct_and_create_llms_txt_path
40 |         run: |
41 |           LLMS_TXT_PATH="${{ inputs.LLMS_TXT_PATH }}"
42 |           LLMS_TXT_PATH="${LLMS_TXT_PATH:-docs}"
43 |           LLMS_TXT_NAME="${{ inputs.LLMS_TXT_NAME }}"
44 |           LLMS_TXT_NAME="${LLMS_TXT_NAME:-llms.txt}"
45 |           echo "LLMS_TXT_FULL=${LLMS_TXT_PATH}/${LLMS_TXT_NAME}" >> $GITHUB_OUTPUT
46 |           mkdir -p "${LLMS_TXT_PATH}"
47 | 
48 |       - name: Fetch TXT from URL
49 |         run: |
50 |           LLMS_TXT_FULL=${{ steps.construct_and_create_llms_txt_path.outputs.LLMS_TXT_FULL }}
51 |           URL="https://${{ inputs.CONVERTER_URL }}/${{ github.repository }}"
52 |           echo "Fetching content from: ${URL}"
53 |           echo "Saving content to: ${LLMS_TXT_FULL}"
54 |           curl -s "${URL}" > "${LLMS_TXT_FULL}"
55 | 
56 |       - name: Commit and push file
57 |         run: |
58 |           LLMS_TXT_FULL=${{ steps.construct_and_create_llms_txt_path.outputs.LLMS_TXT_FULL }}
59 |           commit_msg="feat(docs): Add/Update ${LLMS_TXT_FULL}, a flattened repo as single text file, inspired by [llmstxt.org](https://llmstxt.org/)."
60 |           git config user.name "github-actions"
61 |           git config user.email "github-actions@github.com"
62 |           git add "${LLMS_TXT_FULL}"
63 |           git commit -m "${commit_msg}"
64 |           git push
65 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python bytecode
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # environment
 6 | .venv/
 7 | *.env
 8 | unset_env.sh
 9 | 
10 | # Distribution / packaging
11 | build/
12 | dist/
13 | *.egg-info/
14 | 
15 | # Testing
16 | .pytest_cache/
17 | .coverage
18 | 
19 | # Logs
20 | *.log
21 | /logs
22 | 
23 | # Traces
24 | scalene-profiles
25 | profile.html
26 | profile.json
27 | 
28 | # OS generated files
29 | .DS_Store
30 | Thumbs.db
31 | 
32 | # IDE specific files (adjust as needed)
33 | # .vscode/
34 | # .idea/
35 | 
36 | # mkdocs
37 | reference/
38 | site/
39 | 
40 | # linting
41 | .ruff_cache
42 | 
43 | # type checking
44 | .mypy_cache/
45 | 
46 | # project specific
47 | wandb/
48 | 


--------------------------------------------------------------------------------
/.gitmessage:
--------------------------------------------------------------------------------
 1 | #<--- 72 characters --------------------------------------------------->
 2 | #
 3 | # Conventional Commits, semantic commit messages for humans and machines
 4 | # https://www.conventionalcommits.org/en/v1.0.0/
 5 | # Lint your conventional commits
 6 | # https://github.com/conventional-changelog/commitlint/tree/master/%40 \
 7 | #	commitlint/config-conventional
 8 | # Common types can be (based on Angular convention)
 9 | # build, chore, ci, docs, feat, fix, perf, refactor, revert, style, test
10 | # https://github.com/conventional-changelog/commitlint/tree/master/%40
11 | # Footer
12 | # https://git-scm.com/docs/git-interpret-trailers
13 | #
14 | #<--- pattern --------------------------------------------------------->
15 | #
16 | # <feat|fix|build|chore|ci|docs|style|refactor|perf|test>[(Scope)][!]: \
17 | #	<description>
18 | # short description: <type>[(<scope>)]: <subject>
19 | #
20 | # ! after scope in header indicates breaking change
21 | #
22 | # [optional body]
23 | #
24 | # - with bullets points
25 | #
26 | # [optional footer(s)]
27 | #
28 | # [BREAKING CHANGE:, Refs:, Resolves:, Addresses:, Reviewed by:]
29 | #
30 | #<--- usage ----------------------------------------------------------->
31 | #
32 | # Set locally (in the repository)
33 | # `git config commit.template .gitmessage`
34 | #
35 | # Set globally
36 | # `git config --global commit.template .gitmessage`
37 | #
38 | #<--- 72 characters --------------------------------------------------->


--------------------------------------------------------------------------------
/.streamlit/config.toml:
--------------------------------------------------------------------------------
 1 | [theme]
 2 | primaryColor="#f92aad"
 3 | backgroundColor="#0b0c10"
 4 | secondaryBackgroundColor="#1f2833"
 5 | textColor="#66fcf1"
 6 | font="monospace"
 7 | 
 8 | [server]
 9 | # enableCORS = false
10 | enableXsrfProtection = true
11 | 
12 | [browser]
13 | gatherUsageStats = false
14 | 
15 | [client]
16 | # toolbarMode = "minimal"
17 | showErrorDetails = true
18 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "recommendations": [
 3 |         "charliermarsh.ruff",
 4 |         "davidanson.vscode-markdownlint",
 5 |         "donjayamanne.githistory",
 6 |         "editorconfig.editorconfig",
 7 |         "gruntfuggly.todo-tree",
 8 |         "mhutchie.git-graph",
 9 |         "PKief.material-icon-theme",
10 |         "redhat.vscode-yaml",
11 |         "tamasfe.even-better-toml",
12 |         "yzhang.markdown-all-in-one",
13 | 
14 |         "github.copilot",
15 |         "github.copilot-chat",
16 |         "github.vscode-github-actions",
17 |         "ms-azuretools.vscode-docker",
18 |         "ms-python.debugpy",
19 |         "ms-python.python",
20 |         "ms-python.vscode-pylance",
21 |         "ms-vscode.makefile-tools",
22 |     ]
23 | }


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "editor.lineNumbers": "on",
 3 |     "editor.wordWrap": "on",
 4 |     "explorer.confirmDelete": true,
 5 |     "files.autoSave": "onFocusChange",
 6 |     "git.autofetch": true,
 7 |     "git.enableSmartCommit": true,
 8 |     "makefile.configureOnOpen": false,
 9 |     "markdownlint.config": {
10 |         "MD024": false,
11 |         "MD033": false
12 |     },
13 |     "python.analysis.extraPaths": ["./venv/lib/python3.13/site-packages"],
14 |     "python.defaultInterpreterPath": "./.venv/bin/python",
15 |     "python.analysis.typeCheckingMode": "strict",
16 |     "python.analysis.diagnosticSeverityOverrides": {
17 |         "reportMissingTypeStubs": "none",
18 |         "reportUnknownMemberType": "none",
19 |         "reportUnknownVariableType": "none"
20 |     },
21 |     "redhat.telemetry.enabled": false
22 | }


--------------------------------------------------------------------------------
/AGENTS.md:
--------------------------------------------------------------------------------
  1 | # Agent instructions for `Agents-eval` repository
  2 | 
  3 | As proposed by [agentsmd.net](https://agentsmd.net/) and used by [wandb weave AGENTS.md](https://github.com/wandb/weave/blob/master/AGENTS.md).
  4 | 
  5 | ## Core Rules & AI Behavior
  6 | 
  7 | * When you learn something new about the codebase or introduce a new concept, **update this file (`AGENTS.md`)** to reflect the new knowledge. This is YOUR FILE! It should grow and evolve with you.
  8 | * If something doesn't make sense architecturally, from a developer experience standpoint, or product-wise, please add it to the **`Requests to Humans`** section below.
  9 | * Always follow the established coding patterns, conventions, and architectural decisions documented here and in the `docs/` directory.
 10 | * **Never assume missing context.** Ask questions if you are uncertain about requirements or implementation details.
 11 | * **Never hallucinate libraries or functions.** Only use known, verified Python packages listed in `pyproject.toml`.
 12 | * **Always confirm file paths and module names** exist before referencing them in code or tests.
 13 | * **Never delete or overwrite existing code** unless explicitly instructed to or as part of a documented refactoring task.
 14 | 
 15 | ## Architecture Overview
 16 | 
 17 | This is a multi-agent evaluation system for assessing agentic AI systems. The project uses **PydanticAI** as the core framework for agent orchestration and is designed for evaluation purposes, not for production agent deployment.
 18 | 
 19 | ### Data Flow
 20 | 
 21 | 1. User input → Manager Agent
 22 | 2. Manager delegates to Researcher Agent (with DuckDuckGo search)
 23 | 3. Researcher results → Analyst Agent for validation
 24 | 4. Validated data → Synthesizer Agent for report generation
 25 | 5. Results evaluated using configurable metrics
 26 | 
 27 | ### Key Dependencies
 28 | 
 29 | * **PydanticAI**: Agent framework and orchestration
 30 | * **uv**: Fast Python dependency management
 31 | * **Streamlit**: GUI framework
 32 | * **Ruff**: Code formatting and linting
 33 | * **MyPy**: Static type checking
 34 | 
 35 | ## Codebase Structure & Modularity
 36 | 
 37 | ### Main Components
 38 | 
 39 | * `src/app/`: The core application logic. This is where most of your work will be.
 40 |   * `main.py`: The main entry point for the CLI application.
 41 |   * `agents/agent_system.py`: Defines the multi-agent system, their interactions, and orchestration. **This is the central logic for agent behavior.**
 42 |   * `config/data_models.py`: Contains all **Pydantic** models that define the data contracts. This is a critical file for understanding data flow.
 43 |   * `config/config_chat.json`: Holds provider settings and system prompts for agents.
 44 |   * `config/config_eval.json`: Defines evaluation metrics and their weights.
 45 |   * `evals/metrics.py`: Implements the evaluation metrics.
 46 | * `src/gui/`: Contains the source code for the Streamlit GUI.
 47 | * `docs/`: Contains project documentation, including the Product Requirements Document (`PRD.md`) and the C4 architecture model.
 48 | * `tests/`: Contains all tests for the project, written using **pytest**.
 49 | 
 50 | ### Code Organization Rules
 51 | 
 52 | * **Never create a file longer than 500 lines of code.** If a file approaches this limit, refactor by splitting it into smaller, more focused modules or helper files.
 53 | * Organize code into clearly separated modules grouped by feature.
 54 | * Use clear, consistent, and absolute imports within packages.
 55 | 
 56 | ## Development Commands & Environment
 57 | 
 58 | ### Environment Setup
 59 | 
 60 | The project requirements are stated in `pyproject.toml`. Your development environment should be set up automatically using the provided `Makefile`, which configures the virtual environment.
 61 | 
 62 | * `make setup_dev`: Install all dev dependencies.
 63 | * `make setup_dev_claude`: Setup dev environment with Claude Code CLI.
 64 | * `make setup_dev_ollama`: Setup dev environment with Ollama local LLM.
 65 | 
 66 | ### Running the Application
 67 | 
 68 | * `make run_cli`: Run the CLI application.
 69 | * `make run_cli ARGS="--help"`: Run CLI with specific arguments.
 70 | * `make run_gui`: Run the Streamlit GUI.
 71 | 
 72 | ### Testing and Code Quality
 73 | 
 74 | * `make test_all`: Run all tests with pytest.
 75 | * `make coverage_all`: Run tests and generate a coverage report.
 76 | * `make ruff`: Format code and fix linting issues with Ruff.
 77 | * `make type_check`: Run mypy static type checking on `src/app/`.
 78 | 
 79 | ## Testing & Reliability
 80 | 
 81 | * **Always create Pytest unit tests** for new features (functions, classes, etc.).
 82 | * Tests must live in the `tests/` folder, mirroring the `src/app` structure.
 83 | * After updating any logic, check whether existing unit tests need to be updated. If so, do it.
 84 | * For each new feature, include at least:
 85 |   * 1 test for the expected use case (happy path).
 86 |   * 1 test for a known edge case.
 87 |   * 1 test for an expected failure case (e.g., invalid input).
 88 | * **To run a specific test file or function, use `uv run pytest` directly:**
 89 |   * `uv run pytest tests/test_specific_file.py`
 90 |   * `uv run pytest tests/test_specific_file.py::test_function`
 91 | 
 92 | ## Style, Patterns & Documentation
 93 | 
 94 | ### Coding Style
 95 | 
 96 | * **Use Pydantic** models in `src/app/config/data_models.py` for all data validation and data contracts. **Always use or update these models** when modifying data flows.
 97 | * Use the predefined error message functions from `src/app/utils/error_messages.py` for consistency.
 98 | * When writing complex logic, **add an inline `# Reason:` comment** explaining the *why*, not just the *what*.
 99 | * Comment non-obvious code to ensure it is understandable to a mid-level developer.
100 | 
101 | ### Documentation
102 | 
103 | * Write **docstrings for every function, class, and method** using the Google style format. This is critical as the documentation site is built automatically from docstrings.
104 | 
105 |     ```python
106 |     def example_function(param1: int) -> str:
107 |         """A brief summary of the function.
108 | 
109 |         Args:
110 |             param1 (int): A description of the first parameter.
111 | 
112 |         Returns:
113 |             str: A description of the return value.
114 |         """
115 |         return "example"
116 |     ```
117 | 
118 | * Update this `AGENTS.md` file when introducing new patterns or concepts.
119 | * Document significant architectural decisions in `docs/ADR.md`.
120 | * Document all significant changes, features, and bug fixes in `docs/CHANGELOG.md`.
121 | 
122 | ## Code Review & PR Guidelines
123 | 
124 | ### PR Requirements
125 | 
126 | * **Title Format**: Commit messages and PR titles must follow the **Conventional Commits** specification, as outlined in the `.gitmessage` template.
127 | * Provide detailed PR summaries including the purpose of the changes and the testing performed.
128 | 
129 | ### Pre-commit Checklist
130 | 
131 | 1. Run the linter and formatter: `make ruff`.
132 | 2. Ensure all tests pass: `make test_all`.
133 | 3. Ensure static type checks pass: `make type_check`.
134 | 4. Update documentation as described below.
135 | 
136 | ## Requests to Humans
137 | 
138 | This section contains a list of questions, clarifications, or tasks that AI agents wish to have humans complete or elaborate on.
139 | 
140 | * [ ] The `agent_system.py` module has a `NotImplementedError` for streaming with Pydantic model outputs. Please clarify the intended approach for streaming structured data.
141 | * [ ] The `llm_model_funs.py` module has `NotImplementedError` for the Gemini and HuggingFace providers. Please provide the correct implementation or remove them if they are not supported.
142 | * [ ] The `agent_system.py` module contains a `FIXME` note regarding the use of a try-catch context manager. Please review and implement the intended error handling.
143 | * [ ] Add TypeScript testing guidelines (if a TypeScript frontend is planned for the future).
144 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | All notable changes to this project will be documented in this file.
  4 | 
  5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
  6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
  7 | 
  8 | ## Guiding Principles
  9 | 
 10 | - Changelogs are for humans, not machines.
 11 | - There should be an entry for every single version.
 12 | - The same types of changes should be grouped.
 13 | - Versions and sections should be linkable.
 14 | - The latest version comes first.
 15 | - The release date of each version is displayed.
 16 | - Mention whether you follow Semantic Versioning.
 17 | 
 18 | ## Types of changes
 19 | 
 20 | - `Added` for new features.
 21 | - `Changed` for changes in existing functionality.
 22 | - `Deprecated` for soon-to-be removed features.
 23 | - `Removed` for now removed features.
 24 | - `Fixed` for any bug fixes.
 25 | - `Security` in case of vulnerabilities.
 26 | 
 27 | ## [Unreleased]
 28 | 
 29 | ### Added
 30 | 
 31 | - Claude code functionality, commands and settings
 32 | 
 33 | ## [1.1.0] - 2025-07-05
 34 | 
 35 | ### Added
 36 | 
 37 | - Makefile command and devcontainer.json for claude code usage
 38 | 
 39 | ### Changed
 40 | 
 41 | - Moved streamlit_gui and examples to /src
 42 | - Moved app to /src/app
 43 | 
 44 | ## [1.0.0] - 2025-03-18
 45 | 
 46 | ### 2025-03-18
 47 | 
 48 | - refactor(agent,streamlit): Convert main and run_manager functions again to async for streamli output
 49 | - fix(prompts): Update system prompts for manager,researcher and synthesiser roles to remove complexity
 50 | - chore(workflows): Update action versions in GitHub workflows for consistency
 51 | - chore(workflows): Update action versions for deploy docs to pgh-pages
 52 | - docs(deps): Add documentation dependencies for MkDocs and related plugins to pyproject.toml
 53 | 
 54 | ### 2025-03-17
 55 | 
 56 | - feat(main,agent): refactor entry point to support async execution and enhance login handling
 57 | - feat(cli,login,log): refactor entry point to integrate Typer, enhance logging, added login every run
 58 | - feat(streamlit): replace load_config with load_app_config, enhance sidebar rendering, and improve output rendering with type support
 59 | - feat(streamlit): enhance render_output function with detailed docstring and improve query handling in run_app
 60 | - feat(streamlit): enhance render_output function with additional info parameter and improve output handling in run_app
 61 | - feat(streamlit,app): add Typer dependency, update main entry point for async execution, add streamlit provider input
 62 | - feat(agent): update configuration and improve agent system setup with enhanced error handling and new environment variables
 63 | - feat(config,login,catch): add inference settings with usage limits and result retries, enhance login function to initialize environment and handle exceptions, comment out raise in error handling context to prevent unintended crashes
 64 | - feat(login,catch): integrate logfire configuration in login function and improve error handling context
 65 | 
 66 | ### 2025-03-16
 67 | 
 68 | - feta(devconatiner): Refactor devcontainer setup: remove old configurations and add new setup targets for development and Ollama
 69 | - feat(devcontainer): Changed from vscode to astral-sh devcontainer
 70 | - feat(devcontainer): Changed to vscode container, added postcreatecommand make setup_env
 71 | - feat(devcontainer): restructure environment setup with new devcontainer configurations
 72 | - feat(devcontainer): update environment names for clarity in devcontainer configurations
 73 | - refactor(agent): Added AgentConfig class for better agent configuration management, Refactored main function for streamlined agent initialization.
 74 | - feat(config,agents): Update model providers and enhance configuration management, examples: Added new model providers: Gemini and OpenRouter, src: Enabled streaming responses in the agent system
 75 | - chore: Remove unused prompt files, update configuration, and enhance logging setup
 76 | - refactor(exception,logfire): Enhance error handling and update model configurations in agent system
 77 | 
 78 | ### 2025-03-14
 79 | 
 80 | - feat(scalene): Add profiling support and update dependencies
 81 | - refactor(Makefile): Improve target descriptions and organization
 82 | 
 83 | ### 2025-03-13
 84 | 
 85 | - refactor(API,except): .env.example, add OpenRouter configuration, enhance error handling in run_simple_agent_system.py, and update ModelConfig to allow optional API key.
 86 | - feat(streamlit): add Streamlit app structure with header, footer, sidebar, and main content components
 87 | - feat(streamlit): enhance Streamlit app with detailed docstrings, improved header/footer, and refined main content layout
 88 | - feat(makefile,streamlit): update Makefile commands for CLI and GUI execution, and modify README for usage instructions, add streamlit config.toml
 89 | - feat(streamlit): restructure Streamlit app by removing unused components, adding new header, footer, sidebar, and output components, and updating configuration settings
 90 | - chore: replace app entrypoint with main, remove unused tools and tests, and update makefile for linting and type checking
 91 | - chore: Enhance makefile with coverage and help commands, update mkdocs.yaml and pyproject.toml for improved project structure and documentation
 92 | - test: Update makefile for coverage reporting, modify pyproject.toml to include pytest-cov, and adjust dependency settings
 93 | - test: Add coverage support with pytest-cov and update makefile for coverage reporting
 94 | - test: makefile for coverage reporting, update dependencies in pyproject.toml for improved testing and coverage support
 95 | - chore: Remove redundant help command from makefile
 96 | - refactor(agent,async): Refactor agent tests to use async fixtures and update verification methods for async results
 97 | - fix(Dockerfile): Remove unnecessary user creation and pip install commands from Dockerfile
 98 | - feat(agent): Update dependencies and add new example structures; remove obsolete files
 99 | - chore(structure): simplified agents.py
100 | - fix(pyproject): Replace pydantic-ai with pydantic-ai-slim and update dependencies
101 | - feat(examples): add new examples and data models; update configuration structure
102 | - feat(agent): update dependencies, enhance examples, and introduce new data models for research and analysis agents
103 | - feat(examples): enhance prompts structure and refactor research agent integration
104 | - feat(examples): improve documentation and enhance error handling in agent examples
105 | - feat(agent): Added data models and configuration for research and analysis agents, Added System C4 plantuml
106 | - feat(weave,dependencies): update dependencies and integrate Weave for enhanced functionality in the agent system
107 | - feat(agent): initialize agentops with API key and default tags for enhanced agent functionality
108 | - feat(agent): integrate logfire for logging and configure initial logging settings
109 | - feat(agent): adjust usage limits for ollama provider to enhance performance
110 | - feat(agent): refine system prompts and enhance data model structure for improved agent interactions
111 | - feat(agent): update system prompts for improved clarity and accuracy; add example environment configuration
112 | - feat(agent): enhance agent system with synthesiser functionality and update prompts for improved coordination
113 | - feat(agent): add Grok and Gemini API configurations; initialize logging and agent operations
114 | - feat(agent): improve documentation and refactor model configuration handling for agent system
115 | - feat(agent): update environment configuration, enhance logging, and refine agent management functionality
116 | - feat(agent): refactor login handling, update model retrieval, and enhance agent configuration
117 | 
118 | ## [0.0.2] - 2025-01-20
119 | 
120 | ### Added
121 | 
122 | - PRD.md
123 | - C4 architecture diagrams: system context, code
124 | - tests: basic agent evals, config.json
125 | 
126 | ### Changed
127 | 
128 | - make recipes
129 | 
130 | ## [0.0.1] - 2025-01-20
131 | 
132 | ### Added
133 | 
134 | - Makefile: setup, test, ruff
135 | - devcontainer: python only, w/o Jetbrains clutter from default devcontainer
136 | - ollama: server and model download successful
137 | - agent: tools use full run red
138 | - pytest: e2e runm final result red
139 | - Readme: basic project info
140 | - pyproject.toml
141 | 


--------------------------------------------------------------------------------
/CLAUDE.md:
--------------------------------------------------------------------------------
1 | @AGENTS.md


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG APP_ROOT="/src"
 2 | ARG PYTHON_VERSION="3.12"
 3 | ARG USER="appuser"
 4 | 
 5 | 
 6 | # Stage 1: Builder Image
 7 | FROM python:${PYTHON_VERSION}-slim AS builder
 8 | LABEL author="qte77"
 9 | LABEL builder=true
10 | ENV PYTHONDONTWRITEBYTECODE=1 \
11 |     PYTHONUNBUFFERED=1
12 | COPY pyproject.toml uv.lock /
13 | RUN set -xe \
14 |     && pip install --no-cache-dir uv \
15 |     && uv sync --frozen
16 | 
17 | 
18 | # Stage 2: Runtime Image
19 | FROM python:${PYTHON_VERSION}-slim AS runtime
20 | LABEL author="qte77"
21 | LABEL runtime=true
22 | 
23 | ARG APP_ROOT
24 | ARG USER
25 | ENV PYTHONDONTWRITEBYTECODE=1 \
26 |     PYTHONUNBUFFERED=1 \
27 |     PYTHONPATH=${APP_ROOT} \
28 |     PATH="${APP_ROOT}:${PATH}"
29 | #    WANDB_KEY=${WANDB_KEY} \
30 | #    WANDB_DISABLE_CODE=true
31 | 
32 | USER ${USER}
33 | WORKDIR ${APP_ROOT}
34 | COPY --from=builder /.venv .venv
35 | COPY --chown=${USER}:${USER} ${APP_ROOT} .
36 | 
37 | CMD [ \
38 |     "uv", "run", \
39 |     "--locked", "--no-sync", \
40 |     "python", "-m", "." \
41 | ]
42 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2025 qte77
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 |    list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 |    this list of conditions and the following disclaimer in the documentation
13 |    and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its
16 |    contributors may be used to endorse or promote products derived from
17 |    this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # This Makefile automates the build, test, and clean processes for the project.
  2 | # It provides a convenient way to run common tasks using the 'make' command.
  3 | # It is designed to work with the 'uv' tool for managing Python environments and dependencies.
  4 | # Run `make help` to see all available recipes.
  5 | 
  6 | .SILENT:
  7 | .ONESHELL:
  8 | .PHONY: all setup_prod setup_dev setup_prod_ollama setup_dev_ollama setup_dev_claude setup_claude_code setup_ollama start_ollama stop_ollama clean_ollama ruff run_cli run_gui run_profile prp_gen_claude prp_exe_claude test_all coverage_all type_check output_unset_app_env_sh help
  9 | # .DEFAULT: setup_dev_ollama
 10 | .DEFAULT_GOAL := setup_dev_ollama
 11 | 
 12 | SRC_PATH := src
 13 | APP_PATH := $(SRC_PATH)/app
 14 | GUI_PATH_ST := $(SRC_PATH)/run_gui.py
 15 | CHAT_CFG_FILE := $(APP_PATH)/config_chat.json
 16 | OLLAMA_SETUP_URL := https://ollama.com/install.sh
 17 | OLLAMA_MODEL_NAME := $$(jq -r '.providers.ollama.model_name' $(CHAT_CFG_FILE))
 18 | PRP_DEF_PATH := /context/PRPs/features
 19 | PRP_CLAUDE_GEN_CMD := generate-prp
 20 | PRP_CLAUDE_EXE_CMD := execute-prp
 21 | 
 22 | # construct the full path to the PRP definition file
 23 | define CLAUDE_PRP_RUNNER
 24 | 	echo "Starting Claude Code PRP runner ..."
 25 | 	# 1. Extract arguments and validate that they are not empty.
 26 | 	prp_file=$(firstword $(strip $(1)))
 27 | 	cmd_prp=$(firstword $(strip $(2)))
 28 | 	if [ -z "$${prp_file}" ]; then
 29 | 		echo "Error: ARGS for PRP filename is empty. Please provide a PRP filename."
 30 | 		exit 1
 31 | 	fi
 32 | 	if [ -z "$${cmd_prp}" ]; then
 33 | 		echo "Error: ARGS for command is empty. Please provide a command."
 34 | 		exit 2
 35 | 	fi
 36 | 	cmd_prp="/project:$${cmd_prp} $(PRP_DEF_PATH)/$${prp_file}"
 37 | 	cmd_cost="/cost"
 38 | 	echo "Executing command '$${cmd_prp}' ..."
 39 | 	claude -p "$${cmd_prp}" 2>&1
 40 | 	claude -p "$${cmd_cost}" 2>&1
 41 | endef
 42 | 
 43 | setup_prod:  ## Install uv and deps, Download and start Ollama 
 44 | 	echo "Setting up prod environment ..."
 45 | 	pip install uv -q
 46 | 	uv sync --frozen
 47 | 
 48 | setup_dev:  ## Install uv and deps, Download and start Ollama 
 49 | 	echo "Setting up dev environment ..."
 50 | 	pip install uv -q
 51 | 	uv sync --all-groups
 52 | 
 53 | setup_prod_ollama:
 54 | 	$(MAKE) -s setup_prod
 55 | 	$(MAKE) -s setup_ollama
 56 | 	$(MAKE) -s start_ollama
 57 | 
 58 | setup_dev_ollama:
 59 | 	$(MAKE) -s setup_dev
 60 | 	$(MAKE) -s setup_ollama
 61 | 	$(MAKE) -s start_ollama
 62 | 
 63 | setup_dev_claude:
 64 | 	$(MAKE) -s setup_dev
 65 | 	$(MAKE) -s setup_claude_code
 66 | 
 67 | setup_claude_code:  ## Setup claude code CLI, node.js and npm have to be present
 68 | 	echo "Setting up claude code ..."
 69 | 	npm install -g @anthropic-ai/claude-code
 70 | 	claude config set --global preferredNotifChannel terminal_bell
 71 | 	echo "npm version: $$(npm --version)"
 72 | 	claude --version
 73 | 
 74 | # Ollama BINDIR in /usr/local/bin /usr/bin /bin 
 75 | setup_ollama:  ## Download Ollama, script does start local Ollama server
 76 | 	echo "Downloading Ollama binary... Using '$(OLLAMA_SETUP_URL)'."
 77 | 	# script does start server but not consistently
 78 | 	curl -fsSL $(OLLAMA_SETUP_URL) | sh
 79 | 	echo "Pulling model '$(OLLAMA_MODEL_NAME)' ..."
 80 | 	ollama pull $(OLLAMA_MODEL_NAME)
 81 | 
 82 | start_ollama:  ## Start local Ollama server, default 127.0.0.1:11434
 83 | 	ollama serve
 84 | 
 85 | stop_ollama:  ## Stop local Ollama server
 86 | 	echo "Stopping Ollama server..."
 87 | 	pkill ollama
 88 | 
 89 | clean_ollama:  ## Remove local Ollama from system
 90 | 	echo "Searching for Ollama binary..."
 91 | 	for BINDIR in /usr/local/bin /usr/bin /bin; do
 92 | 		if echo $$PATH | grep -q $$BINDIR; then
 93 | 			echo "Ollama binary found in '$$BINDIR'"
 94 | 			BIN="$$BINDIR/ollama"
 95 | 			break
 96 | 		fi
 97 | 	done
 98 | 	echo "Cleaning up..."
 99 | 	rm -f $(BIN)
100 | 
101 | ruff:  ## Lint: Format and check with ruff
102 | 	uv run ruff format
103 | 	uv run ruff check --fix
104 | 
105 | run_cli:  ## Run app on CLI only
106 | 	path=$$(echo "$(APP_PATH)" | tr '/' '.')
107 | 	uv run python -m $${path}.main $(ARGS)
108 | 
109 | run_gui:  ## Run app with Streamlit GUI
110 | 	uv run streamlit run $(GUI_PATH_ST)
111 | 
112 | run_profile:  ## Profile app with scalene
113 | 	uv run scalene --outfile \
114 | 		"$(APP_PATH)/scalene-profiles/profile-$(date +%Y%m%d-%H%M%S)" \
115 | 		"$(APP_PATH)/main.py"
116 | 
117 | prp_gen_claude:  ## generates the PRP from the file passed in ARGS
118 | 	$(call CLAUDE_PRP_RUNNER, $(ARGS), $(PRP_CLAUDE_GEN_CMD))
119 | 
120 | prp_exe_claude:  ## executes the PRP from the file passed in ARGS
121 | 	$(call CLAUDE_PRP_RUNNER, $(ARGS), $(PRP_CLAUDE_EXE_CMD))
122 | 
123 | test_all:  ## Run all tests
124 | 	uv run pytest
125 | 
126 | coverage_all:  ## Get test coverage
127 | 	uv run coverage run -m pytest || true
128 | 	uv run coverage report -m
129 | 
130 | type_check:  ## Check for static typing errors
131 | 	uv run mypy $(APP_PATH)
132 | 
133 | output_unset_app_env_sh:  ## Unset app environment variables
134 | 	uf="./unset_env.sh"
135 | 	echo "Outputing '$${uf}' ..."
136 | 	printenv | awk -F= '/_API_KEY=/ {print "unset " $$1}' > $$uf
137 | 
138 | help:  ## Displays this message with available recipes
139 | 	# TODO add stackoverflow source
140 | 	echo "Usage: make [recipe]"
141 | 	echo "Recipes:"
142 | 	awk '/^[a-zA-Z0-9_-]+:.*?##/ {
143 | 		helpMessage = match($$0, /## (.*)/)
144 | 		if (helpMessage) {
145 | 			recipe = $$1
146 | 			sub(/:/, "", recipe)
147 | 			printf "  \033[36m%-20s\033[0m %s\n", recipe, substr($$0, RSTART + 3, RLENGTH)
148 | 		}
149 | 	}' $(MAKEFILE_LIST)
150 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Agents-eval
  2 | 
  3 | This project aims to implement an evaluation pipeline to assess the effectiveness of open-source agentic AI systems across various use cases, focusing on use case agnostic metrics that measure core capabilities such as task decomposition, tool integration, adaptability, and overall performance.
  4 | 
  5 | ![License](https://img.shields.io/badge/license-BSD3Clause-green.svg)
  6 | ![Version](https://img.shields.io/badge/version-1.1.0-58f4c2)
  7 | [![CodeQL](https://github.com/qte77/Agents-eval/actions/workflows/codeql.yaml/badge.svg)](https://github.com/qte77/Agents-eval/actions/workflows/codeql.yaml)
  8 | [![CodeFactor](https://www.codefactor.io/repository/github/qte77/Agents-eval/badge)](https://www.codefactor.io/repository/github/qte77/Agents-eval)
  9 | [![ruff](https://github.com/qte77/Agents-eval/actions/workflows/ruff.yaml/badge.svg)](https://github.com/qte77/Agents-eval/actions/workflows/ruff.yaml)
 10 | [![pytest](https://github.com/qte77/Agents-eval/actions/workflows/pytest.yaml/badge.svg)](https://github.com/qte77/Agents-eval/actions/workflows/pytest.yaml)
 11 | [![Link Checker](https://github.com/qte77/Agents-eval/actions/workflows/links-fail-fast.yaml/badge.svg)](https://github.com/qte77/Agents-eval/actions/workflows/links-fail-fast.yaml)
 12 | [![Deploy Docs](https://github.com/qte77/Agents-eval/actions/workflows/generate-deploy-mkdocs-ghpages.yaml/badge.svg)](https://github.com/qte77/Agents-eval/actions/workflows/generate-deploy-mkdocs-ghpages.yaml)
 13 | 
 14 | **DevEx** [![vscode.dev](https://img.shields.io/static/v1?logo=visualstudiocode&label=&message=vscode.dev&labelColor=2c2c32&color=007acc&logoColor=007acc)](https://vscode.dev/github/qte77/Agents-eval)
 15 | [![Codespace Dev](https://img.shields.io/static/v1?logo=visualstudiocode&label=&message=Codespace%20Dev&labelColor=2c2c32&color=007acc&logoColor=007acc)](https://github.com/codespaces/new?repo=qte77/Agents-eval&devcontainer_path=.devcontainer/setup_dev/devcontainer.json)
 16 | [![Codespace Dev Claude Code](https://img.shields.io/static/v1?logo=visualstudiocode&label=&message=Codespace%20Dev%20Claude%20Code&labelColor=2c2c32&color=007acc&logoColor=007acc)](https://github.com/codespaces/new?repo=qte77/Agents-eval&devcontainer_path=.devcontainer/setup_dev_claude/devcontainer.json)
 17 | [![Codespace Dev Ollama](https://img.shields.io/static/v1?logo=visualstudiocode&label=&message=Codespace%20Dev%20Ollama&labelColor=2c2c32&color=007acc&logoColor=007acc)](https://github.com/codespaces/new?repo=qte77/Agents-eval&devcontainer_path=.devcontainer/setup_dev_ollama/devcontainer.json)
 18 | [![TalkToGithub](https://img.shields.io/badge/TalkToGithub-7a83ff.svg)](https://talktogithub.com/qte77/Agents-eval)
 19 | [![llms.txt (UitHub)](https://img.shields.io/badge/llms.txt-uithub-800080.svg)](https://github.com/qte77/Agents-eval)
 20 | [![llms.txt (GitToDoc)](https://img.shields.io/badge/llms.txt-GitToDoc-fe4a60.svg)](https://gittodoc.com/qte77/Agents-eval)
 21 | 
 22 | ## Status
 23 | 
 24 | (DRAFT) (WIP) ----> Not fully implemented yet
 25 | 
 26 | For version history have a look at the [CHANGELOG](CHANGELOG.md).
 27 | 
 28 | ## Setup and Usage
 29 | 
 30 | - `make setup_prod`
 31 | - `make setup_dev` or `make setup_dev_claude` or `make setup_dev_ollama`
 32 | - `make run_cli` or `make run_cli ARGS="--help"`
 33 | - `make run_gui`
 34 | - `make test_all`
 35 | 
 36 | ### Configuration
 37 | 
 38 | - [config_app.py](src/app/config/config_app.py) contains configuration constants for the application.
 39 | - [config_chat.json](src/app/config/config_chat.json) contains inference provider configuration and prompts. inference endpoints used should adhere to [OpenAI Model Spec 2024-05-08](https://cdn.openai.com/spec/model-spec-2024-05-08.html) which is used by [pydantic-ai OpenAI-compatible Models](https://ai.pydantic.dev/models/#openai-compatible-models).
 40 | - [config_eval.json](src/app/config/config_eval.json) contains evaluation metrics and their weights.
 41 | - [data_models.py](src/app/config/data_models.py) contains the pydantic data models for agent system configuration and results.
 42 | 
 43 | ### Environment
 44 | 
 45 | [.env.example](.env.example) contains examples for usage of API keys and variables.
 46 | 
 47 | ```text
 48 | # inference EP
 49 | GEMINI_API_KEY="xyz"
 50 | 
 51 | # tools
 52 | TAVILY_API_KEY=""
 53 | 
 54 | # log/mon/trace
 55 | WANDB_API_KEY="xyz"
 56 | ```
 57 | 
 58 | ### Customer Journey
 59 | 
 60 | <details>
 61 |   <summary>Show Customer Journey</summary>
 62 |   <img src="assets/images/customer-journey-activity-light.png#gh-light-mode-only" alt="Customer Journey" title="Customer Journey" width="60%" />
 63 |   <img src="assets/images/customer-journey-activity-dark.png#gh-dark-mode-only" alt="Customer Journey" title="Customer Journey" width="60%" />
 64 | </details>
 65 | 
 66 | ### Note
 67 | 
 68 | 1. The contained chat configuration uses free inference endpoints which are subject to change by the providers. See lists such as [free-llm-api-resources](https://github.com/cheahjs/free-llm-api-resources) to find other providers.
 69 | 2. The contained chat configuration uses models which are also subject to change by the providers and have to be updated from time to time.
 70 | 3. LLM-as-judge is also subject to the chat configuration.
 71 | 
 72 | ## Documentation
 73 | 
 74 | [Agents-eval](https://qte77.github.io/Agents-eval)
 75 | 
 76 | ### Project Outline
 77 | 
 78 | `# TODO`
 79 | 
 80 | ### Agents
 81 | 
 82 | #### Manager Agent
 83 | 
 84 | - **Description**: Oversees research and analysis tasks, coordinating the efforts of the research, analysis, and synthesizer agents to provide comprehensive answers to user queries. Delegates tasks and ensures the accuracy of the information.
 85 | - **Responsibilities**:
 86 |   - Coordinates the research, analysis, and synthesis agents.
 87 |   - Delegates research tasks to the Research Agent.
 88 |   - Delegates analysis tasks to the Analysis Agent.
 89 |   - Delegates synthesis tasks to the Synthesizer Agent.
 90 |   - Ensures the accuracy of the information.
 91 | - **Location**: [src/app/agents/agent_system.py](https://github.com/qte77/Agents-eval/blob/main/src/app/agents/agent_system.py)
 92 | 
 93 | #### Researcher Agent
 94 | 
 95 | - **Description**: Gathers and analyzes data relevant to a given topic, utilizing search tools to collect data and verifying the accuracy of assumptions, facts, and conclusions.
 96 | - **Responsibilities**:
 97 |   - Gathers and analyzes data relevant to the topic.
 98 |   - Uses search tools to collect data.
 99 |   - Checks the accuracy of assumptions, facts, and conclusions.
100 | - **Tools**:
101 |   - [DuckDuckGo Search Tool](https://ai.pydantic.dev/common-tools/#duckduckgo-search-tool)
102 | - **Location**: [src/app/agents/agent_system.py](https://github.com/qte77/Agents-eval/blob/main/src/app/agents/agent_system.py)
103 | 
104 | #### Analyst Agent
105 | 
106 | - **Description**: Checks the accuracy of assumptions, facts, and conclusions in the provided data, providing relevant feedback and ensuring data integrity.
107 | - **Responsibilities**:
108 |   - Checks the accuracy of assumptions, facts, and conclusions.
109 |   - Provides relevant feedback if the result is not approved.
110 |   - Ensures data integrity.
111 | - **Location**: [src/app/agents/agent_system.py](https://github.com/qte77/Agents-eval/blob/main/src/app/agents/agent_system.py)
112 | 
113 | #### Synthesizer Agent
114 | 
115 | - **Description**: Outputs a well-formatted scientific report using the data provided, maintaining the original facts, conclusions, and sources.
116 | - **Responsibilities**:
117 |   - Outputs a well-formatted scientific report using the provided data.
118 |   - Maintains the original facts, conclusions, and sources.
119 | - **Location**: [src/app/agents/agent_system.py](https://github.com/qte77/Agents-eval/blob/main/src/app/agents/agent_system.py)
120 | 
121 | ### Datasets used
122 | 
123 | `# TODO`
124 | 
125 | ### Evalutions metrics
126 | 
127 | `# TODO`
128 | 
129 | - Time to complete task (time_taken)
130 | - Task success rate (task_success)
131 | - Agent coordination (coordination_quality)
132 | - Tool usage efficiency (tool_efficiency)
133 | - Plan coherence (planning_rational)
134 | - Text response quality (text_similarity)
135 | - Autonomy vs. human intervention (HITL, user feedback)
136 | - Reactivity (adapt to changes of tasks and environments)
137 | - Memory consistency
138 | 
139 | ### Evaluations Metrics Baseline
140 | 
141 | As configured in [config_eval.json](src/app/config/config_eval.json).
142 | 
143 | ```json
144 | {
145 |     "evaluators_and_weights": {
146 |         "planning_rational": "1/6",
147 |         "task_success": "1/6",
148 |         "tool_efficiency": "1/6",
149 |         "coordination_quality": "1/6",
150 |         "time_taken": "1/6",
151 |         "text_similarity": "1/6"
152 |     }
153 | }
154 | ```
155 | 
156 | ### Eval Metrics Sweep
157 | 
158 | <details>
159 |   <summary>Eval Metrics Sweep</summary>
160 |   <img src="assets/images/metrics-eval-sweep.png" alt="Eval Metrics Sweep" title="Eval Metrics Sweep" width="60%" />
161 | </details>
162 | 
163 | ### Tools available
164 | 
165 | Other pydantic-ai agents and [pydantic-ai DuckDuckGo Search Tool](https://ai.pydantic.dev/common-tools/#duckduckgo-search-tool).
166 | 
167 | ### Agentic System Architecture
168 | 
169 | <details>
170 |   <summary>Show Agentic System Architecture</summary>
171 |   <img src="assets/images/c4-multi-agent-system.png#gh-dark-mode-only" alt="Agentic System C4-Arch" title="Agentic System C4-Arch" width="60%" />
172 | </details>
173 | 
174 | ### Project Repo Structure
175 | 
176 | <details>
177 |   <summary>Show Repo Structure</summary>
178 | ```sh
179 | |- .claude  # claude code config and commands
180 | |- .devcontainer  # pre-configured dev env
181 | |- .github  # workflows
182 | |- .streamlit  # config.toml
183 | |- .vscode  # extensions, settings
184 | |- assets/images
185 | |- docs
186 | |- src  # source code
187 |    |- app
188 |       |- agents
189 |       |- config
190 |       |- evals
191 |       |- utils
192 |       |- __init__.py
193 |       |- main.py
194 |       \- py.typed
195 |    |- examples
196 |    |- gui
197 |    \- run_gui.py
198 | |- tests
199 | |- .env.example  # example env vars
200 | |- .gitignore
201 | |- .gitmessage
202 | |- AGENTS.md  # common file like agentsmd.com
203 | |- CHANGEOG.md  # short project history
204 | |- CLAUDE.md  # points to AGENTS.md
205 | |- Dockerfile  # create app image
206 | |- LICENSE.md
207 | |- Makefile  # helper scripts
208 | |- mkdocs.yaml  # docu from docstrings
209 | |- pyproject.toml  # project settings
210 | |- README.md  # project description
211 | \- uv.lock  # resolved package versions
212 | ```
213 | </details>
214 | 
215 | ## Landscape overview
216 | 
217 | ### Agentic System Frameworks
218 | 
219 | - [PydanticAI](https://github.com/pydantic/pydantic-ai)
220 | - [restack](https://www.restack.io/)
221 | - [smolAgents](https://github.com/huggingface/smolagents)
222 | - [AutoGen](https://github.com/microsoft/autogen)
223 | - [Semantic Kernel](https://github.com/microsoft/semantic-kernel)
224 | - [CrewAI](https://github.com/crewAIInc/crewAI)
225 | - [Langchain](https://github.com/langchain-ai/langchain)
226 | - [Langflow](https://github.com/langflow-ai/langflow)
227 | 
228 | ### Agent-builder
229 | 
230 | - [Archon](https://github.com/coleam00/Archon)
231 | - [Agentstack](https://github.com/AgentOps-AI/AgentStack)
232 | 
233 | ### Evaluation
234 | 
235 | - Focusing on agentic systems
236 |   - [AgentNeo](https://github.com/raga-ai-hub/agentneo)
237 |   - [AutoGenBench](https://github.com/microsoft/autogen/blob/0.2/samples/tools/autogenbench)
238 |   - [Langchain AgentEvals](https://github.com/langchain-ai/agentevals)
239 |   - [Mosaic AI Agent Evaluation](https://docs.databricks.com/en/generative-ai/agent-evaluation/index.html)
240 |   - [RagaAI-Catalyst](https://github.com/raga-ai-hub/RagaAI-Catalyst)
241 |   - [AgentBench](https://github.com/THUDM/AgentBench)
242 | - RAG oriented
243 |   - [RAGAs](https://github.com/explodinggradients/ragas)
244 | - LLM apps
245 |   - [DeepEval](https://github.com/confident-ai/deepeval)
246 |   - [Langchain OpenEvals](https://github.com/langchain-ai/openevals)
247 |   - [MLFlow LLM Evaluate](https://mlflow.org/docs/latest/llms/llm-evaluate/index.html)
248 |   - [DeepEval (DeepSeek)]( github.com/confident-ai/deepeval)
249 | 
250 | ### Observation, Monitoring, Tracing
251 | 
252 | - [AgentOps - Agency](https://www.agentops.ai/)
253 | - [arize](https://arize.com/)
254 | - [Langtrace](https://www.langtrace.ai/)
255 | - [LangSmith - Langchain](https://www.langchain.com/langsmith)
256 | - [Weave - Weights & Biases](https://wandb.ai/site/weave/)
257 | - [Pydantic- Logfire](https://pydantic.dev/logfire)
258 | 
259 | ### Datasets
260 | 
261 | - [awesome-reasoning - Collection of datasets](https://github.com/neurallambda/awesome-reasoning)
262 | 
263 | #### Scientific
264 | 
265 | - [SWIF2T](https://arxiv.org/abs/2405.20477), Automated Focused Feedback Generation for Scientific Writing Assistance, 2024, 300 peer reviews citing weaknesses in scientific papers and conduct human evaluation
266 | - [PeerRead](https://github.com/allenai/PeerRead), A Dataset of Peer Reviews (PeerRead): Collection, Insights and NLP Applications, 2018, 14K paper drafts and the corresponding accept/reject decisions, over 10K textual peer reviews written by experts for a subset of the papers, structured JSONL, clear labels
267 | - [BigSurvey](https://www.ijcai.org/proceedings/2022/0591.pdf), Generating a Structured Summary of Numerous Academic Papers: Dataset and Method, 2022, 7K survey papers and 430K referenced papers abstracts
268 | - [SciXGen](https://arxiv.org/abs/2110.10774), A Scientific Paper Dataset for Context-Aware Text Generation, 2021, 205k papers
269 | - [scientific_papers](https://huggingface.co/datasets/armanc/scientific_papers), 2018, two sets of long and structured documents, obtained from ArXiv and PubMed OpenAccess, 300k+ papers, total disk 7GB
270 | 
271 | #### Reasoning, Deduction, Commonsense, Logic
272 | 
273 | - [LIAR](https://www.cs.ucsb.edu/~william/data/liar_dataset.zip), fake news detection, only 12.8k records, single label
274 | - [X-Fact](https://github.com/utahnlp/x-fact/), Benchmark Dataset for Multilingual Fact Checking, 31.1k records, large, multilingual
275 | - [MultiFC](https://www.copenlu.com/publication/2019_emnlp_augenstein/), A Real-World Multi-Domain Dataset for Evidence-Based Fact Checking of Claims, 34.9k records
276 | - [FEVER](https://fever.ai/dataset/fever.html), Fact Extraction and VERification, 185.4k records
277 | - TODO GSM8K, bAbI, CommonsenseQA, DROP, LogiQA, MNLI
278 | 
279 | #### Planning, Execution
280 | 
281 | - [Plancraft](https://arxiv.org/abs/2412.21033), an evaluation dataset for planning with LLM agents, both a text-only and multi-modal interface
282 | - [IDAT](https://arxiv.org/abs/2407.08898), A Multi-Modal Dataset and Toolkit for Building and Evaluating Interactive Task-Solving Agents
283 | - [PDEBench](https://github.com/pdebench/PDEBench), set of benchmarks for scientific machine learning
284 | - [MatSci-NLP](https://arxiv.org/abs/2305.08264), evaluating the performance of natural language processing (NLP) models on materials science text
285 | - TODO BigBench Hard, FSM Game
286 | 
287 | #### Tool Use, Function Invocation
288 | 
289 | - [Trelis Function Calling](https://huggingface.co/datasets/Trelis/function_calling_v3)
290 | - [KnowLM Tool](https://huggingface.co/datasets/zjunlp/KnowLM-Tool)
291 | - [StatLLM](https://arxiv.org/abs/2502.17657), statistical analysis tasks, LLM-generated SAS code, and human evaluation scores
292 | - TODO ToolComp
293 | 
294 | ### Benchmarks
295 | 
296 | - [SciArena: A New Platform for Evaluating Foundation Models in Scientific Literature Tasks](https://allenai.org/blog/sciarena)
297 | - [AgentEvals CORE-Bench Leaderboard](https://huggingface.co/spaces/agent-evals/core_leaderboard)
298 | - [Berkeley Function-Calling Leaderboard](https://gorilla.cs.berkeley.edu/leaderboard.html)
299 | - [Chatbot Arena LLM Leaderboard](https://lmsys.org/projects/)
300 | - [GAIA Leaderboard](https://gaia-benchmark-leaderboard.hf.space/)
301 | - [GalileoAI Agent Leaderboard](https://huggingface.co/spaces/galileo-ai/agent-leaderboard)
302 | - [WebDev Arena Leaderboard](https://web.lmarena.ai/leaderboard)
303 | - [MiniWoB++: a web interaction benchmark for reinforcement learning](https://miniwob.farama.org/)
304 | 
305 | ### Research Agents
306 | 
307 | - [Ai2 Scholar QA](https://qa.allen.ai/chat)
308 | 
309 | ## Further Reading
310 | 
311 | - [[2506.18096] Deep Research Agents: A Systematic Examination And Roadmap](https://arxiv.org/abs/2506.18096), [gh / ai-agents-2030 / awesome-deep-research-agent](https://github.com/ai-agents-2030/awesome-deep-research-agent)
312 | - [[2504.19678] From LLM Reasoning to Autonomous AI Agents: A Comprehensive Review](https://arxiv.org/abs/2504.19678)
313 | - [[2503.21460] Large Language Model Agent: A Survey on Methodology, Applications and Challenges](https://arxiv.org/abs/2503.21460)
314 | - [[2503.16416] Survey on Evaluation of LLM-based Agents](https://arxiv.org/abs/2503.16416)
315 | - [[2503.13657] Why Do Multi-Agent LLM Systems Fail?](https://arxiv.org/abs/2503.13657)
316 | - [[2502.14776] SurveyX: Academic Survey Automation via Large Language Models](https://arxiv.org/abs/2502.14776)
317 | - [[2502.05957] AutoAgent: A Fully-Automated and Zero-Code Framework for LLM Agents](https://arxiv.org/abs/2502.05957)
318 | - [[2502.02649] Fully Autonomous AI Agents Should Not be Developed](https://arxiv.org/abs/2502.02649)
319 | - [[2501.16150] AI Agents for Computer Use: A Review of Instruction-based Computer Control, GUI Automation, and Operator Assistants](https://arxiv.org/abs/2501.16150)
320 | - [[2501.06590] ChemAgent](https://arxiv.org/abs/2501.06590)
321 | - [[2501.06322] Multi-Agent Collaboration Mechanisms: A Survey of LLMs](https://arxiv.org/abs/2501.06322)
322 | - [[2501.04227] Agent Laboratory: Using LLM Agents as Research Assitants](https://arxiv.org/abs/2501.04227), [AgentRxiv:Towards Collaborative Autonomous Research](https://agentrxiv.github.io/)
323 | - [[2501.00881] Agentic Systems: A Guide to Transforming Industries with Vertical AI Agents](https://arxiv.org/abs/2501.00881)
324 | - [[2412.04093] Practical Considerations for Agentic LLM Systems](https://arxiv.org/abs/2412.04093)
325 | - [[2411.13768] Evaluation-driven Approach to LLM Agents](https://arxiv.org/abs/2411.13768)
326 | - [[2411.10478] Large Language Models for Constructing and Optimizing Machine Learning Workflows: A Survey](https://arxiv.org/abs/2411.10478)
327 | - [[2411.05285] A taxonomy of agentops for enabling observability of foundation model based agents](https://arxiv.org/abs/2411.05285)
328 | - [[2410.22457] Advancing Agentic Systems: Dynamic Task Decomposition, Tool Integration and Evaluation using Novel Metrics and Dataset](https://arxiv.org/abs/2410.22457)
329 | - [[2408.06361] Large Language Model Agent in Financial Trading: A Survey](https://arxiv.org/abs/2408.06361)
330 | - [[2408.06292] The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery](https://arxiv.org/abs/2408.06292)
331 | - [[2404.13501] A Survey on the Memory Mechanism of Large Language Model based Agents](https://arxiv.org/pdf/2404.13501)
332 | - [[2402.06360] CoSearchAgent: A Lightweight Collaborative Search Agent with Large Language Models](https://arxiv.org/abs/2402.06360)
333 | - [[2402.02716] Understanding the planning of LLM agents: A survey](https://arxiv.org/abs/2402.02716)
334 | - [[2402.01030] Executable Code Actions Elicit Better LLM Agents](https://arxiv.org/abs/2402.01030)
335 | - [[2308.11432] A Survey on Large Language Model based Autonomous Agents](https://arxiv.org/abs/2308.11432)
336 | 


--------------------------------------------------------------------------------
/assets/images/c4-multi-agent-system.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qte77/Agents-eval/7401b21b53bd8307e7fe8465b466595a7687f8c8/assets/images/c4-multi-agent-system.png


--------------------------------------------------------------------------------
/assets/images/customer-journey-activity-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qte77/Agents-eval/7401b21b53bd8307e7fe8465b466595a7687f8c8/assets/images/customer-journey-activity-dark.png


--------------------------------------------------------------------------------
/assets/images/customer-journey-activity-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qte77/Agents-eval/7401b21b53bd8307e7fe8465b466595a7687f8c8/assets/images/customer-journey-activity-light.png


--------------------------------------------------------------------------------
/assets/images/metrics-eval-sweep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qte77/Agents-eval/7401b21b53bd8307e7fe8465b466595a7687f8c8/assets/images/metrics-eval-sweep.png


--------------------------------------------------------------------------------
/context/PRPs/coordination_quality.md:
--------------------------------------------------------------------------------
  1 | # Coordination Quality Feature PRP
  2 | 
  3 | ## Goal
  4 | 
  5 | Implement a comprehensive coordination quality measurement and monitoring system for the multi-agent evaluation framework to assess how effectively agents collaborate, delegate tasks, and maintain workflow integrity.
  6 | 
  7 | ## Why
  8 | 
  9 | - **Evaluation Completeness**: The coordination_quality metric is defined in `config_eval.json` (0.167 weight) but not implemented
 10 | - **System Reliability**: Need to measure and improve agent coordination failures and bottlenecks
 11 | - **Performance Optimization**: Identify coordination inefficiencies that impact overall system performance
 12 | - **Research Value**: Provide quantitative data on multi-agent coordination patterns for evaluation research
 13 | 
 14 | ## What
 15 | 
 16 | A coordination quality monitoring system that measures:
 17 | 
 18 | - Task delegation success rates between agents
 19 | - Inter-agent communication efficiency and latency
 20 | - Workflow completion rates and error recovery
 21 | - Resource utilization across agent interactions
 22 | - Coordination failure detection and analysis
 23 | 
 24 | ### Success Criteria
 25 | 
 26 | - [ ] Coordination quality metric implemented and functional in evaluation system
 27 | - [ ] Real-time coordination monitoring dashboard
 28 | - [ ] Coordination failure detection and alerting
 29 | - [ ] Performance metrics collection and analysis
 30 | - [ ] Integration with existing evaluation pipeline
 31 | 
 32 | ## All Needed Context
 33 | 
 34 | ### Documentation & References
 35 | 
 36 | ```yaml
 37 | - file: /workspaces/Agents-eval/src/app/agents/agent_system.py
 38 |   why: Core coordination logic, delegation patterns, tool-based coordination
 39 |   critical: Lines 91-99 show delegation pattern, _validate_model_return validation
 40 | 
 41 | - file: /workspaces/Agents-eval/src/app/config/data_models.py
 42 |   why: Data contracts for coordination, Pydantic models for agent communication
 43 |   critical: ResearchResult, AnalysisResult, ResearchSummary models
 44 | 
 45 | - file: /workspaces/Agents-eval/src/app/config/config_eval.json
 46 |   why: Coordination quality metric weight (0.167) defined but not implemented
 47 |   critical: Need to implement the missing coordination_quality metric
 48 | 
 49 | - file: /workspaces/Agents-eval/src/app/evals/metrics.py
 50 |   why: Evaluation metrics implementation patterns
 51 |   critical: How other metrics are implemented and integrated
 52 | 
 53 | - file: /workspaces/Agents-eval/src/app/config/config_chat.json
 54 |   why: Agent prompts defining coordination behavior and approval workflows
 55 |   critical: Manager agent orchestration prompts
 56 | ```
 57 | 
 58 | ### Current Codebase Tree
 59 | 
 60 | ```bash
 61 | src/app/
 62 | ├── agents/
 63 | │   ├── agent_system.py        # Core coordination logic
 64 | │   └── llm_model_funs.py      # Model management
 65 | ├── config/
 66 | │   ├── config_app.py          # Common app configuration
 67 | │   ├── data_models.py         # Coordination data contracts
 68 | │   ├── config_chat.json       # Agent coordination prompts
 69 | │   └── config_eval.json       # Evaluation metrics (coordination_quality: 0.167)
 70 | ├── evals/
 71 | │   └── metrics.py             # Evaluation metrics implementation
 72 | ├── utils/
 73 | │   ├── error_messages.py      # Error handling patterns
 74 | │   └── log.py                 # Logging utilities
 75 | └── main.py                    # Entry point
 76 | ```
 77 | 
 78 | ### Desired Codebase Tree
 79 | 
 80 | ```bash
 81 | src/app/
 82 | ├── evals/
 83 | │   ├── [existing folders unchanged]
 84 | │   ├── coordination_quality/
 85 | │   │   ├── __init__.py
 86 | │   │   ├── quality_metrics.py     # Coordination quality measurement
 87 | │   │   ├── monitoring.py          # Real-time coordination monitoring
 88 | │   │   └── analyzer.py            # Coordination pattern analysis
 89 | │   └── metrics.py             # Updated with coordination_quality implementation
 90 | └── [existing files unchanged]
 91 | ```
 92 | 
 93 | ### Known Gotchas & Library Quirks
 94 | 
 95 | ```python
 96 | # CRITICAL: PydanticAI coordination patterns
 97 | # - Tool-based delegation via @agent.tool decorator
 98 | # - Usage tracking shared via RunContext
 99 | # - Streaming with Pydantic models has NotImplementedError in agent_system.py
100 | 
101 | # GOTCHA: Validation requirements
102 | # - All agent communication must use _validate_model_return()
103 | # - Pydantic models required for type safety
104 | # - Error handling must follow utils/error_messages.py patterns
105 | 
106 | # LIBRARY QUIRK: PydanticAI Usage Limits
107 | # - UsageLimits shared across agents via RunContext
108 | # - Coordination can fail if usage limits exceeded
109 | # - Need to track usage per coordination step
110 | ```
111 | 
112 | ## Implementation Blueprint
113 | 
114 | ### Data Models and Structure
115 | 
116 | ```python
117 | # coordination/quality_metrics.py
118 | class CoordinationMetrics(BaseModel):
119 |     """Coordination quality metrics data model."""
120 |     
121 |     delegation_success_rate: float
122 |     communication_latency: float
123 |     workflow_completion_rate: float
124 |     error_recovery_rate: float
125 |     resource_utilization: float
126 |     coordination_score: float
127 | 
128 | class CoordinationEvent(BaseModel):
129 |     """Individual coordination event tracking."""
130 |     
131 |     timestamp: datetime
132 |     source_agent: str
133 |     target_agent: str
134 |     event_type: str  # delegation, response, error, retry
135 |     success: bool
136 |     latency_ms: float
137 |     error_message: str | None = None
138 | ```
139 | 
140 | ### List of Tasks
141 | 
142 | ```yaml
143 | Task 1:
144 | CREATE src/app/coordination/__init__.py:
145 |   - EMPTY file for Python package
146 | 
147 | Task 2:
148 | CREATE src/app/coordination/quality_metrics.py:
149 |   - IMPLEMENT CoordinationMetrics and CoordinationEvent models
150 |   - IMPLEMENT calculate_coordination_quality() function
151 |   - PATTERN: Follow existing Pydantic models in data_models.py
152 | 
153 | Task 3:
154 | CREATE src/app/coordination/monitoring.py:
155 |   - IMPLEMENT CoordinationMonitor class
156 |   - TRACK delegation events, latency, success rates
157 |   - PATTERN: Use existing logging patterns from utils/log.py
158 | 
159 | Task 4:
160 | CREATE src/app/coordination/analyzer.py:
161 |   - IMPLEMENT coordination pattern analysis
162 |   - DETECT coordination failures and bottlenecks
163 |   - GENERATE coordination quality reports
164 | 
165 | Task 5:
166 | MODIFY src/app/agents/agent_system.py:
167 |   - FIND _add_tools_to_manager_agent function
168 |   - INJECT coordination monitoring into delegation tools
169 |   - PRESERVE existing delegation patterns
170 | 
171 | Task 6:
172 | MODIFY src/app/evals/metrics.py:
173 |   - IMPLEMENT coordination_quality metric function
174 |   - INTEGRATE with existing metrics calculation
175 |   - MIRROR pattern from other metric implementations
176 | 
177 | Task 7:
178 | CREATE tests/test_coordination_quality.py:
179 |   - TEST coordination metrics calculation
180 |   - TEST monitoring functionality
181 |   - TEST integration with evaluation pipeline
182 | ```
183 | 
184 | ### Per Task Pseudocode
185 | 
186 | ```python
187 | # Task 2: quality_metrics.py
188 | class CoordinationMetrics(BaseModel):
189 |     delegation_success_rate: float = Field(ge=0.0, le=1.0)
190 |     communication_latency: float = Field(ge=0.0)
191 |     workflow_completion_rate: float = Field(ge=0.0, le=1.0)
192 |     error_recovery_rate: float = Field(ge=0.0, le=1.0)
193 |     resource_utilization: float = Field(ge=0.0, le=1.0)
194 |     coordination_score: float = Field(ge=0.0, le=1.0)
195 | 
196 | def calculate_coordination_quality(events: list[CoordinationEvent]) -> CoordinationMetrics:
197 |     """Calculate coordination quality from event history."""
198 |     # PATTERN: Weighted average of coordination dimensions
199 |     # CRITICAL: Handle empty events list gracefully
200 |     if not events:
201 |         return CoordinationMetrics(...)
202 |     
203 |     # Calculate individual metrics
204 |     success_rate = sum(e.success for e in events) / len(events)
205 |     avg_latency = sum(e.latency_ms for e in events) / len(events)
206 |     # ... other calculations
207 |     
208 |     # Weighted coordination score
209 |     coordination_score = (
210 |         success_rate * 0.3 +
211 |         normalized_latency * 0.2 +
212 |         completion_rate * 0.3 +
213 |         recovery_rate * 0.2
214 |     )
215 |     
216 |     return CoordinationMetrics(
217 |         coordination_score=coordination_score,
218 |         # ... other metrics
219 |     )
220 | 
221 | # Task 3: monitoring.py
222 | class CoordinationMonitor:
223 |     def __init__(self):
224 |         self.events: list[CoordinationEvent] = []
225 |         self.logger = logger  # From utils/log.py
226 |     
227 |     async def track_delegation(self, source: str, target: str, func: Callable):
228 |         """Track delegation with timing and success monitoring."""
229 |         start_time = time.time()
230 |         
231 |         try:
232 |             result = await func()
233 |             # PATTERN: Log successful coordination
234 |             self.logger.info(f"Delegation {source} -> {target} successful")
235 |             
236 |             # Record successful event
237 |             self._record_event(
238 |                 source_agent=source,
239 |                 target_agent=target,
240 |                 event_type="delegation",
241 |                 success=True,
242 |                 latency_ms=(time.time() - start_time) * 1000
243 |             )
244 |             
245 |             return result
246 |             
247 |         except Exception as e:
248 |             # PATTERN: Log coordination failures
249 |             self.logger.error(f"Delegation {source} -> {target} failed: {str(e)}")
250 |             
251 |             # Record failed event
252 |             self._record_event(
253 |                 source_agent=source,
254 |                 target_agent=target,
255 |                 event_type="delegation",
256 |                 success=False,
257 |                 latency_ms=(time.time() - start_time) * 1000,
258 |                 error_message=str(e)
259 |             )
260 |             
261 |             raise
262 | 
263 | # Task 5: agent_system.py integration
264 | # MODIFY delegate_research function
265 | @manager_agent.tool
266 | async def delegate_research(ctx: RunContext[None], query: str) -> ResearchResult:
267 |     """Delegate research task to ResearchAgent."""
268 |     # INJECT: Coordination monitoring
269 |     monitor = CoordinationMonitor()
270 |     
271 |     async def _research_task():
272 |         result = await research_agent.run(query, usage=ctx.usage)
273 |         return _validate_model_return(str(result.output), ResearchResult)
274 |     
275 |     # PATTERN: Track delegation with monitoring
276 |     return await monitor.track_delegation("manager", "researcher", _research_task)
277 | ```
278 | 
279 | ### Integration Points
280 | 
281 | ```yaml
282 | EVALUATION_SYSTEM:
283 |   - modify: src/app/evals/metrics.py
284 |   - pattern: "def coordination_quality(result: Any) -> float:"
285 |   - integration: "Add to evaluation pipeline alongside existing metrics"
286 | 
287 | CONFIGURATION:
288 |   - modify: src/app/config/config_eval.json
289 |   - pattern: "coordination_quality metric already defined with weight 0.167"
290 |   - validation: "Ensure metric returns float between 0.0 and 1.0"
291 | 
292 | LOGGING:
293 |   - integrate: src/app/utils/log.py
294 |   - pattern: "Use existing logger for coordination events"
295 |   - level: "INFO for successful coordination, ERROR for failures"
296 | ```
297 | 
298 | ## Validation Loop
299 | 
300 | ### Level 1: Syntax & Style
301 | 
302 | ```bash
303 | # Run these FIRST - fix any errors before proceeding
304 | make ruff                    # Format and fix linting issues
305 | make type_check             # Type checking with mypy
306 | 
307 | # Expected: No errors. If errors, READ the error and fix.
308 | ```
309 | 
310 | ### Level 2: Unit Tests
311 | 
312 | ```python
313 | # CREATE tests/test_coordination_quality.py
314 | def test_coordination_metrics_calculation():
315 |     """Test coordination quality calculation with sample events."""
316 |     events = [
317 |         CoordinationEvent(
318 |             timestamp=datetime.now(),
319 |             source_agent="manager",
320 |             target_agent="researcher",
321 |             event_type="delegation",
322 |             success=True,
323 |             latency_ms=150.0
324 |         ),
325 |         # ... more test events
326 |     ]
327 |     
328 |     metrics = calculate_coordination_quality(events)
329 |     assert 0.0 <= metrics.coordination_score <= 1.0
330 |     assert metrics.delegation_success_rate >= 0.0
331 | 
332 | def test_coordination_monitoring():
333 |     """Test coordination monitoring functionality."""
334 |     monitor = CoordinationMonitor()
335 |     
336 |     # Test successful delegation tracking
337 |     async def dummy_task():
338 |         return "success"
339 |     
340 |     result = await monitor.track_delegation("manager", "researcher", dummy_task)
341 |     assert result == "success"
342 |     assert len(monitor.events) == 1
343 |     assert monitor.events[0].success is True
344 | 
345 | def test_coordination_quality_metric():
346 |     """Test integration with evaluation metrics."""
347 |     # PATTERN: Test similar to other metrics in the evaluation system
348 |     sample_result = {"coordination_events": [...]}
349 |     quality_score = coordination_quality(sample_result)
350 |     assert isinstance(quality_score, float)
351 |     assert 0.0 <= quality_score <= 1.0
352 | ```
353 | 
354 | ```bash
355 | # Run and iterate until passing:
356 | make test_all
357 | # If failing: Read error, understand root cause, fix code, re-run
358 | ```
359 | 
360 | ### Level 3: Integration Test
361 | 
362 | ```bash
363 | # Test the coordination quality in full evaluation
364 | make run_cli ARGS="--query 'test coordination quality' --eval"
365 | 
366 | # Expected: Coordination quality metric appears in evaluation results
367 | # If error: Check logs for coordination monitoring issues
368 | ```
369 | 
370 | ## Final Validation Checklist
371 | 
372 | - [ ] All tests pass: `make test_all`
373 | - [ ] No linting errors: `make ruff`
374 | - [ ] No type errors: `make type_check`
375 | - [ ] Coordination quality metric integrated in evaluation pipeline
376 | - [ ] Coordination monitoring tracks delegation events
377 | - [ ] Error cases handled gracefully with proper logging
378 | - [ ] Performance impact minimal (< 5% overhead)
379 | - [ ] Documentation updated in AGENTS.md if needed
380 | 
381 | ## Anti-Patterns to Avoid
382 | 
383 | - ❌ Don't break existing delegation patterns in agent_system.py
384 | - ❌ Don't ignore coordination failures - log and track them
385 | - ❌ Don't add excessive monitoring overhead that slows coordination
386 | - ❌ Don't hardcode coordination thresholds - make them configurable
387 | - ❌ Don't skip validation of coordination metrics calculation
388 | - ❌ Don't assume all coordination events are successful - handle failures gracefully
389 | 


--------------------------------------------------------------------------------
/context/PRPs/features/coordination_quality.md:
--------------------------------------------------------------------------------
 1 | # Feature description for: coordination_quality
 2 | 
 3 | As put forward by [context-engineering-intro](https://github.com/qte77/context-engineering-intro).
 4 | 
 5 | ## FEATURE
 6 | 
 7 | coordination_quality
 8 | 
 9 | ## EXAMPLES
10 | 
11 | [Provide and explain examples that you have in the `PRPs/examples/` folder]
12 | 
13 | ## DOCUMENTATION
14 | 
15 | [List out any documentation (web pages, sources for an MCP server like Crawl4AI RAG, etc.) that will need to be referenced during development]
16 | 
17 | ## OTHER CONSIDERATIONS
18 | 
19 | [Any other considerations or specific requirements - great place to include gotchas that you see AI coding assistants miss with your projects a lot]
20 | 


--------------------------------------------------------------------------------
/context/PRPs/features/tool_efficiency.md:
--------------------------------------------------------------------------------
 1 | # Feature description for: tool_efficiency
 2 | 
 3 | As put forward by [context-engineering-intro](https://github.com/qte77/context-engineering-intro).
 4 | 
 5 | ## FEATURE
 6 | 
 7 | tool_efficiency
 8 | 
 9 | ## EXAMPLES
10 | 
11 | [Provide and explain examples that you have in the `PRPs/examples/` folder]
12 | 
13 | ## DOCUMENTATION
14 | 
15 | [List out any documentation (web pages, sources for an MCP server like Crawl4AI RAG, etc.) that will need to be referenced during development]
16 | 
17 | ## OTHER CONSIDERATIONS
18 | 
19 | [Any other considerations or specific requirements - great place to include gotchas that you see AI coding assistants miss with your projects a lot]
20 | 


--------------------------------------------------------------------------------
/context/PRPs/templates/feature_base.md:
--------------------------------------------------------------------------------
 1 | # Feature description for: [ Initial template for new features ]
 2 | 
 3 | As put forward by [context-engineering-intro](https://github.com/qte77/context-engineering-intro).
 4 | 
 5 | ## FEATURE
 6 | 
 7 | [Insert your feature here]
 8 | 
 9 | ## EXAMPLES
10 | 
11 | [Provide and explain examples that you have in the `PRPs/examples/` folder]
12 | 
13 | ## DOCUMENTATION
14 | 
15 | [List out any documentation (web pages, sources for an MCP server like Crawl4AI RAG, etc.) that will need to be referenced during development]
16 | 
17 | ## OTHER CONSIDERATIONS
18 | 
19 | [Any other considerations or specific requirements - great place to include gotchas that you see AI coding assistants miss with your projects a lot]
20 | 


--------------------------------------------------------------------------------
/context/PRPs/templates/prp_base.md:
--------------------------------------------------------------------------------
  1 | # "Base PRP Template v2 - Context-Rich with Validation Loops"
  2 | 
  3 | ## Purpose
  4 | 
  5 | Template optimized for AI agents to implement features with sufficient context and self-validation capabilities to achieve working code through iterative refinement.
  6 | 
  7 | ## Core Principles
  8 | 
  9 | 1. **Context is King**: Include ALL necessary documentation, examples, and caveats
 10 | 2. **Validation Loops**: Provide executable tests/lints the AI can run and fix
 11 | 3. **Information Dense**: Use keywords and patterns from the codebase
 12 | 4. **Progressive Success**: Start simple, validate, then enhance
 13 | 5. **Global rules**: Be sure to follow all rules in CLAUDE.md
 14 | 
 15 | ---
 16 | 
 17 | ## Goal
 18 | 
 19 | [What needs to be built - be specific about the end state and desires]
 20 | 
 21 | ## Why
 22 | 
 23 | - [Business value and user impact]
 24 | - [Integration with existing features]
 25 | - [Problems this solves and for whom]
 26 | 
 27 | ## What
 28 | 
 29 | [User-visible behavior and technical requirements]
 30 | 
 31 | ### Success Criteria
 32 | 
 33 | - [ ] [Specific measurable outcomes]
 34 | 
 35 | ## All Needed Context
 36 | 
 37 | ### Documentation & References (list all context needed to implement the feature)
 38 | 
 39 | ```yaml
 40 | # MUST READ - Include these in your context window
 41 | - url: [Official API docs URL]
 42 |   why: [Specific sections/methods you'll need]
 43 |   
 44 | - file: [path/to/example.py]
 45 |   why: [Pattern to follow, gotchas to avoid]
 46 |   
 47 | - doc: [Library documentation URL] 
 48 |   section: [Specific section about common pitfalls]
 49 |   critical: [Key insight that prevents common errors]
 50 | 
 51 | - docfile: [PRPs/ai_docs/file.md]
 52 |   why: [docs that the user has pasted in to the project]
 53 | ```
 54 | 
 55 | ### Current Codebase tree (run `tree` in the root of the project) to get an overview of the codebase
 56 | 
 57 | ```bash
 58 | 
 59 | ```
 60 | 
 61 | ### Desired Codebase tree with files to be added and responsibility of file
 62 | 
 63 | ```bash
 64 | 
 65 | ```
 66 | 
 67 | ### Known Gotchas of our codebase & Library Quirks
 68 | 
 69 | ```python
 70 | # CRITICAL: [Library name] requires [specific setup]
 71 | # Example: FastAPI requires async functions for endpoints
 72 | # Example: This ORM doesn't support batch inserts over 1000 records
 73 | # Example: We use pydantic v2 and  
 74 | ```
 75 | 
 76 | ## Implementation Blueprint
 77 | 
 78 | ### Data models and structure
 79 | 
 80 | Create the core data models, we ensure type safety and consistency.
 81 | 
 82 | ```python
 83 | Examples: 
 84 |  - orm models
 85 |  - pydantic models
 86 |  - pydantic schemas
 87 |  - pydantic validators
 88 | 
 89 | ```
 90 | 
 91 | ### list of tasks to be completed to fullfill the PRP in the order they should be completed
 92 | 
 93 | ```yaml
 94 | Task 1:
 95 | MODIFY src/existing_module.py:
 96 |   - FIND pattern: "class OldImplementation"
 97 |   - INJECT after line containing "def __init__"
 98 |   - PRESERVE existing method signatures
 99 | 
100 | CREATE src/new_feature.py:
101 |   - MIRROR pattern from: src/similar_feature.py
102 |   - MODIFY class name and core logic
103 |   - KEEP error handling pattern identical
104 | 
105 | ...(...)
106 | 
107 | Task N:
108 | ...
109 | 
110 | ```
111 | 
112 | ### Per task pseudocode as needed added to each task
113 | 
114 | ```python
115 | 
116 | # Task 1
117 | # Pseudocode with CRITICAL details dont write entire code
118 | async def new_feature(param: str) -> Result:
119 |     # PATTERN: Always validate input first (see src/validators.py)
120 |     validated = validate_input(param)  # raises ValidationError
121 |     
122 |     # GOTCHA: This library requires connection pooling
123 |     async with get_connection() as conn:  # see src/db/pool.py
124 |         # PATTERN: Use existing retry decorator
125 |         @retry(attempts=3, backoff=exponential)
126 |         async def _inner():
127 |             # CRITICAL: API returns 429 if >10 req/sec
128 |             await rate_limiter.acquire()
129 |             return await external_api.call(validated)
130 |         
131 |         result = await _inner()
132 |     
133 |     # PATTERN: Standardized response format
134 |     return format_response(result)  # see src/utils/responses.py
135 | ```
136 | 
137 | ### Integration Points
138 | 
139 | ```yaml
140 | DATABASE:
141 |   - migration: "Add column 'feature_enabled' to users table"
142 |   - index: "CREATE INDEX idx_feature_lookup ON users(feature_id)"
143 |   
144 | CONFIG:
145 |   - add to: config/settings.py
146 |   - pattern: "FEATURE_TIMEOUT = int(os.getenv('FEATURE_TIMEOUT', '30'))"
147 |   
148 | ROUTES:
149 |   - add to: src/api/routes.py  
150 |   - pattern: "router.include_router(feature_router, prefix='/feature')"
151 | ```
152 | 
153 | ## Validation Loop
154 | 
155 | ### Level 1: Syntax & Style
156 | 
157 | ```bash
158 | # Run these FIRST - fix any errors before proceeding
159 | ruff check src/new_feature.py --fix  # Auto-fix what's possible
160 | mypy src/new_feature.py              # Type checking
161 | 
162 | # Expected: No errors. If errors, READ the error and fix.
163 | ```
164 | 
165 | ### Level 2: Unit Tests each new feature/file/function use existing test patterns
166 | 
167 | ```python
168 | # CREATE test_new_feature.py with these test cases:
169 | def test_happy_path():
170 |     """Basic functionality works"""
171 |     result = new_feature("valid_input")
172 |     assert result.status == "success"
173 | 
174 | def test_validation_error():
175 |     """Invalid input raises ValidationError"""
176 |     with pytest.raises(ValidationError):
177 |         new_feature("")
178 | 
179 | def test_external_api_timeout():
180 |     """Handles timeouts gracefully"""
181 |     with mock.patch('external_api.call', side_effect=TimeoutError):
182 |         result = new_feature("valid")
183 |         assert result.status == "error"
184 |         assert "timeout" in result.message
185 | ```
186 | 
187 | ```bash
188 | # Run and iterate until passing:
189 | uv run pytest test_new_feature.py -v
190 | # If failing: Read error, understand root cause, fix code, re-run (never mock to pass)
191 | ```
192 | 
193 | ### Level 3: Integration Test
194 | 
195 | ```bash
196 | # Start the service
197 | uv run python -m src.main --dev
198 | 
199 | # Test the endpoint
200 | curl -X POST http://localhost:8000/feature \
201 |   -H "Content-Type: application/json" \
202 |   -d '{"param": "test_value"}'
203 | 
204 | # Expected: {"status": "success", "data": {...}}
205 | # If error: Check logs at logs/app.log for stack trace
206 | ```
207 | 
208 | ## Final validation Checklist
209 | 
210 | - [ ] All tests pass: `uv run pytest tests/ -v`
211 | - [ ] No linting errors: `uv run ruff check src/`
212 | - [ ] No type errors: `uv run mypy src/`
213 | - [ ] Manual test successful: [specific curl/command]
214 | - [ ] Error cases handled gracefully
215 | - [ ] Logs are informative but not verbose
216 | - [ ] Documentation updated if needed
217 | 
218 | ---
219 | 
220 | ## Anti-Patterns to Avoid
221 | 
222 | - ❌ Don't create new patterns when existing ones work
223 | - ❌ Don't skip validation because "it should work"  
224 | - ❌ Don't ignore failing tests - fix them
225 | - ❌ Don't use sync functions in async context
226 | - ❌ Don't hardcode values that should be config
227 | - ❌ Don't catch all exceptions - be specific
228 | 


--------------------------------------------------------------------------------
/docs/PRD.md:
--------------------------------------------------------------------------------
 1 | # Product Requirements Document (PRD) for Agents-eval
 2 | 
 3 | ## Overview
 4 | 
 5 | **Agents-eval** is a project aimed at evaluating the effectiveness of open-source agentic AI systems across various use cases. The focus is on use case agnostic metrics that measure core capabilities such as task decomposition, tool integration, adaptability, and overall performance.
 6 | 
 7 | ## Goals
 8 | 
 9 | - **Evaluate Agentic AI Systems:** Provide a comprehensive evaluation pipeline to assess the performance of agentic AI systems.
10 | - **Metric Development:** Develop and implement metrics that are agnostic to specific use cases but measure core agentic capabilities.
11 | - **Continuous Improvement:** Promote continuous improvement through automated testing, version control, and documentation.
12 | 
13 | ## Functional Requirements
14 | 
15 | ### CLI
16 | 
17 | - **Command Line Interface:**
18 |   - Commands to start, stop, and check the status of the Ollama server or remote inference endpoints.
19 |   - Commands to download or call models and run tests.
20 | 
21 | ### Frontend (Streamlit)
22 | 
23 | - **User Interface:**
24 |   - Display test results and system performance metrics.
25 | 
26 | ### (Optional) Backend (FastAPI)
27 | 
28 | - **Agentic System Integration:**
29 |   - Support for adding tools to agents using Pydantic-AI.
30 |   - Ensure agents can use tools effectively and return expected results.
31 | - **Model Management:**
32 |   - Ability to download, list, and manage models using the `ollama` Python package.
33 | - **API Endpoints:**
34 |   - Endpoint to start and check the status of the Ollama server.
35 |   - Endpoint to download and manage models.
36 |   - Endpoint to run tests and return results.
37 | 
38 | ## Non-Functional Requirements
39 | 
40 | - **Maintainability:**
41 |   - Use modular design patterns for easy updates and maintenance.
42 |   - Implement logging and error handling for debugging and monitoring.
43 | - **Documentation:**
44 |   - Comprehensive documentation for setup, usage, and testing.
45 | - **Scalability:**
46 |   - Design the system to handle multiple concurrent requests.
47 | - **Performance:**
48 |   - Ensure low latency in server responses and model downloads.
49 |   - Optimize for memory usage and CPU/GPU utilization.
50 | - **Security:**
51 |   - Implement secure communication between components.
52 |   - Use environment variables for sensitive information.
53 | 
54 | ## Assumptions
55 | 
56 | - **Remote Inference Endpoints:** The project can use remote inference endpoints provided within a `config.json` and using API keys from `.env`.
57 | - **Local Ollama Server:** The project can make use of a local Ollama server for model hosting and inference.
58 | - **Python Environment:** The project uses Python 3.12 and related tools like `uv` for dependency management.
59 | - **GitHub Actions:** CI/CD pipelines are set up using GitHub Actions for automated testing, version bumping, and documentation deployment.
60 | 
61 | ## Constraints
62 | 
63 | - **Hardware:** The project assumes access to appropriate hardwareif running the Ollama server and models, including sufficient RAM and GPU capabilities.
64 | - **Software:** Requires Python 3.12, `uv`, and other dependencies listed in `pyproject.toml`.
65 | 
66 | ## Main Dependencies
67 | 
68 | - **Pydantic-AI:** For agent and tool management.
69 | - **Pytest:** For testing.
70 | - **Ollama:** For local model hosting and inference.
71 | - **Streamlit:** For frontend dashboard.
72 | - **Ruff:** For code linting.
73 | - **MkDocs:** For documentation generation.
74 | 
75 | ## Future Enhancements
76 | 
77 | - **Additional Metrics:** Develop more metrics to evaluate agentic systems.
78 | - **Integration with More Frameworks:** Expand compatibility with other agentic system frameworks. Meaning other popular agentic system frameworks like LangChain, AutoGen, CrewAI, LangGraph, Semantic Kernel, and smolAgents.
79 | - **Performance Optimization:** Further optimize for latency and resource usage.
80 | - **User Feedback:** Implement a feedback loop for users to report issues or suggest improvements.
81 | 


--------------------------------------------------------------------------------
/docs/SprintPlan.md:
--------------------------------------------------------------------------------
 1 | # Project Plan Outline
 2 | 
 3 | ## Week 1 starting 2025-03-31: Metric Development and CLI Enhancements
 4 | 
 5 | ### Milestones
 6 | 
 7 | - Metric Development: Implement at least three new metrics for evaluating agentic AI systems.
 8 | - CLI Streaming: Enhance the CLI to stream Pydantic-AI output.
 9 | 
10 | ### Tasks and Sequence
11 | 
12 | - [ ] Research and Design New Metrics
13 |   - Task Definition: Conduct literature review and design three new metrics that are agnostic to specific use cases but measure core agentic capabilities.
14 |   - Sequence: Before implementing any code changes.
15 |   - Definition of Done: A detailed document outlining the metrics, their mathematical formulations, and how they will be integrated into the evaluation pipeline.
16 | - [ ] Implement New Metrics
17 |   - Task Definition: Write Python code to implement the new metrics, ensuring they are modular and easily integratable with existing evaluation logic.
18 |   - Sequence: After completing the design document.
19 |   - Definition of Done: Unit tests for each metric pass, and they are successfully integrated into the evaluation pipeline.
20 | - [ ] Enhance CLI for Streaming
21 |   - Task Definition: Modify the CLI to stream Pydantic-AI output using asynchronous functions.
22 |   - Sequence: Concurrently with metric implementation.
23 |   - Definition of Done: The CLI can stream output from Pydantic-AI models without blocking, and tests demonstrate successful streaming.
24 | - [ ] Update Documentation
25 |   - Task Definition: Update PRD.md and README.md to reflect new metrics and CLI enhancements.
26 |   - Sequence: After completing metric implementation and CLI enhancements.
27 |   - Definition of Done: PRD.md includes detailed descriptions of new metrics, and README.md provides instructions on how to use the enhanced CLI.
28 | 
29 | ## Week 2 starting 2025-03-07: Streamlit GUI Enhancements and Testing
30 | 
31 | ### Milestones
32 | 
33 | - Streamlit GUI Output: Enhance the Streamlit GUI to display streamed output from Pydantic-AI.
34 | - Comprehensive Testing: Perform thorough testing of the entire system with new metrics and GUI enhancements.
35 | 
36 | ### Tasks and Sequence
37 | 
38 | - [ ] Enhance Streamlit GUI
39 |   - Task Definition: Modify the Streamlit GUI to display the streamed output from Pydantic-AI models.
40 |   - Sequence: Start of Week 2.
41 |   - Definition of Done: The GUI can display streamed output without errors, and user interactions (e.g., selecting models, inputting queries) work as expected.
42 | - [ ] Integrate New Metrics into GUI
43 |   - Task Definition: Ensure the Streamlit GUI can display results from the new metrics.
44 |   - Sequence: After enhancing the GUI for streamed output.
45 |   - Definition of Done: The GUI displays metric results clearly, and users can easily interpret the output.
46 | - [ ] Comprehensive System Testing
47 |   - Task Definition: Perform end-to-end testing of the system, including new metrics and GUI enhancements.
48 |   - Sequence: After integrating new metrics into the GUI.
49 |   - Definition of Done: All tests pass without errors, and the system functions as expected in various scenarios.
50 | - [ ] Finalize Documentation and Deployment
51 |   - Task Definition: Update MkDocs documentation to reflect all changes and deploy it to GitHub Pages.
52 |   - Sequence: After completing system testing.
53 |   - Definition of Done: Documentation is updated, and the latest version is live on GitHub Pages.
54 | 
55 | ## Additional Considerations
56 | 
57 | - Code Reviews: Schedule regular code reviews to ensure quality and adherence to project standards.
58 | - Feedback Loop: Establish a feedback loop with stakeholders to gather input on the new metrics and GUI enhancements.
59 | 


--------------------------------------------------------------------------------
/docs/UserStory.md:
--------------------------------------------------------------------------------
 1 | # User Story for Agents-eval
 2 | 
 3 | ## Introduction
 4 | 
 5 | Agents-eval is designed to evaluate the effectiveness of open-source agentic AI systems across various use cases. This user story focuses on the perspective of Gez, an AI researcher who aims to assess and improve these systems using Agents-eval.
 6 | 
 7 | ## User Profile
 8 | 
 9 | - **Name:** Gez
10 | - **Role:** AI Researcher
11 | - **Goals:**
12 |   - Evaluate the performance of agentic AI systems.
13 |   - Identify areas for improvement in these systems.
14 |   - Develop and integrate new metrics for evaluation.
15 | 
16 | ## User Story
17 | 
18 | **As** an AI researcher,
19 | **I want** to use Agents-eval to evaluate the effectiveness of agentic AI systems,
20 | **so that** I can assess their performance across different use cases and improve their capabilities.
21 | 
22 | ### Acceptance Criteria
23 | 
24 | 1. **Evaluation Pipeline:**
25 |    - The system should provide a comprehensive evaluation pipeline that measures core agentic capabilities such as task decomposition, tool integration, adaptability, and overall performance.
26 |    - The pipeline should support multiple agentic AI frameworks (e.g., Pydantic-AI, LangChain).
27 | 
28 | 2. **Metric Development:**
29 |    - The system should allow for the development and integration of new metrics that are agnostic to specific use cases.
30 |    - These metrics should be modular and easily integratable with existing evaluation logic.
31 | 
32 | 3. **CLI and GUI Interactions:**
33 |    - The system should offer both a CLI and a Streamlit GUI for user interaction.
34 |    - The CLI should support streaming output from Pydantic-AI models.
35 |    - The Streamlit GUI should display streamed output and provide an intuitive interface for setting up and running evaluations.
36 | 
37 | 4. **Documentation and Feedback:**
38 |    - The system should include comprehensive documentation for setup, usage, and testing.
39 |    - There should be a feedback loop for users to report issues or suggest improvements.
40 | 
41 | ## Example Scenario
42 | 
43 | - **Scenario:** Gez wants to evaluate a research agent system using Agents-eval.
44 | - **Steps:**
45 |   1. She sets up the environment using the CLI or devcontainer.
46 |   2. She configures the agent system with the desired models and tools.
47 |   3. She runs the evaluation using the CLI or Streamlit GUI.
48 |   4. She views the results and metrics displayed by the system.
49 |   5. She provides feedback on the system's performance and suggests improvements.
50 | 
51 | ## Benefits
52 | 
53 | - **Improved Evaluation Capabilities:** Agents-eval provides a structured approach to evaluating agentic AI systems, allowing researchers to focus on improving these systems.
54 | - **Flexibility and Customization:** The system supports multiple frameworks and allows for the development of new metrics, making it adaptable to various research needs.
55 | - **Enhanced User Experience:** The combination of CLI and GUI interfaces offers flexibility in how users interact with the system, catering to different preferences and workflows.
56 | 


--------------------------------------------------------------------------------
/docs/architecture/c4-multi-agent-system.plantuml:
--------------------------------------------------------------------------------
 1 | @startuml "Multi-Agent Research System - C4 System Context"
 2 | !theme plain
 3 | 
 4 | !include https://raw.githubusercontent.com/plantuml-stdlib/C4-PlantUML/master/C4_Context.puml
 5 | !include https://raw.githubusercontent.com/plantuml-stdlib/C4-PlantUML/master/C4_Container.puml
 6 | !include https://raw.githubusercontent.com/plantuml-stdlib/C4-PlantUML/master/C4_Component.puml
 7 | 
 8 | LAYOUT_WITH_LEGEND()
 9 | 
10 | title "Multi-Agent Research System"
11 | Person(user, "User", "Submits research queries")
12 | 
13 | System_Boundary(research_system, "Supporting System") {
14 |     Container(main_module, "Main Module", "Python", "Entry point that configures and runs the agent system")
15 |     Container(utils, "Utilities", "Python", "Helper functions and data models")
16 |     Container(config, "Configuration", "JSON", "Provider and model settings")
17 | }
18 |     
19 | System_Boundary(agent_system, "Agent System") {
20 |     Container(manager_agent, "Manager Agent", "pydantic-ai", "Coordinates research and analysis tasks")
21 |     Container(research_agent, "Research Agent", "pydantic-ai", "Gathers information on topics")
22 |     Container(analysis_agent, "Analysis Agent", "pydantic-ai", "Analyzes research")
23 |     Container(synthesiser_agent, "Synthesiser Agent", "pydantic-ai", "Produces scientific reports")
24 | }
25 | 
26 | System_Ext(llm_provider, "LLM Provider", "External inference service for AI models")
27 | System_Ext(search_api, "DuckDuckGo Search", "External search API")
28 | 
29 | Rel(user, main_module, "Submits query", "CLI Input or GUI")
30 | Rel(main_module, config, "Loads", "Reads JSON config")
31 | Rel(main_module, agent_system, "Initializes and runs")
32 | 
33 | Rel(manager_agent, research_agent, "Delegates research tasks to", "Optional Tool call")
34 | Rel(manager_agent, analysis_agent, "Delegates analysis tasks to", "Optional Tool call")
35 | Rel(manager_agent, synthesiser_agent, "Delegates synthesis tasks to", "Optional Tool call")
36 | 
37 | Rel(research_agent, search_api, "Searches for information", "API call")
38 | 
39 | Rel(manager_agent, llm_provider, "Generates responses", "API call")
40 | Rel(research_agent, llm_provider, "Generates responses", "API call")
41 | Rel(analysis_agent, llm_provider, "Generates responses", "API call")
42 | Rel(synthesiser_agent, llm_provider, "Generates responses", "API call")
43 | 
44 | Rel(agent_system, utils, "Uses", "Import")
45 | Rel(main_module, utils, "Uses", "Import")
46 | 
47 | @enduml
48 | 


--------------------------------------------------------------------------------
/docs/architecture/customer-journey-activity-dark:
--------------------------------------------------------------------------------
 1 | @startuml
 2 | !theme amiga
 3 | skinparam monochrome true
 4 | 
 5 | title Customer Journey Activity Diagram for CLI and Streamlit
 6 | 
 7 | start
 8 | :Discover Agents-eval;
 9 | if (Choose CLI?) then (yes)
10 |   :Run CLI with `make run_cli`;
11 |   :Interact with CLI for agent setup and execution;
12 |   :View results and metrics in CLI output;
13 | else (no)
14 |   :Run Streamlit GUI with `make run_gui`;
15 |   :Interact with Streamlit for agent setup and execution;
16 |   :View results and metrics in Streamlit dashboard;
17 | endif
18 | :Continue using and provide feedback;
19 | :Improve based on feedback;
20 | 
21 | stop
22 | @enduml


--------------------------------------------------------------------------------
/docs/architecture/customer-journey-activity-light.plantuml:
--------------------------------------------------------------------------------
 1 | @startuml
 2 | !theme plain
 3 | 
 4 | title Customer Journey Activity Diagram for CLI and Streamlit
 5 | 
 6 | start
 7 | :Discover Agents-eval;
 8 | if (Choose CLI?) then (yes)
 9 |   :Run CLI with `make run_cli`;
10 |   :Interact with CLI for agent setup and execution;
11 |   :View results and metrics in CLI output;
12 | else (no)
13 |   :Run Streamlit GUI with `make run_gui`;
14 |   :Interact with Streamlit for agent setup and execution;
15 |   :View results and metrics in Streamlit dashboard;
16 | endif
17 | :Continue using and provide feedback;
18 | :Improve based on feedback;
19 | 
20 | stop
21 | @enduml


--------------------------------------------------------------------------------
/docs/architecture/metrics-eval-sweep.plantuml:
--------------------------------------------------------------------------------
 1 | @startuml
 2 | !theme plain
 3 | skinparam ConditionEndStyle diamond
 4 | skinparam ParticipantPadding 20
 5 | skinparam BoxPadding 20
 6 | 
 7 | participant "Sweep Engine" as SE
 8 | participant "Agentic System" as AS
 9 | participant "Evaluation Engine" as EE
10 | 
11 | SE -> EE: Set baseline parameters
12 | 
13 | group Sweep over parameter variations [Independent runs]
14 | 
15 |     group Vary number of runs [ numbers of runs ]
16 |         loop for each run_number
17 |             SE -> AS: Start runs
18 |             AS -> EE: Execute runs
19 |             EE--> SE: Send results
20 |         end
21 |     end
22 | 
23 |     group Sweep metrics weights [ metrics weights ]
24 |         loop for each weight_config
25 |             SE -> AS: Set weights and start runs
26 |             AS -> EE: Execute runs
27 |             EE--> SE: Send results
28 |         end
29 |     end
30 | 
31 | end
32 | @enduml
33 | 


--------------------------------------------------------------------------------
/mkdocs.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # https://github.com/james-willett/mkdocs-material-youtube-tutorial
 3 | # https://mkdocstrings.github.io/recipes/
 4 | # site info set in workflow
 5 | site_name: '<gha_sed_site_name_here>'
 6 | site_description: '<gha_sed_site_description_here>'
 7 | repo_url: '<gha_sed_repo_url_here>'
 8 | edit_uri: edit/main
 9 | theme:
10 |   name: material
11 |   language: en
12 |   features:
13 |     - content.code.annotation
14 |     - content.code.copy
15 |     - content.tabs.link
16 |     - navigation.footer
17 |     - navigation.sections
18 |     - navigation.tabs
19 |     - navigation.top
20 |     - toc.integrate
21 |     - search.suggest
22 |     - search.highlight
23 |   palette:
24 |     - media: "(prefers-color-scheme: light)"
25 |       scheme: default
26 |       toggle:
27 |         # icon: material/brightness-7
28 |         icon: material/toggle-switch-off-outline 
29 |         name: "Toggle Dark Mode"
30 |     - media: "(prefers-color-scheme: dark)"
31 |       scheme: slate
32 |       toggle:
33 |         # icon: material/brightness-4
34 |         icon: material/toggle-switch
35 |         name: "Toggle Light Mode"
36 | nav:
37 |   - Home: index.md
38 |   - PRD: PRD.md
39 |   - User Story: UserStory.md
40 |   - Sprint Plan: SprintPlan.md
41 |   - Code: docstrings.md
42 |   - Change Log: CHANGELOG.md
43 |   - License: LICENSE.md
44 |   - llms.txt: llms.txt
45 | plugins:
46 |   - search:
47 |       lang: en
48 |   - autorefs
49 |   - mkdocstrings:
50 |       handlers:
51 |         python:
52 |           paths: [src]
53 |           options:
54 |             show_root_heading: true
55 |             show_root_full_path: true
56 |             show_object_full_path: false
57 |             show_root_members_full_path: false
58 |             show_category_heading: true
59 |             show_submodules: true
60 | markdown_extensions:
61 |   - attr_list
62 |   - pymdownx.magiclink
63 |   - pymdownx.tabbed
64 |   - pymdownx.highlight:
65 |       anchor_linenums: true
66 |   - pymdownx.superfences
67 |   - pymdownx.snippets:
68 |       check_paths: true
69 |   - pymdownx.tasklist:
70 |       custom_checkbox: true
71 |   - sane_lists
72 |   - smarty
73 |   - toc:
74 |       permalink: true
75 | validation:
76 |   links:
77 |     not_found: warn
78 |     anchors: warn
79 | # builds only if validation succeeds while
80 | # threating warnings as errors
81 | # also checks for broken links
82 | # strict: true
83 | ...
84 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["hatchling"]
  3 | build-backend = "hatchling.build"
  4 | 
  5 | [project]
  6 | version = "1.1.0"
  7 | name = "Agents-eval"
  8 | description = "Assess the effectiveness of agentic AI systems across various use cases focusing on agnostic metrics that measure core agentic capabilities."
  9 | authors = [
 10 |     {name = "qte77", email = "qte@77.gh"}
 11 | ]
 12 | readme = "README.md"
 13 | requires-python = "==3.13.*"
 14 | license = "bsd-3-clause"
 15 | dependencies = [
 16 |     "agentops>=0.4.14",
 17 |     "logfire>=3.16.1",
 18 |     "loguru>=0.7.3",
 19 |     "pydantic>=2.10.6",
 20 |     # "pydantic-ai>=0.0.36",
 21 |     "pydantic-ai-slim[duckduckgo,openai,tavily]>=0.2.12",
 22 |     "pydantic-settings>=2.9.1",
 23 |     "scalene>=1.5.51",
 24 |     "weave>=0.51.49",
 25 | ]
 26 | 
 27 | # [project.urls]
 28 | # Documentation = ""
 29 | 
 30 | [dependency-groups]
 31 | dev = [
 32 |     # "commitizen>=4.4.1",
 33 |     "mypy>=1.16.0",
 34 |     "ruff>=0.11.12",
 35 | ]
 36 | gui = [
 37 |     "streamlit>=1.43.1",
 38 | ]
 39 | test = [
 40 |     "pytest>=8.3.4",
 41 |     "pytest-cov>=6.0.0",
 42 |     "pytest-asyncio>=0.25.3",
 43 |     "pytest-bdd>=8.1.0",
 44 |     "requests>=2.32.3",
 45 |     "ruff>=0.9.2",
 46 | ]
 47 | docs = [
 48 |     "griffe>=1.5.1",
 49 |     "mkdocs>=1.6.1",
 50 |     "mkdocs-awesome-pages-plugin>=2.9.3",
 51 |     "mkdocs-gen-files>=0.5.0",
 52 |     "mkdocs-literate-nav>=0.6.1",
 53 |     "mkdocs-material>=9.5.44",
 54 |     "mkdocs-section-index>=0.3.8",
 55 |     "mkdocstrings[python]>=0.27.0",
 56 | ]
 57 | 
 58 | [tool.uv]
 59 | package = true
 60 | exclude-newer = "2025-05-31T00:00:00Z"
 61 | 
 62 | [tool.hatch.build.targets.wheel]
 63 | only-include = ["/README.md"]
 64 | 
 65 | [tool.hatch.build.targets.sdist]
 66 | include = ["/README.md", "/Makefile", "/tests"]
 67 | 
 68 | [tool.logfire]
 69 | ignore_no_config=true
 70 | send_to_logfire="if-token-present"
 71 | 
 72 | [[tool.mypy.overrides]]
 73 | module = "agentops"
 74 | ignore_missing_imports = true
 75 | 
 76 | [tool.ruff]
 77 | target-version = "py313"
 78 | src = ["src", "tests"]
 79 | 
 80 | [tool.ruff.format]
 81 | docstring-code-format = true
 82 | 
 83 | [tool.ruff.lint]
 84 | # ignore = ["E203"]  # Whitespace before ':'
 85 | unfixable = ["B"]
 86 | select = [
 87 |     # pycodestyle
 88 |     "E",
 89 |     # Pyflakes
 90 |     "F",
 91 |     # pyupgrade
 92 |     "UP",
 93 |     # isort
 94 |     "I",
 95 | ]
 96 | 
 97 | [tool.ruff.lint.isort]
 98 | known-first-party = ["src", "tests"]
 99 | 
100 | [tool.ruff.lint.pydocstyle]
101 | convention = "google"
102 | 
103 | [tool.pytest.ini_options]
104 | addopts = "--strict-markers"
105 | # "function", "class", "module", "package", "session"
106 | asyncio_default_fixture_loop_scope = "function"
107 | pythonpath = ["src"]
108 | testpaths = ["tests/"]
109 | 
110 | [tool.coverage]
111 | [tool.coverage.run]
112 | include = [
113 |     "tests/**/*.py",
114 | ]
115 | # omit = []
116 | # branch = true
117 | 
118 | [tool.coverage.report]
119 | show_missing = true
120 | exclude_lines = [
121 |     # 'pragma: no cover',
122 |     'raise AssertionError',
123 |     'raise NotImplementedError',
124 | ]
125 | omit = [
126 |     'env/*',
127 |     'venv/*',
128 |     '.venv/*',
129 |     '*/virtualenv/*',
130 |     '*/virtualenvs/*',
131 |     '*/tests/*',
132 | ]
133 | 
134 | [tool.bumpversion]
135 | current_version = "1.1.0"
136 | parse = "(?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)"
137 | serialize = ["{major}.{minor}.{patch}"]
138 | commit = true
139 | tag = true
140 | allow_dirty = false
141 | ignore_missing_version = false
142 | sign_tags = false
143 | tag_name = "v{new_version}"
144 | tag_message = "Bump version: {current_version} → {new_version}"
145 | message = "Bump version: {current_version} → {new_version}"
146 | commit_args = ""
147 | 
148 | [[tool.bumpversion.files]]
149 | filename = "pyproject.toml"
150 | search = 'version = "{current_version}"'
151 | replace = 'version = "{new_version}"'
152 | 
153 | [[tool.bumpversion.files]]
154 | filename = "src/app/__init__.py"
155 | search = '__version__ = "{current_version}"'
156 | replace = '__version__ = "{new_version}"'
157 | 
158 | [[tool.bumpversion.files]]
159 | filename = "README.md"
160 | search = "version-{current_version}-58f4c2"
161 | replace = "version-{new_version}-58f4c2"
162 | 
163 | [[tool.bumpversion.files]]
164 | filename = "CHANGELOG.md"
165 | search = """
166 | ## [Unreleased]
167 | """
168 | replace = """
169 | ## [Unreleased]
170 | 
171 | ## [{new_version}] - {now:%Y-%m-%d}
172 | """
173 | 


--------------------------------------------------------------------------------
/src/app/__init__.py:
--------------------------------------------------------------------------------
1 | """Defines the application version."""
2 | 
3 | __version__ = "1.1.0"
4 | 


--------------------------------------------------------------------------------
/src/app/agents/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qte77/Agents-eval/7401b21b53bd8307e7fe8465b466595a7687f8c8/src/app/agents/__init__.py


--------------------------------------------------------------------------------
/src/app/agents/agent_system.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Agent system utilities for orchestrating multi-agent workflows.
  3 | 
  4 | This module provides functions and helpers to create, configure, and run agent
  5 | systems using Pydantic AI. It supports delegation of tasks to research, analysis, and
  6 | synthesis agents, and manages agent configuration, environment setup, and execution.
  7 | Args:
  8 |     provider (str): The name of the provider. provider_config (ProviderConfig):
  9 |         Configuration settings for the provider.
 10 |     api_key (str): API key for authentication with the provider.
 11 |     prompts (dict[str, str]): Configuration for prompts.
 12 |     include_researcher (bool): Flag to include the researcher agent.
 13 |     include_analyst (bool): Flag to include the analyst agent.
 14 |     include_synthesiser (bool): Flag to include the synthesiser agent.
 15 |     query (str | list[dict[str, str]]): The query or messages for the agent.
 16 |     chat_config (ChatConfig): The configuration object for agents and providers.
 17 |     usage_limits (UsageLimits): Usage limits for agent execution.
 18 |     pydantic_ai_stream (bool): Whether to use Pydantic AI streaming.
 19 | 
 20 | Functions:
 21 |     get_manager: Initializes and returns a manager agent with the specified
 22 |         configuration.
 23 |     run_manager: Asynchronously runs the manager agent with the given query and
 24 |         provider.
 25 |     setup_agent_env: Sets up the environment for an agent by configuring provider
 26 |         settings, prompts, API key, and usage limits.
 27 | """
 28 | 
 29 | from pydantic import BaseModel, ValidationError
 30 | from pydantic_ai import Agent, RunContext
 31 | from pydantic_ai.common_tools.duckduckgo import duckduckgo_search_tool
 32 | from pydantic_ai.messages import ModelRequest
 33 | from pydantic_ai.usage import UsageLimits
 34 | 
 35 | from app.agents.llm_model_funs import get_api_key, get_models, get_provider_config
 36 | from app.config.data_models import (
 37 |     AgentConfig,
 38 |     AnalysisResult,
 39 |     AppEnv,
 40 |     ChatConfig,
 41 |     EndpointConfig,
 42 |     ModelDict,
 43 |     ProviderConfig,
 44 |     ResearchResult,
 45 |     ResearchSummary,
 46 |     ResultBaseType,
 47 |     UserPromptType,
 48 | )
 49 | from app.utils.error_messages import generic_exception, invalid_data_model_format
 50 | from app.utils.log import logger
 51 | 
 52 | 
 53 | def _add_tools_to_manager_agent(
 54 |     manager_agent: Agent[None, BaseModel],
 55 |     research_agent: Agent[None, BaseModel] | None = None,
 56 |     analysis_agent: Agent[None, BaseModel] | None = None,
 57 |     synthesis_agent: Agent[None, BaseModel] | None = None,
 58 | ):
 59 |     """
 60 |     Adds tools to the manager agent for delegating tasks to research, analysis, and
 61 |         synthesis agents.
 62 |     Args:
 63 |         manager_agent (Agent): The manager agent to which tools will be added.
 64 |         research_agent (Agent): The agent responsible for handling research tasks.
 65 |         analysis_agent (Agent, optional): The agent responsible for handling
 66 |             analysis tasks. Defaults to None.
 67 |         synthesis_agent (Agent, optional): The agent responsible for handling
 68 |             synthesis tasks. Defaults to None.
 69 |     Returns:
 70 |         None
 71 |     """
 72 | 
 73 |     def _validate_model_return(
 74 |         result_output: str,
 75 |         result_model: type[ResultBaseType],
 76 |     ) -> ResultBaseType:
 77 |         """Validates the output against the expected model."""
 78 |         try:
 79 |             return result_model.model_validate(result_output)
 80 |         except ValidationError as e:
 81 |             msg = invalid_data_model_format(str(e))
 82 |             logger.error(msg)
 83 |             raise ValidationError(msg)
 84 |         except Exception as e:
 85 |             msg = generic_exception(str(e))
 86 |             logger.exception(msg)
 87 |             raise Exception(msg)
 88 | 
 89 |     if research_agent is not None:
 90 | 
 91 |         @manager_agent.tool
 92 |         # TODO remove redundant tool creation
 93 |         # ignore "delegate_research" is not accessed because of decorator
 94 |         async def delegate_research(  # type: ignore[reportUnusedFunction]
 95 |             ctx: RunContext[None], query: str
 96 |         ) -> ResearchResult:
 97 |             """Delegate research task to ResearchAgent."""
 98 |             result = await research_agent.run(query, usage=ctx.usage)
 99 |             return _validate_model_return(str(result.output), ResearchResult)
100 | 
101 |     if analysis_agent is not None:
102 | 
103 |         @manager_agent.tool
104 |         # ignore "delegate_research" is not accessed because of decorator
105 |         async def delegate_analysis(  # type: ignore[reportUnusedFunction]
106 |             ctx: RunContext[None], query: str
107 |         ) -> AnalysisResult:
108 |             """Delegate analysis task to AnalysisAgent."""
109 |             result = await analysis_agent.run(query, usage=ctx.usage)
110 |             return _validate_model_return(str(result.output), AnalysisResult)
111 | 
112 |     if synthesis_agent is not None:
113 | 
114 |         @manager_agent.tool
115 |         # ignore "delegate_research" is not accessed because of decorator
116 |         async def delegate_synthesis(  # type: ignore[reportUnusedFunction]
117 |             ctx: RunContext[None], query: str
118 |         ) -> ResearchSummary:
119 |             """Delegate synthesis task to AnalysisAgent."""
120 |             result = await synthesis_agent.run(query, usage=ctx.usage)
121 |             return _validate_model_return(str(result.output), ResearchSummary)
122 | 
123 | 
124 | def _create_agent(agent_config: AgentConfig) -> Agent[None, BaseModel]:
125 |     """Factory for creating configured agents"""
126 | 
127 |     return Agent(
128 |         model=agent_config.model,
129 |         output_type=agent_config.output_type,
130 |         system_prompt=agent_config.system_prompt,
131 |         tools=agent_config.tools,
132 |         retries=agent_config.retries,
133 |     )
134 | 
135 | 
136 | def _create_manager(
137 |     prompts: dict[str, str],
138 |     models: ModelDict,
139 | ) -> Agent[None, BaseModel]:
140 |     """
141 |     Creates and configures a manager Agent with associated researcher, analyst,
142 |     and optionally synthesiser agents.
143 |     Args:
144 |         prompts (Dict[str, str]): Dictionary containing system prompts for each agent.
145 |         model_manager (GeminiModel | OpenAIModel): Model to be used by the manager
146 |             agent.
147 |         model_researcher (GeminiModel | OpenAIModel | None, optional): Model to be used
148 |             by the researcher agent.
149 |         model_analyst (GeminiModel | OpenAIModel | None, optional): Model to be used by
150 |             the analyst agent. Defaults to None.
151 |         model_synthesiser (GeminiModel | OpenAIModel | None, optional): Model to be used
152 |             by the synthesiser agent. Defaults to None.
153 |     Returns:
154 |         Agent: Configured manager agent with associated tools and agents.
155 |     """
156 | 
157 |     status = f"Creating manager({models.model_manager.model_name})"
158 |     active_agents = [
159 |         agent
160 |         for agent in [
161 |             f"researcher({models.model_researcher.model_name})"
162 |             if models.model_researcher
163 |             else None,
164 |             f"analyst({models.model_analyst.model_name})"
165 |             if models.model_analyst
166 |             else None,
167 |             f"synthesiser({models.model_synthesiser.model_name})"
168 |             if models.model_synthesiser
169 |             else None,
170 |         ]
171 |         if agent
172 |     ]
173 |     status += f" with agents: {', '.join(active_agents)}" if active_agents else ""
174 |     logger.info(status)
175 | 
176 |     manager = _create_agent(
177 |         AgentConfig.model_validate(
178 |             {
179 |                 "model": models.model_manager,
180 |                 "output_type": ResearchResult,
181 |                 "system_prompt": prompts["system_prompt_manager"],
182 |             }
183 |         )
184 |     )
185 | 
186 |     if models.model_researcher is None:
187 |         researcher = None
188 |     else:
189 |         researcher = _create_agent(
190 |             AgentConfig.model_validate(
191 |                 {
192 |                     "model": models.model_researcher,
193 |                     "output_type": ResearchResult,
194 |                     "system_prompt": prompts["system_prompt_researcher"],
195 |                     "tools": [duckduckgo_search_tool()],
196 |                 }
197 |             )
198 |         )
199 | 
200 |     if models.model_analyst is None:
201 |         analyst = None
202 |     else:
203 |         analyst = _create_agent(
204 |             AgentConfig.model_validate(
205 |                 {
206 |                     "model": models.model_analyst,
207 |                     "output_type": AnalysisResult,
208 |                     "system_prompt": prompts["system_prompt_analyst"],
209 |                 }
210 |             )
211 |         )
212 | 
213 |     if models.model_synthesiser is None:
214 |         synthesiser = None
215 |     else:
216 |         synthesiser = _create_agent(
217 |             AgentConfig.model_validate(
218 |                 {
219 |                     "model": models.model_synthesiser,
220 |                     "output_type": AnalysisResult,
221 |                     "system_prompt": prompts["system_prompt_synthesiser"],
222 |                 }
223 |             )
224 |         )
225 | 
226 |     _add_tools_to_manager_agent(manager, researcher, analyst, synthesiser)
227 |     return manager
228 | 
229 | 
230 | def get_manager(
231 |     provider: str,
232 |     provider_config: ProviderConfig,
233 |     api_key: str | None,
234 |     prompts: dict[str, str],
235 |     include_researcher: bool = False,
236 |     include_analyst: bool = False,
237 |     include_synthesiser: bool = False,
238 | ) -> Agent[None, BaseModel]:
239 |     """
240 |     Initializes and returns a Agent manager with the specified configuration.
241 |     Args:
242 |         provider (str): The name of the provider.
243 |         provider_config (ProviderConfig): Configuration settings for the provider.
244 |         api_key (str): API key for authentication with the provider.
245 |         prompts (PromptsConfig): Configuration for prompts.
246 |         include_researcher (bool, optional): Flag to include analyst model.
247 |             Defaults to False.
248 |         include_analyst (bool, optional): Flag to include analyst model.
249 |             Defaults to False.
250 |         include_synthesiser (bool, optional): Flag to include synthesiser model.
251 |             Defaults to False.
252 |     Returns:
253 |         Agent: The initialized Agent manager.
254 |     """
255 | 
256 |     # FIXME context manager try-catch
257 |     # with error_handling_context("get_manager()"):
258 |     model_config = EndpointConfig.model_validate(
259 |         {
260 |             "provider": provider,
261 |             "prompts": prompts,
262 |             "api_key": api_key,
263 |             "provider_config": provider_config,
264 |         }
265 |     )
266 |     models = get_models(
267 |         model_config, include_researcher, include_analyst, include_synthesiser
268 |     )
269 |     return _create_manager(prompts, models)
270 | 
271 | 
272 | async def run_manager(
273 |     manager: Agent[None, BaseModel],
274 |     query: UserPromptType,
275 |     provider: str,
276 |     usage_limits: UsageLimits | None,
277 |     pydantic_ai_stream: bool = False,
278 | ) -> None:
279 |     """
280 |     Asynchronously runs the manager with the given query and provider, handling errors
281 |         and printing results.
282 |     Args:
283 |         manager (Agent): The system agent responsible for running the query.
284 |         query (str): The query to be processed by the manager.
285 |         provider (str): The provider to be used for the query.
286 |         usage_limits (UsageLimits): The usage limits to be applied during the query
287 |             execution.
288 |         pydantic_ai_stream (bool, optional): Flag to enable or disable Pydantic AI
289 |             stream. Defaults to False.
290 |     Returns:
291 |         None
292 |     """
293 | 
294 |     # FIXME context manager try-catch
295 |     # with out ? error_handling_context("run_manager()"):
296 |     model_name = getattr(manager, "model")._model_name
297 |     mgr_cfg = {"user_prompt": query, "usage_limits": usage_limits}
298 |     logger.info(f"Researching with {provider}({model_name}) and Topic: {query} ...")
299 | 
300 |     if pydantic_ai_stream:
301 |         raise NotImplementedError(
302 |             "Streaming currently only possible for Agents with "
303 |             "output_type str not pydantic model"
304 |         )
305 |         # logger.info("Streaming model response ...")
306 |         # result = await manager.run(**mgr_cfg)
307 |         # aync for chunk in result.stream_text():  # .run(**mgr_cfg) as result:
308 |         # async with manager.run_stream(user_prompt=query) as stream:
309 |         #    async for chunk in stream.stream_text():
310 |         #        logger.info(str(chunk))
311 |         # result = await stream.get_result()
312 |     else:
313 |         logger.info("Waiting for model response ...")
314 |         # FIXME deprecated warning manager.run(), query unknown type
315 |         # FIXME [call-overload] error: No overload variant of "run" of "Agent"
316 |         # matches argument type "dict[str, list[dict[str, str]] |
317 |         # Sequence[str | ImageUrl | AudioUrl | DocumentUrl | VideoUrl |
318 |         # BinaryContent] | UsageLimits | None]"
319 |         result = await manager.run(**mgr_cfg)  # type: ignore[reportDeprecated,reportUnknownArgumentType,reportCallOverload,call-overload]
320 | 
321 |     logger.info(f"Result: {result}")
322 |     logger.info(f"Usage statistics: {result.usage()}")
323 | 
324 | 
325 | def setup_agent_env(
326 |     provider: str,
327 |     query: UserPromptType,
328 |     chat_config: ChatConfig | BaseModel,
329 |     chat_env_config: AppEnv,
330 | ) -> EndpointConfig:
331 |     """
332 |     Sets up the environment for an agent by configuring provider settings, prompts,
333 |     API key, and usage limits.
334 | 
335 |     Args:
336 |         provider (str): The name of the provider.
337 |         query (UserPromptType): The messages or queries to be sent to the agent.
338 |         chat_config (ChatConfig | BaseModel): The configuration object containing
339 |             provider and prompt settings.
340 |         chat_env_config (AppEnv): The application environment configuration
341 |             containing API keys.
342 | 
343 |     Returns:
344 |         EndpointConfig: The configuration object for the agent.
345 |     """
346 | 
347 |     if not isinstance(chat_config, ChatConfig):
348 |         raise TypeError("'chat_config' of invalid type: ChatConfig expected")
349 |     msg: str | None
350 |     # FIXME context manager try-catch
351 |     # with error_handling_context("setup_agent_env()"):
352 |     provider_config = get_provider_config(provider, chat_config.providers)
353 | 
354 |     prompts = chat_config.prompts
355 |     api_key = get_api_key(provider, chat_env_config)
356 | 
357 |     if provider.lower() == "ollama":
358 |         # TODO move usage limits to config
359 |         usage_limits = UsageLimits(request_limit=10, total_tokens_limit=100000)
360 |     else:
361 |         if api_key is None:
362 |             msg = f"API key for provider '{provider}' is not set."
363 |             logger.error(msg)
364 |             raise ValueError(msg)
365 |         # TODO Separate Gemini request into function
366 |         if provider.lower() == "gemini":
367 |             if isinstance(query, str):
368 |                 query = ModelRequest.user_text_prompt(query)
369 |             elif isinstance(query, list):  # type: ignore[reportUnnecessaryIsInstance]
370 |                 # query = [
371 |                 #    ModelRequest.user_text_prompt(
372 |                 #        str(msg.get("content", ""))
373 |                 #    )  # type: ignore[reportUnknownArgumentType]
374 |                 #    if isinstance(msg, dict)
375 |                 #    else msg
376 |                 #    for msg in query
377 |                 # ]
378 |                 raise NotImplementedError("Currently conflicting with UserPromptType")
379 |             else:
380 |                 msg = f"Unsupported query type for Gemini: {type(query)}"
381 |                 logger.error(msg)
382 |                 raise TypeError(msg)
383 |         # TODO move usage limits to config
384 |         usage_limits = UsageLimits(request_limit=10, total_tokens_limit=10000)
385 | 
386 |     return EndpointConfig.model_validate(
387 |         {
388 |             "provider": provider,
389 |             "query": query,
390 |             "api_key": api_key,
391 |             "prompts": prompts,
392 |             "provider_config": provider_config,
393 |             "usage_limits": usage_limits,
394 |         }
395 |     )
396 | 


--------------------------------------------------------------------------------
/src/app/agents/llm_model_funs.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility functions and classes for managing and instantiating LLM models and providers.
  3 | 
  4 | This module provides functions to retrieve API keys, provider configurations, and
  5 | to create model instances for supported LLM providers such as Gemini and OpenAI.
  6 | It also includes logic for assembling model dictionaries for system agents.
  7 | """
  8 | 
  9 | from pydantic import HttpUrl
 10 | from pydantic_ai.models.gemini import GeminiModel
 11 | from pydantic_ai.models.openai import OpenAIModel
 12 | from pydantic_ai.providers.openai import OpenAIProvider
 13 | 
 14 | from app.config.config_app import API_SUFFIX
 15 | from app.config.data_models import AppEnv, EndpointConfig, ModelDict, ProviderConfig
 16 | from app.utils.error_messages import generic_exception, get_key_error
 17 | from app.utils.log import logger
 18 | 
 19 | 
 20 | def get_api_key(
 21 |     provider: str,
 22 |     chat_env_config: AppEnv,
 23 | ) -> str | None:
 24 |     """Retrieve API key from chat env config variable."""
 25 | 
 26 |     provider = provider.upper()
 27 |     if provider == "OLLAMA":
 28 |         return None
 29 |     else:
 30 |         key_name = f"{provider}{API_SUFFIX}"
 31 |         if hasattr(chat_env_config, key_name):
 32 |             logger.info(f"Found API key for provider '{provider}'")
 33 |             return getattr(chat_env_config, key_name)
 34 |         else:
 35 |             raise KeyError(
 36 |                 f"API key for provider '{provider}' not found in configuration."
 37 |             )
 38 | 
 39 | 
 40 | def get_provider_config(
 41 |     provider: str, providers: dict[str, ProviderConfig]
 42 | ) -> dict[str, str | HttpUrl]:
 43 |     """Retrieve configuration settings for the specified provider."""
 44 | 
 45 |     try:
 46 |         model_name = providers[provider].model_name
 47 |         base_url = providers[provider].base_url
 48 |     except KeyError as e:
 49 |         msg = get_key_error(str(e))
 50 |         logger.error(msg)
 51 |         raise KeyError(msg)
 52 |     except Exception as e:
 53 |         msg = generic_exception(str(e))
 54 |         logger.exception(msg)
 55 |         raise Exception(msg)
 56 |     else:
 57 |         return {
 58 |             "model_name": model_name,
 59 |             "base_url": base_url,
 60 |         }
 61 | 
 62 | 
 63 | def _create_model(endpoint_config: EndpointConfig) -> GeminiModel | OpenAIModel:
 64 |     """Create a model that uses model_name and base_url for inference API"""
 65 | 
 66 |     if endpoint_config.provider.lower() == "gemini":
 67 |         # FIXME EndpointConfig: TypeError: 'ModelRequest' object is not iterable.
 68 |         raise NotImplementedError(
 69 |             "Current typing raises TypeError: 'ModelRequest' object is not iterable."
 70 |         )
 71 |     elif endpoint_config.provider.lower() == "huggingface":
 72 |         # FIXME HF not working with pydantic-ai OpenAI model
 73 |         raise NotImplementedError(
 74 |             "Hugging Face provider is not implemented yet. Please use Gemini or OpenAI."
 75 |             " https://huggingface.co/docs/inference-providers/providers/hf-inference"
 76 |         )
 77 |         # headers = {
 78 |         #    "Authorization": f"Bearer {endpoint_config.api_key}",
 79 |         # }
 80 |         # def query(payload):
 81 |         #    response = requests.post(API_URL, headers=headers, json=payload)
 82 |         #    return response.json()
 83 |         # query({"inputs": "", "parameters": {},})
 84 |     else:
 85 |         base_url_str = str(endpoint_config.provider_config.base_url)
 86 |         return OpenAIModel(
 87 |             model_name=endpoint_config.provider_config.model_name,
 88 |             provider=OpenAIProvider(
 89 |                 base_url=base_url_str,
 90 |                 api_key=endpoint_config.api_key,
 91 |             ),
 92 |         )
 93 | 
 94 | 
 95 | def get_models(
 96 |     endpoint_config: EndpointConfig,
 97 |     include_researcher: bool = False,
 98 |     include_analyst: bool = False,
 99 |     include_synthesiser: bool = False,
100 | ) -> ModelDict:
101 |     """
102 |     Get the models for the system agents.
103 |     Args:
104 |         endpoint_config (EndpointConfig): Configuration for the model.
105 |         include_analyist (Optional[bool]): Whether to include the analyst model.
106 |             Defaults to False.
107 |         include_synthesiser (Optional[bool]): Whether to include the synthesiser model.
108 |             Defaults to False.
109 |     Returns:
110 |         Dict[str, GeminiModel | OpenAIModel]: A dictionary containing the models for the
111 |             system agents.
112 |     """
113 | 
114 |     model = _create_model(endpoint_config)
115 |     return ModelDict.model_validate(
116 |         {
117 |             "model_manager": model,
118 |             "model_researcher": model if include_researcher else None,
119 |             "model_analyst": model if include_analyst else None,
120 |             "model_synthesiser": model if include_synthesiser else None,
121 |         }
122 |     )
123 | 


--------------------------------------------------------------------------------
/src/app/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qte77/Agents-eval/7401b21b53bd8307e7fe8465b466595a7687f8c8/src/app/config/__init__.py


--------------------------------------------------------------------------------
/src/app/config/config_app.py:
--------------------------------------------------------------------------------
 1 | """Configuration constants for the application."""
 2 | 
 3 | # MARK: chat env
 4 | API_SUFFIX = "_API_KEY"
 5 | CHAT_DEFAULT_PROVIDER = "github"
 6 | 
 7 | 
 8 | # MARK: project
 9 | PROJECT_NAME = "rd-mas-example"
10 | 
11 | 
12 | # MARK: paths
13 | CHAT_CONFIG_FILE = "config/config_chat.json"
14 | LOGS_PATH = "logs"
15 | EVAL_CONFIG_FILE = "config/config_eval.json"
16 | 


--------------------------------------------------------------------------------
/src/app/config/config_chat.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "providers": {
 3 |         "huggingface": {
 4 |             "model_name": "facebook/bart-large-mnli",
 5 |             "base_url": "https://router.huggingface.co/hf-inference/models"
 6 |         },
 7 |         "gemini": {
 8 |             "model_name": "gemini-1.5-flash-8b",
 9 |             "base_url": "https://generativelanguage.googleapis.com/v1beta"
10 |         },
11 |         "github": {
12 |             "model_name": "GPT-4o",
13 |             "base_url": "https://models.inference.ai.azure.com"
14 |         },
15 |         "grok": {
16 |             "model_name": "grok-2-1212",
17 |             "base_url": "https://api.x.ai/v1"
18 |         },
19 |         "ollama": {
20 |             "model_name": "granite3-dense",
21 |             "base_url": "http://localhost:11434/v1"
22 |         },
23 |         "openrouter": {
24 |             "model_name": "google/gemini-2.0-flash-exp:free",
25 |             "base_url": "https://openrouter.ai/api/v1"
26 |         },
27 |         "perplexity": {
28 |             "model_name": "sonar",
29 |             "base_url": "https://api.perplexity.ai"
30 |         },
31 |         "restack": {
32 |             "model_name": "deepseek-chat",
33 |             "base_url": "https://ai.restack.io"
34 |         },
35 |         "together": {
36 |             "model_name": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
37 |             "base_url": "https://api.together.xyz/v1"
38 |         }
39 |     },
40 |     "inference": {
41 |         "usage_limits": 10000,
42 |         "usage_limits_ollama": 10000,
43 |         "result_retries": 3,
44 |         "result_retries_ollama": 3
45 |     },
46 |     "prompts": {
47 |         "system_prompt_manager": "You are a manager overseeing research and analysis tasks. Your role is to coordinate the efforts of the research, analysis and synthesiser agents to provide comprehensive answers to user queries. The researcher should gather and analyze data relevant to the topic. The whole result must be handed to the analyst, who will check it for accuracy of the assumptions, facts, and conclusions. If an analyst is present the researchers output has to be approved by the analyst. If the analyst does not approve of the researcher's result, all of the analyst's response and the topic must be handed back to the researcher to be refined. Repeat this loop until the analyst approves. If a sysnthesiser is present and once the analyst approves, the synthesiser should output a well formatted scientific report using the data given.",
48 |         "system_prompt_researcher": "You are a researcher. Gather and analyze data relevant to the topic. Use the search tool to gather data. Always check accuracy of assumptions, facts, and conclusions.",
49 |         "system_prompt_analyst": "You are a research analyst. Use your analytical skills to check the accuracy of assumptions, facts, and conclusions in the data provided. Provide relevant feedback if you do not approve. Only approve if you do not have any feedback to give.",
50 |         "system_prompt_synthesiser": "You are a scientific writing assistant. Your task is to output a well formatted scientific report using the data given. Leave the privided facts, conclusions and sources unchanged."
51 |     }
52 | }


--------------------------------------------------------------------------------
/src/app/config/config_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "metrics_and_weights": {
 3 |         "time_taken": 0.167,
 4 |         "task_success": 0.167,
 5 |         "coordination_quality": 0.167,
 6 |         "tool_efficiency": 0.167,
 7 |         "planning_rational": 0.167,
 8 |         "output_similarity": 0.167
 9 |     }
10 | }


--------------------------------------------------------------------------------
/src/app/config/data_models.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Data models for agent system configuration and results.
  3 | 
  4 | This module defines Pydantic models for representing research and analysis results,
  5 | summaries, provider and agent configurations, and model dictionaries used throughout
  6 | the application. These models ensure type safety and validation for data exchanged
  7 | between agents and system components.
  8 | """
  9 | 
 10 | from typing import Any, TypeVar
 11 | 
 12 | from pydantic import BaseModel, ConfigDict, HttpUrl, field_validator
 13 | from pydantic_ai.messages import ModelRequest
 14 | from pydantic_ai.models import Model
 15 | from pydantic_ai.tools import Tool
 16 | from pydantic_ai.usage import UsageLimits
 17 | from pydantic_settings import BaseSettings, SettingsConfigDict
 18 | 
 19 | type UserPromptType = (
 20 |     str | list[dict[str, str]] | ModelRequest | None
 21 | )  #  (1) Input validation
 22 | ResultBaseType = TypeVar(
 23 |     "ResultBaseType", bound=BaseModel
 24 | )  # (2) Generic type for model results
 25 | 
 26 | 
 27 | class ResearchResult(BaseModel):
 28 |     """Research results from the research agent."""
 29 | 
 30 |     topic: str | dict[str, str]
 31 |     findings: list[str] | dict[str, str | list[str]]
 32 |     sources: list[str] | dict[str, str | list[str]]
 33 | 
 34 | 
 35 | class AnalysisResult(BaseModel):
 36 |     """Analysis results from the analysis agent."""
 37 | 
 38 |     insights: list[str]
 39 |     recommendations: list[str]
 40 |     approval: bool
 41 | 
 42 | 
 43 | class ResearchSummary(BaseModel):
 44 |     """Expected model response of research on a topic"""
 45 | 
 46 |     topic: str
 47 |     key_points: list[str]
 48 |     key_points_explanation: list[str]
 49 |     conclusion: str
 50 |     sources: list[str]
 51 | 
 52 | 
 53 | class ProviderConfig(BaseModel):
 54 |     """Configuration for a model provider"""
 55 | 
 56 |     model_name: str
 57 |     base_url: HttpUrl
 58 | 
 59 | 
 60 | class ChatConfig(BaseModel):
 61 |     """Configuration settings for agents and model providers"""
 62 | 
 63 |     providers: dict[str, ProviderConfig]
 64 |     inference: dict[str, str | int]
 65 |     prompts: dict[str, str]
 66 | 
 67 | 
 68 | class EndpointConfig(BaseModel):
 69 |     """Configuration for an agent"""
 70 | 
 71 |     provider: str
 72 |     query: UserPromptType = None
 73 |     api_key: str | None
 74 |     prompts: dict[str, str]
 75 |     provider_config: ProviderConfig
 76 |     usage_limits: UsageLimits | None = None
 77 | 
 78 | 
 79 | class AgentConfig(BaseModel):
 80 |     """Configuration for an agent"""
 81 | 
 82 |     model: Model  # (1) Instance expected
 83 |     output_type: type[BaseModel]  # (2) Class expected
 84 |     system_prompt: str
 85 |     # FIXME tools: list[Callable[..., Awaitable[Any]]]
 86 |     tools: list[Any] = []  # (3) List of tools will be validated at creation
 87 |     retries: int = 3
 88 | 
 89 |     # Avoid pydantic.errors.PydanticSchemaGenerationError:
 90 |     # Unable to generate pydantic-core schema for <class 'openai.AsyncOpenAI'>.
 91 |     # Avoid Pydantic errors related to non-Pydantic types
 92 |     model_config = ConfigDict(
 93 |         arbitrary_types_allowed=True
 94 |     )  # (4) Suppress Error non-Pydantic types caused by <class 'openai.AsyncOpenAI'>
 95 | 
 96 |     @field_validator("tools", mode="before")
 97 |     def validate_tools(cls, v: list[Any]) -> list[Tool | None]:
 98 |         """Validate that all tools are instances of Tool."""
 99 |         if not v:
100 |             return []
101 |         if not all(isinstance(t, Tool) for t in v):
102 |             raise ValueError("All tools must be Tool instances")
103 |         return v
104 | 
105 | 
106 | class ModelDict(BaseModel):
107 |     """Dictionary of models used to create agent systems"""
108 | 
109 |     model_manager: Model
110 |     model_researcher: Model | None
111 |     model_analyst: Model | None
112 |     model_synthesiser: Model | None
113 |     model_config = ConfigDict(arbitrary_types_allowed=True)
114 | 
115 | 
116 | class EvalConfig(BaseModel):
117 |     metrics_and_weights: dict[str, float]
118 | 
119 | 
120 | class AppEnv(BaseSettings):
121 |     """
122 |     Application environment settings loaded from environment variables or .env file.
123 | 
124 |     This class uses Pydantic's BaseSettings to manage API keys and configuration
125 |     for various inference endpoints, tools, and logging/monitoring services.
126 |     Environment variables are loaded from a .env file by default.
127 |     """
128 | 
129 |     # Inference endpoints
130 |     GEMINI_API_KEY: str = ""
131 |     GITHUB_API_KEY: str = ""
132 |     GROK_API_KEY: str = ""
133 |     HUGGINGFACE_API_KEY: str = ""
134 |     OPENROUTER_API_KEY: str = ""
135 |     PERPLEXITY_API_KEY: str = ""
136 |     RESTACK_API_KEY: str = ""
137 |     TOGETHER_API_KEY: str = ""
138 | 
139 |     # Tools
140 |     TAVILY_API_KEY: str = ""
141 | 
142 |     # Logging/Monitoring/Tracing
143 |     AGENTOPS_API_KEY: str = ""
144 |     LOGFIRE_API_KEY: str = ""
145 |     WANDB_API_KEY: str = ""
146 | 
147 |     model_config = SettingsConfigDict(
148 |         env_file=".env", env_file_encoding="utf-8", extra="ignore"
149 |     )
150 | 


--------------------------------------------------------------------------------
/src/app/evals/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qte77/Agents-eval/7401b21b53bd8307e7fe8465b466595a7687f8c8/src/app/evals/__init__.py


--------------------------------------------------------------------------------
/src/app/evals/metrics.py:
--------------------------------------------------------------------------------
 1 | def time_taken(start_time: float, end_time: float) -> float:
 2 |     """Calculate duration between start and end timestamps
 3 | 
 4 |     Args:
 5 |         start_time: Timestamp when execution started
 6 |         end_time: Timestamp when execution completed
 7 | 
 8 |     Returns:
 9 |         Duration in seconds with microsecond precision
10 |     """
11 | 
12 |     # TODO implement
13 |     return end_time - start_time
14 | 
15 | 
16 | def output_similarity(agent_output: str, expected_answer: str) -> bool:
17 |     """
18 |     Determine to what degree the agent's output matches the expected answer.
19 | 
20 |     Args:
21 |         agent_output (str): The output produced by the agent.
22 |         expected_answer (str): The correct or expected answer.
23 | 
24 |     Returns:
25 |         bool: True if the output matches the expected answer, False otherwise.
26 |     """
27 | 
28 |     # TODO score instead of bool
29 |     return agent_output.strip() == expected_answer.strip()
30 | 


--------------------------------------------------------------------------------
/src/app/main.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Main entry point for the Agents-eval application.
  3 | 
  4 | This module initializes the agentic system, loads configuration files,
  5 | handles user input, and orchestrates the multi-agent workflow using
  6 | asynchronous execution. It integrates logging, tracing, and authentication,
  7 | and supports both CLI and programmatic execution.
  8 | """
  9 | 
 10 | from asyncio import run
 11 | from pathlib import Path
 12 | from sys import argv
 13 | 
 14 | from logfire import span
 15 | from weave import op
 16 | 
 17 | from app.__init__ import __version__
 18 | from app.agents.agent_system import get_manager, run_manager, setup_agent_env
 19 | from app.config.config_app import (
 20 |     CHAT_CONFIG_FILE,
 21 |     CHAT_DEFAULT_PROVIDER,
 22 |     EVAL_CONFIG_FILE,
 23 |     PROJECT_NAME,
 24 | )
 25 | from app.config.data_models import AppEnv, ChatConfig, EvalConfig
 26 | from app.utils.error_messages import generic_exception
 27 | from app.utils.load_configs import load_config
 28 | from app.utils.log import logger
 29 | from app.utils.login import login
 30 | from app.utils.utils import parse_args
 31 | 
 32 | 
 33 | @op()
 34 | async def main(
 35 |     chat_provider: str = CHAT_DEFAULT_PROVIDER,
 36 |     query: str = "",
 37 |     include_researcher: bool = False,
 38 |     include_analyst: bool = False,
 39 |     include_synthesiser: bool = False,
 40 |     pydantic_ai_stream: bool = False,
 41 |     chat_config_file: str = CHAT_CONFIG_FILE,
 42 | ) -> None:
 43 |     """
 44 |     Main entry point for the application.
 45 | 
 46 |     Args:
 47 |         chat_provider (str): The inference chat_provider to be used.
 48 |         query (str): The query to be processed by the agent.
 49 |         include_researcher (bool): Whether to include the researcher in the process.
 50 |         include_analyst (bool): Whether to include the analyst in the process.
 51 |         include_synthesiser (bool): Whether to include the synthesiser in the process.
 52 |         pydantic_ai_stream (bool): Whether to use Pydantic AI streaming.
 53 |         chat_config_file (str): Full path to the configuration file.
 54 | 
 55 |     Returns:
 56 |         None
 57 |     """
 58 | 
 59 |     logger.info(f"Starting app '{PROJECT_NAME}' v{__version__}")
 60 |     try:
 61 |         with span("main()"):
 62 |             if not chat_provider:
 63 |                 chat_provider = input("Which inference chat_provider to use? ")
 64 |             if not query:
 65 |                 query = input("What would you like to research? ")
 66 | 
 67 |             chat_config_path = Path(__file__).parent / CHAT_CONFIG_FILE
 68 |             eval_config_path = Path(__file__).parent / EVAL_CONFIG_FILE
 69 |             chat_config = load_config(chat_config_path, ChatConfig)
 70 |             eval_config = load_config(eval_config_path, EvalConfig)
 71 |             chat_env_config = AppEnv()
 72 |             agent_env = setup_agent_env(
 73 |                 chat_provider, query, chat_config, chat_env_config
 74 |             )
 75 |             # TODO remove noqa and type ignore for unused variable
 76 |             metrics_and_weights = eval_config.metrics_and_weights  # noqa: F841  # type: ignore[reportUnusedVariable]
 77 | 
 78 |             # FIXME enhance login, not every run?
 79 |             login(PROJECT_NAME, chat_env_config)
 80 | 
 81 |             manager = get_manager(
 82 |                 agent_env.provider,
 83 |                 agent_env.provider_config,
 84 |                 agent_env.api_key,
 85 |                 agent_env.prompts,
 86 |                 include_researcher,
 87 |                 include_analyst,
 88 |                 include_synthesiser,
 89 |             )
 90 |             await run_manager(
 91 |                 manager,
 92 |                 agent_env.query,
 93 |                 agent_env.provider,
 94 |                 agent_env.usage_limits,
 95 |                 pydantic_ai_stream,
 96 |             )
 97 |             logger.info(f"Exiting app '{PROJECT_NAME}'")
 98 | 
 99 |     except Exception as e:
100 |         msg = generic_exception(f"Aborting app '{PROJECT_NAME}' with: {e}")
101 |         logger.exception(msg)
102 |         raise Exception(msg) from e
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     args = parse_args(argv[1:])
107 |     run(main(**args))
108 | 


--------------------------------------------------------------------------------
/src/app/py.typed:
--------------------------------------------------------------------------------
1 | # PEP 561 – Distributing and Packaging Type Information
2 | # https://peps.python.org/pep-0561/


--------------------------------------------------------------------------------
/src/app/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qte77/Agents-eval/7401b21b53bd8307e7fe8465b466595a7687f8c8/src/app/utils/__init__.py


--------------------------------------------------------------------------------
/src/app/utils/error_messages.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Error message utilities for the Agents-eval application.
 3 | 
 4 | This module provides concise helper functions for generating standardized
 5 | error messages related to configuration loading and validation.
 6 | """
 7 | 
 8 | from pathlib import Path
 9 | 
10 | 
11 | def api_connection_error(error: str) -> str:
12 |     """
13 |     Generate a error message for API connection error.
14 |     """
15 |     return f"API connection error: {error}"
16 | 
17 | 
18 | def failed_to_load_config(error: str) -> str:
19 |     """
20 |     Generate a error message for configuration loading failure.
21 |     """
22 |     return f"Failed to load config: {error}"
23 | 
24 | 
25 | def file_not_found(file_path: str | Path) -> str:
26 |     """
27 |     Generate an error message for a missing configuration file.
28 |     """
29 |     return f"File not found: {file_path}"
30 | 
31 | 
32 | def generic_exception(error: str) -> str:
33 |     """
34 |     Generate a generic error message.
35 |     """
36 |     return f"Exception: {error}"
37 | 
38 | 
39 | def invalid_data_model_format(error: str) -> str:
40 |     """
41 |     Generate an error message for invalid pydantic data model format.
42 |     """
43 |     return f"Invalid pydantic data model format: {error}"
44 | 
45 | 
46 | def invalid_json(error: str) -> str:
47 |     """
48 |     Generate an error message for invalid JSON in a configuration file.
49 |     """
50 |     return f"Invalid JSON: {error}"
51 | 
52 | 
53 | def invalid_type(expected_type: str, actual_type: str) -> str:
54 |     """
55 |     Generate an error message for invalid Type.
56 |     """
57 |     return f"Type Error: Expected {expected_type}, got {actual_type} instead."
58 | 
59 | 
60 | def get_key_error(error: str) -> str:
61 |     """
62 |     Generate a generic error message.
63 |     """
64 |     return f"Key Error: {error}"
65 | 


--------------------------------------------------------------------------------
/src/app/utils/load_configs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Configuration loading utilities.
 3 | 
 4 | Provides a generic function for loading and validating JSON configuration
 5 | files against Pydantic models, with error handling and logging support.
 6 | """
 7 | 
 8 | import json
 9 | from pathlib import Path
10 | 
11 | from pydantic import BaseModel, ValidationError
12 | 
13 | from app.utils.error_messages import (
14 |     failed_to_load_config,
15 |     file_not_found,
16 |     invalid_data_model_format,
17 |     invalid_json,
18 | )
19 | from app.utils.log import logger
20 | 
21 | 
22 | def load_config(config_path: str | Path, data_model: type[BaseModel]) -> BaseModel:
23 |     """
24 |     Generic configuration loader that validates against any Pydantic model.
25 | 
26 |     Args:
27 |         config_path: Path to the JSON configuration file
28 |         model: Pydantic model class for validation
29 | 
30 |     Returns:
31 |         Validated configuration instance
32 |     """
33 | 
34 |     try:
35 |         with open(config_path, encoding="utf-8") as f:
36 |             data = json.load(f)
37 |         return data_model.model_validate(data)
38 |     except FileNotFoundError as e:
39 |         msg = file_not_found(config_path)
40 |         logger.error(msg)
41 |         raise FileNotFoundError(msg) from e
42 |     except json.JSONDecodeError as e:
43 |         msg = invalid_json(str(e))
44 |         logger.error(msg)
45 |         raise ValueError(msg) from e
46 |     except ValidationError as e:
47 |         msg = invalid_data_model_format(str(e))
48 |         logger.error(msg)
49 |         raise ValidationError(msg) from e
50 |     except Exception as e:
51 |         msg = failed_to_load_config(str(e))
52 |         logger.exception(msg)
53 |         raise Exception(msg) from e
54 | 


--------------------------------------------------------------------------------
/src/app/utils/load_settings.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility functions and classes for loading application settings and configuration.
 3 | 
 4 | This module defines the AppEnv class for managing environment variables using Pydantic,
 5 | and provides a function to load and validate application configuration from a JSON file.
 6 | """
 7 | 
 8 | import json
 9 | from pathlib import Path
10 | 
11 | from pydantic_settings import BaseSettings, SettingsConfigDict
12 | 
13 | from app.config.data_models import ChatConfig
14 | from app.utils.error_messages import (
15 |     failed_to_load_config,
16 |     file_not_found,
17 |     invalid_json,
18 | )
19 | from app.utils.log import logger
20 | 
21 | 
22 | class AppEnv(BaseSettings):
23 |     """
24 |     Application environment settings loaded from environment variables or .env file.
25 | 
26 |     This class uses Pydantic's BaseSettings to manage API keys and configuration
27 |     for various inference endpoints, tools, and logging/monitoring services.
28 |     Environment variables are loaded from a .env file by default.
29 |     """
30 | 
31 |     # Inference endpoints
32 |     GEMINI_API_KEY: str = ""
33 |     GITHUB_API_KEY: str = ""
34 |     GROK_API_KEY: str = ""
35 |     HUGGINGFACE_API_KEY: str = ""
36 |     OPENROUTER_API_KEY: str = ""
37 |     PERPLEXITY_API_KEY: str = ""
38 |     RESTACK_API_KEY: str = ""
39 |     TOGETHER_API_KEY: str = ""
40 | 
41 |     # Tools
42 |     TAVILY_API_KEY: str = ""
43 | 
44 |     # Logging/Monitoring/Tracing
45 |     AGENTOPS_API_KEY: str = ""
46 |     LOGFIRE_TOKEN: str = ""
47 |     WANDB_API_KEY: str = ""
48 | 
49 |     model_config = SettingsConfigDict(
50 |         env_file=".env", env_file_encoding="utf-8", extra="ignore"
51 |     )
52 | 
53 | 
54 | chat_config = AppEnv()
55 | 
56 | 
57 | def load_config(config_path: str | Path) -> ChatConfig:
58 |     """
59 |     Load and validate application configuration from a JSON file.
60 | 
61 |     Args:
62 |         config_path (str): Path to the JSON configuration file.
63 | 
64 |     Returns:
65 |         ChatConfig: An instance of ChatConfig with validated configuration data.
66 | 
67 |     Raises:
68 |         FileNotFoundError: If the configuration file does not exist.
69 |         json.JSONDecodeError: If the file contains invalid JSON.
70 |         Exception: For any other unexpected errors during loading or validation.
71 |     """
72 | 
73 |     try:
74 |         with open(config_path) as f:
75 |             config_data = json.load(f)
76 |     except FileNotFoundError as e:
77 |         msg = file_not_found(config_path)
78 |         logger.error(msg)
79 |         raise FileNotFoundError(msg) from e
80 |     except json.JSONDecodeError as e:
81 |         msg = invalid_json(str(e))
82 |         logger.error(msg)
83 |         raise json.JSONDecodeError(msg, str(config_path), 0) from e
84 |     except Exception as e:
85 |         msg = failed_to_load_config(str(e))
86 |         logger.exception(msg)
87 |         raise Exception(msg) from e
88 | 
89 |     return ChatConfig.model_validate(config_data)
90 | 


--------------------------------------------------------------------------------
/src/app/utils/log.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Set up the logger with custom settings.
 3 | Logs are written to a file with automatic rotation.
 4 | """
 5 | 
 6 | from loguru import logger
 7 | 
 8 | from app.config.config_app import LOGS_PATH
 9 | 
10 | logger.add(
11 |     f"{LOGS_PATH}/{{time}}.log",
12 |     rotation="1 MB",
13 |     # level="DEBUG",
14 |     retention="7 days",
15 |     compression="zip",
16 | )
17 | 


--------------------------------------------------------------------------------
/src/app/utils/login.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module provides utility functions for managing login state and initializing
 3 | the environment for a given project. It includes functionality to load and save
 4 | login state, perform a one-time login, and check if the user is logged in.
 5 | """
 6 | 
 7 | from os import environ
 8 | 
 9 | from agentops import init as agentops_init
10 | from logfire import configure as logfire_conf
11 | from wandb import login as wandb_login
12 | from weave import init as weave_init
13 | 
14 | from app.agents.llm_model_funs import get_api_key
15 | from app.config.data_models import AppEnv
16 | from app.utils.error_messages import generic_exception
17 | from app.utils.log import logger
18 | 
19 | 
20 | def login(project_name: str, chat_env_config: AppEnv):
21 |     """
22 |     Logs in to the workspace and initializes the environment for the given project.
23 |     Args:
24 |         project_name (str): The name of the project to initialize.
25 |         chat_env_config (AppEnv): The application environment configuration
26 |             containing the API keys.
27 |     Returns:
28 |         None
29 |     """
30 | 
31 |     try:
32 |         logger.info(f"Logging in to the workspaces for project: {project_name}")
33 |         environ["AGENTOPS_LOGGING_TO_FILE"] = "FALSE"
34 |         agentops_init(
35 |             default_tags=[project_name],
36 |             api_key=get_api_key("AGENTOPS", chat_env_config),
37 |         )
38 |         logfire_conf(token=get_api_key("LOGFIRE", chat_env_config))
39 |         wandb_login(key=get_api_key("WANDB", chat_env_config))
40 |         weave_init(project_name)
41 |     except Exception as e:
42 |         msg = generic_exception(str(e))
43 |         logger.exception(e)
44 |         raise Exception(msg) from e
45 | 


--------------------------------------------------------------------------------
/src/app/utils/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module provides utility functions and context managers for handling configurations,
  3 | error handling, and setting up agent environments.
  4 | 
  5 | Functions:
  6 |     load_config(config_path: str) -> Config:
  7 |         Load and validate configuration from a JSON file.
  8 | 
  9 |     print_research_Result(summary: Dict, usage: Usage) -> None:
 10 |         Output structured summary of the research topic.
 11 | 
 12 |     error_handling_context(operation_name: str, console: Console = None):
 13 |         Context manager for handling errors during operations.
 14 | 
 15 |     setup_agent_env(config: Config, console: Console = None) -> AgentConfig:
 16 |         Set up the agent environment based on the provided configuration.
 17 | """
 18 | 
 19 | from pydantic_ai.usage import Usage
 20 | 
 21 | from app.config.data_models import ResearchSummary
 22 | from app.utils.log import logger
 23 | 
 24 | 
 25 | def log_research_result(summary: ResearchSummary, usage: Usage) -> None:
 26 |     """
 27 |     Prints the research summary and usage details in a formatted manner.
 28 | 
 29 |     Args:
 30 |         summary (Dict): A dictionary containing the research summary with keys 'topic',
 31 |             'key_points', 'key_points_explanation', and 'conclusion'.
 32 |         usage (Usage): An object containing usage details to be printed.
 33 |     """
 34 | 
 35 |     logger.info(f"\n=== Research Summary: {summary.topic} ===")
 36 |     logger.info("\nKey Points:")
 37 |     for i, point in enumerate(summary.key_points, 1):
 38 |         logger.info(f"{i}. {point}")
 39 |     logger.info("\nKey Points Explanation:")
 40 |     for i, point in enumerate(summary.key_points_explanation, 1):
 41 |         logger.info(f"{i}. {point}")
 42 |     logger.info(f"\nConclusion: {summary.conclusion}")
 43 |     logger.info(f"\nResponse structure: {list(dict(summary).keys())}")
 44 |     logger.info(usage)
 45 | 
 46 | 
 47 | def parse_args(argv: list[str]) -> dict[str, str | bool]:
 48 |     """
 49 |     Parse command line arguments into a dictionary.
 50 | 
 51 |     This function processes a list of command-line arguments,
 52 |     extracting recognized options and their values.
 53 |     Supported arguments include flags (e.g., --help, --include-researcher
 54 |     and key-value pairs (e.g., `--chat-provider=ollama`).
 55 |     If the `--help` flag is present, a list of available commands and their
 56 |     descriptions is printed, and an empty dictionary is returned.
 57 | 
 58 |     Recognized arguments as list[str]
 59 |     ```
 60 |         --help                   Display help information and exit.
 61 |         --version                Display version information.
 62 |         --chat-provider=<str>    Specify the chat provider to use.
 63 |         --query=<str>            Specify the query to process.
 64 |         --include-researcher     Include the researcher agent.
 65 |         --include-analyst        Include the analyst agent.
 66 |         --include-synthesiser    Include the synthesiser agent.
 67 |         --no-stream              Disable streaming output.
 68 |         --chat-config-file=<str> Specify the path to the chat configuration file.
 69 |     ```
 70 | 
 71 |     Returns:
 72 |         `dict[str, str | bool]`: A dictionary mapping argument names
 73 |         (with leading '--' removed and hyphens replaced by underscores)
 74 |         to their values (`str` for key-value pairs, `bool` for flags).
 75 |         Returns an empty dict if `--help` is specified.
 76 | 
 77 |     Example:
 78 |         >>> `parse_args(['--chat-provider=ollama', '--include-researcher'])`
 79 |         returns `{'chat_provider': 'ollama', 'include_researcher': True}`
 80 |     """
 81 | 
 82 |     commands = {
 83 |         "--help": "Display help information",
 84 |         "--version": "Display version information",
 85 |         "--chat-provider": "Specify the chat provider to use",
 86 |         "--query": "Specify the query to process",
 87 |         "--include-researcher": "Include the researcher agent",
 88 |         "--include-analyst": "Include the analyst agent",
 89 |         "--include-synthesiser": "Include the synthesiser agent",
 90 |         "--no-stream": "Disable streaming output",
 91 |         "--chat-config-file": "Specify the path to the chat configuration file",
 92 |     }
 93 |     parsed_args: dict[str, str | bool] = {}
 94 | 
 95 |     if "--help" in argv:
 96 |         print("Available commands:")
 97 |         for cmd, desc in commands.items():
 98 |             print(f"{cmd}: {desc}")
 99 |         return parsed_args
100 | 
101 |     for arg in argv:
102 |         if arg.split("=", 1)[0] in commands.keys():
103 |             key, value = arg.split("=", 1) if "=" in arg else (arg, True)
104 |             key = key.lstrip("--").replace("-", "_")
105 |             parsed_args[key] = value
106 | 
107 |     if parsed_args:
108 |         logger.info(f"Used arguments: {parsed_args}")
109 | 
110 |     return parsed_args
111 | 


--------------------------------------------------------------------------------
/src/examples/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "providers": {
 3 |         "gemini": {
 4 |             "model_name": "gemini-1.5-flash-8b",
 5 |             "base_url": "https://generativelanguage.googleapis.com/v1beta"
 6 |         },
 7 |         "github": {
 8 |             "model_name": "GPT-4o",
 9 |             "base_url": "https://models.inference.ai.azure.com"
10 |         },
11 |         "huggingface": {
12 |             "model_name": "Qwen/QwQ-32B-Preview",
13 |             "base_url": "https://api-inference.huggingface.co/v1"
14 |         },
15 |         "ollama": {
16 |             "model_name": "granite3-dense",
17 |             "base_url": "http://localhost:11434/v1"
18 |         },
19 |         "openrouter": {
20 |             "model_name": "google/gemini-2.0-flash-lite-preview-02-05:free",
21 |             "base_url": "https://openrouter.ai/api/v1"
22 |         },
23 |         "restack": {
24 |             "model_name": "deepseek-chat",
25 |             "base_url": "https://ai.restack.io"
26 |         }
27 |     },
28 |     "prompts": {
29 |         "system_prompt": "You are a helpful research assistant. Extract key information about the topic and provide a structured summary.",
30 |         "user_prompt": "Provide a research summary about",
31 |         "system_prompt_researcher": "You are a manager overseeing research and analysis tasks. Your role is to coordinate the efforts of the research and analysis agents to provide comprehensive answers to user queries.",
32 |         "system_prompt_manager": "You are a research assistant. Your task is to find relevant information about the topic provided. Use the search tool to gather data and synthesize it into a concise summary.",
33 |         "system_prompt_analyst": "You are a data scientist. Your task is to analyze the data provided and extract meaningful insights. Use your analytical skills to identify trends, patterns, and correlations."
34 |     }
35 | }


--------------------------------------------------------------------------------
/src/examples/run_simple_agent_no_tools.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A simple example of using a Pydantic AI agent to generate a structured summary of a
 3 | research topic.
 4 | """
 5 | 
 6 | from os import path
 7 | 
 8 | from .utils.agent_simple_no_tools import get_research
 9 | from .utils.utils import (
10 |     get_api_key,
11 |     get_provider_config,
12 |     load_config,
13 |     print_research_Result,
14 | )
15 | 
16 | CONFIG_FILE = "config.json"
17 | 
18 | 
19 | def main():
20 |     """Main function to run the research agent."""
21 | 
22 |     config_path = path.join(path.dirname(__file__), CONFIG_FILE)
23 |     config = load_config(config_path)
24 | 
25 |     provider = input("Which inference provider to use? ")
26 |     topic = input("What topic would you like to research? ")
27 | 
28 |     api_key = get_api_key(provider)
29 |     provider_config = get_provider_config(provider, config)
30 | 
31 |     result = get_research(topic, config.prompts, provider, provider_config, api_key)
32 |     print_research_Result(result.data, result.usage())
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     main()
37 | 


--------------------------------------------------------------------------------
/src/examples/run_simple_agent_system.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example demonstrates how to run a simple agent system that consists of a manager
 3 | agent, a research agent, and an analysis agent. The manager agent delegates research
 4 | and analysis tasks to the corresponding agents and combines the results to provide a
 5 | comprehensive answer to the user query.
 6 | https://ai.pydantic.dev/multi-agent-applications/#agent-delegation
 7 | """
 8 | 
 9 | from asyncio import run
10 | from os import path
11 | 
12 | from openai import UnprocessableEntityError
13 | from pydantic_ai.common_tools.duckduckgo import duckduckgo_search_tool
14 | from pydantic_ai.exceptions import UnexpectedModelBehavior, UsageLimitExceeded
15 | from pydantic_ai.models.openai import OpenAIModel
16 | from pydantic_ai.usage import UsageLimits
17 | 
18 | from .utils.agent_simple_system import SystemAgent, add_tools_to_manager_agent
19 | from .utils.data_models import AnalysisResult, ResearchResult
20 | from .utils.utils import create_model, get_api_key, get_provider_config, load_config
21 | 
22 | CONFIG_FILE = "config.json"
23 | 
24 | 
25 | def get_models(model_config: dict) -> tuple[OpenAIModel]:
26 |     """Get the models for the system agents."""
27 |     model_researcher = create_model(**model_config)
28 |     model_analyst = create_model(**model_config)
29 |     model_manager = create_model(**model_config)
30 |     return model_researcher, model_analyst, model_manager
31 | 
32 | 
33 | def get_manager(
34 |     model_manager: OpenAIModel,
35 |     model_researcher: OpenAIModel,
36 |     model_analyst: OpenAIModel,
37 |     prompts: dict[str, str],
38 | ) -> SystemAgent:
39 |     """Get the agents for the system."""
40 |     researcher = SystemAgent(
41 |         model_researcher,
42 |         ResearchResult,
43 |         prompts["system_prompt_researcher"],
44 |         [duckduckgo_search_tool()],
45 |     )
46 |     analyst = SystemAgent(
47 |         model_analyst, AnalysisResult, prompts["system_prompt_analyst"]
48 |     )
49 |     manager = SystemAgent(
50 |         model_manager, ResearchResult, prompts["system_prompt_manager"]
51 |     )
52 |     add_tools_to_manager_agent(manager, researcher, analyst)
53 |     return manager
54 | 
55 | 
56 | async def main():
57 |     """Main function to run the research system."""
58 | 
59 |     provider = input("Which inference provider to use? ")
60 |     query = input("What would you like to research? ")
61 | 
62 |     config_path = path.join(path.dirname(__file__), CONFIG_FILE)
63 |     config = load_config(config_path)
64 | 
65 |     api_key = get_api_key(provider)
66 |     provider_config = get_provider_config(provider, config)
67 |     usage_limits = UsageLimits(request_limit=10, total_tokens_limit=4000)
68 | 
69 |     model_config = {
70 |         "base_url": provider_config["base_url"],
71 |         "model_name": provider_config["model_name"],
72 |         "api_key": api_key,
73 |         "provider": provider,
74 |     }
75 |     manager = get_manager(*get_models(model_config), config.prompts)
76 | 
77 |     print(f"\nResearching: {query}...")
78 | 
79 |     try:
80 |         result = await manager.run(query, usage_limits=usage_limits)
81 |     except (UnexpectedModelBehavior, UnprocessableEntityError) as e:
82 |         print(f"Error: Model returned unexpected result: {e}")
83 |     except UsageLimitExceeded as e:
84 |         print(f"Usage limit exceeded: {e}")
85 |     else:
86 |         print("\nFindings:", {result.data.findings})
87 |         print(f"Sources: {result.data.sources}")
88 |         print("\nUsage statistics:")
89 |         print(result.usage())
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     run(main())
94 | 


--------------------------------------------------------------------------------
/src/examples/run_simple_agent_tools.py:
--------------------------------------------------------------------------------
 1 | """Run the dice game agent using simple tools."""
 2 | 
 3 | from os import path
 4 | 
 5 | from .utils.agent_simple_tools import get_dice
 6 | from .utils.utils import (
 7 |     get_api_key,
 8 |     get_provider_config,
 9 |     load_config,
10 | )
11 | 
12 | CONFIG_FILE = "config.json"
13 | system_prompt = (
14 |     "You're a dice game, you should roll the die and see if the number "
15 |     "you get back matches the user's guess. If so, tell them they're a winner. "
16 |     "Use the player's name in the response."
17 | )
18 | 
19 | 
20 | def main():
21 |     """Run the dice game agent."""
22 | 
23 |     provider = input("Which inference provider to use? ")
24 |     player_name = input("Enter your name: ")
25 |     guess = input("Guess a number between 1 and 6: ")
26 | 
27 |     config_path = path.join(path.dirname(__file__), CONFIG_FILE)
28 |     config = load_config(config_path)
29 | 
30 |     api_key = get_api_key(provider)
31 |     provider_config = get_provider_config(provider, config)
32 | 
33 |     result = get_dice(
34 |         player_name, guess, system_prompt, provider, api_key, provider_config
35 |     )
36 |     print(result.data)
37 |     print(f"{result._result_tool_name=}")
38 |     print(result.usage())
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     main()
43 | 


--------------------------------------------------------------------------------
/src/examples/utils/agent_simple_no_tools.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains a function to create a research agent with the specified model,
 3 | result type, and system prompt.
 4 | """
 5 | 
 6 | from sys import exit
 7 | 
 8 | from openai import APIConnectionError
 9 | from pydantic_ai import Agent
10 | from pydantic_ai.agent import AgentRunResult
11 | from pydantic_ai.models.openai import OpenAIModel
12 | 
13 | from .data_models import Config, ResearchSummary
14 | from .utils import create_model
15 | 
16 | 
17 | def _create_research_agent(
18 |     model: OpenAIModel, result_type: ResearchSummary, system_prompt: str
19 | ) -> Agent:
20 |     """
21 |     Create a research agent with the specified model, result type, and system prompt.
22 |     """
23 | 
24 |     return Agent(model=model, result_type=result_type, system_prompt=system_prompt)
25 | 
26 | 
27 | def get_research(
28 |     topic: str,
29 |     prompts: dict[str, str],
30 |     provider: str,
31 |     provider_config: Config,
32 |     api_key: str,
33 | ) -> AgentRunResult:
34 |     """Run the research agent to generate a structured summary of a research topic."""
35 | 
36 |     model = create_model(
37 |         provider_config["base_url"], provider_config["model_name"], api_key, provider
38 |     )
39 |     agent = _create_research_agent(model, ResearchSummary, prompts["system_prompt"])
40 | 
41 |     print(f"\nResearching {topic}...")
42 |     try:
43 |         result = agent.run_sync(f"{prompts['user_prompt']} {topic}")
44 |     except APIConnectionError as e:
45 |         print(f"Error connecting to API: {e}")
46 |         exit()
47 |     except Exception as e:
48 |         print(f"Error connecting to API: {e}")
49 |         exit()
50 |     else:
51 |         return result
52 | 


--------------------------------------------------------------------------------
/src/examples/utils/agent_simple_system.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains a simple system of agents that can be used to research and analyze
 3 | data.
 4 | """
 5 | 
 6 | from pydantic_ai import Agent, RunContext
 7 | from pydantic_ai.models.openai import OpenAIModel
 8 | 
 9 | from .data_models import AnalysisResult, ResearchResult
10 | 
11 | 
12 | class SystemAgent(Agent):
13 |     """A generic system agent that can be used to research and analyze data."""
14 | 
15 |     def __init__(
16 |         self,
17 |         model: OpenAIModel,
18 |         result_type: ResearchResult | AnalysisResult,
19 |         system_prompt: str,
20 |         result_retries: int = 3,
21 |         tools: list | None = [],
22 |     ):
23 |         super().__init__(
24 |             model,
25 |             result_type=result_type,
26 |             system_prompt=system_prompt,
27 |             result_retries=result_retries,
28 |             tools=tools,
29 |         )
30 | 
31 | 
32 | def add_tools_to_manager_agent(
33 |     manager_agent: SystemAgent, research_agent: SystemAgent, analysis_agent: SystemAgent
34 | ) -> None:
35 |     """Create and configure the joke generation agent."""
36 | 
37 |     @manager_agent.tool
38 |     async def delegate_research(ctx: RunContext[None], query: str) -> ResearchResult:
39 |         """Delegate research task to ResearchAgent."""
40 |         result = await research_agent.run(query, usage=ctx.usage)
41 |         return result.data
42 | 
43 |     @manager_agent.tool
44 |     async def delegate_analysis(ctx: RunContext[None], data: str) -> AnalysisResult:
45 |         """Delegate analysis task to AnalysisAgent."""
46 |         result = await analysis_agent.run(data, usage=ctx.usage)
47 |         return result.data
48 | 


--------------------------------------------------------------------------------
/src/examples/utils/agent_simple_tools.py:
--------------------------------------------------------------------------------
 1 | """Simple agent for the dice game example."""
 2 | 
 3 | from openai import APIConnectionError
 4 | from pydantic_ai import Agent, Tool
 5 | from pydantic_ai.agent import AgentRunResult
 6 | from pydantic_ai.models.openai import OpenAIModel
 7 | 
 8 | from .tools import get_player_name, roll_die
 9 | from .utils import create_model
10 | 
11 | 
12 | class _DiceGameAgent(Agent):
13 |     """Dice game agent."""
14 | 
15 |     def __init__(self, model: OpenAIModel, system_prompt: str):
16 |         super().__init__(
17 |             model=model,
18 |             deps_type=str,
19 |             system_prompt=system_prompt,
20 |             tools=[  # (1)!
21 |                 Tool(roll_die, takes_ctx=False),
22 |                 Tool(get_player_name, takes_ctx=True),
23 |             ],
24 |         )
25 | 
26 | 
27 | def get_dice(
28 |     player_name: str,
29 |     guess: str,
30 |     system_prompt: str,
31 |     provider: str,
32 |     api_key: str,
33 |     config: dict,
34 | ) -> AgentRunResult:
35 |     """Run the dice game agent."""
36 | 
37 |     model = create_model(config["base_url"], config["model_name"], api_key, provider)
38 |     agent = _DiceGameAgent(model, system_prompt)
39 | 
40 |     try:
41 |         # usage_limits=UsageLimits(request_limit=5, total_tokens_limit=300),
42 |         result = agent.run_sync(f"Player is guessing {guess}...", deps=player_name)
43 |     except APIConnectionError as e:
44 |         print(f"Error connecting to API: {e}")
45 |         exit()
46 |     except Exception as e:
47 |         print(f"Error connecting to API: {e}")
48 |         exit()
49 |     else:
50 |         return result
51 | 


--------------------------------------------------------------------------------
/src/examples/utils/data_models.py:
--------------------------------------------------------------------------------
 1 | """Example of a module with data models"""
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | 
 6 | class ResearchResult(BaseModel):
 7 |     """Research results from the research agent."""
 8 | 
 9 |     topic: str
10 |     findings: list[str]
11 |     sources: list[str]
12 | 
13 | 
14 | class AnalysisResult(BaseModel):
15 |     """Analysis results from the analysis agent."""
16 | 
17 |     insights: list[str]
18 |     recommendations: list[str]
19 | 
20 | 
21 | class ResearchSummary(BaseModel):
22 |     """Expected model response of research on a topic"""
23 | 
24 |     topic: str
25 |     key_points: list[str]
26 |     key_points_explanation: list[str]
27 |     conclusion: str
28 | 
29 | 
30 | class ProviderConfig(BaseModel):
31 |     """Configuration for a model provider"""
32 | 
33 |     model_name: str
34 |     base_url: str
35 | 
36 | 
37 | class Config(BaseModel):
38 |     """Configuration settings for the research agent and model providers"""
39 | 
40 |     providers: dict[str, ProviderConfig]
41 |     prompts: dict[str, str]
42 | 


--------------------------------------------------------------------------------
/src/examples/utils/tools.py:
--------------------------------------------------------------------------------
 1 | """Example tools for the utils example."""
 2 | 
 3 | from random import randint
 4 | 
 5 | from pydantic_ai import RunContext
 6 | 
 7 | 
 8 | def roll_die() -> str:
 9 |     """Tool to roll a die."""
10 | 
11 |     async def _execute(self) -> str:
12 |         """Roll the die and return the result."""
13 |         return str(randint(1, 6))
14 | 
15 | 
16 | def get_player_name(ctx: RunContext[str]) -> str:
17 |     """Get the player's name from the context."""
18 |     return ctx.deps
19 | 


--------------------------------------------------------------------------------
/src/examples/utils/utils.py:
--------------------------------------------------------------------------------
  1 | """Utility functions for running the research agent example."""
  2 | 
  3 | from json import load
  4 | from os import getenv
  5 | from sys import exit
  6 | 
  7 | from dotenv import load_dotenv
  8 | from pydantic import ValidationError
  9 | from pydantic_ai.models.openai import OpenAIModel
 10 | from pydantic_ai.providers.openai import OpenAIProvider
 11 | from pydantic_ai.usage import Usage
 12 | 
 13 | from .data_models import Config
 14 | 
 15 | API_SUFFIX = "_API_KEY"
 16 | 
 17 | 
 18 | def load_config(config_path: str) -> Config:
 19 |     """Load and validate configuration from a JSON file."""
 20 | 
 21 |     try:
 22 |         with open(config_path) as file:
 23 |             config_data = load(file)
 24 |         config = Config.model_validate(config_data)
 25 |     except FileNotFoundError:
 26 |         raise FileNotFoundError(f"Configuration file not found: {config_path}")
 27 |         exit()
 28 |     except ValidationError as e:
 29 |         raise ValueError(f"Invalid configuration format: {e}")
 30 |         exit()
 31 |     except Exception as e:
 32 |         raise Exception(f"Error loading configuration: {e}")
 33 |         exit()
 34 |     else:
 35 |         return config
 36 | 
 37 | 
 38 | def get_api_key(provider: str) -> str | None:
 39 |     """Retrieve API key from environment variable."""
 40 | 
 41 |     # TODO replace with pydantic-settings ?
 42 |     load_dotenv()
 43 | 
 44 |     if provider.lower() == "ollama":
 45 |         return None
 46 |     else:
 47 |         return getenv(f"{provider.upper()}{API_SUFFIX}")
 48 | 
 49 | 
 50 | def get_provider_config(provider: str, config: Config) -> dict[str, str]:
 51 |     """Retrieve configuration settings for the specified provider."""
 52 | 
 53 |     try:
 54 |         model_name = config.providers[provider].model_name
 55 |         base_url = config.providers[provider].base_url
 56 |     except KeyError as e:
 57 |         raise ValueError(f"Missing configuration for {provider}: {e}.")
 58 |         exit()
 59 |     except Exception as e:
 60 |         raise Exception(f"Error loading provider configuration: {e}")
 61 |         exit()
 62 |     else:
 63 |         return {
 64 |             "model_name": model_name,
 65 |             "base_url": base_url,
 66 |         }
 67 | 
 68 | 
 69 | def create_model(
 70 |     base_url: str,
 71 |     model_name: str,
 72 |     api_key: str | None = None,
 73 |     provider: str | None = None,
 74 | ) -> OpenAIModel:
 75 |     """Create a model that uses base_url as inference API"""
 76 | 
 77 |     if api_key is None and not provider.lower() == "ollama":
 78 |         raise ValueError("API key is required for model.")
 79 |         exit()
 80 |     else:
 81 |         return OpenAIModel(
 82 |             model_name, provider=OpenAIProvider(base_url=base_url, api_key=api_key)
 83 |         )
 84 | 
 85 | 
 86 | def print_research_Result(summary: dict, usage: Usage) -> None:
 87 |     """Output structured summary of the research topic."""
 88 | 
 89 |     print(f"\n=== Research Summary: {summary.topic} ===")
 90 |     print("\nKey Points:")
 91 |     for i, point in enumerate(summary.key_points, 1):
 92 |         print(f"{i}. {point}")
 93 |     print("\nKey Points Explanation:")
 94 |     for i, point in enumerate(summary.key_points_explanation, 1):
 95 |         print(f"{i}. {point}")
 96 |     print(f"\nConclusion: {summary.conclusion}")
 97 | 
 98 |     print(f"\nResponse structure: {list(dict(summary).keys())}")
 99 |     print(usage)
100 | 


--------------------------------------------------------------------------------
/src/gui/components/footer.py:
--------------------------------------------------------------------------------
1 | from streamlit import caption, divider
2 | 
3 | 
4 | def render_footer(footer_caption: str):
5 |     """Render the page footer."""
6 |     divider()
7 |     caption(footer_caption)
8 | 


--------------------------------------------------------------------------------
/src/gui/components/header.py:
--------------------------------------------------------------------------------
1 | from streamlit import divider, title
2 | 
3 | 
4 | def render_header(header_title: str):
5 |     """Render the page header with title."""
6 |     title(header_title)
7 |     divider()
8 | 


--------------------------------------------------------------------------------
/src/gui/components/output.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from streamlit import empty, info
 4 | 
 5 | 
 6 | def render_output(
 7 |     result: Any = None, info_str: str | None = None, type: str | None = None
 8 | ):
 9 |     """
10 |     Renders the output in a Streamlit app based on the provided type.
11 | 
12 |     Args:
13 |         result (Any, optional): The content to be displayed. Can be JSON, code
14 |             markdown, or plain text.
15 |         info (str, optional): The information message to be displayed if result is None.
16 |         type (str, optional): The type of the result content. Can be 'json', 'code',
17 |             'md', or other for plain text.
18 | 
19 |     Returns:
20 |         Out: None
21 |     """
22 | 
23 |     if result:
24 |         output_container = empty()
25 |         output_container.write(result)
26 |         # match type:
27 |         #     case "json":
28 |         #         json(result)
29 |         #     case "code":
30 |         #         code(result)
31 |         #     case "md":
32 |         #         markdown(result)
33 |         #     case _:
34 |         #         text(result)
35 |         #         # st.write(result)
36 |     else:
37 |         info(info_str)
38 | 


--------------------------------------------------------------------------------
/src/gui/components/prompts.py:
--------------------------------------------------------------------------------
 1 | from streamlit import text_area
 2 | 
 3 | 
 4 | def render_prompt_editor(
 5 |     prompt_name: str, prompt_value: str, height: int = 150
 6 | ) -> str | None:
 7 |     return text_area(
 8 |         f"{prompt_name.replace('_', ' ').title()}", value=prompt_value, height=height
 9 |     )
10 | 


--------------------------------------------------------------------------------
/src/gui/components/sidebar.py:
--------------------------------------------------------------------------------
 1 | from streamlit import sidebar
 2 | 
 3 | from gui.config.config import PAGES
 4 | 
 5 | 
 6 | def render_sidebar(sidebar_title: str):
 7 |     sidebar.title(sidebar_title)
 8 |     selected_page = sidebar.radio(" ", PAGES)
 9 | 
10 |     # st.sidebar.divider()
11 |     # st.sidebar.info(" ")
12 |     return selected_page
13 | 


--------------------------------------------------------------------------------
/src/gui/config/config.py:
--------------------------------------------------------------------------------
 1 | APP_PATH = "app"
 2 | PAGES = ["Home", "Settings", "Prompts", "App"]
 3 | PROMPTS_DEFAULT = {
 4 |     "system_prompt_manager": (
 5 |         "You are a manager overseeing research and analysis tasks..."
 6 |     ),
 7 |     "system_prompt_researcher": ("You are a researcher. Gather and analyze data..."),
 8 |     "system_prompt_analyst": (
 9 |         "You are a research analyst. Use your analytical skills..."
10 |     ),
11 |     "system_prompt_synthesiser": (
12 |         "You are a research synthesiser. Use your analytical skills..."
13 |     ),
14 | }
15 | 


--------------------------------------------------------------------------------
/src/gui/config/styling.py:
--------------------------------------------------------------------------------
 1 | from streamlit import markdown, set_page_config
 2 | 
 3 | 
 4 | def add_custom_styling(page_title: str):
 5 |     set_page_config(
 6 |         page_title=f"{page_title}",
 7 |         page_icon="🤖",
 8 |         layout="wide",
 9 |         initial_sidebar_state="expanded",
10 |     )
11 | 
12 |     custom_css = """
13 |     <style>    
14 |     /* Hide the default radio button circles */
15 |     div[role="radiogroup"] label > div:first-child {
16 |         display: none !important;
17 |     }
18 |     </style>
19 |     """
20 |     markdown(custom_css, unsafe_allow_html=True)
21 | 


--------------------------------------------------------------------------------
/src/gui/config/text.py:
--------------------------------------------------------------------------------
 1 | HOME_INFO = "Select 'App' to start using the system"
 2 | HOME_HEADER = "Welcome to the Multi-Agent Research System"
 3 | HOME_DESCRIPTION = """
 4 | This system allows you to:
 5 | 
 6 | - Run research queries using multiple specialized agents
 7 | - Configure agent settings and prompts
 8 | - View detailed results from your research
 9 | 
10 | Use the sidebar to navigate between different sections of the application.
11 | """
12 | PAGE_TITLE = "MAS Eval 👾⚗️🧠💡"
13 | PROMPTS_WARNING = "No prompts found. Using default prompts."
14 | PROMPTS_HEADER = "Agent Prompts"
15 | RUN_APP_HEADER = "Run Research App"
16 | RUN_APP_QUERY_PLACEHOLDER = "What would you like to research?"
17 | RUN_APP_PROVIDER_PLACEHOLDER = "Provider?"
18 | RUN_APP_BUTTON = "Run Query"
19 | RUN_APP_OUTPUT_PLACEHOLDER = "Run the agent to see results here"
20 | RUN_APP_QUERY_WARNING = "Please enter a query"
21 | RUN_APP_QUERY_RUN_INFO = "Running query: "
22 | SETTINGS_HEADER = "Settings"
23 | SETTINGS_PROVIDER_LABEL = "Select Provider"
24 | SETTINGS_PROVIDER_PLACEHOLDER = "Select Provider"
25 | SETTINGS_ADD_PROVIDER = "Add New Provider"
26 | SETTINGS_API_KEY_LABEL = "API Key"
27 | OUTPUT_SUBHEADER = "Output"
28 | 


--------------------------------------------------------------------------------
/src/gui/pages/home.py:
--------------------------------------------------------------------------------
 1 | from streamlit import header, info, markdown
 2 | 
 3 | from gui.config.text import HOME_DESCRIPTION, HOME_HEADER, HOME_INFO
 4 | 
 5 | 
 6 | def render_home():
 7 |     header(HOME_HEADER)
 8 |     markdown(HOME_DESCRIPTION)
 9 |     info(HOME_INFO)
10 | 


--------------------------------------------------------------------------------
/src/gui/pages/prompts.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Streamlit component for editing agent system prompts.
 3 | 
 4 | This module provides a function to render and edit prompt configurations
 5 | for agent roles using a Streamlit-based UI. It validates the input configuration,
 6 | displays warnings if prompts are missing, and allows interactive editing of each prompt.
 7 | """
 8 | 
 9 | from pydantic import BaseModel
10 | from streamlit import error, header, warning
11 | 
12 | from app.config.data_models import ChatConfig
13 | from app.utils.error_messages import invalid_type
14 | from app.utils.log import logger
15 | from gui.components.prompts import render_prompt_editor
16 | from gui.config.config import PROMPTS_DEFAULT
17 | from gui.config.text import PROMPTS_HEADER, PROMPTS_WARNING
18 | 
19 | 
20 | def render_prompts(chat_config: ChatConfig | BaseModel):  # -> dict[str, str]:
21 |     """
22 |     Render and edit the prompt configuration for agent roles in the Streamlit UI.
23 |     """
24 | 
25 |     header(PROMPTS_HEADER)
26 | 
27 |     if not isinstance(chat_config, ChatConfig):
28 |         msg = invalid_type("ChatConfig", type(chat_config).__name__)
29 |         logger.error(msg)
30 |         error(msg)
31 |         return None
32 | 
33 |     # updated = False
34 |     prompts = chat_config.prompts
35 | 
36 |     if not prompts:
37 |         warning(PROMPTS_WARNING)
38 |         prompts = PROMPTS_DEFAULT
39 | 
40 |     updated_prompts = prompts.copy()
41 | 
42 |     # Edit prompts
43 |     for prompt_key, prompt_value in prompts.items():
44 |         new_value = render_prompt_editor(prompt_key, prompt_value, height=200)
45 |         if new_value != prompt_value and new_value is not None:
46 |             updated_prompts[prompt_key] = new_value
47 |             # updated = True
48 | 
49 |     # return updated_prompts if updated else prompts
50 | 


--------------------------------------------------------------------------------
/src/gui/pages/run_app.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Streamlit interface for running the agentic system interactively.
 3 | 
 4 | This module defines the render_app function, which provides a Streamlit-based UI
 5 | for users to select a provider, enter a query, and execute the main agent workflow.
 6 | Results and errors are displayed in real time, supporting asynchronous execution.
 7 | """
 8 | 
 9 | from streamlit import button, exception, header, info, subheader, text_input, warning
10 | 
11 | from app.main import main
12 | from app.utils.log import logger
13 | from gui.components.output import render_output
14 | from gui.config.text import (
15 |     OUTPUT_SUBHEADER,
16 |     RUN_APP_BUTTON,
17 |     RUN_APP_HEADER,
18 |     RUN_APP_OUTPUT_PLACEHOLDER,
19 |     RUN_APP_PROVIDER_PLACEHOLDER,
20 |     RUN_APP_QUERY_PLACEHOLDER,
21 |     RUN_APP_QUERY_RUN_INFO,
22 |     RUN_APP_QUERY_WARNING,
23 | )
24 | 
25 | 
26 | async def render_app(provider: str | None = None):
27 |     """
28 |     Render the main app interface for running agentic queries via Streamlit.
29 | 
30 |     Displays input fields for provider and query, a button to trigger execution,
31 |     and an area for output or error messages. Handles async invocation of the
32 |     main agent workflow and logs any exceptions.
33 |     """
34 | 
35 |     header(RUN_APP_HEADER)
36 |     if provider is None:
37 |         provider = text_input(RUN_APP_PROVIDER_PLACEHOLDER)
38 |     query = text_input(RUN_APP_QUERY_PLACEHOLDER)
39 | 
40 |     subheader(OUTPUT_SUBHEADER)
41 |     if button(RUN_APP_BUTTON):
42 |         if query:
43 |             info(f"{RUN_APP_QUERY_RUN_INFO} {query}")
44 |             try:
45 |                 result = await main(chat_provider=provider, query=query)
46 |                 render_output(result)
47 |             except Exception as e:
48 |                 render_output(None)
49 |                 exception(e)
50 |                 logger.exception(e)
51 |         else:
52 |             warning(RUN_APP_QUERY_WARNING)
53 |     else:
54 |         render_output(RUN_APP_OUTPUT_PLACEHOLDER)
55 | 


--------------------------------------------------------------------------------
/src/gui/pages/settings.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Streamlit settings UI for provider and agent configuration.
 3 | 
 4 | This module provides a function to render and edit agent system settings,
 5 | including provider selection and related options, within the Streamlit GUI.
 6 | It validates the input configuration and ensures correct typing before rendering.
 7 | """
 8 | 
 9 | from streamlit import error, header, selectbox
10 | 
11 | from app.config.data_models import BaseModel, ChatConfig
12 | from app.utils.error_messages import invalid_type
13 | from app.utils.log import logger
14 | from gui.config.text import SETTINGS_HEADER, SETTINGS_PROVIDER_LABEL
15 | 
16 | 
17 | def render_settings(chat_config: ChatConfig | BaseModel) -> str:
18 |     """
19 |     Render and edit agent system settings in the Streamlit UI.
20 | 
21 |     Displays a header and a selectbox for choosing the inference provider.
22 |     Validates that the input is a ChatConfig instance and displays an error if not.
23 |     """
24 |     header(SETTINGS_HEADER)
25 | 
26 |     # updated = False
27 |     # updated_config = config.copy()
28 | 
29 |     if not isinstance(chat_config, ChatConfig):
30 |         msg = invalid_type("ChatConfig", type(chat_config).__name__)
31 |         logger.error(msg)
32 |         error(msg)
33 |         return msg
34 | 
35 |     provider = selectbox(
36 |         label=SETTINGS_PROVIDER_LABEL,
37 |         options=chat_config.providers.keys(),
38 |     )
39 | 
40 |     # Run options
41 |     # col1, col2 = st.columns(2)
42 |     # with col1:
43 |     #     streamed_output = st.checkbox(
44 |     #         "Stream Output", value=config.get("streamed_output", False)
45 |     #     )
46 |     # with col2:
47 |     #     st.checkbox("Include Sources", value=True)  # include_sources
48 | 
49 |     # Allow adding new providers
50 |     # new_provider = st.text_input("Add New Provider")
51 |     # api_key = st.text_input(f"{provider} API Key", type="password")
52 |     # if st.button("Add Provider") and new_provider and new_provider not in providers:
53 |     #     providers.append(new_provider)
54 |     #     updated_config["providers"] = providers
55 |     #     updated_config["api_key"] = api_key
56 |     #     updated = True
57 |     #     st.success(f"Added provider: {new_provider}")
58 | 
59 |     # # Update config if changed
60 |     # if (
61 |     #     include_a != config.get("include_a", False)
62 |     #     or include_b != config.get("include_b", False)
63 |     #     or streamed_output != config.get("streamed_output", False)
64 |     # ):
65 |     #     updated_config["include_a"] = include_a
66 |     #     updated_config["include_b"] = include_b
67 |     #     updated_config["streamed_output"] = streamed_output
68 |     #     updated = True
69 | 
70 |     return provider
71 | 


--------------------------------------------------------------------------------
/src/run_gui.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module sets up and runs a Streamlit application for a Multi-Agent System.
 3 | 
 4 | The application includes the following components:
 5 | - Header
 6 | - Sidebar for configuration options
 7 | - Main content area for prompts
 8 | - Footer
 9 | 
10 | The main function loads the configuration, renders the UI components, and handles the
11 | execution of the Multi-Agent System based on user input.
12 | 
13 | Functions:
14 | - run_app(): Placeholder function to run the main application logic.
15 | - main(): Main function to set up and run the Streamlit application.
16 | """
17 | 
18 | from asyncio import run
19 | from pathlib import Path
20 | 
21 | from app.config.config_app import CHAT_CONFIG_FILE, CHAT_DEFAULT_PROVIDER
22 | from app.config.data_models import ChatConfig
23 | from app.utils.load_configs import load_config
24 | from app.utils.log import logger
25 | from gui.components.sidebar import render_sidebar
26 | from gui.config.config import APP_PATH
27 | from gui.config.styling import add_custom_styling
28 | from gui.config.text import PAGE_TITLE
29 | from gui.pages.home import render_home
30 | from gui.pages.prompts import render_prompts
31 | from gui.pages.run_app import render_app
32 | from gui.pages.settings import render_settings
33 | 
34 | # TODO create sidebar tabs, move settings to page,
35 | # set readme.md as home, separate prompts into page
36 | 
37 | chat_config_pfile = Path(__file__).parent / APP_PATH / CHAT_CONFIG_FILE
38 | chat_config = load_config(chat_config_pfile, ChatConfig)
39 | provider = CHAT_DEFAULT_PROVIDER
40 | logger.info(f"Default provider: {CHAT_DEFAULT_PROVIDER}")
41 | 
42 | 
43 | async def main():
44 |     add_custom_styling(PAGE_TITLE)
45 |     selected_page = render_sidebar(PAGE_TITLE)
46 | 
47 |     if selected_page == "Home":
48 |         render_home()
49 |     elif selected_page == "Settings":
50 |         # TODO temp save settings to be used in gui
51 |         provider = render_settings(chat_config)
52 |         logger.info(f"Page 'Settings' provider: {provider}")
53 |     elif selected_page == "Prompts":
54 |         render_prompts(chat_config)
55 |     elif selected_page == "App":
56 |         logger.info(f"Page 'App' provider: {CHAT_DEFAULT_PROVIDER}")
57 |         await render_app(CHAT_DEFAULT_PROVIDER)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     run(main())
62 | 


--------------------------------------------------------------------------------
/tests/test_agent_system.py:
--------------------------------------------------------------------------------
 1 | from app.agents.agent_system import get_manager
 2 | from app.config.data_models import ProviderConfig
 3 | 
 4 | 
 5 | def test_get_manager_minimal():
 6 |     provider = "github"
 7 |     provider_config = ProviderConfig.model_validate(
 8 |         {"model_name": "test-model", "base_url": "http://test.com"}
 9 |     )
10 |     api_key = "test"
11 |     prompts = {"system_prompt_manager": "test"}
12 |     agent = get_manager(provider, provider_config, api_key, prompts)
13 |     assert hasattr(agent, "run")
14 | 


--------------------------------------------------------------------------------
/tests/test_env.py:
--------------------------------------------------------------------------------
 1 | from pytest import MonkeyPatch
 2 | 
 3 | from app.config.data_models import AppEnv
 4 | 
 5 | 
 6 | def test_app_env_loads_env_vars(monkeypatch: MonkeyPatch):
 7 |     monkeypatch.setenv("GEMINI_API_KEY", "test-gemini")
 8 |     env = AppEnv()
 9 |     assert env.GEMINI_API_KEY == "test-gemini"
10 | 


--------------------------------------------------------------------------------
/tests/test_metrics_output_similarity.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for the output_similarity metric.
 3 | 
 4 | This module verifies that the output_similarity metric correctly identifies when
 5 | an agent's output matches the expected answer.
 6 | """
 7 | 
 8 | from app.evals.metrics import output_similarity
 9 | 
10 | 
11 | def test_output_similarity_exact_match():
12 |     assert output_similarity("42", "42") is True
13 | 
14 | 
15 | def test_output_similarity_whitespace():
16 |     assert output_similarity("  answer  ", "answer") is True
17 | 
18 | 
19 | def test_output_similarity_incorrect():
20 |     assert output_similarity("foo", "bar") is False
21 | 


--------------------------------------------------------------------------------
/tests/test_metrics_time_taken.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for the time_taken metric.
 3 | 
 4 | This module verifies that the time_taken metric correctly computes the elapsed
 5 | time between two timestamps, ensuring accurate measurement of agent execution
 6 | duration for evaluation purposes.
 7 | """
 8 | 
 9 | import asyncio
10 | import time
11 | 
12 | import pytest
13 | 
14 | from app.evals.metrics import time_taken
15 | 
16 | 
17 | @pytest.mark.asyncio
18 | async def test_time_taken_metric():
19 |     """Scenario: Calculate time taken for agent execution"""
20 | 
21 |     # Given: Start and end timestamps
22 |     start_time = time.perf_counter()
23 |     await asyncio.sleep(0.1)
24 |     end_time = time.perf_counter()
25 | 
26 |     # When: Calculating time taken
27 |     result = time_taken(start_time, end_time)
28 | 
29 |     # Then: Verify correct duration calculation
30 |     assert result == pytest.approx(0.1, abs=0.05)
31 | 


--------------------------------------------------------------------------------
/tests/test_provider_config.py:
--------------------------------------------------------------------------------
 1 | from pytest import MonkeyPatch
 2 | 
 3 | from app.config.data_models import ProviderConfig
 4 | 
 5 | 
 6 | def test_provider_config_parsing(monkeypatch: MonkeyPatch):
 7 |     pcfg = ProviderConfig.model_validate(
 8 |         {"model_name": "foo", "base_url": "https://foo.bar"}
 9 |     )
10 |     assert pcfg.model_name == "foo"
11 |     assert pcfg.base_url == "bar"
12 | 


--------------------------------------------------------------------------------