├── .claude ├── commands │ ├── execute-prp.md │ └── generate-prp.md └── settings.local.json ├── .devcontainer ├── setup_dev │ └── devcontainer.json ├── setup_dev_claude │ └── devcontainer.json └── setup_dev_ollama │ └── devcontainer.json ├── .env.example ├── .github ├── dependabot.yaml ├── scripts │ ├── create_pr.sh │ └── delete_branch_pr_tag.sh └── workflows │ ├── bump-my-version.yaml │ ├── codeql.yaml │ ├── generate-deploy-mkdocs-ghpages.yaml │ ├── links-fail-fast.yaml │ ├── pytest.yaml │ ├── ruff.yaml │ ├── summarize-jobs-reusable.yaml │ └── write-llms-txt.yaml ├── .gitignore ├── .gitmessage ├── .streamlit └── config.toml ├── .vscode ├── extensions.json └── settings.json ├── AGENTS.md ├── CHANGELOG.md ├── CLAUDE.md ├── Dockerfile ├── LICENSE.md ├── Makefile ├── README.md ├── assets └── images │ ├── c4-multi-agent-system.png │ ├── customer-journey-activity-dark.png │ ├── customer-journey-activity-light.png │ └── metrics-eval-sweep.png ├── context └── PRPs │ ├── coordination_quality.md │ ├── features │ ├── coordination_quality.md │ └── tool_efficiency.md │ ├── templates │ ├── feature_base.md │ └── prp_base.md │ └── tool_efficiency.md ├── docs ├── PRD.md ├── SprintPlan.md ├── UserStory.md ├── architecture │ ├── c4-multi-agent-system.plantuml │ ├── customer-journey-activity-dark │ ├── customer-journey-activity-light.plantuml │ └── metrics-eval-sweep.plantuml └── llms.txt ├── mkdocs.yaml ├── pyproject.toml ├── src ├── app │ ├── __init__.py │ ├── agents │ │ ├── __init__.py │ │ ├── agent_system.py │ │ └── llm_model_funs.py │ ├── config │ │ ├── __init__.py │ │ ├── config_app.py │ │ ├── config_chat.json │ │ ├── config_eval.json │ │ └── data_models.py │ ├── evals │ │ ├── __init__.py │ │ └── metrics.py │ ├── main.py │ ├── py.typed │ └── utils │ │ ├── __init__.py │ │ ├── error_messages.py │ │ ├── load_configs.py │ │ ├── load_settings.py │ │ ├── log.py │ │ ├── login.py │ │ └── utils.py ├── examples │ ├── config.json │ ├── run_simple_agent_no_tools.py │ ├── run_simple_agent_system.py │ ├── run_simple_agent_tools.py │ └── utils │ │ ├── agent_simple_no_tools.py │ │ ├── agent_simple_system.py │ │ ├── agent_simple_tools.py │ │ ├── data_models.py │ │ ├── tools.py │ │ └── utils.py ├── gui │ ├── components │ │ ├── footer.py │ │ ├── header.py │ │ ├── output.py │ │ ├── prompts.py │ │ └── sidebar.py │ ├── config │ │ ├── config.py │ │ ├── styling.py │ │ └── text.py │ └── pages │ │ ├── home.py │ │ ├── prompts.py │ │ ├── run_app.py │ │ └── settings.py └── run_gui.py ├── tests ├── test_agent_system.py ├── test_env.py ├── test_metrics_output_similarity.py ├── test_metrics_time_taken.py └── test_provider_config.py └── uv.lock /.claude/commands/execute-prp.md: -------------------------------------------------------------------------------- 1 | # Execute Product Requirements Prompt (PRP) 2 | 3 | Implement a feature using using the PRP file. 4 | 5 | ## PRP File: $ARGUMENTS 6 | 7 | ## Execution Process 8 | 9 | 1. **Load PRP** 10 | - Read the specified PRP file 11 | - Understand all context and requirements 12 | - Follow all instructions in the PRP and extend the research if needed 13 | - Ensure you have all needed context to implement the PRP fully 14 | - Do more web searches and codebase exploration as needed 15 | 16 | 2. **ULTRATHINK** 17 | - Think hard before you execute the plan. Create a comprehensive plan addressing all requirements. 18 | - Break down complex tasks into smaller, manageable steps using your todos tools. 19 | - Use the TodoWrite tool to create and track your implementation plan. 20 | - Identify implementation patterns from existing code to follow. 21 | 22 | 3. **Execute the plan** 23 | - Execute the PRP 24 | - Implement all the code 25 | 26 | 4. **Validate** 27 | - Run each validation command 28 | - Fix any failures 29 | - Re-run until all pass 30 | 31 | 5. **Complete** 32 | - Ensure all checklist items done 33 | - Run final validation suite 34 | - Report completion status 35 | - Read the PRP again to ensure you have implemented everything 36 | 37 | 6. **Reference the PRP** 38 | - You can always reference the PRP again if needed 39 | 40 | Note: If validation fails, use error patterns in PRP to fix and retry. 41 | -------------------------------------------------------------------------------- /.claude/commands/generate-prp.md: -------------------------------------------------------------------------------- 1 | # Create Product Requirements Prompt (PRP) 2 | 3 | ## Feature file: $ARGUMENTS 4 | 5 | Generate a complete PRP (Product Requirements Prompt) for general feature implementation with thorough research. Ensure context is passed to the AI agent to enable self-validation and iterative refinement. Read the feature file first to understand what needs to be created, how the examples provided help, and any other considerations. 6 | 7 | The AI agent only gets the context you are appending to the PRP and training data. Assume the AI agent has access to the codebase and the same knowledge cutoff as you, so its important that your research findings are included or referenced in the PRP. The Agent has Websearch capabilities, so pass urls to documentation and examples. 8 | 9 | - Use `/context/PRPs` as `$base_path` 10 | - Extract only the filename from `$ARGUMENTS` into `$file_name` 11 | 12 | ## Research Process 13 | 14 | 1. **Codebase Analysis** 15 | - Search for similar features/patterns in the codebase 16 | - Identify files to reference in PRP 17 | - Note existing conventions to follow 18 | - Check test patterns for validation approach 19 | 20 | 2. **External Research** 21 | - Search for similar features/patterns online 22 | - Library documentation (include specific URLs) 23 | - Implementation examples (GitHub/StackOverflow/blogs) 24 | - Best practices and common pitfalls 25 | 26 | 3. **User Clarification** (if needed) 27 | - Specific patterns to mirror and where to find them? 28 | - Integration requirements and where to find them? 29 | 30 | ## PRP Generation 31 | 32 | - Use `${base_path}/templates/prp_base.md` in the base folder as template 33 | 34 | ### Critical Context to Include and pass to the AI agent as part of the PRP 35 | 36 | - **Documentation**: URLs with specific sections 37 | - **Code Examples**: Real snippets from codebase 38 | - **Gotchas**: Library quirks, version issues 39 | - **Patterns**: Existing approaches to follow 40 | 41 | ### Implementation Blueprint 42 | 43 | - Start with pseudocode showing approach 44 | - Reference real files for patterns 45 | - Include error handling strategy 46 | - list tasks to be completed to fullfill the PRP in the order they should be completed 47 | 48 | ### Validation Gates (Must be Executable) eg for python 49 | 50 | ```bash 51 | # Syntax/Style 52 | make ruff 53 | make type_check 54 | 55 | # Unit Tests 56 | make coverage_all 57 | ``` 58 | 59 | ***CRITICAL AFTER YOU ARE DONE RESEARCHING AND EXPLORING THE CODEBASE BEFORE YOU START WRITING THE PRP*** 60 | 61 | ***ULTRATHINK ABOUT THE PRP AND PLAN YOUR APPROACH THEN START WRITING THE PRP*** 62 | 63 | ## Output 64 | 65 | - Save the result to `${base_path}/${file_name}` 66 | 67 | ## Quality Checklist 68 | 69 | - [ ] All necessary context included 70 | - [ ] Validation gates are executable by AI 71 | - [ ] References existing patterns 72 | - [ ] Clear implementation path 73 | - [ ] Error handling documented 74 | 75 | Score the PRP on a scale of 1-10 (confidence level to succeed in one-pass implementation using claude codes) 76 | 77 | Remember: The goal is one-pass implementation success through comprehensive context. 78 | -------------------------------------------------------------------------------- /.claude/settings.local.json: -------------------------------------------------------------------------------- 1 | { 2 | "permissions": { 3 | "allow": [ 4 | "Bash(cat:*)", 5 | "Bash(find:*)", 6 | "Bash(git:diff*)", 7 | "Bash(git:status*)", 8 | "Bash(grep:*)", 9 | "Bash(ls:*)", 10 | "Bash(mkdir:*)", 11 | "Bash(source:*)", 12 | "Bash(touch:*)", 13 | "Bash(tree:*)", 14 | "Bash(uv run:*)", 15 | "Edit(AGENTS.md)", 16 | "Edit(docs/**/*.md)", 17 | "Edit(src/**/*.py)", 18 | "Edit(src/**/*.json)", 19 | "Edit(tests/**/*.py)", 20 | "WebFetch(domain:docs.anthropic.com)" 21 | ], 22 | "deny": [ 23 | "Bash(mv:*)", 24 | "Bash(rm:*)" 25 | ] 26 | } 27 | } -------------------------------------------------------------------------------- /.devcontainer/setup_dev/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "make setup_dev", 3 | "image": "mcr.microsoft.com/vscode/devcontainers/python:3.13", 4 | "postCreateCommand": "make setup_dev" 5 | } -------------------------------------------------------------------------------- /.devcontainer/setup_dev_claude/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "make setup_dev_claude", 3 | "image": "mcr.microsoft.com/vscode/devcontainers/python:3.13", 4 | "features": { 5 | "ghcr.io/devcontainers/features/node:1": {} 6 | }, 7 | "customizations": { 8 | "vscode": { 9 | "extensions": [ 10 | "anthropic.claude-code" 11 | ] 12 | } 13 | }, 14 | "postCreateCommand": "make setup_dev_claude" 15 | } -------------------------------------------------------------------------------- /.devcontainer/setup_dev_ollama/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "make setup_dev_ollama", 3 | "image": "mcr.microsoft.com/vscode/devcontainers/python:3.13", 4 | "postCreateCommand": "make setup_dev_ollama" 5 | } -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # inference EP 2 | ANTHROPIC_API_KEY="sk-abc-xyz" 3 | GEMINI_API_KEY="xyz" 4 | GITHUB_API_KEY="ghp_xyz" 5 | GROK_API_KEY="xai-xyz" 6 | HUGGINGFACE_API_KEY="hf_xyz" 7 | OPENROUTER_API_KEY="sk-or-v1-xyz" 8 | PERPLEXITY_API_KEY="" 9 | RESTACK_API_KEY="xyz" 10 | TOGETHER_API_KEY="xyz" 11 | 12 | # tools 13 | TAVILY_API_KEY="" 14 | 15 | # log/mon/trace 16 | AGENTOPS_API_KEY="x-y-z-x-y" 17 | LOGFIRE_API_KEY="pylf_v1_xx_y" # LOGFIRE_TOKEN 18 | WANDB_API_KEY="xyz" 19 | 20 | # eval 21 | -------------------------------------------------------------------------------- /.github/dependabot.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 3 | version: 2 4 | updates: 5 | - package-ecosystem: "pip" 6 | directory: "/" 7 | schedule: 8 | interval: "weekly" 9 | ... 10 | -------------------------------------------------------------------------------- /.github/scripts/create_pr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 1 base ref, 2 target ref, 3 title suffix 3 | # 4 current version, 5 bumped 4 | 5 | pr_title="PR $2 $3" 6 | pr_body="PR automatically created from \`$1\` to bump from \`$4\` to \`$5\` on \`$2\`. Tag \`v$5\` will be created and has to be deleted manually if PR gets closed without merge." 7 | 8 | gh pr create \ 9 | --base $1 \ 10 | --head $2 \ 11 | --title "${pr_title}" \ 12 | --body "${pr_body}" 13 | # --label "bump" 14 | -------------------------------------------------------------------------------- /.github/scripts/delete_branch_pr_tag.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 1 repo, 2 target ref, 3 current version 3 | 4 | tag_to_delete="v$3" 5 | branch_del_api_call="repos/$1/git/refs/heads/$2" 6 | del_msg="'$2' force deletion attempted." 7 | close_msg="Closing PR '$2' to rollback after failure" 8 | 9 | echo "Tag $tag_to_delete for $del_msg" 10 | git tag -d "$tag_to_delete" 11 | echo "PR for $del_msg" 12 | gh pr close "$2" --comment "$close_msg" 13 | echo "Branch $del_msg" 14 | gh api "$branch_del_api_call" -X DELETE && \ 15 | echo "Branch without error return deleted." -------------------------------------------------------------------------------- /.github/workflows/bump-my-version.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: bump-my-version 3 | 4 | on: 5 | # pull_request: 6 | # types: [closed] 7 | # branches: [main] 8 | workflow_dispatch: 9 | inputs: 10 | bump_type: 11 | description: '[major|minor|patch]' 12 | required: true 13 | default: 'patch' 14 | type: choice 15 | options: 16 | - 'major' 17 | - 'minor' 18 | - 'patch' 19 | 20 | env: 21 | BRANCH_NEW: "bump-${{ github.run_number }}-${{ github.ref_name }}" 22 | SKIP_PR_HINT: "[skip ci bump]" 23 | SCRIPT_PATH: ".github/scripts" 24 | 25 | jobs: 26 | bump_my_version: 27 | # TODO bug? currently resulting in: Unrecognized named-value: 'env'. 28 | # https://stackoverflow.com/questions/61238849/github-actions-if-contains-function-not-working-with-env-variable/61240761 29 | # if: !contains( 30 | # github.event.pull_request.title, 31 | # ${{ env.SKIP_PR_HINT }} 32 | # ) 33 | # TODO check for PR closed by bot to avoid PR creation loop 34 | # github.actor != 'github-actions' 35 | if: > 36 | github.event_name == 'workflow_dispatch' || 37 | ( github.event.pull_request.merged == true && 38 | github.event.pull_request.closed_by != 'github-actions' ) 39 | runs-on: ubuntu-latest 40 | outputs: 41 | branch_new: ${{ steps.create_branch.outputs.branch_new }} 42 | summary_data: ${{ steps.set_summary.outputs.summary_data }} 43 | permissions: 44 | actions: read 45 | checks: write 46 | contents: write 47 | pull-requests: write 48 | steps: 49 | 50 | - name: Checkout repo 51 | uses: actions/checkout@v4 52 | with: 53 | fetch-depth: 1 54 | 55 | - name: Set git cfg and create branch 56 | id: create_branch 57 | run: | 58 | git config user.email "bumped@qte77.gha" 59 | git config user.name "bump-my-version" 60 | git checkout -b "${{ env.BRANCH_NEW }}" 61 | echo "branch_new=${{ env.BRANCH_NEW }}" >> $GITHUB_OUTPUT 62 | 63 | - name: Bump version 64 | id: bump 65 | uses: callowayproject/bump-my-version@0.29.0 66 | env: 67 | BUMPVERSION_TAG: "true" 68 | with: 69 | args: ${{ inputs.bump_type }} 70 | branch: ${{ env.BRANCH_NEW }} 71 | 72 | - name: "Create PR '${{ env.BRANCH_NEW }}'" 73 | if: steps.bump.outputs.bumped == 'true' 74 | env: 75 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 76 | run: | 77 | src="${{ env.SCRIPT_PATH }}/create_pr.sh" 78 | chmod +x "$src" 79 | $src "${{ github.ref_name }}" "${{ env.BRANCH_NEW }}" "${{ env.SKIP_PR_HINT }}" "${{ steps.bump.outputs.previous-version }}" "${{ steps.bump.outputs.current-version }}" 80 | 81 | - name: Delete branch, PR and tag in case of failure or cancel 82 | if: failure() || cancelled() 83 | env: 84 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 85 | run: | 86 | src="${{ env.SCRIPT_PATH }}/delete_branch_pr_tag.sh" 87 | chmod +x "$src" 88 | $src "${{ github.repository }}" "${{ env.BRANCH_NEW }}" "${{ steps.bump.outputs.current-version }}" 89 | 90 | - name: Set summary data 91 | id: set_summary 92 | if: ${{ always() }} 93 | run: echo "summary_data=${GITHUB_STEP_SUMMARY}" >> $GITHUB_OUTPUT 94 | 95 | generate_summary: 96 | name: Generate Summary Report 97 | if: ${{ always() }} 98 | needs: bump_my_version 99 | uses: ./.github/workflows/summarize-jobs-reusable.yaml 100 | with: 101 | branch_to_summarize: ${{ needs.bump_my_version.outputs.branch_new }} 102 | summary_data: ${{ needs.bump_my_version.outputs.summary_data }} 103 | ... 104 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # https://github.blog/changelog/2023-01-18-code-scanning-codeql-action-v1-is-now-deprecated/ 3 | name: "CodeQL" 4 | 5 | on: 6 | push: 7 | pull_request: 8 | types: [closed] 9 | branches: [ main ] 10 | schedule: 11 | - cron: '27 11 * * 0' 12 | workflow_dispatch: 13 | 14 | jobs: 15 | analyze: 16 | name: Analyze 17 | runs-on: ubuntu-latest 18 | permissions: 19 | actions: read 20 | contents: read 21 | security-events: write 22 | 23 | steps: 24 | - name: Checkout repository 25 | uses: actions/checkout@v4 26 | 27 | - name: Initialize CodeQL 28 | uses: github/codeql-action/init@v3 29 | with: 30 | languages: python 31 | 32 | - name: Autobuild 33 | uses: github/codeql-action/autobuild@v3 34 | # if autobuild fails 35 | #- run: | 36 | # make bootstrap 37 | # make release 38 | 39 | - name: Perform CodeQL Analysis 40 | uses: github/codeql-action/analyze@v3 41 | #- name: sarif 42 | # uses: github/codeql-action/upload-sarif@v2 43 | ... 44 | -------------------------------------------------------------------------------- /.github/workflows/generate-deploy-mkdocs-ghpages.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Deploy Docs 3 | 4 | on: 5 | pull_request: 6 | types: [closed] 7 | branches: [main] 8 | workflow_dispatch: 9 | 10 | env: 11 | DOCSTRINGS_FILE: "docstrings.md" 12 | DOC_DIR: "docs" 13 | SRC_DIR: "src" 14 | SITE_DIR: "site" 15 | IMG_DIR: "assets/images" 16 | 17 | jobs: 18 | build-and-deploy: 19 | runs-on: ubuntu-latest 20 | permissions: 21 | contents: read 22 | pages: write 23 | id-token: write 24 | environment: 25 | name: github-pages 26 | steps: 27 | 28 | - name: Checkout the repository 29 | uses: actions/checkout@v4.0.0 30 | with: 31 | ref: 32 | ${{ 33 | github.event.pull_request.merged == true && 34 | 'main' || 35 | github.ref_name 36 | }} 37 | fetch-depth: 0 38 | 39 | - uses: actions/configure-pages@v5.0.0 40 | 41 | # caching instead of actions/cache@v4.0.0 42 | # https://docs.astral.sh/uv/guides/integration/github/#caching 43 | - name: Install uv with cache dependency glob 44 | uses: astral-sh/setup-uv@v5.0.0 45 | with: 46 | enable-cache: true 47 | cache-dependency-glob: "uv.lock" 48 | 49 | # setup python from pyproject.toml using uv 50 | # instead of using actions/setup-python@v5.0.0 51 | # https://docs.astral.sh/uv/guides/integration/github/#setting-up-python 52 | - name: "Set up Python" 53 | run: uv python install 54 | 55 | - name: Install only doc deps 56 | run: uv sync --only-group docs # --frozen 57 | 58 | - name: Get repo info and stream into mkdocs.yaml 59 | id: repo_info 60 | run: | 61 | REPO_INFO=$(curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ 62 | -H "Accept: application/vnd.github.v3+json" \ 63 | https://api.github.com/repos/${{ github.repository }}) 64 | REPO_URL="${{ github.server_url }}/${{ github.repository }}" 65 | REPO_URL=$(echo ${REPO_URL} | sed 's|/|\\/|g') 66 | SITE_NAME=$(sed '1!d' README.md | sed '0,/# /{s/# //}') 67 | SITE_DESC=$(echo $REPO_INFO | jq -r .description) 68 | sed -i "s//${REPO_URL}/g" mkdocs.yaml 69 | sed -i "s//${SITE_NAME}/g" mkdocs.yaml 70 | sed -i "s//${SITE_DESC}/g" mkdocs.yaml 71 | 72 | - name: Copy text files to be included 73 | run: | 74 | CFG_PATH="src/app/config" 75 | mkdir -p "${DOC_DIR}/${CFG_PATH}" 76 | cp README.md "${DOC_DIR}/index.md" 77 | cp {CHANGELOG,LICENSE}.md "${DOC_DIR}" 78 | # Auxiliary files 79 | cp .env.example "${DOC_DIR}" 80 | cp "${CFG_PATH}/config_chat.json" "${DOC_DIR}/${CFG_PATH}" 81 | 82 | - name: Generate code docstrings concat file 83 | run: | 84 | PREFIX="::: " 85 | find "${SRC_DIR}" -type f -name "*.py" \ 86 | -type f -not -name "__*__*" -printf "%P\n" | \ 87 | sed 's/\//./g' | sed 's/\.py$//' | \ 88 | sed "s/^/${PREFIX}/" | sort > \ 89 | "${DOC_DIR}/${DOCSTRINGS_FILE}" 90 | 91 | - name: Build documentation 92 | run: uv run --locked --only-group docs mkdocs build 93 | 94 | - name: Copy image files to be included 95 | run: | 96 | # copy images, mkdocs does not by default 97 | # mkdocs also overwrites pre-made directories 98 | dir="${{ env.SITE_DIR }}/${{ env.IMG_DIR }}" 99 | if [ -d "${{ env.IMG_DIR }}" ]; then 100 | mkdir -p "${dir}" 101 | cp "${{ env.IMG_DIR }}"/* "${dir}" 102 | fi 103 | 104 | # - name: Push to gh-pages 105 | # run: uv run mkdocs gh-deploy --force 106 | 107 | - name: Upload artifact 108 | uses: actions/upload-pages-artifact@v3.0.0 109 | with: 110 | path: "${{ env.SITE_DIR }}" 111 | 112 | - name: Deploy to GitHub Pages 113 | id: deployment 114 | uses: actions/deploy-pages@v4.0.0 115 | ... 116 | -------------------------------------------------------------------------------- /.github/workflows/links-fail-fast.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # https://github.com/lycheeverse/lychee-action 3 | # https://github.com/marketplace/actions/lychee-broken-link-checker 4 | name: "Link Checker" 5 | 6 | on: 7 | workflow_dispatch: 8 | push: 9 | branches-ignore: [main] 10 | pull_request: 11 | types: [closed] 12 | branches: [main] 13 | schedule: 14 | - cron: "00 00 * * 0" 15 | 16 | jobs: 17 | linkChecker: 18 | runs-on: ubuntu-latest 19 | permissions: 20 | issues: write 21 | 22 | steps: 23 | - uses: actions/checkout@v4 24 | 25 | - name: Link Checker 26 | id: lychee 27 | uses: lycheeverse/lychee-action@v2 28 | 29 | - name: Create Issue From File 30 | if: steps.lychee.outputs.exit_code != 0 31 | uses: peter-evans/create-issue-from-file@v5 32 | with: 33 | title: lychee Link Checker Report 34 | content-filepath: ./lychee/out.md 35 | labels: report, automated issue 36 | ... 37 | -------------------------------------------------------------------------------- /.github/workflows/pytest.yaml: -------------------------------------------------------------------------------- 1 | name: pytest 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | test: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout repository 11 | uses: actions/checkout@v4 12 | 13 | - name: Set up Python 14 | uses: actions/setup-python@v4 15 | with: 16 | python-version: '3.12' 17 | 18 | - name: Install dependencies 19 | run: | 20 | python -m pip install --upgrade pip 21 | pip install pytest 22 | 23 | - name: Run tests 24 | run: pytest 25 | -------------------------------------------------------------------------------- /.github/workflows/ruff.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # https://github.com/astral-sh/ruff-action 3 | # https://github.com/astral-sh/ruff 4 | name: ruff 5 | on: 6 | push: 7 | pull_request: 8 | types: [closed] 9 | branches: [main] 10 | schedule: 11 | - cron: "0 0 * * 0" 12 | workflow_dispatch: 13 | jobs: 14 | ruff: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v4 18 | - uses: astral-sh/ruff-action@v3 19 | ... 20 | -------------------------------------------------------------------------------- /.github/workflows/summarize-jobs-reusable.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # https://ecanarys.com/supercharging-github-actions-with-job-summaries-and-pull-request-comments/ 3 | # FIXME currently bug in gha summaries ? $GITHUB_STEP_SUMMARY files are empty 4 | # https://github.com/orgs/community/discussions/110283 5 | # https://github.com/orgs/community/discussions/67991 6 | # Possible workaround 7 | # echo ${{ fromJSON(step).name }}" >> $GITHUB_STEP_SUMMARY 8 | # echo ${{ fromJSON(step).outcome }}" >> $GITHUB_STEP_SUMMARY 9 | # echo ${{ fromJSON(step).conclusion }}" 10 | 11 | name: Summarize workflow jobs 12 | 13 | on: 14 | workflow_call: 15 | outputs: 16 | summary: 17 | description: "Outputs summaries of jobs in a workflow" 18 | value: ${{ jobs.generate_summary.outputs.summary }} 19 | inputs: 20 | branch_to_summarize: 21 | required: false 22 | default: 'main' 23 | type: string 24 | summary_data: 25 | required: false 26 | type: string 27 | 28 | jobs: 29 | generate_summary: 30 | name: Generate Summary 31 | runs-on: ubuntu-latest 32 | permissions: 33 | contents: read 34 | actions: read 35 | checks: read 36 | pull-requests: none 37 | outputs: 38 | summary: ${{ steps.add_changed_files.outputs.summary }} 39 | steps: 40 | 41 | - name: Add general information 42 | id: general_info 43 | run: | 44 | echo "# Job Summaries" >> $GITHUB_STEP_SUMMARY 45 | echo "Job: `${{ github.job }}`" >> $GITHUB_STEP_SUMMARY 46 | echo "Date: $(date +'%Y-%m-%d %H:%M:%S')" >> $GITHUB_STEP_SUMMARY 47 | 48 | - name: Add step states 49 | id: step_states 50 | run: | 51 | echo "### Steps:" >> $GITHUB_STEP_SUMMARY 52 | # loop summary_data if valid json 53 | if jq -e . >/dev/null 2>&1 <<< "${{ inputs.summary_data }}"; then 54 | jq -r ' 55 | .steps[] 56 | | select(.conclusion != null) 57 | | "- **\(.name)**: \( 58 | if .conclusion == "success" then ":white_check_mark:" 59 | elif .conclusion == "failure" then ":x:" 60 | else ":warning:" end 61 | )" 62 | ' <<< "${{ inputs.summary_data }}" >> $GITHUB_STEP_SUMMARY 63 | else 64 | echo "Invalid JSON in summary data." >> $GITHUB_STEP_SUMMARY 65 | fi 66 | 67 | - name: Checkout repo 68 | uses: actions/checkout@v4 69 | with: 70 | ref: "${{ inputs.branch_to_summarize }}" 71 | fetch-depth: 0 72 | 73 | - name: Add changed files since last push 74 | id: add_changed_files 75 | run: | 76 | # Get the tags 77 | # Use disabled lines to get last two commits 78 | # current=$(git show -s --format=%ci HEAD) 79 | # previous=$(git show -s --format=%ci HEAD~1) 80 | # git diff --name-only HEAD^ HEAD >> $GITHUB_STEP_SUMMARY 81 | version_tag_regex="^v[0-9]+\.[0-9]+\.[0-9]+$" # v0.0.0 82 | tags=$(git tag --sort=-version:refname | \ 83 | grep -E "${version_tag_regex}" || echo "") 84 | 85 | # Get latest and previous tags 86 | latest_tag=$(echo "${tags}" | head -n 1) 87 | previous_tag=$(echo "${tags}" | head -n 2 | tail -n 1) 88 | 89 | echo "tags: latest '${latest_tag}', previous '${previous_tag}'" 90 | 91 | # Write to summary 92 | error_msg="No files to output. Tag not found:" 93 | echo ${{ steps.step_states.outputs.summary }} >> $GITHUB_STEP_SUMMARY 94 | echo "## Changed files on '${{ inputs.branch_to_summarize }}'" >> $GITHUB_STEP_SUMMARY 95 | 96 | if [ -z "${latest_tag}" ]; then 97 | echo "${error_msg} latest" >> $GITHUB_STEP_SUMMARY 98 | elif [ -z "${previous_tag}" ]; then 99 | echo "${error_msg} previous" >> $GITHUB_STEP_SUMMARY 100 | elif [ "${latest_tag}" == "${previous_tag}" ]; then 101 | echo "Latest and previous tags are the same: '${latest_tag}'" >> $GITHUB_STEP_SUMMARY 102 | else 103 | # Get commit dates and hashes 104 | latest_date=$(git log -1 --format=%ci $latest_tag) 105 | previous_date=$(git log -1 --format=%ci $previous_tag) 106 | current_hash=$(git rev-parse --short $latest_tag) 107 | previous_hash=$(git rev-parse --short $previous_tag) 108 | 109 | # Append summary to the job summary 110 | echo "Latest Tag Commit: '${latest_tag}' (${current_hash}) ${latest_date}" >> $GITHUB_STEP_SUMMARY 111 | echo "Previous Tag Commit: '${previous_tag}' (${previous_hash}) ${previous_date}" >> $GITHUB_STEP_SUMMARY 112 | echo "Files changed:" >> $GITHUB_STEP_SUMMARY 113 | echo '```' >> $GITHUB_STEP_SUMMARY 114 | git diff --name-only $previous_tag..$latest_tag >> $GITHUB_STEP_SUMMARY 115 | echo '```' >> $GITHUB_STEP_SUMMARY 116 | fi 117 | 118 | - name: Output error message in case of failure or cancel 119 | if: failure() || cancelled() 120 | run: | 121 | if [ "${{ job.status }}" == "cancelled" ]; then 122 | out_msg="## Workflow was cancelled" 123 | else 124 | out_msg="## Error in previous step" 125 | fi 126 | echo $out_msg >> $GITHUB_STEP_SUMMARY 127 | ... -------------------------------------------------------------------------------- /.github/workflows/write-llms-txt.yaml: -------------------------------------------------------------------------------- 1 | # TODO use local installation of repo to text 2 | # https://github.com/itsitgroup/repo2txt 3 | name: Write repo llms.txt 4 | 5 | on: 6 | push: 7 | branches: [main] 8 | workflow_dispatch: 9 | inputs: 10 | LLMS_TXT_PATH: 11 | description: 'Path to the directory to save llsm.txt' 12 | required: true 13 | default: 'docs' 14 | type: string 15 | LLMS_TXT_NAME: 16 | description: 'Path to the directory to save llsm.txt' 17 | required: true 18 | default: 'llms.txt' 19 | type: string 20 | CONVERTER_URL: 21 | description: '[uithub|gittodoc]' # |repo2txt 22 | required: true 23 | default: 'uithub.com' 24 | type: choice 25 | options: 26 | - 'uithub.com' 27 | - 'gittodoc.com' 28 | # - 'repo2txt.com' 29 | 30 | jobs: 31 | generate-file: 32 | runs-on: ubuntu-latest 33 | 34 | steps: 35 | - name: Checkout repo 36 | uses: actions/checkout@v4 37 | 38 | - name: Construct and create llms.txt path 39 | id: construct_and_create_llms_txt_path 40 | run: | 41 | LLMS_TXT_PATH="${{ inputs.LLMS_TXT_PATH }}" 42 | LLMS_TXT_PATH="${LLMS_TXT_PATH:-docs}" 43 | LLMS_TXT_NAME="${{ inputs.LLMS_TXT_NAME }}" 44 | LLMS_TXT_NAME="${LLMS_TXT_NAME:-llms.txt}" 45 | echo "LLMS_TXT_FULL=${LLMS_TXT_PATH}/${LLMS_TXT_NAME}" >> $GITHUB_OUTPUT 46 | mkdir -p "${LLMS_TXT_PATH}" 47 | 48 | - name: Fetch TXT from URL 49 | run: | 50 | LLMS_TXT_FULL=${{ steps.construct_and_create_llms_txt_path.outputs.LLMS_TXT_FULL }} 51 | URL="https://${{ inputs.CONVERTER_URL }}/${{ github.repository }}" 52 | echo "Fetching content from: ${URL}" 53 | echo "Saving content to: ${LLMS_TXT_FULL}" 54 | curl -s "${URL}" > "${LLMS_TXT_FULL}" 55 | 56 | - name: Commit and push file 57 | run: | 58 | LLMS_TXT_FULL=${{ steps.construct_and_create_llms_txt_path.outputs.LLMS_TXT_FULL }} 59 | commit_msg="feat(docs): Add/Update ${LLMS_TXT_FULL}, a flattened repo as single text file, inspired by [llmstxt.org](https://llmstxt.org/)." 60 | git config user.name "github-actions" 61 | git config user.email "github-actions@github.com" 62 | git add "${LLMS_TXT_FULL}" 63 | git commit -m "${commit_msg}" 64 | git push 65 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python bytecode 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # environment 6 | .venv/ 7 | *.env 8 | unset_env.sh 9 | 10 | # Distribution / packaging 11 | build/ 12 | dist/ 13 | *.egg-info/ 14 | 15 | # Testing 16 | .pytest_cache/ 17 | .coverage 18 | 19 | # Logs 20 | *.log 21 | /logs 22 | 23 | # Traces 24 | scalene-profiles 25 | profile.html 26 | profile.json 27 | 28 | # OS generated files 29 | .DS_Store 30 | Thumbs.db 31 | 32 | # IDE specific files (adjust as needed) 33 | # .vscode/ 34 | # .idea/ 35 | 36 | # mkdocs 37 | reference/ 38 | site/ 39 | 40 | # linting 41 | .ruff_cache 42 | 43 | # type checking 44 | .mypy_cache/ 45 | 46 | # project specific 47 | wandb/ 48 | -------------------------------------------------------------------------------- /.gitmessage: -------------------------------------------------------------------------------- 1 | #<--- 72 characters ---------------------------------------------------> 2 | # 3 | # Conventional Commits, semantic commit messages for humans and machines 4 | # https://www.conventionalcommits.org/en/v1.0.0/ 5 | # Lint your conventional commits 6 | # https://github.com/conventional-changelog/commitlint/tree/master/%40 \ 7 | # commitlint/config-conventional 8 | # Common types can be (based on Angular convention) 9 | # build, chore, ci, docs, feat, fix, perf, refactor, revert, style, test 10 | # https://github.com/conventional-changelog/commitlint/tree/master/%40 11 | # Footer 12 | # https://git-scm.com/docs/git-interpret-trailers 13 | # 14 | #<--- pattern ---------------------------------------------------------> 15 | # 16 | # [(Scope)][!]: \ 17 | # 18 | # short description: [()]: 19 | # 20 | # ! after scope in header indicates breaking change 21 | # 22 | # [optional body] 23 | # 24 | # - with bullets points 25 | # 26 | # [optional footer(s)] 27 | # 28 | # [BREAKING CHANGE:, Refs:, Resolves:, Addresses:, Reviewed by:] 29 | # 30 | #<--- usage -----------------------------------------------------------> 31 | # 32 | # Set locally (in the repository) 33 | # `git config commit.template .gitmessage` 34 | # 35 | # Set globally 36 | # `git config --global commit.template .gitmessage` 37 | # 38 | #<--- 72 characters ---------------------------------------------------> -------------------------------------------------------------------------------- /.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | [theme] 2 | primaryColor="#f92aad" 3 | backgroundColor="#0b0c10" 4 | secondaryBackgroundColor="#1f2833" 5 | textColor="#66fcf1" 6 | font="monospace" 7 | 8 | [server] 9 | # enableCORS = false 10 | enableXsrfProtection = true 11 | 12 | [browser] 13 | gatherUsageStats = false 14 | 15 | [client] 16 | # toolbarMode = "minimal" 17 | showErrorDetails = true 18 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "charliermarsh.ruff", 4 | "davidanson.vscode-markdownlint", 5 | "donjayamanne.githistory", 6 | "editorconfig.editorconfig", 7 | "gruntfuggly.todo-tree", 8 | "mhutchie.git-graph", 9 | "PKief.material-icon-theme", 10 | "redhat.vscode-yaml", 11 | "tamasfe.even-better-toml", 12 | "yzhang.markdown-all-in-one", 13 | 14 | "github.copilot", 15 | "github.copilot-chat", 16 | "github.vscode-github-actions", 17 | "ms-azuretools.vscode-docker", 18 | "ms-python.debugpy", 19 | "ms-python.python", 20 | "ms-python.vscode-pylance", 21 | "ms-vscode.makefile-tools", 22 | ] 23 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.lineNumbers": "on", 3 | "editor.wordWrap": "on", 4 | "explorer.confirmDelete": true, 5 | "files.autoSave": "onFocusChange", 6 | "git.autofetch": true, 7 | "git.enableSmartCommit": true, 8 | "makefile.configureOnOpen": false, 9 | "markdownlint.config": { 10 | "MD024": false, 11 | "MD033": false 12 | }, 13 | "python.analysis.extraPaths": ["./venv/lib/python3.13/site-packages"], 14 | "python.defaultInterpreterPath": "./.venv/bin/python", 15 | "python.analysis.typeCheckingMode": "strict", 16 | "python.analysis.diagnosticSeverityOverrides": { 17 | "reportMissingTypeStubs": "none", 18 | "reportUnknownMemberType": "none", 19 | "reportUnknownVariableType": "none" 20 | }, 21 | "redhat.telemetry.enabled": false 22 | } -------------------------------------------------------------------------------- /AGENTS.md: -------------------------------------------------------------------------------- 1 | # Agent instructions for `Agents-eval` repository 2 | 3 | As proposed by [agentsmd.net](https://agentsmd.net/) and used by [wandb weave AGENTS.md](https://github.com/wandb/weave/blob/master/AGENTS.md). 4 | 5 | ## Core Rules & AI Behavior 6 | 7 | * When you learn something new about the codebase or introduce a new concept, **update this file (`AGENTS.md`)** to reflect the new knowledge. This is YOUR FILE! It should grow and evolve with you. 8 | * If something doesn't make sense architecturally, from a developer experience standpoint, or product-wise, please add it to the **`Requests to Humans`** section below. 9 | * Always follow the established coding patterns, conventions, and architectural decisions documented here and in the `docs/` directory. 10 | * **Never assume missing context.** Ask questions if you are uncertain about requirements or implementation details. 11 | * **Never hallucinate libraries or functions.** Only use known, verified Python packages listed in `pyproject.toml`. 12 | * **Always confirm file paths and module names** exist before referencing them in code or tests. 13 | * **Never delete or overwrite existing code** unless explicitly instructed to or as part of a documented refactoring task. 14 | 15 | ## Architecture Overview 16 | 17 | This is a multi-agent evaluation system for assessing agentic AI systems. The project uses **PydanticAI** as the core framework for agent orchestration and is designed for evaluation purposes, not for production agent deployment. 18 | 19 | ### Data Flow 20 | 21 | 1. User input → Manager Agent 22 | 2. Manager delegates to Researcher Agent (with DuckDuckGo search) 23 | 3. Researcher results → Analyst Agent for validation 24 | 4. Validated data → Synthesizer Agent for report generation 25 | 5. Results evaluated using configurable metrics 26 | 27 | ### Key Dependencies 28 | 29 | * **PydanticAI**: Agent framework and orchestration 30 | * **uv**: Fast Python dependency management 31 | * **Streamlit**: GUI framework 32 | * **Ruff**: Code formatting and linting 33 | * **MyPy**: Static type checking 34 | 35 | ## Codebase Structure & Modularity 36 | 37 | ### Main Components 38 | 39 | * `src/app/`: The core application logic. This is where most of your work will be. 40 | * `main.py`: The main entry point for the CLI application. 41 | * `agents/agent_system.py`: Defines the multi-agent system, their interactions, and orchestration. **This is the central logic for agent behavior.** 42 | * `config/data_models.py`: Contains all **Pydantic** models that define the data contracts. This is a critical file for understanding data flow. 43 | * `config/config_chat.json`: Holds provider settings and system prompts for agents. 44 | * `config/config_eval.json`: Defines evaluation metrics and their weights. 45 | * `evals/metrics.py`: Implements the evaluation metrics. 46 | * `src/gui/`: Contains the source code for the Streamlit GUI. 47 | * `docs/`: Contains project documentation, including the Product Requirements Document (`PRD.md`) and the C4 architecture model. 48 | * `tests/`: Contains all tests for the project, written using **pytest**. 49 | 50 | ### Code Organization Rules 51 | 52 | * **Never create a file longer than 500 lines of code.** If a file approaches this limit, refactor by splitting it into smaller, more focused modules or helper files. 53 | * Organize code into clearly separated modules grouped by feature. 54 | * Use clear, consistent, and absolute imports within packages. 55 | 56 | ## Development Commands & Environment 57 | 58 | ### Environment Setup 59 | 60 | The project requirements are stated in `pyproject.toml`. Your development environment should be set up automatically using the provided `Makefile`, which configures the virtual environment. 61 | 62 | * `make setup_dev`: Install all dev dependencies. 63 | * `make setup_dev_claude`: Setup dev environment with Claude Code CLI. 64 | * `make setup_dev_ollama`: Setup dev environment with Ollama local LLM. 65 | 66 | ### Running the Application 67 | 68 | * `make run_cli`: Run the CLI application. 69 | * `make run_cli ARGS="--help"`: Run CLI with specific arguments. 70 | * `make run_gui`: Run the Streamlit GUI. 71 | 72 | ### Testing and Code Quality 73 | 74 | * `make test_all`: Run all tests with pytest. 75 | * `make coverage_all`: Run tests and generate a coverage report. 76 | * `make ruff`: Format code and fix linting issues with Ruff. 77 | * `make type_check`: Run mypy static type checking on `src/app/`. 78 | 79 | ## Testing & Reliability 80 | 81 | * **Always create Pytest unit tests** for new features (functions, classes, etc.). 82 | * Tests must live in the `tests/` folder, mirroring the `src/app` structure. 83 | * After updating any logic, check whether existing unit tests need to be updated. If so, do it. 84 | * For each new feature, include at least: 85 | * 1 test for the expected use case (happy path). 86 | * 1 test for a known edge case. 87 | * 1 test for an expected failure case (e.g., invalid input). 88 | * **To run a specific test file or function, use `uv run pytest` directly:** 89 | * `uv run pytest tests/test_specific_file.py` 90 | * `uv run pytest tests/test_specific_file.py::test_function` 91 | 92 | ## Style, Patterns & Documentation 93 | 94 | ### Coding Style 95 | 96 | * **Use Pydantic** models in `src/app/config/data_models.py` for all data validation and data contracts. **Always use or update these models** when modifying data flows. 97 | * Use the predefined error message functions from `src/app/utils/error_messages.py` for consistency. 98 | * When writing complex logic, **add an inline `# Reason:` comment** explaining the *why*, not just the *what*. 99 | * Comment non-obvious code to ensure it is understandable to a mid-level developer. 100 | 101 | ### Documentation 102 | 103 | * Write **docstrings for every function, class, and method** using the Google style format. This is critical as the documentation site is built automatically from docstrings. 104 | 105 | ```python 106 | def example_function(param1: int) -> str: 107 | """A brief summary of the function. 108 | 109 | Args: 110 | param1 (int): A description of the first parameter. 111 | 112 | Returns: 113 | str: A description of the return value. 114 | """ 115 | return "example" 116 | ``` 117 | 118 | * Update this `AGENTS.md` file when introducing new patterns or concepts. 119 | * Document significant architectural decisions in `docs/ADR.md`. 120 | * Document all significant changes, features, and bug fixes in `docs/CHANGELOG.md`. 121 | 122 | ## Code Review & PR Guidelines 123 | 124 | ### PR Requirements 125 | 126 | * **Title Format**: Commit messages and PR titles must follow the **Conventional Commits** specification, as outlined in the `.gitmessage` template. 127 | * Provide detailed PR summaries including the purpose of the changes and the testing performed. 128 | 129 | ### Pre-commit Checklist 130 | 131 | 1. Run the linter and formatter: `make ruff`. 132 | 2. Ensure all tests pass: `make test_all`. 133 | 3. Ensure static type checks pass: `make type_check`. 134 | 4. Update documentation as described below. 135 | 136 | ## Requests to Humans 137 | 138 | This section contains a list of questions, clarifications, or tasks that AI agents wish to have humans complete or elaborate on. 139 | 140 | * [ ] The `agent_system.py` module has a `NotImplementedError` for streaming with Pydantic model outputs. Please clarify the intended approach for streaming structured data. 141 | * [ ] The `llm_model_funs.py` module has `NotImplementedError` for the Gemini and HuggingFace providers. Please provide the correct implementation or remove them if they are not supported. 142 | * [ ] The `agent_system.py` module contains a `FIXME` note regarding the use of a try-catch context manager. Please review and implement the intended error handling. 143 | * [ ] Add TypeScript testing guidelines (if a TypeScript frontend is planned for the future). 144 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## Guiding Principles 9 | 10 | - Changelogs are for humans, not machines. 11 | - There should be an entry for every single version. 12 | - The same types of changes should be grouped. 13 | - Versions and sections should be linkable. 14 | - The latest version comes first. 15 | - The release date of each version is displayed. 16 | - Mention whether you follow Semantic Versioning. 17 | 18 | ## Types of changes 19 | 20 | - `Added` for new features. 21 | - `Changed` for changes in existing functionality. 22 | - `Deprecated` for soon-to-be removed features. 23 | - `Removed` for now removed features. 24 | - `Fixed` for any bug fixes. 25 | - `Security` in case of vulnerabilities. 26 | 27 | ## [Unreleased] 28 | 29 | ### Added 30 | 31 | - Claude code functionality, commands and settings 32 | 33 | ## [1.1.0] - 2025-07-05 34 | 35 | ### Added 36 | 37 | - Makefile command and devcontainer.json for claude code usage 38 | 39 | ### Changed 40 | 41 | - Moved streamlit_gui and examples to /src 42 | - Moved app to /src/app 43 | 44 | ## [1.0.0] - 2025-03-18 45 | 46 | ### 2025-03-18 47 | 48 | - refactor(agent,streamlit): Convert main and run_manager functions again to async for streamli output 49 | - fix(prompts): Update system prompts for manager,researcher and synthesiser roles to remove complexity 50 | - chore(workflows): Update action versions in GitHub workflows for consistency 51 | - chore(workflows): Update action versions for deploy docs to pgh-pages 52 | - docs(deps): Add documentation dependencies for MkDocs and related plugins to pyproject.toml 53 | 54 | ### 2025-03-17 55 | 56 | - feat(main,agent): refactor entry point to support async execution and enhance login handling 57 | - feat(cli,login,log): refactor entry point to integrate Typer, enhance logging, added login every run 58 | - feat(streamlit): replace load_config with load_app_config, enhance sidebar rendering, and improve output rendering with type support 59 | - feat(streamlit): enhance render_output function with detailed docstring and improve query handling in run_app 60 | - feat(streamlit): enhance render_output function with additional info parameter and improve output handling in run_app 61 | - feat(streamlit,app): add Typer dependency, update main entry point for async execution, add streamlit provider input 62 | - feat(agent): update configuration and improve agent system setup with enhanced error handling and new environment variables 63 | - feat(config,login,catch): add inference settings with usage limits and result retries, enhance login function to initialize environment and handle exceptions, comment out raise in error handling context to prevent unintended crashes 64 | - feat(login,catch): integrate logfire configuration in login function and improve error handling context 65 | 66 | ### 2025-03-16 67 | 68 | - feta(devconatiner): Refactor devcontainer setup: remove old configurations and add new setup targets for development and Ollama 69 | - feat(devcontainer): Changed from vscode to astral-sh devcontainer 70 | - feat(devcontainer): Changed to vscode container, added postcreatecommand make setup_env 71 | - feat(devcontainer): restructure environment setup with new devcontainer configurations 72 | - feat(devcontainer): update environment names for clarity in devcontainer configurations 73 | - refactor(agent): Added AgentConfig class for better agent configuration management, Refactored main function for streamlined agent initialization. 74 | - feat(config,agents): Update model providers and enhance configuration management, examples: Added new model providers: Gemini and OpenRouter, src: Enabled streaming responses in the agent system 75 | - chore: Remove unused prompt files, update configuration, and enhance logging setup 76 | - refactor(exception,logfire): Enhance error handling and update model configurations in agent system 77 | 78 | ### 2025-03-14 79 | 80 | - feat(scalene): Add profiling support and update dependencies 81 | - refactor(Makefile): Improve target descriptions and organization 82 | 83 | ### 2025-03-13 84 | 85 | - refactor(API,except): .env.example, add OpenRouter configuration, enhance error handling in run_simple_agent_system.py, and update ModelConfig to allow optional API key. 86 | - feat(streamlit): add Streamlit app structure with header, footer, sidebar, and main content components 87 | - feat(streamlit): enhance Streamlit app with detailed docstrings, improved header/footer, and refined main content layout 88 | - feat(makefile,streamlit): update Makefile commands for CLI and GUI execution, and modify README for usage instructions, add streamlit config.toml 89 | - feat(streamlit): restructure Streamlit app by removing unused components, adding new header, footer, sidebar, and output components, and updating configuration settings 90 | - chore: replace app entrypoint with main, remove unused tools and tests, and update makefile for linting and type checking 91 | - chore: Enhance makefile with coverage and help commands, update mkdocs.yaml and pyproject.toml for improved project structure and documentation 92 | - test: Update makefile for coverage reporting, modify pyproject.toml to include pytest-cov, and adjust dependency settings 93 | - test: Add coverage support with pytest-cov and update makefile for coverage reporting 94 | - test: makefile for coverage reporting, update dependencies in pyproject.toml for improved testing and coverage support 95 | - chore: Remove redundant help command from makefile 96 | - refactor(agent,async): Refactor agent tests to use async fixtures and update verification methods for async results 97 | - fix(Dockerfile): Remove unnecessary user creation and pip install commands from Dockerfile 98 | - feat(agent): Update dependencies and add new example structures; remove obsolete files 99 | - chore(structure): simplified agents.py 100 | - fix(pyproject): Replace pydantic-ai with pydantic-ai-slim and update dependencies 101 | - feat(examples): add new examples and data models; update configuration structure 102 | - feat(agent): update dependencies, enhance examples, and introduce new data models for research and analysis agents 103 | - feat(examples): enhance prompts structure and refactor research agent integration 104 | - feat(examples): improve documentation and enhance error handling in agent examples 105 | - feat(agent): Added data models and configuration for research and analysis agents, Added System C4 plantuml 106 | - feat(weave,dependencies): update dependencies and integrate Weave for enhanced functionality in the agent system 107 | - feat(agent): initialize agentops with API key and default tags for enhanced agent functionality 108 | - feat(agent): integrate logfire for logging and configure initial logging settings 109 | - feat(agent): adjust usage limits for ollama provider to enhance performance 110 | - feat(agent): refine system prompts and enhance data model structure for improved agent interactions 111 | - feat(agent): update system prompts for improved clarity and accuracy; add example environment configuration 112 | - feat(agent): enhance agent system with synthesiser functionality and update prompts for improved coordination 113 | - feat(agent): add Grok and Gemini API configurations; initialize logging and agent operations 114 | - feat(agent): improve documentation and refactor model configuration handling for agent system 115 | - feat(agent): update environment configuration, enhance logging, and refine agent management functionality 116 | - feat(agent): refactor login handling, update model retrieval, and enhance agent configuration 117 | 118 | ## [0.0.2] - 2025-01-20 119 | 120 | ### Added 121 | 122 | - PRD.md 123 | - C4 architecture diagrams: system context, code 124 | - tests: basic agent evals, config.json 125 | 126 | ### Changed 127 | 128 | - make recipes 129 | 130 | ## [0.0.1] - 2025-01-20 131 | 132 | ### Added 133 | 134 | - Makefile: setup, test, ruff 135 | - devcontainer: python only, w/o Jetbrains clutter from default devcontainer 136 | - ollama: server and model download successful 137 | - agent: tools use full run red 138 | - pytest: e2e runm final result red 139 | - Readme: basic project info 140 | - pyproject.toml 141 | -------------------------------------------------------------------------------- /CLAUDE.md: -------------------------------------------------------------------------------- 1 | @AGENTS.md -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG APP_ROOT="/src" 2 | ARG PYTHON_VERSION="3.12" 3 | ARG USER="appuser" 4 | 5 | 6 | # Stage 1: Builder Image 7 | FROM python:${PYTHON_VERSION}-slim AS builder 8 | LABEL author="qte77" 9 | LABEL builder=true 10 | ENV PYTHONDONTWRITEBYTECODE=1 \ 11 | PYTHONUNBUFFERED=1 12 | COPY pyproject.toml uv.lock / 13 | RUN set -xe \ 14 | && pip install --no-cache-dir uv \ 15 | && uv sync --frozen 16 | 17 | 18 | # Stage 2: Runtime Image 19 | FROM python:${PYTHON_VERSION}-slim AS runtime 20 | LABEL author="qte77" 21 | LABEL runtime=true 22 | 23 | ARG APP_ROOT 24 | ARG USER 25 | ENV PYTHONDONTWRITEBYTECODE=1 \ 26 | PYTHONUNBUFFERED=1 \ 27 | PYTHONPATH=${APP_ROOT} \ 28 | PATH="${APP_ROOT}:${PATH}" 29 | # WANDB_KEY=${WANDB_KEY} \ 30 | # WANDB_DISABLE_CODE=true 31 | 32 | USER ${USER} 33 | WORKDIR ${APP_ROOT} 34 | COPY --from=builder /.venv .venv 35 | COPY --chown=${USER}:${USER} ${APP_ROOT} . 36 | 37 | CMD [ \ 38 | "uv", "run", \ 39 | "--locked", "--no-sync", \ 40 | "python", "-m", "." \ 41 | ] 42 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # BSD 3-Clause License 2 | 3 | Copyright (c) 2025 qte77 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # This Makefile automates the build, test, and clean processes for the project. 2 | # It provides a convenient way to run common tasks using the 'make' command. 3 | # It is designed to work with the 'uv' tool for managing Python environments and dependencies. 4 | # Run `make help` to see all available recipes. 5 | 6 | .SILENT: 7 | .ONESHELL: 8 | .PHONY: all setup_prod setup_dev setup_prod_ollama setup_dev_ollama setup_dev_claude setup_claude_code setup_ollama start_ollama stop_ollama clean_ollama ruff run_cli run_gui run_profile prp_gen_claude prp_exe_claude test_all coverage_all type_check output_unset_app_env_sh help 9 | # .DEFAULT: setup_dev_ollama 10 | .DEFAULT_GOAL := setup_dev_ollama 11 | 12 | SRC_PATH := src 13 | APP_PATH := $(SRC_PATH)/app 14 | GUI_PATH_ST := $(SRC_PATH)/run_gui.py 15 | CHAT_CFG_FILE := $(APP_PATH)/config_chat.json 16 | OLLAMA_SETUP_URL := https://ollama.com/install.sh 17 | OLLAMA_MODEL_NAME := $$(jq -r '.providers.ollama.model_name' $(CHAT_CFG_FILE)) 18 | PRP_DEF_PATH := /context/PRPs/features 19 | PRP_CLAUDE_GEN_CMD := generate-prp 20 | PRP_CLAUDE_EXE_CMD := execute-prp 21 | 22 | # construct the full path to the PRP definition file 23 | define CLAUDE_PRP_RUNNER 24 | echo "Starting Claude Code PRP runner ..." 25 | # 1. Extract arguments and validate that they are not empty. 26 | prp_file=$(firstword $(strip $(1))) 27 | cmd_prp=$(firstword $(strip $(2))) 28 | if [ -z "$${prp_file}" ]; then 29 | echo "Error: ARGS for PRP filename is empty. Please provide a PRP filename." 30 | exit 1 31 | fi 32 | if [ -z "$${cmd_prp}" ]; then 33 | echo "Error: ARGS for command is empty. Please provide a command." 34 | exit 2 35 | fi 36 | cmd_prp="/project:$${cmd_prp} $(PRP_DEF_PATH)/$${prp_file}" 37 | cmd_cost="/cost" 38 | echo "Executing command '$${cmd_prp}' ..." 39 | claude -p "$${cmd_prp}" 2>&1 40 | claude -p "$${cmd_cost}" 2>&1 41 | endef 42 | 43 | setup_prod: ## Install uv and deps, Download and start Ollama 44 | echo "Setting up prod environment ..." 45 | pip install uv -q 46 | uv sync --frozen 47 | 48 | setup_dev: ## Install uv and deps, Download and start Ollama 49 | echo "Setting up dev environment ..." 50 | pip install uv -q 51 | uv sync --all-groups 52 | 53 | setup_prod_ollama: 54 | $(MAKE) -s setup_prod 55 | $(MAKE) -s setup_ollama 56 | $(MAKE) -s start_ollama 57 | 58 | setup_dev_ollama: 59 | $(MAKE) -s setup_dev 60 | $(MAKE) -s setup_ollama 61 | $(MAKE) -s start_ollama 62 | 63 | setup_dev_claude: 64 | $(MAKE) -s setup_dev 65 | $(MAKE) -s setup_claude_code 66 | 67 | setup_claude_code: ## Setup claude code CLI, node.js and npm have to be present 68 | echo "Setting up claude code ..." 69 | npm install -g @anthropic-ai/claude-code 70 | claude config set --global preferredNotifChannel terminal_bell 71 | echo "npm version: $$(npm --version)" 72 | claude --version 73 | 74 | # Ollama BINDIR in /usr/local/bin /usr/bin /bin 75 | setup_ollama: ## Download Ollama, script does start local Ollama server 76 | echo "Downloading Ollama binary... Using '$(OLLAMA_SETUP_URL)'." 77 | # script does start server but not consistently 78 | curl -fsSL $(OLLAMA_SETUP_URL) | sh 79 | echo "Pulling model '$(OLLAMA_MODEL_NAME)' ..." 80 | ollama pull $(OLLAMA_MODEL_NAME) 81 | 82 | start_ollama: ## Start local Ollama server, default 127.0.0.1:11434 83 | ollama serve 84 | 85 | stop_ollama: ## Stop local Ollama server 86 | echo "Stopping Ollama server..." 87 | pkill ollama 88 | 89 | clean_ollama: ## Remove local Ollama from system 90 | echo "Searching for Ollama binary..." 91 | for BINDIR in /usr/local/bin /usr/bin /bin; do 92 | if echo $$PATH | grep -q $$BINDIR; then 93 | echo "Ollama binary found in '$$BINDIR'" 94 | BIN="$$BINDIR/ollama" 95 | break 96 | fi 97 | done 98 | echo "Cleaning up..." 99 | rm -f $(BIN) 100 | 101 | ruff: ## Lint: Format and check with ruff 102 | uv run ruff format 103 | uv run ruff check --fix 104 | 105 | run_cli: ## Run app on CLI only 106 | path=$$(echo "$(APP_PATH)" | tr '/' '.') 107 | uv run python -m $${path}.main $(ARGS) 108 | 109 | run_gui: ## Run app with Streamlit GUI 110 | uv run streamlit run $(GUI_PATH_ST) 111 | 112 | run_profile: ## Profile app with scalene 113 | uv run scalene --outfile \ 114 | "$(APP_PATH)/scalene-profiles/profile-$(date +%Y%m%d-%H%M%S)" \ 115 | "$(APP_PATH)/main.py" 116 | 117 | prp_gen_claude: ## generates the PRP from the file passed in ARGS 118 | $(call CLAUDE_PRP_RUNNER, $(ARGS), $(PRP_CLAUDE_GEN_CMD)) 119 | 120 | prp_exe_claude: ## executes the PRP from the file passed in ARGS 121 | $(call CLAUDE_PRP_RUNNER, $(ARGS), $(PRP_CLAUDE_EXE_CMD)) 122 | 123 | test_all: ## Run all tests 124 | uv run pytest 125 | 126 | coverage_all: ## Get test coverage 127 | uv run coverage run -m pytest || true 128 | uv run coverage report -m 129 | 130 | type_check: ## Check for static typing errors 131 | uv run mypy $(APP_PATH) 132 | 133 | output_unset_app_env_sh: ## Unset app environment variables 134 | uf="./unset_env.sh" 135 | echo "Outputing '$${uf}' ..." 136 | printenv | awk -F= '/_API_KEY=/ {print "unset " $$1}' > $$uf 137 | 138 | help: ## Displays this message with available recipes 139 | # TODO add stackoverflow source 140 | echo "Usage: make [recipe]" 141 | echo "Recipes:" 142 | awk '/^[a-zA-Z0-9_-]+:.*?##/ { 143 | helpMessage = match($$0, /## (.*)/) 144 | if (helpMessage) { 145 | recipe = $$1 146 | sub(/:/, "", recipe) 147 | printf " \033[36m%-20s\033[0m %s\n", recipe, substr($$0, RSTART + 3, RLENGTH) 148 | } 149 | }' $(MAKEFILE_LIST) 150 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Agents-eval 2 | 3 | This project aims to implement an evaluation pipeline to assess the effectiveness of open-source agentic AI systems across various use cases, focusing on use case agnostic metrics that measure core capabilities such as task decomposition, tool integration, adaptability, and overall performance. 4 | 5 | ![License](https://img.shields.io/badge/license-BSD3Clause-green.svg) 6 | ![Version](https://img.shields.io/badge/version-1.1.0-58f4c2) 7 | [![CodeQL](https://github.com/qte77/Agents-eval/actions/workflows/codeql.yaml/badge.svg)](https://github.com/qte77/Agents-eval/actions/workflows/codeql.yaml) 8 | [![CodeFactor](https://www.codefactor.io/repository/github/qte77/Agents-eval/badge)](https://www.codefactor.io/repository/github/qte77/Agents-eval) 9 | [![ruff](https://github.com/qte77/Agents-eval/actions/workflows/ruff.yaml/badge.svg)](https://github.com/qte77/Agents-eval/actions/workflows/ruff.yaml) 10 | [![pytest](https://github.com/qte77/Agents-eval/actions/workflows/pytest.yaml/badge.svg)](https://github.com/qte77/Agents-eval/actions/workflows/pytest.yaml) 11 | [![Link Checker](https://github.com/qte77/Agents-eval/actions/workflows/links-fail-fast.yaml/badge.svg)](https://github.com/qte77/Agents-eval/actions/workflows/links-fail-fast.yaml) 12 | [![Deploy Docs](https://github.com/qte77/Agents-eval/actions/workflows/generate-deploy-mkdocs-ghpages.yaml/badge.svg)](https://github.com/qte77/Agents-eval/actions/workflows/generate-deploy-mkdocs-ghpages.yaml) 13 | 14 | **DevEx** [![vscode.dev](https://img.shields.io/static/v1?logo=visualstudiocode&label=&message=vscode.dev&labelColor=2c2c32&color=007acc&logoColor=007acc)](https://vscode.dev/github/qte77/Agents-eval) 15 | [![Codespace Dev](https://img.shields.io/static/v1?logo=visualstudiocode&label=&message=Codespace%20Dev&labelColor=2c2c32&color=007acc&logoColor=007acc)](https://github.com/codespaces/new?repo=qte77/Agents-eval&devcontainer_path=.devcontainer/setup_dev/devcontainer.json) 16 | [![Codespace Dev Claude Code](https://img.shields.io/static/v1?logo=visualstudiocode&label=&message=Codespace%20Dev%20Claude%20Code&labelColor=2c2c32&color=007acc&logoColor=007acc)](https://github.com/codespaces/new?repo=qte77/Agents-eval&devcontainer_path=.devcontainer/setup_dev_claude/devcontainer.json) 17 | [![Codespace Dev Ollama](https://img.shields.io/static/v1?logo=visualstudiocode&label=&message=Codespace%20Dev%20Ollama&labelColor=2c2c32&color=007acc&logoColor=007acc)](https://github.com/codespaces/new?repo=qte77/Agents-eval&devcontainer_path=.devcontainer/setup_dev_ollama/devcontainer.json) 18 | [![TalkToGithub](https://img.shields.io/badge/TalkToGithub-7a83ff.svg)](https://talktogithub.com/qte77/Agents-eval) 19 | [![llms.txt (UitHub)](https://img.shields.io/badge/llms.txt-uithub-800080.svg)](https://github.com/qte77/Agents-eval) 20 | [![llms.txt (GitToDoc)](https://img.shields.io/badge/llms.txt-GitToDoc-fe4a60.svg)](https://gittodoc.com/qte77/Agents-eval) 21 | 22 | ## Status 23 | 24 | (DRAFT) (WIP) ----> Not fully implemented yet 25 | 26 | For version history have a look at the [CHANGELOG](CHANGELOG.md). 27 | 28 | ## Setup and Usage 29 | 30 | - `make setup_prod` 31 | - `make setup_dev` or `make setup_dev_claude` or `make setup_dev_ollama` 32 | - `make run_cli` or `make run_cli ARGS="--help"` 33 | - `make run_gui` 34 | - `make test_all` 35 | 36 | ### Configuration 37 | 38 | - [config_app.py](src/app/config/config_app.py) contains configuration constants for the application. 39 | - [config_chat.json](src/app/config/config_chat.json) contains inference provider configuration and prompts. inference endpoints used should adhere to [OpenAI Model Spec 2024-05-08](https://cdn.openai.com/spec/model-spec-2024-05-08.html) which is used by [pydantic-ai OpenAI-compatible Models](https://ai.pydantic.dev/models/#openai-compatible-models). 40 | - [config_eval.json](src/app/config/config_eval.json) contains evaluation metrics and their weights. 41 | - [data_models.py](src/app/config/data_models.py) contains the pydantic data models for agent system configuration and results. 42 | 43 | ### Environment 44 | 45 | [.env.example](.env.example) contains examples for usage of API keys and variables. 46 | 47 | ```text 48 | # inference EP 49 | GEMINI_API_KEY="xyz" 50 | 51 | # tools 52 | TAVILY_API_KEY="" 53 | 54 | # log/mon/trace 55 | WANDB_API_KEY="xyz" 56 | ``` 57 | 58 | ### Customer Journey 59 | 60 |
61 | Show Customer Journey 62 | Customer Journey 63 | Customer Journey 64 |
65 | 66 | ### Note 67 | 68 | 1. The contained chat configuration uses free inference endpoints which are subject to change by the providers. See lists such as [free-llm-api-resources](https://github.com/cheahjs/free-llm-api-resources) to find other providers. 69 | 2. The contained chat configuration uses models which are also subject to change by the providers and have to be updated from time to time. 70 | 3. LLM-as-judge is also subject to the chat configuration. 71 | 72 | ## Documentation 73 | 74 | [Agents-eval](https://qte77.github.io/Agents-eval) 75 | 76 | ### Project Outline 77 | 78 | `# TODO` 79 | 80 | ### Agents 81 | 82 | #### Manager Agent 83 | 84 | - **Description**: Oversees research and analysis tasks, coordinating the efforts of the research, analysis, and synthesizer agents to provide comprehensive answers to user queries. Delegates tasks and ensures the accuracy of the information. 85 | - **Responsibilities**: 86 | - Coordinates the research, analysis, and synthesis agents. 87 | - Delegates research tasks to the Research Agent. 88 | - Delegates analysis tasks to the Analysis Agent. 89 | - Delegates synthesis tasks to the Synthesizer Agent. 90 | - Ensures the accuracy of the information. 91 | - **Location**: [src/app/agents/agent_system.py](https://github.com/qte77/Agents-eval/blob/main/src/app/agents/agent_system.py) 92 | 93 | #### Researcher Agent 94 | 95 | - **Description**: Gathers and analyzes data relevant to a given topic, utilizing search tools to collect data and verifying the accuracy of assumptions, facts, and conclusions. 96 | - **Responsibilities**: 97 | - Gathers and analyzes data relevant to the topic. 98 | - Uses search tools to collect data. 99 | - Checks the accuracy of assumptions, facts, and conclusions. 100 | - **Tools**: 101 | - [DuckDuckGo Search Tool](https://ai.pydantic.dev/common-tools/#duckduckgo-search-tool) 102 | - **Location**: [src/app/agents/agent_system.py](https://github.com/qte77/Agents-eval/blob/main/src/app/agents/agent_system.py) 103 | 104 | #### Analyst Agent 105 | 106 | - **Description**: Checks the accuracy of assumptions, facts, and conclusions in the provided data, providing relevant feedback and ensuring data integrity. 107 | - **Responsibilities**: 108 | - Checks the accuracy of assumptions, facts, and conclusions. 109 | - Provides relevant feedback if the result is not approved. 110 | - Ensures data integrity. 111 | - **Location**: [src/app/agents/agent_system.py](https://github.com/qte77/Agents-eval/blob/main/src/app/agents/agent_system.py) 112 | 113 | #### Synthesizer Agent 114 | 115 | - **Description**: Outputs a well-formatted scientific report using the data provided, maintaining the original facts, conclusions, and sources. 116 | - **Responsibilities**: 117 | - Outputs a well-formatted scientific report using the provided data. 118 | - Maintains the original facts, conclusions, and sources. 119 | - **Location**: [src/app/agents/agent_system.py](https://github.com/qte77/Agents-eval/blob/main/src/app/agents/agent_system.py) 120 | 121 | ### Datasets used 122 | 123 | `# TODO` 124 | 125 | ### Evalutions metrics 126 | 127 | `# TODO` 128 | 129 | - Time to complete task (time_taken) 130 | - Task success rate (task_success) 131 | - Agent coordination (coordination_quality) 132 | - Tool usage efficiency (tool_efficiency) 133 | - Plan coherence (planning_rational) 134 | - Text response quality (text_similarity) 135 | - Autonomy vs. human intervention (HITL, user feedback) 136 | - Reactivity (adapt to changes of tasks and environments) 137 | - Memory consistency 138 | 139 | ### Evaluations Metrics Baseline 140 | 141 | As configured in [config_eval.json](src/app/config/config_eval.json). 142 | 143 | ```json 144 | { 145 | "evaluators_and_weights": { 146 | "planning_rational": "1/6", 147 | "task_success": "1/6", 148 | "tool_efficiency": "1/6", 149 | "coordination_quality": "1/6", 150 | "time_taken": "1/6", 151 | "text_similarity": "1/6" 152 | } 153 | } 154 | ``` 155 | 156 | ### Eval Metrics Sweep 157 | 158 |
159 | Eval Metrics Sweep 160 | Eval Metrics Sweep 161 |
162 | 163 | ### Tools available 164 | 165 | Other pydantic-ai agents and [pydantic-ai DuckDuckGo Search Tool](https://ai.pydantic.dev/common-tools/#duckduckgo-search-tool). 166 | 167 | ### Agentic System Architecture 168 | 169 |
170 | Show Agentic System Architecture 171 | Agentic System C4-Arch 172 |
173 | 174 | ### Project Repo Structure 175 | 176 |
177 | Show Repo Structure 178 | ```sh 179 | |- .claude # claude code config and commands 180 | |- .devcontainer # pre-configured dev env 181 | |- .github # workflows 182 | |- .streamlit # config.toml 183 | |- .vscode # extensions, settings 184 | |- assets/images 185 | |- docs 186 | |- src # source code 187 | |- app 188 | |- agents 189 | |- config 190 | |- evals 191 | |- utils 192 | |- __init__.py 193 | |- main.py 194 | \- py.typed 195 | |- examples 196 | |- gui 197 | \- run_gui.py 198 | |- tests 199 | |- .env.example # example env vars 200 | |- .gitignore 201 | |- .gitmessage 202 | |- AGENTS.md # common file like agentsmd.com 203 | |- CHANGEOG.md # short project history 204 | |- CLAUDE.md # points to AGENTS.md 205 | |- Dockerfile # create app image 206 | |- LICENSE.md 207 | |- Makefile # helper scripts 208 | |- mkdocs.yaml # docu from docstrings 209 | |- pyproject.toml # project settings 210 | |- README.md # project description 211 | \- uv.lock # resolved package versions 212 | ``` 213 |
214 | 215 | ## Landscape overview 216 | 217 | ### Agentic System Frameworks 218 | 219 | - [PydanticAI](https://github.com/pydantic/pydantic-ai) 220 | - [restack](https://www.restack.io/) 221 | - [smolAgents](https://github.com/huggingface/smolagents) 222 | - [AutoGen](https://github.com/microsoft/autogen) 223 | - [Semantic Kernel](https://github.com/microsoft/semantic-kernel) 224 | - [CrewAI](https://github.com/crewAIInc/crewAI) 225 | - [Langchain](https://github.com/langchain-ai/langchain) 226 | - [Langflow](https://github.com/langflow-ai/langflow) 227 | 228 | ### Agent-builder 229 | 230 | - [Archon](https://github.com/coleam00/Archon) 231 | - [Agentstack](https://github.com/AgentOps-AI/AgentStack) 232 | 233 | ### Evaluation 234 | 235 | - Focusing on agentic systems 236 | - [AgentNeo](https://github.com/raga-ai-hub/agentneo) 237 | - [AutoGenBench](https://github.com/microsoft/autogen/blob/0.2/samples/tools/autogenbench) 238 | - [Langchain AgentEvals](https://github.com/langchain-ai/agentevals) 239 | - [Mosaic AI Agent Evaluation](https://docs.databricks.com/en/generative-ai/agent-evaluation/index.html) 240 | - [RagaAI-Catalyst](https://github.com/raga-ai-hub/RagaAI-Catalyst) 241 | - [AgentBench](https://github.com/THUDM/AgentBench) 242 | - RAG oriented 243 | - [RAGAs](https://github.com/explodinggradients/ragas) 244 | - LLM apps 245 | - [DeepEval](https://github.com/confident-ai/deepeval) 246 | - [Langchain OpenEvals](https://github.com/langchain-ai/openevals) 247 | - [MLFlow LLM Evaluate](https://mlflow.org/docs/latest/llms/llm-evaluate/index.html) 248 | - [DeepEval (DeepSeek)]( github.com/confident-ai/deepeval) 249 | 250 | ### Observation, Monitoring, Tracing 251 | 252 | - [AgentOps - Agency](https://www.agentops.ai/) 253 | - [arize](https://arize.com/) 254 | - [Langtrace](https://www.langtrace.ai/) 255 | - [LangSmith - Langchain](https://www.langchain.com/langsmith) 256 | - [Weave - Weights & Biases](https://wandb.ai/site/weave/) 257 | - [Pydantic- Logfire](https://pydantic.dev/logfire) 258 | 259 | ### Datasets 260 | 261 | - [awesome-reasoning - Collection of datasets](https://github.com/neurallambda/awesome-reasoning) 262 | 263 | #### Scientific 264 | 265 | - [SWIF2T](https://arxiv.org/abs/2405.20477), Automated Focused Feedback Generation for Scientific Writing Assistance, 2024, 300 peer reviews citing weaknesses in scientific papers and conduct human evaluation 266 | - [PeerRead](https://github.com/allenai/PeerRead), A Dataset of Peer Reviews (PeerRead): Collection, Insights and NLP Applications, 2018, 14K paper drafts and the corresponding accept/reject decisions, over 10K textual peer reviews written by experts for a subset of the papers, structured JSONL, clear labels 267 | - [BigSurvey](https://www.ijcai.org/proceedings/2022/0591.pdf), Generating a Structured Summary of Numerous Academic Papers: Dataset and Method, 2022, 7K survey papers and 430K referenced papers abstracts 268 | - [SciXGen](https://arxiv.org/abs/2110.10774), A Scientific Paper Dataset for Context-Aware Text Generation, 2021, 205k papers 269 | - [scientific_papers](https://huggingface.co/datasets/armanc/scientific_papers), 2018, two sets of long and structured documents, obtained from ArXiv and PubMed OpenAccess, 300k+ papers, total disk 7GB 270 | 271 | #### Reasoning, Deduction, Commonsense, Logic 272 | 273 | - [LIAR](https://www.cs.ucsb.edu/~william/data/liar_dataset.zip), fake news detection, only 12.8k records, single label 274 | - [X-Fact](https://github.com/utahnlp/x-fact/), Benchmark Dataset for Multilingual Fact Checking, 31.1k records, large, multilingual 275 | - [MultiFC](https://www.copenlu.com/publication/2019_emnlp_augenstein/), A Real-World Multi-Domain Dataset for Evidence-Based Fact Checking of Claims, 34.9k records 276 | - [FEVER](https://fever.ai/dataset/fever.html), Fact Extraction and VERification, 185.4k records 277 | - TODO GSM8K, bAbI, CommonsenseQA, DROP, LogiQA, MNLI 278 | 279 | #### Planning, Execution 280 | 281 | - [Plancraft](https://arxiv.org/abs/2412.21033), an evaluation dataset for planning with LLM agents, both a text-only and multi-modal interface 282 | - [IDAT](https://arxiv.org/abs/2407.08898), A Multi-Modal Dataset and Toolkit for Building and Evaluating Interactive Task-Solving Agents 283 | - [PDEBench](https://github.com/pdebench/PDEBench), set of benchmarks for scientific machine learning 284 | - [MatSci-NLP](https://arxiv.org/abs/2305.08264), evaluating the performance of natural language processing (NLP) models on materials science text 285 | - TODO BigBench Hard, FSM Game 286 | 287 | #### Tool Use, Function Invocation 288 | 289 | - [Trelis Function Calling](https://huggingface.co/datasets/Trelis/function_calling_v3) 290 | - [KnowLM Tool](https://huggingface.co/datasets/zjunlp/KnowLM-Tool) 291 | - [StatLLM](https://arxiv.org/abs/2502.17657), statistical analysis tasks, LLM-generated SAS code, and human evaluation scores 292 | - TODO ToolComp 293 | 294 | ### Benchmarks 295 | 296 | - [SciArena: A New Platform for Evaluating Foundation Models in Scientific Literature Tasks](https://allenai.org/blog/sciarena) 297 | - [AgentEvals CORE-Bench Leaderboard](https://huggingface.co/spaces/agent-evals/core_leaderboard) 298 | - [Berkeley Function-Calling Leaderboard](https://gorilla.cs.berkeley.edu/leaderboard.html) 299 | - [Chatbot Arena LLM Leaderboard](https://lmsys.org/projects/) 300 | - [GAIA Leaderboard](https://gaia-benchmark-leaderboard.hf.space/) 301 | - [GalileoAI Agent Leaderboard](https://huggingface.co/spaces/galileo-ai/agent-leaderboard) 302 | - [WebDev Arena Leaderboard](https://web.lmarena.ai/leaderboard) 303 | - [MiniWoB++: a web interaction benchmark for reinforcement learning](https://miniwob.farama.org/) 304 | 305 | ### Research Agents 306 | 307 | - [Ai2 Scholar QA](https://qa.allen.ai/chat) 308 | 309 | ## Further Reading 310 | 311 | - [[2506.18096] Deep Research Agents: A Systematic Examination And Roadmap](https://arxiv.org/abs/2506.18096), [gh / ai-agents-2030 / awesome-deep-research-agent](https://github.com/ai-agents-2030/awesome-deep-research-agent) 312 | - [[2504.19678] From LLM Reasoning to Autonomous AI Agents: A Comprehensive Review](https://arxiv.org/abs/2504.19678) 313 | - [[2503.21460] Large Language Model Agent: A Survey on Methodology, Applications and Challenges](https://arxiv.org/abs/2503.21460) 314 | - [[2503.16416] Survey on Evaluation of LLM-based Agents](https://arxiv.org/abs/2503.16416) 315 | - [[2503.13657] Why Do Multi-Agent LLM Systems Fail?](https://arxiv.org/abs/2503.13657) 316 | - [[2502.14776] SurveyX: Academic Survey Automation via Large Language Models](https://arxiv.org/abs/2502.14776) 317 | - [[2502.05957] AutoAgent: A Fully-Automated and Zero-Code Framework for LLM Agents](https://arxiv.org/abs/2502.05957) 318 | - [[2502.02649] Fully Autonomous AI Agents Should Not be Developed](https://arxiv.org/abs/2502.02649) 319 | - [[2501.16150] AI Agents for Computer Use: A Review of Instruction-based Computer Control, GUI Automation, and Operator Assistants](https://arxiv.org/abs/2501.16150) 320 | - [[2501.06590] ChemAgent](https://arxiv.org/abs/2501.06590) 321 | - [[2501.06322] Multi-Agent Collaboration Mechanisms: A Survey of LLMs](https://arxiv.org/abs/2501.06322) 322 | - [[2501.04227] Agent Laboratory: Using LLM Agents as Research Assitants](https://arxiv.org/abs/2501.04227), [AgentRxiv:Towards Collaborative Autonomous Research](https://agentrxiv.github.io/) 323 | - [[2501.00881] Agentic Systems: A Guide to Transforming Industries with Vertical AI Agents](https://arxiv.org/abs/2501.00881) 324 | - [[2412.04093] Practical Considerations for Agentic LLM Systems](https://arxiv.org/abs/2412.04093) 325 | - [[2411.13768] Evaluation-driven Approach to LLM Agents](https://arxiv.org/abs/2411.13768) 326 | - [[2411.10478] Large Language Models for Constructing and Optimizing Machine Learning Workflows: A Survey](https://arxiv.org/abs/2411.10478) 327 | - [[2411.05285] A taxonomy of agentops for enabling observability of foundation model based agents](https://arxiv.org/abs/2411.05285) 328 | - [[2410.22457] Advancing Agentic Systems: Dynamic Task Decomposition, Tool Integration and Evaluation using Novel Metrics and Dataset](https://arxiv.org/abs/2410.22457) 329 | - [[2408.06361] Large Language Model Agent in Financial Trading: A Survey](https://arxiv.org/abs/2408.06361) 330 | - [[2408.06292] The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery](https://arxiv.org/abs/2408.06292) 331 | - [[2404.13501] A Survey on the Memory Mechanism of Large Language Model based Agents](https://arxiv.org/pdf/2404.13501) 332 | - [[2402.06360] CoSearchAgent: A Lightweight Collaborative Search Agent with Large Language Models](https://arxiv.org/abs/2402.06360) 333 | - [[2402.02716] Understanding the planning of LLM agents: A survey](https://arxiv.org/abs/2402.02716) 334 | - [[2402.01030] Executable Code Actions Elicit Better LLM Agents](https://arxiv.org/abs/2402.01030) 335 | - [[2308.11432] A Survey on Large Language Model based Autonomous Agents](https://arxiv.org/abs/2308.11432) 336 | -------------------------------------------------------------------------------- /assets/images/c4-multi-agent-system.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qte77/Agents-eval/7401b21b53bd8307e7fe8465b466595a7687f8c8/assets/images/c4-multi-agent-system.png -------------------------------------------------------------------------------- /assets/images/customer-journey-activity-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qte77/Agents-eval/7401b21b53bd8307e7fe8465b466595a7687f8c8/assets/images/customer-journey-activity-dark.png -------------------------------------------------------------------------------- /assets/images/customer-journey-activity-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qte77/Agents-eval/7401b21b53bd8307e7fe8465b466595a7687f8c8/assets/images/customer-journey-activity-light.png -------------------------------------------------------------------------------- /assets/images/metrics-eval-sweep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qte77/Agents-eval/7401b21b53bd8307e7fe8465b466595a7687f8c8/assets/images/metrics-eval-sweep.png -------------------------------------------------------------------------------- /context/PRPs/coordination_quality.md: -------------------------------------------------------------------------------- 1 | # Coordination Quality Feature PRP 2 | 3 | ## Goal 4 | 5 | Implement a comprehensive coordination quality measurement and monitoring system for the multi-agent evaluation framework to assess how effectively agents collaborate, delegate tasks, and maintain workflow integrity. 6 | 7 | ## Why 8 | 9 | - **Evaluation Completeness**: The coordination_quality metric is defined in `config_eval.json` (0.167 weight) but not implemented 10 | - **System Reliability**: Need to measure and improve agent coordination failures and bottlenecks 11 | - **Performance Optimization**: Identify coordination inefficiencies that impact overall system performance 12 | - **Research Value**: Provide quantitative data on multi-agent coordination patterns for evaluation research 13 | 14 | ## What 15 | 16 | A coordination quality monitoring system that measures: 17 | 18 | - Task delegation success rates between agents 19 | - Inter-agent communication efficiency and latency 20 | - Workflow completion rates and error recovery 21 | - Resource utilization across agent interactions 22 | - Coordination failure detection and analysis 23 | 24 | ### Success Criteria 25 | 26 | - [ ] Coordination quality metric implemented and functional in evaluation system 27 | - [ ] Real-time coordination monitoring dashboard 28 | - [ ] Coordination failure detection and alerting 29 | - [ ] Performance metrics collection and analysis 30 | - [ ] Integration with existing evaluation pipeline 31 | 32 | ## All Needed Context 33 | 34 | ### Documentation & References 35 | 36 | ```yaml 37 | - file: /workspaces/Agents-eval/src/app/agents/agent_system.py 38 | why: Core coordination logic, delegation patterns, tool-based coordination 39 | critical: Lines 91-99 show delegation pattern, _validate_model_return validation 40 | 41 | - file: /workspaces/Agents-eval/src/app/config/data_models.py 42 | why: Data contracts for coordination, Pydantic models for agent communication 43 | critical: ResearchResult, AnalysisResult, ResearchSummary models 44 | 45 | - file: /workspaces/Agents-eval/src/app/config/config_eval.json 46 | why: Coordination quality metric weight (0.167) defined but not implemented 47 | critical: Need to implement the missing coordination_quality metric 48 | 49 | - file: /workspaces/Agents-eval/src/app/evals/metrics.py 50 | why: Evaluation metrics implementation patterns 51 | critical: How other metrics are implemented and integrated 52 | 53 | - file: /workspaces/Agents-eval/src/app/config/config_chat.json 54 | why: Agent prompts defining coordination behavior and approval workflows 55 | critical: Manager agent orchestration prompts 56 | ``` 57 | 58 | ### Current Codebase Tree 59 | 60 | ```bash 61 | src/app/ 62 | ├── agents/ 63 | │ ├── agent_system.py # Core coordination logic 64 | │ └── llm_model_funs.py # Model management 65 | ├── config/ 66 | │ ├── config_app.py # Common app configuration 67 | │ ├── data_models.py # Coordination data contracts 68 | │ ├── config_chat.json # Agent coordination prompts 69 | │ └── config_eval.json # Evaluation metrics (coordination_quality: 0.167) 70 | ├── evals/ 71 | │ └── metrics.py # Evaluation metrics implementation 72 | ├── utils/ 73 | │ ├── error_messages.py # Error handling patterns 74 | │ └── log.py # Logging utilities 75 | └── main.py # Entry point 76 | ``` 77 | 78 | ### Desired Codebase Tree 79 | 80 | ```bash 81 | src/app/ 82 | ├── evals/ 83 | │ ├── [existing folders unchanged] 84 | │ ├── coordination_quality/ 85 | │ │ ├── __init__.py 86 | │ │ ├── quality_metrics.py # Coordination quality measurement 87 | │ │ ├── monitoring.py # Real-time coordination monitoring 88 | │ │ └── analyzer.py # Coordination pattern analysis 89 | │ └── metrics.py # Updated with coordination_quality implementation 90 | └── [existing files unchanged] 91 | ``` 92 | 93 | ### Known Gotchas & Library Quirks 94 | 95 | ```python 96 | # CRITICAL: PydanticAI coordination patterns 97 | # - Tool-based delegation via @agent.tool decorator 98 | # - Usage tracking shared via RunContext 99 | # - Streaming with Pydantic models has NotImplementedError in agent_system.py 100 | 101 | # GOTCHA: Validation requirements 102 | # - All agent communication must use _validate_model_return() 103 | # - Pydantic models required for type safety 104 | # - Error handling must follow utils/error_messages.py patterns 105 | 106 | # LIBRARY QUIRK: PydanticAI Usage Limits 107 | # - UsageLimits shared across agents via RunContext 108 | # - Coordination can fail if usage limits exceeded 109 | # - Need to track usage per coordination step 110 | ``` 111 | 112 | ## Implementation Blueprint 113 | 114 | ### Data Models and Structure 115 | 116 | ```python 117 | # coordination/quality_metrics.py 118 | class CoordinationMetrics(BaseModel): 119 | """Coordination quality metrics data model.""" 120 | 121 | delegation_success_rate: float 122 | communication_latency: float 123 | workflow_completion_rate: float 124 | error_recovery_rate: float 125 | resource_utilization: float 126 | coordination_score: float 127 | 128 | class CoordinationEvent(BaseModel): 129 | """Individual coordination event tracking.""" 130 | 131 | timestamp: datetime 132 | source_agent: str 133 | target_agent: str 134 | event_type: str # delegation, response, error, retry 135 | success: bool 136 | latency_ms: float 137 | error_message: str | None = None 138 | ``` 139 | 140 | ### List of Tasks 141 | 142 | ```yaml 143 | Task 1: 144 | CREATE src/app/coordination/__init__.py: 145 | - EMPTY file for Python package 146 | 147 | Task 2: 148 | CREATE src/app/coordination/quality_metrics.py: 149 | - IMPLEMENT CoordinationMetrics and CoordinationEvent models 150 | - IMPLEMENT calculate_coordination_quality() function 151 | - PATTERN: Follow existing Pydantic models in data_models.py 152 | 153 | Task 3: 154 | CREATE src/app/coordination/monitoring.py: 155 | - IMPLEMENT CoordinationMonitor class 156 | - TRACK delegation events, latency, success rates 157 | - PATTERN: Use existing logging patterns from utils/log.py 158 | 159 | Task 4: 160 | CREATE src/app/coordination/analyzer.py: 161 | - IMPLEMENT coordination pattern analysis 162 | - DETECT coordination failures and bottlenecks 163 | - GENERATE coordination quality reports 164 | 165 | Task 5: 166 | MODIFY src/app/agents/agent_system.py: 167 | - FIND _add_tools_to_manager_agent function 168 | - INJECT coordination monitoring into delegation tools 169 | - PRESERVE existing delegation patterns 170 | 171 | Task 6: 172 | MODIFY src/app/evals/metrics.py: 173 | - IMPLEMENT coordination_quality metric function 174 | - INTEGRATE with existing metrics calculation 175 | - MIRROR pattern from other metric implementations 176 | 177 | Task 7: 178 | CREATE tests/test_coordination_quality.py: 179 | - TEST coordination metrics calculation 180 | - TEST monitoring functionality 181 | - TEST integration with evaluation pipeline 182 | ``` 183 | 184 | ### Per Task Pseudocode 185 | 186 | ```python 187 | # Task 2: quality_metrics.py 188 | class CoordinationMetrics(BaseModel): 189 | delegation_success_rate: float = Field(ge=0.0, le=1.0) 190 | communication_latency: float = Field(ge=0.0) 191 | workflow_completion_rate: float = Field(ge=0.0, le=1.0) 192 | error_recovery_rate: float = Field(ge=0.0, le=1.0) 193 | resource_utilization: float = Field(ge=0.0, le=1.0) 194 | coordination_score: float = Field(ge=0.0, le=1.0) 195 | 196 | def calculate_coordination_quality(events: list[CoordinationEvent]) -> CoordinationMetrics: 197 | """Calculate coordination quality from event history.""" 198 | # PATTERN: Weighted average of coordination dimensions 199 | # CRITICAL: Handle empty events list gracefully 200 | if not events: 201 | return CoordinationMetrics(...) 202 | 203 | # Calculate individual metrics 204 | success_rate = sum(e.success for e in events) / len(events) 205 | avg_latency = sum(e.latency_ms for e in events) / len(events) 206 | # ... other calculations 207 | 208 | # Weighted coordination score 209 | coordination_score = ( 210 | success_rate * 0.3 + 211 | normalized_latency * 0.2 + 212 | completion_rate * 0.3 + 213 | recovery_rate * 0.2 214 | ) 215 | 216 | return CoordinationMetrics( 217 | coordination_score=coordination_score, 218 | # ... other metrics 219 | ) 220 | 221 | # Task 3: monitoring.py 222 | class CoordinationMonitor: 223 | def __init__(self): 224 | self.events: list[CoordinationEvent] = [] 225 | self.logger = logger # From utils/log.py 226 | 227 | async def track_delegation(self, source: str, target: str, func: Callable): 228 | """Track delegation with timing and success monitoring.""" 229 | start_time = time.time() 230 | 231 | try: 232 | result = await func() 233 | # PATTERN: Log successful coordination 234 | self.logger.info(f"Delegation {source} -> {target} successful") 235 | 236 | # Record successful event 237 | self._record_event( 238 | source_agent=source, 239 | target_agent=target, 240 | event_type="delegation", 241 | success=True, 242 | latency_ms=(time.time() - start_time) * 1000 243 | ) 244 | 245 | return result 246 | 247 | except Exception as e: 248 | # PATTERN: Log coordination failures 249 | self.logger.error(f"Delegation {source} -> {target} failed: {str(e)}") 250 | 251 | # Record failed event 252 | self._record_event( 253 | source_agent=source, 254 | target_agent=target, 255 | event_type="delegation", 256 | success=False, 257 | latency_ms=(time.time() - start_time) * 1000, 258 | error_message=str(e) 259 | ) 260 | 261 | raise 262 | 263 | # Task 5: agent_system.py integration 264 | # MODIFY delegate_research function 265 | @manager_agent.tool 266 | async def delegate_research(ctx: RunContext[None], query: str) -> ResearchResult: 267 | """Delegate research task to ResearchAgent.""" 268 | # INJECT: Coordination monitoring 269 | monitor = CoordinationMonitor() 270 | 271 | async def _research_task(): 272 | result = await research_agent.run(query, usage=ctx.usage) 273 | return _validate_model_return(str(result.output), ResearchResult) 274 | 275 | # PATTERN: Track delegation with monitoring 276 | return await monitor.track_delegation("manager", "researcher", _research_task) 277 | ``` 278 | 279 | ### Integration Points 280 | 281 | ```yaml 282 | EVALUATION_SYSTEM: 283 | - modify: src/app/evals/metrics.py 284 | - pattern: "def coordination_quality(result: Any) -> float:" 285 | - integration: "Add to evaluation pipeline alongside existing metrics" 286 | 287 | CONFIGURATION: 288 | - modify: src/app/config/config_eval.json 289 | - pattern: "coordination_quality metric already defined with weight 0.167" 290 | - validation: "Ensure metric returns float between 0.0 and 1.0" 291 | 292 | LOGGING: 293 | - integrate: src/app/utils/log.py 294 | - pattern: "Use existing logger for coordination events" 295 | - level: "INFO for successful coordination, ERROR for failures" 296 | ``` 297 | 298 | ## Validation Loop 299 | 300 | ### Level 1: Syntax & Style 301 | 302 | ```bash 303 | # Run these FIRST - fix any errors before proceeding 304 | make ruff # Format and fix linting issues 305 | make type_check # Type checking with mypy 306 | 307 | # Expected: No errors. If errors, READ the error and fix. 308 | ``` 309 | 310 | ### Level 2: Unit Tests 311 | 312 | ```python 313 | # CREATE tests/test_coordination_quality.py 314 | def test_coordination_metrics_calculation(): 315 | """Test coordination quality calculation with sample events.""" 316 | events = [ 317 | CoordinationEvent( 318 | timestamp=datetime.now(), 319 | source_agent="manager", 320 | target_agent="researcher", 321 | event_type="delegation", 322 | success=True, 323 | latency_ms=150.0 324 | ), 325 | # ... more test events 326 | ] 327 | 328 | metrics = calculate_coordination_quality(events) 329 | assert 0.0 <= metrics.coordination_score <= 1.0 330 | assert metrics.delegation_success_rate >= 0.0 331 | 332 | def test_coordination_monitoring(): 333 | """Test coordination monitoring functionality.""" 334 | monitor = CoordinationMonitor() 335 | 336 | # Test successful delegation tracking 337 | async def dummy_task(): 338 | return "success" 339 | 340 | result = await monitor.track_delegation("manager", "researcher", dummy_task) 341 | assert result == "success" 342 | assert len(monitor.events) == 1 343 | assert monitor.events[0].success is True 344 | 345 | def test_coordination_quality_metric(): 346 | """Test integration with evaluation metrics.""" 347 | # PATTERN: Test similar to other metrics in the evaluation system 348 | sample_result = {"coordination_events": [...]} 349 | quality_score = coordination_quality(sample_result) 350 | assert isinstance(quality_score, float) 351 | assert 0.0 <= quality_score <= 1.0 352 | ``` 353 | 354 | ```bash 355 | # Run and iterate until passing: 356 | make test_all 357 | # If failing: Read error, understand root cause, fix code, re-run 358 | ``` 359 | 360 | ### Level 3: Integration Test 361 | 362 | ```bash 363 | # Test the coordination quality in full evaluation 364 | make run_cli ARGS="--query 'test coordination quality' --eval" 365 | 366 | # Expected: Coordination quality metric appears in evaluation results 367 | # If error: Check logs for coordination monitoring issues 368 | ``` 369 | 370 | ## Final Validation Checklist 371 | 372 | - [ ] All tests pass: `make test_all` 373 | - [ ] No linting errors: `make ruff` 374 | - [ ] No type errors: `make type_check` 375 | - [ ] Coordination quality metric integrated in evaluation pipeline 376 | - [ ] Coordination monitoring tracks delegation events 377 | - [ ] Error cases handled gracefully with proper logging 378 | - [ ] Performance impact minimal (< 5% overhead) 379 | - [ ] Documentation updated in AGENTS.md if needed 380 | 381 | ## Anti-Patterns to Avoid 382 | 383 | - ❌ Don't break existing delegation patterns in agent_system.py 384 | - ❌ Don't ignore coordination failures - log and track them 385 | - ❌ Don't add excessive monitoring overhead that slows coordination 386 | - ❌ Don't hardcode coordination thresholds - make them configurable 387 | - ❌ Don't skip validation of coordination metrics calculation 388 | - ❌ Don't assume all coordination events are successful - handle failures gracefully 389 | -------------------------------------------------------------------------------- /context/PRPs/features/coordination_quality.md: -------------------------------------------------------------------------------- 1 | # Feature description for: coordination_quality 2 | 3 | As put forward by [context-engineering-intro](https://github.com/qte77/context-engineering-intro). 4 | 5 | ## FEATURE 6 | 7 | coordination_quality 8 | 9 | ## EXAMPLES 10 | 11 | [Provide and explain examples that you have in the `PRPs/examples/` folder] 12 | 13 | ## DOCUMENTATION 14 | 15 | [List out any documentation (web pages, sources for an MCP server like Crawl4AI RAG, etc.) that will need to be referenced during development] 16 | 17 | ## OTHER CONSIDERATIONS 18 | 19 | [Any other considerations or specific requirements - great place to include gotchas that you see AI coding assistants miss with your projects a lot] 20 | -------------------------------------------------------------------------------- /context/PRPs/features/tool_efficiency.md: -------------------------------------------------------------------------------- 1 | # Feature description for: tool_efficiency 2 | 3 | As put forward by [context-engineering-intro](https://github.com/qte77/context-engineering-intro). 4 | 5 | ## FEATURE 6 | 7 | tool_efficiency 8 | 9 | ## EXAMPLES 10 | 11 | [Provide and explain examples that you have in the `PRPs/examples/` folder] 12 | 13 | ## DOCUMENTATION 14 | 15 | [List out any documentation (web pages, sources for an MCP server like Crawl4AI RAG, etc.) that will need to be referenced during development] 16 | 17 | ## OTHER CONSIDERATIONS 18 | 19 | [Any other considerations or specific requirements - great place to include gotchas that you see AI coding assistants miss with your projects a lot] 20 | -------------------------------------------------------------------------------- /context/PRPs/templates/feature_base.md: -------------------------------------------------------------------------------- 1 | # Feature description for: [ Initial template for new features ] 2 | 3 | As put forward by [context-engineering-intro](https://github.com/qte77/context-engineering-intro). 4 | 5 | ## FEATURE 6 | 7 | [Insert your feature here] 8 | 9 | ## EXAMPLES 10 | 11 | [Provide and explain examples that you have in the `PRPs/examples/` folder] 12 | 13 | ## DOCUMENTATION 14 | 15 | [List out any documentation (web pages, sources for an MCP server like Crawl4AI RAG, etc.) that will need to be referenced during development] 16 | 17 | ## OTHER CONSIDERATIONS 18 | 19 | [Any other considerations or specific requirements - great place to include gotchas that you see AI coding assistants miss with your projects a lot] 20 | -------------------------------------------------------------------------------- /context/PRPs/templates/prp_base.md: -------------------------------------------------------------------------------- 1 | # "Base PRP Template v2 - Context-Rich with Validation Loops" 2 | 3 | ## Purpose 4 | 5 | Template optimized for AI agents to implement features with sufficient context and self-validation capabilities to achieve working code through iterative refinement. 6 | 7 | ## Core Principles 8 | 9 | 1. **Context is King**: Include ALL necessary documentation, examples, and caveats 10 | 2. **Validation Loops**: Provide executable tests/lints the AI can run and fix 11 | 3. **Information Dense**: Use keywords and patterns from the codebase 12 | 4. **Progressive Success**: Start simple, validate, then enhance 13 | 5. **Global rules**: Be sure to follow all rules in CLAUDE.md 14 | 15 | --- 16 | 17 | ## Goal 18 | 19 | [What needs to be built - be specific about the end state and desires] 20 | 21 | ## Why 22 | 23 | - [Business value and user impact] 24 | - [Integration with existing features] 25 | - [Problems this solves and for whom] 26 | 27 | ## What 28 | 29 | [User-visible behavior and technical requirements] 30 | 31 | ### Success Criteria 32 | 33 | - [ ] [Specific measurable outcomes] 34 | 35 | ## All Needed Context 36 | 37 | ### Documentation & References (list all context needed to implement the feature) 38 | 39 | ```yaml 40 | # MUST READ - Include these in your context window 41 | - url: [Official API docs URL] 42 | why: [Specific sections/methods you'll need] 43 | 44 | - file: [path/to/example.py] 45 | why: [Pattern to follow, gotchas to avoid] 46 | 47 | - doc: [Library documentation URL] 48 | section: [Specific section about common pitfalls] 49 | critical: [Key insight that prevents common errors] 50 | 51 | - docfile: [PRPs/ai_docs/file.md] 52 | why: [docs that the user has pasted in to the project] 53 | ``` 54 | 55 | ### Current Codebase tree (run `tree` in the root of the project) to get an overview of the codebase 56 | 57 | ```bash 58 | 59 | ``` 60 | 61 | ### Desired Codebase tree with files to be added and responsibility of file 62 | 63 | ```bash 64 | 65 | ``` 66 | 67 | ### Known Gotchas of our codebase & Library Quirks 68 | 69 | ```python 70 | # CRITICAL: [Library name] requires [specific setup] 71 | # Example: FastAPI requires async functions for endpoints 72 | # Example: This ORM doesn't support batch inserts over 1000 records 73 | # Example: We use pydantic v2 and 74 | ``` 75 | 76 | ## Implementation Blueprint 77 | 78 | ### Data models and structure 79 | 80 | Create the core data models, we ensure type safety and consistency. 81 | 82 | ```python 83 | Examples: 84 | - orm models 85 | - pydantic models 86 | - pydantic schemas 87 | - pydantic validators 88 | 89 | ``` 90 | 91 | ### list of tasks to be completed to fullfill the PRP in the order they should be completed 92 | 93 | ```yaml 94 | Task 1: 95 | MODIFY src/existing_module.py: 96 | - FIND pattern: "class OldImplementation" 97 | - INJECT after line containing "def __init__" 98 | - PRESERVE existing method signatures 99 | 100 | CREATE src/new_feature.py: 101 | - MIRROR pattern from: src/similar_feature.py 102 | - MODIFY class name and core logic 103 | - KEEP error handling pattern identical 104 | 105 | ...(...) 106 | 107 | Task N: 108 | ... 109 | 110 | ``` 111 | 112 | ### Per task pseudocode as needed added to each task 113 | 114 | ```python 115 | 116 | # Task 1 117 | # Pseudocode with CRITICAL details dont write entire code 118 | async def new_feature(param: str) -> Result: 119 | # PATTERN: Always validate input first (see src/validators.py) 120 | validated = validate_input(param) # raises ValidationError 121 | 122 | # GOTCHA: This library requires connection pooling 123 | async with get_connection() as conn: # see src/db/pool.py 124 | # PATTERN: Use existing retry decorator 125 | @retry(attempts=3, backoff=exponential) 126 | async def _inner(): 127 | # CRITICAL: API returns 429 if >10 req/sec 128 | await rate_limiter.acquire() 129 | return await external_api.call(validated) 130 | 131 | result = await _inner() 132 | 133 | # PATTERN: Standardized response format 134 | return format_response(result) # see src/utils/responses.py 135 | ``` 136 | 137 | ### Integration Points 138 | 139 | ```yaml 140 | DATABASE: 141 | - migration: "Add column 'feature_enabled' to users table" 142 | - index: "CREATE INDEX idx_feature_lookup ON users(feature_id)" 143 | 144 | CONFIG: 145 | - add to: config/settings.py 146 | - pattern: "FEATURE_TIMEOUT = int(os.getenv('FEATURE_TIMEOUT', '30'))" 147 | 148 | ROUTES: 149 | - add to: src/api/routes.py 150 | - pattern: "router.include_router(feature_router, prefix='/feature')" 151 | ``` 152 | 153 | ## Validation Loop 154 | 155 | ### Level 1: Syntax & Style 156 | 157 | ```bash 158 | # Run these FIRST - fix any errors before proceeding 159 | ruff check src/new_feature.py --fix # Auto-fix what's possible 160 | mypy src/new_feature.py # Type checking 161 | 162 | # Expected: No errors. If errors, READ the error and fix. 163 | ``` 164 | 165 | ### Level 2: Unit Tests each new feature/file/function use existing test patterns 166 | 167 | ```python 168 | # CREATE test_new_feature.py with these test cases: 169 | def test_happy_path(): 170 | """Basic functionality works""" 171 | result = new_feature("valid_input") 172 | assert result.status == "success" 173 | 174 | def test_validation_error(): 175 | """Invalid input raises ValidationError""" 176 | with pytest.raises(ValidationError): 177 | new_feature("") 178 | 179 | def test_external_api_timeout(): 180 | """Handles timeouts gracefully""" 181 | with mock.patch('external_api.call', side_effect=TimeoutError): 182 | result = new_feature("valid") 183 | assert result.status == "error" 184 | assert "timeout" in result.message 185 | ``` 186 | 187 | ```bash 188 | # Run and iterate until passing: 189 | uv run pytest test_new_feature.py -v 190 | # If failing: Read error, understand root cause, fix code, re-run (never mock to pass) 191 | ``` 192 | 193 | ### Level 3: Integration Test 194 | 195 | ```bash 196 | # Start the service 197 | uv run python -m src.main --dev 198 | 199 | # Test the endpoint 200 | curl -X POST http://localhost:8000/feature \ 201 | -H "Content-Type: application/json" \ 202 | -d '{"param": "test_value"}' 203 | 204 | # Expected: {"status": "success", "data": {...}} 205 | # If error: Check logs at logs/app.log for stack trace 206 | ``` 207 | 208 | ## Final validation Checklist 209 | 210 | - [ ] All tests pass: `uv run pytest tests/ -v` 211 | - [ ] No linting errors: `uv run ruff check src/` 212 | - [ ] No type errors: `uv run mypy src/` 213 | - [ ] Manual test successful: [specific curl/command] 214 | - [ ] Error cases handled gracefully 215 | - [ ] Logs are informative but not verbose 216 | - [ ] Documentation updated if needed 217 | 218 | --- 219 | 220 | ## Anti-Patterns to Avoid 221 | 222 | - ❌ Don't create new patterns when existing ones work 223 | - ❌ Don't skip validation because "it should work" 224 | - ❌ Don't ignore failing tests - fix them 225 | - ❌ Don't use sync functions in async context 226 | - ❌ Don't hardcode values that should be config 227 | - ❌ Don't catch all exceptions - be specific 228 | -------------------------------------------------------------------------------- /docs/PRD.md: -------------------------------------------------------------------------------- 1 | # Product Requirements Document (PRD) for Agents-eval 2 | 3 | ## Overview 4 | 5 | **Agents-eval** is a project aimed at evaluating the effectiveness of open-source agentic AI systems across various use cases. The focus is on use case agnostic metrics that measure core capabilities such as task decomposition, tool integration, adaptability, and overall performance. 6 | 7 | ## Goals 8 | 9 | - **Evaluate Agentic AI Systems:** Provide a comprehensive evaluation pipeline to assess the performance of agentic AI systems. 10 | - **Metric Development:** Develop and implement metrics that are agnostic to specific use cases but measure core agentic capabilities. 11 | - **Continuous Improvement:** Promote continuous improvement through automated testing, version control, and documentation. 12 | 13 | ## Functional Requirements 14 | 15 | ### CLI 16 | 17 | - **Command Line Interface:** 18 | - Commands to start, stop, and check the status of the Ollama server or remote inference endpoints. 19 | - Commands to download or call models and run tests. 20 | 21 | ### Frontend (Streamlit) 22 | 23 | - **User Interface:** 24 | - Display test results and system performance metrics. 25 | 26 | ### (Optional) Backend (FastAPI) 27 | 28 | - **Agentic System Integration:** 29 | - Support for adding tools to agents using Pydantic-AI. 30 | - Ensure agents can use tools effectively and return expected results. 31 | - **Model Management:** 32 | - Ability to download, list, and manage models using the `ollama` Python package. 33 | - **API Endpoints:** 34 | - Endpoint to start and check the status of the Ollama server. 35 | - Endpoint to download and manage models. 36 | - Endpoint to run tests and return results. 37 | 38 | ## Non-Functional Requirements 39 | 40 | - **Maintainability:** 41 | - Use modular design patterns for easy updates and maintenance. 42 | - Implement logging and error handling for debugging and monitoring. 43 | - **Documentation:** 44 | - Comprehensive documentation for setup, usage, and testing. 45 | - **Scalability:** 46 | - Design the system to handle multiple concurrent requests. 47 | - **Performance:** 48 | - Ensure low latency in server responses and model downloads. 49 | - Optimize for memory usage and CPU/GPU utilization. 50 | - **Security:** 51 | - Implement secure communication between components. 52 | - Use environment variables for sensitive information. 53 | 54 | ## Assumptions 55 | 56 | - **Remote Inference Endpoints:** The project can use remote inference endpoints provided within a `config.json` and using API keys from `.env`. 57 | - **Local Ollama Server:** The project can make use of a local Ollama server for model hosting and inference. 58 | - **Python Environment:** The project uses Python 3.12 and related tools like `uv` for dependency management. 59 | - **GitHub Actions:** CI/CD pipelines are set up using GitHub Actions for automated testing, version bumping, and documentation deployment. 60 | 61 | ## Constraints 62 | 63 | - **Hardware:** The project assumes access to appropriate hardwareif running the Ollama server and models, including sufficient RAM and GPU capabilities. 64 | - **Software:** Requires Python 3.12, `uv`, and other dependencies listed in `pyproject.toml`. 65 | 66 | ## Main Dependencies 67 | 68 | - **Pydantic-AI:** For agent and tool management. 69 | - **Pytest:** For testing. 70 | - **Ollama:** For local model hosting and inference. 71 | - **Streamlit:** For frontend dashboard. 72 | - **Ruff:** For code linting. 73 | - **MkDocs:** For documentation generation. 74 | 75 | ## Future Enhancements 76 | 77 | - **Additional Metrics:** Develop more metrics to evaluate agentic systems. 78 | - **Integration with More Frameworks:** Expand compatibility with other agentic system frameworks. Meaning other popular agentic system frameworks like LangChain, AutoGen, CrewAI, LangGraph, Semantic Kernel, and smolAgents. 79 | - **Performance Optimization:** Further optimize for latency and resource usage. 80 | - **User Feedback:** Implement a feedback loop for users to report issues or suggest improvements. 81 | -------------------------------------------------------------------------------- /docs/SprintPlan.md: -------------------------------------------------------------------------------- 1 | # Project Plan Outline 2 | 3 | ## Week 1 starting 2025-03-31: Metric Development and CLI Enhancements 4 | 5 | ### Milestones 6 | 7 | - Metric Development: Implement at least three new metrics for evaluating agentic AI systems. 8 | - CLI Streaming: Enhance the CLI to stream Pydantic-AI output. 9 | 10 | ### Tasks and Sequence 11 | 12 | - [ ] Research and Design New Metrics 13 | - Task Definition: Conduct literature review and design three new metrics that are agnostic to specific use cases but measure core agentic capabilities. 14 | - Sequence: Before implementing any code changes. 15 | - Definition of Done: A detailed document outlining the metrics, their mathematical formulations, and how they will be integrated into the evaluation pipeline. 16 | - [ ] Implement New Metrics 17 | - Task Definition: Write Python code to implement the new metrics, ensuring they are modular and easily integratable with existing evaluation logic. 18 | - Sequence: After completing the design document. 19 | - Definition of Done: Unit tests for each metric pass, and they are successfully integrated into the evaluation pipeline. 20 | - [ ] Enhance CLI for Streaming 21 | - Task Definition: Modify the CLI to stream Pydantic-AI output using asynchronous functions. 22 | - Sequence: Concurrently with metric implementation. 23 | - Definition of Done: The CLI can stream output from Pydantic-AI models without blocking, and tests demonstrate successful streaming. 24 | - [ ] Update Documentation 25 | - Task Definition: Update PRD.md and README.md to reflect new metrics and CLI enhancements. 26 | - Sequence: After completing metric implementation and CLI enhancements. 27 | - Definition of Done: PRD.md includes detailed descriptions of new metrics, and README.md provides instructions on how to use the enhanced CLI. 28 | 29 | ## Week 2 starting 2025-03-07: Streamlit GUI Enhancements and Testing 30 | 31 | ### Milestones 32 | 33 | - Streamlit GUI Output: Enhance the Streamlit GUI to display streamed output from Pydantic-AI. 34 | - Comprehensive Testing: Perform thorough testing of the entire system with new metrics and GUI enhancements. 35 | 36 | ### Tasks and Sequence 37 | 38 | - [ ] Enhance Streamlit GUI 39 | - Task Definition: Modify the Streamlit GUI to display the streamed output from Pydantic-AI models. 40 | - Sequence: Start of Week 2. 41 | - Definition of Done: The GUI can display streamed output without errors, and user interactions (e.g., selecting models, inputting queries) work as expected. 42 | - [ ] Integrate New Metrics into GUI 43 | - Task Definition: Ensure the Streamlit GUI can display results from the new metrics. 44 | - Sequence: After enhancing the GUI for streamed output. 45 | - Definition of Done: The GUI displays metric results clearly, and users can easily interpret the output. 46 | - [ ] Comprehensive System Testing 47 | - Task Definition: Perform end-to-end testing of the system, including new metrics and GUI enhancements. 48 | - Sequence: After integrating new metrics into the GUI. 49 | - Definition of Done: All tests pass without errors, and the system functions as expected in various scenarios. 50 | - [ ] Finalize Documentation and Deployment 51 | - Task Definition: Update MkDocs documentation to reflect all changes and deploy it to GitHub Pages. 52 | - Sequence: After completing system testing. 53 | - Definition of Done: Documentation is updated, and the latest version is live on GitHub Pages. 54 | 55 | ## Additional Considerations 56 | 57 | - Code Reviews: Schedule regular code reviews to ensure quality and adherence to project standards. 58 | - Feedback Loop: Establish a feedback loop with stakeholders to gather input on the new metrics and GUI enhancements. 59 | -------------------------------------------------------------------------------- /docs/UserStory.md: -------------------------------------------------------------------------------- 1 | # User Story for Agents-eval 2 | 3 | ## Introduction 4 | 5 | Agents-eval is designed to evaluate the effectiveness of open-source agentic AI systems across various use cases. This user story focuses on the perspective of Gez, an AI researcher who aims to assess and improve these systems using Agents-eval. 6 | 7 | ## User Profile 8 | 9 | - **Name:** Gez 10 | - **Role:** AI Researcher 11 | - **Goals:** 12 | - Evaluate the performance of agentic AI systems. 13 | - Identify areas for improvement in these systems. 14 | - Develop and integrate new metrics for evaluation. 15 | 16 | ## User Story 17 | 18 | **As** an AI researcher, 19 | **I want** to use Agents-eval to evaluate the effectiveness of agentic AI systems, 20 | **so that** I can assess their performance across different use cases and improve their capabilities. 21 | 22 | ### Acceptance Criteria 23 | 24 | 1. **Evaluation Pipeline:** 25 | - The system should provide a comprehensive evaluation pipeline that measures core agentic capabilities such as task decomposition, tool integration, adaptability, and overall performance. 26 | - The pipeline should support multiple agentic AI frameworks (e.g., Pydantic-AI, LangChain). 27 | 28 | 2. **Metric Development:** 29 | - The system should allow for the development and integration of new metrics that are agnostic to specific use cases. 30 | - These metrics should be modular and easily integratable with existing evaluation logic. 31 | 32 | 3. **CLI and GUI Interactions:** 33 | - The system should offer both a CLI and a Streamlit GUI for user interaction. 34 | - The CLI should support streaming output from Pydantic-AI models. 35 | - The Streamlit GUI should display streamed output and provide an intuitive interface for setting up and running evaluations. 36 | 37 | 4. **Documentation and Feedback:** 38 | - The system should include comprehensive documentation for setup, usage, and testing. 39 | - There should be a feedback loop for users to report issues or suggest improvements. 40 | 41 | ## Example Scenario 42 | 43 | - **Scenario:** Gez wants to evaluate a research agent system using Agents-eval. 44 | - **Steps:** 45 | 1. She sets up the environment using the CLI or devcontainer. 46 | 2. She configures the agent system with the desired models and tools. 47 | 3. She runs the evaluation using the CLI or Streamlit GUI. 48 | 4. She views the results and metrics displayed by the system. 49 | 5. She provides feedback on the system's performance and suggests improvements. 50 | 51 | ## Benefits 52 | 53 | - **Improved Evaluation Capabilities:** Agents-eval provides a structured approach to evaluating agentic AI systems, allowing researchers to focus on improving these systems. 54 | - **Flexibility and Customization:** The system supports multiple frameworks and allows for the development of new metrics, making it adaptable to various research needs. 55 | - **Enhanced User Experience:** The combination of CLI and GUI interfaces offers flexibility in how users interact with the system, catering to different preferences and workflows. 56 | -------------------------------------------------------------------------------- /docs/architecture/c4-multi-agent-system.plantuml: -------------------------------------------------------------------------------- 1 | @startuml "Multi-Agent Research System - C4 System Context" 2 | !theme plain 3 | 4 | !include https://raw.githubusercontent.com/plantuml-stdlib/C4-PlantUML/master/C4_Context.puml 5 | !include https://raw.githubusercontent.com/plantuml-stdlib/C4-PlantUML/master/C4_Container.puml 6 | !include https://raw.githubusercontent.com/plantuml-stdlib/C4-PlantUML/master/C4_Component.puml 7 | 8 | LAYOUT_WITH_LEGEND() 9 | 10 | title "Multi-Agent Research System" 11 | Person(user, "User", "Submits research queries") 12 | 13 | System_Boundary(research_system, "Supporting System") { 14 | Container(main_module, "Main Module", "Python", "Entry point that configures and runs the agent system") 15 | Container(utils, "Utilities", "Python", "Helper functions and data models") 16 | Container(config, "Configuration", "JSON", "Provider and model settings") 17 | } 18 | 19 | System_Boundary(agent_system, "Agent System") { 20 | Container(manager_agent, "Manager Agent", "pydantic-ai", "Coordinates research and analysis tasks") 21 | Container(research_agent, "Research Agent", "pydantic-ai", "Gathers information on topics") 22 | Container(analysis_agent, "Analysis Agent", "pydantic-ai", "Analyzes research") 23 | Container(synthesiser_agent, "Synthesiser Agent", "pydantic-ai", "Produces scientific reports") 24 | } 25 | 26 | System_Ext(llm_provider, "LLM Provider", "External inference service for AI models") 27 | System_Ext(search_api, "DuckDuckGo Search", "External search API") 28 | 29 | Rel(user, main_module, "Submits query", "CLI Input or GUI") 30 | Rel(main_module, config, "Loads", "Reads JSON config") 31 | Rel(main_module, agent_system, "Initializes and runs") 32 | 33 | Rel(manager_agent, research_agent, "Delegates research tasks to", "Optional Tool call") 34 | Rel(manager_agent, analysis_agent, "Delegates analysis tasks to", "Optional Tool call") 35 | Rel(manager_agent, synthesiser_agent, "Delegates synthesis tasks to", "Optional Tool call") 36 | 37 | Rel(research_agent, search_api, "Searches for information", "API call") 38 | 39 | Rel(manager_agent, llm_provider, "Generates responses", "API call") 40 | Rel(research_agent, llm_provider, "Generates responses", "API call") 41 | Rel(analysis_agent, llm_provider, "Generates responses", "API call") 42 | Rel(synthesiser_agent, llm_provider, "Generates responses", "API call") 43 | 44 | Rel(agent_system, utils, "Uses", "Import") 45 | Rel(main_module, utils, "Uses", "Import") 46 | 47 | @enduml 48 | -------------------------------------------------------------------------------- /docs/architecture/customer-journey-activity-dark: -------------------------------------------------------------------------------- 1 | @startuml 2 | !theme amiga 3 | skinparam monochrome true 4 | 5 | title Customer Journey Activity Diagram for CLI and Streamlit 6 | 7 | start 8 | :Discover Agents-eval; 9 | if (Choose CLI?) then (yes) 10 | :Run CLI with `make run_cli`; 11 | :Interact with CLI for agent setup and execution; 12 | :View results and metrics in CLI output; 13 | else (no) 14 | :Run Streamlit GUI with `make run_gui`; 15 | :Interact with Streamlit for agent setup and execution; 16 | :View results and metrics in Streamlit dashboard; 17 | endif 18 | :Continue using and provide feedback; 19 | :Improve based on feedback; 20 | 21 | stop 22 | @enduml -------------------------------------------------------------------------------- /docs/architecture/customer-journey-activity-light.plantuml: -------------------------------------------------------------------------------- 1 | @startuml 2 | !theme plain 3 | 4 | title Customer Journey Activity Diagram for CLI and Streamlit 5 | 6 | start 7 | :Discover Agents-eval; 8 | if (Choose CLI?) then (yes) 9 | :Run CLI with `make run_cli`; 10 | :Interact with CLI for agent setup and execution; 11 | :View results and metrics in CLI output; 12 | else (no) 13 | :Run Streamlit GUI with `make run_gui`; 14 | :Interact with Streamlit for agent setup and execution; 15 | :View results and metrics in Streamlit dashboard; 16 | endif 17 | :Continue using and provide feedback; 18 | :Improve based on feedback; 19 | 20 | stop 21 | @enduml -------------------------------------------------------------------------------- /docs/architecture/metrics-eval-sweep.plantuml: -------------------------------------------------------------------------------- 1 | @startuml 2 | !theme plain 3 | skinparam ConditionEndStyle diamond 4 | skinparam ParticipantPadding 20 5 | skinparam BoxPadding 20 6 | 7 | participant "Sweep Engine" as SE 8 | participant "Agentic System" as AS 9 | participant "Evaluation Engine" as EE 10 | 11 | SE -> EE: Set baseline parameters 12 | 13 | group Sweep over parameter variations [Independent runs] 14 | 15 | group Vary number of runs [ numbers of runs ] 16 | loop for each run_number 17 | SE -> AS: Start runs 18 | AS -> EE: Execute runs 19 | EE--> SE: Send results 20 | end 21 | end 22 | 23 | group Sweep metrics weights [ metrics weights ] 24 | loop for each weight_config 25 | SE -> AS: Set weights and start runs 26 | AS -> EE: Execute runs 27 | EE--> SE: Send results 28 | end 29 | end 30 | 31 | end 32 | @enduml 33 | -------------------------------------------------------------------------------- /mkdocs.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # https://github.com/james-willett/mkdocs-material-youtube-tutorial 3 | # https://mkdocstrings.github.io/recipes/ 4 | # site info set in workflow 5 | site_name: '' 6 | site_description: '' 7 | repo_url: '' 8 | edit_uri: edit/main 9 | theme: 10 | name: material 11 | language: en 12 | features: 13 | - content.code.annotation 14 | - content.code.copy 15 | - content.tabs.link 16 | - navigation.footer 17 | - navigation.sections 18 | - navigation.tabs 19 | - navigation.top 20 | - toc.integrate 21 | - search.suggest 22 | - search.highlight 23 | palette: 24 | - media: "(prefers-color-scheme: light)" 25 | scheme: default 26 | toggle: 27 | # icon: material/brightness-7 28 | icon: material/toggle-switch-off-outline 29 | name: "Toggle Dark Mode" 30 | - media: "(prefers-color-scheme: dark)" 31 | scheme: slate 32 | toggle: 33 | # icon: material/brightness-4 34 | icon: material/toggle-switch 35 | name: "Toggle Light Mode" 36 | nav: 37 | - Home: index.md 38 | - PRD: PRD.md 39 | - User Story: UserStory.md 40 | - Sprint Plan: SprintPlan.md 41 | - Code: docstrings.md 42 | - Change Log: CHANGELOG.md 43 | - License: LICENSE.md 44 | - llms.txt: llms.txt 45 | plugins: 46 | - search: 47 | lang: en 48 | - autorefs 49 | - mkdocstrings: 50 | handlers: 51 | python: 52 | paths: [src] 53 | options: 54 | show_root_heading: true 55 | show_root_full_path: true 56 | show_object_full_path: false 57 | show_root_members_full_path: false 58 | show_category_heading: true 59 | show_submodules: true 60 | markdown_extensions: 61 | - attr_list 62 | - pymdownx.magiclink 63 | - pymdownx.tabbed 64 | - pymdownx.highlight: 65 | anchor_linenums: true 66 | - pymdownx.superfences 67 | - pymdownx.snippets: 68 | check_paths: true 69 | - pymdownx.tasklist: 70 | custom_checkbox: true 71 | - sane_lists 72 | - smarty 73 | - toc: 74 | permalink: true 75 | validation: 76 | links: 77 | not_found: warn 78 | anchors: warn 79 | # builds only if validation succeeds while 80 | # threating warnings as errors 81 | # also checks for broken links 82 | # strict: true 83 | ... 84 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | version = "1.1.0" 7 | name = "Agents-eval" 8 | description = "Assess the effectiveness of agentic AI systems across various use cases focusing on agnostic metrics that measure core agentic capabilities." 9 | authors = [ 10 | {name = "qte77", email = "qte@77.gh"} 11 | ] 12 | readme = "README.md" 13 | requires-python = "==3.13.*" 14 | license = "bsd-3-clause" 15 | dependencies = [ 16 | "agentops>=0.4.14", 17 | "logfire>=3.16.1", 18 | "loguru>=0.7.3", 19 | "pydantic>=2.10.6", 20 | # "pydantic-ai>=0.0.36", 21 | "pydantic-ai-slim[duckduckgo,openai,tavily]>=0.2.12", 22 | "pydantic-settings>=2.9.1", 23 | "scalene>=1.5.51", 24 | "weave>=0.51.49", 25 | ] 26 | 27 | # [project.urls] 28 | # Documentation = "" 29 | 30 | [dependency-groups] 31 | dev = [ 32 | # "commitizen>=4.4.1", 33 | "mypy>=1.16.0", 34 | "ruff>=0.11.12", 35 | ] 36 | gui = [ 37 | "streamlit>=1.43.1", 38 | ] 39 | test = [ 40 | "pytest>=8.3.4", 41 | "pytest-cov>=6.0.0", 42 | "pytest-asyncio>=0.25.3", 43 | "pytest-bdd>=8.1.0", 44 | "requests>=2.32.3", 45 | "ruff>=0.9.2", 46 | ] 47 | docs = [ 48 | "griffe>=1.5.1", 49 | "mkdocs>=1.6.1", 50 | "mkdocs-awesome-pages-plugin>=2.9.3", 51 | "mkdocs-gen-files>=0.5.0", 52 | "mkdocs-literate-nav>=0.6.1", 53 | "mkdocs-material>=9.5.44", 54 | "mkdocs-section-index>=0.3.8", 55 | "mkdocstrings[python]>=0.27.0", 56 | ] 57 | 58 | [tool.uv] 59 | package = true 60 | exclude-newer = "2025-05-31T00:00:00Z" 61 | 62 | [tool.hatch.build.targets.wheel] 63 | only-include = ["/README.md"] 64 | 65 | [tool.hatch.build.targets.sdist] 66 | include = ["/README.md", "/Makefile", "/tests"] 67 | 68 | [tool.logfire] 69 | ignore_no_config=true 70 | send_to_logfire="if-token-present" 71 | 72 | [[tool.mypy.overrides]] 73 | module = "agentops" 74 | ignore_missing_imports = true 75 | 76 | [tool.ruff] 77 | target-version = "py313" 78 | src = ["src", "tests"] 79 | 80 | [tool.ruff.format] 81 | docstring-code-format = true 82 | 83 | [tool.ruff.lint] 84 | # ignore = ["E203"] # Whitespace before ':' 85 | unfixable = ["B"] 86 | select = [ 87 | # pycodestyle 88 | "E", 89 | # Pyflakes 90 | "F", 91 | # pyupgrade 92 | "UP", 93 | # isort 94 | "I", 95 | ] 96 | 97 | [tool.ruff.lint.isort] 98 | known-first-party = ["src", "tests"] 99 | 100 | [tool.ruff.lint.pydocstyle] 101 | convention = "google" 102 | 103 | [tool.pytest.ini_options] 104 | addopts = "--strict-markers" 105 | # "function", "class", "module", "package", "session" 106 | asyncio_default_fixture_loop_scope = "function" 107 | pythonpath = ["src"] 108 | testpaths = ["tests/"] 109 | 110 | [tool.coverage] 111 | [tool.coverage.run] 112 | include = [ 113 | "tests/**/*.py", 114 | ] 115 | # omit = [] 116 | # branch = true 117 | 118 | [tool.coverage.report] 119 | show_missing = true 120 | exclude_lines = [ 121 | # 'pragma: no cover', 122 | 'raise AssertionError', 123 | 'raise NotImplementedError', 124 | ] 125 | omit = [ 126 | 'env/*', 127 | 'venv/*', 128 | '.venv/*', 129 | '*/virtualenv/*', 130 | '*/virtualenvs/*', 131 | '*/tests/*', 132 | ] 133 | 134 | [tool.bumpversion] 135 | current_version = "1.1.0" 136 | parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)" 137 | serialize = ["{major}.{minor}.{patch}"] 138 | commit = true 139 | tag = true 140 | allow_dirty = false 141 | ignore_missing_version = false 142 | sign_tags = false 143 | tag_name = "v{new_version}" 144 | tag_message = "Bump version: {current_version} → {new_version}" 145 | message = "Bump version: {current_version} → {new_version}" 146 | commit_args = "" 147 | 148 | [[tool.bumpversion.files]] 149 | filename = "pyproject.toml" 150 | search = 'version = "{current_version}"' 151 | replace = 'version = "{new_version}"' 152 | 153 | [[tool.bumpversion.files]] 154 | filename = "src/app/__init__.py" 155 | search = '__version__ = "{current_version}"' 156 | replace = '__version__ = "{new_version}"' 157 | 158 | [[tool.bumpversion.files]] 159 | filename = "README.md" 160 | search = "version-{current_version}-58f4c2" 161 | replace = "version-{new_version}-58f4c2" 162 | 163 | [[tool.bumpversion.files]] 164 | filename = "CHANGELOG.md" 165 | search = """ 166 | ## [Unreleased] 167 | """ 168 | replace = """ 169 | ## [Unreleased] 170 | 171 | ## [{new_version}] - {now:%Y-%m-%d} 172 | """ 173 | -------------------------------------------------------------------------------- /src/app/__init__.py: -------------------------------------------------------------------------------- 1 | """Defines the application version.""" 2 | 3 | __version__ = "1.1.0" 4 | -------------------------------------------------------------------------------- /src/app/agents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qte77/Agents-eval/7401b21b53bd8307e7fe8465b466595a7687f8c8/src/app/agents/__init__.py -------------------------------------------------------------------------------- /src/app/agents/agent_system.py: -------------------------------------------------------------------------------- 1 | """ 2 | Agent system utilities for orchestrating multi-agent workflows. 3 | 4 | This module provides functions and helpers to create, configure, and run agent 5 | systems using Pydantic AI. It supports delegation of tasks to research, analysis, and 6 | synthesis agents, and manages agent configuration, environment setup, and execution. 7 | Args: 8 | provider (str): The name of the provider. provider_config (ProviderConfig): 9 | Configuration settings for the provider. 10 | api_key (str): API key for authentication with the provider. 11 | prompts (dict[str, str]): Configuration for prompts. 12 | include_researcher (bool): Flag to include the researcher agent. 13 | include_analyst (bool): Flag to include the analyst agent. 14 | include_synthesiser (bool): Flag to include the synthesiser agent. 15 | query (str | list[dict[str, str]]): The query or messages for the agent. 16 | chat_config (ChatConfig): The configuration object for agents and providers. 17 | usage_limits (UsageLimits): Usage limits for agent execution. 18 | pydantic_ai_stream (bool): Whether to use Pydantic AI streaming. 19 | 20 | Functions: 21 | get_manager: Initializes and returns a manager agent with the specified 22 | configuration. 23 | run_manager: Asynchronously runs the manager agent with the given query and 24 | provider. 25 | setup_agent_env: Sets up the environment for an agent by configuring provider 26 | settings, prompts, API key, and usage limits. 27 | """ 28 | 29 | from pydantic import BaseModel, ValidationError 30 | from pydantic_ai import Agent, RunContext 31 | from pydantic_ai.common_tools.duckduckgo import duckduckgo_search_tool 32 | from pydantic_ai.messages import ModelRequest 33 | from pydantic_ai.usage import UsageLimits 34 | 35 | from app.agents.llm_model_funs import get_api_key, get_models, get_provider_config 36 | from app.config.data_models import ( 37 | AgentConfig, 38 | AnalysisResult, 39 | AppEnv, 40 | ChatConfig, 41 | EndpointConfig, 42 | ModelDict, 43 | ProviderConfig, 44 | ResearchResult, 45 | ResearchSummary, 46 | ResultBaseType, 47 | UserPromptType, 48 | ) 49 | from app.utils.error_messages import generic_exception, invalid_data_model_format 50 | from app.utils.log import logger 51 | 52 | 53 | def _add_tools_to_manager_agent( 54 | manager_agent: Agent[None, BaseModel], 55 | research_agent: Agent[None, BaseModel] | None = None, 56 | analysis_agent: Agent[None, BaseModel] | None = None, 57 | synthesis_agent: Agent[None, BaseModel] | None = None, 58 | ): 59 | """ 60 | Adds tools to the manager agent for delegating tasks to research, analysis, and 61 | synthesis agents. 62 | Args: 63 | manager_agent (Agent): The manager agent to which tools will be added. 64 | research_agent (Agent): The agent responsible for handling research tasks. 65 | analysis_agent (Agent, optional): The agent responsible for handling 66 | analysis tasks. Defaults to None. 67 | synthesis_agent (Agent, optional): The agent responsible for handling 68 | synthesis tasks. Defaults to None. 69 | Returns: 70 | None 71 | """ 72 | 73 | def _validate_model_return( 74 | result_output: str, 75 | result_model: type[ResultBaseType], 76 | ) -> ResultBaseType: 77 | """Validates the output against the expected model.""" 78 | try: 79 | return result_model.model_validate(result_output) 80 | except ValidationError as e: 81 | msg = invalid_data_model_format(str(e)) 82 | logger.error(msg) 83 | raise ValidationError(msg) 84 | except Exception as e: 85 | msg = generic_exception(str(e)) 86 | logger.exception(msg) 87 | raise Exception(msg) 88 | 89 | if research_agent is not None: 90 | 91 | @manager_agent.tool 92 | # TODO remove redundant tool creation 93 | # ignore "delegate_research" is not accessed because of decorator 94 | async def delegate_research( # type: ignore[reportUnusedFunction] 95 | ctx: RunContext[None], query: str 96 | ) -> ResearchResult: 97 | """Delegate research task to ResearchAgent.""" 98 | result = await research_agent.run(query, usage=ctx.usage) 99 | return _validate_model_return(str(result.output), ResearchResult) 100 | 101 | if analysis_agent is not None: 102 | 103 | @manager_agent.tool 104 | # ignore "delegate_research" is not accessed because of decorator 105 | async def delegate_analysis( # type: ignore[reportUnusedFunction] 106 | ctx: RunContext[None], query: str 107 | ) -> AnalysisResult: 108 | """Delegate analysis task to AnalysisAgent.""" 109 | result = await analysis_agent.run(query, usage=ctx.usage) 110 | return _validate_model_return(str(result.output), AnalysisResult) 111 | 112 | if synthesis_agent is not None: 113 | 114 | @manager_agent.tool 115 | # ignore "delegate_research" is not accessed because of decorator 116 | async def delegate_synthesis( # type: ignore[reportUnusedFunction] 117 | ctx: RunContext[None], query: str 118 | ) -> ResearchSummary: 119 | """Delegate synthesis task to AnalysisAgent.""" 120 | result = await synthesis_agent.run(query, usage=ctx.usage) 121 | return _validate_model_return(str(result.output), ResearchSummary) 122 | 123 | 124 | def _create_agent(agent_config: AgentConfig) -> Agent[None, BaseModel]: 125 | """Factory for creating configured agents""" 126 | 127 | return Agent( 128 | model=agent_config.model, 129 | output_type=agent_config.output_type, 130 | system_prompt=agent_config.system_prompt, 131 | tools=agent_config.tools, 132 | retries=agent_config.retries, 133 | ) 134 | 135 | 136 | def _create_manager( 137 | prompts: dict[str, str], 138 | models: ModelDict, 139 | ) -> Agent[None, BaseModel]: 140 | """ 141 | Creates and configures a manager Agent with associated researcher, analyst, 142 | and optionally synthesiser agents. 143 | Args: 144 | prompts (Dict[str, str]): Dictionary containing system prompts for each agent. 145 | model_manager (GeminiModel | OpenAIModel): Model to be used by the manager 146 | agent. 147 | model_researcher (GeminiModel | OpenAIModel | None, optional): Model to be used 148 | by the researcher agent. 149 | model_analyst (GeminiModel | OpenAIModel | None, optional): Model to be used by 150 | the analyst agent. Defaults to None. 151 | model_synthesiser (GeminiModel | OpenAIModel | None, optional): Model to be used 152 | by the synthesiser agent. Defaults to None. 153 | Returns: 154 | Agent: Configured manager agent with associated tools and agents. 155 | """ 156 | 157 | status = f"Creating manager({models.model_manager.model_name})" 158 | active_agents = [ 159 | agent 160 | for agent in [ 161 | f"researcher({models.model_researcher.model_name})" 162 | if models.model_researcher 163 | else None, 164 | f"analyst({models.model_analyst.model_name})" 165 | if models.model_analyst 166 | else None, 167 | f"synthesiser({models.model_synthesiser.model_name})" 168 | if models.model_synthesiser 169 | else None, 170 | ] 171 | if agent 172 | ] 173 | status += f" with agents: {', '.join(active_agents)}" if active_agents else "" 174 | logger.info(status) 175 | 176 | manager = _create_agent( 177 | AgentConfig.model_validate( 178 | { 179 | "model": models.model_manager, 180 | "output_type": ResearchResult, 181 | "system_prompt": prompts["system_prompt_manager"], 182 | } 183 | ) 184 | ) 185 | 186 | if models.model_researcher is None: 187 | researcher = None 188 | else: 189 | researcher = _create_agent( 190 | AgentConfig.model_validate( 191 | { 192 | "model": models.model_researcher, 193 | "output_type": ResearchResult, 194 | "system_prompt": prompts["system_prompt_researcher"], 195 | "tools": [duckduckgo_search_tool()], 196 | } 197 | ) 198 | ) 199 | 200 | if models.model_analyst is None: 201 | analyst = None 202 | else: 203 | analyst = _create_agent( 204 | AgentConfig.model_validate( 205 | { 206 | "model": models.model_analyst, 207 | "output_type": AnalysisResult, 208 | "system_prompt": prompts["system_prompt_analyst"], 209 | } 210 | ) 211 | ) 212 | 213 | if models.model_synthesiser is None: 214 | synthesiser = None 215 | else: 216 | synthesiser = _create_agent( 217 | AgentConfig.model_validate( 218 | { 219 | "model": models.model_synthesiser, 220 | "output_type": AnalysisResult, 221 | "system_prompt": prompts["system_prompt_synthesiser"], 222 | } 223 | ) 224 | ) 225 | 226 | _add_tools_to_manager_agent(manager, researcher, analyst, synthesiser) 227 | return manager 228 | 229 | 230 | def get_manager( 231 | provider: str, 232 | provider_config: ProviderConfig, 233 | api_key: str | None, 234 | prompts: dict[str, str], 235 | include_researcher: bool = False, 236 | include_analyst: bool = False, 237 | include_synthesiser: bool = False, 238 | ) -> Agent[None, BaseModel]: 239 | """ 240 | Initializes and returns a Agent manager with the specified configuration. 241 | Args: 242 | provider (str): The name of the provider. 243 | provider_config (ProviderConfig): Configuration settings for the provider. 244 | api_key (str): API key for authentication with the provider. 245 | prompts (PromptsConfig): Configuration for prompts. 246 | include_researcher (bool, optional): Flag to include analyst model. 247 | Defaults to False. 248 | include_analyst (bool, optional): Flag to include analyst model. 249 | Defaults to False. 250 | include_synthesiser (bool, optional): Flag to include synthesiser model. 251 | Defaults to False. 252 | Returns: 253 | Agent: The initialized Agent manager. 254 | """ 255 | 256 | # FIXME context manager try-catch 257 | # with error_handling_context("get_manager()"): 258 | model_config = EndpointConfig.model_validate( 259 | { 260 | "provider": provider, 261 | "prompts": prompts, 262 | "api_key": api_key, 263 | "provider_config": provider_config, 264 | } 265 | ) 266 | models = get_models( 267 | model_config, include_researcher, include_analyst, include_synthesiser 268 | ) 269 | return _create_manager(prompts, models) 270 | 271 | 272 | async def run_manager( 273 | manager: Agent[None, BaseModel], 274 | query: UserPromptType, 275 | provider: str, 276 | usage_limits: UsageLimits | None, 277 | pydantic_ai_stream: bool = False, 278 | ) -> None: 279 | """ 280 | Asynchronously runs the manager with the given query and provider, handling errors 281 | and printing results. 282 | Args: 283 | manager (Agent): The system agent responsible for running the query. 284 | query (str): The query to be processed by the manager. 285 | provider (str): The provider to be used for the query. 286 | usage_limits (UsageLimits): The usage limits to be applied during the query 287 | execution. 288 | pydantic_ai_stream (bool, optional): Flag to enable or disable Pydantic AI 289 | stream. Defaults to False. 290 | Returns: 291 | None 292 | """ 293 | 294 | # FIXME context manager try-catch 295 | # with out ? error_handling_context("run_manager()"): 296 | model_name = getattr(manager, "model")._model_name 297 | mgr_cfg = {"user_prompt": query, "usage_limits": usage_limits} 298 | logger.info(f"Researching with {provider}({model_name}) and Topic: {query} ...") 299 | 300 | if pydantic_ai_stream: 301 | raise NotImplementedError( 302 | "Streaming currently only possible for Agents with " 303 | "output_type str not pydantic model" 304 | ) 305 | # logger.info("Streaming model response ...") 306 | # result = await manager.run(**mgr_cfg) 307 | # aync for chunk in result.stream_text(): # .run(**mgr_cfg) as result: 308 | # async with manager.run_stream(user_prompt=query) as stream: 309 | # async for chunk in stream.stream_text(): 310 | # logger.info(str(chunk)) 311 | # result = await stream.get_result() 312 | else: 313 | logger.info("Waiting for model response ...") 314 | # FIXME deprecated warning manager.run(), query unknown type 315 | # FIXME [call-overload] error: No overload variant of "run" of "Agent" 316 | # matches argument type "dict[str, list[dict[str, str]] | 317 | # Sequence[str | ImageUrl | AudioUrl | DocumentUrl | VideoUrl | 318 | # BinaryContent] | UsageLimits | None]" 319 | result = await manager.run(**mgr_cfg) # type: ignore[reportDeprecated,reportUnknownArgumentType,reportCallOverload,call-overload] 320 | 321 | logger.info(f"Result: {result}") 322 | logger.info(f"Usage statistics: {result.usage()}") 323 | 324 | 325 | def setup_agent_env( 326 | provider: str, 327 | query: UserPromptType, 328 | chat_config: ChatConfig | BaseModel, 329 | chat_env_config: AppEnv, 330 | ) -> EndpointConfig: 331 | """ 332 | Sets up the environment for an agent by configuring provider settings, prompts, 333 | API key, and usage limits. 334 | 335 | Args: 336 | provider (str): The name of the provider. 337 | query (UserPromptType): The messages or queries to be sent to the agent. 338 | chat_config (ChatConfig | BaseModel): The configuration object containing 339 | provider and prompt settings. 340 | chat_env_config (AppEnv): The application environment configuration 341 | containing API keys. 342 | 343 | Returns: 344 | EndpointConfig: The configuration object for the agent. 345 | """ 346 | 347 | if not isinstance(chat_config, ChatConfig): 348 | raise TypeError("'chat_config' of invalid type: ChatConfig expected") 349 | msg: str | None 350 | # FIXME context manager try-catch 351 | # with error_handling_context("setup_agent_env()"): 352 | provider_config = get_provider_config(provider, chat_config.providers) 353 | 354 | prompts = chat_config.prompts 355 | api_key = get_api_key(provider, chat_env_config) 356 | 357 | if provider.lower() == "ollama": 358 | # TODO move usage limits to config 359 | usage_limits = UsageLimits(request_limit=10, total_tokens_limit=100000) 360 | else: 361 | if api_key is None: 362 | msg = f"API key for provider '{provider}' is not set." 363 | logger.error(msg) 364 | raise ValueError(msg) 365 | # TODO Separate Gemini request into function 366 | if provider.lower() == "gemini": 367 | if isinstance(query, str): 368 | query = ModelRequest.user_text_prompt(query) 369 | elif isinstance(query, list): # type: ignore[reportUnnecessaryIsInstance] 370 | # query = [ 371 | # ModelRequest.user_text_prompt( 372 | # str(msg.get("content", "")) 373 | # ) # type: ignore[reportUnknownArgumentType] 374 | # if isinstance(msg, dict) 375 | # else msg 376 | # for msg in query 377 | # ] 378 | raise NotImplementedError("Currently conflicting with UserPromptType") 379 | else: 380 | msg = f"Unsupported query type for Gemini: {type(query)}" 381 | logger.error(msg) 382 | raise TypeError(msg) 383 | # TODO move usage limits to config 384 | usage_limits = UsageLimits(request_limit=10, total_tokens_limit=10000) 385 | 386 | return EndpointConfig.model_validate( 387 | { 388 | "provider": provider, 389 | "query": query, 390 | "api_key": api_key, 391 | "prompts": prompts, 392 | "provider_config": provider_config, 393 | "usage_limits": usage_limits, 394 | } 395 | ) 396 | -------------------------------------------------------------------------------- /src/app/agents/llm_model_funs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions and classes for managing and instantiating LLM models and providers. 3 | 4 | This module provides functions to retrieve API keys, provider configurations, and 5 | to create model instances for supported LLM providers such as Gemini and OpenAI. 6 | It also includes logic for assembling model dictionaries for system agents. 7 | """ 8 | 9 | from pydantic import HttpUrl 10 | from pydantic_ai.models.gemini import GeminiModel 11 | from pydantic_ai.models.openai import OpenAIModel 12 | from pydantic_ai.providers.openai import OpenAIProvider 13 | 14 | from app.config.config_app import API_SUFFIX 15 | from app.config.data_models import AppEnv, EndpointConfig, ModelDict, ProviderConfig 16 | from app.utils.error_messages import generic_exception, get_key_error 17 | from app.utils.log import logger 18 | 19 | 20 | def get_api_key( 21 | provider: str, 22 | chat_env_config: AppEnv, 23 | ) -> str | None: 24 | """Retrieve API key from chat env config variable.""" 25 | 26 | provider = provider.upper() 27 | if provider == "OLLAMA": 28 | return None 29 | else: 30 | key_name = f"{provider}{API_SUFFIX}" 31 | if hasattr(chat_env_config, key_name): 32 | logger.info(f"Found API key for provider '{provider}'") 33 | return getattr(chat_env_config, key_name) 34 | else: 35 | raise KeyError( 36 | f"API key for provider '{provider}' not found in configuration." 37 | ) 38 | 39 | 40 | def get_provider_config( 41 | provider: str, providers: dict[str, ProviderConfig] 42 | ) -> dict[str, str | HttpUrl]: 43 | """Retrieve configuration settings for the specified provider.""" 44 | 45 | try: 46 | model_name = providers[provider].model_name 47 | base_url = providers[provider].base_url 48 | except KeyError as e: 49 | msg = get_key_error(str(e)) 50 | logger.error(msg) 51 | raise KeyError(msg) 52 | except Exception as e: 53 | msg = generic_exception(str(e)) 54 | logger.exception(msg) 55 | raise Exception(msg) 56 | else: 57 | return { 58 | "model_name": model_name, 59 | "base_url": base_url, 60 | } 61 | 62 | 63 | def _create_model(endpoint_config: EndpointConfig) -> GeminiModel | OpenAIModel: 64 | """Create a model that uses model_name and base_url for inference API""" 65 | 66 | if endpoint_config.provider.lower() == "gemini": 67 | # FIXME EndpointConfig: TypeError: 'ModelRequest' object is not iterable. 68 | raise NotImplementedError( 69 | "Current typing raises TypeError: 'ModelRequest' object is not iterable." 70 | ) 71 | elif endpoint_config.provider.lower() == "huggingface": 72 | # FIXME HF not working with pydantic-ai OpenAI model 73 | raise NotImplementedError( 74 | "Hugging Face provider is not implemented yet. Please use Gemini or OpenAI." 75 | " https://huggingface.co/docs/inference-providers/providers/hf-inference" 76 | ) 77 | # headers = { 78 | # "Authorization": f"Bearer {endpoint_config.api_key}", 79 | # } 80 | # def query(payload): 81 | # response = requests.post(API_URL, headers=headers, json=payload) 82 | # return response.json() 83 | # query({"inputs": "", "parameters": {},}) 84 | else: 85 | base_url_str = str(endpoint_config.provider_config.base_url) 86 | return OpenAIModel( 87 | model_name=endpoint_config.provider_config.model_name, 88 | provider=OpenAIProvider( 89 | base_url=base_url_str, 90 | api_key=endpoint_config.api_key, 91 | ), 92 | ) 93 | 94 | 95 | def get_models( 96 | endpoint_config: EndpointConfig, 97 | include_researcher: bool = False, 98 | include_analyst: bool = False, 99 | include_synthesiser: bool = False, 100 | ) -> ModelDict: 101 | """ 102 | Get the models for the system agents. 103 | Args: 104 | endpoint_config (EndpointConfig): Configuration for the model. 105 | include_analyist (Optional[bool]): Whether to include the analyst model. 106 | Defaults to False. 107 | include_synthesiser (Optional[bool]): Whether to include the synthesiser model. 108 | Defaults to False. 109 | Returns: 110 | Dict[str, GeminiModel | OpenAIModel]: A dictionary containing the models for the 111 | system agents. 112 | """ 113 | 114 | model = _create_model(endpoint_config) 115 | return ModelDict.model_validate( 116 | { 117 | "model_manager": model, 118 | "model_researcher": model if include_researcher else None, 119 | "model_analyst": model if include_analyst else None, 120 | "model_synthesiser": model if include_synthesiser else None, 121 | } 122 | ) 123 | -------------------------------------------------------------------------------- /src/app/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qte77/Agents-eval/7401b21b53bd8307e7fe8465b466595a7687f8c8/src/app/config/__init__.py -------------------------------------------------------------------------------- /src/app/config/config_app.py: -------------------------------------------------------------------------------- 1 | """Configuration constants for the application.""" 2 | 3 | # MARK: chat env 4 | API_SUFFIX = "_API_KEY" 5 | CHAT_DEFAULT_PROVIDER = "github" 6 | 7 | 8 | # MARK: project 9 | PROJECT_NAME = "rd-mas-example" 10 | 11 | 12 | # MARK: paths 13 | CHAT_CONFIG_FILE = "config/config_chat.json" 14 | LOGS_PATH = "logs" 15 | EVAL_CONFIG_FILE = "config/config_eval.json" 16 | -------------------------------------------------------------------------------- /src/app/config/config_chat.json: -------------------------------------------------------------------------------- 1 | { 2 | "providers": { 3 | "huggingface": { 4 | "model_name": "facebook/bart-large-mnli", 5 | "base_url": "https://router.huggingface.co/hf-inference/models" 6 | }, 7 | "gemini": { 8 | "model_name": "gemini-1.5-flash-8b", 9 | "base_url": "https://generativelanguage.googleapis.com/v1beta" 10 | }, 11 | "github": { 12 | "model_name": "GPT-4o", 13 | "base_url": "https://models.inference.ai.azure.com" 14 | }, 15 | "grok": { 16 | "model_name": "grok-2-1212", 17 | "base_url": "https://api.x.ai/v1" 18 | }, 19 | "ollama": { 20 | "model_name": "granite3-dense", 21 | "base_url": "http://localhost:11434/v1" 22 | }, 23 | "openrouter": { 24 | "model_name": "google/gemini-2.0-flash-exp:free", 25 | "base_url": "https://openrouter.ai/api/v1" 26 | }, 27 | "perplexity": { 28 | "model_name": "sonar", 29 | "base_url": "https://api.perplexity.ai" 30 | }, 31 | "restack": { 32 | "model_name": "deepseek-chat", 33 | "base_url": "https://ai.restack.io" 34 | }, 35 | "together": { 36 | "model_name": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free", 37 | "base_url": "https://api.together.xyz/v1" 38 | } 39 | }, 40 | "inference": { 41 | "usage_limits": 10000, 42 | "usage_limits_ollama": 10000, 43 | "result_retries": 3, 44 | "result_retries_ollama": 3 45 | }, 46 | "prompts": { 47 | "system_prompt_manager": "You are a manager overseeing research and analysis tasks. Your role is to coordinate the efforts of the research, analysis and synthesiser agents to provide comprehensive answers to user queries. The researcher should gather and analyze data relevant to the topic. The whole result must be handed to the analyst, who will check it for accuracy of the assumptions, facts, and conclusions. If an analyst is present the researchers output has to be approved by the analyst. If the analyst does not approve of the researcher's result, all of the analyst's response and the topic must be handed back to the researcher to be refined. Repeat this loop until the analyst approves. If a sysnthesiser is present and once the analyst approves, the synthesiser should output a well formatted scientific report using the data given.", 48 | "system_prompt_researcher": "You are a researcher. Gather and analyze data relevant to the topic. Use the search tool to gather data. Always check accuracy of assumptions, facts, and conclusions.", 49 | "system_prompt_analyst": "You are a research analyst. Use your analytical skills to check the accuracy of assumptions, facts, and conclusions in the data provided. Provide relevant feedback if you do not approve. Only approve if you do not have any feedback to give.", 50 | "system_prompt_synthesiser": "You are a scientific writing assistant. Your task is to output a well formatted scientific report using the data given. Leave the privided facts, conclusions and sources unchanged." 51 | } 52 | } -------------------------------------------------------------------------------- /src/app/config/config_eval.json: -------------------------------------------------------------------------------- 1 | { 2 | "metrics_and_weights": { 3 | "time_taken": 0.167, 4 | "task_success": 0.167, 5 | "coordination_quality": 0.167, 6 | "tool_efficiency": 0.167, 7 | "planning_rational": 0.167, 8 | "output_similarity": 0.167 9 | } 10 | } -------------------------------------------------------------------------------- /src/app/config/data_models.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data models for agent system configuration and results. 3 | 4 | This module defines Pydantic models for representing research and analysis results, 5 | summaries, provider and agent configurations, and model dictionaries used throughout 6 | the application. These models ensure type safety and validation for data exchanged 7 | between agents and system components. 8 | """ 9 | 10 | from typing import Any, TypeVar 11 | 12 | from pydantic import BaseModel, ConfigDict, HttpUrl, field_validator 13 | from pydantic_ai.messages import ModelRequest 14 | from pydantic_ai.models import Model 15 | from pydantic_ai.tools import Tool 16 | from pydantic_ai.usage import UsageLimits 17 | from pydantic_settings import BaseSettings, SettingsConfigDict 18 | 19 | type UserPromptType = ( 20 | str | list[dict[str, str]] | ModelRequest | None 21 | ) # (1) Input validation 22 | ResultBaseType = TypeVar( 23 | "ResultBaseType", bound=BaseModel 24 | ) # (2) Generic type for model results 25 | 26 | 27 | class ResearchResult(BaseModel): 28 | """Research results from the research agent.""" 29 | 30 | topic: str | dict[str, str] 31 | findings: list[str] | dict[str, str | list[str]] 32 | sources: list[str] | dict[str, str | list[str]] 33 | 34 | 35 | class AnalysisResult(BaseModel): 36 | """Analysis results from the analysis agent.""" 37 | 38 | insights: list[str] 39 | recommendations: list[str] 40 | approval: bool 41 | 42 | 43 | class ResearchSummary(BaseModel): 44 | """Expected model response of research on a topic""" 45 | 46 | topic: str 47 | key_points: list[str] 48 | key_points_explanation: list[str] 49 | conclusion: str 50 | sources: list[str] 51 | 52 | 53 | class ProviderConfig(BaseModel): 54 | """Configuration for a model provider""" 55 | 56 | model_name: str 57 | base_url: HttpUrl 58 | 59 | 60 | class ChatConfig(BaseModel): 61 | """Configuration settings for agents and model providers""" 62 | 63 | providers: dict[str, ProviderConfig] 64 | inference: dict[str, str | int] 65 | prompts: dict[str, str] 66 | 67 | 68 | class EndpointConfig(BaseModel): 69 | """Configuration for an agent""" 70 | 71 | provider: str 72 | query: UserPromptType = None 73 | api_key: str | None 74 | prompts: dict[str, str] 75 | provider_config: ProviderConfig 76 | usage_limits: UsageLimits | None = None 77 | 78 | 79 | class AgentConfig(BaseModel): 80 | """Configuration for an agent""" 81 | 82 | model: Model # (1) Instance expected 83 | output_type: type[BaseModel] # (2) Class expected 84 | system_prompt: str 85 | # FIXME tools: list[Callable[..., Awaitable[Any]]] 86 | tools: list[Any] = [] # (3) List of tools will be validated at creation 87 | retries: int = 3 88 | 89 | # Avoid pydantic.errors.PydanticSchemaGenerationError: 90 | # Unable to generate pydantic-core schema for . 91 | # Avoid Pydantic errors related to non-Pydantic types 92 | model_config = ConfigDict( 93 | arbitrary_types_allowed=True 94 | ) # (4) Suppress Error non-Pydantic types caused by 95 | 96 | @field_validator("tools", mode="before") 97 | def validate_tools(cls, v: list[Any]) -> list[Tool | None]: 98 | """Validate that all tools are instances of Tool.""" 99 | if not v: 100 | return [] 101 | if not all(isinstance(t, Tool) for t in v): 102 | raise ValueError("All tools must be Tool instances") 103 | return v 104 | 105 | 106 | class ModelDict(BaseModel): 107 | """Dictionary of models used to create agent systems""" 108 | 109 | model_manager: Model 110 | model_researcher: Model | None 111 | model_analyst: Model | None 112 | model_synthesiser: Model | None 113 | model_config = ConfigDict(arbitrary_types_allowed=True) 114 | 115 | 116 | class EvalConfig(BaseModel): 117 | metrics_and_weights: dict[str, float] 118 | 119 | 120 | class AppEnv(BaseSettings): 121 | """ 122 | Application environment settings loaded from environment variables or .env file. 123 | 124 | This class uses Pydantic's BaseSettings to manage API keys and configuration 125 | for various inference endpoints, tools, and logging/monitoring services. 126 | Environment variables are loaded from a .env file by default. 127 | """ 128 | 129 | # Inference endpoints 130 | GEMINI_API_KEY: str = "" 131 | GITHUB_API_KEY: str = "" 132 | GROK_API_KEY: str = "" 133 | HUGGINGFACE_API_KEY: str = "" 134 | OPENROUTER_API_KEY: str = "" 135 | PERPLEXITY_API_KEY: str = "" 136 | RESTACK_API_KEY: str = "" 137 | TOGETHER_API_KEY: str = "" 138 | 139 | # Tools 140 | TAVILY_API_KEY: str = "" 141 | 142 | # Logging/Monitoring/Tracing 143 | AGENTOPS_API_KEY: str = "" 144 | LOGFIRE_API_KEY: str = "" 145 | WANDB_API_KEY: str = "" 146 | 147 | model_config = SettingsConfigDict( 148 | env_file=".env", env_file_encoding="utf-8", extra="ignore" 149 | ) 150 | -------------------------------------------------------------------------------- /src/app/evals/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qte77/Agents-eval/7401b21b53bd8307e7fe8465b466595a7687f8c8/src/app/evals/__init__.py -------------------------------------------------------------------------------- /src/app/evals/metrics.py: -------------------------------------------------------------------------------- 1 | def time_taken(start_time: float, end_time: float) -> float: 2 | """Calculate duration between start and end timestamps 3 | 4 | Args: 5 | start_time: Timestamp when execution started 6 | end_time: Timestamp when execution completed 7 | 8 | Returns: 9 | Duration in seconds with microsecond precision 10 | """ 11 | 12 | # TODO implement 13 | return end_time - start_time 14 | 15 | 16 | def output_similarity(agent_output: str, expected_answer: str) -> bool: 17 | """ 18 | Determine to what degree the agent's output matches the expected answer. 19 | 20 | Args: 21 | agent_output (str): The output produced by the agent. 22 | expected_answer (str): The correct or expected answer. 23 | 24 | Returns: 25 | bool: True if the output matches the expected answer, False otherwise. 26 | """ 27 | 28 | # TODO score instead of bool 29 | return agent_output.strip() == expected_answer.strip() 30 | -------------------------------------------------------------------------------- /src/app/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main entry point for the Agents-eval application. 3 | 4 | This module initializes the agentic system, loads configuration files, 5 | handles user input, and orchestrates the multi-agent workflow using 6 | asynchronous execution. It integrates logging, tracing, and authentication, 7 | and supports both CLI and programmatic execution. 8 | """ 9 | 10 | from asyncio import run 11 | from pathlib import Path 12 | from sys import argv 13 | 14 | from logfire import span 15 | from weave import op 16 | 17 | from app.__init__ import __version__ 18 | from app.agents.agent_system import get_manager, run_manager, setup_agent_env 19 | from app.config.config_app import ( 20 | CHAT_CONFIG_FILE, 21 | CHAT_DEFAULT_PROVIDER, 22 | EVAL_CONFIG_FILE, 23 | PROJECT_NAME, 24 | ) 25 | from app.config.data_models import AppEnv, ChatConfig, EvalConfig 26 | from app.utils.error_messages import generic_exception 27 | from app.utils.load_configs import load_config 28 | from app.utils.log import logger 29 | from app.utils.login import login 30 | from app.utils.utils import parse_args 31 | 32 | 33 | @op() 34 | async def main( 35 | chat_provider: str = CHAT_DEFAULT_PROVIDER, 36 | query: str = "", 37 | include_researcher: bool = False, 38 | include_analyst: bool = False, 39 | include_synthesiser: bool = False, 40 | pydantic_ai_stream: bool = False, 41 | chat_config_file: str = CHAT_CONFIG_FILE, 42 | ) -> None: 43 | """ 44 | Main entry point for the application. 45 | 46 | Args: 47 | chat_provider (str): The inference chat_provider to be used. 48 | query (str): The query to be processed by the agent. 49 | include_researcher (bool): Whether to include the researcher in the process. 50 | include_analyst (bool): Whether to include the analyst in the process. 51 | include_synthesiser (bool): Whether to include the synthesiser in the process. 52 | pydantic_ai_stream (bool): Whether to use Pydantic AI streaming. 53 | chat_config_file (str): Full path to the configuration file. 54 | 55 | Returns: 56 | None 57 | """ 58 | 59 | logger.info(f"Starting app '{PROJECT_NAME}' v{__version__}") 60 | try: 61 | with span("main()"): 62 | if not chat_provider: 63 | chat_provider = input("Which inference chat_provider to use? ") 64 | if not query: 65 | query = input("What would you like to research? ") 66 | 67 | chat_config_path = Path(__file__).parent / CHAT_CONFIG_FILE 68 | eval_config_path = Path(__file__).parent / EVAL_CONFIG_FILE 69 | chat_config = load_config(chat_config_path, ChatConfig) 70 | eval_config = load_config(eval_config_path, EvalConfig) 71 | chat_env_config = AppEnv() 72 | agent_env = setup_agent_env( 73 | chat_provider, query, chat_config, chat_env_config 74 | ) 75 | # TODO remove noqa and type ignore for unused variable 76 | metrics_and_weights = eval_config.metrics_and_weights # noqa: F841 # type: ignore[reportUnusedVariable] 77 | 78 | # FIXME enhance login, not every run? 79 | login(PROJECT_NAME, chat_env_config) 80 | 81 | manager = get_manager( 82 | agent_env.provider, 83 | agent_env.provider_config, 84 | agent_env.api_key, 85 | agent_env.prompts, 86 | include_researcher, 87 | include_analyst, 88 | include_synthesiser, 89 | ) 90 | await run_manager( 91 | manager, 92 | agent_env.query, 93 | agent_env.provider, 94 | agent_env.usage_limits, 95 | pydantic_ai_stream, 96 | ) 97 | logger.info(f"Exiting app '{PROJECT_NAME}'") 98 | 99 | except Exception as e: 100 | msg = generic_exception(f"Aborting app '{PROJECT_NAME}' with: {e}") 101 | logger.exception(msg) 102 | raise Exception(msg) from e 103 | 104 | 105 | if __name__ == "__main__": 106 | args = parse_args(argv[1:]) 107 | run(main(**args)) 108 | -------------------------------------------------------------------------------- /src/app/py.typed: -------------------------------------------------------------------------------- 1 | # PEP 561 – Distributing and Packaging Type Information 2 | # https://peps.python.org/pep-0561/ -------------------------------------------------------------------------------- /src/app/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qte77/Agents-eval/7401b21b53bd8307e7fe8465b466595a7687f8c8/src/app/utils/__init__.py -------------------------------------------------------------------------------- /src/app/utils/error_messages.py: -------------------------------------------------------------------------------- 1 | """ 2 | Error message utilities for the Agents-eval application. 3 | 4 | This module provides concise helper functions for generating standardized 5 | error messages related to configuration loading and validation. 6 | """ 7 | 8 | from pathlib import Path 9 | 10 | 11 | def api_connection_error(error: str) -> str: 12 | """ 13 | Generate a error message for API connection error. 14 | """ 15 | return f"API connection error: {error}" 16 | 17 | 18 | def failed_to_load_config(error: str) -> str: 19 | """ 20 | Generate a error message for configuration loading failure. 21 | """ 22 | return f"Failed to load config: {error}" 23 | 24 | 25 | def file_not_found(file_path: str | Path) -> str: 26 | """ 27 | Generate an error message for a missing configuration file. 28 | """ 29 | return f"File not found: {file_path}" 30 | 31 | 32 | def generic_exception(error: str) -> str: 33 | """ 34 | Generate a generic error message. 35 | """ 36 | return f"Exception: {error}" 37 | 38 | 39 | def invalid_data_model_format(error: str) -> str: 40 | """ 41 | Generate an error message for invalid pydantic data model format. 42 | """ 43 | return f"Invalid pydantic data model format: {error}" 44 | 45 | 46 | def invalid_json(error: str) -> str: 47 | """ 48 | Generate an error message for invalid JSON in a configuration file. 49 | """ 50 | return f"Invalid JSON: {error}" 51 | 52 | 53 | def invalid_type(expected_type: str, actual_type: str) -> str: 54 | """ 55 | Generate an error message for invalid Type. 56 | """ 57 | return f"Type Error: Expected {expected_type}, got {actual_type} instead." 58 | 59 | 60 | def get_key_error(error: str) -> str: 61 | """ 62 | Generate a generic error message. 63 | """ 64 | return f"Key Error: {error}" 65 | -------------------------------------------------------------------------------- /src/app/utils/load_configs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configuration loading utilities. 3 | 4 | Provides a generic function for loading and validating JSON configuration 5 | files against Pydantic models, with error handling and logging support. 6 | """ 7 | 8 | import json 9 | from pathlib import Path 10 | 11 | from pydantic import BaseModel, ValidationError 12 | 13 | from app.utils.error_messages import ( 14 | failed_to_load_config, 15 | file_not_found, 16 | invalid_data_model_format, 17 | invalid_json, 18 | ) 19 | from app.utils.log import logger 20 | 21 | 22 | def load_config(config_path: str | Path, data_model: type[BaseModel]) -> BaseModel: 23 | """ 24 | Generic configuration loader that validates against any Pydantic model. 25 | 26 | Args: 27 | config_path: Path to the JSON configuration file 28 | model: Pydantic model class for validation 29 | 30 | Returns: 31 | Validated configuration instance 32 | """ 33 | 34 | try: 35 | with open(config_path, encoding="utf-8") as f: 36 | data = json.load(f) 37 | return data_model.model_validate(data) 38 | except FileNotFoundError as e: 39 | msg = file_not_found(config_path) 40 | logger.error(msg) 41 | raise FileNotFoundError(msg) from e 42 | except json.JSONDecodeError as e: 43 | msg = invalid_json(str(e)) 44 | logger.error(msg) 45 | raise ValueError(msg) from e 46 | except ValidationError as e: 47 | msg = invalid_data_model_format(str(e)) 48 | logger.error(msg) 49 | raise ValidationError(msg) from e 50 | except Exception as e: 51 | msg = failed_to_load_config(str(e)) 52 | logger.exception(msg) 53 | raise Exception(msg) from e 54 | -------------------------------------------------------------------------------- /src/app/utils/load_settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions and classes for loading application settings and configuration. 3 | 4 | This module defines the AppEnv class for managing environment variables using Pydantic, 5 | and provides a function to load and validate application configuration from a JSON file. 6 | """ 7 | 8 | import json 9 | from pathlib import Path 10 | 11 | from pydantic_settings import BaseSettings, SettingsConfigDict 12 | 13 | from app.config.data_models import ChatConfig 14 | from app.utils.error_messages import ( 15 | failed_to_load_config, 16 | file_not_found, 17 | invalid_json, 18 | ) 19 | from app.utils.log import logger 20 | 21 | 22 | class AppEnv(BaseSettings): 23 | """ 24 | Application environment settings loaded from environment variables or .env file. 25 | 26 | This class uses Pydantic's BaseSettings to manage API keys and configuration 27 | for various inference endpoints, tools, and logging/monitoring services. 28 | Environment variables are loaded from a .env file by default. 29 | """ 30 | 31 | # Inference endpoints 32 | GEMINI_API_KEY: str = "" 33 | GITHUB_API_KEY: str = "" 34 | GROK_API_KEY: str = "" 35 | HUGGINGFACE_API_KEY: str = "" 36 | OPENROUTER_API_KEY: str = "" 37 | PERPLEXITY_API_KEY: str = "" 38 | RESTACK_API_KEY: str = "" 39 | TOGETHER_API_KEY: str = "" 40 | 41 | # Tools 42 | TAVILY_API_KEY: str = "" 43 | 44 | # Logging/Monitoring/Tracing 45 | AGENTOPS_API_KEY: str = "" 46 | LOGFIRE_TOKEN: str = "" 47 | WANDB_API_KEY: str = "" 48 | 49 | model_config = SettingsConfigDict( 50 | env_file=".env", env_file_encoding="utf-8", extra="ignore" 51 | ) 52 | 53 | 54 | chat_config = AppEnv() 55 | 56 | 57 | def load_config(config_path: str | Path) -> ChatConfig: 58 | """ 59 | Load and validate application configuration from a JSON file. 60 | 61 | Args: 62 | config_path (str): Path to the JSON configuration file. 63 | 64 | Returns: 65 | ChatConfig: An instance of ChatConfig with validated configuration data. 66 | 67 | Raises: 68 | FileNotFoundError: If the configuration file does not exist. 69 | json.JSONDecodeError: If the file contains invalid JSON. 70 | Exception: For any other unexpected errors during loading or validation. 71 | """ 72 | 73 | try: 74 | with open(config_path) as f: 75 | config_data = json.load(f) 76 | except FileNotFoundError as e: 77 | msg = file_not_found(config_path) 78 | logger.error(msg) 79 | raise FileNotFoundError(msg) from e 80 | except json.JSONDecodeError as e: 81 | msg = invalid_json(str(e)) 82 | logger.error(msg) 83 | raise json.JSONDecodeError(msg, str(config_path), 0) from e 84 | except Exception as e: 85 | msg = failed_to_load_config(str(e)) 86 | logger.exception(msg) 87 | raise Exception(msg) from e 88 | 89 | return ChatConfig.model_validate(config_data) 90 | -------------------------------------------------------------------------------- /src/app/utils/log.py: -------------------------------------------------------------------------------- 1 | """ 2 | Set up the logger with custom settings. 3 | Logs are written to a file with automatic rotation. 4 | """ 5 | 6 | from loguru import logger 7 | 8 | from app.config.config_app import LOGS_PATH 9 | 10 | logger.add( 11 | f"{LOGS_PATH}/{{time}}.log", 12 | rotation="1 MB", 13 | # level="DEBUG", 14 | retention="7 days", 15 | compression="zip", 16 | ) 17 | -------------------------------------------------------------------------------- /src/app/utils/login.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module provides utility functions for managing login state and initializing 3 | the environment for a given project. It includes functionality to load and save 4 | login state, perform a one-time login, and check if the user is logged in. 5 | """ 6 | 7 | from os import environ 8 | 9 | from agentops import init as agentops_init 10 | from logfire import configure as logfire_conf 11 | from wandb import login as wandb_login 12 | from weave import init as weave_init 13 | 14 | from app.agents.llm_model_funs import get_api_key 15 | from app.config.data_models import AppEnv 16 | from app.utils.error_messages import generic_exception 17 | from app.utils.log import logger 18 | 19 | 20 | def login(project_name: str, chat_env_config: AppEnv): 21 | """ 22 | Logs in to the workspace and initializes the environment for the given project. 23 | Args: 24 | project_name (str): The name of the project to initialize. 25 | chat_env_config (AppEnv): The application environment configuration 26 | containing the API keys. 27 | Returns: 28 | None 29 | """ 30 | 31 | try: 32 | logger.info(f"Logging in to the workspaces for project: {project_name}") 33 | environ["AGENTOPS_LOGGING_TO_FILE"] = "FALSE" 34 | agentops_init( 35 | default_tags=[project_name], 36 | api_key=get_api_key("AGENTOPS", chat_env_config), 37 | ) 38 | logfire_conf(token=get_api_key("LOGFIRE", chat_env_config)) 39 | wandb_login(key=get_api_key("WANDB", chat_env_config)) 40 | weave_init(project_name) 41 | except Exception as e: 42 | msg = generic_exception(str(e)) 43 | logger.exception(e) 44 | raise Exception(msg) from e 45 | -------------------------------------------------------------------------------- /src/app/utils/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module provides utility functions and context managers for handling configurations, 3 | error handling, and setting up agent environments. 4 | 5 | Functions: 6 | load_config(config_path: str) -> Config: 7 | Load and validate configuration from a JSON file. 8 | 9 | print_research_Result(summary: Dict, usage: Usage) -> None: 10 | Output structured summary of the research topic. 11 | 12 | error_handling_context(operation_name: str, console: Console = None): 13 | Context manager for handling errors during operations. 14 | 15 | setup_agent_env(config: Config, console: Console = None) -> AgentConfig: 16 | Set up the agent environment based on the provided configuration. 17 | """ 18 | 19 | from pydantic_ai.usage import Usage 20 | 21 | from app.config.data_models import ResearchSummary 22 | from app.utils.log import logger 23 | 24 | 25 | def log_research_result(summary: ResearchSummary, usage: Usage) -> None: 26 | """ 27 | Prints the research summary and usage details in a formatted manner. 28 | 29 | Args: 30 | summary (Dict): A dictionary containing the research summary with keys 'topic', 31 | 'key_points', 'key_points_explanation', and 'conclusion'. 32 | usage (Usage): An object containing usage details to be printed. 33 | """ 34 | 35 | logger.info(f"\n=== Research Summary: {summary.topic} ===") 36 | logger.info("\nKey Points:") 37 | for i, point in enumerate(summary.key_points, 1): 38 | logger.info(f"{i}. {point}") 39 | logger.info("\nKey Points Explanation:") 40 | for i, point in enumerate(summary.key_points_explanation, 1): 41 | logger.info(f"{i}. {point}") 42 | logger.info(f"\nConclusion: {summary.conclusion}") 43 | logger.info(f"\nResponse structure: {list(dict(summary).keys())}") 44 | logger.info(usage) 45 | 46 | 47 | def parse_args(argv: list[str]) -> dict[str, str | bool]: 48 | """ 49 | Parse command line arguments into a dictionary. 50 | 51 | This function processes a list of command-line arguments, 52 | extracting recognized options and their values. 53 | Supported arguments include flags (e.g., --help, --include-researcher 54 | and key-value pairs (e.g., `--chat-provider=ollama`). 55 | If the `--help` flag is present, a list of available commands and their 56 | descriptions is printed, and an empty dictionary is returned. 57 | 58 | Recognized arguments as list[str] 59 | ``` 60 | --help Display help information and exit. 61 | --version Display version information. 62 | --chat-provider= Specify the chat provider to use. 63 | --query= Specify the query to process. 64 | --include-researcher Include the researcher agent. 65 | --include-analyst Include the analyst agent. 66 | --include-synthesiser Include the synthesiser agent. 67 | --no-stream Disable streaming output. 68 | --chat-config-file= Specify the path to the chat configuration file. 69 | ``` 70 | 71 | Returns: 72 | `dict[str, str | bool]`: A dictionary mapping argument names 73 | (with leading '--' removed and hyphens replaced by underscores) 74 | to their values (`str` for key-value pairs, `bool` for flags). 75 | Returns an empty dict if `--help` is specified. 76 | 77 | Example: 78 | >>> `parse_args(['--chat-provider=ollama', '--include-researcher'])` 79 | returns `{'chat_provider': 'ollama', 'include_researcher': True}` 80 | """ 81 | 82 | commands = { 83 | "--help": "Display help information", 84 | "--version": "Display version information", 85 | "--chat-provider": "Specify the chat provider to use", 86 | "--query": "Specify the query to process", 87 | "--include-researcher": "Include the researcher agent", 88 | "--include-analyst": "Include the analyst agent", 89 | "--include-synthesiser": "Include the synthesiser agent", 90 | "--no-stream": "Disable streaming output", 91 | "--chat-config-file": "Specify the path to the chat configuration file", 92 | } 93 | parsed_args: dict[str, str | bool] = {} 94 | 95 | if "--help" in argv: 96 | print("Available commands:") 97 | for cmd, desc in commands.items(): 98 | print(f"{cmd}: {desc}") 99 | return parsed_args 100 | 101 | for arg in argv: 102 | if arg.split("=", 1)[0] in commands.keys(): 103 | key, value = arg.split("=", 1) if "=" in arg else (arg, True) 104 | key = key.lstrip("--").replace("-", "_") 105 | parsed_args[key] = value 106 | 107 | if parsed_args: 108 | logger.info(f"Used arguments: {parsed_args}") 109 | 110 | return parsed_args 111 | -------------------------------------------------------------------------------- /src/examples/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "providers": { 3 | "gemini": { 4 | "model_name": "gemini-1.5-flash-8b", 5 | "base_url": "https://generativelanguage.googleapis.com/v1beta" 6 | }, 7 | "github": { 8 | "model_name": "GPT-4o", 9 | "base_url": "https://models.inference.ai.azure.com" 10 | }, 11 | "huggingface": { 12 | "model_name": "Qwen/QwQ-32B-Preview", 13 | "base_url": "https://api-inference.huggingface.co/v1" 14 | }, 15 | "ollama": { 16 | "model_name": "granite3-dense", 17 | "base_url": "http://localhost:11434/v1" 18 | }, 19 | "openrouter": { 20 | "model_name": "google/gemini-2.0-flash-lite-preview-02-05:free", 21 | "base_url": "https://openrouter.ai/api/v1" 22 | }, 23 | "restack": { 24 | "model_name": "deepseek-chat", 25 | "base_url": "https://ai.restack.io" 26 | } 27 | }, 28 | "prompts": { 29 | "system_prompt": "You are a helpful research assistant. Extract key information about the topic and provide a structured summary.", 30 | "user_prompt": "Provide a research summary about", 31 | "system_prompt_researcher": "You are a manager overseeing research and analysis tasks. Your role is to coordinate the efforts of the research and analysis agents to provide comprehensive answers to user queries.", 32 | "system_prompt_manager": "You are a research assistant. Your task is to find relevant information about the topic provided. Use the search tool to gather data and synthesize it into a concise summary.", 33 | "system_prompt_analyst": "You are a data scientist. Your task is to analyze the data provided and extract meaningful insights. Use your analytical skills to identify trends, patterns, and correlations." 34 | } 35 | } -------------------------------------------------------------------------------- /src/examples/run_simple_agent_no_tools.py: -------------------------------------------------------------------------------- 1 | """ 2 | A simple example of using a Pydantic AI agent to generate a structured summary of a 3 | research topic. 4 | """ 5 | 6 | from os import path 7 | 8 | from .utils.agent_simple_no_tools import get_research 9 | from .utils.utils import ( 10 | get_api_key, 11 | get_provider_config, 12 | load_config, 13 | print_research_Result, 14 | ) 15 | 16 | CONFIG_FILE = "config.json" 17 | 18 | 19 | def main(): 20 | """Main function to run the research agent.""" 21 | 22 | config_path = path.join(path.dirname(__file__), CONFIG_FILE) 23 | config = load_config(config_path) 24 | 25 | provider = input("Which inference provider to use? ") 26 | topic = input("What topic would you like to research? ") 27 | 28 | api_key = get_api_key(provider) 29 | provider_config = get_provider_config(provider, config) 30 | 31 | result = get_research(topic, config.prompts, provider, provider_config, api_key) 32 | print_research_Result(result.data, result.usage()) 33 | 34 | 35 | if __name__ == "__main__": 36 | main() 37 | -------------------------------------------------------------------------------- /src/examples/run_simple_agent_system.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example demonstrates how to run a simple agent system that consists of a manager 3 | agent, a research agent, and an analysis agent. The manager agent delegates research 4 | and analysis tasks to the corresponding agents and combines the results to provide a 5 | comprehensive answer to the user query. 6 | https://ai.pydantic.dev/multi-agent-applications/#agent-delegation 7 | """ 8 | 9 | from asyncio import run 10 | from os import path 11 | 12 | from openai import UnprocessableEntityError 13 | from pydantic_ai.common_tools.duckduckgo import duckduckgo_search_tool 14 | from pydantic_ai.exceptions import UnexpectedModelBehavior, UsageLimitExceeded 15 | from pydantic_ai.models.openai import OpenAIModel 16 | from pydantic_ai.usage import UsageLimits 17 | 18 | from .utils.agent_simple_system import SystemAgent, add_tools_to_manager_agent 19 | from .utils.data_models import AnalysisResult, ResearchResult 20 | from .utils.utils import create_model, get_api_key, get_provider_config, load_config 21 | 22 | CONFIG_FILE = "config.json" 23 | 24 | 25 | def get_models(model_config: dict) -> tuple[OpenAIModel]: 26 | """Get the models for the system agents.""" 27 | model_researcher = create_model(**model_config) 28 | model_analyst = create_model(**model_config) 29 | model_manager = create_model(**model_config) 30 | return model_researcher, model_analyst, model_manager 31 | 32 | 33 | def get_manager( 34 | model_manager: OpenAIModel, 35 | model_researcher: OpenAIModel, 36 | model_analyst: OpenAIModel, 37 | prompts: dict[str, str], 38 | ) -> SystemAgent: 39 | """Get the agents for the system.""" 40 | researcher = SystemAgent( 41 | model_researcher, 42 | ResearchResult, 43 | prompts["system_prompt_researcher"], 44 | [duckduckgo_search_tool()], 45 | ) 46 | analyst = SystemAgent( 47 | model_analyst, AnalysisResult, prompts["system_prompt_analyst"] 48 | ) 49 | manager = SystemAgent( 50 | model_manager, ResearchResult, prompts["system_prompt_manager"] 51 | ) 52 | add_tools_to_manager_agent(manager, researcher, analyst) 53 | return manager 54 | 55 | 56 | async def main(): 57 | """Main function to run the research system.""" 58 | 59 | provider = input("Which inference provider to use? ") 60 | query = input("What would you like to research? ") 61 | 62 | config_path = path.join(path.dirname(__file__), CONFIG_FILE) 63 | config = load_config(config_path) 64 | 65 | api_key = get_api_key(provider) 66 | provider_config = get_provider_config(provider, config) 67 | usage_limits = UsageLimits(request_limit=10, total_tokens_limit=4000) 68 | 69 | model_config = { 70 | "base_url": provider_config["base_url"], 71 | "model_name": provider_config["model_name"], 72 | "api_key": api_key, 73 | "provider": provider, 74 | } 75 | manager = get_manager(*get_models(model_config), config.prompts) 76 | 77 | print(f"\nResearching: {query}...") 78 | 79 | try: 80 | result = await manager.run(query, usage_limits=usage_limits) 81 | except (UnexpectedModelBehavior, UnprocessableEntityError) as e: 82 | print(f"Error: Model returned unexpected result: {e}") 83 | except UsageLimitExceeded as e: 84 | print(f"Usage limit exceeded: {e}") 85 | else: 86 | print("\nFindings:", {result.data.findings}) 87 | print(f"Sources: {result.data.sources}") 88 | print("\nUsage statistics:") 89 | print(result.usage()) 90 | 91 | 92 | if __name__ == "__main__": 93 | run(main()) 94 | -------------------------------------------------------------------------------- /src/examples/run_simple_agent_tools.py: -------------------------------------------------------------------------------- 1 | """Run the dice game agent using simple tools.""" 2 | 3 | from os import path 4 | 5 | from .utils.agent_simple_tools import get_dice 6 | from .utils.utils import ( 7 | get_api_key, 8 | get_provider_config, 9 | load_config, 10 | ) 11 | 12 | CONFIG_FILE = "config.json" 13 | system_prompt = ( 14 | "You're a dice game, you should roll the die and see if the number " 15 | "you get back matches the user's guess. If so, tell them they're a winner. " 16 | "Use the player's name in the response." 17 | ) 18 | 19 | 20 | def main(): 21 | """Run the dice game agent.""" 22 | 23 | provider = input("Which inference provider to use? ") 24 | player_name = input("Enter your name: ") 25 | guess = input("Guess a number between 1 and 6: ") 26 | 27 | config_path = path.join(path.dirname(__file__), CONFIG_FILE) 28 | config = load_config(config_path) 29 | 30 | api_key = get_api_key(provider) 31 | provider_config = get_provider_config(provider, config) 32 | 33 | result = get_dice( 34 | player_name, guess, system_prompt, provider, api_key, provider_config 35 | ) 36 | print(result.data) 37 | print(f"{result._result_tool_name=}") 38 | print(result.usage()) 39 | 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /src/examples/utils/agent_simple_no_tools.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains a function to create a research agent with the specified model, 3 | result type, and system prompt. 4 | """ 5 | 6 | from sys import exit 7 | 8 | from openai import APIConnectionError 9 | from pydantic_ai import Agent 10 | from pydantic_ai.agent import AgentRunResult 11 | from pydantic_ai.models.openai import OpenAIModel 12 | 13 | from .data_models import Config, ResearchSummary 14 | from .utils import create_model 15 | 16 | 17 | def _create_research_agent( 18 | model: OpenAIModel, result_type: ResearchSummary, system_prompt: str 19 | ) -> Agent: 20 | """ 21 | Create a research agent with the specified model, result type, and system prompt. 22 | """ 23 | 24 | return Agent(model=model, result_type=result_type, system_prompt=system_prompt) 25 | 26 | 27 | def get_research( 28 | topic: str, 29 | prompts: dict[str, str], 30 | provider: str, 31 | provider_config: Config, 32 | api_key: str, 33 | ) -> AgentRunResult: 34 | """Run the research agent to generate a structured summary of a research topic.""" 35 | 36 | model = create_model( 37 | provider_config["base_url"], provider_config["model_name"], api_key, provider 38 | ) 39 | agent = _create_research_agent(model, ResearchSummary, prompts["system_prompt"]) 40 | 41 | print(f"\nResearching {topic}...") 42 | try: 43 | result = agent.run_sync(f"{prompts['user_prompt']} {topic}") 44 | except APIConnectionError as e: 45 | print(f"Error connecting to API: {e}") 46 | exit() 47 | except Exception as e: 48 | print(f"Error connecting to API: {e}") 49 | exit() 50 | else: 51 | return result 52 | -------------------------------------------------------------------------------- /src/examples/utils/agent_simple_system.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains a simple system of agents that can be used to research and analyze 3 | data. 4 | """ 5 | 6 | from pydantic_ai import Agent, RunContext 7 | from pydantic_ai.models.openai import OpenAIModel 8 | 9 | from .data_models import AnalysisResult, ResearchResult 10 | 11 | 12 | class SystemAgent(Agent): 13 | """A generic system agent that can be used to research and analyze data.""" 14 | 15 | def __init__( 16 | self, 17 | model: OpenAIModel, 18 | result_type: ResearchResult | AnalysisResult, 19 | system_prompt: str, 20 | result_retries: int = 3, 21 | tools: list | None = [], 22 | ): 23 | super().__init__( 24 | model, 25 | result_type=result_type, 26 | system_prompt=system_prompt, 27 | result_retries=result_retries, 28 | tools=tools, 29 | ) 30 | 31 | 32 | def add_tools_to_manager_agent( 33 | manager_agent: SystemAgent, research_agent: SystemAgent, analysis_agent: SystemAgent 34 | ) -> None: 35 | """Create and configure the joke generation agent.""" 36 | 37 | @manager_agent.tool 38 | async def delegate_research(ctx: RunContext[None], query: str) -> ResearchResult: 39 | """Delegate research task to ResearchAgent.""" 40 | result = await research_agent.run(query, usage=ctx.usage) 41 | return result.data 42 | 43 | @manager_agent.tool 44 | async def delegate_analysis(ctx: RunContext[None], data: str) -> AnalysisResult: 45 | """Delegate analysis task to AnalysisAgent.""" 46 | result = await analysis_agent.run(data, usage=ctx.usage) 47 | return result.data 48 | -------------------------------------------------------------------------------- /src/examples/utils/agent_simple_tools.py: -------------------------------------------------------------------------------- 1 | """Simple agent for the dice game example.""" 2 | 3 | from openai import APIConnectionError 4 | from pydantic_ai import Agent, Tool 5 | from pydantic_ai.agent import AgentRunResult 6 | from pydantic_ai.models.openai import OpenAIModel 7 | 8 | from .tools import get_player_name, roll_die 9 | from .utils import create_model 10 | 11 | 12 | class _DiceGameAgent(Agent): 13 | """Dice game agent.""" 14 | 15 | def __init__(self, model: OpenAIModel, system_prompt: str): 16 | super().__init__( 17 | model=model, 18 | deps_type=str, 19 | system_prompt=system_prompt, 20 | tools=[ # (1)! 21 | Tool(roll_die, takes_ctx=False), 22 | Tool(get_player_name, takes_ctx=True), 23 | ], 24 | ) 25 | 26 | 27 | def get_dice( 28 | player_name: str, 29 | guess: str, 30 | system_prompt: str, 31 | provider: str, 32 | api_key: str, 33 | config: dict, 34 | ) -> AgentRunResult: 35 | """Run the dice game agent.""" 36 | 37 | model = create_model(config["base_url"], config["model_name"], api_key, provider) 38 | agent = _DiceGameAgent(model, system_prompt) 39 | 40 | try: 41 | # usage_limits=UsageLimits(request_limit=5, total_tokens_limit=300), 42 | result = agent.run_sync(f"Player is guessing {guess}...", deps=player_name) 43 | except APIConnectionError as e: 44 | print(f"Error connecting to API: {e}") 45 | exit() 46 | except Exception as e: 47 | print(f"Error connecting to API: {e}") 48 | exit() 49 | else: 50 | return result 51 | -------------------------------------------------------------------------------- /src/examples/utils/data_models.py: -------------------------------------------------------------------------------- 1 | """Example of a module with data models""" 2 | 3 | from pydantic import BaseModel 4 | 5 | 6 | class ResearchResult(BaseModel): 7 | """Research results from the research agent.""" 8 | 9 | topic: str 10 | findings: list[str] 11 | sources: list[str] 12 | 13 | 14 | class AnalysisResult(BaseModel): 15 | """Analysis results from the analysis agent.""" 16 | 17 | insights: list[str] 18 | recommendations: list[str] 19 | 20 | 21 | class ResearchSummary(BaseModel): 22 | """Expected model response of research on a topic""" 23 | 24 | topic: str 25 | key_points: list[str] 26 | key_points_explanation: list[str] 27 | conclusion: str 28 | 29 | 30 | class ProviderConfig(BaseModel): 31 | """Configuration for a model provider""" 32 | 33 | model_name: str 34 | base_url: str 35 | 36 | 37 | class Config(BaseModel): 38 | """Configuration settings for the research agent and model providers""" 39 | 40 | providers: dict[str, ProviderConfig] 41 | prompts: dict[str, str] 42 | -------------------------------------------------------------------------------- /src/examples/utils/tools.py: -------------------------------------------------------------------------------- 1 | """Example tools for the utils example.""" 2 | 3 | from random import randint 4 | 5 | from pydantic_ai import RunContext 6 | 7 | 8 | def roll_die() -> str: 9 | """Tool to roll a die.""" 10 | 11 | async def _execute(self) -> str: 12 | """Roll the die and return the result.""" 13 | return str(randint(1, 6)) 14 | 15 | 16 | def get_player_name(ctx: RunContext[str]) -> str: 17 | """Get the player's name from the context.""" 18 | return ctx.deps 19 | -------------------------------------------------------------------------------- /src/examples/utils/utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions for running the research agent example.""" 2 | 3 | from json import load 4 | from os import getenv 5 | from sys import exit 6 | 7 | from dotenv import load_dotenv 8 | from pydantic import ValidationError 9 | from pydantic_ai.models.openai import OpenAIModel 10 | from pydantic_ai.providers.openai import OpenAIProvider 11 | from pydantic_ai.usage import Usage 12 | 13 | from .data_models import Config 14 | 15 | API_SUFFIX = "_API_KEY" 16 | 17 | 18 | def load_config(config_path: str) -> Config: 19 | """Load and validate configuration from a JSON file.""" 20 | 21 | try: 22 | with open(config_path) as file: 23 | config_data = load(file) 24 | config = Config.model_validate(config_data) 25 | except FileNotFoundError: 26 | raise FileNotFoundError(f"Configuration file not found: {config_path}") 27 | exit() 28 | except ValidationError as e: 29 | raise ValueError(f"Invalid configuration format: {e}") 30 | exit() 31 | except Exception as e: 32 | raise Exception(f"Error loading configuration: {e}") 33 | exit() 34 | else: 35 | return config 36 | 37 | 38 | def get_api_key(provider: str) -> str | None: 39 | """Retrieve API key from environment variable.""" 40 | 41 | # TODO replace with pydantic-settings ? 42 | load_dotenv() 43 | 44 | if provider.lower() == "ollama": 45 | return None 46 | else: 47 | return getenv(f"{provider.upper()}{API_SUFFIX}") 48 | 49 | 50 | def get_provider_config(provider: str, config: Config) -> dict[str, str]: 51 | """Retrieve configuration settings for the specified provider.""" 52 | 53 | try: 54 | model_name = config.providers[provider].model_name 55 | base_url = config.providers[provider].base_url 56 | except KeyError as e: 57 | raise ValueError(f"Missing configuration for {provider}: {e}.") 58 | exit() 59 | except Exception as e: 60 | raise Exception(f"Error loading provider configuration: {e}") 61 | exit() 62 | else: 63 | return { 64 | "model_name": model_name, 65 | "base_url": base_url, 66 | } 67 | 68 | 69 | def create_model( 70 | base_url: str, 71 | model_name: str, 72 | api_key: str | None = None, 73 | provider: str | None = None, 74 | ) -> OpenAIModel: 75 | """Create a model that uses base_url as inference API""" 76 | 77 | if api_key is None and not provider.lower() == "ollama": 78 | raise ValueError("API key is required for model.") 79 | exit() 80 | else: 81 | return OpenAIModel( 82 | model_name, provider=OpenAIProvider(base_url=base_url, api_key=api_key) 83 | ) 84 | 85 | 86 | def print_research_Result(summary: dict, usage: Usage) -> None: 87 | """Output structured summary of the research topic.""" 88 | 89 | print(f"\n=== Research Summary: {summary.topic} ===") 90 | print("\nKey Points:") 91 | for i, point in enumerate(summary.key_points, 1): 92 | print(f"{i}. {point}") 93 | print("\nKey Points Explanation:") 94 | for i, point in enumerate(summary.key_points_explanation, 1): 95 | print(f"{i}. {point}") 96 | print(f"\nConclusion: {summary.conclusion}") 97 | 98 | print(f"\nResponse structure: {list(dict(summary).keys())}") 99 | print(usage) 100 | -------------------------------------------------------------------------------- /src/gui/components/footer.py: -------------------------------------------------------------------------------- 1 | from streamlit import caption, divider 2 | 3 | 4 | def render_footer(footer_caption: str): 5 | """Render the page footer.""" 6 | divider() 7 | caption(footer_caption) 8 | -------------------------------------------------------------------------------- /src/gui/components/header.py: -------------------------------------------------------------------------------- 1 | from streamlit import divider, title 2 | 3 | 4 | def render_header(header_title: str): 5 | """Render the page header with title.""" 6 | title(header_title) 7 | divider() 8 | -------------------------------------------------------------------------------- /src/gui/components/output.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from streamlit import empty, info 4 | 5 | 6 | def render_output( 7 | result: Any = None, info_str: str | None = None, type: str | None = None 8 | ): 9 | """ 10 | Renders the output in a Streamlit app based on the provided type. 11 | 12 | Args: 13 | result (Any, optional): The content to be displayed. Can be JSON, code 14 | markdown, or plain text. 15 | info (str, optional): The information message to be displayed if result is None. 16 | type (str, optional): The type of the result content. Can be 'json', 'code', 17 | 'md', or other for plain text. 18 | 19 | Returns: 20 | Out: None 21 | """ 22 | 23 | if result: 24 | output_container = empty() 25 | output_container.write(result) 26 | # match type: 27 | # case "json": 28 | # json(result) 29 | # case "code": 30 | # code(result) 31 | # case "md": 32 | # markdown(result) 33 | # case _: 34 | # text(result) 35 | # # st.write(result) 36 | else: 37 | info(info_str) 38 | -------------------------------------------------------------------------------- /src/gui/components/prompts.py: -------------------------------------------------------------------------------- 1 | from streamlit import text_area 2 | 3 | 4 | def render_prompt_editor( 5 | prompt_name: str, prompt_value: str, height: int = 150 6 | ) -> str | None: 7 | return text_area( 8 | f"{prompt_name.replace('_', ' ').title()}", value=prompt_value, height=height 9 | ) 10 | -------------------------------------------------------------------------------- /src/gui/components/sidebar.py: -------------------------------------------------------------------------------- 1 | from streamlit import sidebar 2 | 3 | from gui.config.config import PAGES 4 | 5 | 6 | def render_sidebar(sidebar_title: str): 7 | sidebar.title(sidebar_title) 8 | selected_page = sidebar.radio(" ", PAGES) 9 | 10 | # st.sidebar.divider() 11 | # st.sidebar.info(" ") 12 | return selected_page 13 | -------------------------------------------------------------------------------- /src/gui/config/config.py: -------------------------------------------------------------------------------- 1 | APP_PATH = "app" 2 | PAGES = ["Home", "Settings", "Prompts", "App"] 3 | PROMPTS_DEFAULT = { 4 | "system_prompt_manager": ( 5 | "You are a manager overseeing research and analysis tasks..." 6 | ), 7 | "system_prompt_researcher": ("You are a researcher. Gather and analyze data..."), 8 | "system_prompt_analyst": ( 9 | "You are a research analyst. Use your analytical skills..." 10 | ), 11 | "system_prompt_synthesiser": ( 12 | "You are a research synthesiser. Use your analytical skills..." 13 | ), 14 | } 15 | -------------------------------------------------------------------------------- /src/gui/config/styling.py: -------------------------------------------------------------------------------- 1 | from streamlit import markdown, set_page_config 2 | 3 | 4 | def add_custom_styling(page_title: str): 5 | set_page_config( 6 | page_title=f"{page_title}", 7 | page_icon="🤖", 8 | layout="wide", 9 | initial_sidebar_state="expanded", 10 | ) 11 | 12 | custom_css = """ 13 | 19 | """ 20 | markdown(custom_css, unsafe_allow_html=True) 21 | -------------------------------------------------------------------------------- /src/gui/config/text.py: -------------------------------------------------------------------------------- 1 | HOME_INFO = "Select 'App' to start using the system" 2 | HOME_HEADER = "Welcome to the Multi-Agent Research System" 3 | HOME_DESCRIPTION = """ 4 | This system allows you to: 5 | 6 | - Run research queries using multiple specialized agents 7 | - Configure agent settings and prompts 8 | - View detailed results from your research 9 | 10 | Use the sidebar to navigate between different sections of the application. 11 | """ 12 | PAGE_TITLE = "MAS Eval 👾⚗️🧠💡" 13 | PROMPTS_WARNING = "No prompts found. Using default prompts." 14 | PROMPTS_HEADER = "Agent Prompts" 15 | RUN_APP_HEADER = "Run Research App" 16 | RUN_APP_QUERY_PLACEHOLDER = "What would you like to research?" 17 | RUN_APP_PROVIDER_PLACEHOLDER = "Provider?" 18 | RUN_APP_BUTTON = "Run Query" 19 | RUN_APP_OUTPUT_PLACEHOLDER = "Run the agent to see results here" 20 | RUN_APP_QUERY_WARNING = "Please enter a query" 21 | RUN_APP_QUERY_RUN_INFO = "Running query: " 22 | SETTINGS_HEADER = "Settings" 23 | SETTINGS_PROVIDER_LABEL = "Select Provider" 24 | SETTINGS_PROVIDER_PLACEHOLDER = "Select Provider" 25 | SETTINGS_ADD_PROVIDER = "Add New Provider" 26 | SETTINGS_API_KEY_LABEL = "API Key" 27 | OUTPUT_SUBHEADER = "Output" 28 | -------------------------------------------------------------------------------- /src/gui/pages/home.py: -------------------------------------------------------------------------------- 1 | from streamlit import header, info, markdown 2 | 3 | from gui.config.text import HOME_DESCRIPTION, HOME_HEADER, HOME_INFO 4 | 5 | 6 | def render_home(): 7 | header(HOME_HEADER) 8 | markdown(HOME_DESCRIPTION) 9 | info(HOME_INFO) 10 | -------------------------------------------------------------------------------- /src/gui/pages/prompts.py: -------------------------------------------------------------------------------- 1 | """ 2 | Streamlit component for editing agent system prompts. 3 | 4 | This module provides a function to render and edit prompt configurations 5 | for agent roles using a Streamlit-based UI. It validates the input configuration, 6 | displays warnings if prompts are missing, and allows interactive editing of each prompt. 7 | """ 8 | 9 | from pydantic import BaseModel 10 | from streamlit import error, header, warning 11 | 12 | from app.config.data_models import ChatConfig 13 | from app.utils.error_messages import invalid_type 14 | from app.utils.log import logger 15 | from gui.components.prompts import render_prompt_editor 16 | from gui.config.config import PROMPTS_DEFAULT 17 | from gui.config.text import PROMPTS_HEADER, PROMPTS_WARNING 18 | 19 | 20 | def render_prompts(chat_config: ChatConfig | BaseModel): # -> dict[str, str]: 21 | """ 22 | Render and edit the prompt configuration for agent roles in the Streamlit UI. 23 | """ 24 | 25 | header(PROMPTS_HEADER) 26 | 27 | if not isinstance(chat_config, ChatConfig): 28 | msg = invalid_type("ChatConfig", type(chat_config).__name__) 29 | logger.error(msg) 30 | error(msg) 31 | return None 32 | 33 | # updated = False 34 | prompts = chat_config.prompts 35 | 36 | if not prompts: 37 | warning(PROMPTS_WARNING) 38 | prompts = PROMPTS_DEFAULT 39 | 40 | updated_prompts = prompts.copy() 41 | 42 | # Edit prompts 43 | for prompt_key, prompt_value in prompts.items(): 44 | new_value = render_prompt_editor(prompt_key, prompt_value, height=200) 45 | if new_value != prompt_value and new_value is not None: 46 | updated_prompts[prompt_key] = new_value 47 | # updated = True 48 | 49 | # return updated_prompts if updated else prompts 50 | -------------------------------------------------------------------------------- /src/gui/pages/run_app.py: -------------------------------------------------------------------------------- 1 | """ 2 | Streamlit interface for running the agentic system interactively. 3 | 4 | This module defines the render_app function, which provides a Streamlit-based UI 5 | for users to select a provider, enter a query, and execute the main agent workflow. 6 | Results and errors are displayed in real time, supporting asynchronous execution. 7 | """ 8 | 9 | from streamlit import button, exception, header, info, subheader, text_input, warning 10 | 11 | from app.main import main 12 | from app.utils.log import logger 13 | from gui.components.output import render_output 14 | from gui.config.text import ( 15 | OUTPUT_SUBHEADER, 16 | RUN_APP_BUTTON, 17 | RUN_APP_HEADER, 18 | RUN_APP_OUTPUT_PLACEHOLDER, 19 | RUN_APP_PROVIDER_PLACEHOLDER, 20 | RUN_APP_QUERY_PLACEHOLDER, 21 | RUN_APP_QUERY_RUN_INFO, 22 | RUN_APP_QUERY_WARNING, 23 | ) 24 | 25 | 26 | async def render_app(provider: str | None = None): 27 | """ 28 | Render the main app interface for running agentic queries via Streamlit. 29 | 30 | Displays input fields for provider and query, a button to trigger execution, 31 | and an area for output or error messages. Handles async invocation of the 32 | main agent workflow and logs any exceptions. 33 | """ 34 | 35 | header(RUN_APP_HEADER) 36 | if provider is None: 37 | provider = text_input(RUN_APP_PROVIDER_PLACEHOLDER) 38 | query = text_input(RUN_APP_QUERY_PLACEHOLDER) 39 | 40 | subheader(OUTPUT_SUBHEADER) 41 | if button(RUN_APP_BUTTON): 42 | if query: 43 | info(f"{RUN_APP_QUERY_RUN_INFO} {query}") 44 | try: 45 | result = await main(chat_provider=provider, query=query) 46 | render_output(result) 47 | except Exception as e: 48 | render_output(None) 49 | exception(e) 50 | logger.exception(e) 51 | else: 52 | warning(RUN_APP_QUERY_WARNING) 53 | else: 54 | render_output(RUN_APP_OUTPUT_PLACEHOLDER) 55 | -------------------------------------------------------------------------------- /src/gui/pages/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Streamlit settings UI for provider and agent configuration. 3 | 4 | This module provides a function to render and edit agent system settings, 5 | including provider selection and related options, within the Streamlit GUI. 6 | It validates the input configuration and ensures correct typing before rendering. 7 | """ 8 | 9 | from streamlit import error, header, selectbox 10 | 11 | from app.config.data_models import BaseModel, ChatConfig 12 | from app.utils.error_messages import invalid_type 13 | from app.utils.log import logger 14 | from gui.config.text import SETTINGS_HEADER, SETTINGS_PROVIDER_LABEL 15 | 16 | 17 | def render_settings(chat_config: ChatConfig | BaseModel) -> str: 18 | """ 19 | Render and edit agent system settings in the Streamlit UI. 20 | 21 | Displays a header and a selectbox for choosing the inference provider. 22 | Validates that the input is a ChatConfig instance and displays an error if not. 23 | """ 24 | header(SETTINGS_HEADER) 25 | 26 | # updated = False 27 | # updated_config = config.copy() 28 | 29 | if not isinstance(chat_config, ChatConfig): 30 | msg = invalid_type("ChatConfig", type(chat_config).__name__) 31 | logger.error(msg) 32 | error(msg) 33 | return msg 34 | 35 | provider = selectbox( 36 | label=SETTINGS_PROVIDER_LABEL, 37 | options=chat_config.providers.keys(), 38 | ) 39 | 40 | # Run options 41 | # col1, col2 = st.columns(2) 42 | # with col1: 43 | # streamed_output = st.checkbox( 44 | # "Stream Output", value=config.get("streamed_output", False) 45 | # ) 46 | # with col2: 47 | # st.checkbox("Include Sources", value=True) # include_sources 48 | 49 | # Allow adding new providers 50 | # new_provider = st.text_input("Add New Provider") 51 | # api_key = st.text_input(f"{provider} API Key", type="password") 52 | # if st.button("Add Provider") and new_provider and new_provider not in providers: 53 | # providers.append(new_provider) 54 | # updated_config["providers"] = providers 55 | # updated_config["api_key"] = api_key 56 | # updated = True 57 | # st.success(f"Added provider: {new_provider}") 58 | 59 | # # Update config if changed 60 | # if ( 61 | # include_a != config.get("include_a", False) 62 | # or include_b != config.get("include_b", False) 63 | # or streamed_output != config.get("streamed_output", False) 64 | # ): 65 | # updated_config["include_a"] = include_a 66 | # updated_config["include_b"] = include_b 67 | # updated_config["streamed_output"] = streamed_output 68 | # updated = True 69 | 70 | return provider 71 | -------------------------------------------------------------------------------- /src/run_gui.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module sets up and runs a Streamlit application for a Multi-Agent System. 3 | 4 | The application includes the following components: 5 | - Header 6 | - Sidebar for configuration options 7 | - Main content area for prompts 8 | - Footer 9 | 10 | The main function loads the configuration, renders the UI components, and handles the 11 | execution of the Multi-Agent System based on user input. 12 | 13 | Functions: 14 | - run_app(): Placeholder function to run the main application logic. 15 | - main(): Main function to set up and run the Streamlit application. 16 | """ 17 | 18 | from asyncio import run 19 | from pathlib import Path 20 | 21 | from app.config.config_app import CHAT_CONFIG_FILE, CHAT_DEFAULT_PROVIDER 22 | from app.config.data_models import ChatConfig 23 | from app.utils.load_configs import load_config 24 | from app.utils.log import logger 25 | from gui.components.sidebar import render_sidebar 26 | from gui.config.config import APP_PATH 27 | from gui.config.styling import add_custom_styling 28 | from gui.config.text import PAGE_TITLE 29 | from gui.pages.home import render_home 30 | from gui.pages.prompts import render_prompts 31 | from gui.pages.run_app import render_app 32 | from gui.pages.settings import render_settings 33 | 34 | # TODO create sidebar tabs, move settings to page, 35 | # set readme.md as home, separate prompts into page 36 | 37 | chat_config_pfile = Path(__file__).parent / APP_PATH / CHAT_CONFIG_FILE 38 | chat_config = load_config(chat_config_pfile, ChatConfig) 39 | provider = CHAT_DEFAULT_PROVIDER 40 | logger.info(f"Default provider: {CHAT_DEFAULT_PROVIDER}") 41 | 42 | 43 | async def main(): 44 | add_custom_styling(PAGE_TITLE) 45 | selected_page = render_sidebar(PAGE_TITLE) 46 | 47 | if selected_page == "Home": 48 | render_home() 49 | elif selected_page == "Settings": 50 | # TODO temp save settings to be used in gui 51 | provider = render_settings(chat_config) 52 | logger.info(f"Page 'Settings' provider: {provider}") 53 | elif selected_page == "Prompts": 54 | render_prompts(chat_config) 55 | elif selected_page == "App": 56 | logger.info(f"Page 'App' provider: {CHAT_DEFAULT_PROVIDER}") 57 | await render_app(CHAT_DEFAULT_PROVIDER) 58 | 59 | 60 | if __name__ == "__main__": 61 | run(main()) 62 | -------------------------------------------------------------------------------- /tests/test_agent_system.py: -------------------------------------------------------------------------------- 1 | from app.agents.agent_system import get_manager 2 | from app.config.data_models import ProviderConfig 3 | 4 | 5 | def test_get_manager_minimal(): 6 | provider = "github" 7 | provider_config = ProviderConfig.model_validate( 8 | {"model_name": "test-model", "base_url": "http://test.com"} 9 | ) 10 | api_key = "test" 11 | prompts = {"system_prompt_manager": "test"} 12 | agent = get_manager(provider, provider_config, api_key, prompts) 13 | assert hasattr(agent, "run") 14 | -------------------------------------------------------------------------------- /tests/test_env.py: -------------------------------------------------------------------------------- 1 | from pytest import MonkeyPatch 2 | 3 | from app.config.data_models import AppEnv 4 | 5 | 6 | def test_app_env_loads_env_vars(monkeypatch: MonkeyPatch): 7 | monkeypatch.setenv("GEMINI_API_KEY", "test-gemini") 8 | env = AppEnv() 9 | assert env.GEMINI_API_KEY == "test-gemini" 10 | -------------------------------------------------------------------------------- /tests/test_metrics_output_similarity.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the output_similarity metric. 3 | 4 | This module verifies that the output_similarity metric correctly identifies when 5 | an agent's output matches the expected answer. 6 | """ 7 | 8 | from app.evals.metrics import output_similarity 9 | 10 | 11 | def test_output_similarity_exact_match(): 12 | assert output_similarity("42", "42") is True 13 | 14 | 15 | def test_output_similarity_whitespace(): 16 | assert output_similarity(" answer ", "answer") is True 17 | 18 | 19 | def test_output_similarity_incorrect(): 20 | assert output_similarity("foo", "bar") is False 21 | -------------------------------------------------------------------------------- /tests/test_metrics_time_taken.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the time_taken metric. 3 | 4 | This module verifies that the time_taken metric correctly computes the elapsed 5 | time between two timestamps, ensuring accurate measurement of agent execution 6 | duration for evaluation purposes. 7 | """ 8 | 9 | import asyncio 10 | import time 11 | 12 | import pytest 13 | 14 | from app.evals.metrics import time_taken 15 | 16 | 17 | @pytest.mark.asyncio 18 | async def test_time_taken_metric(): 19 | """Scenario: Calculate time taken for agent execution""" 20 | 21 | # Given: Start and end timestamps 22 | start_time = time.perf_counter() 23 | await asyncio.sleep(0.1) 24 | end_time = time.perf_counter() 25 | 26 | # When: Calculating time taken 27 | result = time_taken(start_time, end_time) 28 | 29 | # Then: Verify correct duration calculation 30 | assert result == pytest.approx(0.1, abs=0.05) 31 | -------------------------------------------------------------------------------- /tests/test_provider_config.py: -------------------------------------------------------------------------------- 1 | from pytest import MonkeyPatch 2 | 3 | from app.config.data_models import ProviderConfig 4 | 5 | 6 | def test_provider_config_parsing(monkeypatch: MonkeyPatch): 7 | pcfg = ProviderConfig.model_validate( 8 | {"model_name": "foo", "base_url": "https://foo.bar"} 9 | ) 10 | assert pcfg.model_name == "foo" 11 | assert pcfg.base_url == "bar" 12 | --------------------------------------------------------------------------------