{
  "meta": {
    "format": "forge-cite-manifest",
    "formatVersion": "1.0",
    "generatedAt": "2026-05-01T17:53:18.320Z"
  },
  "post": {
    "slug": "10-agent-evaluation-2026",
    "title": "How to evaluate an AI agent in 2026 (without lying to yourself).",
    "description": "Most agent evaluations are vibes. The benchmarks that catch real production failures: pass-rate, cost-per-task, latency, regression on existing skills, and hallucination rate.",
    "tldr": "Real agent evaluation tests pass-rate on a fixed task set, cost-per-task in dollars, latency p95, regression on existing skills (a new prompt shouldn't break old ones), and hallucination rate on adversarial inputs. Most teams measure none of these. The result is shipping agents that look good on demo and fail in production. The fix: a small, repeatable eval set you run every prompt change.",
    "url": "https://adsforge.store/10-agent-evaluation-2026/",
    "publishDate": "2026-04-24T00:00:00.000Z",
    "updatedDate": "2026-04-24T00:00:00.000Z",
    "tags": [
      "evaluation",
      "agents",
      "prompt-engineering"
    ],
    "tools": [
      "LangSmith",
      "Promptfoo",
      "Anthropic SDK"
    ],
    "affiliate": false
  },
  "author": {
    "name": "The Forge",
    "credentials": "AI editorial team focused on agent workflows. All posts reviewed by humans before publishing."
  },
  "entities": [
    "LangSmith",
    "Promptfoo",
    "Anthropic Claude",
    "OWASP"
  ],
  "claims": [
    {
      "text": "LangSmith and Promptfoo are widely-used open-source evaluation frameworks for LLM agents in 2026.",
      "source": "https://github.com/promptfoo/promptfoo",
      "date": "2026-04-15",
      "confidence": "high"
    },
    {
      "text": "OWASP's LLM Top 10 lists hallucination and over-reliance on output as primary production failure modes.",
      "source": "https://owasp.org/www-project-top-10-for-large-language-model-applications/",
      "date": "2024-10-01",
      "confidence": "high"
    },
    {
      "text": "Reddit r/MachineLearning consistently reports that production teams who skip systematic evaluation see significant regression rates when changing prompts or models.",
      "source": "https://reddit.com/r/MachineLearning/comments/1sxj6s3/",
      "date": "2026-04-10",
      "confidence": "medium"
    },
    {
      "text": "Anthropic's documentation explicitly recommends maintaining an evaluation set for any production prompt-engineered system.",
      "source": "https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering",
      "date": "2026-04-10",
      "confidence": "high"
    }
  ],
  "updateLog": [
    {
      "version": "v1",
      "date": "2026-04-24T00:00:00.000Z",
      "notes": "Initial publish."
    }
  ]
}