{
  "meta": {
    "format": "forge-cite-manifest",
    "formatVersion": "1.0",
    "generatedAt": "2026-05-01T17:53:18.318Z"
  },
  "post": {
    "slug": "08-self-hosted-llm-2026",
    "title": "Self-hosted LLMs in 2026: when does it make sense vs paying for the API?",
    "description": "Local Llama / Qwen / DeepSeek vs Anthropic / OpenAI API. Cost crossover, privacy trade-offs, the operational reality.",
    "tldr": "Self-hosting becomes cheaper than paying API per-call at roughly 10 million tokens per month, depending on model size and hardware. Below that, API economics win. Above it, ops complexity dominates: GPU cooling, model swap latency, batching infrastructure. Hybrid setups (cheap local for triage, API for high-stakes) outperform pure plays in most production workloads.",
    "url": "https://adsforge.store/08-self-hosted-llm-2026/",
    "publishDate": "2026-04-26T00:00:00.000Z",
    "updatedDate": "2026-04-26T00:00:00.000Z",
    "tags": [
      "local-models",
      "evaluation",
      "agents",
      "claude",
      "openai"
    ],
    "tools": [
      "Ollama",
      "vLLM",
      "Llama",
      "Qwen",
      "DeepSeek"
    ],
    "affiliate": false
  },
  "author": {
    "name": "The Forge",
    "credentials": "AI editorial team focused on agent workflows. All posts reviewed by humans before publishing."
  },
  "entities": [
    "Llama",
    "Qwen",
    "DeepSeek",
    "vLLM",
    "Ollama",
    "Anthropic Claude"
  ],
  "claims": [
    {
      "text": "Anthropic Claude Haiku 4.5 pricing as of May 2026 is approximately $1 per million input tokens and $5 per million output tokens.",
      "source": "https://www.anthropic.com/pricing",
      "date": "2026-05-01",
      "confidence": "high"
    },
    {
      "text": "Llama 3 70B and Qwen2 72B are open-weight models commonly used for self-hosting, requiring approximately 80GB of GPU VRAM at FP8 quantisation.",
      "source": "https://en.wikipedia.org/wiki/Llama_(language_model)",
      "date": "2026-04-20",
      "confidence": "medium"
    },
    {
      "text": "Reddit r/LocalLLaMA benchmarks consistently show self-hosted 70B-class models matching or exceeding GPT-3.5-class API performance on common tasks.",
      "source": "https://reddit.com/r/LocalLLaMA/comments/1sxj6s3/",
      "date": "2026-04-15",
      "confidence": "medium"
    },
    {
      "text": "vLLM and SGLang are the leading open-source serving stacks for self-hosted LLMs in 2026, with native batching and continuous-batching support.",
      "source": "https://github.com/vllm-project/vllm",
      "date": "2026-03-10",
      "confidence": "high"
    }
  ],
  "updateLog": [
    {
      "version": "v1",
      "date": "2026-04-26T00:00:00.000Z",
      "notes": "Initial publish."
    }
  ]
}