<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/">
  <channel>
    <title>Promptfoo on Egor Lynko</title>
    <link>https://yegor.me/tags/promptfoo/</link>
    <description>Recent content in Promptfoo on Egor Lynko</description>
    <generator>Hugo</generator>
    <language>en-us</language>
    <lastBuildDate>Sat, 06 Jun 2026 08:00:00 +0200</lastBuildDate>
    <atom:link href="https://yegor.me/tags/promptfoo/index.xml" rel="self" type="application/rss+xml" />
    <item>
      <title>Put your LLM output under test, or every prompt tweak is a silent regression</title>
      <link>https://yegor.me/posts/put-your-llm-output-under-test/</link>
      <pubDate>Sat, 06 Jun 2026 08:00:00 +0200</pubDate>
      <guid>https://yegor.me/posts/put-your-llm-output-under-test/</guid>
      <description>&lt;p&gt;In our platform, we run a multi-step AI agent that generates content for customers in the sports world. The agent is never fixed: we change the underlying models, reword prompts, restructure workflow steps, adjust the agent logic. Any of those can quietly make the output worse, and for a while the only safeguard was a human, reading generated content and forming an opinion. That doesn&amp;rsquo;t scale at all, highly subjective and misses slow drift completely.&lt;/p&gt;</description>
    </item>
  </channel>
</rss>
