Testing API

Comprehensive testing framework for prompts with regression detection, semantic scoring, and performance tracking.

Overview

The Testing API provides a robust framework for validating prompt behavior, detecting regressions, and tracking quality metrics over time. Test cases can be run individually or in batches, with automatic scoring and detailed comparison analytics.

Test cases are scoped to individual prompts and automatically track version changes, making it easy to identify when prompt updates cause regressions.

Test Case Schema

TestCase Type

Parameter	Type	Description
`id` required	`ID`	Unique identifier for the test case
`promptId` required	`ID`	ID of the prompt being tested
`name` required	`string`	Descriptive name for the test case
`description`	`string`	Optional detailed description
`input` required	`JSON`	Input data for the test execution
`expectedOutput`	`string`	Expected output for comparison
`totalRuns`	`number`	Total number of times this test has been run
`passRate`	`number`	Percentage of passing test runs (0-100)
`avgScore`	`number`	Average similarity score across all runs
`lastRun`	`DateTime`	Timestamp of the most recent execution
`lastPassed`	`boolean`	Whether the most recent run passed
`createdAt`	`DateTime`	When the test case was created
`updatedAt`	`DateTime`	When the test case was last modified

TestCaseExecution Type

Parameter	Type	Description
`id` required	`ID`	Execution record ID
`testCaseId` required	`ID`	Reference to the test case
`executionId` required	`ID`	Reference to the prompt execution
`passed` required	`boolean`	Whether the test passed (score >= 0.8)
`score`	`number`	Overall similarity score (0-1)
`actualOutput` required	`string`	Actual output from the prompt
`expectedOutput`	`string`	Expected output for comparison
`outputDiff`	`JSON`	Character-level diff between outputs
`exactMatch` required	`boolean`	Whether outputs matched exactly
`semanticScore`	`number`	Semantic similarity score using embeddings (0-1)
`lengthDiff`	`number`	Difference in output length (characters)
`tokenDiff`	`number`	Difference in token count
`isRegression` required	`boolean`	Whether this represents a quality regression
`regressionType`	`string`	Type of regression: FAILED, SCORE_DROP, LENGTH_CHANGE
`promptVersion` required	`number`	Prompt version used for this test
`runDuration`	`number`	Test execution duration in milliseconds
`createdAt`	`DateTime`	When the test was executed

Queries

Get All Test Cases

Retrieve all test cases for a specific prompt.

query GetTestCases($promptId: ID!) {
  testCases(promptId: $promptId) {
    id
    name
    description
    input
    expectedOutput
    totalRuns
    passRate
    avgScore
    lastRun
    lastPassed
    createdAt
    updatedAt
  }
}

Get Single Test Case

query GetTestCase($id: ID!) {
  testCase(id: $id) {
    id
    name
    description
    input
    expectedOutput
    totalRuns
    passRate
    avgScore
    prompt {
      id
      name
    }
  }
}

Get Test Case Executions

Retrieve execution history for a test case with pagination.

query GetTestCaseExecutions($testCaseId: ID!, $limit: Int, $offset: Int) {
  testCaseExecutions(testCaseId: $testCaseId, limit: $limit, offset: $offset) {
    id
    passed
    score
    actualOutput
    expectedOutput
    exactMatch
    semanticScore
    lengthDiff
    tokenDiff
    isRegression
    regressionType
    promptVersion
    runDuration
    createdAt
  }
}

Get Test Case Statistics

Get detailed statistics, trends, and regression data for a test case.

query GetTestCaseStats($testCaseId: ID!) {
  testCaseStats(testCaseId: $testCaseId) {
    totalRuns
    passRate
    avgScore
    recentRuns {
      id
      passed
      score
      promptVersion
      createdAt
    }
    regressions {
      id
      passed
      score
      regressionType
      promptVersion
      createdAt
    }
    trendData {
      date
      totalRuns
      passRate
      avgScore
    }
  }
}

Mutations

Create Test Case

Create a new test case for a prompt.

mutation CreateTestCase($input: TestCaseInput!) {
  createTestCase(input: $input) {
    id
    name
    description
    input
    expectedOutput
    createdAt
  }
}

# Variables
{
  "input": {
    "promptId": "cm123456789",
    "name": "Positive sentiment detection",
    "description": "Tests detection of positive sentiment in customer feedback",
    "input": {
      "text": "I absolutely love this product! Best purchase ever."
    },
    "expectedOutput": "positive"
  }
}

Parameter	Type	Description
`promptId` required	`ID`	ID of the prompt to test
`name` required	`string`	Test case name
`description`	`string`	Optional description
`input` required	`JSON`	Input data matching prompt schema
`expectedOutput`	`string`	Expected output for comparison

Update Test Case

mutation UpdateTestCase($id: ID!, $input: TestCaseInput!) {
  updateTestCase(id: $id, input: $input) {
    id
    name
    description
    input
    expectedOutput
    updatedAt
  }
}

Delete Test Case

mutation DeleteTestCase($id: ID!) {
  deleteTestCase(id: $id)
}

Run Test Case

Execute a test case against the latest (or specified) prompt version.

mutation RunTestCase($input: RunTestCaseInput!) {
  runTestCase(input: $input) {
    testCaseExecutionId
    executionId
    passed
    score
    actualOutput
    expectedOutput
    outputDiff
    exactMatch
    semanticScore
    lengthDiff
    tokenDiff
    latencyMs
    costUsd
    isRegression
    regressionType
  }
}

# Variables
{
  "input": {
    "testCaseId": "cm123456789",
    "promptVersion": 3  // Optional: defaults to latest
  }
}

Batch Run Tests

Run multiple test cases in a single operation. Useful for regression testing after prompt updates.

mutation BatchRunTests($input: BatchRunTestsInput!) {
  batchRunTests(input: $input) {
    testCaseExecutionId
    executionId
    passed
    score
    actualOutput
    isRegression
    regressionType
  }
}

# Variables
{
  "input": {
    "promptId": "cm123456789",
    "testCaseIds": [
      "test1",
      "test2",
      "test3"
    ],
    "promptVersion": 4  // Optional
  }
}

Update Expected Output

Update the expected output for a test case. Useful when accepting new behavior as the baseline.

mutation UpdateExpectedOutput($testCaseId: ID!, $output: String!) {
  updateExpectedOutput(testCaseId: $testCaseId, output: $output) {
    id
    expectedOutput
    updatedAt
  }
}

Test Scoring System

Test cases are evaluated using multiple comparison methods to provide a comprehensive quality assessment.

Scoring Components

Exact Match (Boolean)

Simple string equality check. True if outputs match exactly after trimming whitespace.

Semantic Score (0-1)

Cosine similarity between output embeddings. Measures semantic meaning rather than exact wording. Calculated using text-embedding-3-small model.

Length Difference

Character count difference between actual and expected outputs. Helps identify verbose or terse responses.

Token Difference

Token count difference. Important for cost and context window tracking.

Pass/Fail Criteria

A test passes if either:

Exact match is true, OR
Semantic score is >= 0.8 (80% similarity)

The semantic scoring approach allows for natural language variation while still catching significant deviations in meaning or intent.

Regression Detection

The system automatically detects regressions by comparing each test run against historical performance. Three types of regressions are tracked:

Regression Types

Parameter	Type	Description
`FAILED`	`regression`	Test that previously passed now fails (score < 0.8)
`SCORE_DROP`	`regression`	Semantic score dropped by more than 10% compared to recent average
`LENGTH_CHANGE`	`regression`	Output length changed significantly (>30%) compared to recent average

Regression Detection Logic

// Test fails when it previously passed
if (!currentPassed && historicalPassRate > 0.8) {
  regressionType = 'FAILED'
  isRegression = true
}

// Score drops significantly
if (currentScore < recentAvgScore * 0.9) {
  regressionType = 'SCORE_DROP'
  isRegression = true
}

// Output length changes dramatically
const lengthChange = Math.abs(currentLength - recentAvgLength) / recentAvgLength
if (lengthChange > 0.3) {
  regressionType = 'LENGTH_CHANGE'
  isRegression = true
}

Regressions are calculated based on the last 5 successful test runs. New test cases with fewer than 2 historical runs won't trigger regression detection.

Testing Workflow

1. Create Test Cases

Start by creating test cases that represent important use cases for your prompt.

// Create multiple test cases for different scenarios
const testCases = [
  {
    name: "Positive sentiment",
    input: { text: "I love this!" },
    expectedOutput: "positive"
  },
  {
    name: "Negative sentiment",
    input: { text: "This is terrible." },
    expectedOutput: "negative"
  },
  {
    name: "Neutral sentiment",
    input: { text: "It's okay." },
    expectedOutput: "neutral"
  }
]

for (const tc of testCases) {
  await createTestCase({ promptId, ...tc })
}

2. Run Tests After Changes

After updating a prompt, run all test cases to check for regressions.

// Batch run all tests for a prompt
const results = await batchRunTests({
  promptId: "cm123456789",
  testCaseIds: ["test1", "test2", "test3"]
})

// Check for failures or regressions
const failures = results.filter(r => !r.passed)
const regressions = results.filter(r => r.isRegression)

if (failures.length > 0) {
  console.log('Failed tests:', failures.length)
}

if (regressions.length > 0) {
  console.log('Regressions detected:', regressions)
}

3. Analyze Results

Use test case statistics to track quality trends over time.

const stats = await getTestCaseStats({ testCaseId })

console.log('Pass rate:', stats.passRate + '%')
console.log('Average score:', stats.avgScore)
console.log('Recent regressions:', stats.regressions.length)

// Analyze trend data
stats.trendData.forEach(point => {
  console.log(`${point.date}: ${point.passRate}% pass rate`)
})

4. Update Baselines

When new behavior is intentional, update the expected output to establish a new baseline.

// Accept new output as the expected baseline
await updateExpectedOutput({
  testCaseId: "test123",
  output: newExpectedOutput
})

Best Practices

Test Edge Cases - Create test cases for boundary conditions, unusual inputs, and error scenarios, not just happy paths.

Maintain Test Coverage - Aim for test cases covering all major prompt behaviors. Add new tests whenever you fix a bug or add functionality.

Run Tests Before Deployment - Always run the full test suite before deploying prompt changes to production. Check both pass rates and regression flags.

Monitor Trends - Use trend data to identify gradual quality degradation over time, even if individual tests still pass.

Review Regressions Carefully - Not all regressions are bad. LENGTH_CHANGE might indicate more concise responses. Always review the actual output.

Version Testing - You can run tests against specific prompt versions to compare behavior across updates. Useful for A/B testing.

A/B Testing

A/B testing enables you to compare different model configurations (branches) against the same test case to find the optimal setup for your use case. Test branches run in parallel and are automatically ranked based on quality, speed, cost, or a balanced score.

Each test case can have up to 5 branches. Branches are independent configurations that can use different models, providers, or parameters while testing against the same input.

Branch Schema

TestBranch Type

Parameter	Type	Description
`id` required	`ID`	Unique branch identifier
`testCaseId` required	`ID`	Parent test case ID
`name` required	`string`	Branch name (e.g., "GPT-4 High Temp")
`description`	`string`	Optional branch description
`provider` required	`string`	LLM provider: openai, anthropic, gemini
`model` required	`string`	Model identifier
`temperature`	`number`	Temperature (0-2, default: 0.7)
`maxTokens`	`number`	Max output tokens (default: 2048)
`topP`	`number`	Top-p sampling (0-1, default: 1.0)
`isControl`	`boolean`	Whether this is the control group
`isActive`	`boolean`	Whether branch is active for testing
`totalRuns`	`number`	Total executions for this branch
`avgLatency`	`number`	Average response time (ms)
`avgCost`	`number`	Average cost per execution
`avgQualityScore`	`number`	Average quality score (0-1)
`avgTokensPerSec`	`number`	Average throughput
`consistencyScore`	`number`	Output consistency metric (0-1)
`passRate`	`number`	Percentage passing quality threshold

TestBranchExecution Type

Parameter	Type	Description
`id` required	`ID`	Execution record ID
`branchId` required	`ID`	Branch that was executed
`executionId`	`ID`	Reference to prompt execution
`output` required	`string`	Generated output
`error`	`string`	Error message if execution failed
`status` required	`string`	success or error
`latencyMs` required	`number`	Response time in milliseconds
`timeToFirstToken`	`number`	TTFT in milliseconds
`tokensPerSecond`	`number`	Throughput metric
`inputTokens` required	`number`	Input token count
`outputTokens` required	`number`	Output token count
`totalTokens` required	`number`	Total token count
`estimatedCost` required	`number`	Estimated cost in USD
`qualityScore` required	`number`	Quality score (0-1)
`similarityScore`	`number`	Semantic similarity score
`clarityScore`	`number`	Output readability score
`validJson`	`boolean`	Whether output is valid JSON
`consistencyHash`	`string`	Hash for consistency tracking
`promptVersion` required	`number`	Prompt version used

A/B Testing Queries

Get Test Branches

List all branches for a test case with their performance metrics.

query GetTestBranches($testCaseId: ID!) {
  testBranches(testCaseId: $testCaseId) {
    id
    name
    description
    provider
    model
    temperature
    maxTokens
    topP
    isControl
    isActive
    totalRuns
    avgLatency
    avgCost
    avgQualityScore
    avgTokensPerSec
    consistencyScore
    passRate
    createdAt
  }
}

Get Branch Execution History

Retrieve detailed execution history for a specific branch.

query GetBranchExecutions($branchId: ID!, $limit: Int, $offset: Int) {
  testBranchExecutions(branchId: $branchId, limit: $limit, offset: $offset) {
    id
    output
    status
    latencyMs
    timeToFirstToken
    tokensPerSecond
    inputTokens
    outputTokens
    estimatedCost
    qualityScore
    similarityScore
    clarityScore
    validJson
    promptVersion
    createdAt
  }
}

A/B Testing Mutations

Create Test Branch

Create a new branch with specific model configuration. Maximum 5 branches per test case.

mutation CreateTestBranch($input: TestBranchInput!) {
  createTestBranch(input: $input) {
    id
    name
    model
    temperature
  }
}

# Variables
{
  "input": {
    "testCaseId": "tc_123",
    "name": "GPT-4 High Temperature",
    "description": "Testing GPT-4 with higher temperature for creativity",
    "provider": "openai",
    "model": "gpt-4-turbo-preview",
    "temperature": 0.9,
    "maxTokens": 1000,
    "topP": 0.95,
    "isControl": false
  }
}

Parameter	Type	Description
`testCaseId` required	`ID`	Test case ID
`name` required	`string`	Branch name
`description`	`string`	Optional description
`provider` required	`string`	openai, anthropic, or gemini
`model` required	`string`	Model identifier
`temperature`	`number`	Temperature (0-2)
`maxTokens`	`number`	Max output tokens
`topP`	`number`	Top-p sampling (0-1)
`isControl`	`boolean`	Mark as control group
`otherParams`	`JSON`	Additional provider-specific params

Update Test Branch

mutation UpdateTestBranch($id: ID!, $input: TestBranchInput!) {
  updateTestBranch(id: $id, input: $input) {
    id
    name
    temperature
    maxTokens
  }
}

Delete Test Branch

mutation DeleteTestBranch($id: ID!) {
  deleteTestBranch(id: $id)
}

Run A/B Test

Execute an A/B test across multiple branches and automatically determine the winner based on your criteria.

mutation RunABTest($input: RunABTestInput!) {
  runABTest(input: $input) {
    branches {
      branchId
      branchName
      model
      execution {
        id
        output
        status
      }
      qualityScore
      latencyMs
      estimatedCost
      tokensPerSecond
      rank
    }
    winner {
      branchId
      branchName
      model
      qualityScore
      latencyMs
      estimatedCost
    }
    criteria
  }
}

# Variables
{
  "input": {
    "testCaseId": "tc_123",
    "branchIds": ["branch_1", "branch_2", "branch_3"],
    "promptVersion": 2,
    "winnerCriteria": "balanced"
  }
}

Parameter	Type	Description
`testCaseId` required	`ID`	Test case to run
`branchIds` required	`[ID!]!`	Array of branch IDs (2-3 recommended)
`promptVersion`	`Int`	Specific prompt version (defaults to latest)
`winnerCriteria`	`string`	Winner selection: best_quality, fastest, cheapest, best_value, balanced (default: best_quality)

Winner Criteria

The A/B test winner is determined automatically based on the selected criteria:

best_quality (Default)

Selects the branch with the highest quality score. Best for prioritizing output accuracy.

fastest

Selects the branch with the lowest latency. Best for real-time applications.

cheapest

Selects the branch with the lowest estimated cost. Best for cost optimization.

best_value

Selects the branch with the best quality-to-cost ratio. Calculates quality score divided by cost.

balanced

Composite score: 40% quality, 30% speed, 30% cost. Best for balanced performance across all metrics.

A/B Testing Metrics

Each branch execution tracks comprehensive metrics for comparison:

Per-Execution Metrics

Quality Score (0-1): Similarity to expected output using semantic comparison
Latency (ms): Total response time from request to completion
Cost (USD): Estimated cost based on token usage and model pricing
Tokens/Second: Throughput metric for real-time performance
Clarity Score (0-1): Output readability based on sentence structure
Consistency Hash: For detecting output variations across runs
Valid JSON: Whether output is parseable JSON (useful for structured outputs)

Aggregated Branch Metrics

Total Runs: Number of executions for this branch
Average Metrics: Mean quality, latency, cost, and throughput
Pass Rate: Percentage passing quality threshold (70%+)
Consistency Score: Variance in outputs (lower = more consistent)

Run A/B tests multiple times to get statistically significant results. Single runs can be affected by LLM non-determinism, even with temperature set to 0.

A/B Testing Workflow

Complete A/B Test Example

import { gql } from 'graphql-request'

// 1. Create a test case
const CREATE_TEST = gql`
  mutation CreateTestCase($input: TestCaseInput!) {
    createTestCase(input: $input) {
      id
    }
  }
`

const testCase = await client.request(CREATE_TEST, {
  input: {
    promptId: "prompt_123",
    name: "Sentiment Analysis Test",
    input: {
      text: "This product exceeded my expectations!"
    },
    expectedOutput: "positive"
  }
})

// 2. Create test branches with different configurations
const CREATE_BRANCH = gql`
  mutation CreateBranch($input: TestBranchInput!) {
    createTestBranch(input: $input) {
      id
    }
  }
`

const branches = await Promise.all([
  // GPT-4 Turbo
  client.request(CREATE_BRANCH, {
    input: {
      testCaseId: testCase.createTestCase.id,
      name: "GPT-4 Turbo",
      provider: "openai",
      model: "gpt-4-turbo-preview",
      temperature: 0.7
    }
  }),
  // Claude Sonnet
  client.request(CREATE_BRANCH, {
    input: {
      testCaseId: testCase.createTestCase.id,
      name: "Claude Sonnet",
      provider: "anthropic",
      model: "claude-3-sonnet-20240229",
      temperature: 0.7
    }
  }),
  // Gemini Pro
  client.request(CREATE_BRANCH, {
    input: {
      testCaseId: testCase.createTestCase.id,
      name: "Gemini Pro",
      provider: "gemini",
      model: "gemini-pro",
      temperature: 0.7
    }
  })
])

// 3. Run A/B test
const RUN_AB_TEST = gql`
  mutation RunABTest($input: RunABTestInput!) {
    runABTest(input: $input) {
      winner {
        branchName
        model
        qualityScore
        latencyMs
        estimatedCost
      }
      branches {
        branchName
        model
        qualityScore
        latencyMs
        estimatedCost
        rank
      }
      criteria
    }
  }
`

const result = await client.request(RUN_AB_TEST, {
  input: {
    testCaseId: testCase.createTestCase.id,
    branchIds: branches.map(b => b.createTestBranch.id),
    winnerCriteria: "balanced"
  }
})

// 4. Analyze results
console.log('Winner:', result.runABTest.winner.branchName)
console.log('  Model:', result.runABTest.winner.model)
console.log('  Quality:', result.runABTest.winner.qualityScore)
console.log('  Latency:', result.runABTest.winner.latencyMs + 'ms')
console.log('  Cost: $' + result.runABTest.winner.estimatedCost)

console.log('\nAll Results:')
result.runABTest.branches.forEach(branch => {
  console.log(`${branch.rank}. ${branch.branchName}`)
  console.log(`   Quality: ${branch.qualityScore}`)
  console.log(`   Speed: ${branch.latencyMs}ms`)
  console.log(`   Cost: $${branch.estimatedCost}`)
})

// 5. Get detailed execution history
const GET_EXECUTIONS = gql`
  query GetBranchExecutions($branchId: ID!, $limit: Int) {
    testBranchExecutions(branchId: $branchId, limit: $limit) {
      output
      qualityScore
      latencyMs
      estimatedCost
      createdAt
    }
  }
`

const winnerId = branches[0].createTestBranch.id
const executions = await client.request(GET_EXECUTIONS, {
  branchId: winnerId,
  limit: 10
})

console.log('\nRecent executions for winner:')
executions.testBranchExecutions.forEach((exec, i) => {
  console.log(`${i + 1}. Quality: ${exec.qualityScore}, Latency: ${exec.latencyMs}ms`)
})

Running Multiple Test Iterations

For statistically significant results, run A/B tests multiple times and analyze aggregate metrics.

// Run the same A/B test 5 times
const iterations = 5
const results = []

for (let i = 0; i < iterations; i++) {
  const result = await client.request(RUN_AB_TEST, {
    input: {
      testCaseId: testCase.id,
      branchIds: branchIds,
      winnerCriteria: "balanced"
    }
  })
  results.push(result.runABTest)

  // Small delay between iterations
  await new Promise(resolve => setTimeout(resolve, 1000))
}

// Aggregate results
const winnerCounts = {}
results.forEach(r => {
  const winner = r.winner.branchName
  winnerCounts[winner] = (winnerCounts[winner] || 0) + 1
})

console.log('Winner distribution across', iterations, 'runs:')
Object.entries(winnerCounts).forEach(([name, count]) => {
  console.log(`  ${name}: ${count}/${iterations} (${(count/iterations*100).toFixed(1)}%)`)
})

// Calculate average metrics per branch
const branchStats = {}
results.forEach(result => {
  result.branches.forEach(branch => {
    if (!branchStats[branch.branchName]) {
      branchStats[branch.branchName] = {
        quality: [],
        latency: [],
        cost: []
      }
    }
    branchStats[branch.branchName].quality.push(branch.qualityScore)
    branchStats[branch.branchName].latency.push(branch.latencyMs)
    branchStats[branch.branchName].cost.push(branch.estimatedCost)
  })
})

console.log('\nAverage metrics across all runs:')
Object.entries(branchStats).forEach(([name, stats]) => {
  const avgQuality = stats.quality.reduce((a, b) => a + b) / stats.quality.length
  const avgLatency = stats.latency.reduce((a, b) => a + b) / stats.latency.length
  const avgCost = stats.cost.reduce((a, b) => a + b) / stats.cost.length

  console.log(`${name}:`)
  console.log(`  Avg Quality: ${avgQuality.toFixed(3)}`)
  console.log(`  Avg Latency: ${avgLatency.toFixed(0)}ms`)
  console.log(`  Avg Cost: $${avgCost.toFixed(5)}`)
})

A/B Testing Best Practices

Test Representative Inputs - Use test cases that reflect real-world usage patterns. Edge cases and outliers can skew results.

Run Multiple Iterations - LLMs are non-deterministic. Run tests 3-5 times to get reliable comparisons, even with temperature=0.

Choose Appropriate Criteria - Use "fastest" for real-time apps, "cheapest" for high-volume use cases, "balanced" when you need good all-around performance.

Monitor Consistency - Check consistency scores to identify models that produce highly variable outputs. Lower consistency may indicate unpredictable behavior.

Consider All Metrics - The fastest model might have poor quality. The best quality might be prohibitively expensive. Review all metrics, not just the winner.

Test at Different Temperatures - Create branches with the same model but different temperatures to find the sweet spot between creativity and consistency.

Use Control Groups - Mark your current production configuration as the control (isControl: true) to easily identify and compare against your baseline.

Example: Complete Testing Flow

import { gql } from 'graphql-request'

// 1. Create test cases for a sentiment analysis prompt
const CREATE_TEST = gql`
  mutation CreateTestCase($input: TestCaseInput!) {
    createTestCase(input: $input) {
      id
      name
    }
  }
`

const testCases = [
  {
    promptId: "cm123",
    name: "Strongly positive",
    input: { text: "This is absolutely amazing! Best ever!" },
    expectedOutput: "positive"
  },
  {
    promptId: "cm123",
    name: "Subtly negative",
    input: { text: "I guess it's okay, but I expected better." },
    expectedOutput: "negative"
  }
]

const createdTests = await Promise.all(
  testCases.map(tc => client.request(CREATE_TEST, { input: tc }))
)

// 2. Make changes to the prompt
// ... update prompt content ...

// 3. Run all tests against the new version
const BATCH_RUN = gql`
  mutation BatchRun($input: BatchRunTestsInput!) {
    batchRunTests(input: $input) {
      testCaseExecutionId
      passed
      score
      isRegression
      regressionType
      actualOutput
    }
  }
`

const results = await client.request(BATCH_RUN, {
  input: {
    promptId: "cm123",
    testCaseIds: createdTests.map(t => t.createTestCase.id)
  }
})

// 4. Analyze results
const summary = {
  total: results.batchRunTests.length,
  passed: results.batchRunTests.filter(r => r.passed).length,
  regressions: results.batchRunTests.filter(r => r.isRegression)
}

console.log(`Test Results: ${summary.passed}/${summary.total} passed`)

if (summary.regressions.length > 0) {
  console.log('Regressions detected:')
  summary.regressions.forEach(r => {
    console.log(`  - Type: ${r.regressionType}, Score: ${r.score}`)
  })
}

// 5. Get detailed stats for monitoring
const STATS = gql`
  query GetStats($testCaseId: ID!) {
    testCaseStats(testCaseId: $testCaseId) {
      totalRuns
      passRate
      avgScore
      trendData {
        date
        passRate
        avgScore
      }
    }
  }
`

for (const test of createdTests) {
  const stats = await client.request(STATS, {
    testCaseId: test.createTestCase.id
  })
  console.log(`${test.createTestCase.name}:`)
  console.log(`  Pass rate: ${stats.testCaseStats.passRate}%`)
  console.log(`  Avg score: ${stats.testCaseStats.avgScore}`)
}

Next Steps

Prompts API

Learn how to create and manage prompts that tests validate

Prompts API →

Execution API

Understand prompt execution and how tests leverage executions

Execution API →