Testing API

Comprehensive testing framework for prompts with regression detection, semantic scoring, and performance tracking.

Overview

The Testing API provides a robust framework for validating prompt behavior, detecting regressions, and tracking quality metrics over time. Test cases can be run individually or in batches, with automatic scoring and detailed comparison analytics.

Test cases are scoped to individual prompts and automatically track version changes, making it easy to identify when prompt updates cause regressions.

Test Case Schema

TestCase Type

ParameterTypeDescription
id
required
IDUnique identifier for the test case
promptId
required
IDID of the prompt being tested
name
required
stringDescriptive name for the test case
descriptionstringOptional detailed description
input
required
JSONInput data for the test execution
expectedOutputstringExpected output for comparison
totalRunsnumberTotal number of times this test has been run
passRatenumberPercentage of passing test runs (0-100)
avgScorenumberAverage similarity score across all runs
lastRunDateTimeTimestamp of the most recent execution
lastPassedbooleanWhether the most recent run passed
createdAtDateTimeWhen the test case was created
updatedAtDateTimeWhen the test case was last modified

TestCaseExecution Type

ParameterTypeDescription
id
required
IDExecution record ID
testCaseId
required
IDReference to the test case
executionId
required
IDReference to the prompt execution
passed
required
booleanWhether the test passed (score >= 0.8)
scorenumberOverall similarity score (0-1)
actualOutput
required
stringActual output from the prompt
expectedOutputstringExpected output for comparison
outputDiffJSONCharacter-level diff between outputs
exactMatch
required
booleanWhether outputs matched exactly
semanticScorenumberSemantic similarity score using embeddings (0-1)
lengthDiffnumberDifference in output length (characters)
tokenDiffnumberDifference in token count
isRegression
required
booleanWhether this represents a quality regression
regressionTypestringType of regression: FAILED, SCORE_DROP, LENGTH_CHANGE
promptVersion
required
numberPrompt version used for this test
runDurationnumberTest execution duration in milliseconds
createdAtDateTimeWhen the test was executed

Queries

Get All Test Cases

Retrieve all test cases for a specific prompt.

query GetTestCases($promptId: ID!) {
  testCases(promptId: $promptId) {
    id
    name
    description
    input
    expectedOutput
    totalRuns
    passRate
    avgScore
    lastRun
    lastPassed
    createdAt
    updatedAt
  }
}

Get Single Test Case

query GetTestCase($id: ID!) {
  testCase(id: $id) {
    id
    name
    description
    input
    expectedOutput
    totalRuns
    passRate
    avgScore
    prompt {
      id
      name
    }
  }
}

Get Test Case Executions

Retrieve execution history for a test case with pagination.

query GetTestCaseExecutions($testCaseId: ID!, $limit: Int, $offset: Int) {
  testCaseExecutions(testCaseId: $testCaseId, limit: $limit, offset: $offset) {
    id
    passed
    score
    actualOutput
    expectedOutput
    exactMatch
    semanticScore
    lengthDiff
    tokenDiff
    isRegression
    regressionType
    promptVersion
    runDuration
    createdAt
  }
}

Get Test Case Statistics

Get detailed statistics, trends, and regression data for a test case.

query GetTestCaseStats($testCaseId: ID!) {
  testCaseStats(testCaseId: $testCaseId) {
    totalRuns
    passRate
    avgScore
    recentRuns {
      id
      passed
      score
      promptVersion
      createdAt
    }
    regressions {
      id
      passed
      score
      regressionType
      promptVersion
      createdAt
    }
    trendData {
      date
      totalRuns
      passRate
      avgScore
    }
  }
}

Mutations

Create Test Case

Create a new test case for a prompt.

mutation CreateTestCase($input: TestCaseInput!) {
  createTestCase(input: $input) {
    id
    name
    description
    input
    expectedOutput
    createdAt
  }
}

# Variables
{
  "input": {
    "promptId": "cm123456789",
    "name": "Positive sentiment detection",
    "description": "Tests detection of positive sentiment in customer feedback",
    "input": {
      "text": "I absolutely love this product! Best purchase ever."
    },
    "expectedOutput": "positive"
  }
}
ParameterTypeDescription
promptId
required
IDID of the prompt to test
name
required
stringTest case name
descriptionstringOptional description
input
required
JSONInput data matching prompt schema
expectedOutputstringExpected output for comparison

Update Test Case

mutation UpdateTestCase($id: ID!, $input: TestCaseInput!) {
  updateTestCase(id: $id, input: $input) {
    id
    name
    description
    input
    expectedOutput
    updatedAt
  }
}

Delete Test Case

mutation DeleteTestCase($id: ID!) {
  deleteTestCase(id: $id)
}

Run Test Case

Execute a test case against the latest (or specified) prompt version.

mutation RunTestCase($input: RunTestCaseInput!) {
  runTestCase(input: $input) {
    testCaseExecutionId
    executionId
    passed
    score
    actualOutput
    expectedOutput
    outputDiff
    exactMatch
    semanticScore
    lengthDiff
    tokenDiff
    latencyMs
    costUsd
    isRegression
    regressionType
  }
}

# Variables
{
  "input": {
    "testCaseId": "cm123456789",
    "promptVersion": 3  // Optional: defaults to latest
  }
}

Batch Run Tests

Run multiple test cases in a single operation. Useful for regression testing after prompt updates.

mutation BatchRunTests($input: BatchRunTestsInput!) {
  batchRunTests(input: $input) {
    testCaseExecutionId
    executionId
    passed
    score
    actualOutput
    isRegression
    regressionType
  }
}

# Variables
{
  "input": {
    "promptId": "cm123456789",
    "testCaseIds": [
      "test1",
      "test2",
      "test3"
    ],
    "promptVersion": 4  // Optional
  }
}

Update Expected Output

Update the expected output for a test case. Useful when accepting new behavior as the baseline.

mutation UpdateExpectedOutput($testCaseId: ID!, $output: String!) {
  updateExpectedOutput(testCaseId: $testCaseId, output: $output) {
    id
    expectedOutput
    updatedAt
  }
}

Test Scoring System

Test cases are evaluated using multiple comparison methods to provide a comprehensive quality assessment.

Scoring Components

Exact Match (Boolean)

Simple string equality check. True if outputs match exactly after trimming whitespace.

Semantic Score (0-1)

Cosine similarity between output embeddings. Measures semantic meaning rather than exact wording. Calculated using text-embedding-3-small model.

Length Difference

Character count difference between actual and expected outputs. Helps identify verbose or terse responses.

Token Difference

Token count difference. Important for cost and context window tracking.

Pass/Fail Criteria

A test passes if either:

  1. Exact match is true, OR
  2. Semantic score is >= 0.8 (80% similarity)
The semantic scoring approach allows for natural language variation while still catching significant deviations in meaning or intent.

Regression Detection

The system automatically detects regressions by comparing each test run against historical performance. Three types of regressions are tracked:

Regression Types

ParameterTypeDescription
FAILEDregressionTest that previously passed now fails (score < 0.8)
SCORE_DROPregressionSemantic score dropped by more than 10% compared to recent average
LENGTH_CHANGEregressionOutput length changed significantly (>30%) compared to recent average

Regression Detection Logic

// Test fails when it previously passed
if (!currentPassed && historicalPassRate > 0.8) {
  regressionType = 'FAILED'
  isRegression = true
}

// Score drops significantly
if (currentScore < recentAvgScore * 0.9) {
  regressionType = 'SCORE_DROP'
  isRegression = true
}

// Output length changes dramatically
const lengthChange = Math.abs(currentLength - recentAvgLength) / recentAvgLength
if (lengthChange > 0.3) {
  regressionType = 'LENGTH_CHANGE'
  isRegression = true
}
Regressions are calculated based on the last 5 successful test runs. New test cases with fewer than 2 historical runs won't trigger regression detection.

Testing Workflow

1. Create Test Cases

Start by creating test cases that represent important use cases for your prompt.

// Create multiple test cases for different scenarios
const testCases = [
  {
    name: "Positive sentiment",
    input: { text: "I love this!" },
    expectedOutput: "positive"
  },
  {
    name: "Negative sentiment",
    input: { text: "This is terrible." },
    expectedOutput: "negative"
  },
  {
    name: "Neutral sentiment",
    input: { text: "It's okay." },
    expectedOutput: "neutral"
  }
]

for (const tc of testCases) {
  await createTestCase({ promptId, ...tc })
}

2. Run Tests After Changes

After updating a prompt, run all test cases to check for regressions.

// Batch run all tests for a prompt
const results = await batchRunTests({
  promptId: "cm123456789",
  testCaseIds: ["test1", "test2", "test3"]
})

// Check for failures or regressions
const failures = results.filter(r => !r.passed)
const regressions = results.filter(r => r.isRegression)

if (failures.length > 0) {
  console.log('Failed tests:', failures.length)
}

if (regressions.length > 0) {
  console.log('Regressions detected:', regressions)
}

3. Analyze Results

Use test case statistics to track quality trends over time.

const stats = await getTestCaseStats({ testCaseId })

console.log('Pass rate:', stats.passRate + '%')
console.log('Average score:', stats.avgScore)
console.log('Recent regressions:', stats.regressions.length)

// Analyze trend data
stats.trendData.forEach(point => {
  console.log(`${point.date}: ${point.passRate}% pass rate`)
})

4. Update Baselines

When new behavior is intentional, update the expected output to establish a new baseline.

// Accept new output as the expected baseline
await updateExpectedOutput({
  testCaseId: "test123",
  output: newExpectedOutput
})

Best Practices

Test Edge Cases - Create test cases for boundary conditions, unusual inputs, and error scenarios, not just happy paths.
Maintain Test Coverage - Aim for test cases covering all major prompt behaviors. Add new tests whenever you fix a bug or add functionality.
Run Tests Before Deployment - Always run the full test suite before deploying prompt changes to production. Check both pass rates and regression flags.
Monitor Trends - Use trend data to identify gradual quality degradation over time, even if individual tests still pass.
Review Regressions Carefully - Not all regressions are bad. LENGTH_CHANGE might indicate more concise responses. Always review the actual output.
Version Testing - You can run tests against specific prompt versions to compare behavior across updates. Useful for A/B testing.

A/B Testing

A/B testing enables you to compare different model configurations (branches) against the same test case to find the optimal setup for your use case. Test branches run in parallel and are automatically ranked based on quality, speed, cost, or a balanced score.

Each test case can have up to 5 branches. Branches are independent configurations that can use different models, providers, or parameters while testing against the same input.

Branch Schema

TestBranch Type

ParameterTypeDescription
id
required
IDUnique branch identifier
testCaseId
required
IDParent test case ID
name
required
stringBranch name (e.g., "GPT-4 High Temp")
descriptionstringOptional branch description
provider
required
stringLLM provider: openai, anthropic, gemini
model
required
stringModel identifier
temperaturenumberTemperature (0-2, default: 0.7)
maxTokensnumberMax output tokens (default: 2048)
topPnumberTop-p sampling (0-1, default: 1.0)
isControlbooleanWhether this is the control group
isActivebooleanWhether branch is active for testing
totalRunsnumberTotal executions for this branch
avgLatencynumberAverage response time (ms)
avgCostnumberAverage cost per execution
avgQualityScorenumberAverage quality score (0-1)
avgTokensPerSecnumberAverage throughput
consistencyScorenumberOutput consistency metric (0-1)
passRatenumberPercentage passing quality threshold

TestBranchExecution Type

ParameterTypeDescription
id
required
IDExecution record ID
branchId
required
IDBranch that was executed
executionIdIDReference to prompt execution
output
required
stringGenerated output
errorstringError message if execution failed
status
required
stringsuccess or error
latencyMs
required
numberResponse time in milliseconds
timeToFirstTokennumberTTFT in milliseconds
tokensPerSecondnumberThroughput metric
inputTokens
required
numberInput token count
outputTokens
required
numberOutput token count
totalTokens
required
numberTotal token count
estimatedCost
required
numberEstimated cost in USD
qualityScore
required
numberQuality score (0-1)
similarityScorenumberSemantic similarity score
clarityScorenumberOutput readability score
validJsonbooleanWhether output is valid JSON
consistencyHashstringHash for consistency tracking
promptVersion
required
numberPrompt version used

A/B Testing Queries

Get Test Branches

List all branches for a test case with their performance metrics.

query GetTestBranches($testCaseId: ID!) {
  testBranches(testCaseId: $testCaseId) {
    id
    name
    description
    provider
    model
    temperature
    maxTokens
    topP
    isControl
    isActive
    totalRuns
    avgLatency
    avgCost
    avgQualityScore
    avgTokensPerSec
    consistencyScore
    passRate
    createdAt
  }
}

Get Branch Execution History

Retrieve detailed execution history for a specific branch.

query GetBranchExecutions($branchId: ID!, $limit: Int, $offset: Int) {
  testBranchExecutions(branchId: $branchId, limit: $limit, offset: $offset) {
    id
    output
    status
    latencyMs
    timeToFirstToken
    tokensPerSecond
    inputTokens
    outputTokens
    estimatedCost
    qualityScore
    similarityScore
    clarityScore
    validJson
    promptVersion
    createdAt
  }
}

A/B Testing Mutations

Create Test Branch

Create a new branch with specific model configuration. Maximum 5 branches per test case.

mutation CreateTestBranch($input: TestBranchInput!) {
  createTestBranch(input: $input) {
    id
    name
    model
    temperature
  }
}

# Variables
{
  "input": {
    "testCaseId": "tc_123",
    "name": "GPT-4 High Temperature",
    "description": "Testing GPT-4 with higher temperature for creativity",
    "provider": "openai",
    "model": "gpt-4-turbo-preview",
    "temperature": 0.9,
    "maxTokens": 1000,
    "topP": 0.95,
    "isControl": false
  }
}
ParameterTypeDescription
testCaseId
required
IDTest case ID
name
required
stringBranch name
descriptionstringOptional description
provider
required
stringopenai, anthropic, or gemini
model
required
stringModel identifier
temperaturenumberTemperature (0-2)
maxTokensnumberMax output tokens
topPnumberTop-p sampling (0-1)
isControlbooleanMark as control group
otherParamsJSONAdditional provider-specific params

Update Test Branch

mutation UpdateTestBranch($id: ID!, $input: TestBranchInput!) {
  updateTestBranch(id: $id, input: $input) {
    id
    name
    temperature
    maxTokens
  }
}

Delete Test Branch

mutation DeleteTestBranch($id: ID!) {
  deleteTestBranch(id: $id)
}

Run A/B Test

Execute an A/B test across multiple branches and automatically determine the winner based on your criteria.

mutation RunABTest($input: RunABTestInput!) {
  runABTest(input: $input) {
    branches {
      branchId
      branchName
      model
      execution {
        id
        output
        status
      }
      qualityScore
      latencyMs
      estimatedCost
      tokensPerSecond
      rank
    }
    winner {
      branchId
      branchName
      model
      qualityScore
      latencyMs
      estimatedCost
    }
    criteria
  }
}

# Variables
{
  "input": {
    "testCaseId": "tc_123",
    "branchIds": ["branch_1", "branch_2", "branch_3"],
    "promptVersion": 2,
    "winnerCriteria": "balanced"
  }
}
ParameterTypeDescription
testCaseId
required
IDTest case to run
branchIds
required
[ID!]!Array of branch IDs (2-3 recommended)
promptVersionIntSpecific prompt version (defaults to latest)
winnerCriteriastringWinner selection: best_quality, fastest, cheapest, best_value, balanced (default: best_quality)

Winner Criteria

The A/B test winner is determined automatically based on the selected criteria:

best_quality (Default)

Selects the branch with the highest quality score. Best for prioritizing output accuracy.

fastest

Selects the branch with the lowest latency. Best for real-time applications.

cheapest

Selects the branch with the lowest estimated cost. Best for cost optimization.

best_value

Selects the branch with the best quality-to-cost ratio. Calculates quality score divided by cost.

balanced

Composite score: 40% quality, 30% speed, 30% cost. Best for balanced performance across all metrics.

A/B Testing Metrics

Each branch execution tracks comprehensive metrics for comparison:

Per-Execution Metrics

  • Quality Score (0-1): Similarity to expected output using semantic comparison
  • Latency (ms): Total response time from request to completion
  • Cost (USD): Estimated cost based on token usage and model pricing
  • Tokens/Second: Throughput metric for real-time performance
  • Clarity Score (0-1): Output readability based on sentence structure
  • Consistency Hash: For detecting output variations across runs
  • Valid JSON: Whether output is parseable JSON (useful for structured outputs)

Aggregated Branch Metrics

  • Total Runs: Number of executions for this branch
  • Average Metrics: Mean quality, latency, cost, and throughput
  • Pass Rate: Percentage passing quality threshold (70%+)
  • Consistency Score: Variance in outputs (lower = more consistent)
Run A/B tests multiple times to get statistically significant results. Single runs can be affected by LLM non-determinism, even with temperature set to 0.

A/B Testing Workflow

Complete A/B Test Example

import { gql } from 'graphql-request'

// 1. Create a test case
const CREATE_TEST = gql`
  mutation CreateTestCase($input: TestCaseInput!) {
    createTestCase(input: $input) {
      id
    }
  }
`

const testCase = await client.request(CREATE_TEST, {
  input: {
    promptId: "prompt_123",
    name: "Sentiment Analysis Test",
    input: {
      text: "This product exceeded my expectations!"
    },
    expectedOutput: "positive"
  }
})

// 2. Create test branches with different configurations
const CREATE_BRANCH = gql`
  mutation CreateBranch($input: TestBranchInput!) {
    createTestBranch(input: $input) {
      id
    }
  }
`

const branches = await Promise.all([
  // GPT-4 Turbo
  client.request(CREATE_BRANCH, {
    input: {
      testCaseId: testCase.createTestCase.id,
      name: "GPT-4 Turbo",
      provider: "openai",
      model: "gpt-4-turbo-preview",
      temperature: 0.7
    }
  }),
  // Claude Sonnet
  client.request(CREATE_BRANCH, {
    input: {
      testCaseId: testCase.createTestCase.id,
      name: "Claude Sonnet",
      provider: "anthropic",
      model: "claude-3-sonnet-20240229",
      temperature: 0.7
    }
  }),
  // Gemini Pro
  client.request(CREATE_BRANCH, {
    input: {
      testCaseId: testCase.createTestCase.id,
      name: "Gemini Pro",
      provider: "gemini",
      model: "gemini-pro",
      temperature: 0.7
    }
  })
])

// 3. Run A/B test
const RUN_AB_TEST = gql`
  mutation RunABTest($input: RunABTestInput!) {
    runABTest(input: $input) {
      winner {
        branchName
        model
        qualityScore
        latencyMs
        estimatedCost
      }
      branches {
        branchName
        model
        qualityScore
        latencyMs
        estimatedCost
        rank
      }
      criteria
    }
  }
`

const result = await client.request(RUN_AB_TEST, {
  input: {
    testCaseId: testCase.createTestCase.id,
    branchIds: branches.map(b => b.createTestBranch.id),
    winnerCriteria: "balanced"
  }
})

// 4. Analyze results
console.log('Winner:', result.runABTest.winner.branchName)
console.log('  Model:', result.runABTest.winner.model)
console.log('  Quality:', result.runABTest.winner.qualityScore)
console.log('  Latency:', result.runABTest.winner.latencyMs + 'ms')
console.log('  Cost: $' + result.runABTest.winner.estimatedCost)

console.log('\nAll Results:')
result.runABTest.branches.forEach(branch => {
  console.log(`${branch.rank}. ${branch.branchName}`)
  console.log(`   Quality: ${branch.qualityScore}`)
  console.log(`   Speed: ${branch.latencyMs}ms`)
  console.log(`   Cost: $${branch.estimatedCost}`)
})

// 5. Get detailed execution history
const GET_EXECUTIONS = gql`
  query GetBranchExecutions($branchId: ID!, $limit: Int) {
    testBranchExecutions(branchId: $branchId, limit: $limit) {
      output
      qualityScore
      latencyMs
      estimatedCost
      createdAt
    }
  }
`

const winnerId = branches[0].createTestBranch.id
const executions = await client.request(GET_EXECUTIONS, {
  branchId: winnerId,
  limit: 10
})

console.log('\nRecent executions for winner:')
executions.testBranchExecutions.forEach((exec, i) => {
  console.log(`${i + 1}. Quality: ${exec.qualityScore}, Latency: ${exec.latencyMs}ms`)
})

Running Multiple Test Iterations

For statistically significant results, run A/B tests multiple times and analyze aggregate metrics.

// Run the same A/B test 5 times
const iterations = 5
const results = []

for (let i = 0; i < iterations; i++) {
  const result = await client.request(RUN_AB_TEST, {
    input: {
      testCaseId: testCase.id,
      branchIds: branchIds,
      winnerCriteria: "balanced"
    }
  })
  results.push(result.runABTest)

  // Small delay between iterations
  await new Promise(resolve => setTimeout(resolve, 1000))
}

// Aggregate results
const winnerCounts = {}
results.forEach(r => {
  const winner = r.winner.branchName
  winnerCounts[winner] = (winnerCounts[winner] || 0) + 1
})

console.log('Winner distribution across', iterations, 'runs:')
Object.entries(winnerCounts).forEach(([name, count]) => {
  console.log(`  ${name}: ${count}/${iterations} (${(count/iterations*100).toFixed(1)}%)`)
})

// Calculate average metrics per branch
const branchStats = {}
results.forEach(result => {
  result.branches.forEach(branch => {
    if (!branchStats[branch.branchName]) {
      branchStats[branch.branchName] = {
        quality: [],
        latency: [],
        cost: []
      }
    }
    branchStats[branch.branchName].quality.push(branch.qualityScore)
    branchStats[branch.branchName].latency.push(branch.latencyMs)
    branchStats[branch.branchName].cost.push(branch.estimatedCost)
  })
})

console.log('\nAverage metrics across all runs:')
Object.entries(branchStats).forEach(([name, stats]) => {
  const avgQuality = stats.quality.reduce((a, b) => a + b) / stats.quality.length
  const avgLatency = stats.latency.reduce((a, b) => a + b) / stats.latency.length
  const avgCost = stats.cost.reduce((a, b) => a + b) / stats.cost.length

  console.log(`${name}:`)
  console.log(`  Avg Quality: ${avgQuality.toFixed(3)}`)
  console.log(`  Avg Latency: ${avgLatency.toFixed(0)}ms`)
  console.log(`  Avg Cost: $${avgCost.toFixed(5)}`)
})

A/B Testing Best Practices

Test Representative Inputs - Use test cases that reflect real-world usage patterns. Edge cases and outliers can skew results.
Run Multiple Iterations - LLMs are non-deterministic. Run tests 3-5 times to get reliable comparisons, even with temperature=0.
Choose Appropriate Criteria - Use "fastest" for real-time apps, "cheapest" for high-volume use cases, "balanced" when you need good all-around performance.
Monitor Consistency - Check consistency scores to identify models that produce highly variable outputs. Lower consistency may indicate unpredictable behavior.
Consider All Metrics - The fastest model might have poor quality. The best quality might be prohibitively expensive. Review all metrics, not just the winner.
Test at Different Temperatures - Create branches with the same model but different temperatures to find the sweet spot between creativity and consistency.
Use Control Groups - Mark your current production configuration as the control (isControl: true) to easily identify and compare against your baseline.

Example: Complete Testing Flow

import { gql } from 'graphql-request'

// 1. Create test cases for a sentiment analysis prompt
const CREATE_TEST = gql`
  mutation CreateTestCase($input: TestCaseInput!) {
    createTestCase(input: $input) {
      id
      name
    }
  }
`

const testCases = [
  {
    promptId: "cm123",
    name: "Strongly positive",
    input: { text: "This is absolutely amazing! Best ever!" },
    expectedOutput: "positive"
  },
  {
    promptId: "cm123",
    name: "Subtly negative",
    input: { text: "I guess it's okay, but I expected better." },
    expectedOutput: "negative"
  }
]

const createdTests = await Promise.all(
  testCases.map(tc => client.request(CREATE_TEST, { input: tc }))
)

// 2. Make changes to the prompt
// ... update prompt content ...

// 3. Run all tests against the new version
const BATCH_RUN = gql`
  mutation BatchRun($input: BatchRunTestsInput!) {
    batchRunTests(input: $input) {
      testCaseExecutionId
      passed
      score
      isRegression
      regressionType
      actualOutput
    }
  }
`

const results = await client.request(BATCH_RUN, {
  input: {
    promptId: "cm123",
    testCaseIds: createdTests.map(t => t.createTestCase.id)
  }
})

// 4. Analyze results
const summary = {
  total: results.batchRunTests.length,
  passed: results.batchRunTests.filter(r => r.passed).length,
  regressions: results.batchRunTests.filter(r => r.isRegression)
}

console.log(`Test Results: ${summary.passed}/${summary.total} passed`)

if (summary.regressions.length > 0) {
  console.log('Regressions detected:')
  summary.regressions.forEach(r => {
    console.log(`  - Type: ${r.regressionType}, Score: ${r.score}`)
  })
}

// 5. Get detailed stats for monitoring
const STATS = gql`
  query GetStats($testCaseId: ID!) {
    testCaseStats(testCaseId: $testCaseId) {
      totalRuns
      passRate
      avgScore
      trendData {
        date
        passRate
        avgScore
      }
    }
  }
`

for (const test of createdTests) {
  const stats = await client.request(STATS, {
    testCaseId: test.createTestCase.id
  })
  console.log(`${test.createTestCase.name}:`)
  console.log(`  Pass rate: ${stats.testCaseStats.passRate}%`)
  console.log(`  Avg score: ${stats.testCaseStats.avgScore}`)
}

Next Steps

Prompts API

Learn how to create and manage prompts that tests validate

Prompts API →

Execution API

Understand prompt execution and how tests leverage executions

Execution API →