Testing API
Comprehensive testing framework for prompts with regression detection, semantic scoring, and performance tracking.
Overview
The Testing API provides a robust framework for validating prompt behavior, detecting regressions, and tracking quality metrics over time. Test cases can be run individually or in batches, with automatic scoring and detailed comparison analytics.
Test Case Schema
TestCase Type
| Parameter | Type | Description |
|---|---|---|
idrequired | ID | Unique identifier for the test case |
promptIdrequired | ID | ID of the prompt being tested |
namerequired | string | Descriptive name for the test case |
description | string | Optional detailed description |
inputrequired | JSON | Input data for the test execution |
expectedOutput | string | Expected output for comparison |
totalRuns | number | Total number of times this test has been run |
passRate | number | Percentage of passing test runs (0-100) |
avgScore | number | Average similarity score across all runs |
lastRun | DateTime | Timestamp of the most recent execution |
lastPassed | boolean | Whether the most recent run passed |
createdAt | DateTime | When the test case was created |
updatedAt | DateTime | When the test case was last modified |
TestCaseExecution Type
| Parameter | Type | Description |
|---|---|---|
idrequired | ID | Execution record ID |
testCaseIdrequired | ID | Reference to the test case |
executionIdrequired | ID | Reference to the prompt execution |
passedrequired | boolean | Whether the test passed (score >= 0.8) |
score | number | Overall similarity score (0-1) |
actualOutputrequired | string | Actual output from the prompt |
expectedOutput | string | Expected output for comparison |
outputDiff | JSON | Character-level diff between outputs |
exactMatchrequired | boolean | Whether outputs matched exactly |
semanticScore | number | Semantic similarity score using embeddings (0-1) |
lengthDiff | number | Difference in output length (characters) |
tokenDiff | number | Difference in token count |
isRegressionrequired | boolean | Whether this represents a quality regression |
regressionType | string | Type of regression: FAILED, SCORE_DROP, LENGTH_CHANGE |
promptVersionrequired | number | Prompt version used for this test |
runDuration | number | Test execution duration in milliseconds |
createdAt | DateTime | When the test was executed |
Queries
Get All Test Cases
Retrieve all test cases for a specific prompt.
query GetTestCases($promptId: ID!) {
testCases(promptId: $promptId) {
id
name
description
input
expectedOutput
totalRuns
passRate
avgScore
lastRun
lastPassed
createdAt
updatedAt
}
}Get Single Test Case
query GetTestCase($id: ID!) {
testCase(id: $id) {
id
name
description
input
expectedOutput
totalRuns
passRate
avgScore
prompt {
id
name
}
}
}Get Test Case Executions
Retrieve execution history for a test case with pagination.
query GetTestCaseExecutions($testCaseId: ID!, $limit: Int, $offset: Int) {
testCaseExecutions(testCaseId: $testCaseId, limit: $limit, offset: $offset) {
id
passed
score
actualOutput
expectedOutput
exactMatch
semanticScore
lengthDiff
tokenDiff
isRegression
regressionType
promptVersion
runDuration
createdAt
}
}Get Test Case Statistics
Get detailed statistics, trends, and regression data for a test case.
query GetTestCaseStats($testCaseId: ID!) {
testCaseStats(testCaseId: $testCaseId) {
totalRuns
passRate
avgScore
recentRuns {
id
passed
score
promptVersion
createdAt
}
regressions {
id
passed
score
regressionType
promptVersion
createdAt
}
trendData {
date
totalRuns
passRate
avgScore
}
}
}Mutations
Create Test Case
Create a new test case for a prompt.
mutation CreateTestCase($input: TestCaseInput!) {
createTestCase(input: $input) {
id
name
description
input
expectedOutput
createdAt
}
}
# Variables
{
"input": {
"promptId": "cm123456789",
"name": "Positive sentiment detection",
"description": "Tests detection of positive sentiment in customer feedback",
"input": {
"text": "I absolutely love this product! Best purchase ever."
},
"expectedOutput": "positive"
}
}| Parameter | Type | Description |
|---|---|---|
promptIdrequired | ID | ID of the prompt to test |
namerequired | string | Test case name |
description | string | Optional description |
inputrequired | JSON | Input data matching prompt schema |
expectedOutput | string | Expected output for comparison |
Update Test Case
mutation UpdateTestCase($id: ID!, $input: TestCaseInput!) {
updateTestCase(id: $id, input: $input) {
id
name
description
input
expectedOutput
updatedAt
}
}Delete Test Case
mutation DeleteTestCase($id: ID!) {
deleteTestCase(id: $id)
}Run Test Case
Execute a test case against the latest (or specified) prompt version.
mutation RunTestCase($input: RunTestCaseInput!) {
runTestCase(input: $input) {
testCaseExecutionId
executionId
passed
score
actualOutput
expectedOutput
outputDiff
exactMatch
semanticScore
lengthDiff
tokenDiff
latencyMs
costUsd
isRegression
regressionType
}
}
# Variables
{
"input": {
"testCaseId": "cm123456789",
"promptVersion": 3 // Optional: defaults to latest
}
}Batch Run Tests
Run multiple test cases in a single operation. Useful for regression testing after prompt updates.
mutation BatchRunTests($input: BatchRunTestsInput!) {
batchRunTests(input: $input) {
testCaseExecutionId
executionId
passed
score
actualOutput
isRegression
regressionType
}
}
# Variables
{
"input": {
"promptId": "cm123456789",
"testCaseIds": [
"test1",
"test2",
"test3"
],
"promptVersion": 4 // Optional
}
}Update Expected Output
Update the expected output for a test case. Useful when accepting new behavior as the baseline.
mutation UpdateExpectedOutput($testCaseId: ID!, $output: String!) {
updateExpectedOutput(testCaseId: $testCaseId, output: $output) {
id
expectedOutput
updatedAt
}
}Test Scoring System
Test cases are evaluated using multiple comparison methods to provide a comprehensive quality assessment.
Scoring Components
Exact Match (Boolean)
Simple string equality check. True if outputs match exactly after trimming whitespace.
Semantic Score (0-1)
Cosine similarity between output embeddings. Measures semantic meaning rather than exact wording. Calculated using text-embedding-3-small model.
Length Difference
Character count difference between actual and expected outputs. Helps identify verbose or terse responses.
Token Difference
Token count difference. Important for cost and context window tracking.
Pass/Fail Criteria
A test passes if either:
- Exact match is true, OR
- Semantic score is >= 0.8 (80% similarity)
Regression Detection
The system automatically detects regressions by comparing each test run against historical performance. Three types of regressions are tracked:
Regression Types
| Parameter | Type | Description |
|---|---|---|
FAILED | regression | Test that previously passed now fails (score < 0.8) |
SCORE_DROP | regression | Semantic score dropped by more than 10% compared to recent average |
LENGTH_CHANGE | regression | Output length changed significantly (>30%) compared to recent average |
Regression Detection Logic
// Test fails when it previously passed
if (!currentPassed && historicalPassRate > 0.8) {
regressionType = 'FAILED'
isRegression = true
}
// Score drops significantly
if (currentScore < recentAvgScore * 0.9) {
regressionType = 'SCORE_DROP'
isRegression = true
}
// Output length changes dramatically
const lengthChange = Math.abs(currentLength - recentAvgLength) / recentAvgLength
if (lengthChange > 0.3) {
regressionType = 'LENGTH_CHANGE'
isRegression = true
}Testing Workflow
1. Create Test Cases
Start by creating test cases that represent important use cases for your prompt.
// Create multiple test cases for different scenarios
const testCases = [
{
name: "Positive sentiment",
input: { text: "I love this!" },
expectedOutput: "positive"
},
{
name: "Negative sentiment",
input: { text: "This is terrible." },
expectedOutput: "negative"
},
{
name: "Neutral sentiment",
input: { text: "It's okay." },
expectedOutput: "neutral"
}
]
for (const tc of testCases) {
await createTestCase({ promptId, ...tc })
}2. Run Tests After Changes
After updating a prompt, run all test cases to check for regressions.
// Batch run all tests for a prompt
const results = await batchRunTests({
promptId: "cm123456789",
testCaseIds: ["test1", "test2", "test3"]
})
// Check for failures or regressions
const failures = results.filter(r => !r.passed)
const regressions = results.filter(r => r.isRegression)
if (failures.length > 0) {
console.log('Failed tests:', failures.length)
}
if (regressions.length > 0) {
console.log('Regressions detected:', regressions)
}3. Analyze Results
Use test case statistics to track quality trends over time.
const stats = await getTestCaseStats({ testCaseId })
console.log('Pass rate:', stats.passRate + '%')
console.log('Average score:', stats.avgScore)
console.log('Recent regressions:', stats.regressions.length)
// Analyze trend data
stats.trendData.forEach(point => {
console.log(`${point.date}: ${point.passRate}% pass rate`)
})4. Update Baselines
When new behavior is intentional, update the expected output to establish a new baseline.
// Accept new output as the expected baseline
await updateExpectedOutput({
testCaseId: "test123",
output: newExpectedOutput
})Best Practices
A/B Testing
A/B testing enables you to compare different model configurations (branches) against the same test case to find the optimal setup for your use case. Test branches run in parallel and are automatically ranked based on quality, speed, cost, or a balanced score.
Branch Schema
TestBranch Type
| Parameter | Type | Description |
|---|---|---|
idrequired | ID | Unique branch identifier |
testCaseIdrequired | ID | Parent test case ID |
namerequired | string | Branch name (e.g., "GPT-4 High Temp") |
description | string | Optional branch description |
providerrequired | string | LLM provider: openai, anthropic, gemini |
modelrequired | string | Model identifier |
temperature | number | Temperature (0-2, default: 0.7) |
maxTokens | number | Max output tokens (default: 2048) |
topP | number | Top-p sampling (0-1, default: 1.0) |
isControl | boolean | Whether this is the control group |
isActive | boolean | Whether branch is active for testing |
totalRuns | number | Total executions for this branch |
avgLatency | number | Average response time (ms) |
avgCost | number | Average cost per execution |
avgQualityScore | number | Average quality score (0-1) |
avgTokensPerSec | number | Average throughput |
consistencyScore | number | Output consistency metric (0-1) |
passRate | number | Percentage passing quality threshold |
TestBranchExecution Type
| Parameter | Type | Description |
|---|---|---|
idrequired | ID | Execution record ID |
branchIdrequired | ID | Branch that was executed |
executionId | ID | Reference to prompt execution |
outputrequired | string | Generated output |
error | string | Error message if execution failed |
statusrequired | string | success or error |
latencyMsrequired | number | Response time in milliseconds |
timeToFirstToken | number | TTFT in milliseconds |
tokensPerSecond | number | Throughput metric |
inputTokensrequired | number | Input token count |
outputTokensrequired | number | Output token count |
totalTokensrequired | number | Total token count |
estimatedCostrequired | number | Estimated cost in USD |
qualityScorerequired | number | Quality score (0-1) |
similarityScore | number | Semantic similarity score |
clarityScore | number | Output readability score |
validJson | boolean | Whether output is valid JSON |
consistencyHash | string | Hash for consistency tracking |
promptVersionrequired | number | Prompt version used |
A/B Testing Queries
Get Test Branches
List all branches for a test case with their performance metrics.
query GetTestBranches($testCaseId: ID!) {
testBranches(testCaseId: $testCaseId) {
id
name
description
provider
model
temperature
maxTokens
topP
isControl
isActive
totalRuns
avgLatency
avgCost
avgQualityScore
avgTokensPerSec
consistencyScore
passRate
createdAt
}
}Get Branch Execution History
Retrieve detailed execution history for a specific branch.
query GetBranchExecutions($branchId: ID!, $limit: Int, $offset: Int) {
testBranchExecutions(branchId: $branchId, limit: $limit, offset: $offset) {
id
output
status
latencyMs
timeToFirstToken
tokensPerSecond
inputTokens
outputTokens
estimatedCost
qualityScore
similarityScore
clarityScore
validJson
promptVersion
createdAt
}
}A/B Testing Mutations
Create Test Branch
Create a new branch with specific model configuration. Maximum 5 branches per test case.
mutation CreateTestBranch($input: TestBranchInput!) {
createTestBranch(input: $input) {
id
name
model
temperature
}
}
# Variables
{
"input": {
"testCaseId": "tc_123",
"name": "GPT-4 High Temperature",
"description": "Testing GPT-4 with higher temperature for creativity",
"provider": "openai",
"model": "gpt-4-turbo-preview",
"temperature": 0.9,
"maxTokens": 1000,
"topP": 0.95,
"isControl": false
}
}| Parameter | Type | Description |
|---|---|---|
testCaseIdrequired | ID | Test case ID |
namerequired | string | Branch name |
description | string | Optional description |
providerrequired | string | openai, anthropic, or gemini |
modelrequired | string | Model identifier |
temperature | number | Temperature (0-2) |
maxTokens | number | Max output tokens |
topP | number | Top-p sampling (0-1) |
isControl | boolean | Mark as control group |
otherParams | JSON | Additional provider-specific params |
Update Test Branch
mutation UpdateTestBranch($id: ID!, $input: TestBranchInput!) {
updateTestBranch(id: $id, input: $input) {
id
name
temperature
maxTokens
}
}Delete Test Branch
mutation DeleteTestBranch($id: ID!) {
deleteTestBranch(id: $id)
}Run A/B Test
Execute an A/B test across multiple branches and automatically determine the winner based on your criteria.
mutation RunABTest($input: RunABTestInput!) {
runABTest(input: $input) {
branches {
branchId
branchName
model
execution {
id
output
status
}
qualityScore
latencyMs
estimatedCost
tokensPerSecond
rank
}
winner {
branchId
branchName
model
qualityScore
latencyMs
estimatedCost
}
criteria
}
}
# Variables
{
"input": {
"testCaseId": "tc_123",
"branchIds": ["branch_1", "branch_2", "branch_3"],
"promptVersion": 2,
"winnerCriteria": "balanced"
}
}| Parameter | Type | Description |
|---|---|---|
testCaseIdrequired | ID | Test case to run |
branchIdsrequired | [ID!]! | Array of branch IDs (2-3 recommended) |
promptVersion | Int | Specific prompt version (defaults to latest) |
winnerCriteria | string | Winner selection: best_quality, fastest, cheapest, best_value, balanced (default: best_quality) |
Winner Criteria
The A/B test winner is determined automatically based on the selected criteria:
best_quality (Default)
Selects the branch with the highest quality score. Best for prioritizing output accuracy.
fastest
Selects the branch with the lowest latency. Best for real-time applications.
cheapest
Selects the branch with the lowest estimated cost. Best for cost optimization.
best_value
Selects the branch with the best quality-to-cost ratio. Calculates quality score divided by cost.
balanced
Composite score: 40% quality, 30% speed, 30% cost. Best for balanced performance across all metrics.
A/B Testing Metrics
Each branch execution tracks comprehensive metrics for comparison:
Per-Execution Metrics
- Quality Score (0-1): Similarity to expected output using semantic comparison
- Latency (ms): Total response time from request to completion
- Cost (USD): Estimated cost based on token usage and model pricing
- Tokens/Second: Throughput metric for real-time performance
- Clarity Score (0-1): Output readability based on sentence structure
- Consistency Hash: For detecting output variations across runs
- Valid JSON: Whether output is parseable JSON (useful for structured outputs)
Aggregated Branch Metrics
- Total Runs: Number of executions for this branch
- Average Metrics: Mean quality, latency, cost, and throughput
- Pass Rate: Percentage passing quality threshold (70%+)
- Consistency Score: Variance in outputs (lower = more consistent)
A/B Testing Workflow
Complete A/B Test Example
import { gql } from 'graphql-request'
// 1. Create a test case
const CREATE_TEST = gql`
mutation CreateTestCase($input: TestCaseInput!) {
createTestCase(input: $input) {
id
}
}
`
const testCase = await client.request(CREATE_TEST, {
input: {
promptId: "prompt_123",
name: "Sentiment Analysis Test",
input: {
text: "This product exceeded my expectations!"
},
expectedOutput: "positive"
}
})
// 2. Create test branches with different configurations
const CREATE_BRANCH = gql`
mutation CreateBranch($input: TestBranchInput!) {
createTestBranch(input: $input) {
id
}
}
`
const branches = await Promise.all([
// GPT-4 Turbo
client.request(CREATE_BRANCH, {
input: {
testCaseId: testCase.createTestCase.id,
name: "GPT-4 Turbo",
provider: "openai",
model: "gpt-4-turbo-preview",
temperature: 0.7
}
}),
// Claude Sonnet
client.request(CREATE_BRANCH, {
input: {
testCaseId: testCase.createTestCase.id,
name: "Claude Sonnet",
provider: "anthropic",
model: "claude-3-sonnet-20240229",
temperature: 0.7
}
}),
// Gemini Pro
client.request(CREATE_BRANCH, {
input: {
testCaseId: testCase.createTestCase.id,
name: "Gemini Pro",
provider: "gemini",
model: "gemini-pro",
temperature: 0.7
}
})
])
// 3. Run A/B test
const RUN_AB_TEST = gql`
mutation RunABTest($input: RunABTestInput!) {
runABTest(input: $input) {
winner {
branchName
model
qualityScore
latencyMs
estimatedCost
}
branches {
branchName
model
qualityScore
latencyMs
estimatedCost
rank
}
criteria
}
}
`
const result = await client.request(RUN_AB_TEST, {
input: {
testCaseId: testCase.createTestCase.id,
branchIds: branches.map(b => b.createTestBranch.id),
winnerCriteria: "balanced"
}
})
// 4. Analyze results
console.log('Winner:', result.runABTest.winner.branchName)
console.log(' Model:', result.runABTest.winner.model)
console.log(' Quality:', result.runABTest.winner.qualityScore)
console.log(' Latency:', result.runABTest.winner.latencyMs + 'ms')
console.log(' Cost: $' + result.runABTest.winner.estimatedCost)
console.log('\nAll Results:')
result.runABTest.branches.forEach(branch => {
console.log(`${branch.rank}. ${branch.branchName}`)
console.log(` Quality: ${branch.qualityScore}`)
console.log(` Speed: ${branch.latencyMs}ms`)
console.log(` Cost: $${branch.estimatedCost}`)
})
// 5. Get detailed execution history
const GET_EXECUTIONS = gql`
query GetBranchExecutions($branchId: ID!, $limit: Int) {
testBranchExecutions(branchId: $branchId, limit: $limit) {
output
qualityScore
latencyMs
estimatedCost
createdAt
}
}
`
const winnerId = branches[0].createTestBranch.id
const executions = await client.request(GET_EXECUTIONS, {
branchId: winnerId,
limit: 10
})
console.log('\nRecent executions for winner:')
executions.testBranchExecutions.forEach((exec, i) => {
console.log(`${i + 1}. Quality: ${exec.qualityScore}, Latency: ${exec.latencyMs}ms`)
})Running Multiple Test Iterations
For statistically significant results, run A/B tests multiple times and analyze aggregate metrics.
// Run the same A/B test 5 times
const iterations = 5
const results = []
for (let i = 0; i < iterations; i++) {
const result = await client.request(RUN_AB_TEST, {
input: {
testCaseId: testCase.id,
branchIds: branchIds,
winnerCriteria: "balanced"
}
})
results.push(result.runABTest)
// Small delay between iterations
await new Promise(resolve => setTimeout(resolve, 1000))
}
// Aggregate results
const winnerCounts = {}
results.forEach(r => {
const winner = r.winner.branchName
winnerCounts[winner] = (winnerCounts[winner] || 0) + 1
})
console.log('Winner distribution across', iterations, 'runs:')
Object.entries(winnerCounts).forEach(([name, count]) => {
console.log(` ${name}: ${count}/${iterations} (${(count/iterations*100).toFixed(1)}%)`)
})
// Calculate average metrics per branch
const branchStats = {}
results.forEach(result => {
result.branches.forEach(branch => {
if (!branchStats[branch.branchName]) {
branchStats[branch.branchName] = {
quality: [],
latency: [],
cost: []
}
}
branchStats[branch.branchName].quality.push(branch.qualityScore)
branchStats[branch.branchName].latency.push(branch.latencyMs)
branchStats[branch.branchName].cost.push(branch.estimatedCost)
})
})
console.log('\nAverage metrics across all runs:')
Object.entries(branchStats).forEach(([name, stats]) => {
const avgQuality = stats.quality.reduce((a, b) => a + b) / stats.quality.length
const avgLatency = stats.latency.reduce((a, b) => a + b) / stats.latency.length
const avgCost = stats.cost.reduce((a, b) => a + b) / stats.cost.length
console.log(`${name}:`)
console.log(` Avg Quality: ${avgQuality.toFixed(3)}`)
console.log(` Avg Latency: ${avgLatency.toFixed(0)}ms`)
console.log(` Avg Cost: $${avgCost.toFixed(5)}`)
})A/B Testing Best Practices
Example: Complete Testing Flow
import { gql } from 'graphql-request'
// 1. Create test cases for a sentiment analysis prompt
const CREATE_TEST = gql`
mutation CreateTestCase($input: TestCaseInput!) {
createTestCase(input: $input) {
id
name
}
}
`
const testCases = [
{
promptId: "cm123",
name: "Strongly positive",
input: { text: "This is absolutely amazing! Best ever!" },
expectedOutput: "positive"
},
{
promptId: "cm123",
name: "Subtly negative",
input: { text: "I guess it's okay, but I expected better." },
expectedOutput: "negative"
}
]
const createdTests = await Promise.all(
testCases.map(tc => client.request(CREATE_TEST, { input: tc }))
)
// 2. Make changes to the prompt
// ... update prompt content ...
// 3. Run all tests against the new version
const BATCH_RUN = gql`
mutation BatchRun($input: BatchRunTestsInput!) {
batchRunTests(input: $input) {
testCaseExecutionId
passed
score
isRegression
regressionType
actualOutput
}
}
`
const results = await client.request(BATCH_RUN, {
input: {
promptId: "cm123",
testCaseIds: createdTests.map(t => t.createTestCase.id)
}
})
// 4. Analyze results
const summary = {
total: results.batchRunTests.length,
passed: results.batchRunTests.filter(r => r.passed).length,
regressions: results.batchRunTests.filter(r => r.isRegression)
}
console.log(`Test Results: ${summary.passed}/${summary.total} passed`)
if (summary.regressions.length > 0) {
console.log('Regressions detected:')
summary.regressions.forEach(r => {
console.log(` - Type: ${r.regressionType}, Score: ${r.score}`)
})
}
// 5. Get detailed stats for monitoring
const STATS = gql`
query GetStats($testCaseId: ID!) {
testCaseStats(testCaseId: $testCaseId) {
totalRuns
passRate
avgScore
trendData {
date
passRate
avgScore
}
}
}
`
for (const test of createdTests) {
const stats = await client.request(STATS, {
testCaseId: test.createTestCase.id
})
console.log(`${test.createTestCase.name}:`)
console.log(` Pass rate: ${stats.testCaseStats.passRate}%`)
console.log(` Avg score: ${stats.testCaseStats.avgScore}`)
}