Evaluation & Testing Guide¶
OrmAI includes a comprehensive evaluation and replay framework for testing agent interactions with your database.
Overview¶
The eval framework helps you:
- Record and replay tool calls
- Verify determinism across runs
- Check invariants (no cross-tenant data, budget compliance)
- Test policy enforcement
- Benchmark performance
Recording Tool Calls¶
Basic Recording¶
from ormai.eval import CallRecorder
recorder = CallRecorder()
# Wrap your toolset
recorded_toolset = recorder.wrap(toolset)
# Execute operations (they're now recorded)
await recorded_toolset.query(ctx, model="Order", ...)
await recorded_toolset.get(ctx, model="User", id="u-123")
await recorded_toolset.create(ctx, model="Order", data={...})
# Get recorded calls
calls = recorder.get_calls()
print(f"Recorded {len(calls)} calls")
Call Record Structure¶
@dataclass
class CallRecord:
id: str
timestamp: datetime
tool_name: str
inputs: dict
principal: dict
result: ToolResult
execution_time_ms: float
metadata: dict
Saving Recordings¶
# Save to file
recorder.save("./recordings/session_001.jsonl")
# Load from file
recorder = CallRecorder.load("./recordings/session_001.jsonl")
Replay Engine¶
Basic Replay¶
from ormai.eval import ReplayEngine
engine = ReplayEngine(toolset)
# Load recorded calls
calls = CallRecorder.load("./recordings/session_001.jsonl").get_calls()
# Replay all calls
results = await engine.replay(calls)
for original, replayed in results:
print(f"Tool: {original.tool_name}")
print(f"Original success: {original.result.success}")
print(f"Replayed success: {replayed.success}")
Replay with Context Override¶
# Replay as different user
results = await engine.replay(
calls,
ctx_override=RunContext(
principal=Principal(tenant_id="test-tenant", user_id="test-user"),
db=test_session,
),
)
Determinism Checking¶
Basic Check¶
from ormai.eval import DeterminismChecker
checker = DeterminismChecker(toolset)
# Record and replay multiple times
is_deterministic = await checker.check(
calls,
num_runs=3, # Replay 3 times
)
if not is_deterministic:
print("Non-deterministic behavior detected!")
for diff in checker.get_diffs():
print(f"Call {diff.call_id}: {diff.description}")
Comparison Options¶
checker = DeterminismChecker(
toolset,
compare_options={
"ignore_fields": ["created_at", "updated_at", "id"],
"ignore_order": True, # Don't compare row order
"tolerance": 0.001, # Float comparison tolerance
},
)
Invariant Testing¶
Built-in Invariants¶
from ormai.eval import (
EvalHarness,
no_cross_tenant_data,
no_denied_fields,
response_within_budget,
)
harness = EvalHarness(toolset, policy)
# Test with invariants
result = await harness.run(
ctx,
tool="query",
kwargs={"model": "Order", "limit": 100},
invariants=[
no_cross_tenant_data,
no_denied_fields,
response_within_budget,
],
)
if result.violations:
for violation in result.violations:
print(f"Invariant violated: {violation.name}")
print(f" Details: {violation.message}")
Custom Invariants¶
from ormai.eval import Invariant, InvariantResult
class MaxRowsInvariant(Invariant):
name = "max_rows"
def __init__(self, max_rows: int):
self.max_rows = max_rows
def check(
self,
ctx: RunContext,
tool_name: str,
inputs: dict,
result: ToolResult,
) -> InvariantResult:
if result.success and len(result.data.get("rows", [])) > self.max_rows:
return InvariantResult(
passed=False,
message=f"Returned {len(result.data['rows'])} rows, max is {self.max_rows}",
)
return InvariantResult(passed=True)
# Use custom invariant
result = await harness.run(
ctx,
tool="query",
kwargs={"model": "Order"},
invariants=[MaxRowsInvariant(max_rows=50)],
)
Invariant for All Calls¶
class NoSensitiveDataInvariant(Invariant):
name = "no_sensitive_data"
sensitive_fields = ["ssn", "password", "secret"]
def check(self, ctx, tool_name, inputs, result) -> InvariantResult:
if not result.success:
return InvariantResult(passed=True)
rows = result.data.get("rows", [result.data])
for row in rows:
for field in self.sensitive_fields:
if field in row and row[field] not in [None, "[REDACTED]", "***"]:
return InvariantResult(
passed=False,
message=f"Sensitive field '{field}' exposed in response",
)
return InvariantResult(passed=True)
Eval Harness¶
Comprehensive Testing¶
from ormai.eval import EvalHarness
harness = EvalHarness(
toolset=toolset,
policy=policy,
adapter=adapter,
)
# Run test suite
results = await harness.run_suite([
# Test queries
{
"name": "query_orders",
"tool": "query",
"kwargs": {"model": "Order", "limit": 10},
"invariants": [no_cross_tenant_data],
"expected": {"success": True, "min_rows": 0},
},
# Test forbidden model
{
"name": "query_forbidden",
"tool": "query",
"kwargs": {"model": "SecretModel"},
"expected": {"success": False, "error_code": "MODEL_NOT_ALLOWED"},
},
# Test write
{
"name": "create_order",
"tool": "create",
"kwargs": {"model": "Order", "data": {"status": "pending"}},
"invariants": [no_denied_fields],
"expected": {"success": True},
},
])
# Report
print(f"Passed: {results.passed}/{results.total}")
for failure in results.failures:
print(f"FAILED: {failure.name}")
print(f" Reason: {failure.reason}")
Test Fixtures¶
@harness.fixture
async def sample_orders(ctx):
"""Create sample orders for testing."""
orders = []
for i in range(5):
result = await toolset.create(
ctx,
model="Order",
data={"status": "pending", "total": 1000 * (i + 1)},
)
orders.append(result.data)
return orders
# Use in tests
results = await harness.run_suite([
{
"name": "query_with_fixtures",
"tool": "query",
"kwargs": {"model": "Order"},
"fixtures": ["sample_orders"],
"expected": {"min_rows": 5},
},
])
Performance Benchmarking¶
from ormai.eval import Benchmark
benchmark = Benchmark(toolset)
# Benchmark a query
stats = await benchmark.run(
tool="query",
kwargs={"model": "Order", "limit": 100},
iterations=100,
)
print(f"Mean: {stats.mean_ms:.2f}ms")
print(f"P50: {stats.p50_ms:.2f}ms")
print(f"P95: {stats.p95_ms:.2f}ms")
print(f"P99: {stats.p99_ms:.2f}ms")
print(f"Min: {stats.min_ms:.2f}ms")
print(f"Max: {stats.max_ms:.2f}ms")
Comparative Benchmarks¶
# Compare different configurations
results = await benchmark.compare([
{
"name": "limit_10",
"tool": "query",
"kwargs": {"model": "Order", "limit": 10},
},
{
"name": "limit_100",
"tool": "query",
"kwargs": {"model": "Order", "limit": 100},
},
{
"name": "limit_1000",
"tool": "query",
"kwargs": {"model": "Order", "limit": 1000},
},
])
for name, stats in results.items():
print(f"{name}: {stats.mean_ms:.2f}ms mean")
Integration with pytest¶
# tests/test_tools.py
import pytest
from ormai.eval import EvalHarness
@pytest.fixture
async def harness(toolset, policy, adapter):
return EvalHarness(toolset, policy, adapter)
@pytest.fixture
async def ctx(db_session):
return RunContext(
principal=Principal(tenant_id="test", user_id="test-user"),
db=db_session,
)
async def test_query_respects_tenant_scope(harness, ctx):
result = await harness.run(
ctx,
tool="query",
kwargs={"model": "Order"},
invariants=[no_cross_tenant_data],
)
assert not result.violations
async def test_denied_model_rejected(harness, ctx):
result = await harness.run(
ctx,
tool="query",
kwargs={"model": "SecretModel"},
)
assert not result.result.success
assert "MODEL_NOT_ALLOWED" in str(result.result.error)
async def test_field_masking_applied(harness, ctx):
result = await harness.run(
ctx,
tool="query",
kwargs={"model": "User", "select": ["email"]},
)
for row in result.result.data["rows"]:
# Email should be masked
assert "***" in row.get("email", "")
CI/CD Integration¶
# .github/workflows/eval.yml
name: Evaluation Tests
on: [push, pull_request]
jobs:
eval:
runs-on: ubuntu-latest
services:
postgres:
image: postgres:15
env:
POSTGRES_PASSWORD: test
options: >-
--health-cmd pg_isready
--health-interval 10s
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- run: pip install -e .[test]
- name: Run eval tests
run: pytest tests/eval/ -v
env:
DATABASE_URL: postgres://postgres:test@localhost/test
- name: Run benchmarks
run: python -m ormai.eval benchmark --config ./benchmark.yaml
- name: Upload results
uses: actions/upload-artifact@v4
with:
name: eval-results
path: ./eval-results/
Best Practices¶
-
Record production samples - Use real query patterns for testing
-
Test all invariants - Especially cross-tenant and field policies
-
Benchmark regularly - Catch performance regressions
-
Use fixtures - Consistent test data
-
Test edge cases - Empty results, max limits, errors
-
Replay across versions - Ensure backward compatibility
Next Steps¶
- Multi-Tenant Setup - Test tenant isolation
- Policies - Policy testing
- Audit Logging - Verify audit trails