Skip to content

Metrics API

Trajectory Metrics

from agentgate import node_f1, edge_f1, tool_edit_distance

trace = adapter.run("Book a flight")
expected = ["search_flights", "book_flight"]

node_f1(trace, expected)            # 1.0 — all expected tools called
edge_f1(trace, expected)            # 1.0 — correct ordering
tool_edit_distance(trace, expected)  # 0.0 — exact match

Side Effects & Repetition

from agentgate import side_effect_rate, repetition_rate

side_effect_rate(trace, allowed=["search", "book"], mutating=["cancel"])
repetition_rate(trace)  # fraction of consecutive duplicate steps

SABER Deviation

from agentgate import decisive_deviation_score

score = decisive_deviation_score(
    trace, expected_tools=["search", "book"],
    mutating_tools=["book", "cancel"],
)
# Mutating deviations weighted 5x

Trajectory Analysis

from agentgate import step_credit, critical_steps, trajectory_redundancy, trajectory_efficiency

credits = step_credit(trace, expected)          # [1.0, -0.5, 0.5, 1.0]
critical = critical_steps(trace, expected)       # [2, 5]
redundancy = trajectory_redundancy(trace, expected)  # 0.1
efficiency = trajectory_efficiency(trace, expected)  # 0.85

Confidence Calibration

from agentgate import trajectory_confidence, detect_handoff_errors

conf = trajectory_confidence(trace)  # 0.0-1.0
errors = detect_handoff_errors(trace)
# [HandoffError("data_gap", step=3, detail="empty input")]

Reproducibility

from agentgate import variance_report, reproducibility_score

results = suite.run(agent, runs=10)
report = variance_report(results.results)
# VarianceReport(consistency=0.8, pass_rate=0.8)

score = reproducibility_score(results.results)  # 0.0-1.0

Cost

from agentgate import token_cost, cost_score, pareto_frontier, AgentResult

cost = token_cost(trace)  # $0.0045
efficiency = cost_score(trace, max_tokens=10000)  # 0.85

results = [
    AgentResult("GPT-4o", success_rate=0.92, cost=45.0),
    AgentResult("DeepSeek", success_rate=0.78, cost=1.5),
]
frontier = pareto_frontier(results)

Memory Evaluation

from agentgate import MemoryProbe, memory_consistency_suite

probes = [MemoryProbe(
    fact="User birthday is March 5",
    queries=["When is my birthday?"],
    expected_answers=["march 5"],
    competency="AR",
)]
suite = memory_consistency_suite(probes)

Multi-Agent

from agentgate import MultiAgentTrace, collaboration_quality, coordination_efficiency

mat = MultiAgentTrace(
    traces={"planner": t1, "executor": t2},
    coordination_protocol="star",
    total_messages=5,
)
quality = collaboration_quality(mat)       # 0.0-1.0
efficiency = coordination_efficiency(mat)  # 0.0-1.0

Silent Failure Detection

from agentgate import full_silent_failure_scan

report = full_silent_failure_scan(
    trace,
    expected_tools=["search", "book"],
    max_tool_repeats=2,
    required_output_keywords=["confirmation"],
)
print(report.failure_types)  # ['drift', 'cycles']