#!/usr/bin/env python3 """Tool Trajectory verification script with detailed logging. This script verifies the research agent tools work correctly by: 1. Testing each tool individually with logging 2. Verifying the tool call sequence (trajectory) 3. Outputting detailed logs for debugging Usage: uv run python scripts/verify_tool_trajectory.py """ from __future__ import annotations import logging import sys from dataclasses import dataclass, field from datetime import datetime from pathlib import Path from typing import Any from rich.console import Console from rich.logging import RichHandler from rich.panel import Panel from rich.table import Table logging.basicConfig( level=logging.DEBUG, format="%(message)s", handlers=[RichHandler(rich_tracebacks=True, show_path=False)], ) log = logging.getLogger("tool_trajectory") console = Console() @dataclass class ToolCall: tool_name: str input_args: dict[str, Any] output: str duration_ms: float success: bool error: str | None = None @dataclass class ToolTrajectory: calls: list[ToolCall] = field(default_factory=list) start_time: datetime = field(default_factory=datetime.now) def add_call(self, call: ToolCall) -> None: self.calls.append(call) log.info( f"[{len(self.calls)}] {call.tool_name} " f"({'OK' if call.success else 'FAIL'}) " f"[{call.duration_ms:.0f}ms]" ) def summary(self) -> str: total = len(self.calls) success = sum(1 for c in self.calls if c.success) return f"Total: {total}, Success: {success}, Failed: {total - success}" def test_tool( trajectory: ToolTrajectory, tool_name: str, tool_func: Any, args: dict[str, Any], ) -> bool: log.debug(f"Testing {tool_name} with args: {args}") start = datetime.now() try: result = tool_func.invoke(args) duration = (datetime.now() - start).total_seconds() * 1000 call = ToolCall( tool_name=tool_name, input_args=args, output=result[:500] if len(result) > 500 else result, duration_ms=duration, success=True, ) trajectory.add_call(call) return True except Exception as e: duration = (datetime.now() - start).total_seconds() * 1000 call = ToolCall( tool_name=tool_name, input_args=args, output="", duration_ms=duration, success=False, error=str(e), ) trajectory.add_call(call) log.error(f"Error in {tool_name}: {e}") return False def main() -> int: console.print( Panel( "[bold cyan]Tool Trajectory Verification[/bold cyan]\n" "[dim]Testing research agent tools with detailed logging[/dim]", title="Verification Started", ) ) from research_agent.tools import ( arxiv_search, github_code_search, library_docs_search, tavily_search, think_tool, ) trajectory = ToolTrajectory() console.print("\n[bold]Phase 1: Individual Tool Tests[/bold]\n") test_cases = [ ("think_tool", think_tool, {"reflection": "Testing reflection capability"}), ( "tavily_search", tavily_search, {"query": "context engineering", "max_results": 1}, ), ( "arxiv_search", arxiv_search, {"query": "large language model", "max_results": 2}, ), ( "github_code_search", github_code_search, {"query": "useState(", "max_results": 2}, ), ] for tool_name, tool_func, args in test_cases: console.print(f" Testing: [cyan]{tool_name}[/cyan]...") test_tool(trajectory, tool_name, tool_func, args) console.print("\n[bold]Phase 2: Tool Trajectory Analysis[/bold]\n") table = Table(title="Tool Call Trajectory") table.add_column("#", style="cyan", width=3) table.add_column("Tool", style="green") table.add_column("Status", style="yellow") table.add_column("Duration", style="blue") table.add_column("Output Preview", style="dim", max_width=50) for i, call in enumerate(trajectory.calls, 1): status = ( "[green]OK[/green]" if call.success else f"[red]FAIL: {call.error}[/red]" ) output_preview = ( call.output[:50] + "..." if len(call.output) > 50 else call.output ) output_preview = output_preview.replace("\n", " ") table.add_row( str(i), call.tool_name, status, f"{call.duration_ms:.0f}ms", output_preview, ) console.print(table) console.print("\n[bold]Phase 3: Verification Summary[/bold]\n") total_calls = len(trajectory.calls) success_calls = sum(1 for c in trajectory.calls if c.success) failed_calls = total_calls - success_calls summary_table = Table(show_header=False) summary_table.add_column("Metric", style="bold") summary_table.add_column("Value") summary_table.add_row("Total Tool Calls", str(total_calls)) summary_table.add_row("Successful", f"[green]{success_calls}[/green]") summary_table.add_row( "Failed", f"[red]{failed_calls}[/red]" if failed_calls > 0 else "[green]0[/green]", ) summary_table.add_row( "Total Duration", f"{sum(c.duration_ms for c in trajectory.calls):.0f}ms", ) console.print(summary_table) log_path = Path("research_workspace") / "tool_trajectory.log" log_path.parent.mkdir(parents=True, exist_ok=True) with open(log_path, "w") as f: f.write(f"Tool Trajectory Log - {datetime.now().isoformat()}\n") f.write("=" * 60 + "\n\n") for i, call in enumerate(trajectory.calls, 1): f.write(f"[{i}] {call.tool_name}\n") f.write(f" Args: {call.input_args}\n") f.write(f" Success: {call.success}\n") f.write(f" Duration: {call.duration_ms:.0f}ms\n") if call.error: f.write(f" Error: {call.error}\n") f.write(f" Output:\n{call.output}\n") f.write("-" * 40 + "\n") console.print(f"\n[dim]Log saved to: {log_path}[/dim]") if failed_calls > 0: console.print( Panel( f"[red]Verification FAILED[/red]\n" f"{failed_calls} tool(s) failed. Check logs above.", border_style="red", ) ) return 1 console.print( Panel( "[green]Verification PASSED[/green]\n" "All tools executed successfully with correct trajectory.", border_style="green", ) ) return 0 if __name__ == "__main__": sys.exit(main())