- Context_Engineering.md: 에이전트 컨텍스트 엔지니어링 개념 정리 문서 추가 - Context_Engineering_Research.ipynb: 연구 노트북 업데이트 - deepagents_sourcecode/: docstring과 주석을 한국어로 번역
503 lines
16 KiB
Python
Executable File
503 lines
16 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Harbor용 LangSmith 연동 CLI입니다.
|
|
|
|
CLI for LangSmith integration with Harbor.
|
|
|
|
Provides commands for:
|
|
- Creating LangSmith datasets from Harbor tasks
|
|
- Creating experiment sessions
|
|
- Adding feedback from Harbor job results to LangSmith traces
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import datetime
|
|
import json
|
|
import os
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import aiohttp
|
|
import toml
|
|
from dotenv import load_dotenv
|
|
from harbor.models.dataset_item import DownloadedDatasetItem
|
|
from harbor.registry.client import RegistryClient
|
|
from langsmith import Client
|
|
|
|
from deepagents_harbor.tracing import create_example_id_from_instruction
|
|
|
|
load_dotenv()
|
|
|
|
LANGSMITH_API_URL = os.getenv("LANGSMITH_ENDPOINT", "https://api.smith.langchain.com")
|
|
HEADERS = {
|
|
"x-api-key": os.getenv("LANGSMITH_API_KEY"),
|
|
}
|
|
|
|
|
|
# ============================================================================
|
|
# CREATE DATASET
|
|
# ============================================================================
|
|
|
|
|
|
def _read_instruction(task_path: Path) -> str:
|
|
"""Read the instruction.md file from a task directory."""
|
|
instruction_file = task_path / "instruction.md"
|
|
if instruction_file.exists():
|
|
return instruction_file.read_text()
|
|
return ""
|
|
|
|
|
|
def _read_task_metadata(task_path: Path) -> dict:
|
|
"""Read metadata from task.toml file."""
|
|
task_toml = task_path / "task.toml"
|
|
if task_toml.exists():
|
|
return toml.load(task_toml)
|
|
return {}
|
|
|
|
|
|
def _read_solution(task_path: Path) -> str | None:
|
|
"""Read the solution script from a task directory.
|
|
|
|
Args:
|
|
task_path: Path to the task directory
|
|
|
|
Returns:
|
|
Solution script content if it exists, None otherwise
|
|
"""
|
|
solution_file = task_path / "solution" / "solve.sh"
|
|
if solution_file.exists():
|
|
return solution_file.read_text()
|
|
return None
|
|
|
|
|
|
def _scan_downloaded_tasks(downloaded_tasks: list[DownloadedDatasetItem]) -> list:
|
|
"""Scan downloaded tasks and extract all task information.
|
|
|
|
Args:
|
|
downloaded_tasks: List of DownloadedDatasetItem objects from Harbor
|
|
|
|
Returns:
|
|
List of example dictionaries for LangSmith
|
|
"""
|
|
examples = []
|
|
|
|
for downloaded_task in downloaded_tasks:
|
|
task_path = downloaded_task.downloaded_path
|
|
|
|
instruction = _read_instruction(task_path)
|
|
metadata = _read_task_metadata(task_path)
|
|
solution = _read_solution(task_path)
|
|
task_name = downloaded_task.id.name
|
|
task_id = str(downloaded_task.id)
|
|
|
|
if instruction:
|
|
# Create deterministic example_id from instruction content
|
|
example_id = create_example_id_from_instruction(instruction)
|
|
|
|
# Build outputs dict with reference solution if available
|
|
outputs = {}
|
|
if solution:
|
|
outputs["reference_solution"] = solution
|
|
|
|
example = {
|
|
"id": example_id, # Explicitly set the example ID
|
|
"inputs": {
|
|
"task_id": task_id,
|
|
"task_name": task_name,
|
|
"instruction": instruction,
|
|
"metadata": metadata.get("metadata", {}),
|
|
},
|
|
"outputs": outputs,
|
|
}
|
|
examples.append(example)
|
|
|
|
solution_status = "with solution" if solution else "without solution"
|
|
print(f"Added task: {task_name} (ID: {task_id}, Example ID: {example_id}) [{solution_status}]")
|
|
|
|
return examples
|
|
|
|
|
|
def create_dataset(dataset_name: str, version: str = "head", overwrite: bool = False) -> None:
|
|
"""Create a LangSmith dataset from Harbor tasks.
|
|
|
|
Args:
|
|
dataset_name: Dataset name (used for both Harbor download and LangSmith dataset)
|
|
version: Harbor dataset version (default: 'head')
|
|
overwrite: Whether to overwrite cached remote tasks
|
|
"""
|
|
langsmith_client = Client()
|
|
output_dir = Path(tempfile.mkdtemp(prefix="harbor_tasks_"))
|
|
print(f"Using temporary directory: {output_dir}")
|
|
|
|
# Download from Harbor registry
|
|
print(f"Downloading dataset '{dataset_name}@{version}' from Harbor registry...")
|
|
registry_client = RegistryClient()
|
|
downloaded_tasks = registry_client.download_dataset(
|
|
name=dataset_name,
|
|
version=version,
|
|
overwrite=overwrite,
|
|
output_dir=output_dir,
|
|
)
|
|
|
|
print(f"Downloaded {len(downloaded_tasks)} tasks")
|
|
examples = _scan_downloaded_tasks(downloaded_tasks)
|
|
|
|
print(f"\nFound {len(examples)} tasks")
|
|
|
|
# Create the dataset
|
|
print(f"\nCreating LangSmith dataset: {dataset_name}")
|
|
dataset = langsmith_client.create_dataset(dataset_name=dataset_name)
|
|
|
|
print(f"Dataset created with ID: {dataset.id}")
|
|
|
|
# Add examples to the dataset
|
|
print(f"\nAdding {len(examples)} examples to dataset...")
|
|
langsmith_client.create_examples(dataset_id=dataset.id, examples=examples)
|
|
|
|
print(f"\nSuccessfully created dataset '{dataset_name}' with {len(examples)} examples")
|
|
print(f"Dataset ID: {dataset.id}")
|
|
|
|
|
|
# ============================================================================
|
|
# CREATE EXPERIMENT
|
|
# ============================================================================
|
|
|
|
|
|
async def _create_experiment_session(
|
|
dataset_id: str, name: str, session: aiohttp.ClientSession
|
|
) -> dict:
|
|
"""Create a LangSmith experiment session.
|
|
|
|
Args:
|
|
dataset_id: LangSmith dataset ID to associate with
|
|
name: Name for the experiment session
|
|
session: aiohttp ClientSession for making requests
|
|
|
|
Returns:
|
|
Experiment session dictionary with 'id' field
|
|
"""
|
|
async with session.post(
|
|
f"{LANGSMITH_API_URL}/sessions",
|
|
headers=HEADERS,
|
|
json={
|
|
"start_time": datetime.datetime.now(datetime.timezone.utc).isoformat(),
|
|
"reference_dataset_id": dataset_id,
|
|
"name": name,
|
|
},
|
|
) as experiment_response:
|
|
if experiment_response.status == 200:
|
|
return await experiment_response.json()
|
|
else:
|
|
raise Exception(
|
|
f"Failed to create experiment: {experiment_response.status} {await experiment_response.text()}"
|
|
)
|
|
|
|
|
|
async def _get_dataset_by_name(dataset_name: str, session: aiohttp.ClientSession) -> dict:
|
|
"""Get a LangSmith dataset by name.
|
|
|
|
Args:
|
|
dataset_name: Name of the dataset to retrieve
|
|
session: aiohttp ClientSession for making requests
|
|
|
|
Returns:
|
|
Dataset dictionary with 'id' field
|
|
"""
|
|
async with session.get(
|
|
f"{LANGSMITH_API_URL}/datasets?name={dataset_name}&limit=1",
|
|
headers=HEADERS,
|
|
) as response:
|
|
if response.status == 200:
|
|
datasets = await response.json()
|
|
if len(datasets) > 0:
|
|
return datasets[0]
|
|
else:
|
|
raise Exception(f"Dataset '{dataset_name}' not found")
|
|
else:
|
|
raise Exception(f"Failed to get dataset: {response.status} {await response.text()}")
|
|
|
|
|
|
async def create_experiment_async(dataset_name: str, experiment_name: str | None = None) -> str:
|
|
"""Create a LangSmith experiment session for the given dataset.
|
|
|
|
Args:
|
|
dataset_name: Name of the LangSmith dataset to create experiment for
|
|
experiment_name: Optional name for the experiment (auto-generated if not provided)
|
|
|
|
Returns:
|
|
The experiment session ID
|
|
"""
|
|
async with aiohttp.ClientSession() as session:
|
|
# Get the dataset
|
|
dataset = await _get_dataset_by_name(dataset_name, session)
|
|
dataset_id = dataset["id"]
|
|
print(f"Found dataset '{dataset_name}' with ID: {dataset_id}")
|
|
|
|
# Generate experiment name if not provided
|
|
if experiment_name is None:
|
|
timestamp = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d_%H-%M-%S")
|
|
experiment_name = f"harbor-experiment-{timestamp}"
|
|
|
|
# Create experiment session
|
|
print(f"Creating experiment session: {experiment_name}")
|
|
experiment_session = await _create_experiment_session(dataset_id, experiment_name, session)
|
|
session_id = experiment_session["id"]
|
|
tenant_id = experiment_session["tenant_id"]
|
|
|
|
print("✓ Experiment created successfully!")
|
|
print(f" Session ID: {session_id}")
|
|
print(
|
|
f" View at: https://smith.langchain.com/o/{tenant_id}/datasets/{dataset_id}/compare?selectedSessions={session_id}"
|
|
)
|
|
print("\nTo run Harbor with this experiment, use:")
|
|
print(f" LANGSMITH_EXPERIMENT={experiment_name} harbor run ...")
|
|
|
|
return session_id
|
|
|
|
|
|
def create_experiment(dataset_name: str, experiment_name: str | None = None) -> str:
|
|
"""Synchronous wrapper for create_experiment_async."""
|
|
return asyncio.run(create_experiment_async(dataset_name, experiment_name))
|
|
|
|
|
|
# ============================================================================
|
|
# ADD FEEDBACK
|
|
# ============================================================================
|
|
|
|
|
|
def _extract_reward(trial_dir: Path) -> float:
|
|
"""Extract reward from trial's result.json."""
|
|
result_path = trial_dir / "result.json"
|
|
if not result_path.exists():
|
|
# If task completed but no result.json, assume reward 0.0 as default
|
|
# because it was likely due to an exception.
|
|
return 0.0
|
|
|
|
with open(result_path) as f:
|
|
result = json.load(f)
|
|
verifier_result = result.get("verifier_result") or {}
|
|
rewards = verifier_result.get("rewards") or {}
|
|
return rewards.get("reward")
|
|
|
|
|
|
def _process_trial(
|
|
client: Client,
|
|
trial_dir: Path,
|
|
project_name: str,
|
|
dry_run: bool = False,
|
|
) -> dict:
|
|
"""Process a single trial and update its trace."""
|
|
trial_name = trial_dir.name
|
|
|
|
# Find the trace by trial_name metadata
|
|
try:
|
|
# Build filter to match trial_name in metadata
|
|
filter_query = f'and(eq(metadata_key, "trial_name"), eq(metadata_value, "{trial_name}"))'
|
|
|
|
# Fetch runs matching the filter
|
|
runs = list(
|
|
client.list_runs(
|
|
project_name=project_name,
|
|
filter=filter_query,
|
|
is_root=True,
|
|
)
|
|
)
|
|
except Exception as e:
|
|
return {"status": "error", "message": f"Failed to fetch trace: {e}"}
|
|
|
|
if not runs:
|
|
return {"status": "error", "message": f"No trace found for trial_name {trial_name}"}
|
|
|
|
if len(runs) > 1:
|
|
return {"status": "error", "message": f"Multiple traces found for trial_name {trial_name}"}
|
|
|
|
run = runs[0]
|
|
run_id = str(run.id)
|
|
|
|
# Check if feedback already exists
|
|
try:
|
|
feedback_list = list(client.list_feedback(run_ids=[run_id]))
|
|
if any(fb.key == "harbor_reward" for fb in feedback_list):
|
|
return {"status": "skipped", "message": "Feedback already exists"}
|
|
except Exception:
|
|
pass # Continue if feedback check fails
|
|
|
|
# Extract reward
|
|
reward = _extract_reward(trial_dir)
|
|
|
|
if not dry_run:
|
|
client.create_feedback(
|
|
run_id=run_id,
|
|
key="harbor_reward",
|
|
score=reward,
|
|
)
|
|
return {
|
|
"status": "success",
|
|
"message": f"Added harbor_reward feedback: {reward}",
|
|
}
|
|
else:
|
|
return {
|
|
"status": "success",
|
|
"message": f"Would add harbor_reward feedback: {reward}",
|
|
}
|
|
|
|
|
|
def add_feedback(job_folder: Path, project_name: str, dry_run: bool = False) -> None:
|
|
"""Add Harbor reward feedback to LangSmith traces.
|
|
|
|
Args:
|
|
job_folder: Path to the Harbor job folder
|
|
project_name: LangSmith project name to search for traces
|
|
dry_run: If True, show what would be done without making changes
|
|
"""
|
|
print(f"Processing job folder: {job_folder}")
|
|
print(f"LangSmith project: {project_name}")
|
|
if dry_run:
|
|
print("DRY RUN MODE - No changes will be made")
|
|
print()
|
|
|
|
# Find all trial directories
|
|
trial_dirs = [d for d in job_folder.iterdir() if d.is_dir()]
|
|
print(f"Found {len(trial_dirs)} trial directories\n")
|
|
|
|
results = {"success": 0, "skipped": 0, "error": 0}
|
|
client = Client()
|
|
|
|
for i, trial_dir in enumerate(trial_dirs, 1):
|
|
print(f"[{i}/{len(trial_dirs)}] Processing {trial_dir.name}...")
|
|
|
|
result = _process_trial(
|
|
trial_dir=trial_dir,
|
|
project_name=project_name,
|
|
client=client,
|
|
dry_run=dry_run,
|
|
)
|
|
|
|
status = result["status"]
|
|
message = result["message"]
|
|
|
|
if status == "success":
|
|
print(f" ✓ {message}")
|
|
results["success"] += 1
|
|
elif status == "skipped":
|
|
print(f" ⊘ {message}")
|
|
results["skipped"] += 1
|
|
else: # error
|
|
print(f" ✗ {message}")
|
|
results["error"] += 1
|
|
|
|
# Print summary
|
|
print(f"\n{'=' * 80}")
|
|
print("SUMMARY")
|
|
print(f"{'=' * 80}")
|
|
print(f"Total trials: {len(trial_dirs)}")
|
|
print(f"Successfully updated: {results['success']}")
|
|
print(f"Skipped (already has feedback): {results['skipped']}")
|
|
print(f"Errors: {results['error']}")
|
|
|
|
|
|
def main() -> None:
|
|
"""Main CLI entrypoint with subcommands."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Harbor-LangSmith integration CLI for managing datasets, experiments, and feedback.",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
)
|
|
|
|
subparsers = parser.add_subparsers(dest="command", help="Available commands", required=True)
|
|
|
|
# ========================================================================
|
|
# create-dataset subcommand
|
|
# ========================================================================
|
|
dataset_parser = subparsers.add_parser(
|
|
"create-dataset",
|
|
help="Create a LangSmith dataset from Harbor tasks",
|
|
)
|
|
dataset_parser.add_argument(
|
|
"dataset_name",
|
|
type=str,
|
|
help="Dataset name (e.g., 'terminal-bench')",
|
|
)
|
|
dataset_parser.add_argument(
|
|
"--version",
|
|
type=str,
|
|
default="head",
|
|
help="Dataset version (default: 'head')",
|
|
)
|
|
dataset_parser.add_argument(
|
|
"--overwrite",
|
|
action="store_true",
|
|
help="Overwrite cached remote tasks",
|
|
)
|
|
|
|
# ========================================================================
|
|
# create-experiment subcommand
|
|
# ========================================================================
|
|
experiment_parser = subparsers.add_parser(
|
|
"create-experiment",
|
|
help="Create an experiment session for a dataset",
|
|
)
|
|
experiment_parser.add_argument(
|
|
"dataset_name",
|
|
type=str,
|
|
help="Dataset name (must already exist in LangSmith)",
|
|
)
|
|
experiment_parser.add_argument(
|
|
"--name",
|
|
type=str,
|
|
help="Name for the experiment (auto-generated if not provided)",
|
|
)
|
|
|
|
# ========================================================================
|
|
# add-feedback subcommand
|
|
# ========================================================================
|
|
feedback_parser = subparsers.add_parser(
|
|
"add-feedback",
|
|
help="Add Harbor reward feedback to LangSmith traces",
|
|
)
|
|
feedback_parser.add_argument(
|
|
"job_folder",
|
|
type=Path,
|
|
help="Path to the job folder (e.g., jobs/terminal-bench/2025-12-02__16-25-40)",
|
|
)
|
|
feedback_parser.add_argument(
|
|
"--project-name",
|
|
type=str,
|
|
required=True,
|
|
help="LangSmith project name to search for traces",
|
|
)
|
|
feedback_parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Show what would be done without making changes",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Route to appropriate command
|
|
if args.command == "create-dataset":
|
|
create_dataset(
|
|
dataset_name=args.dataset_name,
|
|
version=args.version,
|
|
overwrite=args.overwrite,
|
|
)
|
|
elif args.command == "create-experiment":
|
|
create_experiment(
|
|
dataset_name=args.dataset_name,
|
|
experiment_name=args.name,
|
|
)
|
|
elif args.command == "add-feedback":
|
|
if not args.job_folder.exists():
|
|
print(f"Error: Job folder does not exist: {args.job_folder}")
|
|
return 1
|
|
add_feedback(
|
|
job_folder=args.job_folder,
|
|
project_name=args.project_name,
|
|
dry_run=args.dry_run,
|
|
)
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit(main())
|