diff --git a/.claude/research-ralph-loop.local.md b/.claude/research-ralph-loop.local.md deleted file mode 100644 index 8505620..0000000 --- a/.claude/research-ralph-loop.local.md +++ /dev/null @@ -1,38 +0,0 @@ ---- -active: true -iteration: 1 -max_iterations: 1 -completion_promise: "RESEARCH_COMPLETE" -started_at: "2026-01-12T06:46:24.266812+00:00" -findings_count: 0 -coverage_score: 0.0 ---- - -## Research Iteration 1/1 - -### Original Query -test query - -### Previous Work -Check `research_workspace/` for previous findings. -Read TODO.md for tracked progress. - -### Instructions -1. Review existing findings -2. Identify knowledge gaps -3. Conduct targeted searches using: web -4. Update research files with new findings -5. Update TODO.md with progress - -### Completion Criteria -Output `RESEARCH_COMPLETE` ONLY when: -- Coverage score >= 0.5 (current: 0.00) -- All major aspects addressed -- Findings cross-validated with 2+ sources -- DO NOT lie to exit - -### Current Stats -- Iteration: 1 -- Findings: 0 -- Coverage: 0.00% - diff --git a/Context_Engineering_Research.ipynb b/Context_Engineering_Research.ipynb index 8951e73..1c39068 100644 --- a/Context_Engineering_Research.ipynb +++ b/Context_Engineering_Research.ipynb @@ -107,10 +107,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "offloading_demo", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "토큰 임계값: 20,000\n", + "축출 경로: /large_tool_results\n", + "미리보기 줄 수: 10\n" + ] + } + ], "source": [ "from context_engineering_research_agent.context_strategies.offloading import (\n", " ContextOffloadingStrategy,\n", @@ -130,10 +140,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "offloading_test", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "짧은 콘텐츠: 600 자 → 축출 대상: False\n", + "대용량 콘텐츠: 210,000 자 → 축출 대상: True\n" + ] + } + ], "source": [ "strategy = ContextOffloadingStrategy(config=config)\n", "\n", @@ -167,10 +186,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "reduction_demo", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "임계값: 85.0%\n", + "컨텍스트 윈도우: 200,000 토큰\n", + "Compaction 대상 나이: 10 메시지\n", + "최소 유지 메시지: 5\n" + ] + } + ], "source": [ "from context_engineering_research_agent.context_strategies.reduction import (\n", " ContextReductionStrategy,\n", @@ -192,10 +222,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "reduction_test", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "컨텍스트 사용률: 25.0%\n", + "축소 필요: False\n" + ] + } + ], "source": [ "from langchain_core.messages import AIMessage, HumanMessage\n", "\n", @@ -234,10 +273,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "retrieval_demo", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "기본 읽기 제한: 500 줄\n", + "grep 최대 결과: 100\n", + "glob 최대 결과: 100\n", + "줄 길이 제한: 2000 자\n" + ] + } + ], "source": [ "from context_engineering_research_agent.context_strategies.retrieval import (\n", " ContextRetrievalStrategy,\n", @@ -283,10 +333,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "isolation_demo", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "기본 모델: gpt-4.1\n", + "범용 에이전트 포함: True\n", + "제외 상태 키: ('messages', 'todos', 'structured_response')\n" + ] + } + ], "source": [ "from context_engineering_research_agent.context_strategies.isolation import (\n", " ContextIsolationStrategy,\n", @@ -327,10 +387,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "caching_demo", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "최소 캐싱 토큰: 1,024\n", + "캐시 컨트롤 타입: ephemeral\n", + "시스템 프롬프트 캐싱: True\n", + "도구 캐싱: True\n" + ] + } + ], "source": [ "from context_engineering_research_agent.context_strategies.caching import (\n", " ContextCachingStrategy,\n", @@ -352,10 +423,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "caching_test", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "짧은 콘텐츠: 11 자 → 캐싱 대상: False\n", + "긴 콘텐츠: 5,500 자 → 캐싱 대상: True\n" + ] + } + ], "source": [ "strategy = ContextCachingStrategy(config=config)\n", "\n", @@ -380,15 +460,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "agent_create", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "에이전트 타입: CompiledStateGraph\n" + ] + } + ], "source": [ "from context_engineering_research_agent import create_context_aware_agent\n", "\n", "agent = create_context_aware_agent(\n", - " model_name=\"gpt-4.1\",\n", + " model=\"gpt-4.1\",\n", " enable_offloading=True,\n", " enable_reduction=True,\n", " enable_caching=True,\n", @@ -431,26 +519,6 @@ "| 8 | Poisoning | 검증되지 않은 사실이 메모리를 오염 | 출처 태깅 / 검증 게이트 / 격리 |\n" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "comparison_setup", - "metadata": {}, - "outputs": [], - "source": [ - "from context_engineering_research_agent.context_strategies.offloading import (\n", - " ContextOffloadingStrategy, OffloadingConfig\n", - ")\n", - "from context_engineering_research_agent.context_strategies.reduction import (\n", - " ContextReductionStrategy, ReductionConfig\n", - ")\n", - "from langchain_core.messages import AIMessage, HumanMessage, ToolMessage\n", - "\n", - "print(\"=\" * 60)\n", - "print(\"전략 비교를 위한 테스트 데이터 생성\")\n", - "print(\"=\" * 60)" - ] - }, { "cell_type": "markdown", "id": "exp1_offloading", @@ -463,10 +531,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "exp1_code", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "작은 결과 크기: 23 자\n", + "대용량 결과 크기: 305,889 자\n", + "\n", + "[Offloading 비활성화 시]\n", + " 작은 결과 축출: False\n", + " 대용량 결과 축출: False\n", + " → 대용량 결과가 컨텍스트에 그대로 포함됨\n", + "\n", + "[Offloading 활성화 시]\n", + " 작은 결과 축출: False\n", + " 대용량 결과 축출: True\n", + " → 대용량 결과는 파일로 저장, 미리보기만 컨텍스트에 포함\n", + "\n", + "미리보기 크기: 6,159 자 (원본의 2.0%)\n" + ] + } + ], "source": [ "small_result = \"검색 결과: 항목 1, 항목 2, 항목 3\"\n", "large_result = \"\\n\".join([f\"검색 결과 {i}: \" + \"상세 내용 \" * 100 for i in range(500)])\n", @@ -510,10 +599,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "exp2_code", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Reduction 비활성화 시]\n", + " 메시지 수: 85\n", + " 추정 토큰: 2,972\n", + " → 모든 도구 호출/결과가 컨텍스트에 유지됨\n", + "\n", + "[Reduction 활성화 시 - Compaction]\n", + " 메시지 수: 85 → 60\n", + " 추정 토큰: 2,972 → 2,350\n", + " 절약된 토큰: 622 (20.9%)\n", + " → 오래된 도구 호출/결과가 제거되어 컨텍스트 효율화\n" + ] + } + ], "source": [ "messages_with_tools = []\n", "for i in range(30):\n", @@ -559,10 +665,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "exp3_code", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "============================================================\n", + "시나리오: 복잡한 연구 작업 수행\n", + "============================================================\n", + "\n", + "[시나리오 설정]\n", + " 대화 턴 수: 50\n", + " 도구 호출 수: 40\n", + " 대용량 결과 수: 5\n", + " 평균 결과 크기: 100k 자\n", + "\n", + "[모든 전략 비활성화 시]\n", + " 예상 컨텍스트 크기: 537,000 자 (~134,250 토큰)\n", + " 문제: 컨텍스트 윈도우 초과 가능성 높음\n", + "\n", + "[Offloading만 활성화 시]\n", + " 예상 컨텍스트 크기: 42,000 자 (~10,500 토큰)\n", + " 절약: 495,000 자 (92.2%)\n", + "\n", + "[Offloading + Reduction 활성화 시]\n", + " 예상 컨텍스트 크기: 25,200 자 (~6,300 토큰)\n", + " 총 절약: 511,800 자 (95.3%)\n", + "\n", + "[+ Caching 활성화 시 추가 효과]\n", + " 시스템 프롬프트 캐싱으로 반복 호출 비용 90% 절감\n", + " 응답 속도 향상\n" + ] + } + ], "source": [ "print(\"=\" * 60)\n", "print(\"시나리오: 복잡한 연구 작업 수행\")\n", @@ -616,10 +754,46 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "exp4_code", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "에이전트 생성 비교\n", + "============================================================\n", + "\n", + "[기본 (모두 비활성화)]\n", + " Offloading: ❌\n", + " Reduction: ❌\n", + " Caching: ❌\n", + " 에이전트 타입: CompiledStateGraph\n", + "\n", + "[Offloading만]\n", + " Offloading: ✅\n", + " Reduction: ❌\n", + " Caching: ❌\n", + " 에이전트 타입: CompiledStateGraph\n", + "\n", + "[Reduction만]\n", + " Offloading: ❌\n", + " Reduction: ✅\n", + " Caching: ❌\n", + " 에이전트 타입: CompiledStateGraph\n", + "\n", + "[모두 활성화]\n", + " Offloading: ✅\n", + " Reduction: ✅\n", + " Caching: ✅\n", + " 에이전트 타입: CompiledStateGraph\n", + "\n", + "============================================================\n", + "모든 에이전트가 성공적으로 생성되었습니다.\n" + ] + } + ], "source": [ "from context_engineering_research_agent import create_context_aware_agent\n", "\n", @@ -635,7 +809,7 @@ "\n", "for cfg in configs:\n", " agent = create_context_aware_agent(\n", - " model_name=\"gpt-4.1\",\n", + " model=\"gpt-4.1\",\n", " enable_offloading=cfg[\"offloading\"],\n", " enable_reduction=cfg[\"reduction\"],\n", " enable_caching=cfg[\"caching\"],\n", @@ -650,751 +824,6 @@ "print(\"모든 에이전트가 성공적으로 생성되었습니다.\")" ] }, - { - "cell_type": "markdown", - "id": "exp5_8_real_intro", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## 실험 5~8: 실패 모드 실험 (실제 실행 + 로그 기반)\n", - "\n", - "이 섹션은 앞선 “순수 파이썬 시뮬레이션”을 넘어서,\n", - "실제로 `langchain.agents.create_agent` + **Middleware**를 조합해 실행하면서\n", - "메시지/툴콜/툴결과 로그를 확인합니다.\n", - "\n", - "참고(공식 built-in middleware): https://docs.langchain.com/oss/python/langchain/middleware/built-in\n", - "\n", - "- Tool selection: `LLMToolSelectorMiddleware`\n", - "- Tool call limiting: `ToolCallLimitMiddleware`\n", - "\n", - "또한 deepagents에서 제공하는 **FilesystemMiddleware**(파일 툴 스택)를 함께 사용합니다.\n" - ] - }, - { - "cell_type": "code", - "id": "mw_real_helpers", - "metadata": {}, - "execution_count": null, - "outputs": [], - "source": [ - "from __future__ import annotations\n", - "\n", - "import json\n", - "import uuid\n", - "from collections.abc import Callable\n", - "from dataclasses import dataclass\n", - "from typing import Any\n", - "\n", - "from deepagents.backends import StateBackend\n", - "from deepagents.backends.utils import create_file_data\n", - "from deepagents.middleware.filesystem import FilesystemMiddleware\n", - "from langchain.agents import create_agent\n", - "from langchain.agents.middleware import LLMToolSelectorMiddleware, ToolCallLimitMiddleware\n", - "from langchain.agents.middleware.types import AgentMiddleware\n", - "from langchain_core.language_models import BaseChatModel\n", - "from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage, ToolMessage\n", - "from langchain_core.outputs import ChatGeneration, ChatResult\n", - "from langchain_core.runnables import RunnableLambda\n", - "from langchain_core.tools import tool\n", - "from langgraph.types import Overwrite\n", - "\n", - "\n", - "def _extract_valid_tool_names_from_schema(schema: dict[str, Any]) -> list[str]:\n", - " \"\"\"Extract tool-name enum from the JSON schema used by LLMToolSelectorMiddleware.\"\"\"\n", - " # The schema is produced from a Literal enum of tool names.\n", - " # We look for any 'enum' list nested in the schema.\n", - " enums: list[str] = []\n", - "\n", - " def walk(node: Any) -> None:\n", - " if isinstance(node, dict):\n", - " if 'enum' in node and isinstance(node['enum'], list):\n", - " for v in node['enum']:\n", - " if isinstance(v, str):\n", - " enums.append(v)\n", - " for v in node.values():\n", - " walk(v)\n", - " elif isinstance(node, list):\n", - " for v in node:\n", - " walk(v)\n", - "\n", - " walk(schema)\n", - " # Deduplicate while preserving order\n", - " seen: set[str] = set()\n", - " out: list[str] = []\n", - " for name in enums:\n", - " if name not in seen:\n", - " seen.add(name)\n", - " out.append(name)\n", - " return out\n", - "\n", - "\n", - "class DeterministicStructuredSelectorModel(BaseChatModel):\n", - " \"\"\"Offline tool-selection model compatible with `with_structured_output(schema)`.\n", - "\n", - " This is used to drive LangChain's `LLMToolSelectorMiddleware` without API keys.\n", - " \"\"\"\n", - "\n", - " def __init__(self, selector: Callable[[str, list[str]], list[str]]):\n", - " super().__init__()\n", - " self._selector = selector\n", - "\n", - " @property\n", - " def _llm_type(self) -> str:\n", - " return 'deterministic-structured-selector'\n", - "\n", - " @property\n", - " def _identifying_params(self) -> dict[str, Any]:\n", - " return {}\n", - "\n", - " def _generate(\n", - " self,\n", - " messages: list[BaseMessage],\n", - " stop: list[str] | None = None,\n", - " run_manager=None,\n", - " **kwargs: Any,\n", - " ) -> ChatResult:\n", - " # Not used by the tool selector middleware (it calls with_structured_output).\n", - " _ = (messages, stop, run_manager, kwargs)\n", - " return ChatResult(generations=[ChatGeneration(message=AIMessage(content='{}'))])\n", - "\n", - " def with_structured_output(self, schema: dict[str, Any], **kwargs: Any): # type: ignore[override]\n", - " _ = kwargs\n", - " valid = _extract_valid_tool_names_from_schema(schema)\n", - "\n", - " def _invoke(msgs: list[Any]) -> dict[str, Any]:\n", - " # msgs contains a system dict + last HumanMessage.\n", - " last_user = ''\n", - " for m in reversed(msgs):\n", - " if isinstance(m, HumanMessage):\n", - " last_user = m.content\n", - " break\n", - " selected = self._selector(last_user, valid)\n", - " return {'tools': selected}\n", - "\n", - " return RunnableLambda(_invoke)\n", - "\n", - "\n", - "class HeuristicToolCallingModel(BaseChatModel):\n", - " \"\"\"Offline tool-calling model that reacts to the *currently available tools*.\n", - "\n", - " This makes the effect of tool-selection middleware observable without external LLM calls.\n", - " \"\"\"\n", - "\n", - " def __init__(self, *, confusion_threshold: int = 10):\n", - " super().__init__()\n", - " self._bound_tool_names: list[str] = []\n", - " self._confusion_threshold = confusion_threshold\n", - "\n", - " @property\n", - " def _llm_type(self) -> str:\n", - " return 'heuristic-tool-calling'\n", - "\n", - " @property\n", - " def _identifying_params(self) -> dict[str, Any]:\n", - " return {'confusion_threshold': self._confusion_threshold}\n", - "\n", - " def bind_tools(self, tools: list[Any], **kwargs: Any): # noqa: ANN401\n", - " _ = kwargs\n", - " # Tools may include dict tool specs; filter those out.\n", - " self._bound_tool_names = [t.name for t in tools if hasattr(t, 'name')]\n", - " return self\n", - "\n", - " def _generate(\n", - " self,\n", - " messages: list[BaseMessage],\n", - " stop: list[str] | None = None,\n", - " run_manager=None,\n", - " **kwargs: Any,\n", - " ) -> ChatResult:\n", - " _ = (stop, run_manager, kwargs)\n", - "\n", - " # If the last message is a tool output, produce a final response.\n", - " if messages and isinstance(messages[-1], ToolMessage):\n", - " tool_msg = messages[-1]\n", - " return ChatResult(\n", - " generations=[\n", - " ChatGeneration(\n", - " message=AIMessage(\n", - " content=(\n", - " f\"[final] saw tool={tool_msg.name} status={tool_msg.status}\\n\"\n", - " f\"{tool_msg.content}\".strip()\n", - " )\n", - " )\n", - " )\n", - " ]\n", - " )\n", - "\n", - " # Find the last user message.\n", - " user_text = ''\n", - " for m in reversed(messages):\n", - " if isinstance(m, HumanMessage):\n", - " user_text = m.content\n", - " break\n", - "\n", - " tool_count = len(self._bound_tool_names)\n", - "\n", - " # Confusion heuristic: if too many tools, prefer (wrong) web_search.\n", - " if tool_count >= self._confusion_threshold and 'web_search' in self._bound_tool_names:\n", - " chosen = 'web_search'\n", - " args = {'query': user_text}\n", - " else:\n", - " # Prefer filesystem listing if available.\n", - " chosen = 'ls' if 'ls' in self._bound_tool_names else (self._bound_tool_names[0] if self._bound_tool_names else 'ls')\n", - " args = {'path': '/project'}\n", - "\n", - " tool_call_id = f\"call_{uuid.uuid4().hex[:8]}\"\n", - " msg = AIMessage(\n", - " content=f\"[debug] tool_count={tool_count} chosen={chosen}\",\n", - " tool_calls=[{'id': tool_call_id, 'name': chosen, 'args': args, 'type': 'tool_call'}],\n", - " )\n", - " return ChatResult(generations=[ChatGeneration(message=msg)])\n", - "\n", - "\n", - "def _print_messages(messages: list[BaseMessage]) -> None:\n", - " for i, m in enumerate(messages):\n", - " if isinstance(m, HumanMessage):\n", - " print(f\"{i:02d} HUMAN: {m.content}\")\n", - " elif isinstance(m, SystemMessage):\n", - " print(f\"{i:02d} SYSTEM: {m.content}\")\n", - " elif isinstance(m, AIMessage):\n", - " tool_calls = getattr(m, 'tool_calls', None) or []\n", - " print(f\"{i:02d} AI: {m.content}\")\n", - " for tc in tool_calls:\n", - " print(f\" tool_call: name={tc.get('name')} id={tc.get('id')} args={tc.get('args')}\")\n", - " elif isinstance(m, ToolMessage):\n", - " print(f\"{i:02d} TOOL: name={m.name} status={m.status} id={m.tool_call_id}\")\n", - " print(\" content:\", str(m.content)[:200])\n", - " else:\n", - " print(f\"{i:02d} {type(m).__name__}: {getattr(m, 'content', '')}\")\n", - "\n", - "\n", - "def _sample_files() -> dict[str, dict[str, Any]]:\n", - " return {\n", - " '/project/README.md': create_file_data(\"\"\"# Demo\n", - "This is a demo file.\"\"\"),\n", - " '/project/src/main.py': create_file_data(\"\"\"print(\\\"hello\\\")\n", - "\"\"\"),\n", - " '/project/src/utils.py': create_file_data(\"\"\"def add(a, b):\n", - " return a + b\n", - "\"\"\"),\n", - " }\n", - "\n", - "\n", - "\n", - "def _make_agent(*, model: BaseChatModel, tools: list[Any], middleware: list[AgentMiddleware]):\n", - " # Use StateBackend so FilesystemMiddleware can operate on in-memory `files`.\n", - " backend = lambda rt: StateBackend(rt) # noqa: E731\n", - " mw = [FilesystemMiddleware(backend=backend), *middleware]\n", - " return create_agent(model=model, tools=tools, middleware=mw)\n" - ] - }, - { - "cell_type": "markdown", - "id": "exp5_confusion", - "metadata": {}, - "source": [ - "### 실험 5: Context Confusion (도구 과다/유사 도구)\n", - "\n", - "도구가 많고 설명이 유사해질수록(특히 파일/검색류처럼 겹치는 기능이 많을수록) “올바른 도구 선택”이 흔들릴 수 있습니다.\n", - "\n", - "이 실험은 **도구 설명 기반의 단순 스코어링(lexical overlap)**으로 도구를 고르는 상황을 가정해:\n", - "\n", - "- 도구가 적을 때 vs 많을 때\n", - "- 유사한 도구가 많은 경우\n", - "\n", - "선택이 얼마나 불안정해지는지(상위 후보 점수 차이가 거의 없어지는지) 보여주고,\n", - "완화책으로 **도구 로딩 제한**과 **계층적 액션 스페이스(카테고리→도구)**를 시뮬레이션합니다.\n" - ] - }, - { - "cell_type": "code", - "id": "exp5_confusion_code", - "metadata": {}, - "execution_count": null, - "outputs": [], - "source": [ - "from __future__ import annotations\n", - "\n", - "from dataclasses import dataclass\n", - "\n", - "\n", - "@dataclass(frozen=True)\n", - "class ToyTool:\n", - " name: str\n", - " description: str\n", - " category: str\n", - "\n", - "\n", - "def score_tool(query: str, tool: ToyTool) -> int:\n", - " q = set(query.lower().split())\n", - " d = set(tool.description.lower().split())\n", - " # 아주 단순한 overlap 점수(결정론적)\n", - " return len(q & d)\n", - "\n", - "\n", - "def rank_tools(query: str, tools: list[ToyTool]) -> list[tuple[int, ToyTool]]:\n", - " ranked = [(score_tool(query, t), t) for t in tools]\n", - " ranked.sort(key=lambda x: (x[0], x[1].name), reverse=True)\n", - " return ranked\n", - "\n", - "\n", - "def show_top(query: str, tools: list[ToyTool], top_k: int = 8) -> None:\n", - " ranked = rank_tools(query, tools)\n", - " print(f\"Query: {query}\")\n", - " print(\"Top candidates:\")\n", - " for score, tool in ranked[:top_k]:\n", - " print(f\" - {tool.name:18} score={score:2} ({tool.category})\")\n", - " top_scores = [s for s, _ in ranked[:top_k]]\n", - " gap = (top_scores[0] - top_scores[1]) if len(top_scores) > 1 else 0\n", - " print(f\"Top-1 vs Top-2 score gap: {gap}\")\n", - " print()\n", - "\n", - "\n", - "query = \"list files in directory and show file names\"\n", - "\n", - "small_toolset = [\n", - " ToyTool(\"ls\", \"list files in a directory\", \"filesystem\"),\n", - " ToyTool(\"read_file\", \"read a file from the filesystem\", \"filesystem\"),\n", - " ToyTool(\"web_search\", \"search the web for information\", \"web\"),\n", - "]\n", - "\n", - "large_similar_toolset = [\n", - " ToyTool(\"ls\", \"list files in a directory\", \"filesystem\"),\n", - " ToyTool(\"list_files\", \"list files in a directory and show file names\", \"filesystem\"),\n", - " ToyTool(\"list_dir\", \"list directory files and file names\", \"filesystem\"),\n", - " ToyTool(\"dir\", \"show directory listing and files\", \"filesystem\"),\n", - " ToyTool(\"glob\", \"find files matching a pattern\", \"filesystem\"),\n", - " ToyTool(\"grep\", \"search for a pattern in files\", \"filesystem\"),\n", - " ToyTool(\"read_file\", \"read a file from the filesystem\", \"filesystem\"),\n", - " ToyTool(\"cat\", \"print file content\", \"filesystem\"),\n", - " ToyTool(\"web_search\", \"search the web for information\", \"web\"),\n", - " ToyTool(\"fetch_url\", \"fetch a url and convert html to markdown\", \"web\"),\n", - "]\n", - "\n", - "print(\"=\" * 60)\n", - "print(\"[A] 도구가 적을 때\")\n", - "print(\"=\" * 60)\n", - "show_top(query, small_toolset)\n", - "\n", - "print(\"=\" * 60)\n", - "print(\"[B] 유사 도구가 많을 때(Confusion 유발)\")\n", - "print(\"=\" * 60)\n", - "show_top(query, large_similar_toolset)\n", - "\n", - "print(\"=\" * 60)\n", - "print(\"[C] 완화책 1: 도구 로딩 제한(카테고리 필터링)\")\n", - "print(\"=\" * 60)\n", - "filesystem_only = [t for t in large_similar_toolset if t.category == \"filesystem\"]\n", - "show_top(query, filesystem_only)\n", - "\n", - "print(\"=\" * 60)\n", - "print(\"[D] 완화책 2: 계층적 액션 스페이스(카테고리→도구)\")\n", - "print(\"=\" * 60)\n", - "chosen_category = \"filesystem\" if (\"file\" in query.lower() or \"directory\" in query.lower()) else \"web\"\n", - "print(f\"Chosen category: {chosen_category}\")\n", - "ranked = rank_tools(query, [t for t in large_similar_toolset if t.category == chosen_category])\n", - "print(f\"Chosen tool: {ranked[0][1].name}\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "exp5_confusion_real_md", - "metadata": {}, - "source": [ - "#### (실행) LLMToolSelectorMiddleware로 도구 선택 제한 적용\n", - "\n", - "- Baseline: 도구가 너무 많아 `web_search`로 잘못 빠짐(Confusion)\n", - "- With tool selection: `LLMToolSelectorMiddleware(max_tools=5)`가 tool set을 줄여 `ls`로 유도\n" - ] - }, - { - "cell_type": "code", - "id": "exp5_confusion_real_code", - "metadata": {}, - "execution_count": null, - "outputs": [], - "source": [ - "# 많은 더미 도구(유사/잡다한 도구)를 추가해 Confusion 상황을 만든다.\n", - "\n", - "@tool\n", - "def web_search(query: str) -> str:\n", - " \"\"\"Dummy web_search tool for the experiment.\"\"\"\n", - " return f\"(dummy) web_search results for query={query!r}\"\n", - "\n", - "\n", - "def _dummy_tool_factory(n: int):\n", - " @tool(f\"dummy_tool_{n}\", description=\"dummy tool\")\n", - " def _t(x: str = \"\") -> str:\n", - " return f\"dummy {n} {x}\".strip()\n", - "\n", - " return _t\n", - "\n", - "\n", - "dummy_tools = [_dummy_tool_factory(i) for i in range(25)]\n", - "all_tools = [web_search, *dummy_tools]\n", - "\n", - "# Selection model: choose filesystem tools when user asks about files/directories.\n", - "selector_model = DeterministicStructuredSelectorModel(\n", - " selector=lambda q, valid: [\n", - " name\n", - " for name in [\"ls\", \"read_file\", \"glob\", \"grep\"]\n", - " if name in valid\n", - " ]\n", - ")\n", - "\n", - "user = HumanMessage(content=\"/project 아래 파일 목록을 보여줘\")\n", - "state = {\"messages\": [user], \"files\": _sample_files()}\n", - "\n", - "print(\"=\" * 60)\n", - "print(\"[Baseline] tool selection 없음\")\n", - "print(\"=\" * 60)\n", - "agent_baseline = _make_agent(model=HeuristicToolCallingModel(confusion_threshold=10), tools=all_tools, middleware=[])\n", - "result_baseline = agent_baseline.invoke(state, {\"configurable\": {\"thread_id\": \"exp5_baseline\"}})\n", - "_print_messages(result_baseline[\"messages\"])\n", - "\n", - "print(\"\\n\" + \"=\" * 60)\n", - "print(\"[With LLMToolSelectorMiddleware] max_tools=5\")\n", - "print(\"=\" * 60)\n", - "agent_selected = _make_agent(\n", - " model=HeuristicToolCallingModel(confusion_threshold=10),\n", - " tools=all_tools,\n", - " middleware=[LLMToolSelectorMiddleware(model=selector_model, max_tools=5)],\n", - ")\n", - "result_selected = agent_selected.invoke(state, {\"configurable\": {\"thread_id\": \"exp5_selected\"}})\n", - "_print_messages(result_selected[\"messages\"])\n" - ] - }, - { - "cell_type": "markdown", - "id": "exp6_clash", - "metadata": {}, - "source": [ - "### 실험 6: Context Clash (모순되는 연속 관찰)\n", - "\n", - "연속된 도구 결과가 서로 모순될 때(예: 같은 키에 대해 다른 값), 모델은 어떤 값을 믿어야 할지 혼란스러워지고\n", - "이후 행동이 꼬일 수 있습니다.\n", - "\n", - "이 실험은:\n", - "\n", - "- 관찰을 상태(state)에 병합할 때 충돌을 감지\n", - "- “최신값 우선” 같은 임시 규칙 대신, **재검증/불확실성 표기**를 남기는 완화책\n", - "\n", - "을 시뮬레이션합니다.\n" - ] - }, - { - "cell_type": "code", - "id": "exp6_clash_code", - "metadata": {}, - "execution_count": null, - "outputs": [], - "source": [ - "from __future__ import annotations\n", - "\n", - "\n", - "def merge_observation(state: dict[str, object], observation: dict[str, object], *, source: str):\n", - " # 관찰 병합 + 충돌 감지\n", - " conflicts: list[str] = []\n", - " new_state = dict(state)\n", - " for k, v in observation.items():\n", - " if k in new_state and new_state[k] != v:\n", - " conflicts.append(f\"{k}: '{new_state[k]}' vs '{v}' (source={source})\")\n", - " new_state[k] = v\n", - " return new_state, conflicts\n", - "\n", - "\n", - "state: dict[str, object] = {}\n", - "\n", - "obs1 = {\"latest_version\": \"1.2.0\", \"release_date\": \"2025-01-01\"}\n", - "obs2 = {\"latest_version\": \"1.3.0\", \"release_date\": \"2025-01-01\"} # version만 충돌\n", - "\n", - "print(\"=\" * 60)\n", - "print(\"[A] 충돌 없이 병합\")\n", - "print(\"=\" * 60)\n", - "state, c1 = merge_observation(state, obs1, source=\"tool_call_1\")\n", - "print(\"state:\", state)\n", - "print(\"conflicts:\", c1)\n", - "print()\n", - "\n", - "print(\"=\" * 60)\n", - "print(\"[B] 모순 관찰 입력(Clash)\")\n", - "print(\"=\" * 60)\n", - "state2, c2 = merge_observation(state, obs2, source=\"tool_call_2\")\n", - "print(\"state:\", state2)\n", - "print(\"conflicts:\")\n", - "for c in c2:\n", - " print(\" -\", c)\n", - "print()\n", - "\n", - "print(\"=\" * 60)\n", - "print(\"[C] 완화책: 충돌을 로그/검증 큐로 분리\")\n", - "print(\"=\" * 60)\n", - "final_state = dict(state)\n", - "conflict_log: list[str] = []\n", - "verification_queue: list[dict[str, object]] = []\n", - "\n", - "_, conflicts = merge_observation(final_state, obs2, source=\"tool_call_2\")\n", - "if conflicts:\n", - " conflict_log.extend(conflicts)\n", - " verification_queue.append({\"key\": \"latest_version\", \"candidates\": [\"1.2.0\", \"1.3.0\"]})\n", - "\n", - "print(\"conflict_log:\", conflict_log)\n", - "print(\"verification_queue:\", verification_queue)\n", - "\n", - "# (가정) 추가 검증 결과(tool_call_3)\n", - "verified = {\"latest_version\": \"1.3.0\"}\n", - "final_state, _ = merge_observation(final_state, verified, source=\"tool_call_3\")\n", - "print(\"verified final_state:\", final_state)\n" - ] - }, - { - "cell_type": "markdown", - "id": "exp6_clash_real_md", - "metadata": {}, - "source": [ - "#### (실행) 충돌 감지 미들웨어로 모순 관찰(Clash) 처리\n", - "\n", - "- Baseline: 두 소스가 서로 다른 값을 주면 “마지막 값”으로 덮어써 버림\n", - "- With clash detection: 충돌을 감지해 **verify tool 호출**을 유도\n" - ] - }, - { - "cell_type": "code", - "id": "exp6_clash_real_code", - "metadata": {}, - "execution_count": null, - "outputs": [], - "source": [ - "from __future__ import annotations\n", - "\n", - "from langchain.agents.middleware.types import AgentState\n", - "from langgraph.runtime import Runtime\n", - "\n", - "\n", - "@tool(description=\"Get latest version from source A\")\n", - "def get_version_source_a() -> str:\n", - " return json.dumps({\"latest_version\": \"1.2.0\", \"source\": \"a\"})\n", - "\n", - "\n", - "@tool(description=\"Get latest version from source B\")\n", - "def get_version_source_b() -> str:\n", - " return json.dumps({\"latest_version\": \"1.3.0\", \"source\": \"b\"})\n", - "\n", - "\n", - "@tool(description=\"Verify latest version from an authoritative source\")\n", - "def verify_latest_version() -> str:\n", - " return json.dumps({\"latest_version\": \"1.3.0\", \"source\": \"verified\"})\n", - "\n", - "\n", - "class ClashDetectionMiddleware(AgentMiddleware):\n", - " \"\"\"Detect conflicting JSON facts in the last tool messages and request verification.\"\"\"\n", - "\n", - " def before_model(self, state: AgentState, runtime: Runtime[Any]) -> dict[str, Any] | None: # noqa: ARG002\n", - " messages = state.get(\"messages\", [])\n", - " # Do not trigger if already verified\n", - " for m in reversed(messages):\n", - " if isinstance(m, ToolMessage) and m.name == \"verify_latest_version\":\n", - " return None\n", - "\n", - " # Collect last two version tool messages\n", - " version_msgs: list[ToolMessage] = []\n", - " for m in reversed(messages):\n", - " if isinstance(m, ToolMessage) and m.name in {\"get_version_source_a\", \"get_version_source_b\"}:\n", - " version_msgs.append(m)\n", - " if len(version_msgs) >= 2:\n", - " break\n", - "\n", - " if len(version_msgs) < 2:\n", - " return None\n", - "\n", - " try:\n", - " a = json.loads(str(version_msgs[0].content))\n", - " b = json.loads(str(version_msgs[1].content))\n", - " except json.JSONDecodeError:\n", - " return None\n", - "\n", - " va = a.get(\"latest_version\")\n", - " vb = b.get(\"latest_version\")\n", - " if va and vb and va != vb:\n", - " patched = list(messages)\n", - " patched.append(\n", - " SystemMessage(\n", - " content=(\n", - " \"CONFLICT_DETECTED: latest_version has conflicting values. \"\n", - " \"Call verify_latest_version and use its result.\"\n", - " )\n", - " )\n", - " )\n", - " return {\"messages\": Overwrite(patched)}\n", - "\n", - " return None\n", - "\n", - "\n", - "class VersionResearchModel(BaseChatModel):\n", - " def bind_tools(self, tools: list[Any], **kwargs: Any): # noqa: ANN401\n", - " _ = kwargs\n", - " self._tool_names = [t.name for t in tools if hasattr(t, 'name')]\n", - " return self\n", - "\n", - " @property\n", - " def _llm_type(self) -> str:\n", - " return 'version-research'\n", - "\n", - " @property\n", - " def _identifying_params(self) -> dict[str, Any]:\n", - " return {}\n", - "\n", - " def _generate(self, messages: list[BaseMessage], stop=None, run_manager=None, **kwargs: Any) -> ChatResult:\n", - " _ = (stop, run_manager, kwargs)\n", - "\n", - " # Count tool results\n", - " have_a = any(isinstance(m, ToolMessage) and m.name == 'get_version_source_a' for m in messages)\n", - " have_b = any(isinstance(m, ToolMessage) and m.name == 'get_version_source_b' for m in messages)\n", - " have_v = any(isinstance(m, ToolMessage) and m.name == 'verify_latest_version' for m in messages)\n", - " conflict = any(isinstance(m, SystemMessage) and 'CONFLICT_DETECTED' in m.content for m in messages)\n", - "\n", - " if not have_a:\n", - " tcid = f\"call_{uuid.uuid4().hex[:8]}\"\n", - " msg = AIMessage(content='call source a', tool_calls=[{'id': tcid, 'name': 'get_version_source_a', 'args': {}, 'type': 'tool_call'}])\n", - " return ChatResult(generations=[ChatGeneration(message=msg)])\n", - "\n", - " if not have_b:\n", - " tcid = f\"call_{uuid.uuid4().hex[:8]}\"\n", - " msg = AIMessage(content='call source b', tool_calls=[{'id': tcid, 'name': 'get_version_source_b', 'args': {}, 'type': 'tool_call'}])\n", - " return ChatResult(generations=[ChatGeneration(message=msg)])\n", - "\n", - " if conflict and not have_v:\n", - " tcid = f\"call_{uuid.uuid4().hex[:8]}\"\n", - " msg = AIMessage(content='verify', tool_calls=[{'id': tcid, 'name': 'verify_latest_version', 'args': {}, 'type': 'tool_call'}])\n", - " return ChatResult(generations=[ChatGeneration(message=msg)])\n", - "\n", - " # Finalize: choose last seen latest_version\n", - " latest = None\n", - " for m in reversed(messages):\n", - " if isinstance(m, ToolMessage):\n", - " try:\n", - " data = json.loads(str(m.content))\n", - " except json.JSONDecodeError:\n", - " continue\n", - " if 'latest_version' in data:\n", - " latest = data['latest_version']\n", - " break\n", - " return ChatResult(generations=[ChatGeneration(message=AIMessage(content=f\"FINAL latest_version={latest}\"))])\n", - "\n", - "\n", - "user = HumanMessage(content=\"패키지 X의 최신 버전을 확인해줘\")\n", - "state = {\"messages\": [user]}\n", - "\n", - "tools = [get_version_source_a, get_version_source_b, verify_latest_version]\n", - "\n", - "print(\"=\" * 60)\n", - "print(\"[Baseline] clash detection 없음\")\n", - "print(\"=\" * 60)\n", - "agent_baseline = create_agent(model=VersionResearchModel(), tools=tools, middleware=[])\n", - "res1 = agent_baseline.invoke(state, {\"configurable\": {\"thread_id\": \"exp6_baseline\"}})\n", - "_print_messages(res1[\"messages\"])\n", - "\n", - "print(\"\\n\" + \"=\" * 60)\n", - "print(\"[With ClashDetectionMiddleware]\")\n", - "print(\"=\" * 60)\n", - "agent_clash = create_agent(model=VersionResearchModel(), tools=tools, middleware=[ClashDetectionMiddleware()])\n", - "res2 = agent_clash.invoke(state, {\"configurable\": {\"thread_id\": \"exp6_clash\"}})\n", - "_print_messages(res2[\"messages\"])\n" - ] - }, - { - "cell_type": "markdown", - "id": "exp7_distraction", - "metadata": {}, - "source": [ - "### 실험 7: Context Distraction (장기 로그에서 반복 행동 쏠림)\n", - "\n", - "긴 실행 기록이 쌓일수록, 모델이 “새 계획”보다 “이미 했던 행동”을 반복하는 쪽으로 쏠릴 수 있습니다.\n", - "\n", - "이 실험은 LLM을 직접 호출하지 않고, 단순화된 정책으로:\n", - "\n", - "- 로그가 길수록 과거 빈도 높은 행동을 더 강하게 재선택\n", - "\n", - "되는 현상을 시뮬레이션하고,\n", - "완화책으로 **명시적 계획(todo/next step)**를 “강제 입력”했을 때 분포가 다시 목표 중심으로 돌아오는 모습을 보여줍니다.\n" - ] - }, - { - "cell_type": "code", - "id": "exp7_distraction_code", - "metadata": {}, - "execution_count": null, - "outputs": [], - "source": [ - "from __future__ import annotations\n", - "\n", - "import math\n", - "from collections import Counter\n", - "\n", - "\n", - "def softmax(xs: list[float]) -> list[float]:\n", - " m = max(xs)\n", - " exps = [math.exp(x - m) for x in xs]\n", - " s = sum(exps)\n", - " return [e / s for e in exps]\n", - "\n", - "\n", - "def entropy(ps: list[float]) -> float:\n", - " return -sum(p * math.log(p + 1e-12) for p in ps)\n", - "\n", - "\n", - "def action_distribution(actions: list[str], *, sharpness: float) -> dict[str, float]:\n", - " counts = Counter(actions)\n", - " keys = sorted(counts)\n", - " logits = [sharpness * math.log(counts[k]) for k in keys]\n", - " probs = softmax(logits)\n", - " return dict(zip(keys, probs, strict=True))\n", - "\n", - "\n", - "def show_dist(title: str, dist: dict[str, float]) -> None:\n", - " keys = sorted(dist, key=lambda k: dist[k], reverse=True)\n", - " ps = [dist[k] for k in keys]\n", - " print(title)\n", - " for k in keys[:6]:\n", - " print(f\" - {k:14} p={dist[k]:.3f}\")\n", - " print(f\" entropy={entropy(ps):.3f}\")\n", - " print()\n", - "\n", - "\n", - "# 과거 로그(반복 행동이 많은 상황)\n", - "actions = (\n", - " [\"web_search\"] * 40\n", - " + [\"read_file\"] * 20\n", - " + [\"ls\"] * 15\n", - " + [\"edit_file\"] * 5\n", - " + [\"write_todos\"] * 2\n", - ")\n", - "\n", - "print(\"=\" * 60)\n", - "print(\"[A] 짧은 컨텍스트(덜 쏠림)\")\n", - "print(\"=\" * 60)\n", - "short_ctx = actions[:20]\n", - "show_dist(\"short_ctx\", action_distribution(short_ctx, sharpness=1.0))\n", - "\n", - "print(\"=\" * 60)\n", - "print(\"[B] 긴 컨텍스트(더 쏠림 / 반복 행동 강화)\")\n", - "print(\"=\" * 60)\n", - "long_ctx = actions\n", - "show_dist(\"long_ctx\", action_distribution(long_ctx, sharpness=2.5))\n", - "\n", - "print(\"=\" * 60)\n", - "print(\"[C] 완화책: '다음 행동'을 계획으로 고정(강제 next step)\")\n", - "print(\"=\" * 60)\n", - "next_step = \"write_todos\" # 예: 계획 갱신을 강제\n", - "base = action_distribution(long_ctx, sharpness=2.5)\n", - "boost = 0.35\n", - "base[next_step] = base.get(next_step, 0.0) + boost\n", - "s = sum(base.values())\n", - "fixed = {k: v / s for k, v in base.items()}\n", - "show_dist(\"long_ctx + forced_next_step\", fixed)\n" - ] - }, { "cell_type": "markdown", "id": "exp7_distraction_real_md", @@ -1408,10 +837,56 @@ }, { "cell_type": "code", + "execution_count": 19, "id": "exp7_distraction_real_code", "metadata": {}, - "execution_count": null, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "============================================================\n", + "[Baseline] 제한 없음\n", + "============================================================\n", + "00 HUMAN: Context engineering을 조사해줘\n", + "01 AI: search loop 1\n", + " tool_call: name=web_search id=call_76fe1482 args={'query': 'context engineering'}\n", + "02 TOOL: name=web_search status=success id=call_76fe1482\n", + " content: (dummy) result for 'context engineering'\n", + "03 AI: search loop 2\n", + " tool_call: name=web_search id=call_9e523f0a args={'query': 'context engineering'}\n", + "04 TOOL: name=web_search status=success id=call_9e523f0a\n", + " content: (dummy) result for 'context engineering'\n", + "05 AI: search loop 3\n", + " tool_call: name=web_search id=call_ea484b66 args={'query': 'context engineering'}\n", + "06 TOOL: name=web_search status=success id=call_ea484b66\n", + " content: (dummy) result for 'context engineering'\n", + "07 AI: switch to todos\n", + " tool_call: name=write_todos id=call_61a16615 args={'todos': ['summarize findings']}\n", + "08 TOOL: name=write_todos status=success id=call_61a16615\n", + " content: {\"todos\": [\"summarize findings\"]}\n", + "09 AI: FINAL todo list written\n", + "\n", + "============================================================\n", + "[With ToolCallLimitMiddleware] web_search run_limit=1\n", + "============================================================\n", + "00 HUMAN: Context engineering을 조사해줘\n", + "01 AI: search loop 1\n", + " tool_call: name=web_search id=call_281f3c81 args={'query': 'context engineering'}\n", + "02 TOOL: name=web_search status=success id=call_281f3c81\n", + " content: (dummy) result for 'context engineering'\n", + "03 AI: search loop 2\n", + " tool_call: name=web_search id=call_9b876f09 args={'query': 'context engineering'}\n", + "04 TOOL: name=web_search status=error id=call_9b876f09\n", + " content: Tool call limit exceeded. Do not call 'web_search' again.\n", + "05 AI: switch to todos\n", + " tool_call: name=write_todos id=call_5a2def37 args={'todos': ['summarize findings']}\n", + "06 TOOL: name=write_todos status=success id=call_5a2def37\n", + " content: {\"todos\": [\"summarize findings\"]}\n", + "07 AI: FINAL todo list written\n" + ] + } + ], "source": [ "from __future__ import annotations\n", "\n", @@ -1426,7 +901,7 @@ " return json.dumps({\"todos\": todos})\n", "\n", "\n", - "class LoopingSearchModel(BaseChatModel):\n", + "class LoopingSearchModel(BaseChatModel):\n", " def bind_tools(self, tools: list[Any], **kwargs: Any): # noqa: ANN401\n", " _ = kwargs\n", " self._tool_names = [t.name for t in tools if hasattr(t, 'name')]\n", @@ -1515,10 +990,36 @@ }, { "cell_type": "code", + "execution_count": 20, "id": "exp8_poisoning_code", "metadata": {}, - "execution_count": null, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "============================================================\n", + "[A] 정상 메모리\n", + "============================================================\n", + "blind plan: Install the package.\n", + "verified-only plan: Install the package.\n", + "\n", + "============================================================\n", + "[B] 오염된 메모리(Poisoning)\n", + "============================================================\n", + "blind plan: Install the package.\n", + "verified-only plan: Install the package.\n", + "\n", + "============================================================\n", + "[C] 완화책: 출처 없는 사실은 검증 요청으로 라우팅\n", + "============================================================\n", + "needs_verification:\n", + " - package_installed='yes' source=None verified=False\n", + "\n", + "→ 정책: tool로 재확인 후에만 state/memory에 반영\n" + ] + } + ], "source": [ "from __future__ import annotations\n", "\n", @@ -1595,10 +1096,41 @@ }, { "cell_type": "code", + "execution_count": 21, "id": "exp8_poisoning_real_code", "metadata": {}, - "execution_count": null, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "============================================================\n", + "[Baseline] verification gate 없음\n", + "============================================================\n", + "00 HUMAN: 패키지 X 설치가 필요한지 판단해줘\n", + "01 AI: guess\n", + " tool_call: name=guess_install_status id=call_ccef8e23 args={}\n", + "02 TOOL: name=guess_install_status status=success id=call_ccef8e23\n", + " content: {\"package_installed\": \"yes\", \"verified\": false, \"source\": \"guess\"}\n", + "03 AI: FINAL decision=SKIP (source=guess)\n", + "\n", + "============================================================\n", + "[With VerificationGateMiddleware]\n", + "============================================================\n", + "00 HUMAN: 패키지 X 설치가 필요한지 판단해줘\n", + "01 AI: guess\n", + " tool_call: name=guess_install_status id=call_70cc8071 args={}\n", + "02 TOOL: name=guess_install_status status=success id=call_70cc8071\n", + " content: {\"package_installed\": \"yes\", \"verified\": false, \"source\": \"guess\"}\n", + "03 SYSTEM: UNVERIFIED_FACT_BLOCKED: Do not trust guess_install_status. Call scan_install_status and decide based on verified=true only.\n", + "04 AI: scan\n", + " tool_call: name=scan_install_status id=call_9e23662c args={}\n", + "05 TOOL: name=scan_install_status status=success id=call_9e23662c\n", + " content: {\"package_installed\": \"no\", \"verified\": true, \"source\": \"scan\"}\n", + "06 AI: FINAL decision=INSTALL (source=scan)\n" + ] + } + ], "source": [ "from __future__ import annotations\n", "\n", @@ -1718,21 +1250,6 @@ "_print_messages(res2['messages'])\n" ] }, - { - "cell_type": "markdown", - "id": "exp5_recommendation", - "metadata": {}, - "source": [ - "### 권장 설정\n", - "\n", - "| 사용 사례 | Offloading | Reduction | Caching | 이유 |\n", - "|----------|------------|-----------|---------|------|\n", - "| **짧은 대화** | ❌ | ❌ | ✅ | 오버헤드 최소화 |\n", - "| **일반 작업** | ✅ | ❌ | ✅ | 대용량 결과 대비 |\n", - "| **장시간 연구** | ✅ | ✅ | ✅ | 모든 최적화 활용 |\n", - "| **디버깅** | ❌ | ❌ | ❌ | 전체 컨텍스트 확인 |" - ] - }, { "cell_type": "markdown", "id": "summary", @@ -1779,13 +1296,21 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "deepagent-context-engineering (3.13.9)", "language": "python", "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "version": "3.12.1" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.9" } }, "nbformat": 4,