Files
goose/scripts/test_compaction.sh

376 lines
12 KiB
Bash
Executable File

#!/bin/bash
# Compaction smoke test script
# Tests both manual (trigger prompt) and auto compaction (threshold-based)
#
# Environment variable overrides:
# COMPACTION_PROVIDER - Override the provider for tests 1 & 2 (default: use system default)
# COMPACTION_MODEL - Override the model for tests 1 & 2 (default: use system default)
# SKIP_BUILD - Skip cargo build if set
if [ -f .env ]; then
export $(grep -v '^#' .env | xargs)
fi
if [ -z "$SKIP_BUILD" ]; then
echo "Building goose..."
cargo build --bin goose
echo ""
else
echo "Skipping build (SKIP_BUILD is set)..."
echo ""
fi
SCRIPT_DIR=$(pwd)
GOOSE_BIN="$SCRIPT_DIR/target/debug/goose"
# Apply provider/model overrides if set
if [ -n "$COMPACTION_PROVIDER" ]; then
echo "Using override provider: $COMPACTION_PROVIDER"
export GOOSE_PROVIDER="$COMPACTION_PROVIDER"
fi
if [ -n "$COMPACTION_MODEL" ]; then
echo "Using override model: $COMPACTION_MODEL"
export GOOSE_MODEL="$COMPACTION_MODEL"
fi
if [ -n "$COMPACTION_PROVIDER" ] || [ -n "$COMPACTION_MODEL" ]; then
echo ""
fi
# Validation function to check compaction structure in session JSON
validate_compaction() {
local session_id=$1
local test_name=$2
echo "Validating compaction structure for session: $session_id"
# Export the session to JSON
local session_json=$($GOOSE_BIN session export --format json --session-id "$session_id" 2>&1)
if [ $? -ne 0 ]; then
echo "✗ FAILED: Could not export session JSON"
echo " Error: $session_json"
return 1
fi
if ! command -v jq &> /dev/null; then
echo "⚠ WARNING: jq not available, cannot validate compaction structure"
return 0
fi
# Check basic structure
echo "$session_json" | jq -e '.conversation' > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo "✗ FAILED: Session JSON missing 'conversation' field"
return 1
fi
local message_count=$(echo "$session_json" | jq '.conversation | length' 2>/dev/null)
echo " Session has $message_count messages"
# Look for a summary message (assistant role with userVisible=false, agentVisible=true)
local has_summary=$(echo "$session_json" | jq '[.conversation[] | select(.role == "assistant" and .metadata.userVisible == false and .metadata.agentVisible == true)] | length > 0' 2>/dev/null)
if [ "$has_summary" != "true" ]; then
echo "✗ FAILED: No summary message found (expected assistant message with userVisible=false, agentVisible=true)"
return 1
fi
echo "✓ Found summary message with correct visibility flags"
# Check for original messages with userVisible=true, agentVisible=false
local has_hidden_originals=$(echo "$session_json" | jq '[.conversation[] | select(.metadata.userVisible == true and .metadata.agentVisible == false)] | length > 0' 2>/dev/null)
if [ "$has_hidden_originals" != "true" ]; then
echo "⚠ WARNING: No original messages found with userVisible=true, agentVisible=false"
echo " This might be OK if all messages were compacted"
else
echo "✓ Found original messages hidden from agent (userVisible=true, agentVisible=false)"
fi
# For auto-compaction, check for the preserved user message (userVisible=true, agentVisible=true)
local has_preserved_user=$(echo "$session_json" | jq '[.conversation[] | select(.role == "user" and .metadata.userVisible == true and .metadata.agentVisible == true)] | length > 0' 2>/dev/null)
if [ "$has_preserved_user" == "true" ]; then
echo "✓ Found preserved user message (userVisible=true, agentVisible=true)"
fi
echo "✓ SUCCESS: Compaction structure is valid for $test_name"
return 0
}
echo "=================================================="
echo "COMPACTION SMOKE TESTS"
echo "=================================================="
echo ""
RESULTS=()
# ==================================================
# TEST 1: Manual Compaction
# ==================================================
echo "---------------------------------------------------"
echo "TEST 1: Manual Compaction via trigger prompt"
echo "---------------------------------------------------"
TESTDIR=$(mktemp -d)
echo "hello world" > "$TESTDIR/hello.txt"
echo "Test directory: $TESTDIR"
echo ""
OUTPUT=$(mktemp)
echo "Step 1: Creating session with initial messages..."
(cd "$TESTDIR" && "$GOOSE_BIN" run --with-builtin developer --text "list files and read hello.txt" 2>&1) | tee "$OUTPUT"
if ! command -v jq &> /dev/null; then
echo "✗ FAILED: jq is required for this test"
RESULTS+=("✗ Manual Compaction (jq required)")
rm -f "$OUTPUT"
rm -rf "$TESTDIR"
else
SESSION_ID=$("$GOOSE_BIN" session list --format json 2>/dev/null | jq -r '.[0].id' 2>/dev/null)
if [ -z "$SESSION_ID" ] || [ "$SESSION_ID" = "null" ]; then
echo "✗ FAILED: Could not create session"
RESULTS+=("✗ Manual Compaction (no session)")
else
echo ""
echo "Session created: $SESSION_ID"
echo "Step 2: Sending manual compaction trigger..."
# Send the manual compact trigger prompt
(cd "$TESTDIR" && "$GOOSE_BIN" run --resume --session-id "$SESSION_ID" --text "Please compact this conversation" 2>&1) | tee -a "$OUTPUT"
echo ""
echo "Checking for compaction evidence..."
if grep -qi "compacting\|compacted\|compaction" "$OUTPUT"; then
echo "✓ SUCCESS: Manual compaction was triggered"
if validate_compaction "$SESSION_ID" "manual compaction"; then
RESULTS+=("✓ Manual Compaction")
else
RESULTS+=("✗ Manual Compaction (structure validation failed)")
fi
else
echo "✗ FAILED: Manual compaction was not triggered"
RESULTS+=("✗ Manual Compaction")
fi
fi
rm -f "$OUTPUT"
rm -rf "$TESTDIR"
fi
echo ""
echo ""
# ==================================================
# TEST 2: Auto Compaction
# ==================================================
echo "---------------------------------------------------"
echo "TEST 2: Auto Compaction via threshold (0.005)"
echo "---------------------------------------------------"
TESTDIR=$(mktemp -d)
echo "test content" > "$TESTDIR/test.txt"
echo "Test directory: $TESTDIR"
echo ""
# Set auto-compact threshold very low (.5%) to trigger it quickly
export GOOSE_AUTO_COMPACT_THRESHOLD=0.005
OUTPUT=$(mktemp)
LONG_RESPONSE_PROMPT="Count from 1 to 200, one number per line."
echo "Step 1: Creating session with first message (generating tokens for threshold)..."
(cd "$TESTDIR" && "$GOOSE_BIN" run --text "$LONG_RESPONSE_PROMPT" 2>&1) | tee "$OUTPUT"
if ! command -v jq &> /dev/null; then
echo "✗ FAILED: jq is required for this test"
RESULTS+=("✗ Auto Compaction (jq required)")
else
SESSION_ID=$("$GOOSE_BIN" session list --format json 2>/dev/null | jq -r '.[0].id' 2>/dev/null)
if [ -z "$SESSION_ID" ] || [ "$SESSION_ID" = "null" ]; then
echo "✗ FAILED: Could not create session"
RESULTS+=("✗ Auto Compaction (no session)")
else
echo ""
echo "Session created: $SESSION_ID"
echo "Step 2: Sending second message (should trigger auto-compact)..."
# Send second message - auto-compaction should trigger before processing this
(cd "$TESTDIR" && "$GOOSE_BIN" run --resume --session-id "$SESSION_ID" --text "hi again" 2>&1) | tee -a "$OUTPUT"
echo ""
echo "Checking for auto-compaction evidence..."
if grep -qi "auto.*compact\|exceeded.*auto.*compact.*threshold" "$OUTPUT"; then
echo "✓ SUCCESS: Auto compaction was triggered"
if validate_compaction "$SESSION_ID" "auto compaction"; then
RESULTS+=("✓ Auto Compaction")
else
RESULTS+=("✗ Auto Compaction (structure validation failed)")
fi
else
echo "✗ FAILED: Auto compaction was not triggered"
echo " Expected to see auto-compact messages with threshold of 0.005"
RESULTS+=("✗ Auto Compaction")
fi
fi
fi
# Unset the env variable
unset GOOSE_AUTO_COMPACT_THRESHOLD
rm -f "$OUTPUT"
rm -rf "$TESTDIR"
echo ""
echo ""
# ==================================================
# TEST 3: Out-of-Context Error Compaction
# ==================================================
echo "---------------------------------------------------"
echo "TEST 3: Compaction via out-of-context error (proxy)"
echo "---------------------------------------------------"
TESTDIR=$(mktemp -d)
echo "test content" > "$TESTDIR/test.txt"
echo "Test directory: $TESTDIR"
echo ""
# Use a random port to avoid conflicts
PROXY_PORT=$((9000 + RANDOM % 1000))
PROXY_DIR="$SCRIPT_DIR/scripts/provider-error-proxy"
OUTPUT=$(mktemp)
PROXY_LOG=$(mktemp)
PROXY_SETUP_LOG=$(mktemp)
# Pre-install proxy dependencies (so first run doesn't take forever)
echo "Installing proxy dependencies..."
export UV_INDEX_URL="https://pypi.org/simple"
if ! (cd "$PROXY_DIR" && uv sync 2>&1 | tee "$PROXY_SETUP_LOG"); then
echo "✗ FAILED: Could not install proxy dependencies"
echo "Setup log:"
cat "$PROXY_SETUP_LOG"
RESULTS+=("✗ Out-of-Context Error (dependency install failed)")
else
echo "✓ Dependencies installed"
# Start the error proxy in context-length error mode (3 errors)
echo "Starting error proxy on port $PROXY_PORT with context-length error mode..."
(cd "$PROXY_DIR" && UV_INDEX_URL="https://pypi.org/simple" uv run proxy.py --port "$PROXY_PORT" --mode "c 3" --no-stdin > "$PROXY_LOG" 2>&1) &
PROXY_PID=$!
# Wait for proxy to be ready (check if port is listening)
echo "Waiting for proxy to be ready..."
PROXY_READY=false
for i in {1..60}; do
if kill -0 $PROXY_PID 2>/dev/null; then
# Check if port is listening using /dev/tcp
if timeout 1 bash -c "echo -n > /dev/tcp/localhost/$PROXY_PORT" 2>/dev/null; then
PROXY_READY=true
echo "✓ Proxy is ready on port $PROXY_PORT"
break
fi
else
echo "✗ FAILED: Error proxy process died"
break
fi
sleep 0.5
done
# Check if proxy is running and ready
if [ "$PROXY_READY" != "true" ]; then
echo "✗ FAILED: Error proxy failed to become ready"
echo "Proxy log:"
cat "$PROXY_LOG"
kill $PROXY_PID 2>/dev/null || true
RESULTS+=("✗ Out-of-Context Test Error (proxy failed)")
else
# Configure provider to use proxy and skip backoff
export ANTHROPIC_HOST="http://localhost:$PROXY_PORT"
export GOOSE_PROVIDER_SKIP_BACKOFF=true
export GOOSE_PROVIDER=anthropic
export GOOSE_MODEL=claude-haiku-4-5
echo "Step 1: Creating session (should trigger context-length error and compaction)..."
(cd "$TESTDIR" && "$GOOSE_BIN" run --text "hello world" 2>&1) | tee "$OUTPUT"
SESSION_ID=$("$GOOSE_BIN" session list --format json 2>/dev/null | jq -r '.[0].id' 2>/dev/null)
if [ -z "$SESSION_ID" ] || [ "$SESSION_ID" = "null" ]; then
echo "✗ FAILED: Could not create session"
RESULTS+=("✗ Out-of-Context Test Error (no session)")
else
echo ""
echo "Session created: $SESSION_ID"
echo "Checking for compaction evidence..."
# Check for compaction in the output
if grep -qi "context.*length\|compacting\|compacted\|compaction" "$OUTPUT"; then
echo "✓ SUCCESS: Out-of-context Test error triggered compaction"
if validate_compaction "$SESSION_ID" "out-of-context error compaction"; then
RESULTS+=("✓ Out-of-Context Test Error")
else
RESULTS+=("✗ Out-of-Context Test Error (structure validation failed)")
fi
else
echo "✗ FAILED: No evidence of compaction after context-length error"
echo " Output:"
cat "$OUTPUT"
RESULTS+=("✗ Out-of-Context Test Error")
fi
fi
# Clean up
echo ""
echo "Stopping error proxy..."
# Kill the entire process group to ensure UV and Python processes are terminated
kill -- -$PROXY_PID 2>/dev/null || true
# Also explicitly kill any remaining UV processes on this port
pkill -f "uv run.*--port $PROXY_PORT" 2>/dev/null || true
wait $PROXY_PID 2>/dev/null || true
unset ANTHROPIC_HOST
unset GOOSE_PROVIDER_SKIP_BACKOFF
unset GOOSE_PROVIDER
unset GOOSE_MODEL
unset UV_INDEX_URL
fi
fi
rm -f "$OUTPUT" "$PROXY_LOG" "$PROXY_SETUP_LOG"
rm -rf "$TESTDIR"
echo ""
echo ""
# ==================================================
# Summary
# ==================================================
echo "=================================================="
echo "TEST SUMMARY"
echo "=================================================="
for result in "${RESULTS[@]}"; do
echo "$result"
done
# Count results
FAILURE_COUNT=$(echo "${RESULTS[@]}" | grep -o "✗" | wc -l | tr -d ' ')
if [ "$FAILURE_COUNT" -gt 0 ]; then
echo ""
echo "$FAILURE_COUNT test(s) failed!"
exit 1
else
echo ""
echo "✅ All tests passed!"
fi