diff options
| author | main <main@swarm.moe> | 2026-03-24 13:17:59 -0400 |
|---|---|---|
| committer | main <main@swarm.moe> | 2026-03-24 13:17:59 -0400 |
| commit | 53797d1f9bbaf73778cbb9dd6ad2f857ba1a88e2 (patch) | |
| tree | 69b17b86e72b5f292bde42adf839a8ed8cf8005c /crates/phone-opus/tests/mcp_hardening.rs | |
| parent | 690b4851ea0afd8b214ddaa5450eec3a8c3a7ec9 (diff) | |
| download | phone_opus-53797d1f9bbaf73778cbb9dd6ad2f857ba1a88e2.zip | |
Reuse consult context per cwd by default
Diffstat (limited to 'crates/phone-opus/tests/mcp_hardening.rs')
| -rw-r--r-- | crates/phone-opus/tests/mcp_hardening.rs | 261 |
1 files changed, 171 insertions, 90 deletions
diff --git a/crates/phone-opus/tests/mcp_hardening.rs b/crates/phone-opus/tests/mcp_hardening.rs index 06861f8..f6e0e73 100644 --- a/crates/phone-opus/tests/mcp_hardening.rs +++ b/crates/phone-opus/tests/mcp_hardening.rs @@ -280,6 +280,40 @@ fn seed_caller_claude_home(home: &Path) -> TestResult { Ok(()) } +fn write_fake_claude_stdout(path: &Path, result: &str, session_id: &str, uuid: &str) -> TestResult { + must( + fs::write( + path, + serde_json::to_string(&json!({ + "type": "result", + "subtype": "success", + "is_error": false, + "duration_ms": 1234, + "duration_api_ms": 1200, + "num_turns": 2, + "result": result, + "stop_reason": "end_turn", + "session_id": session_id, + "total_cost_usd": 0.125, + "usage": { + "input_tokens": 10, + "output_tokens": 5 + }, + "modelUsage": { + "claude-opus-4-6": { + "inputTokens": 10, + "outputTokens": 5 + } + }, + "permission_denials": [], + "fast_mode_state": "off", + "uuid": uuid + }))?, + ), + "write fake stdout", + ) +} + #[test] fn cold_start_exposes_consult_and_ops_tools() -> TestResult { let root = temp_root("cold_start")?; @@ -314,6 +348,14 @@ fn cold_start_exposes_consult_and_ops_tools() -> TestResult { consult_tool["inputSchema"]["properties"]["background"].is_null(), "consult schema should not advertise background: {consult_tool:#}" ); + assert!( + consult_tool["inputSchema"]["properties"]["session_id"].is_null(), + "consult schema should not advertise session_id: {consult_tool:#}" + ); + assert_eq!( + consult_tool["inputSchema"]["properties"]["fresh_context"]["type"].as_str(), + Some("boolean") + ); let health = harness.call_tool(3, "health_snapshot", json!({}))?; assert_tool_ok(&health); @@ -322,14 +364,18 @@ fn cold_start_exposes_consult_and_ops_tools() -> TestResult { } #[test] -fn consult_can_resume_a_prior_session_with_read_only_toolset_and_requested_working_directory() --> TestResult { +fn consult_reuses_context_per_cwd_by_default_and_fresh_context_opts_out() -> TestResult { let root = temp_root("consult_success")?; let state_home = root.join("state-home"); let sandbox = root.join("sandbox"); + let sibling_sandbox = root.join("sibling-sandbox"); let caller_home = root.join("caller-home"); must(fs::create_dir_all(&state_home), "create state home")?; must(fs::create_dir_all(&sandbox), "create sandbox")?; + must( + fs::create_dir_all(&sibling_sandbox), + "create sibling sandbox", + )?; must(fs::create_dir_all(&caller_home), "create caller home")?; seed_caller_claude_home(&caller_home)?; @@ -342,39 +388,11 @@ fn consult_can_resume_a_prior_session_with_read_only_toolset_and_requested_worki let cwd_probe_error_file = root.join("cwd-write-probe.err"); let credential_probe_file = root.join("credential-write-probe.txt"); let credential_probe_error_file = root.join("credential-write-probe.err"); - let resumed_session = "81f218eb-568b-409b-871b-f6e86d8f666f"; + let remembered_session = "81f218eb-568b-409b-871b-f6e86d8f666f"; + let fresh_session = "dbd3b6c2-4757-4b45-a8f0-f3d877e1a13f"; + let sibling_session = "d9a9a472-a091-4268-a7dd-9f31cf61f87e"; write_fake_claude_script(&fake_claude)?; - must( - fs::write( - &stdout_file, - serde_json::to_string(&json!({ - "type": "result", - "subtype": "success", - "is_error": false, - "duration_ms": 1234, - "duration_api_ms": 1200, - "num_turns": 2, - "result": "oracle", - "stop_reason": "end_turn", - "session_id": resumed_session, - "total_cost_usd": 0.125, - "usage": { - "input_tokens": 10, - "output_tokens": 5 - }, - "modelUsage": { - "claude-opus-4-6": { - "inputTokens": 10, - "outputTokens": 5 - } - }, - "permission_denials": [], - "fast_mode_state": "off", - "uuid": "uuid-123" - }))?, - ), - "write fake stdout", - )?; + write_fake_claude_stdout(&stdout_file, "oracle", remembered_session, "uuid-123")?; let claude_bin = fake_claude.display().to_string(); let stdout_path = stdout_file.display().to_string(); @@ -420,43 +438,135 @@ fn consult_can_resume_a_prior_session_with_read_only_toolset_and_requested_worki json!({ "prompt": "say oracle", "cwd": sandbox.display().to_string(), - "session_id": resumed_session, + "session_id": "not-a-uuid", "background": true }), )?; assert_tool_ok(&consult); assert_eq!(tool_content(&consult)["response"].as_str(), Some("oracle")); - assert!(tool_content(&consult)["mode"].is_null()); - assert!(tool_content(&consult)["job_id"].is_null()); assert_eq!( - tool_content(&consult)["session_mode"].as_str(), - Some("resumed") + tool_content(&consult)["context_mode"].as_str(), + Some("fresh") ); + assert!(tool_content(&consult)["reused_session_id"].is_null()); assert_eq!( - tool_content(&consult)["requested_session_id"].as_str(), - Some(resumed_session) + tool_content(&consult)["session_id"].as_str(), + Some(remembered_session) ); + let first_args = must(fs::read_to_string(&args_file), "read first fake args file")?; + assert!(!first_args.contains("--resume")); + assert!(!first_args.contains("not-a-uuid")); + + write_fake_claude_stdout( + &stdout_file, + "oracle reused", + remembered_session, + "uuid-124", + )?; + let reused = harness.call_tool( + 4, + "consult", + json!({ + "prompt": "say oracle reused", + "cwd": sandbox.display().to_string() + }), + )?; + assert_tool_ok(&reused); assert_eq!( - tool_content(&consult)["prompt_prefix_injected"].as_bool(), - Some(true) + tool_content(&reused)["response"].as_str(), + Some("oracle reused") ); assert_eq!( - tool_content(&consult)["cwd"].as_str(), - Some(sandbox.display().to_string().as_str()) + tool_content(&reused)["context_mode"].as_str(), + Some("reused") ); - assert_eq!(tool_content(&consult)["num_turns"].as_u64(), Some(2)); assert_eq!( - tool_content(&consult)["session_id"].as_str(), - Some(resumed_session) + tool_content(&reused)["reused_session_id"].as_str(), + Some(remembered_session) + ); + let reused_args = must(fs::read_to_string(&args_file), "read reused fake args file")?; + assert!(reused_args.contains("--resume")); + assert!(reused_args.contains(remembered_session)); + + write_fake_claude_stdout(&stdout_file, "oracle fresh", fresh_session, "uuid-125")?; + let fresh = harness.call_tool( + 5, + "consult", + json!({ + "prompt": "say oracle fresh", + "cwd": sandbox.display().to_string(), + "fresh_context": true + }), + )?; + assert_tool_ok(&fresh); + assert_eq!( + tool_content(&fresh)["response"].as_str(), + Some("oracle fresh") ); + assert_eq!(tool_content(&fresh)["context_mode"].as_str(), Some("fresh")); + assert!(tool_content(&fresh)["reused_session_id"].is_null()); + let fresh_args = must(fs::read_to_string(&args_file), "read fresh fake args file")?; + assert!(!fresh_args.contains("--resume")); + + write_fake_claude_stdout( + &stdout_file, + "oracle after fresh", + fresh_session, + "uuid-126", + )?; + let after_fresh = harness.call_tool( + 6, + "consult", + json!({ + "prompt": "say oracle after fresh", + "cwd": sandbox.display().to_string() + }), + )?; + assert_tool_ok(&after_fresh); + assert_eq!( + tool_content(&after_fresh)["context_mode"].as_str(), + Some("reused") + ); + assert_eq!( + tool_content(&after_fresh)["reused_session_id"].as_str(), + Some(fresh_session) + ); + let after_fresh_args = must( + fs::read_to_string(&args_file), + "read after-fresh fake args file", + )?; + assert!(after_fresh_args.contains("--resume")); + assert!(after_fresh_args.contains(fresh_session)); + + write_fake_claude_stdout(&stdout_file, "oracle sibling", sibling_session, "uuid-127")?; + let sibling = harness.call_tool( + 7, + "consult", + json!({ + "prompt": "say oracle sibling", + "cwd": sibling_sandbox.display().to_string() + }), + )?; + assert_tool_ok(&sibling); + assert_eq!( + tool_content(&sibling)["context_mode"].as_str(), + Some("fresh") + ); + assert!(tool_content(&sibling)["reused_session_id"].is_null()); + let sibling_args = must( + fs::read_to_string(&args_file), + "read sibling fake args file", + )?; + assert!(!sibling_args.contains("--resume")); + let persisted_output_path = must_some( - tool_content(&consult)["persisted_output_path"] + tool_content(&after_fresh)["persisted_output_path"] .as_str() .map(str::to_owned), "persisted output path", )?; assert!(persisted_output_path.starts_with("/tmp/phone_opus-consults/")); - assert!(persisted_output_path.contains(resumed_session)); + assert!(persisted_output_path.contains(fresh_session)); let persisted_output = must( fs::read_to_string(&persisted_output_path), "read persisted consult output", @@ -465,14 +575,18 @@ fn consult_can_resume_a_prior_session_with_read_only_toolset_and_requested_worki serde_json::from_str(&persisted_output), "parse persisted consult output", )?; - assert_eq!(persisted_output["response"].as_str(), Some("oracle")); assert_eq!( - persisted_output["requested_session_id"].as_str(), - Some(resumed_session) + persisted_output["response"].as_str(), + Some("oracle after fresh") + ); + assert_eq!(persisted_output["context_mode"].as_str(), Some("reused")); + assert_eq!( + persisted_output["reused_session_id"].as_str(), + Some(fresh_session) ); let pwd = must(fs::read_to_string(&pwd_file), "read fake pwd file")?; - assert_eq!(pwd.trim(), sandbox.display().to_string()); + assert_eq!(pwd.trim(), sibling_sandbox.display().to_string()); let args = must(fs::read_to_string(&args_file), "read fake args file")?; let lines = args.lines().collect::<Vec<_>>(); @@ -493,13 +607,11 @@ fn consult_can_resume_a_prior_session_with_read_only_toolset_and_requested_worki assert!(lines.contains(&"--dangerously-skip-permissions")); assert!(!lines.contains(&"--permission-mode")); assert!(!lines.contains(&"dontAsk")); - assert!(lines.contains(&"--resume")); - assert!(lines.contains(&resumed_session)); + assert!(!lines.contains(&"--resume")); assert!(!lines.contains(&"--max-turns")); assert!(args.contains(PROMPT_PREFIX)); - assert!(args.contains("The real prompt follows.")); let prefix_index = must_some(args.find(PROMPT_PREFIX), "prefixed consult prompt")?; - let user_prompt_index = must_some(args.find("say oracle"), "user prompt inside args")?; + let user_prompt_index = must_some(args.find("say oracle sibling"), "user prompt inside args")?; assert!(prefix_index < user_prompt_index); let env_dump = must(fs::read_to_string(&env_file), "read fake env file")?; @@ -572,7 +684,7 @@ fn consult_can_resume_a_prior_session_with_read_only_toolset_and_requested_worki )?; assert_eq!(credential_probe.trim(), "write_succeeded"); - let telemetry = harness.call_tool(4, "telemetry_snapshot", json!({}))?; + let telemetry = harness.call_tool(8, "telemetry_snapshot", json!({}))?; assert_tool_ok(&telemetry); let hot_methods = tool_content(&telemetry)["hot_methods"] .as_array() @@ -644,37 +756,6 @@ fn background_surfaces_are_hidden_from_public_mcp() -> TestResult { } #[test] -fn consult_rejects_invalid_session_handles() -> TestResult { - let root = temp_root("consult_invalid_session")?; - let state_home = root.join("state-home"); - must(fs::create_dir_all(&state_home), "create state home")?; - - let mut harness = McpHarness::spawn(&state_home, &[])?; - let _ = harness.initialize()?; - harness.notify_initialized()?; - - let consult = harness.call_tool( - 3, - "consult", - json!({ - "prompt": "fail", - "session_id": "not-a-uuid" - }), - )?; - assert_tool_error(&consult); - assert_eq!( - tool_content(&consult)["fault"]["class"].as_str(), - Some("protocol") - ); - assert!( - tool_content(&consult)["fault"]["detail"] - .as_str() - .is_some_and(|value| value.contains("session_id must be a valid UUID")) - ); - Ok(()) -} - -#[test] fn consult_surfaces_downstream_cli_failures() -> TestResult { let root = temp_root("consult_failure")?; let state_home = root.join("state-home"); |