From 2c219204d627634442d46c38d1b5df806f77f4c1 Mon Sep 17 00:00:00 2001 From: main Date: Wed, 25 Mar 2026 00:43:57 -0400 Subject: Disable public resume behavior --- crates/phone-opus/tests/mcp_hardening.rs | 312 ++++++++++++------------------- 1 file changed, 118 insertions(+), 194 deletions(-) (limited to 'crates/phone-opus/tests') diff --git a/crates/phone-opus/tests/mcp_hardening.rs b/crates/phone-opus/tests/mcp_hardening.rs index 0b32442..b35e687 100644 --- a/crates/phone-opus/tests/mcp_hardening.rs +++ b/crates/phone-opus/tests/mcp_hardening.rs @@ -378,8 +378,8 @@ fn cold_start_exposes_consult_and_ops_tools() -> TestResult { "consult schema should not advertise session_id: {consult_tool:#}" ); assert_eq!( - consult_tool["inputSchema"]["properties"]["fresh_context"]["type"].as_str(), - Some("boolean") + consult_tool["inputSchema"]["properties"]["fresh_context"], + Value::Null ); let health = harness.call_tool(3, "health_snapshot", json!({}))?; @@ -389,18 +389,13 @@ fn cold_start_exposes_consult_and_ops_tools() -> TestResult { } #[test] -fn consult_reuses_context_per_cwd_by_default_and_fresh_context_opts_out() -> TestResult { +fn consult_is_one_shot_and_hides_session_state() -> TestResult { let root = temp_root("consult_success")?; let state_home = root.join("state-home"); let sandbox = root.join("sandbox"); - let sibling_sandbox = root.join("sibling-sandbox"); let caller_home = root.join("caller-home"); must(fs::create_dir_all(&state_home), "create state home")?; must(fs::create_dir_all(&sandbox), "create sandbox")?; - must( - fs::create_dir_all(&sibling_sandbox), - "create sibling sandbox", - )?; must(fs::create_dir_all(&caller_home), "create caller home")?; seed_caller_claude_home(&caller_home)?; @@ -413,11 +408,10 @@ fn consult_reuses_context_per_cwd_by_default_and_fresh_context_opts_out() -> Tes let cwd_probe_error_file = root.join("cwd-write-probe.err"); let credential_probe_file = root.join("credential-write-probe.txt"); let credential_probe_error_file = root.join("credential-write-probe.err"); - let remembered_session = "81f218eb-568b-409b-871b-f6e86d8f666f"; - let fresh_session = "dbd3b6c2-4757-4b45-a8f0-f3d877e1a13f"; - let sibling_session = "d9a9a472-a091-4268-a7dd-9f31cf61f87e"; + let first_observed_session = "81f218eb-568b-409b-871b-f6e86d8f666f"; + let second_observed_session = "dbd3b6c2-4757-4b45-a8f0-f3d877e1a13f"; write_fake_claude_script(&fake_claude)?; - write_fake_claude_stream_success(&stdout_file, "oracle", remembered_session, "uuid-123")?; + write_fake_claude_stream_success(&stdout_file, "oracle", first_observed_session, "uuid-123")?; let claude_bin = fake_claude.display().to_string(); let stdout_path = stdout_file.display().to_string(); @@ -469,144 +463,73 @@ fn consult_reuses_context_per_cwd_by_default_and_fresh_context_opts_out() -> Tes )?; assert_tool_ok(&consult); assert_eq!(tool_content(&consult)["response"].as_str(), Some("oracle")); - assert_eq!( - tool_content(&consult)["context_mode"].as_str(), - Some("fresh") - ); - assert!( - tool_content(&consult)["planned_session_id"] - .as_str() - .is_some_and(|value| !value.is_empty()) - ); + assert!(tool_content(&consult)["context_mode"].is_null()); + assert!(tool_content(&consult)["planned_session_id"].is_null()); assert!(tool_content(&consult)["reused_session_id"].is_null()); - assert_eq!( - tool_content(&consult)["observed_session_id"].as_str(), - Some(remembered_session) - ); - assert_eq!( - tool_content(&consult)["session_id"].as_str(), - Some(remembered_session) - ); + assert!(tool_content(&consult)["observed_session_id"].is_null()); + assert!(tool_content(&consult)["session_id"].is_null()); let first_args = must(fs::read_to_string(&args_file), "read first fake args file")?; - assert!(first_args.contains("--session-id")); - assert!( - tool_content(&consult)["planned_session_id"] - .as_str() - .is_some_and(|value| first_args.contains(value)) - ); + let first_lines = first_args.lines().collect::>(); + assert!(first_lines.contains(&"--session-id")); assert!(!first_args.contains("--resume")); assert!(!first_args.contains("not-a-uuid")); + let first_session_id = must_some( + first_lines + .windows(2) + .find_map(|window| (window[0] == "--session-id").then_some(window[1].to_owned())), + "first one-shot session id", + )?; + assert!(uuid::Uuid::parse_str(&first_session_id).is_ok()); write_fake_claude_stream_success( &stdout_file, - "oracle reused", - remembered_session, + "oracle again", + second_observed_session, "uuid-124", )?; - let reused = harness.call_tool( + let repeated = harness.call_tool( 4, "consult", json!({ - "prompt": "say oracle reused", + "prompt": "say oracle again", "cwd": sandbox.display().to_string() }), )?; - assert_tool_ok(&reused); + assert_tool_ok(&repeated); assert_eq!( - tool_content(&reused)["response"].as_str(), - Some("oracle reused") + tool_content(&repeated)["response"].as_str(), + Some("oracle again") ); - assert_eq!( - tool_content(&reused)["context_mode"].as_str(), - Some("reused") - ); - assert_eq!( - tool_content(&reused)["reused_session_id"].as_str(), - Some(remembered_session) - ); - let reused_args = must(fs::read_to_string(&args_file), "read reused fake args file")?; - assert!(reused_args.contains("--resume")); - assert!(reused_args.contains(remembered_session)); - - write_fake_claude_stream_success(&stdout_file, "oracle fresh", fresh_session, "uuid-125")?; - let fresh = harness.call_tool( - 5, - "consult", - json!({ - "prompt": "say oracle fresh", - "cwd": sandbox.display().to_string(), - "fresh_context": true - }), - )?; - assert_tool_ok(&fresh); - assert_eq!( - tool_content(&fresh)["response"].as_str(), - Some("oracle fresh") - ); - assert_eq!(tool_content(&fresh)["context_mode"].as_str(), Some("fresh")); - assert!(tool_content(&fresh)["reused_session_id"].is_null()); - let fresh_args = must(fs::read_to_string(&args_file), "read fresh fake args file")?; - assert!(!fresh_args.contains("--resume")); - - write_fake_claude_stream_success( - &stdout_file, - "oracle after fresh", - fresh_session, - "uuid-126", - )?; - let after_fresh = harness.call_tool( - 6, - "consult", - json!({ - "prompt": "say oracle after fresh", - "cwd": sandbox.display().to_string() - }), - )?; - assert_tool_ok(&after_fresh); - assert_eq!( - tool_content(&after_fresh)["context_mode"].as_str(), - Some("reused") - ); - assert_eq!( - tool_content(&after_fresh)["reused_session_id"].as_str(), - Some(fresh_session) - ); - let after_fresh_args = must( + assert!(tool_content(&repeated)["context_mode"].is_null()); + assert!(tool_content(&repeated)["planned_session_id"].is_null()); + assert!(tool_content(&repeated)["reused_session_id"].is_null()); + assert!(tool_content(&repeated)["observed_session_id"].is_null()); + assert!(tool_content(&repeated)["session_id"].is_null()); + let repeated_args = must( fs::read_to_string(&args_file), - "read after-fresh fake args file", + "read repeated fake args file", )?; - assert!(after_fresh_args.contains("--resume")); - assert!(after_fresh_args.contains(fresh_session)); - - write_fake_claude_stream_success(&stdout_file, "oracle sibling", sibling_session, "uuid-127")?; - let sibling = harness.call_tool( - 7, - "consult", - json!({ - "prompt": "say oracle sibling", - "cwd": sibling_sandbox.display().to_string() - }), + let repeated_lines = repeated_args.lines().collect::>(); + assert!(repeated_lines.contains(&"--session-id")); + assert!(!repeated_args.contains("--resume")); + let repeated_session_id = must_some( + repeated_lines + .windows(2) + .find_map(|window| (window[0] == "--session-id").then_some(window[1].to_owned())), + "repeated one-shot session id", )?; - assert_tool_ok(&sibling); - assert_eq!( - tool_content(&sibling)["context_mode"].as_str(), - Some("fresh") - ); - assert!(tool_content(&sibling)["reused_session_id"].is_null()); - let sibling_args = must( - fs::read_to_string(&args_file), - "read sibling fake args file", - )?; - assert!(!sibling_args.contains("--resume")); + assert!(uuid::Uuid::parse_str(&repeated_session_id).is_ok()); + assert_ne!(repeated_session_id, first_session_id); let persisted_output_path = must_some( - tool_content(&after_fresh)["persisted_output_path"] + tool_content(&repeated)["persisted_output_path"] .as_str() .map(str::to_owned), "persisted output path", )?; assert!(persisted_output_path.starts_with("/tmp/phone_opus-consults/")); - assert!(persisted_output_path.contains(fresh_session)); + assert!(!persisted_output_path.contains(first_observed_session)); + assert!(!persisted_output_path.contains(second_observed_session)); let persisted_output = must( fs::read_to_string(&persisted_output_path), "read persisted consult output", @@ -615,18 +538,37 @@ fn consult_reuses_context_per_cwd_by_default_and_fresh_context_opts_out() -> Tes serde_json::from_str(&persisted_output), "parse persisted consult output", )?; + assert_eq!(persisted_output["response"].as_str(), Some("oracle again")); + assert!(persisted_output["context_mode"].is_null()); + assert!(persisted_output["planned_session_id"].is_null()); + assert!(persisted_output["reused_session_id"].is_null()); + assert!(persisted_output["session_id"].is_null()); + assert!(persisted_output["observed_session_id"].is_null()); + + let consult_context_index = must( + fs::read_to_string( + state_home + .join("phone_opus") + .join("mcp") + .join("consult_contexts.json"), + ), + "read consult context index", + )?; + let consult_context_index: Value = must( + serde_json::from_str(&consult_context_index), + "parse consult context index", + )?; assert_eq!( - persisted_output["response"].as_str(), - Some("oracle after fresh") + consult_context_index["by_cwd"][sandbox.display().to_string()]["session_id"].as_str(), + Some(second_observed_session) ); - assert_eq!(persisted_output["context_mode"].as_str(), Some("reused")); assert_eq!( - persisted_output["reused_session_id"].as_str(), - Some(fresh_session) + consult_context_index["by_cwd"][sandbox.display().to_string()]["state"].as_str(), + Some("confirmed") ); let pwd = must(fs::read_to_string(&pwd_file), "read fake pwd file")?; - assert_eq!(pwd.trim(), sibling_sandbox.display().to_string()); + assert_eq!(pwd.trim(), sandbox.display().to_string()); let args = must(fs::read_to_string(&args_file), "read fake args file")?; let lines = args.lines().collect::>(); @@ -653,7 +595,7 @@ fn consult_reuses_context_per_cwd_by_default_and_fresh_context_opts_out() -> Tes assert!(!lines.contains(&"--max-turns")); assert!(args.contains(PROMPT_PREFIX)); let prefix_index = must_some(args.find(PROMPT_PREFIX), "prefixed consult prompt")?; - let user_prompt_index = must_some(args.find("say oracle sibling"), "user prompt inside args")?; + let user_prompt_index = must_some(args.find("say oracle again"), "user prompt inside args")?; assert!(prefix_index < user_prompt_index); let env_dump = must(fs::read_to_string(&env_file), "read fake env file")?; @@ -726,7 +668,7 @@ fn consult_reuses_context_per_cwd_by_default_and_fresh_context_opts_out() -> Tes )?; assert_eq!(credential_probe.trim(), "write_succeeded"); - let telemetry = harness.call_tool(8, "telemetry_snapshot", json!({}))?; + let telemetry = harness.call_tool(5, "telemetry_snapshot", json!({}))?; assert_tool_ok(&telemetry); let hot_methods = tool_content(&telemetry)["hot_methods"] .as_array() @@ -831,10 +773,10 @@ fn consult_surfaces_downstream_cli_failures() -> TestResult { .is_some_and(|value| value.contains("permission denied by fake claude")) ); assert_eq!( - tool_content(&consult)["context"]["consult"]["context_mode"].as_str(), - Some("fresh") + tool_content(&consult)["context"]["consult"]["cwd"].as_str(), + Some(std::env::current_dir()?.display().to_string().as_str()) ); - assert!(tool_content(&consult)["context"]["consult"]["reused_session_id"].is_null()); + assert!(tool_content(&consult)["context"]["consult"]["planned_session_id"].is_null()); Ok(()) } @@ -876,20 +818,16 @@ fn silent_claude_processes_fail_fast_instead_of_wedging() -> TestResult { .is_some_and(|value| value.contains("produced no stream output within 100 ms")) ); assert!(elapsed < std::time::Duration::from_secs(3)); - assert_eq!( - tool_content(&consult)["context"]["consult"]["context_mode"].as_str(), - Some("fresh") - ); assert!( - tool_content(&consult)["context"]["consult"]["planned_session_id"] + tool_content(&consult)["context"]["consult"]["retry_hint"] .as_str() - .is_some_and(|value| !value.is_empty()) + .is_some_and(|value| value.contains("fresh one-shot call")) ); Ok(()) } #[test] -fn quota_failures_surface_resume_context_for_same_cwd() -> TestResult { +fn quota_failures_hide_session_state_on_public_surface() -> TestResult { let root = temp_root("consult_quota_failure")?; let state_home = root.join("state-home"); let sandbox = root.join("sandbox"); @@ -926,10 +864,7 @@ fn quota_failures_surface_resume_context_for_same_cwd() -> TestResult { }), )?; assert_tool_ok(&first); - assert_eq!( - tool_content(&first)["session_id"].as_str(), - Some(remembered_session) - ); + assert!(tool_content(&first)["session_id"].is_null()); let quota_env = [ ("HOME", caller_home_path.as_str()), @@ -962,26 +897,11 @@ fn quota_failures_surface_resume_context_for_same_cwd() -> TestResult { tool_content(&failed)["context"]["consult"]["cwd"].as_str(), Some(sandbox.display().to_string().as_str()) ); - assert_eq!( - tool_content(&failed)["context"]["consult"]["context_mode"].as_str(), - Some("reused") - ); - assert_eq!( - tool_content(&failed)["context"]["consult"]["planned_session_id"].as_str(), - Some(remembered_session) - ); - assert_eq!( - tool_content(&failed)["context"]["consult"]["reused_session_id"].as_str(), - Some(remembered_session) - ); - assert_eq!( - tool_content(&failed)["context"]["consult"]["observed_session_id"].as_str(), - Some(remembered_session) - ); - assert_eq!( - tool_content(&failed)["context"]["consult"]["resume_session_id"].as_str(), - Some(remembered_session) - ); + assert!(tool_content(&failed)["context"]["consult"]["context_mode"].is_null()); + assert!(tool_content(&failed)["context"]["consult"]["planned_session_id"].is_null()); + assert!(tool_content(&failed)["context"]["consult"]["reused_session_id"].is_null()); + assert!(tool_content(&failed)["context"]["consult"]["observed_session_id"].is_null()); + assert!(tool_content(&failed)["context"]["consult"]["resume_session_id"].is_null()); assert_eq!( tool_content(&failed)["context"]["consult"]["quota_limited"].as_bool(), Some(true) @@ -993,7 +913,15 @@ fn quota_failures_surface_resume_context_for_same_cwd() -> TestResult { assert!( tool_content(&failed)["context"]["consult"]["retry_hint"] .as_str() - .is_some_and(|value| value.contains(remembered_session)) + .is_some_and(|value| value.contains("retry the consult")) + ); + assert!( + failed["result"]["content"] + .as_array() + .into_iter() + .flatten() + .filter_map(|entry| entry["text"].as_str()) + .any(|text| text.contains("quota_reset: 4pm (America/New_York)")) ); assert!( failed["result"]["content"] @@ -1001,16 +929,13 @@ fn quota_failures_surface_resume_context_for_same_cwd() -> TestResult { .into_iter() .flatten() .filter_map(|entry| entry["text"].as_str()) - .any(|text| { - text.contains("resume_session: 84b9d462-5af9-4a4e-8e44-379a8d0c46d7") - && text.contains("quota_reset: 4pm (America/New_York)") - }) + .all(|text| !text.contains("session")) ); Ok(()) } #[test] -fn fresh_failures_capture_streamed_session_ids_eagerly() -> TestResult { +fn fresh_failures_keep_internal_session_state_without_public_leakage() -> TestResult { let root = temp_root("consult_fresh_stream_failure")?; let state_home = root.join("state-home"); let sandbox = root.join("sandbox"); @@ -1054,42 +979,41 @@ fn fresh_failures_capture_streamed_session_ids_eagerly() -> TestResult { }), )?; assert_tool_error(&failed); - assert_eq!( - tool_content(&failed)["context"]["consult"]["context_mode"].as_str(), - Some("fresh") - ); - assert_eq!( - tool_content(&failed)["context"]["consult"]["observed_session_id"].as_str(), - Some(init_session) - ); - assert_eq!( - tool_content(&failed)["context"]["consult"]["resume_session_id"].as_str(), - Some(init_session) - ); + assert!(tool_content(&failed)["context"]["consult"]["context_mode"].is_null()); + assert!(tool_content(&failed)["context"]["consult"]["observed_session_id"].is_null()); + assert!(tool_content(&failed)["context"]["consult"]["resume_session_id"].is_null()); assert_eq!( tool_content(&failed)["context"]["consult"]["quota_reset_hint"].as_str(), Some("9pm (America/New_York)") ); - let planned_session = must_some( - tool_content(&failed)["context"]["consult"]["planned_session_id"] - .as_str() - .map(str::to_owned), - "planned session id on failure", - )?; + assert!(tool_content(&failed)["context"]["consult"]["planned_session_id"].is_null()); let args = must(fs::read_to_string(&args_file), "read fresh failure args")?; assert!(args.contains("--session-id")); - assert!(args.contains(&planned_session)); assert!(!args.contains("--resume")); + let consult_context_index = must( + fs::read_to_string( + state_home + .join("phone_opus") + .join("mcp") + .join("consult_contexts.json"), + ), + "read consult context index after failure", + )?; + let consult_context_index: Value = must( + serde_json::from_str(&consult_context_index), + "parse consult context index after failure", + )?; + assert_eq!( + consult_context_index["by_cwd"][sandbox.display().to_string()]["session_id"].as_str(), + Some(init_session) + ); assert!( failed["result"]["content"] .as_array() .into_iter() .flatten() .filter_map(|entry| entry["text"].as_str()) - .any(|text| { - text.contains("observed_session: 550e8400-e29b-41d4-a716-446655440000") - && text.contains("resume_session: 550e8400-e29b-41d4-a716-446655440000") - }) + .all(|text| !text.contains("session")) ); Ok(()) } -- cgit v1.2.3