From 2fc866a3ce50b6ba9c5e84e0ad2f8c77517361ff Mon Sep 17 00:00:00 2001 From: main Date: Tue, 31 Mar 2026 13:21:05 -0400 Subject: Excise hidden consult machinery --- crates/phone-opus/tests/mcp_hardening.rs | 291 ++----------------------------- 1 file changed, 17 insertions(+), 274 deletions(-) (limited to 'crates/phone-opus/tests') diff --git a/crates/phone-opus/tests/mcp_hardening.rs b/crates/phone-opus/tests/mcp_hardening.rs index 754ee79..c826092 100644 --- a/crates/phone-opus/tests/mcp_hardening.rs +++ b/crates/phone-opus/tests/mcp_hardening.rs @@ -423,7 +423,7 @@ fn cold_start_exposes_consult_and_ops_tools() -> TestResult { } #[test] -fn consult_is_one_shot_and_hides_session_state() -> TestResult { +fn consult_runs_blocking_in_sandbox() -> TestResult { let root = temp_root("consult_success")?; let state_home = root.join("state-home"); let sandbox = root.join("sandbox"); @@ -442,10 +442,9 @@ fn consult_is_one_shot_and_hides_session_state() -> TestResult { let cwd_probe_error_file = root.join("cwd-write-probe.err"); let credential_probe_file = root.join("credential-write-probe.txt"); let credential_probe_error_file = root.join("credential-write-probe.err"); - let first_observed_session = "81f218eb-568b-409b-871b-f6e86d8f666f"; - let second_observed_session = "dbd3b6c2-4757-4b45-a8f0-f3d877e1a13f"; + let observed_session = "81f218eb-568b-409b-871b-f6e86d8f666f"; write_fake_claude_script(&fake_claude)?; - write_fake_claude_json_success(&stdout_file, "oracle", first_observed_session, "uuid-123")?; + write_fake_claude_json_success(&stdout_file, "oracle", observed_session, "uuid-123")?; let claude_bin = fake_claude.display().to_string(); let stdout_path = stdout_file.display().to_string(); @@ -490,80 +489,31 @@ fn consult_is_one_shot_and_hides_session_state() -> TestResult { "consult", json!({ "prompt": "say oracle", - "cwd": sandbox.display().to_string(), - "session_id": "not-a-uuid", - "background": true + "cwd": sandbox.display().to_string() }), )?; assert_tool_ok(&consult); assert_eq!(tool_content(&consult)["response"].as_str(), Some("oracle")); - assert!(tool_content(&consult)["context_mode"].is_null()); - assert!(tool_content(&consult)["planned_session_id"].is_null()); - assert!(tool_content(&consult)["reused_session_id"].is_null()); - assert!(tool_content(&consult)["observed_session_id"].is_null()); - assert!(tool_content(&consult)["session_id"].is_null()); - let first_args = must(fs::read_to_string(&args_file), "read first fake args file")?; - let first_lines = first_args.lines().collect::>(); - assert!(first_lines.contains(&"--session-id")); - assert!(!first_args.contains("--resume")); - assert!(!first_args.contains("not-a-uuid")); - let first_session_id = must_some( - first_lines - .windows(2) - .find_map(|window| (window[0] == "--session-id").then_some(window[1].to_owned())), - "first one-shot session id", - )?; - assert!(uuid::Uuid::parse_str(&first_session_id).is_ok()); - - write_fake_claude_json_success( - &stdout_file, - "oracle again", - second_observed_session, - "uuid-124", - )?; - let repeated = harness.call_tool( - 4, - "consult", - json!({ - "prompt": "say oracle again", - "cwd": sandbox.display().to_string() - }), - )?; - assert_tool_ok(&repeated); - assert_eq!( - tool_content(&repeated)["response"].as_str(), - Some("oracle again") - ); - assert!(tool_content(&repeated)["context_mode"].is_null()); - assert!(tool_content(&repeated)["planned_session_id"].is_null()); - assert!(tool_content(&repeated)["reused_session_id"].is_null()); - assert!(tool_content(&repeated)["observed_session_id"].is_null()); - assert!(tool_content(&repeated)["session_id"].is_null()); - let repeated_args = must( - fs::read_to_string(&args_file), - "read repeated fake args file", - )?; - let repeated_lines = repeated_args.lines().collect::>(); - assert!(repeated_lines.contains(&"--session-id")); - assert!(!repeated_args.contains("--resume")); - let repeated_session_id = must_some( - repeated_lines + let args = must(fs::read_to_string(&args_file), "read fake args file")?; + let lines = args.lines().collect::>(); + assert!(lines.contains(&"--session-id")); + assert!(!args.contains("--resume")); + let session_id = must_some( + lines .windows(2) .find_map(|window| (window[0] == "--session-id").then_some(window[1].to_owned())), - "repeated one-shot session id", + "one-shot session id", )?; - assert!(uuid::Uuid::parse_str(&repeated_session_id).is_ok()); - assert_ne!(repeated_session_id, first_session_id); + assert!(uuid::Uuid::parse_str(&session_id).is_ok()); let persisted_output_path = must_some( - tool_content(&repeated)["persisted_output_path"] + tool_content(&consult)["persisted_output_path"] .as_str() .map(str::to_owned), "persisted output path", )?; assert!(persisted_output_path.starts_with("/tmp/phone_opus-consults/")); - assert!(!persisted_output_path.contains(first_observed_session)); - assert!(!persisted_output_path.contains(second_observed_session)); + assert!(!persisted_output_path.contains(observed_session)); let persisted_output = must( fs::read_to_string(&persisted_output_path), "read persisted consult output", @@ -572,40 +522,11 @@ fn consult_is_one_shot_and_hides_session_state() -> TestResult { serde_json::from_str(&persisted_output), "parse persisted consult output", )?; - assert_eq!(persisted_output["response"].as_str(), Some("oracle again")); - assert!(persisted_output["context_mode"].is_null()); - assert!(persisted_output["planned_session_id"].is_null()); - assert!(persisted_output["reused_session_id"].is_null()); - assert!(persisted_output["session_id"].is_null()); - assert!(persisted_output["observed_session_id"].is_null()); - - let consult_context_index = must( - fs::read_to_string( - state_home - .join("phone_opus") - .join("mcp") - .join("consult_contexts.json"), - ), - "read consult context index", - )?; - let consult_context_index: Value = must( - serde_json::from_str(&consult_context_index), - "parse consult context index", - )?; - assert_eq!( - consult_context_index["by_cwd"][sandbox.display().to_string()]["session_id"].as_str(), - Some(second_observed_session) - ); - assert_eq!( - consult_context_index["by_cwd"][sandbox.display().to_string()]["state"].as_str(), - Some("confirmed") - ); + assert_eq!(persisted_output["response"].as_str(), Some("oracle")); let pwd = must(fs::read_to_string(&pwd_file), "read fake pwd file")?; assert_eq!(pwd.trim(), sandbox.display().to_string()); - let args = must(fs::read_to_string(&args_file), "read fake args file")?; - let lines = args.lines().collect::>(); assert!(lines.contains(&"-p")); assert!(lines.contains(&"--output-format")); assert!(lines.contains(&"json")); @@ -628,7 +549,7 @@ fn consult_is_one_shot_and_hides_session_state() -> TestResult { assert!(!lines.contains(&"--max-turns")); assert!(args.contains(PROMPT_PREFIX)); let prefix_index = must_some(args.find(PROMPT_PREFIX), "prefixed consult prompt")?; - let user_prompt_index = must_some(args.find("say oracle again"), "user prompt inside args")?; + let user_prompt_index = must_some(args.find("say oracle"), "user prompt inside args")?; assert!(prefix_index < user_prompt_index); let env_dump = must(fs::read_to_string(&env_file), "read fake env file")?; @@ -802,63 +723,6 @@ fn transcript_progress_prevents_false_stall_timeout() -> TestResult { Ok(()) } -#[test] -fn background_surfaces_are_hidden_from_public_mcp() -> TestResult { - let root = temp_root("consult_hidden_background")?; - let state_home = root.join("state-home"); - must(fs::create_dir_all(&state_home), "create state home")?; - - let mut harness = McpHarness::spawn(&state_home, &[])?; - let _ = harness.initialize()?; - harness.notify_initialized()?; - - let consult_job = harness.call_tool( - 3, - "consult_job", - json!({ - "job_id": "00000000-0000-0000-0000-000000000000" - }), - )?; - assert_tool_error(&consult_job); - assert!( - consult_job["result"]["content"] - .as_array() - .into_iter() - .flatten() - .filter_map(|entry| entry["text"].as_str()) - .any(|text| text.contains("unknown tool `consult_job`")) - ); - - let consult_wait = harness.call_tool( - 4, - "consult_wait", - json!({ - "job_id": "00000000-0000-0000-0000-000000000000" - }), - )?; - assert_tool_error(&consult_wait); - assert!( - consult_wait["result"]["content"] - .as_array() - .into_iter() - .flatten() - .filter_map(|entry| entry["text"].as_str()) - .any(|text| text.contains("unknown tool `consult_wait`")) - ); - - let consult_jobs = harness.call_tool(5, "consult_jobs", json!({}))?; - assert_tool_error(&consult_jobs); - assert!( - consult_jobs["result"]["content"] - .as_array() - .into_iter() - .flatten() - .filter_map(|entry| entry["text"].as_str()) - .any(|text| text.contains("unknown tool `consult_jobs`")) - ); - Ok(()) -} - #[test] fn consult_surfaces_downstream_cli_failures() -> TestResult { let root = temp_root("consult_failure")?; @@ -896,12 +760,11 @@ fn consult_surfaces_downstream_cli_failures() -> TestResult { tool_content(&consult)["context"]["consult"]["cwd"].as_str(), Some(std::env::current_dir()?.display().to_string().as_str()) ); - assert!(tool_content(&consult)["context"]["consult"]["planned_session_id"].is_null()); Ok(()) } #[test] -fn quota_failures_hide_session_state_on_public_surface() -> TestResult { +fn quota_failures_surface_reset_hint() -> TestResult { let root = temp_root("consult_quota_failure")?; let state_home = root.join("state-home"); let sandbox = root.join("sandbox"); @@ -912,34 +775,10 @@ fn quota_failures_hide_session_state_on_public_surface() -> TestResult { seed_caller_claude_home(&caller_home)?; let fake_claude = root.join("claude"); - let stdout_file = root.join("stdout.json"); - let remembered_session = "84b9d462-5af9-4a4e-8e44-379a8d0c46d7"; write_fake_claude_script(&fake_claude)?; - write_fake_claude_json_success(&stdout_file, "ok", remembered_session, "uuid-remembered")?; let claude_bin = fake_claude.display().to_string(); - let stdout_path = stdout_file.display().to_string(); let caller_home_path = caller_home.display().to_string(); - let env = [ - ("HOME", caller_home_path.as_str()), - ("PHONE_OPUS_CLAUDE_BIN", claude_bin.as_str()), - ("PHONE_OPUS_TEST_STDOUT_FILE", stdout_path.as_str()), - ]; - let mut harness = McpHarness::spawn(&state_home, &env)?; - let _ = harness.initialize()?; - harness.notify_initialized()?; - - let first = harness.call_tool( - 3, - "consult", - json!({ - "prompt": "seed remembered session", - "cwd": sandbox.display().to_string() - }), - )?; - assert_tool_ok(&first); - assert!(tool_content(&first)["session_id"].is_null()); - let quota_env = [ ("HOME", caller_home_path.as_str()), ("PHONE_OPUS_CLAUDE_BIN", claude_bin.as_str()), @@ -949,7 +788,6 @@ fn quota_failures_hide_session_state_on_public_surface() -> TestResult { "You've hit your limit · resets 4pm (America/New_York)", ), ]; - drop(harness); let mut harness = McpHarness::spawn(&state_home, "a_env)?; let _ = harness.initialize()?; harness.notify_initialized()?; @@ -971,11 +809,6 @@ fn quota_failures_hide_session_state_on_public_surface() -> TestResult { tool_content(&failed)["context"]["consult"]["cwd"].as_str(), Some(sandbox.display().to_string().as_str()) ); - assert!(tool_content(&failed)["context"]["consult"]["context_mode"].is_null()); - assert!(tool_content(&failed)["context"]["consult"]["planned_session_id"].is_null()); - assert!(tool_content(&failed)["context"]["consult"]["reused_session_id"].is_null()); - assert!(tool_content(&failed)["context"]["consult"]["observed_session_id"].is_null()); - assert!(tool_content(&failed)["context"]["consult"]["resume_session_id"].is_null()); assert_eq!( tool_content(&failed)["context"]["consult"]["quota_limited"].as_bool(), Some(true) @@ -1008,96 +841,6 @@ fn quota_failures_hide_session_state_on_public_surface() -> TestResult { Ok(()) } -#[test] -fn fresh_failures_keep_internal_session_state_without_public_leakage() -> TestResult { - let root = temp_root("consult_fresh_json_failure")?; - let state_home = root.join("state-home"); - let sandbox = root.join("sandbox"); - let caller_home = root.join("caller-home"); - let fake_claude = root.join("claude"); - let stdout_file = root.join("stdout.json"); - let args_file = root.join("args.txt"); - must(fs::create_dir_all(&state_home), "create state home")?; - must(fs::create_dir_all(&sandbox), "create sandbox")?; - must(fs::create_dir_all(&caller_home), "create caller home")?; - seed_caller_claude_home(&caller_home)?; - write_fake_claude_script(&fake_claude)?; - must(fs::write(&stdout_file, ""), "write empty fake stdout")?; - - let claude_bin = fake_claude.display().to_string(); - let stdout_path = stdout_file.display().to_string(); - let args_path = args_file.display().to_string(); - let caller_home_path = caller_home.display().to_string(); - let env = [ - ("HOME", caller_home_path.as_str()), - ("PHONE_OPUS_CLAUDE_BIN", claude_bin.as_str()), - ("PHONE_OPUS_TEST_STDOUT_FILE", stdout_path.as_str()), - ("PHONE_OPUS_TEST_ARGS_FILE", args_path.as_str()), - ("PHONE_OPUS_TEST_EXIT_CODE", "17"), - ( - "PHONE_OPUS_TEST_STDERR", - "You've hit your limit · resets 9pm (America/New_York)", - ), - ]; - let mut harness = McpHarness::spawn(&state_home, &env)?; - let _ = harness.initialize()?; - harness.notify_initialized()?; - - let failed = harness.call_tool( - 3, - "consult", - json!({ - "prompt": "fresh expensive audit", - "cwd": sandbox.display().to_string() - }), - )?; - assert_tool_error(&failed); - assert!(tool_content(&failed)["context"]["consult"]["context_mode"].is_null()); - assert!(tool_content(&failed)["context"]["consult"]["observed_session_id"].is_null()); - assert!(tool_content(&failed)["context"]["consult"]["resume_session_id"].is_null()); - assert_eq!( - tool_content(&failed)["context"]["consult"]["quota_reset_hint"].as_str(), - Some("9pm (America/New_York)") - ); - assert!(tool_content(&failed)["context"]["consult"]["planned_session_id"].is_null()); - let args = must(fs::read_to_string(&args_file), "read fresh failure args")?; - assert!(args.contains("--session-id")); - assert!(!args.contains("--resume")); - let planned_session_id = must_some( - args.lines() - .collect::>() - .windows(2) - .find_map(|window| (window[0] == "--session-id").then_some(window[1].to_owned())), - "planned session id", - )?; - let consult_context_index = must( - fs::read_to_string( - state_home - .join("phone_opus") - .join("mcp") - .join("consult_contexts.json"), - ), - "read consult context index after failure", - )?; - let consult_context_index: Value = must( - serde_json::from_str(&consult_context_index), - "parse consult context index after failure", - )?; - assert_eq!( - consult_context_index["by_cwd"][sandbox.display().to_string()]["session_id"].as_str(), - Some(planned_session_id.as_str()) - ); - assert!( - failed["result"]["content"] - .as_array() - .into_iter() - .flatten() - .filter_map(|entry| entry["text"].as_str()) - .all(|text| !text.contains("session")) - ); - Ok(()) -} - #[test] fn consult_never_replays_after_worker_transport_failure() -> TestResult { let root = temp_root("consult_no_replay")?; -- cgit v1.2.3