swarm repositories / source
aboutsummaryrefslogtreecommitdiff
path: root/crates/phone-opus/tests
diff options
context:
space:
mode:
authormain <main@swarm.moe>2026-03-24 13:17:59 -0400
committermain <main@swarm.moe>2026-03-24 13:17:59 -0400
commit53797d1f9bbaf73778cbb9dd6ad2f857ba1a88e2 (patch)
tree69b17b86e72b5f292bde42adf839a8ed8cf8005c /crates/phone-opus/tests
parent690b4851ea0afd8b214ddaa5450eec3a8c3a7ec9 (diff)
downloadphone_opus-53797d1f9bbaf73778cbb9dd6ad2f857ba1a88e2.zip
Reuse consult context per cwd by default
Diffstat (limited to 'crates/phone-opus/tests')
-rw-r--r--crates/phone-opus/tests/mcp_hardening.rs261
1 files changed, 171 insertions, 90 deletions
diff --git a/crates/phone-opus/tests/mcp_hardening.rs b/crates/phone-opus/tests/mcp_hardening.rs
index 06861f8..f6e0e73 100644
--- a/crates/phone-opus/tests/mcp_hardening.rs
+++ b/crates/phone-opus/tests/mcp_hardening.rs
@@ -280,6 +280,40 @@ fn seed_caller_claude_home(home: &Path) -> TestResult {
Ok(())
}
+fn write_fake_claude_stdout(path: &Path, result: &str, session_id: &str, uuid: &str) -> TestResult {
+ must(
+ fs::write(
+ path,
+ serde_json::to_string(&json!({
+ "type": "result",
+ "subtype": "success",
+ "is_error": false,
+ "duration_ms": 1234,
+ "duration_api_ms": 1200,
+ "num_turns": 2,
+ "result": result,
+ "stop_reason": "end_turn",
+ "session_id": session_id,
+ "total_cost_usd": 0.125,
+ "usage": {
+ "input_tokens": 10,
+ "output_tokens": 5
+ },
+ "modelUsage": {
+ "claude-opus-4-6": {
+ "inputTokens": 10,
+ "outputTokens": 5
+ }
+ },
+ "permission_denials": [],
+ "fast_mode_state": "off",
+ "uuid": uuid
+ }))?,
+ ),
+ "write fake stdout",
+ )
+}
+
#[test]
fn cold_start_exposes_consult_and_ops_tools() -> TestResult {
let root = temp_root("cold_start")?;
@@ -314,6 +348,14 @@ fn cold_start_exposes_consult_and_ops_tools() -> TestResult {
consult_tool["inputSchema"]["properties"]["background"].is_null(),
"consult schema should not advertise background: {consult_tool:#}"
);
+ assert!(
+ consult_tool["inputSchema"]["properties"]["session_id"].is_null(),
+ "consult schema should not advertise session_id: {consult_tool:#}"
+ );
+ assert_eq!(
+ consult_tool["inputSchema"]["properties"]["fresh_context"]["type"].as_str(),
+ Some("boolean")
+ );
let health = harness.call_tool(3, "health_snapshot", json!({}))?;
assert_tool_ok(&health);
@@ -322,14 +364,18 @@ fn cold_start_exposes_consult_and_ops_tools() -> TestResult {
}
#[test]
-fn consult_can_resume_a_prior_session_with_read_only_toolset_and_requested_working_directory()
--> TestResult {
+fn consult_reuses_context_per_cwd_by_default_and_fresh_context_opts_out() -> TestResult {
let root = temp_root("consult_success")?;
let state_home = root.join("state-home");
let sandbox = root.join("sandbox");
+ let sibling_sandbox = root.join("sibling-sandbox");
let caller_home = root.join("caller-home");
must(fs::create_dir_all(&state_home), "create state home")?;
must(fs::create_dir_all(&sandbox), "create sandbox")?;
+ must(
+ fs::create_dir_all(&sibling_sandbox),
+ "create sibling sandbox",
+ )?;
must(fs::create_dir_all(&caller_home), "create caller home")?;
seed_caller_claude_home(&caller_home)?;
@@ -342,39 +388,11 @@ fn consult_can_resume_a_prior_session_with_read_only_toolset_and_requested_worki
let cwd_probe_error_file = root.join("cwd-write-probe.err");
let credential_probe_file = root.join("credential-write-probe.txt");
let credential_probe_error_file = root.join("credential-write-probe.err");
- let resumed_session = "81f218eb-568b-409b-871b-f6e86d8f666f";
+ let remembered_session = "81f218eb-568b-409b-871b-f6e86d8f666f";
+ let fresh_session = "dbd3b6c2-4757-4b45-a8f0-f3d877e1a13f";
+ let sibling_session = "d9a9a472-a091-4268-a7dd-9f31cf61f87e";
write_fake_claude_script(&fake_claude)?;
- must(
- fs::write(
- &stdout_file,
- serde_json::to_string(&json!({
- "type": "result",
- "subtype": "success",
- "is_error": false,
- "duration_ms": 1234,
- "duration_api_ms": 1200,
- "num_turns": 2,
- "result": "oracle",
- "stop_reason": "end_turn",
- "session_id": resumed_session,
- "total_cost_usd": 0.125,
- "usage": {
- "input_tokens": 10,
- "output_tokens": 5
- },
- "modelUsage": {
- "claude-opus-4-6": {
- "inputTokens": 10,
- "outputTokens": 5
- }
- },
- "permission_denials": [],
- "fast_mode_state": "off",
- "uuid": "uuid-123"
- }))?,
- ),
- "write fake stdout",
- )?;
+ write_fake_claude_stdout(&stdout_file, "oracle", remembered_session, "uuid-123")?;
let claude_bin = fake_claude.display().to_string();
let stdout_path = stdout_file.display().to_string();
@@ -420,43 +438,135 @@ fn consult_can_resume_a_prior_session_with_read_only_toolset_and_requested_worki
json!({
"prompt": "say oracle",
"cwd": sandbox.display().to_string(),
- "session_id": resumed_session,
+ "session_id": "not-a-uuid",
"background": true
}),
)?;
assert_tool_ok(&consult);
assert_eq!(tool_content(&consult)["response"].as_str(), Some("oracle"));
- assert!(tool_content(&consult)["mode"].is_null());
- assert!(tool_content(&consult)["job_id"].is_null());
assert_eq!(
- tool_content(&consult)["session_mode"].as_str(),
- Some("resumed")
+ tool_content(&consult)["context_mode"].as_str(),
+ Some("fresh")
);
+ assert!(tool_content(&consult)["reused_session_id"].is_null());
assert_eq!(
- tool_content(&consult)["requested_session_id"].as_str(),
- Some(resumed_session)
+ tool_content(&consult)["session_id"].as_str(),
+ Some(remembered_session)
);
+ let first_args = must(fs::read_to_string(&args_file), "read first fake args file")?;
+ assert!(!first_args.contains("--resume"));
+ assert!(!first_args.contains("not-a-uuid"));
+
+ write_fake_claude_stdout(
+ &stdout_file,
+ "oracle reused",
+ remembered_session,
+ "uuid-124",
+ )?;
+ let reused = harness.call_tool(
+ 4,
+ "consult",
+ json!({
+ "prompt": "say oracle reused",
+ "cwd": sandbox.display().to_string()
+ }),
+ )?;
+ assert_tool_ok(&reused);
assert_eq!(
- tool_content(&consult)["prompt_prefix_injected"].as_bool(),
- Some(true)
+ tool_content(&reused)["response"].as_str(),
+ Some("oracle reused")
);
assert_eq!(
- tool_content(&consult)["cwd"].as_str(),
- Some(sandbox.display().to_string().as_str())
+ tool_content(&reused)["context_mode"].as_str(),
+ Some("reused")
);
- assert_eq!(tool_content(&consult)["num_turns"].as_u64(), Some(2));
assert_eq!(
- tool_content(&consult)["session_id"].as_str(),
- Some(resumed_session)
+ tool_content(&reused)["reused_session_id"].as_str(),
+ Some(remembered_session)
+ );
+ let reused_args = must(fs::read_to_string(&args_file), "read reused fake args file")?;
+ assert!(reused_args.contains("--resume"));
+ assert!(reused_args.contains(remembered_session));
+
+ write_fake_claude_stdout(&stdout_file, "oracle fresh", fresh_session, "uuid-125")?;
+ let fresh = harness.call_tool(
+ 5,
+ "consult",
+ json!({
+ "prompt": "say oracle fresh",
+ "cwd": sandbox.display().to_string(),
+ "fresh_context": true
+ }),
+ )?;
+ assert_tool_ok(&fresh);
+ assert_eq!(
+ tool_content(&fresh)["response"].as_str(),
+ Some("oracle fresh")
);
+ assert_eq!(tool_content(&fresh)["context_mode"].as_str(), Some("fresh"));
+ assert!(tool_content(&fresh)["reused_session_id"].is_null());
+ let fresh_args = must(fs::read_to_string(&args_file), "read fresh fake args file")?;
+ assert!(!fresh_args.contains("--resume"));
+
+ write_fake_claude_stdout(
+ &stdout_file,
+ "oracle after fresh",
+ fresh_session,
+ "uuid-126",
+ )?;
+ let after_fresh = harness.call_tool(
+ 6,
+ "consult",
+ json!({
+ "prompt": "say oracle after fresh",
+ "cwd": sandbox.display().to_string()
+ }),
+ )?;
+ assert_tool_ok(&after_fresh);
+ assert_eq!(
+ tool_content(&after_fresh)["context_mode"].as_str(),
+ Some("reused")
+ );
+ assert_eq!(
+ tool_content(&after_fresh)["reused_session_id"].as_str(),
+ Some(fresh_session)
+ );
+ let after_fresh_args = must(
+ fs::read_to_string(&args_file),
+ "read after-fresh fake args file",
+ )?;
+ assert!(after_fresh_args.contains("--resume"));
+ assert!(after_fresh_args.contains(fresh_session));
+
+ write_fake_claude_stdout(&stdout_file, "oracle sibling", sibling_session, "uuid-127")?;
+ let sibling = harness.call_tool(
+ 7,
+ "consult",
+ json!({
+ "prompt": "say oracle sibling",
+ "cwd": sibling_sandbox.display().to_string()
+ }),
+ )?;
+ assert_tool_ok(&sibling);
+ assert_eq!(
+ tool_content(&sibling)["context_mode"].as_str(),
+ Some("fresh")
+ );
+ assert!(tool_content(&sibling)["reused_session_id"].is_null());
+ let sibling_args = must(
+ fs::read_to_string(&args_file),
+ "read sibling fake args file",
+ )?;
+ assert!(!sibling_args.contains("--resume"));
+
let persisted_output_path = must_some(
- tool_content(&consult)["persisted_output_path"]
+ tool_content(&after_fresh)["persisted_output_path"]
.as_str()
.map(str::to_owned),
"persisted output path",
)?;
assert!(persisted_output_path.starts_with("/tmp/phone_opus-consults/"));
- assert!(persisted_output_path.contains(resumed_session));
+ assert!(persisted_output_path.contains(fresh_session));
let persisted_output = must(
fs::read_to_string(&persisted_output_path),
"read persisted consult output",
@@ -465,14 +575,18 @@ fn consult_can_resume_a_prior_session_with_read_only_toolset_and_requested_worki
serde_json::from_str(&persisted_output),
"parse persisted consult output",
)?;
- assert_eq!(persisted_output["response"].as_str(), Some("oracle"));
assert_eq!(
- persisted_output["requested_session_id"].as_str(),
- Some(resumed_session)
+ persisted_output["response"].as_str(),
+ Some("oracle after fresh")
+ );
+ assert_eq!(persisted_output["context_mode"].as_str(), Some("reused"));
+ assert_eq!(
+ persisted_output["reused_session_id"].as_str(),
+ Some(fresh_session)
);
let pwd = must(fs::read_to_string(&pwd_file), "read fake pwd file")?;
- assert_eq!(pwd.trim(), sandbox.display().to_string());
+ assert_eq!(pwd.trim(), sibling_sandbox.display().to_string());
let args = must(fs::read_to_string(&args_file), "read fake args file")?;
let lines = args.lines().collect::<Vec<_>>();
@@ -493,13 +607,11 @@ fn consult_can_resume_a_prior_session_with_read_only_toolset_and_requested_worki
assert!(lines.contains(&"--dangerously-skip-permissions"));
assert!(!lines.contains(&"--permission-mode"));
assert!(!lines.contains(&"dontAsk"));
- assert!(lines.contains(&"--resume"));
- assert!(lines.contains(&resumed_session));
+ assert!(!lines.contains(&"--resume"));
assert!(!lines.contains(&"--max-turns"));
assert!(args.contains(PROMPT_PREFIX));
- assert!(args.contains("The real prompt follows."));
let prefix_index = must_some(args.find(PROMPT_PREFIX), "prefixed consult prompt")?;
- let user_prompt_index = must_some(args.find("say oracle"), "user prompt inside args")?;
+ let user_prompt_index = must_some(args.find("say oracle sibling"), "user prompt inside args")?;
assert!(prefix_index < user_prompt_index);
let env_dump = must(fs::read_to_string(&env_file), "read fake env file")?;
@@ -572,7 +684,7 @@ fn consult_can_resume_a_prior_session_with_read_only_toolset_and_requested_worki
)?;
assert_eq!(credential_probe.trim(), "write_succeeded");
- let telemetry = harness.call_tool(4, "telemetry_snapshot", json!({}))?;
+ let telemetry = harness.call_tool(8, "telemetry_snapshot", json!({}))?;
assert_tool_ok(&telemetry);
let hot_methods = tool_content(&telemetry)["hot_methods"]
.as_array()
@@ -644,37 +756,6 @@ fn background_surfaces_are_hidden_from_public_mcp() -> TestResult {
}
#[test]
-fn consult_rejects_invalid_session_handles() -> TestResult {
- let root = temp_root("consult_invalid_session")?;
- let state_home = root.join("state-home");
- must(fs::create_dir_all(&state_home), "create state home")?;
-
- let mut harness = McpHarness::spawn(&state_home, &[])?;
- let _ = harness.initialize()?;
- harness.notify_initialized()?;
-
- let consult = harness.call_tool(
- 3,
- "consult",
- json!({
- "prompt": "fail",
- "session_id": "not-a-uuid"
- }),
- )?;
- assert_tool_error(&consult);
- assert_eq!(
- tool_content(&consult)["fault"]["class"].as_str(),
- Some("protocol")
- );
- assert!(
- tool_content(&consult)["fault"]["detail"]
- .as_str()
- .is_some_and(|value| value.contains("session_id must be a valid UUID"))
- );
- Ok(())
-}
-
-#[test]
fn consult_surfaces_downstream_cli_failures() -> TestResult {
let root = temp_root("consult_failure")?;
let state_home = root.join("state-home");