From f309ff8642c2e788cd7936ec1063f444fd671f3d Mon Sep 17 00:00:00 2001 From: Yeachan-Heo Date: Sun, 12 Apr 2026 07:00:07 +0000 Subject: [PATCH] Stop repo lanes from executing the wrong task payload The next repo-local sweep target was ROADMAP #71: a claw-code lane accepted an unrelated KakaoTalk/image-analysis prompt even though the lane itself was supposed to be repo-scoped work. This extends the existing prompt-misdelivery guardrail with an optional structured task receipt so worker boot can reject visible wrong-task context before the lane continues executing. Constraint: Keep the fix inside the existing worker_boot / WorkerSendPrompt control surface instead of inventing a new external OMX-only protocol Rejected: Treat wrong-task receipts as generic shell misdelivery | loses the expected-vs-observed task context needed to debug contaminated lanes Confidence: high Scope-risk: narrow Reversibility: clean Directive: If task-receipt fields change later, update the WorkerSendPrompt schema, worker payload serialization, and wrong-task regression together Tested: cargo fmt --all --check; cargo clippy --workspace --all-targets -- -D warnings; cargo test --workspace; architect review APPROVE Not-tested: External orchestrators that have not yet started populating the optional task_receipt field --- ROADMAP.md | 2 + rust/crates/runtime/src/worker_boot.rs | 176 +++++++++++++++++- .../crates/runtime/tests/integration_tests.rs | 2 +- rust/crates/tools/src/lib.rs | 27 ++- 4 files changed, 195 insertions(+), 12 deletions(-) diff --git a/ROADMAP.md b/ROADMAP.md index 7cbfffa..fcf5681 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -513,3 +513,5 @@ Model name prefix now wins unconditionally over env-var presence. Regression tes 69. **Lane stop summaries have no minimum quality floor** — **done (verified 2026-04-12):** completed lane persistence in `rust/crates/tools/src/lib.rs` now normalizes vague/control-only stop summaries into a contextual fallback that includes the lane target and status, while preserving structured metadata about whether the quality floor fired (`qualityFloorApplied`, `rawSummary`, `reasons`, `wordCount`). Regression coverage locks both the pass-through path for good summaries and the fallback path for mushy summaries like `commit push everyting, keep sweeping $ralph`. **Original filing below.** 70. **Install-source ambiguity misleads real users** — **done (verified 2026-04-12):** repo-local Rust guidance now makes the source of truth explicit in `claw doctor` and `claw --help`, naming `ultraworkers/claw-code` as the canonical repo and warning that `cargo install claw-code` installs a deprecated stub rather than the `claw` binary. Regression coverage locks both the new doctor JSON check and the help-text warning. **Original filing below.** + +71. **Wrong-task prompt receipt is not detected before execution** — **done (verified 2026-04-12):** worker boot prompt dispatch now accepts an optional structured `task_receipt` (`repo`, `task_kind`, `source_surface`, `expected_artifacts`, `objective_preview`) and treats mismatched visible prompt context as a `WrongTask` prompt-delivery failure before execution continues. The prompt-delivery payload now records `observed_prompt_preview` plus the expected receipt, and regression coverage locks both the existing shell/wrong-target paths and the new KakaoTalk-style wrong-task mismatch case. **Original filing below.** diff --git a/rust/crates/runtime/src/worker_boot.rs b/rust/crates/runtime/src/worker_boot.rs index d133bfa..a7eacf5 100644 --- a/rust/crates/runtime/src/worker_boot.rs +++ b/rust/crates/runtime/src/worker_boot.rs @@ -92,6 +92,7 @@ pub enum WorkerTrustResolution { pub enum WorkerPromptTarget { Shell, WrongTarget, + WrongTask, Unknown, } @@ -108,10 +109,24 @@ pub enum WorkerEventPayload { observed_target: WorkerPromptTarget, #[serde(skip_serializing_if = "Option::is_none")] observed_cwd: Option, + #[serde(skip_serializing_if = "Option::is_none")] + observed_prompt_preview: Option, + #[serde(skip_serializing_if = "Option::is_none")] + task_receipt: Option, recovery_armed: bool, }, } +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct WorkerTaskReceipt { + pub repo: String, + pub task_kind: String, + pub source_surface: String, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub expected_artifacts: Vec, + pub objective_preview: String, +} + #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct WorkerEvent { pub seq: u64, @@ -134,6 +149,7 @@ pub struct Worker { pub prompt_delivery_attempts: u32, pub prompt_in_flight: bool, pub last_prompt: Option, + pub expected_receipt: Option, pub replay_prompt: Option, pub last_error: Option, pub created_at: u64, @@ -182,6 +198,7 @@ impl WorkerRegistry { prompt_delivery_attempts: 0, prompt_in_flight: false, last_prompt: None, + expected_receipt: None, replay_prompt: None, last_error: None, created_at: ts, @@ -257,6 +274,7 @@ impl WorkerRegistry { &lowered, worker.last_prompt.as_deref(), &worker.cwd, + worker.expected_receipt.as_ref(), ) }) .flatten() @@ -272,6 +290,10 @@ impl WorkerRegistry { "worker prompt landed in the wrong target instead of {}: {}", worker.cwd, prompt_preview ), + WorkerPromptTarget::WrongTask => format!( + "worker prompt receipt mismatched the expected task context for {}: {}", + worker.cwd, prompt_preview + ), WorkerPromptTarget::Unknown => format!( "worker prompt delivery failed before reaching coding agent: {prompt_preview}" ), @@ -291,6 +313,8 @@ impl WorkerRegistry { prompt_preview: prompt_preview.clone(), observed_target: observation.target, observed_cwd: observation.observed_cwd.clone(), + observed_prompt_preview: observation.observed_prompt_preview.clone(), + task_receipt: worker.expected_receipt.clone(), recovery_armed: false, }), ); @@ -306,6 +330,8 @@ impl WorkerRegistry { prompt_preview, observed_target: observation.target, observed_cwd: observation.observed_cwd, + observed_prompt_preview: observation.observed_prompt_preview, + task_receipt: worker.expected_receipt.clone(), recovery_armed: true, }), ); @@ -374,7 +400,12 @@ impl WorkerRegistry { Ok(worker.clone()) } - pub fn send_prompt(&self, worker_id: &str, prompt: Option<&str>) -> Result { + pub fn send_prompt( + &self, + worker_id: &str, + prompt: Option<&str>, + task_receipt: Option, + ) -> Result { let mut inner = self.inner.lock().expect("worker registry lock poisoned"); let worker = inner .workers @@ -398,6 +429,7 @@ impl WorkerRegistry { worker.prompt_delivery_attempts += 1; worker.prompt_in_flight = true; worker.last_prompt = Some(next_prompt.clone()); + worker.expected_receipt = task_receipt; worker.replay_prompt = None; worker.last_error = None; worker.status = WorkerStatus::Running; @@ -548,6 +580,7 @@ fn prompt_misdelivery_is_relevant(worker: &Worker) -> bool { struct PromptDeliveryObservation { target: WorkerPromptTarget, observed_cwd: Option, + observed_prompt_preview: Option, } fn push_event( @@ -699,6 +732,7 @@ fn detect_prompt_misdelivery( lowered: &str, prompt: Option<&str>, expected_cwd: &str, + expected_receipt: Option<&WorkerTaskReceipt>, ) -> Option { let Some(prompt) = prompt else { return None; @@ -713,12 +747,30 @@ fn detect_prompt_misdelivery( return None; } let prompt_visible = lowered.contains(&prompt_snippet); + let observed_prompt_preview = detect_prompt_echo(screen_text); + + if let Some(receipt) = expected_receipt { + let receipt_visible = task_receipt_visible(lowered, receipt); + let mismatched_prompt_visible = observed_prompt_preview + .as_deref() + .map(str::to_ascii_lowercase) + .is_some_and(|preview| !preview.contains(&prompt_snippet)); + + if (prompt_visible || mismatched_prompt_visible) && !receipt_visible { + return Some(PromptDeliveryObservation { + target: WorkerPromptTarget::WrongTask, + observed_cwd: detect_observed_shell_cwd(screen_text), + observed_prompt_preview, + }); + } + } if let Some(observed_cwd) = detect_observed_shell_cwd(screen_text) { if prompt_visible && !cwd_matches_observed_target(expected_cwd, &observed_cwd) { return Some(PromptDeliveryObservation { target: WorkerPromptTarget::WrongTarget, observed_cwd: Some(observed_cwd), + observed_prompt_preview, }); } } @@ -736,6 +788,7 @@ fn detect_prompt_misdelivery( (shell_error && prompt_visible).then_some(PromptDeliveryObservation { target: WorkerPromptTarget::Shell, observed_cwd: None, + observed_prompt_preview, }) } @@ -748,10 +801,38 @@ fn prompt_preview(prompt: &str) -> String { format!("{}…", preview.trim_end()) } +fn detect_prompt_echo(screen_text: &str) -> Option { + screen_text.lines().find_map(|line| { + line.trim_start() + .strip_prefix('›') + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(str::to_string) + }) +} + +fn task_receipt_visible(lowered_screen_text: &str, receipt: &WorkerTaskReceipt) -> bool { + let expected_tokens = [ + receipt.repo.to_ascii_lowercase(), + receipt.task_kind.to_ascii_lowercase(), + receipt.source_surface.to_ascii_lowercase(), + receipt.objective_preview.to_ascii_lowercase(), + ]; + + expected_tokens + .iter() + .all(|token| lowered_screen_text.contains(token)) + && receipt + .expected_artifacts + .iter() + .all(|artifact| lowered_screen_text.contains(&artifact.to_ascii_lowercase())) +} + fn prompt_misdelivery_detail(observation: &PromptDeliveryObservation) -> &'static str { match observation.target { WorkerPromptTarget::Shell => "shell misdelivery detected", WorkerPromptTarget::WrongTarget => "prompt landed in wrong target", + WorkerPromptTarget::WrongTask => "prompt receipt mismatched expected task context", WorkerPromptTarget::Unknown => "prompt delivery failure detected", } } @@ -865,7 +946,7 @@ mod tests { WorkerFailureKind::TrustGate ); - let send_before_resolve = registry.send_prompt(&worker.worker_id, Some("ship it")); + let send_before_resolve = registry.send_prompt(&worker.worker_id, Some("ship it"), None); assert!(send_before_resolve .expect_err("prompt delivery should be gated") .contains("not ready for prompt delivery")); @@ -905,7 +986,7 @@ mod tests { .expect("ready observe should succeed"); let running = registry - .send_prompt(&worker.worker_id, Some("Implement worker handshake")) + .send_prompt(&worker.worker_id, Some("Implement worker handshake"), None) .expect("prompt send should succeed"); assert_eq!(running.status, WorkerStatus::Running); assert_eq!(running.prompt_delivery_attempts, 1); @@ -941,6 +1022,8 @@ mod tests { prompt_preview: "Implement worker handshake".to_string(), observed_target: WorkerPromptTarget::Shell, observed_cwd: None, + observed_prompt_preview: None, + task_receipt: None, recovery_armed: false, }) ); @@ -956,12 +1039,14 @@ mod tests { prompt_preview: "Implement worker handshake".to_string(), observed_target: WorkerPromptTarget::Shell, observed_cwd: None, + observed_prompt_preview: None, + task_receipt: None, recovery_armed: true, }) ); let replayed = registry - .send_prompt(&worker.worker_id, None) + .send_prompt(&worker.worker_id, None, None) .expect("replay send should succeed"); assert_eq!(replayed.status, WorkerStatus::Running); assert!(replayed.replay_prompt.is_none()); @@ -976,7 +1061,11 @@ mod tests { .observe(&worker.worker_id, "Ready for input\n>") .expect("ready observe should succeed"); registry - .send_prompt(&worker.worker_id, Some("Run the worker bootstrap tests")) + .send_prompt( + &worker.worker_id, + Some("Run the worker bootstrap tests"), + None, + ) .expect("prompt send should succeed"); let recovered = registry @@ -1007,6 +1096,8 @@ mod tests { prompt_preview: "Run the worker bootstrap tests".to_string(), observed_target: WorkerPromptTarget::WrongTarget, observed_cwd: Some("/tmp/repo-target-b".to_string()), + observed_prompt_preview: None, + task_receipt: None, recovery_armed: false, }) ); @@ -1049,6 +1140,75 @@ mod tests { assert!(ready.last_error.is_none()); } + #[test] + fn wrong_task_receipt_mismatch_is_detected_before_execution_continues() { + let registry = WorkerRegistry::new(); + let worker = registry.create("/tmp/repo-task", &[], true); + registry + .observe(&worker.worker_id, "Ready for input\n>") + .expect("ready observe should succeed"); + registry + .send_prompt( + &worker.worker_id, + Some("Implement worker handshake"), + Some(WorkerTaskReceipt { + repo: "claw-code".to_string(), + task_kind: "repo_code".to_string(), + source_surface: "omx_team".to_string(), + expected_artifacts: vec!["patch".to_string(), "tests".to_string()], + objective_preview: "Implement worker handshake".to_string(), + }), + ) + .expect("prompt send should succeed"); + + let recovered = registry + .observe( + &worker.worker_id, + "› Explain this KakaoTalk screenshot for a friend\nI can help analyze the screenshot…", + ) + .expect("mismatch observe should succeed"); + + assert_eq!(recovered.status, WorkerStatus::ReadyForPrompt); + assert_eq!( + recovered + .last_error + .expect("mismatch error should exist") + .kind, + WorkerFailureKind::PromptDelivery + ); + let mismatch = recovered + .events + .iter() + .find(|event| event.kind == WorkerEventKind::PromptMisdelivery) + .expect("wrong-task event should exist"); + assert_eq!(mismatch.status, WorkerStatus::Failed); + assert_eq!( + mismatch.payload, + Some(WorkerEventPayload::PromptDelivery { + prompt_preview: "Implement worker handshake".to_string(), + observed_target: WorkerPromptTarget::WrongTask, + observed_cwd: None, + observed_prompt_preview: Some( + "Explain this KakaoTalk screenshot for a friend".to_string() + ), + task_receipt: Some(WorkerTaskReceipt { + repo: "claw-code".to_string(), + task_kind: "repo_code".to_string(), + source_surface: "omx_team".to_string(), + expected_artifacts: vec!["patch".to_string(), "tests".to_string()], + objective_preview: "Implement worker handshake".to_string(), + }), + recovery_armed: false, + }) + ); + let replay = recovered + .events + .iter() + .find(|event| event.kind == WorkerEventKind::PromptReplayArmed) + .expect("replay event should exist"); + assert_eq!(replay.status, WorkerStatus::ReadyForPrompt); + } + #[test] fn restart_and_terminate_reset_or_finish_worker() { let registry = WorkerRegistry::new(); @@ -1057,7 +1217,7 @@ mod tests { .observe(&worker.worker_id, "Ready for input\n>") .expect("ready observe should succeed"); registry - .send_prompt(&worker.worker_id, Some("Run tests")) + .send_prompt(&worker.worker_id, Some("Run tests"), None) .expect("prompt send should succeed"); let restarted = registry @@ -1086,7 +1246,7 @@ mod tests { .observe(&worker.worker_id, "Ready for input\n>") .expect("ready observe should succeed"); registry - .send_prompt(&worker.worker_id, Some("Run tests")) + .send_prompt(&worker.worker_id, Some("Run tests"), None) .expect("prompt send should succeed"); let failed = registry @@ -1163,7 +1323,7 @@ mod tests { .observe(&worker.worker_id, "Ready for input\n>") .expect("ready observe should succeed"); registry - .send_prompt(&worker.worker_id, Some("Run tests")) + .send_prompt(&worker.worker_id, Some("Run tests"), None) .expect("prompt send should succeed"); let finished = registry diff --git a/rust/crates/runtime/tests/integration_tests.rs b/rust/crates/runtime/tests/integration_tests.rs index 49c6636..cc7bd9c 100644 --- a/rust/crates/runtime/tests/integration_tests.rs +++ b/rust/crates/runtime/tests/integration_tests.rs @@ -304,7 +304,7 @@ fn worker_provider_failure_flows_through_recovery_to_policy() { .observe(&worker.worker_id, "Ready for your input\n>") .expect("ready observe should succeed"); registry - .send_prompt(&worker.worker_id, Some("Run analysis")) + .send_prompt(&worker.worker_id, Some("Run analysis"), None) .expect("prompt send should succeed"); // Session completes with provider failure (finish="unknown", tokens=0) diff --git a/rust/crates/tools/src/lib.rs b/rust/crates/tools/src/lib.rs index d843dcc..9155afe 100644 --- a/rust/crates/tools/src/lib.rs +++ b/rust/crates/tools/src/lib.rs @@ -20,7 +20,7 @@ use runtime::{ summary_compression::compress_summary_text, task_registry::TaskRegistry, team_cron_registry::{CronRegistry, TeamRegistry}, - worker_boot::{WorkerReadySnapshot, WorkerRegistry}, + worker_boot::{WorkerReadySnapshot, WorkerRegistry, WorkerTaskReceipt}, write_file, ApiClient, ApiRequest, AssistantEvent, BashCommandInput, BashCommandOutput, BranchFreshness, ConfigLoader, ContentBlock, ConversationMessage, ConversationRuntime, GrepSearchInput, LaneCommitProvenance, LaneEvent, LaneEventBlocker, LaneEventName, @@ -930,7 +930,22 @@ pub fn mvp_tool_specs() -> Vec { "type": "object", "properties": { "worker_id": { "type": "string" }, - "prompt": { "type": "string" } + "prompt": { "type": "string" }, + "task_receipt": { + "type": "object", + "properties": { + "repo": { "type": "string" }, + "task_kind": { "type": "string" }, + "source_surface": { "type": "string" }, + "expected_artifacts": { + "type": "array", + "items": { "type": "string" } + }, + "objective_preview": { "type": "string" } + }, + "required": ["repo", "task_kind", "source_surface", "objective_preview"], + "additionalProperties": false + } }, "required": ["worker_id"], "additionalProperties": false @@ -1522,7 +1537,11 @@ fn run_worker_await_ready(input: WorkerIdInput) -> Result { #[allow(clippy::needless_pass_by_value)] fn run_worker_send_prompt(input: WorkerSendPromptInput) -> Result { - let worker = global_worker_registry().send_prompt(&input.worker_id, input.prompt.as_deref())?; + let worker = global_worker_registry().send_prompt( + &input.worker_id, + input.prompt.as_deref(), + input.task_receipt, + )?; to_pretty_json(worker) } @@ -2439,6 +2458,8 @@ struct WorkerSendPromptInput { worker_id: String, #[serde(default)] prompt: Option, + #[serde(default)] + task_receipt: Option, } const fn default_auto_recover_prompt_misdelivery() -> bool {