diff --git a/rust/crates/runtime/src/lib.rs b/rust/crates/runtime/src/lib.rs index fb75d98..aae0979 100644 --- a/rust/crates/runtime/src/lib.rs +++ b/rust/crates/runtime/src/lib.rs @@ -17,6 +17,8 @@ pub mod mcp_tool_bridge; mod oauth; pub mod permission_enforcer; mod policy_engine; +||||||| f76311f +pub mod recovery_recipes; mod permissions; pub mod plugin_lifecycle; mod prompt; @@ -102,6 +104,11 @@ pub use plugin_lifecycle::{ DegradedMode, DiscoveryResult, PluginHealthcheck, PluginLifecycle, PluginLifecycleEvent, PluginState, ResourceInfo, ServerHealth, ServerStatus, ToolInfo, }; +||||||| f76311f +pub use recovery_recipes::{ + attempt_recovery, recipe_for, EscalationPolicy, FailureScenario, RecoveryContext, + RecoveryEvent, RecoveryRecipe, RecoveryResult, RecoveryStep, +}; pub use prompt::{ load_system_prompt, prepend_bullets, ContextFile, ProjectContext, PromptBuildError, SystemPromptBuilder, FRONTIER_MODEL_NAME, SYSTEM_PROMPT_DYNAMIC_BOUNDARY, diff --git a/rust/crates/runtime/src/recovery_recipes.rs b/rust/crates/runtime/src/recovery_recipes.rs new file mode 100644 index 0000000..0afee62 --- /dev/null +++ b/rust/crates/runtime/src/recovery_recipes.rs @@ -0,0 +1,554 @@ +//! Recovery recipes for common failure scenarios. +//! +//! Encodes known automatic recoveries for the six failure scenarios +//! listed in ROADMAP item 8, and enforces one automatic recovery +//! attempt before escalation. Each attempt is emitted as a structured +//! recovery event. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +/// The six failure scenarios that have known recovery recipes. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum FailureScenario { + TrustPromptUnresolved, + PromptMisdelivery, + StaleBranch, + CompileRedCrossCrate, + McpHandshakeFailure, + PartialPluginStartup, +} + +impl FailureScenario { + /// Returns all known failure scenarios. + #[must_use] + pub fn all() -> &'static [FailureScenario] { + &[ + Self::TrustPromptUnresolved, + Self::PromptMisdelivery, + Self::StaleBranch, + Self::CompileRedCrossCrate, + Self::McpHandshakeFailure, + Self::PartialPluginStartup, + ] + } +} + +impl std::fmt::Display for FailureScenario { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::TrustPromptUnresolved => write!(f, "trust_prompt_unresolved"), + Self::PromptMisdelivery => write!(f, "prompt_misdelivery"), + Self::StaleBranch => write!(f, "stale_branch"), + Self::CompileRedCrossCrate => write!(f, "compile_red_cross_crate"), + Self::McpHandshakeFailure => write!(f, "mcp_handshake_failure"), + Self::PartialPluginStartup => write!(f, "partial_plugin_startup"), + } + } +} + +/// Individual step that can be executed as part of a recovery recipe. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum RecoveryStep { + AcceptTrustPrompt, + RedirectPromptToAgent, + RebaseBranch, + CleanBuild, + RetryMcpHandshake { timeout: u64 }, + RestartPlugin { name: String }, + EscalateToHuman { reason: String }, +} + +/// Policy governing what happens when automatic recovery is exhausted. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum EscalationPolicy { + AlertHuman, + LogAndContinue, + Abort, +} + +/// A recovery recipe encodes the sequence of steps to attempt for a +/// given failure scenario, along with the maximum number of automatic +/// attempts and the escalation policy. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct RecoveryRecipe { + pub scenario: FailureScenario, + pub steps: Vec, + pub max_attempts: u32, + pub escalation_policy: EscalationPolicy, +} + +/// Outcome of a recovery attempt. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum RecoveryResult { + Recovered { + steps_taken: u32, + }, + PartialRecovery { + recovered: Vec, + remaining: Vec, + }, + EscalationRequired { + reason: String, + }, +} + +/// Structured event emitted during recovery. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum RecoveryEvent { + RecoveryAttempted { + scenario: FailureScenario, + recipe: RecoveryRecipe, + result: RecoveryResult, + }, + RecoverySucceeded, + RecoveryFailed, + Escalated, +} + +/// Minimal context for tracking recovery state and emitting events. +/// +/// Holds per-scenario attempt counts, a structured event log, and an +/// optional simulation knob for controlling step outcomes during tests. +#[derive(Debug, Clone, Default)] +pub struct RecoveryContext { + attempts: HashMap, + events: Vec, + /// Optional step index at which simulated execution fails. + /// `None` means all steps succeed. + fail_at_step: Option, +} + +impl RecoveryContext { + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// Configure a step index at which simulated execution will fail. + #[must_use] + pub fn with_fail_at_step(mut self, index: usize) -> Self { + self.fail_at_step = Some(index); + self + } + + /// Returns the structured event log populated during recovery. + #[must_use] + pub fn events(&self) -> &[RecoveryEvent] { + &self.events + } + + /// Returns the number of recovery attempts made for a scenario. + #[must_use] + pub fn attempt_count(&self, scenario: &FailureScenario) -> u32 { + self.attempts.get(scenario).copied().unwrap_or(0) + } +} + +/// Returns the known recovery recipe for the given failure scenario. +#[must_use] +pub fn recipe_for(scenario: &FailureScenario) -> RecoveryRecipe { + match scenario { + FailureScenario::TrustPromptUnresolved => RecoveryRecipe { + scenario: *scenario, + steps: vec![RecoveryStep::AcceptTrustPrompt], + max_attempts: 1, + escalation_policy: EscalationPolicy::AlertHuman, + }, + FailureScenario::PromptMisdelivery => RecoveryRecipe { + scenario: *scenario, + steps: vec![RecoveryStep::RedirectPromptToAgent], + max_attempts: 1, + escalation_policy: EscalationPolicy::AlertHuman, + }, + FailureScenario::StaleBranch => RecoveryRecipe { + scenario: *scenario, + steps: vec![RecoveryStep::RebaseBranch, RecoveryStep::CleanBuild], + max_attempts: 1, + escalation_policy: EscalationPolicy::AlertHuman, + }, + FailureScenario::CompileRedCrossCrate => RecoveryRecipe { + scenario: *scenario, + steps: vec![RecoveryStep::CleanBuild], + max_attempts: 1, + escalation_policy: EscalationPolicy::AlertHuman, + }, + FailureScenario::McpHandshakeFailure => RecoveryRecipe { + scenario: *scenario, + steps: vec![RecoveryStep::RetryMcpHandshake { timeout: 5000 }], + max_attempts: 1, + escalation_policy: EscalationPolicy::Abort, + }, + FailureScenario::PartialPluginStartup => RecoveryRecipe { + scenario: *scenario, + steps: vec![ + RecoveryStep::RestartPlugin { + name: "stalled".to_string(), + }, + RecoveryStep::RetryMcpHandshake { timeout: 3000 }, + ], + max_attempts: 1, + escalation_policy: EscalationPolicy::LogAndContinue, + }, + } +} + +/// Attempts automatic recovery for the given failure scenario. +/// +/// Looks up the recipe, enforces the one-attempt-before-escalation +/// policy, simulates step execution (controlled by the context), and +/// emits structured [`RecoveryEvent`]s for every attempt. +pub fn attempt_recovery(scenario: &FailureScenario, ctx: &mut RecoveryContext) -> RecoveryResult { + let recipe = recipe_for(scenario); + let attempt_count = ctx.attempts.entry(*scenario).or_insert(0); + + // Enforce one automatic recovery attempt before escalation. + if *attempt_count >= recipe.max_attempts { + let result = RecoveryResult::EscalationRequired { + reason: format!( + "max recovery attempts ({}) exceeded for {}", + recipe.max_attempts, scenario + ), + }; + ctx.events.push(RecoveryEvent::RecoveryAttempted { + scenario: *scenario, + recipe, + result: result.clone(), + }); + ctx.events.push(RecoveryEvent::Escalated); + return result; + } + + *attempt_count += 1; + + // Execute steps, honoring the optional fail_at_step simulation. + let fail_index = ctx.fail_at_step; + let mut executed = Vec::new(); + let mut failed = false; + + for (i, step) in recipe.steps.iter().enumerate() { + if fail_index == Some(i) { + failed = true; + break; + } + executed.push(step.clone()); + } + + let result = if failed { + let remaining: Vec = recipe.steps[executed.len()..].to_vec(); + if executed.is_empty() { + RecoveryResult::EscalationRequired { + reason: format!("recovery failed at first step for {}", scenario), + } + } else { + RecoveryResult::PartialRecovery { + recovered: executed, + remaining, + } + } + } else { + RecoveryResult::Recovered { + steps_taken: recipe.steps.len() as u32, + } + }; + + // Emit the attempt as structured event data. + ctx.events.push(RecoveryEvent::RecoveryAttempted { + scenario: *scenario, + recipe, + result: result.clone(), + }); + + match &result { + RecoveryResult::Recovered { .. } => { + ctx.events.push(RecoveryEvent::RecoverySucceeded); + } + RecoveryResult::PartialRecovery { .. } => { + ctx.events.push(RecoveryEvent::RecoveryFailed); + } + RecoveryResult::EscalationRequired { .. } => { + ctx.events.push(RecoveryEvent::Escalated); + } + } + + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn each_scenario_has_a_matching_recipe() { + // given + let scenarios = FailureScenario::all(); + + // when / then + for scenario in scenarios { + let recipe = recipe_for(scenario); + assert_eq!( + recipe.scenario, *scenario, + "recipe scenario should match requested scenario" + ); + assert!( + !recipe.steps.is_empty(), + "recipe for {} should have at least one step", + scenario + ); + assert!( + recipe.max_attempts >= 1, + "recipe for {} should allow at least one attempt", + scenario + ); + } + } + + #[test] + fn successful_recovery_returns_recovered_and_emits_events() { + // given + let mut ctx = RecoveryContext::new(); + let scenario = FailureScenario::TrustPromptUnresolved; + + // when + let result = attempt_recovery(&scenario, &mut ctx); + + // then + assert_eq!(result, RecoveryResult::Recovered { steps_taken: 1 }); + assert_eq!(ctx.events().len(), 2); + assert!(matches!( + &ctx.events()[0], + RecoveryEvent::RecoveryAttempted { + scenario: s, + result: r, + .. + } if *s == FailureScenario::TrustPromptUnresolved + && matches!(r, RecoveryResult::Recovered { steps_taken: 1 }) + )); + assert_eq!(ctx.events()[1], RecoveryEvent::RecoverySucceeded); + } + + #[test] + fn escalation_after_max_attempts_exceeded() { + // given + let mut ctx = RecoveryContext::new(); + let scenario = FailureScenario::PromptMisdelivery; + + // when — first attempt succeeds + let first = attempt_recovery(&scenario, &mut ctx); + assert!(matches!(first, RecoveryResult::Recovered { .. })); + + // when — second attempt should escalate + let second = attempt_recovery(&scenario, &mut ctx); + + // then + assert!( + matches!( + &second, + RecoveryResult::EscalationRequired { reason } + if reason.contains("max recovery attempts") + ), + "second attempt should require escalation, got: {second:?}" + ); + assert_eq!(ctx.attempt_count(&scenario), 1); + assert!(ctx + .events() + .iter() + .any(|e| matches!(e, RecoveryEvent::Escalated))); + } + + #[test] + fn partial_recovery_when_step_fails_midway() { + // given — PartialPluginStartup has two steps; fail at step index 1 + let mut ctx = RecoveryContext::new().with_fail_at_step(1); + let scenario = FailureScenario::PartialPluginStartup; + + // when + let result = attempt_recovery(&scenario, &mut ctx); + + // then + match &result { + RecoveryResult::PartialRecovery { + recovered, + remaining, + } => { + assert_eq!(recovered.len(), 1, "one step should have succeeded"); + assert_eq!(remaining.len(), 1, "one step should remain"); + assert!(matches!(recovered[0], RecoveryStep::RestartPlugin { .. })); + assert!(matches!( + remaining[0], + RecoveryStep::RetryMcpHandshake { .. } + )); + } + other => panic!("expected PartialRecovery, got {other:?}"), + } + assert!(ctx + .events() + .iter() + .any(|e| matches!(e, RecoveryEvent::RecoveryFailed))); + } + + #[test] + fn first_step_failure_escalates_immediately() { + // given — fail at step index 0 + let mut ctx = RecoveryContext::new().with_fail_at_step(0); + let scenario = FailureScenario::CompileRedCrossCrate; + + // when + let result = attempt_recovery(&scenario, &mut ctx); + + // then + assert!( + matches!( + &result, + RecoveryResult::EscalationRequired { reason } + if reason.contains("failed at first step") + ), + "zero-step failure should escalate, got: {result:?}" + ); + assert!(ctx + .events() + .iter() + .any(|e| matches!(e, RecoveryEvent::Escalated))); + } + + #[test] + fn emitted_events_include_structured_attempt_data() { + // given + let mut ctx = RecoveryContext::new(); + let scenario = FailureScenario::McpHandshakeFailure; + + // when + let _ = attempt_recovery(&scenario, &mut ctx); + + // then — verify the RecoveryAttempted event carries full context + let attempted = ctx + .events() + .iter() + .find(|e| matches!(e, RecoveryEvent::RecoveryAttempted { .. })) + .expect("should have emitted RecoveryAttempted event"); + + match attempted { + RecoveryEvent::RecoveryAttempted { + scenario: s, + recipe, + result, + } => { + assert_eq!(*s, scenario); + assert_eq!(recipe.scenario, scenario); + assert!(!recipe.steps.is_empty()); + assert!(matches!(result, RecoveryResult::Recovered { .. })); + } + _ => unreachable!(), + } + + // Verify the event is serializable as structured JSON + let json = serde_json::to_string(&ctx.events()[0]) + .expect("recovery event should be serializable to JSON"); + assert!( + json.contains("mcp_handshake_failure"), + "serialized event should contain scenario name" + ); + } + + #[test] + fn recovery_context_tracks_attempts_per_scenario() { + // given + let mut ctx = RecoveryContext::new(); + + // when + assert_eq!(ctx.attempt_count(&FailureScenario::StaleBranch), 0); + attempt_recovery(&FailureScenario::StaleBranch, &mut ctx); + + // then + assert_eq!(ctx.attempt_count(&FailureScenario::StaleBranch), 1); + assert_eq!(ctx.attempt_count(&FailureScenario::PromptMisdelivery), 0); + } + + #[test] + fn stale_branch_recipe_has_rebase_then_clean_build() { + // given + let recipe = recipe_for(&FailureScenario::StaleBranch); + + // then + assert_eq!(recipe.steps.len(), 2); + assert_eq!(recipe.steps[0], RecoveryStep::RebaseBranch); + assert_eq!(recipe.steps[1], RecoveryStep::CleanBuild); + } + + #[test] + fn partial_plugin_startup_recipe_has_restart_then_handshake() { + // given + let recipe = recipe_for(&FailureScenario::PartialPluginStartup); + + // then + assert_eq!(recipe.steps.len(), 2); + assert!(matches!( + recipe.steps[0], + RecoveryStep::RestartPlugin { .. } + )); + assert!(matches!( + recipe.steps[1], + RecoveryStep::RetryMcpHandshake { timeout: 3000 } + )); + assert_eq!(recipe.escalation_policy, EscalationPolicy::LogAndContinue); + } + + #[test] + fn failure_scenario_display_all_variants() { + // given + let cases = [ + ( + FailureScenario::TrustPromptUnresolved, + "trust_prompt_unresolved", + ), + (FailureScenario::PromptMisdelivery, "prompt_misdelivery"), + (FailureScenario::StaleBranch, "stale_branch"), + ( + FailureScenario::CompileRedCrossCrate, + "compile_red_cross_crate", + ), + ( + FailureScenario::McpHandshakeFailure, + "mcp_handshake_failure", + ), + ( + FailureScenario::PartialPluginStartup, + "partial_plugin_startup", + ), + ]; + + // when / then + for (scenario, expected) in &cases { + assert_eq!(scenario.to_string(), *expected); + } + } + + #[test] + fn multi_step_success_reports_correct_steps_taken() { + // given — StaleBranch has 2 steps, no simulated failure + let mut ctx = RecoveryContext::new(); + let scenario = FailureScenario::StaleBranch; + + // when + let result = attempt_recovery(&scenario, &mut ctx); + + // then + assert_eq!(result, RecoveryResult::Recovered { steps_taken: 2 }); + } + + #[test] + fn mcp_handshake_recipe_uses_abort_escalation_policy() { + // given + let recipe = recipe_for(&FailureScenario::McpHandshakeFailure); + + // then + assert_eq!(recipe.escalation_policy, EscalationPolicy::Abort); + assert_eq!(recipe.max_attempts, 1); + } +}