omx(team): auto-checkpoint worker-3 [unknown]

This commit is contained in:
bellman
2026-05-14 17:40:29 +09:00
parent 6df60a4683
commit a6ee51baab

View File

@@ -122,13 +122,37 @@ pub enum StartupFailureClassification {
Unknown, Unknown,
} }
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct StartupHealthSummary {
/// Whether this subsystem appeared healthy at timeout.
pub healthy: bool,
/// Stable placeholder/source string until deeper transport and MCP probes are wired in.
pub summary: String,
}
impl StartupHealthSummary {
fn observed(name: &str, healthy: bool) -> Self {
let status = if healthy { "healthy" } else { "unhealthy" };
Self {
healthy,
summary: format!("{name}_{status}_placeholder"),
}
}
}
/// Evidence bundle collected when worker startup times out without clear evidence. /// Evidence bundle collected when worker startup times out without clear evidence.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct StartupEvidenceBundle { pub struct StartupEvidenceBundle {
/// Last known worker lifecycle state before timeout /// Last known worker lifecycle state before timeout
pub last_lifecycle_state: WorkerStatus, pub last_lifecycle_state: WorkerStatus,
/// Timestamp of the last lifecycle state transition, unix epoch seconds
pub last_lifecycle_at: u64,
/// The pane/command that was being executed /// The pane/command that was being executed
pub pane_command: String, pub pane_command: String,
/// Timestamp when the pane/command snapshot was observed, unix epoch seconds
pub pane_observed_at: u64,
/// Timestamp when the worker command was started, unix epoch seconds
pub command_started_at: u64,
/// Timestamp when prompt was sent (if any), unix epoch seconds /// Timestamp when prompt was sent (if any), unix epoch seconds
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub prompt_sent_at: Option<u64>, pub prompt_sent_at: Option<u64>,
@@ -146,8 +170,12 @@ pub struct StartupEvidenceBundle {
pub tool_permission_allow_scope: Option<ToolPermissionAllowScope>, pub tool_permission_allow_scope: Option<ToolPermissionAllowScope>,
/// Transport health summary (true = healthy/responsive) /// Transport health summary (true = healthy/responsive)
pub transport_healthy: bool, pub transport_healthy: bool,
/// Typed transport health placeholder for future concrete probes
pub transport_health: StartupHealthSummary,
/// MCP health summary (true = all servers healthy) /// MCP health summary (true = all servers healthy)
pub mcp_healthy: bool, pub mcp_healthy: bool,
/// Typed MCP health placeholder for future concrete probes
pub mcp_health: StartupHealthSummary,
/// Seconds since worker creation /// Seconds since worker creation
pub elapsed_seconds: u64, pub elapsed_seconds: u64,
} }
@@ -225,6 +253,7 @@ pub struct Worker {
pub auto_recover_prompt_misdelivery: bool, pub auto_recover_prompt_misdelivery: bool,
pub prompt_delivery_attempts: u32, pub prompt_delivery_attempts: u32,
pub prompt_in_flight: bool, pub prompt_in_flight: bool,
pub prompt_sent_at: Option<u64>,
pub last_prompt: Option<String>, pub last_prompt: Option<String>,
pub expected_receipt: Option<WorkerTaskReceipt>, pub expected_receipt: Option<WorkerTaskReceipt>,
pub replay_prompt: Option<String>, pub replay_prompt: Option<String>,
@@ -274,6 +303,7 @@ impl WorkerRegistry {
auto_recover_prompt_misdelivery, auto_recover_prompt_misdelivery,
prompt_delivery_attempts: 0, prompt_delivery_attempts: 0,
prompt_in_flight: false, prompt_in_flight: false,
prompt_sent_at: None,
last_prompt: None, last_prompt: None,
expected_receipt: None, expected_receipt: None,
replay_prompt: None, replay_prompt: None,
@@ -528,6 +558,7 @@ impl WorkerRegistry {
worker.prompt_delivery_attempts += 1; worker.prompt_delivery_attempts += 1;
worker.prompt_in_flight = true; worker.prompt_in_flight = true;
worker.prompt_sent_at = Some(now_secs());
worker.last_prompt = Some(next_prompt.clone()); worker.last_prompt = Some(next_prompt.clone());
worker.expected_receipt = task_receipt; worker.expected_receipt = task_receipt;
worker.replay_prompt = None; worker.replay_prompt = None;
@@ -579,6 +610,7 @@ impl WorkerRegistry {
worker.last_error = None; worker.last_error = None;
worker.prompt_delivery_attempts = 0; worker.prompt_delivery_attempts = 0;
worker.prompt_in_flight = false; worker.prompt_in_flight = false;
worker.prompt_sent_at = None;
push_event( push_event(
worker, worker,
WorkerEventKind::Restarted, WorkerEventKind::Restarted,
@@ -696,12 +728,11 @@ impl WorkerRegistry {
// Build evidence bundle // Build evidence bundle
let evidence = StartupEvidenceBundle { let evidence = StartupEvidenceBundle {
last_lifecycle_state: worker.status, last_lifecycle_state: worker.status,
last_lifecycle_at: worker.updated_at,
pane_command: pane_command.to_string(), pane_command: pane_command.to_string(),
prompt_sent_at: if worker.prompt_delivery_attempts > 0 { pane_observed_at: now,
Some(worker.updated_at) command_started_at: worker.created_at,
} else { prompt_sent_at: worker.prompt_sent_at,
None
},
prompt_acceptance_state: worker.status == WorkerStatus::Running prompt_acceptance_state: worker.status == WorkerStatus::Running
&& !worker.prompt_in_flight, && !worker.prompt_in_flight,
trust_prompt_detected: worker trust_prompt_detected: worker
@@ -716,7 +747,9 @@ impl WorkerRegistry {
.map(|event| now.saturating_sub(event.timestamp)), .map(|event| now.saturating_sub(event.timestamp)),
tool_permission_allow_scope, tool_permission_allow_scope,
transport_healthy, transport_healthy,
transport_health: StartupHealthSummary::observed("transport", transport_healthy),
mcp_healthy, mcp_healthy,
mcp_health: StartupHealthSummary::observed("mcp", mcp_healthy),
elapsed_seconds: elapsed, elapsed_seconds: elapsed,
}; };