mirror of
https://github.com/instructkr/claw-code.git
synced 2026-04-17 19:15:09 +08:00
All 23 stories (US-001 through US-023) are now complete. Updated status from "in_progress" to "completed". Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
342 lines
19 KiB
JSON
342 lines
19 KiB
JSON
{
|
|
"version": "1.0",
|
|
"description": "Clawable Coding Harness - Clear roadmap stories and commit each",
|
|
"stories": [
|
|
{
|
|
"id": "US-001",
|
|
"title": "Phase 1.6 - startup-no-evidence evidence bundle + classifier",
|
|
"description": "When startup times out, emit typed worker.startup_no_evidence event with evidence bundle including last known worker lifecycle state, pane command, prompt-send timestamp, prompt-acceptance state, trust-prompt detection result, and transport/MCP health summary. Classifier should down-rank into specific failure classes.",
|
|
"acceptanceCriteria": [
|
|
"worker.startup_no_evidence event emitted on startup timeout with evidence bundle",
|
|
"Evidence bundle includes: last lifecycle state, pane command, prompt-send timestamp, prompt-acceptance state, trust-prompt detection, transport/MCP health",
|
|
"Classifier attempts to categorize into: trust_required, prompt_misdelivery, prompt_acceptance_timeout, transport_dead, worker_crashed, or unknown",
|
|
"Tests verify evidence bundle structure and classifier behavior"
|
|
],
|
|
"passes": true,
|
|
"priority": "P0"
|
|
},
|
|
{
|
|
"id": "US-002",
|
|
"title": "Phase 2 - Canonical lane event schema (4.x series)",
|
|
"description": "Define typed events for lane lifecycle: lane.started, lane.ready, lane.prompt_misdelivery, lane.blocked, lane.red, lane.green, lane.commit.created, lane.pr.opened, lane.merge.ready, lane.finished, lane.failed, branch.stale_against_main. Also implement event ordering, reconciliation, provenance, deduplication, and projection contracts.",
|
|
"acceptanceCriteria": [
|
|
"LaneEvent enum with all required variants defined",
|
|
"Event ordering with monotonic sequence metadata attached",
|
|
"Event provenance labels (live_lane, test, healthcheck, replay, transport)",
|
|
"Session identity completeness at creation (title, workspace, purpose)",
|
|
"Duplicate terminal-event suppression with fingerprinting",
|
|
"Lane ownership/scope binding in events",
|
|
"Nudge acknowledgment with dedupe contract",
|
|
"clawhip consumes typed lane events instead of pane scraping"
|
|
],
|
|
"passes": true,
|
|
"priority": "P0"
|
|
},
|
|
{
|
|
"id": "US-003",
|
|
"title": "Phase 3 - Stale-branch detection before broad verification",
|
|
"description": "Before broad test runs, compare current branch to main and detect if known fixes are missing. Emit branch.stale_against_main event and suggest/auto-run rebase/merge-forward.",
|
|
"acceptanceCriteria": [
|
|
"Branch freshness comparison against main implemented",
|
|
"branch.stale_against_main event emitted when behind",
|
|
"Auto-rebase/merge-forward policy integration",
|
|
"Avoid misclassifying stale-branch failures as new regressions"
|
|
],
|
|
"passes": true,
|
|
"priority": "P1"
|
|
},
|
|
{
|
|
"id": "US-004",
|
|
"title": "Phase 3 - Recovery recipes with ledger",
|
|
"description": "Encode automatic recoveries for common failures (trust prompt, prompt misdelivery, stale branch, compile red, MCP startup). Expose recovery attempt ledger with recipe id, attempt count, state, timestamps, failure summary.",
|
|
"acceptanceCriteria": [
|
|
"Recovery recipes defined for: trust_prompt_unresolved, prompt_delivered_to_shell, stale_branch, compile_red_after_refactor, MCP_handshake_failure, partial_plugin_startup",
|
|
"Recovery attempt ledger with: recipe id, attempt count, state, timestamps, failure summary, escalation reason",
|
|
"One automatic recovery attempt before escalation",
|
|
"Ledger emitted as structured event data"
|
|
],
|
|
"passes": true,
|
|
"priority": "P1"
|
|
},
|
|
{
|
|
"id": "US-005",
|
|
"title": "Phase 4 - Typed task packet format",
|
|
"description": "Define structured task packet with fields: objective, scope, repo/worktree, branch policy, acceptance tests, commit policy, reporting contract, escalation policy.",
|
|
"acceptanceCriteria": [
|
|
"TaskPacket struct with all required fields",
|
|
"TaskScope resolution (workspace/module/single-file/custom)",
|
|
"Validation and serialization support",
|
|
"Integration into tools/src/lib.rs"
|
|
],
|
|
"passes": true,
|
|
"priority": "P1"
|
|
},
|
|
{
|
|
"id": "US-006",
|
|
"title": "Phase 4 - Policy engine for autonomous coding",
|
|
"description": "Encode automation rules: if green + scoped diff + review passed -> merge to dev; if stale branch -> merge-forward before broad tests; if startup blocked -> recover once, then escalate; if lane completed -> emit closeout and cleanup session.",
|
|
"acceptanceCriteria": [
|
|
"Policy rules engine implemented",
|
|
"Rules: green + scoped diff + review -> merge",
|
|
"Rules: stale branch -> merge-forward before tests",
|
|
"Rules: startup blocked -> recover once, then escalate",
|
|
"Rules: lane completed -> closeout and cleanup"
|
|
],
|
|
"passes": true,
|
|
"priority": "P2"
|
|
},
|
|
{
|
|
"id": "US-007",
|
|
"title": "Phase 5 - Plugin/MCP lifecycle maturity",
|
|
"description": "First-class plugin/MCP lifecycle contract: config validation, startup healthcheck, discovery result, degraded-mode behavior, shutdown/cleanup. Close gaps in end-to-end lifecycle.",
|
|
"acceptanceCriteria": [
|
|
"Plugin/MCP config validation contract",
|
|
"Startup healthcheck with structured results",
|
|
"Discovery result reporting",
|
|
"Degraded-mode behavior documented and implemented",
|
|
"Shutdown/cleanup contract",
|
|
"Partial startup and per-server failures reported structurally"
|
|
],
|
|
"passes": true,
|
|
"priority": "P2"
|
|
},
|
|
{
|
|
"id": "US-008",
|
|
"title": "Fix kimi-k2.5 model API compatibility",
|
|
"description": "The kimi-k2.5 model (and other kimi models) reject API requests containing the is_error field in tool result messages. The OpenAI-compatible provider currently always includes is_error for all models. Need to make this field conditional based on model support.",
|
|
"acceptanceCriteria": [
|
|
"translate_message function accepts model parameter",
|
|
"is_error field excluded for kimi models (kimi-k2.5, kimi-k1.5, etc.)",
|
|
"is_error field included for models that support it (openai, grok, xai, etc.)",
|
|
"build_chat_completion_request passes model to translate_message",
|
|
"Tests verify is_error presence/absence based on model",
|
|
"cargo test passes",
|
|
"cargo clippy passes",
|
|
"cargo fmt passes"
|
|
],
|
|
"passes": true,
|
|
"priority": "P0"
|
|
},
|
|
{
|
|
"id": "US-009",
|
|
"title": "Add unit tests for kimi model compatibility fix",
|
|
"description": "During dogfooding we discovered the existing test coverage for model-specific is_error handling is insufficient. Need to add dedicated tests for model_rejects_is_error_field function and translate_message behavior with different models.",
|
|
"acceptanceCriteria": [
|
|
"Test model_rejects_is_error_field identifies kimi-k2.5, kimi-k1.5, dashscope/kimi-k2.5",
|
|
"Test translate_message includes is_error for gpt-4, grok-3, claude models",
|
|
"Test translate_message excludes is_error for kimi models",
|
|
"Test build_chat_completion_request produces correct payload for kimi vs non-kimi",
|
|
"All new tests pass",
|
|
"cargo test --package api passes"
|
|
],
|
|
"passes": true,
|
|
"priority": "P1"
|
|
},
|
|
{
|
|
"id": "US-010",
|
|
"title": "Add model compatibility documentation",
|
|
"description": "Document which models require special handling (is_error exclusion, reasoning model tuning param stripping, etc.) in a MODEL_COMPATIBILITY.md file for operators and contributors.",
|
|
"acceptanceCriteria": [
|
|
"MODEL_COMPATIBILITY.md created in docs/ or repo root",
|
|
"Document kimi models is_error exclusion",
|
|
"Document reasoning models (o1, o3, grok-3-mini) tuning param stripping",
|
|
"Document gpt-5 max_completion_tokens requirement",
|
|
"Document qwen model routing through dashscope",
|
|
"Cross-reference with existing code comments"
|
|
],
|
|
"passes": true,
|
|
"priority": "P2"
|
|
},
|
|
{
|
|
"id": "US-011",
|
|
"title": "Performance optimization: reduce API request serialization overhead",
|
|
"description": "The translate_message function creates intermediate JSON Value objects that could be optimized. Profile and optimize the hot path for API request building, especially for conversations with many tool results.",
|
|
"acceptanceCriteria": [
|
|
"Profile current request building with criterion or similar",
|
|
"Identify bottlenecks in translate_message and build_chat_completion_request",
|
|
"Implement optimizations (Vec pre-allocation, reduced cloning, etc.)",
|
|
"Benchmark before/after showing improvement",
|
|
"No functional changes or API breakage"
|
|
],
|
|
"passes": true,
|
|
"priority": "P2"
|
|
},
|
|
{
|
|
"id": "US-012",
|
|
"title": "Trust prompt resolver with allowlist auto-trust",
|
|
"description": "Add allowlisted auto-trust behavior for known repos/worktrees. Trust prompts currently block TUI startup and require manual intervention. Implement automatic trust resolution for pre-approved repositories.",
|
|
"acceptanceCriteria": [
|
|
"TrustAllowlist config structure with repo patterns",
|
|
"Auto-trust behavior for allowlisted repos/worktrees",
|
|
"trust_required event emitted when trust prompt detected",
|
|
"trust_resolved event emitted when trust is granted",
|
|
"Non-allowlisted repos remain gated (manual trust required)",
|
|
"Integration with worker boot lifecycle",
|
|
"Tests for allowlist matching and event emission"
|
|
],
|
|
"passes": true,
|
|
"priority": "P1"
|
|
},
|
|
{
|
|
"id": "US-013",
|
|
"title": "Phase 2 - Session event ordering + terminal-state reconciliation",
|
|
"description": "When the same session emits contradictory lifecycle events (idle, error, completed, transport/server-down) in close succession, expose deterministic final truth. Attach monotonic sequence/causal ordering metadata, classify terminal vs advisory events, reconcile duplicate/out-of-order terminal events into one canonical lane outcome.",
|
|
"acceptanceCriteria": [
|
|
"Monotonic sequence / causal ordering metadata attached to session lifecycle events",
|
|
"Terminal vs advisory event classification implemented",
|
|
"Reconcile duplicate or out-of-order terminal events into one canonical outcome",
|
|
"Distinguish 'session terminal state unknown because transport died' from real 'completed'",
|
|
"Tests verify reconciliation behavior with out-of-order event bursts"
|
|
],
|
|
"passes": true,
|
|
"priority": "P1"
|
|
},
|
|
{
|
|
"id": "US-014",
|
|
"title": "Phase 2 - Event provenance / environment labeling",
|
|
"description": "Every emitted event should declare its source (live_lane, test, healthcheck, replay, transport) so claws do not mistake test noise for production truth. Include environment/channel label, emitter identity, and confidence/trust level.",
|
|
"acceptanceCriteria": [
|
|
"EventProvenance enum with live_lane, test, healthcheck, replay, transport variants",
|
|
"Environment/channel label attached to all events",
|
|
"Emitter identity field on events",
|
|
"Confidence/trust level field for downstream automation",
|
|
"Tests verify provenance labeling and filtering"
|
|
],
|
|
"passes": true,
|
|
"priority": "P1"
|
|
},
|
|
{
|
|
"id": "US-015",
|
|
"title": "Phase 2 - Session identity completeness at creation time",
|
|
"description": "A newly created session should emit stable title, workspace/worktree path, and lane/session purpose at creation time. If any field is not yet known, emit explicit typed placeholder reason rather than bare unknown string.",
|
|
"acceptanceCriteria": [
|
|
"Session creation emits stable title, workspace/worktree path, purpose immediately",
|
|
"Explicit typed placeholder when fields unknown (not bare 'unknown' strings)",
|
|
"Later-enriched metadata reconciles onto same session identity without ambiguity",
|
|
"Tests verify session identity completeness and placeholder handling"
|
|
],
|
|
"passes": true,
|
|
"priority": "P1"
|
|
},
|
|
{
|
|
"id": "US-016",
|
|
"title": "Phase 2 - Duplicate terminal-event suppression",
|
|
"description": "When the same session emits repeated completed/failed/terminal notifications, collapse duplicates before they trigger repeated downstream reactions. Attach canonical terminal-event fingerprint per lane/session outcome.",
|
|
"acceptanceCriteria": [
|
|
"Canonical terminal-event fingerprint attached per lane/session outcome",
|
|
"Suppress/coalesce repeated terminal notifications within reconciliation window",
|
|
"Preserve raw event history for audit while exposing one actionable outcome downstream",
|
|
"Surface when later duplicate materially differs from original terminal payload",
|
|
"Tests verify deduplication and material difference detection"
|
|
],
|
|
"passes": true,
|
|
"priority": "P2"
|
|
},
|
|
{
|
|
"id": "US-017",
|
|
"title": "Phase 2 - Lane ownership / scope binding",
|
|
"description": "Each session and lane event should declare who owns it and what workflow scope it belongs to. Attach owner/assignee identity, workflow scope (claw-code-dogfood, external-git-maintenance, infra-health, manual-operator), and mark whether watcher is expected to act, observe only, or ignore.",
|
|
"acceptanceCriteria": [
|
|
"Owner/assignee identity attached to sessions and lane events",
|
|
"Workflow scope field (claw-code-dogfood, external-git-maintenance, etc.)",
|
|
"Watcher action expectation field (act, observe-only, ignore)",
|
|
"Preserve scope through session restarts, resumes, and late terminal events",
|
|
"Tests verify ownership and scope binding"
|
|
],
|
|
"passes": true,
|
|
"priority": "P2"
|
|
},
|
|
{
|
|
"id": "US-018",
|
|
"title": "Phase 2 - Nudge acknowledgment / dedupe contract",
|
|
"description": "Periodic clawhip nudges should carry nudge id/cycle id and delivery timestamp. Expose whether claw has already acknowledged or responded for that cycle. Distinguish new nudge, retry nudge, and stale duplicate.",
|
|
"acceptanceCriteria": [
|
|
"Nudge id / cycle id and delivery timestamp attached",
|
|
"Acknowledgment state exposed (already acknowledged or not)",
|
|
"Distinguish new nudge vs retry nudge vs stale duplicate",
|
|
"Allow downstream summaries to bind reported pinpoint back to triggering nudge id",
|
|
"Tests verify nudge deduplication and acknowledgment tracking"
|
|
],
|
|
"passes": true,
|
|
"priority": "P2"
|
|
},
|
|
{
|
|
"id": "US-019",
|
|
"title": "Phase 2 - Stable roadmap-id assignment for newly filed pinpoints",
|
|
"description": "When a claw records a new pinpoint/follow-up, assign or expose a stable tracking id immediately. Expose that id in structured event/report payload and preserve across edits, reorderings, and summary compression.",
|
|
"acceptanceCriteria": [
|
|
"Canonical roadmap id assigned at filing time",
|
|
"Roadmap id exposed in structured event/report payload",
|
|
"Same id preserved across edits, reorderings, summary compression",
|
|
"Distinguish 'new roadmap filing' from 'update to existing roadmap item'",
|
|
"Tests verify stable id assignment and update detection"
|
|
],
|
|
"passes": true,
|
|
"priority": "P2"
|
|
},
|
|
{
|
|
"id": "US-020",
|
|
"title": "Phase 2 - Roadmap item lifecycle state contract",
|
|
"description": "Each roadmap pinpoint should carry machine-readable lifecycle state (filed, acknowledged, in_progress, blocked, done, superseded). Attach last state-change timestamp and preserve lineage when one pinpoint supersedes or merges into another.",
|
|
"acceptanceCriteria": [
|
|
"Lifecycle state enum with filed, acknowledged, in_progress, blocked, done, superseded",
|
|
"Last state-change timestamp attached",
|
|
"New report can declare first filing, status update, or closure",
|
|
"Preserve lineage when one pinpoint supersedes or merges into another",
|
|
"Tests verify lifecycle state transitions"
|
|
],
|
|
"passes": true,
|
|
"priority": "P2"
|
|
},
|
|
{
|
|
"id": "US-021",
|
|
"title": "Request body size pre-flight check for OpenAI-compatible provider",
|
|
"description": "Implement pre-flight request body size estimation to prevent 400 Bad Request errors from API gateways with size limits. Based on dogfood findings with kimi-k2.5 testing, DashScope API has a 6MB request body limit that was exceeded by large system prompts.",
|
|
"acceptanceCriteria": [
|
|
"Pre-flight size estimation before sending requests to OpenAI-compatible providers",
|
|
"Clear error message when request exceeds provider-specific size limit",
|
|
"Configuration for different provider limits (6MB DashScope, 100MB OpenAI, etc.)",
|
|
"Unit tests for size estimation and limit checking",
|
|
"Integration with existing error handling for actionable user messages"
|
|
],
|
|
"passes": true,
|
|
"priority": "P1"
|
|
},
|
|
{
|
|
"id": "US-022",
|
|
"title": "Enhanced error context for API failures",
|
|
"description": "Add structured error context to API failures including request ID tracking across retries, provider-specific error code mapping, and suggested user actions based on error type (e.g., 'Reduce prompt size' for 413, 'Check API key' for 401).",
|
|
"acceptanceCriteria": [
|
|
"Request ID tracking across retries with full context in error messages",
|
|
"Provider-specific error code mapping with actionable suggestions",
|
|
"Suggested user actions for common error types (401, 403, 413, 429, 500, 502-504)",
|
|
"Unit tests for error context extraction",
|
|
"All existing tests pass and clippy is clean"
|
|
],
|
|
"passes": true,
|
|
"priority": "P1"
|
|
},
|
|
{
|
|
"id": "US-023",
|
|
"title": "Add automatic routing for kimi models to DashScope",
|
|
"description": "Based on dogfood findings with kimi-k2.5 testing, users must manually prefix with dashscope/kimi-k2.5 instead of just using kimi-k2.5. Add automatic routing for kimi/ and kimi- prefixed models to DashScope (similar to qwen models), and add a 'kimi' alias to the model registry.",
|
|
"acceptanceCriteria": [
|
|
"kimi/ and kimi- prefix routing to DashScope in metadata_for_model()",
|
|
"'kimi' alias in MODEL_REGISTRY that resolves to 'kimi-k2.5'",
|
|
"resolve_model_alias() handles the kimi alias correctly",
|
|
"Unit tests for kimi routing (similar to qwen routing tests)",
|
|
"All tests pass and clippy is clean"
|
|
],
|
|
"passes": true,
|
|
"priority": "P1"
|
|
}
|
|
],
|
|
"metadata": {
|
|
"lastUpdated": "2026-04-16",
|
|
"completedStories": ["US-001", "US-002", "US-003", "US-004", "US-005", "US-006", "US-007", "US-008", "US-009", "US-010", "US-011", "US-012", "US-013", "US-014", "US-015", "US-016", "US-017", "US-018", "US-019", "US-020", "US-021", "US-022", "US-023"],
|
|
"inProgressStories": [],
|
|
"totalStories": 23,
|
|
"status": "completed"
|
|
}
|
|
}
|