From 0665483c612b06f129de6e42ec7d7aaa7c2fda4e Mon Sep 17 00:00:00 2001 From: John Doe Date: Wed, 1 Apr 2026 12:34:52 -0400 Subject: [PATCH] Fix Agent Lifecycle: Steward primary agent, heartbeat & visibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove 'main' agent from config, it was incomplete and incorrect - Add role='orchestrator' and primary=true to steward agent - Implement automatic agent registration on Gateway connect - Add heartbeat mechanism (30s interval) with uptime/memory metrics - Add /agent-status HTTP endpoint for online/offline state tracking - Add getHealth() and getHeartbeatStatus() methods to AgentClient - Create DEBUG_AGENT_LIFECYCLE.md with full documentation Fixes reported issues: 1. 'Steward wasn't the primary agent - main was' → Steward now primary 2. 'Agents weren't even online' → Auto-registration + heartbeat 3. 'No visibility on agents' → /agent-status endpoint --- DEBUG_AGENT_LIFECYCLE.md | 308 ++++++++++++++++++++++++++++++++++++ agents/lib/agent-client.js | 176 ++++++++++++++++++++- gateway/openclaw-gateway.js | 186 +++++++++++++++++++++- openclaw.json | 7 +- 4 files changed, 661 insertions(+), 16 deletions(-) create mode 100644 DEBUG_AGENT_LIFECYCLE.md diff --git a/DEBUG_AGENT_LIFECYCLE.md b/DEBUG_AGENT_LIFECYCLE.md new file mode 100644 index 0000000..8a94d23 --- /dev/null +++ b/DEBUG_AGENT_LIFECYCLE.md @@ -0,0 +1,308 @@ +# Agent Lifecycle Debug Report + +**Date:** 2026-04-01 +**Author:** Heretek OpenClaw Engineering +**Status:** Resolved + +--- + +## Executive Summary + +This document details the investigation and resolution of critical agent lifecycle issues reported by users: + +1. **"Steward wasn't the primary agent - 'main' was"** +2. **"Agents weren't even online"** +3. **"We had no visibility on agents"** + +--- + +## Issues Identified + +### Issue 1: Incorrect Primary Agent Configuration + +**Problem:** The `openclaw.json` configuration file had "main" listed as the first agent in `agents.list`, but "main" had no proper configuration (no `workspace`, `agentDir`, or `model`). The "steward" agent, which should be the orchestrator, was listed second. + +**Root Cause:** +- In [`openclaw.json`](./openclaw.json:486-496), the agent list had: + ```json + "list": [ + { + "id": "main" + }, + { + "id": "steward", + "name": "steward", + "workspace": "/root/.openclaw/agents/steward/workspace", + "agentDir": "/root/.openclaw/agents/steward", + "model": "litellm/agent/steward" + }, + ... + ] + ``` + +The "main" agent entry was incomplete and should not have existed as a separate entity. + +**Fix Applied:** +- Removed the "main" agent entry from the list +- Added `role: "orchestrator"` and `primary: true` to the steward agent configuration +- Steward is now correctly positioned as the first and primary agent + +**Modified File:** [`openclaw.json`](./openclaw.json:486) + +--- + +### Issue 2: Agents Not Coming Online + +**Problem:** Agents were not properly connecting to the Gateway and registering themselves, resulting in them being "offline". + +**Root Cause:** +- The [`GatewayClient`](./agents/lib/agent-client.js:40) in `agent-client.js` did not automatically: + 1. Register the agent with the Gateway upon connection + 2. Send periodic heartbeats to maintain online status + 3. Handle connection state properly + +**Fix Applied:** + +1. **Added automatic agent registration** ([`agent-client.js`](./agents/lib/agent-client.js:132)): + ```javascript + async _registerAgent(role, metadata) { + const registrationMessage = { + type: 'register', + agentId: this.agentId, + timestamp: new Date().toISOString(), + metadata: { role: role || 'general', ...metadata } + }; + this.ws.send(JSON.stringify(registrationMessage)); + } + ``` + +2. **Implemented automatic heartbeat mechanism** ([`agent-client.js`](./agents/lib/agent-client.js:155)): + - Heartbeat sent every 30 seconds (configurable) + - Includes uptime and memory usage metrics + - Automatically starts on connection + +3. **Updated connect method** ([`agent-client.js`](./agents/lib/agent-client.js:71)): + ```javascript + async connect(options = {}) { + const { enableHeartbeat = true, role = null, metadata = {} } = options; + // ... connection logic ... + await this._registerAgent(role, metadata); + if (enableHeartbeat) { + this._startHeartbeat(); + } + } + ``` + +**Modified File:** [`agents/lib/agent-client.js`](./agents/lib/agent-client.js) + +--- + +### Issue 3: No Visibility on Agent Status + +**Problem:** There was no way to check which agents were online/offline or view their health status. + +**Root Cause:** +- The Gateway had basic `/health` and `/agents` endpoints but lacked: + 1. Detailed agent status with online/offline state + 2. Heartbeat-based health tracking + 3. Per-agent status endpoint + +**Fix Applied:** + +1. **Added `/agent-status` HTTP endpoint** ([`openclaw-gateway.js`](./gateway/openclaw-gateway.js:670)): + - Returns all agents with detailed status + - Includes online/offline state based on heartbeat + - Shows last seen timestamp and metadata + - Includes agents from Redis that are not currently connected + +2. **Added `/agent-status/{agentId}` endpoint** for specific agent queries + +3. **Enhanced ping handling** ([`openclaw-gateway.js`](./gateway/openclaw-gateway.js:569)): + - Now accepts heartbeat metadata from agents + - Stores heartbeat data in Redis + - Tracks agent uptime and memory usage + +4. **Added health status methods to AgentClient** ([`agent-client.js`](./agents/lib/agent-client.js:300)): + ```javascript + getHeartbeatStatus() { /* returns heartbeat info */ } + getHealth() { /* returns full health information */ } + ``` + +**Modified File:** [`gateway/openclaw-gateway.js`](./gateway/openclaw-gateway.js) + +--- + +## API Reference + +### Gateway HTTP Endpoints + +#### GET `/agent-status` +Returns status of all agents (connected and known offline agents). + +**Response:** +```json +{ + "timestamp": "2026-04-01T16:30:00.000Z", + "totalAgents": 5, + "onlineCount": 3, + "offlineCount": 2, + "agents": [ + { + "agentId": "steward", + "status": "online", + "lastSeen": "2026-04-01T16:29:55.000Z", + "registeredAt": "2026-04-01T16:00:00.000Z", + "metadata": { "role": "orchestrator" }, + "websocketReadyState": 1, + "timeSinceLastSeenMs": 5000 + } + ] +} +``` + +#### GET `/agent-status/{agentId}` +Returns status of a specific agent. + +**Response:** +```json +{ + "agentId": "steward", + "status": "online", + "lastSeen": "2026-04-01T16:29:55.000Z", + "registeredAt": "2026-04-01T16:00:00.000Z", + "metadata": { "role": "orchestrator" }, + "websocketReadyState": 1, + "timeSinceLastSeenMs": 5000 +} +``` + +#### GET `/health` +Returns gateway health status (unchanged). + +#### GET `/agents` +Returns list of connected agent IDs (unchanged). + +--- + +## Agent Client Usage + +### Connecting with Heartbeat + +```javascript +const AgentClient = require('./lib/agent-client'); + +const client = new AgentClient({ + agentId: 'steward', + role: 'orchestrator', + gatewayUrl: 'ws://127.0.0.1:18789' +}); + +// Connect with automatic registration and heartbeat +await client.connect({ + enableHeartbeat: true, // Default: true + role: 'orchestrator', + metadata: { + capabilities: ['coordinate', 'delegate', 'monitor'] + } +}); + +// Check health status +const health = client.getHealth(); +console.log(health.status); // 'online' or 'offline' + +// Check heartbeat status +const heartbeat = client.getHeartbeatStatus(); +console.log(heartbeat.lastHeartbeatSent); +console.log(heartbeat.lastHeartbeatReceived); +``` + +--- + +## Heartbeat Mechanism + +### How It Works + +1. **Agent connects** → Sends `register` message to Gateway +2. **Agent starts heartbeat** → Sends `ping` every 30 seconds +3. **Gateway responds** → Sends `pong` with acknowledgment +4. **Gateway updates status** → Updates `lastSeen` and stores in Redis +5. **Online/Offline determination**: + - **Online**: WebSocket connected AND last heartbeat < 60 seconds ago + - **Offline**: No WebSocket connection OR heartbeat stale + +### Heartbeat Message Format + +```json +{ + "type": "ping", + "agentId": "steward", + "timestamp": "2026-04-01T16:30:00.000Z", + "heartbeat": { + "uptime": 1234.56, + "memoryUsage": { + "rss": 123456789, + "heapTotal": 98765432, + "heapUsed": 87654321, + "external": 1234567 + }, + "lastHeartbeatSent": "2026-04-01T16:29:30.000Z" + } +} +``` + +--- + +## Testing + +### Verify Agent Registration + +```bash +# Check all agent status +curl http://localhost:18789/agent-status + +# Check specific agent +curl http://localhost:18789/agent-status/steward + +# Check gateway health +curl http://localhost:18789/health +``` + +### Monitor Heartbeat + +```bash +# Watch agent status in real-time +watch -n 5 'curl -s http://localhost:18789/agent-status | jq .' +``` + +--- + +## Files Modified + +| File | Changes | +|------|---------| +| [`openclaw.json`](./openclaw.json) | Removed "main" agent, added `role` and `primary` to steward | +| [`agents/lib/agent-client.js`](./agents/lib/agent-client.js) | Added registration, heartbeat, health methods | +| [`gateway/openclaw-gateway.js`](./gateway/openclaw-gateway.js) | Added `/agent-status` endpoints, enhanced ping handling | + +--- + +## Recommendations + +1. **Monitor agent heartbeat** in production using the `/agent-status` endpoint +2. **Configure alerting** when agents go offline (no heartbeat for > 60 seconds) +3. **Use the `getHealth()` method** in agent code to self-monitor +4. **Consider adding** a Steward dashboard that polls `/agent-status` periodically + +--- + +## Sign-Off + +**Issues Resolved:** +- [x] Steward is now the primary agent +- [x] Agents automatically register and send heartbeats +- [x] Agent visibility via `/agent-status` endpoint + +**Next Steps:** +- Consider adding agent lifecycle events (agent-online, agent-offline) to Gateway EventEmitter +- Add Steward skill to monitor and alert on agent health +- Create dashboard for real-time agent status visualization diff --git a/agents/lib/agent-client.js b/agents/lib/agent-client.js index 7af2514..e00a38f 100644 --- a/agents/lib/agent-client.js +++ b/agents/lib/agent-client.js @@ -52,24 +52,45 @@ class GatewayClient { this.messageHandlers = new Map(); this.pendingResponses = new Map(); this.messageCounter = 0; + + // Heartbeat configuration + this.heartbeatInterval = config.heartbeatInterval || 30000; // 30 seconds + this.heartbeatTimer = null; + this.lastHeartbeatSent = null; + this.lastHeartbeatReceived = null; } /** * Connect to the Gateway + * @param {Object} options - Connection options + * @param {boolean} [options.enableHeartbeat=true] - Enable automatic heartbeat + * @param {string} [options.role] - Agent role for registration + * @param {Object} [options.metadata] - Additional metadata for registration * @returns {Promise} Connection status */ - async connect() { + async connect(options = {}) { if (this.connected) { return true; } + const { enableHeartbeat = true, role = null, metadata = {} } = options; + return new Promise((resolve, reject) => { try { this.ws = new WebSocket(this.gatewayUrl); - this.ws.on('open', () => { + this.ws.on('open', async () => { console.log(`[GatewayClient] Connected to Gateway at ${this.gatewayUrl}`); this.connected = true; + + // Register agent with gateway + await this._registerAgent(role, metadata); + + // Start heartbeat if enabled + if (enableHeartbeat) { + this._startHeartbeat(); + } + resolve(true); }); @@ -86,6 +107,7 @@ class GatewayClient { this.ws.on('close', () => { console.log('[GatewayClient] Gateway connection closed'); this.connected = false; + this._stopHeartbeat(); }); // Connection timeout @@ -101,6 +123,86 @@ class GatewayClient { }); } + /** + * Register agent with the Gateway + * @private + * @param {string} role - Agent role + * @param {Object} metadata - Additional metadata + */ + async _registerAgent(role, metadata) { + if (!this.ws || this.ws.readyState !== WebSocket.OPEN) { + return; + } + + const registrationMessage = { + type: 'register', + agentId: this.agentId, + timestamp: new Date().toISOString(), + metadata: { + role: role || 'general', + ...metadata + } + }; + + this.ws.send(JSON.stringify(registrationMessage)); + console.log(`[GatewayClient] Registered agent ${this.agentId} with role ${role || 'general'}`); + } + + /** + * Start automatic heartbeat to Gateway + * @private + */ + _startHeartbeat() { + if (this.heartbeatTimer) { + this._stopHeartbeat(); + } + + console.log(`[GatewayClient] Starting heartbeat every ${this.heartbeatInterval}ms`); + + // Send initial heartbeat + this._sendHeartbeat(); + + // Schedule regular heartbeats + this.heartbeatTimer = setInterval(() => { + this._sendHeartbeat(); + }, this.heartbeatInterval); + } + + /** + * Stop automatic heartbeat + * @private + */ + _stopHeartbeat() { + if (this.heartbeatTimer) { + clearInterval(this.heartbeatTimer); + this.heartbeatTimer = null; + } + } + + /** + * Send heartbeat to Gateway + * @private + */ + _sendHeartbeat() { + if (!this.ws || this.ws.readyState !== WebSocket.OPEN) { + return; + } + + const heartbeatMessage = { + type: 'ping', + agentId: this.agentId, + timestamp: new Date().toISOString(), + heartbeat: { + uptime: process.uptime(), + memoryUsage: process.memoryUsage(), + lastHeartbeatSent: this.lastHeartbeatSent + } + }; + + this.ws.send(JSON.stringify(heartbeatMessage)); + this.lastHeartbeatSent = new Date().toISOString(); + } + /** * Handle incoming WebSocket messages * @private @@ -177,11 +279,56 @@ class GatewayClient { this.messageHandlers.set('message', [...(this.messageHandlers.get('message') || []), handler]); } + /** + * Handle pong response from Gateway (heartbeat acknowledgment) + * @private + * @param {Object} message - Pong message + */ + _handlePong(message) { + this.lastHeartbeatReceived = new Date().toISOString(); + console.log(`[GatewayClient] Heartbeat acknowledged for agent ${this.agentId}`); + } + + /** + * Get heartbeat status + * @returns {Object} Heartbeat status information + */ + getHeartbeatStatus() { + return { + agentId: this.agentId, + connected: this.connected, + lastHeartbeatSent: this.lastHeartbeatSent, + lastHeartbeatReceived: this.lastHeartbeatReceived, + heartbeatInterval: this.heartbeatInterval, + heartbeatActive: this.heartbeatTimer !== null, + uptime: process.uptime() + }; + } + + /** + * Get agent health information + * @returns {Object} Health information + */ + getHealth() { + const now = new Date().toISOString(); + const heartbeatStatus = this.getHeartbeatStatus(); + + return { + agentId: this.agentId, + status: this.connected ? 'online' : 'offline', + timestamp: now, + heartbeat: heartbeatStatus, + memory: process.memoryUsage(), + uptime: process.uptime() + }; + } + /** * Disconnect from Gateway */ async disconnect() { if (this.ws) { + this._stopHeartbeat(); this.ws.close(); this.ws = null; this.connected = false; @@ -557,10 +704,15 @@ You communicate with other agents through the OpenClaw Gateway WebSocket RPC pro /** * Connect to Gateway + * @param {Object} options - Connection options + * @param {boolean} [options.enableHeartbeat=true] - Enable automatic heartbeat + * @param {string} [options.role] - Agent role for registration (defaults to this.role) + * @param {Object} [options.metadata] - Additional metadata for registration * @returns {Promise} Connection status */ - async connect() { - return this.gatewayClient.connect(); + async connect(options = {}) { + const { enableHeartbeat = true, role = this.role, metadata = {} } = options; + return this.gatewayClient.connect({ enableHeartbeat, role, metadata }); } /** @@ -577,6 +729,22 @@ You communicate with other agents through the OpenClaw Gateway WebSocket RPC pro isConnected() { return this.gatewayClient.isConnected(); } + + /** + * Get heartbeat status from GatewayClient + * @returns {Object} Heartbeat status information + */ + getHeartbeatStatus() { + return this.gatewayClient.getHeartbeatStatus(); + } + + /** + * Get agent health information + * @returns {Object} Health information + */ + getHealth() { + return this.gatewayClient.getHealth(); + } } // Export for CommonJS diff --git a/gateway/openclaw-gateway.js b/gateway/openclaw-gateway.js index 8712274..47fad5e 100644 --- a/gateway/openclaw-gateway.js +++ b/gateway/openclaw-gateway.js @@ -408,7 +408,11 @@ class OpenClawGateway extends EventEmitter { break; case 'ping': - this._handlePing(ws, agentId); + this._handlePing(ws, agentId, message); + break; + + case 'pong': + this._handlePong(ws, agentId, message); break; case 'discover': @@ -567,22 +571,62 @@ class OpenClawGateway extends EventEmitter { } /** - * Handle ping + * Handle ping (heartbeat from agent) * @private + * @param {WebSocket} ws - WebSocket client + * @param {string} agentId - Agent ID + * @param {Object} message - Ping message with heartbeat data */ - _handlePing(ws, agentId) { + _handlePing(ws, agentId, message) { + // Send pong response ws.send(JSON.stringify({ type: 'pong', timestamp: Date.now(), - agentId + agentId, + heartbeat: { + received: new Date().toISOString(), + agentHeartbeat: message.heartbeat || {} + } })); - // Update last seen + // Update last seen with heartbeat data if (agentId && this.agents.has(agentId)) { - this.agents.get(agentId).lastSeen = new Date().toISOString(); + const agent = this.agents.get(agentId); + agent.lastSeen = new Date().toISOString(); + + // Store heartbeat metadata if provided + if (message.heartbeat) { + agent.lastHeartbeat = { + uptime: message.heartbeat.uptime, + memoryUsage: message.heartbeat.memoryUsage, + lastHeartbeatSent: message.heartbeat.lastHeartbeatSent, + receivedAt: agent.lastSeen + }; + } + + // Update Redis with heartbeat status + if (this.redisClient) { + this.redisClient.hset(`${A2A_PREFIX}:agent:${agentId}`, { + lastSeen: agent.lastSeen, + status: 'active', + lastHeartbeatUptime: message.heartbeat?.uptime?.toString() || null + }); + } } } + /** + * Handle pong (heartbeat acknowledgment from gateway) + * @private + * @param {WebSocket} ws - WebSocket client + * @param {string} agentId - Agent ID + * @param {Object} message - Pong message + */ + _handlePong(ws, agentId, message) { + console.log(`[Gateway] Heartbeat ack from ${agentId || 'unknown'} at ${message.timestamp}`); + // Pong messages are acknowledgments - the agent handles the response internally + } + /** * Handle discover request * @private @@ -641,14 +685,14 @@ class OpenClawGateway extends EventEmitter { _handleHttpRequest(req, res) { const url = new URL(req.url, `http://${req.headers.host}`); - // Health check endpoint + // Health check endpoint - basic gateway health if (url.pathname === '/health') { res.writeHead(200, { 'Content-Type': 'application/json' }); res.end(JSON.stringify(this.getStatus())); return; } - // Agents endpoint + // Agents endpoint - list connected agents if (url.pathname === '/agents') { res.writeHead(200, { 'Content-Type': 'application/json' }); res.end(JSON.stringify({ @@ -658,11 +702,137 @@ class OpenClawGateway extends EventEmitter { return; } + // Agent status endpoint - detailed agent online/offline state tracking + if (url.pathname === '/agent-status' || url.pathname.startsWith('/agent-status/')) { + this._handleAgentStatusHttp(req, res, url); + return; + } + // 404 for other paths res.writeHead(404, { 'Content-Type': 'application/json' }); res.end(JSON.stringify({ error: 'Not found' })); } + /** + * Handle agent status HTTP requests + * @private + * @param {http.IncomingMessage} req - HTTP request + * @param {http.ServerResponse} res - HTTP response + * @param {URL} url - Parsed URL + */ + async _handleAgentStatusHttp(req, res, url) { + const pathParts = url.pathname.split('/'); + const specificAgentId = pathParts[pathParts.length - 1]; + + // GET /agent-status - all agents with status + if (url.pathname === '/agent-status') { + const agentStatus = []; + + for (const [agentId, agent] of this.agents) { + const now = Date.now(); + const lastSeenTime = new Date(agent.lastSeen).getTime(); + const timeSinceLastSeen = now - lastSeenTime; + + // Consider agent offline if no heartbeat for more than 2 heartbeat intervals (60 seconds) + const isOnline = agent.ws && agent.ws.readyState === WebSocket.OPEN && timeSinceLastSeen < (this.config.heartbeatInterval * 2); + + agentStatus.push({ + agentId, + status: isOnline ? 'online' : 'offline', + lastSeen: agent.lastSeen, + registeredAt: agent.registeredAt, + metadata: agent.metadata, + websocketReadyState: agent.ws ? agent.ws.readyState : null, + timeSinceLastSeenMs: timeSinceLastSeen + }); + } + + // Also include agents registered in Redis but not currently connected + let redisAgents = []; + if (this.redisClient) { + redisAgents = await this.redisClient.smembers(`${A2A_PREFIX}:agents`); + } + + const connectedAgentIds = new Set(this.agents.keys()); + for (const redisAgentId of redisAgents) { + if (!connectedAgentIds.has(redisAgentId)) { + // Agent in Redis but not connected - get last known status + let agentData = {}; + if (this.redisClient) { + agentData = await this.redisClient.hgetall(`${A2A_PREFIX}:agent:${redisAgentId}`); + } + + agentStatus.push({ + agentId: redisAgentId, + status: 'offline', + lastSeen: agentData.lastSeen || null, + registeredAt: agentData.registeredAt || null, + metadata: {}, + websocketReadyState: null, + timeSinceLastSeenMs: null + }); + } + } + + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ + timestamp: new Date().toISOString(), + totalAgents: agentStatus.length, + onlineCount: agentStatus.filter(a => a.status === 'online').length, + offlineCount: agentStatus.filter(a => a.status === 'offline').length, + agents: agentStatus + })); + return; + } + + // GET /agent-status/{agentId} - specific agent status + if (specificAgentId && specificAgentId !== 'agent-status') { + const agent = this.agents.get(specificAgentId); + + if (!agent) { + // Check Redis for last known status + let redisData = {}; + if (this.redisClient) { + redisData = await this.redisClient.hgetall(`${A2A_PREFIX}:agent:${specificAgentId}`); + } + + if (Object.keys(redisData).length > 0) { + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ + agentId: specificAgentId, + status: 'offline', + lastSeen: redisData.lastSeen || null, + registeredAt: redisData.registeredAt || null, + metadata: redisData, + websocketReadyState: null, + note: 'Agent not currently connected, showing last known status from Redis' + })); + } else { + res.writeHead(404, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ error: `Agent ${specificAgentId} not found` })); + } + return; + } + + const now = Date.now(); + const lastSeenTime = new Date(agent.lastSeen).getTime(); + const timeSinceLastSeen = now - lastSeenTime; + const isOnline = agent.ws && agent.ws.readyState === WebSocket.OPEN && timeSinceLastSeen < (this.config.heartbeatInterval * 2); + + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ + agentId: specificAgentId, + status: isOnline ? 'online' : 'offline', + lastSeen: agent.lastSeen, + registeredAt: agent.registeredAt, + metadata: agent.metadata, + websocketReadyState: agent.ws ? agent.ws.readyState : null, + timeSinceLastSeenMs: timeSinceLastSeen + })); + return; + } + } + /** * Start HTTP server * @private diff --git a/openclaw.json b/openclaw.json index 6cbcdb0..ecea47d 100644 --- a/openclaw.json +++ b/openclaw.json @@ -484,15 +484,14 @@ } }, "list": [ - { - "id": "main" - }, { "id": "steward", "name": "steward", "workspace": "/root/.openclaw/agents/steward/workspace", "agentDir": "/root/.openclaw/agents/steward", - "model": "litellm/agent/steward" + "model": "litellm/agent/steward", + "role": "orchestrator", + "primary": true }, { "id": "alpha",