mirror of
https://github.com/Heretek-AI/telemetry-service.git
synced 2026-07-01 13:54:38 -04:00
Add configuring status and improve error analysis
Treat a new 'configuring' status alongside 'installing' across services and UI (filters, counts, badges, allowedStatus, stuck-install detection). Reclassify failed records with exit_code==0 as success (in Fetch* paths and main ingestion) and remove aggressive top-error truncation; increase error preview/full display limits. Expand exit code mappings and descriptions (many curl/apt/docker/signal/timeouts added) and enhance categorizeErrorText to detect Docker/container, resource (OOM) and signal-related errors for better error_category assignment. Misc: add new HTML/CSS for configuring badge and adjust related dashboard/error-analysis rendering.
This commit is contained in:
+1
-1
@@ -113,7 +113,7 @@ func (c *Cleaner) findStuckInstallations(ctx context.Context) ([]StuckRecord, er
|
||||
cutoffStr := cutoff.Format("2006-01-02 15:04:05")
|
||||
|
||||
// Build filter: status='installing' AND created < cutoff
|
||||
filter := url.QueryEscape(fmt.Sprintf("status='installing' && created<'%s'", cutoffStr))
|
||||
filter := url.QueryEscape(fmt.Sprintf("(status='installing' || status='configuring') && created<'%s'", cutoffStr))
|
||||
|
||||
var allRecords []StuckRecord
|
||||
page := 1
|
||||
|
||||
+34
-21
@@ -456,7 +456,7 @@ func aggregateRecords(records []TelemetryRecord, knownScripts map[string]ScriptI
|
||||
st.Failed++
|
||||
case "aborted":
|
||||
st.Aborted++
|
||||
case "installing":
|
||||
case "installing", "configuring":
|
||||
st.Installing++
|
||||
}
|
||||
}
|
||||
@@ -622,7 +622,7 @@ func (s *ScriptStatsStore) IncrementalUpdate(ctx context.Context, repoSource str
|
||||
st.Failed++
|
||||
case "aborted":
|
||||
st.Aborted++
|
||||
case "installing":
|
||||
case "installing", "configuring":
|
||||
st.Installing++
|
||||
}
|
||||
added++
|
||||
@@ -867,8 +867,8 @@ func (p *PBClient) FetchScriptAnalysisData(ctx context.Context, days int, repoSo
|
||||
strings.Contains(strings.ToLower(r.Error), "aborted by user")) {
|
||||
r.Status = "aborted"
|
||||
}
|
||||
// Reclassify failed+exit_code=0
|
||||
if r.Status == "failed" && r.ExitCode == 0 && (r.Error == "" || strings.ToLower(r.Error) == "success") {
|
||||
// Reclassify failed+exit_code=0 — exit_code=0 is NEVER an error
|
||||
if r.Status == "failed" && r.ExitCode == 0 {
|
||||
r.Status = "success"
|
||||
}
|
||||
|
||||
@@ -886,7 +886,7 @@ func (p *PBClient) FetchScriptAnalysisData(ctx context.Context, days int, repoSo
|
||||
a.failed++
|
||||
case "aborted":
|
||||
a.aborted++
|
||||
case "installing":
|
||||
case "installing", "configuring":
|
||||
a.installing++
|
||||
}
|
||||
|
||||
@@ -1041,12 +1041,12 @@ func (p *PBClient) FetchErrorAnalysisData(ctx context.Context, days int, repoSou
|
||||
r.Status = "aborted"
|
||||
}
|
||||
|
||||
// Reclassify: status="failed" with exit_code=0 and no error text is actually success
|
||||
if r.Status == "failed" && r.ExitCode == 0 && (r.Error == "" || strings.ToLower(r.Error) == "success") {
|
||||
// Reclassify: exit_code=0 is NEVER an error — always reclassify as success
|
||||
if r.Status == "failed" && r.ExitCode == 0 {
|
||||
r.Status = "success"
|
||||
}
|
||||
|
||||
if r.Status == "installing" {
|
||||
if r.Status == "installing" || r.Status == "configuring" {
|
||||
stuckCount++
|
||||
continue
|
||||
}
|
||||
@@ -1100,9 +1100,6 @@ func (p *PBClient) FetchErrorAnalysisData(ctx context.Context, days int, repoSou
|
||||
}
|
||||
if r.Error != "" && (appStats[key].topError == "" || len(r.Error) > len(appStats[key].topError)) {
|
||||
appStats[key].topError = r.Error
|
||||
if len(appStats[key].topError) > 150 {
|
||||
appStats[key].topError = appStats[key].topError[:150] + "..."
|
||||
}
|
||||
}
|
||||
if cat != "uncategorized" && appStats[key].topCategory == "" {
|
||||
appStats[key].topCategory = cat
|
||||
@@ -1214,6 +1211,9 @@ func (p *PBClient) FetchErrorAnalysisData(ctx context.Context, days int, repoSou
|
||||
case 124:
|
||||
desc = "Command timed out (timeout command)"
|
||||
cat = "timeout"
|
||||
case 125:
|
||||
desc = "Docker daemon error (container failed to run)"
|
||||
cat = "config"
|
||||
case 126:
|
||||
desc = "Command cannot execute (permission problem)"
|
||||
cat = "permission"
|
||||
@@ -1222,10 +1222,16 @@ func (p *PBClient) FetchErrorAnalysisData(ctx context.Context, days int, repoSou
|
||||
cat = "command_not_found"
|
||||
case 128:
|
||||
desc = "Invalid argument to exit"
|
||||
cat = "unknown"
|
||||
cat = "signal"
|
||||
case 129:
|
||||
desc = "Killed by SIGHUP (terminal closed)"
|
||||
cat = "signal"
|
||||
case 130:
|
||||
desc = "Script terminated by Ctrl+C (SIGINT)"
|
||||
cat = "user_aborted"
|
||||
case 131:
|
||||
desc = "Killed by SIGQUIT (core dump)"
|
||||
cat = "signal"
|
||||
case 134:
|
||||
desc = "Process aborted (SIGABRT)"
|
||||
cat = "signal"
|
||||
@@ -1402,8 +1408,8 @@ func (p *PBClient) FetchErrorAnalysisData(ctx context.Context, days int, repoSou
|
||||
desc = "npm/pnpm/yarn: Unknown fatal error"
|
||||
cat = "unknown"
|
||||
case 255:
|
||||
desc = "Script error (set -e / errexit triggered or SSH error)"
|
||||
cat = "unknown"
|
||||
desc = "DPKG: Fatal internal error / set -e triggered"
|
||||
cat = "apt"
|
||||
default:
|
||||
if code > 128 && code < 192 {
|
||||
sigNum := code - 128
|
||||
@@ -1684,7 +1690,7 @@ func (p *PBClient) FetchDashboardData(ctx context.Context, days int, repoSource
|
||||
}
|
||||
case "aborted":
|
||||
data.AbortedCount++
|
||||
case "installing":
|
||||
case "installing", "configuring":
|
||||
data.InstallingCount++
|
||||
}
|
||||
|
||||
@@ -2846,6 +2852,12 @@ func DashboardHTML() string {
|
||||
border-color: rgba(234, 179, 8, 0.3);
|
||||
}
|
||||
|
||||
.status-badge.configuring {
|
||||
background: rgba(59, 130, 246, 0.15);
|
||||
color: var(--accent-blue);
|
||||
border-color: rgba(59, 130, 246, 0.3);
|
||||
}
|
||||
|
||||
.status-badge.aborted {
|
||||
background: rgba(168, 85, 247, 0.15);
|
||||
color: var(--accent-purple);
|
||||
@@ -3973,6 +3985,7 @@ func DashboardHTML() string {
|
||||
<option value="failed">Failed</option>
|
||||
<option value="aborted">Aborted</option>
|
||||
<option value="installing">Installing</option>
|
||||
<option value="configuring">Configuring</option>
|
||||
<option value="unknown">Unknown</option>
|
||||
</select>
|
||||
<select id="filterOs" class="custom-select" onchange="filterTable()">
|
||||
@@ -5475,9 +5488,9 @@ func ErrorAnalysisHTML() string {
|
||||
const failRateColor = a.failure_rate > 50 ? 'var(--accent-red)' : a.failure_rate > 20 ? 'var(--accent-orange)' : 'var(--accent-yellow)';
|
||||
const topCat = a.top_category ? '<span class="category-badge ' + a.top_category + '">' + escapeHtml(a.top_category) + '</span>' : '-';
|
||||
const errorId = 'err-app-' + idx;
|
||||
const shortError = escapeHtml((a.top_error || '-').substring(0, 80));
|
||||
const shortError = escapeHtml((a.top_error || '-').substring(0, 120));
|
||||
const fullError = escapeHtml(a.top_error || '-');
|
||||
const isLong = (a.top_error || '').length > 80;
|
||||
const isLong = (a.top_error || '').length > 120;
|
||||
return '<tr>' +
|
||||
'<td><strong>' + escapeHtml(a.app) + '</strong></td>' +
|
||||
'<td><span class="type-badge ' + typeClass + '">' + (a.type || '-').toUpperCase() + '</span></td>' +
|
||||
@@ -5488,7 +5501,7 @@ func ErrorAnalysisHTML() string {
|
||||
'<td>' + (a.top_exit_code ? '<span class="exit-code err">' + a.top_exit_code + '</span>' : '-') + '</td>' +
|
||||
'<td class="error-text">' +
|
||||
'<div id="' + errorId + '-short">' + shortError + (isLong ? ' <a href="#" onclick="toggleError(\'' + errorId + '\');return false;" style="color:var(--accent-blue);font-size:11px;">show more</a>' : '') + '</div>' +
|
||||
(isLong ? '<div id="' + errorId + '-full" style="display:none;white-space:pre-wrap;word-break:break-all;max-height:300px;overflow-y:auto;">' + fullError + ' <a href="#" onclick="toggleError(\'' + errorId + '\');return false;" style="color:var(--accent-blue);font-size:11px;">show less</a></div>' : '') +
|
||||
(isLong ? '<div id="' + errorId + '-full" style="display:none;white-space:pre-wrap;word-break:break-all;max-height:600px;overflow-y:auto;">' + fullError + ' <a href="#" onclick="toggleError(\'' + errorId + '\');return false;" style="color:var(--accent-blue);font-size:11px;">show less</a></div>' : '') +
|
||||
'</td>' +
|
||||
'<td><button class="btn issue-btn" data-app="' + escapeAttr(a.app) + '" data-exit="' + (a.top_exit_code||0) + '" data-error="' + escapeAttr(a.top_error||'') + '" data-rate="' + a.failure_rate.toFixed(1) + '">🐛 Issue</button></td>' +
|
||||
'</tr>';
|
||||
@@ -5508,9 +5521,9 @@ func ErrorAnalysisHTML() string {
|
||||
const catClass = (e.error_category || 'unknown').replace(/ /g, '_');
|
||||
const os = e.os_type ? e.os_type + (e.os_version ? ' ' + e.os_version : '') : '-';
|
||||
const errorId = 'err-recent-' + idx;
|
||||
const shortError = escapeHtml((e.error || '-').substring(0, 80));
|
||||
const shortError = escapeHtml((e.error || '-').substring(0, 120));
|
||||
const fullError = escapeHtml(e.error || '-');
|
||||
const isLong = (e.error || '').length > 80;
|
||||
const isLong = (e.error || '').length > 120;
|
||||
return '<tr>' +
|
||||
'<td><span class="status-badge ' + statusClass + '">' + escapeHtml(e.status) + '</span></td>' +
|
||||
'<td><span class="type-badge ' + typeClass + '">' + (e.type || '-').toUpperCase() + '</span></td>' +
|
||||
@@ -5519,7 +5532,7 @@ func ErrorAnalysisHTML() string {
|
||||
'<td><span class="category-badge ' + catClass + '">' + escapeHtml(e.error_category || 'unknown') + '</span></td>' +
|
||||
'<td class="error-text">' +
|
||||
'<div id="' + errorId + '-short">' + shortError + (isLong ? ' <a href="#" onclick="toggleError(\'' + errorId + '\');return false;" style="color:var(--accent-blue);font-size:11px;">show more</a>' : '') + '</div>' +
|
||||
(isLong ? '<div id="' + errorId + '-full" style="display:none;white-space:pre-wrap;word-break:break-all;max-height:300px;overflow-y:auto;">' + fullError + ' <a href="#" onclick="toggleError(\'' + errorId + '\');return false;" style="color:var(--accent-blue);font-size:11px;">show less</a></div>' : '') +
|
||||
(isLong ? '<div id="' + errorId + '-full" style="display:none;white-space:pre-wrap;word-break:break-all;max-height:600px;overflow-y:auto;">' + fullError + ' <a href="#" onclick="toggleError(\'' + errorId + '\');return false;" style="color:var(--accent-blue);font-size:11px;">show less</a></div>' : '') +
|
||||
'</td>' +
|
||||
'<td>' + escapeHtml(os) + '</td>' +
|
||||
'<td style="white-space:nowrap;">' + formatTimestamp(e.created) + '</td>' +
|
||||
|
||||
+84
-12
@@ -607,7 +607,7 @@ var (
|
||||
allowedType = map[string]bool{"lxc": true, "vm": true, "tool": true, "addon": true}
|
||||
|
||||
// Allowed values for 'status' field
|
||||
allowedStatus = map[string]bool{"installing": true, "success": true, "failed": true, "aborted": true, "unknown": true}
|
||||
allowedStatus = map[string]bool{"installing": true, "configuring": true, "success": true, "failed": true, "aborted": true, "unknown": true}
|
||||
|
||||
// Allowed values for 'os_type' field
|
||||
allowedOsType = map[string]bool{
|
||||
@@ -630,21 +630,42 @@ var (
|
||||
"network": true, "storage": true, "dependency": true, "permission": true,
|
||||
"timeout": true, "config": true, "resource": true, "unknown": true, "": true,
|
||||
"user_aborted": true, "apt": true, "command_not_found": true, "signal": true,
|
||||
"service": true, "database": true, "proxmox": true,
|
||||
}
|
||||
|
||||
// exitCodeCategories maps well-known exit codes to error categories
|
||||
exitCodeCategories = map[int]string{
|
||||
1: "unknown", // General error
|
||||
2: "unknown", // Misuse of shell builtins
|
||||
100: "apt", // APT: package manager error (broken packages / dependency problems)
|
||||
126: "permission", // Command invoked cannot execute (permission problem or not executable)
|
||||
4: "network", // curl: Network/protocol error
|
||||
5: "network", // curl: Could not resolve proxy
|
||||
6: "network", // curl: Could not resolve host
|
||||
7: "network", // curl: Connection refused
|
||||
8: "network", // curl: FTP server reply error
|
||||
10: "config", // Docker / privileged mode required
|
||||
22: "network", // curl: HTTP error (404/500 etc.)
|
||||
23: "storage", // curl: Write error (disk full?)
|
||||
25: "network", // curl: Upload failed
|
||||
28: "timeout", // curl: Connection timed out
|
||||
35: "network", // SSL connect error
|
||||
56: "network", // curl: Receive error (connection reset)
|
||||
100: "apt", // APT: package manager error
|
||||
101: "apt", // APT: Unmet dependencies
|
||||
102: "apt", // APT: Lock held by another process
|
||||
124: "timeout", // Command timed out
|
||||
125: "config", // Docker daemon error / container failed to run
|
||||
126: "permission", // Command invoked cannot execute
|
||||
127: "command_not_found", // Command not found
|
||||
128: "signal", // Invalid argument to exit
|
||||
129: "signal", // Killed by SIGHUP (terminal closed)
|
||||
130: "user_aborted", // Script terminated by Ctrl+C (SIGINT)
|
||||
131: "signal", // Killed by SIGQUIT (core dump)
|
||||
134: "signal", // Process aborted (SIGABRT)
|
||||
137: "resource", // SIGKILL - often OOM killer
|
||||
139: "unknown", // SIGSEGV - segfault
|
||||
141: "unknown", // SIGPIPE
|
||||
141: "signal", // SIGPIPE
|
||||
143: "signal", // SIGTERM
|
||||
255: "apt", // DPKG: Fatal internal error
|
||||
}
|
||||
|
||||
// exitCodeDescriptions provides human-readable exit code descriptions
|
||||
@@ -652,15 +673,38 @@ var (
|
||||
0: "Success",
|
||||
1: "General error",
|
||||
2: "Misuse of shell builtins",
|
||||
4: "curl: Network/protocol error",
|
||||
5: "curl: Could not resolve proxy",
|
||||
6: "curl: DNS resolution failed",
|
||||
7: "curl: Connection refused",
|
||||
8: "curl: FTP server reply error",
|
||||
10: "Docker / privileged mode required (unsupported environment)",
|
||||
22: "curl: HTTP error (404/500 etc.)",
|
||||
23: "curl: Write error (disk full?)",
|
||||
25: "curl: Upload failed",
|
||||
28: "curl: Connection timed out",
|
||||
30: "curl: FTP port command failed",
|
||||
35: "SSL connect error",
|
||||
56: "curl: Receive error (connection reset)",
|
||||
75: "Temporary failure (retry later)",
|
||||
78: "curl: Remote file not found (404)",
|
||||
100: "APT: Package manager error (broken packages / dependency problems)",
|
||||
126: "Command invoked cannot execute (permission problem or not executable)",
|
||||
101: "APT: Unmet dependencies",
|
||||
102: "APT: Lock held by another process",
|
||||
124: "Command timed out",
|
||||
125: "Docker daemon error (container failed to run)",
|
||||
126: "Command cannot execute (permission problem)",
|
||||
127: "Command not found",
|
||||
128: "Invalid argument to exit",
|
||||
129: "Killed by SIGHUP (terminal closed)",
|
||||
130: "Script terminated by Ctrl+C (SIGINT)",
|
||||
131: "Killed by SIGQUIT (core dump)",
|
||||
134: "Process aborted (SIGABRT)",
|
||||
137: "Process killed (SIGKILL) - likely OOM",
|
||||
139: "Segmentation fault (SIGSEGV)",
|
||||
141: "Broken pipe (SIGPIPE)",
|
||||
143: "Process terminated (SIGTERM)",
|
||||
255: "DPKG: Fatal internal error",
|
||||
}
|
||||
)
|
||||
|
||||
@@ -814,6 +858,13 @@ func computeHash(out TelemetryOut) string {
|
||||
|
||||
// categorizeErrorText assigns an error_category based on error text patterns
|
||||
func categorizeErrorText(errLower string) string {
|
||||
// Docker / container errors (check early, before generic patterns)
|
||||
if strings.Contains(errLower, "docker") ||
|
||||
strings.Contains(errLower, "privileged mode") ||
|
||||
strings.Contains(errLower, "container runtime") ||
|
||||
strings.Contains(errLower, "daemon") {
|
||||
return "config"
|
||||
}
|
||||
// Network errors
|
||||
if strings.Contains(errLower, "connection refused") ||
|
||||
strings.Contains(errLower, "could not resolve") ||
|
||||
@@ -827,7 +878,7 @@ func categorizeErrorText(errLower string) string {
|
||||
strings.Contains(errLower, "certificate") {
|
||||
return "network"
|
||||
}
|
||||
// APT / package manager
|
||||
// APT / package manager (check before generic "dependency")
|
||||
if strings.Contains(errLower, "apt") ||
|
||||
strings.Contains(errLower, "dpkg") ||
|
||||
strings.Contains(errLower, "broken packages") ||
|
||||
@@ -848,6 +899,23 @@ func categorizeErrorText(errLower string) string {
|
||||
strings.Contains(errLower, "access denied") {
|
||||
return "permission"
|
||||
}
|
||||
// Resource (OOM, memory)
|
||||
if strings.Contains(errLower, "oom") ||
|
||||
strings.Contains(errLower, "out of memory") ||
|
||||
strings.Contains(errLower, "cannot allocate") ||
|
||||
strings.Contains(errLower, "killed") ||
|
||||
strings.Contains(errLower, "sigkill") {
|
||||
return "resource"
|
||||
}
|
||||
// Signal-related
|
||||
if strings.Contains(errLower, "sighup") ||
|
||||
strings.Contains(errLower, "sigquit") ||
|
||||
strings.Contains(errLower, "sigterm") ||
|
||||
strings.Contains(errLower, "sigabrt") ||
|
||||
strings.Contains(errLower, "sigpipe") ||
|
||||
strings.Contains(errLower, "core dump") {
|
||||
return "signal"
|
||||
}
|
||||
// Command not found
|
||||
if strings.Contains(errLower, "command not found") ||
|
||||
strings.Contains(errLower, "not found") {
|
||||
@@ -859,12 +927,6 @@ func categorizeErrorText(errLower string) string {
|
||||
strings.Contains(errLower, "missing") {
|
||||
return "dependency"
|
||||
}
|
||||
// Resource
|
||||
if strings.Contains(errLower, "oom") ||
|
||||
strings.Contains(errLower, "out of memory") ||
|
||||
strings.Contains(errLower, "cannot allocate") {
|
||||
return "resource"
|
||||
}
|
||||
// Config
|
||||
if strings.Contains(errLower, "config") ||
|
||||
strings.Contains(errLower, "syntax error") ||
|
||||
@@ -1643,6 +1705,16 @@ func main() {
|
||||
return
|
||||
}
|
||||
|
||||
// Auto-reclassify: exit_code=0 is NEVER an error — always reclassify as success
|
||||
if in.Status == "failed" && in.ExitCode == 0 {
|
||||
in.Status = "success"
|
||||
in.Error = ""
|
||||
in.ErrorCategory = ""
|
||||
if cfg.EnableReqLogging {
|
||||
log.Printf("auto-reclassified exit_code=0 as success: nsapp=%s", in.NSAPP)
|
||||
}
|
||||
}
|
||||
|
||||
// Auto-reclassify: clients still send status="failed" for SIGINT/Ctrl+C,
|
||||
// detect and reclassify as "aborted" server-side.
|
||||
errorLower := strings.ToLower(in.Error)
|
||||
|
||||
Reference in New Issue
Block a user