fix(discovery): raise enrich-all timeout + surface partial progress

Pain: a 1400+ row pending queue can't finish crawl-enrich inside the old 10-minute cap (Nominatim's 1 rps means ~23m minimum). Operators saw a scary red "Crawl-enrich fehlgeschlagen: context deadline exceeded" banner even though the pipeline is resumable. - Introduce enrichAllTimeout constant (45m) sized for ~2700 rows per press; the original 10m assumed 600 rows worst-case. - On context.DeadlineExceeded, translate to a user-facing message ("Zeitlimit erreicht nach N von M Zeilen. Erneut starten, um die verbleibenden Zeilen zu bearbeiten.") instead of raw Go error. - Always stash the summary in handler state, even on error, so the UI can show partial progress (N/M processed) alongside the message. - Service: populate DurationMs on early-return too, so the status endpoint's duration reflects the partial run instead of zero. Behavior unchanged when a run finishes cleanly; the queue remains resumable across presses as before.
2026-04-24 14:11:38 +02:00
parent 950d01e3d4
commit 9cbe654d55
2 changed files with 32 additions and 8 deletions
--- a/backend/internal/domain/discovery/handler.go
+++ b/backend/internal/domain/discovery/handler.go
@@ -2,6 +2,7 @@ package discovery

 import (
 	"context"
+	"errors"
 	"fmt"
 	"log/slog"
 	"net/http"
@@ -16,6 +17,15 @@ import (
 	"marktvogt.de/backend/internal/pkg/apierror"
 )

+// enrichAllTimeout bounds a single RunCrawlEnrichAll goroutine. The pipeline
+// is resumable — rows that don't finish stay enrichment_status='pending' and
+// the next run picks them up — so hitting this cap is not a failure, just a
+// signal to press the button again.
+//
+// Sized against Nominatim's 1 rps ceiling: 45m handles ~2700 rows per press.
+// Larger queues simply need multiple presses.
+const enrichAllTimeout = 45 * time.Minute
+
 type Handler struct {
 	service *Service

@@ -354,24 +364,37 @@ func (h *Handler) RunCrawlEnrichAll(c *gin.Context) {
 	})
 }

-// runEnrichAsync runs RunCrawlEnrichAll with a detached context. 10m cap is
-// generous for Nominatim's 1rps: a 600-row queue is the worst case we expect.
+// runEnrichAsync runs RunCrawlEnrichAll with a detached context. See
+// enrichAllTimeout for the cap rationale.
 func (h *Handler) runEnrichAsync() {
 	defer h.enrichRunning.Store(false)

-	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
+	ctx, cancel := context.WithTimeout(context.Background(), enrichAllTimeout)
 	defer cancel()

 	summary, err := h.service.RunCrawlEnrichAll(ctx)

 	h.enrichMu.Lock()
 	h.enrichFinishedAt = time.Now().UTC()
+	// Always stash the summary — partial progress is meaningful even when the
+	// run didn't complete (e.g. deadline exceeded mid-loop). The UI can then
+	// show "N / Total processed" alongside any message.
+	sCopy := summary
+	h.enrichSummary = &sCopy
 	if err != nil {
-		h.enrichError = err.Error()
-		slog.ErrorContext(ctx, "async crawl-enrich failed", "error", err)
-	} else {
-		sCopy := summary
-		h.enrichSummary = &sCopy
+		if errors.Is(err, context.DeadlineExceeded) {
+			// Resumable by design: remaining rows stay pending. Don't shout
+			// "failed" — this is a "press again" situation.
+			h.enrichError = fmt.Sprintf(
+				"Zeitlimit (%s) erreicht nach %d von %d Zeilen. Erneut starten, um die verbleibenden Zeilen zu bearbeiten.",
+				enrichAllTimeout, summary.Succeeded+summary.Failed, summary.Total,
+			)
+			slog.WarnContext(ctx, "async crawl-enrich hit timeout",
+				"processed", summary.Succeeded+summary.Failed, "total", summary.Total)
+		} else {
+			h.enrichError = err.Error()
+			slog.ErrorContext(ctx, "async crawl-enrich failed", "error", err)
+		}
 	}
 	h.enrichMu.Unlock()
 }
--- a/backend/internal/domain/discovery/service.go
+++ b/backend/internal/domain/discovery/service.go
@@ -856,6 +856,7 @@ func (s *Service) RunCrawlEnrichAll(ctx context.Context) (CrawlEnrichSummary, er
 			// Caller cancelled — stop cleanly. Summary reflects partial
 			// progress; the remaining rows stay in enrichment_status='pending'
 			// and will be picked up by the next run.
+			summary.DurationMs = time.Since(summary.StartedAt).Milliseconds()
 			return summary, err
 		}
 		in := enrich.Input{