fix(discovery): raise enrich-all timeout + surface partial progress

Pain: a 1400+ row pending queue can't finish crawl-enrich inside the
old 10-minute cap (Nominatim's 1 rps means ~23m minimum). Operators
saw a scary red "Crawl-enrich fehlgeschlagen: context deadline
exceeded" banner even though the pipeline is resumable.

- Introduce enrichAllTimeout constant (45m) sized for ~2700 rows per
  press; the original 10m assumed 600 rows worst-case.
- On context.DeadlineExceeded, translate to a user-facing message
  ("Zeitlimit erreicht nach N von M Zeilen. Erneut starten, um die
  verbleibenden Zeilen zu bearbeiten.") instead of raw Go error.
- Always stash the summary in handler state, even on error, so the
  UI can show partial progress (N/M processed) alongside the message.
- Service: populate DurationMs on early-return too, so the status
  endpoint's duration reflects the partial run instead of zero.

Behavior unchanged when a run finishes cleanly; the queue remains
resumable across presses as before.
This commit is contained in:
2026-04-24 14:11:38 +02:00
parent 950d01e3d4
commit 9cbe654d55
2 changed files with 32 additions and 8 deletions

View File

@@ -2,6 +2,7 @@ package discovery
import (
"context"
"errors"
"fmt"
"log/slog"
"net/http"
@@ -16,6 +17,15 @@ import (
"marktvogt.de/backend/internal/pkg/apierror"
)
// enrichAllTimeout bounds a single RunCrawlEnrichAll goroutine. The pipeline
// is resumable — rows that don't finish stay enrichment_status='pending' and
// the next run picks them up — so hitting this cap is not a failure, just a
// signal to press the button again.
//
// Sized against Nominatim's 1 rps ceiling: 45m handles ~2700 rows per press.
// Larger queues simply need multiple presses.
const enrichAllTimeout = 45 * time.Minute
type Handler struct {
service *Service
@@ -354,24 +364,37 @@ func (h *Handler) RunCrawlEnrichAll(c *gin.Context) {
})
}
// runEnrichAsync runs RunCrawlEnrichAll with a detached context. 10m cap is
// generous for Nominatim's 1rps: a 600-row queue is the worst case we expect.
// runEnrichAsync runs RunCrawlEnrichAll with a detached context. See
// enrichAllTimeout for the cap rationale.
func (h *Handler) runEnrichAsync() {
defer h.enrichRunning.Store(false)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
ctx, cancel := context.WithTimeout(context.Background(), enrichAllTimeout)
defer cancel()
summary, err := h.service.RunCrawlEnrichAll(ctx)
h.enrichMu.Lock()
h.enrichFinishedAt = time.Now().UTC()
// Always stash the summary — partial progress is meaningful even when the
// run didn't complete (e.g. deadline exceeded mid-loop). The UI can then
// show "N / Total processed" alongside any message.
sCopy := summary
h.enrichSummary = &sCopy
if err != nil {
h.enrichError = err.Error()
slog.ErrorContext(ctx, "async crawl-enrich failed", "error", err)
} else {
sCopy := summary
h.enrichSummary = &sCopy
if errors.Is(err, context.DeadlineExceeded) {
// Resumable by design: remaining rows stay pending. Don't shout
// "failed" — this is a "press again" situation.
h.enrichError = fmt.Sprintf(
"Zeitlimit (%s) erreicht nach %d von %d Zeilen. Erneut starten, um die verbleibenden Zeilen zu bearbeiten.",
enrichAllTimeout, summary.Succeeded+summary.Failed, summary.Total,
)
slog.WarnContext(ctx, "async crawl-enrich hit timeout",
"processed", summary.Succeeded+summary.Failed, "total", summary.Total)
} else {
h.enrichError = err.Error()
slog.ErrorContext(ctx, "async crawl-enrich failed", "error", err)
}
}
h.enrichMu.Unlock()
}

View File

@@ -856,6 +856,7 @@ func (s *Service) RunCrawlEnrichAll(ctx context.Context) (CrawlEnrichSummary, er
// Caller cancelled — stop cleanly. Summary reflects partial
// progress; the remaining rows stay in enrichment_status='pending'
// and will be picked up by the next run.
summary.DurationMs = time.Since(summary.StartedAt).Milliseconds()
return summary, err
}
in := enrich.Input{