From 2e3141aaeb179d36f7c1e4a7c66c7481ddf90d8a Mon Sep 17 00:00:00 2001 From: vikingowl Date: Sat, 25 Apr 2026 17:50:27 +0200 Subject: [PATCH] fix(discovery): skip enrichment cache for date-less rows (year=0) Rows without start_datum all hash to year=0, causing cache collisions across unrelated markets. Gate both cache reads and writes on year!=0. --- backend/internal/domain/discovery/service.go | 32 ++++++++++------- .../internal/domain/discovery/service_test.go | 34 +++++++++++++++++++ 2 files changed, 54 insertions(+), 12 deletions(-) diff --git a/backend/internal/domain/discovery/service.go b/backend/internal/domain/discovery/service.go index 5b320f6..dcf29fe 100644 --- a/backend/internal/domain/discovery/service.go +++ b/backend/internal/domain/discovery/service.go @@ -717,20 +717,25 @@ func (s *Service) RunLLMEnrichOne(ctx context.Context, queueID uuid.UUID) (enric if row.StartDatum != nil { year = row.StartDatum.Year() } + // Rows without a start_datum all hash to year=0, which would cause cache + // collisions across unrelated markets. Skip the cache entirely for these rows. + useCache := year != 0 cacheKey := enrich.CacheKey(row.NameNormalized, row.Stadt, year) // Cache lookup — if we have a fresh LLM payload for this (name, city, // year) tuple, skip the call. The merge still runs so a newly-populated // crawl-enrich base gets its provenance preserved. - if cached, hit, err := s.repo.GetEnrichmentCache(ctx, cacheKey); err != nil { - slog.WarnContext(ctx, "enrichment cache get failed; continuing", - "cache_key", cacheKey, "error", err) - } else if hit { - merged := enrich.Merge(row.Enrichment, cached) - if err := s.repo.SetEnrichment(ctx, row.ID, merged, EnrichmentStatusDone); err != nil { - return enrich.Enrichment{}, fmt.Errorf("persist merged (cache hit): %w", err) + if useCache { + if cached, hit, err := s.repo.GetEnrichmentCache(ctx, cacheKey); err != nil { + slog.WarnContext(ctx, "enrichment cache get failed; continuing", + "cache_key", cacheKey, "error", err) + } else if hit { + merged := enrich.Merge(row.Enrichment, cached) + if err := s.repo.SetEnrichment(ctx, row.ID, merged, EnrichmentStatusDone); err != nil { + return enrich.Enrichment{}, fmt.Errorf("persist merged (cache hit): %w", err) + } + return merged, nil } - return merged, nil } llmReq := enrich.LLMRequest{ @@ -753,10 +758,13 @@ func (s *Service) RunLLMEnrichOne(ctx context.Context, queueID uuid.UUID) (enric // Cache the raw LLM output (not the merged result). A later re-crawl // might change crawl-enrich fields; the cached answer should layer on - // top of whatever the current base is. - if err := s.repo.SetEnrichmentCache(ctx, cacheKey, llmPayload, enrich.DefaultCacheTTL); err != nil { - slog.WarnContext(ctx, "enrichment cache set failed; continuing", - "cache_key", cacheKey, "error", err) + // top of whatever the current base is. Skip for date-less rows (year=0) + // to avoid cross-row cache collisions. + if useCache { + if err := s.repo.SetEnrichmentCache(ctx, cacheKey, llmPayload, enrich.DefaultCacheTTL); err != nil { + slog.WarnContext(ctx, "enrichment cache set failed; continuing", + "cache_key", cacheKey, "error", err) + } } merged := enrich.Merge(row.Enrichment, llmPayload) diff --git a/backend/internal/domain/discovery/service_test.go b/backend/internal/domain/discovery/service_test.go index b18a422..f23a125 100644 --- a/backend/internal/domain/discovery/service_test.go +++ b/backend/internal/domain/discovery/service_test.go @@ -853,6 +853,40 @@ func TestRunLLMEnrichOne_LLMErrorMarksFailed(t *testing.T) { } } +// TestRunLLMEnrichOne_SkipsCacheWhenYearZero: a row with no start_datum (year=0) +// must bypass the enrichment cache entirely — both reads and writes. Two calls +// for the same row must both hit the LLM. +func TestRunLLMEnrichOne_SkipsCacheWhenYearZero(t *testing.T) { + rowID := uuid.New() + cacheHits := 0 + + repo := &mockRepo{ + getDiscoveredFn: func(_ context.Context, _ uuid.UUID) (DiscoveredMarket, error) { + return DiscoveredMarket{ + ID: rowID, + MarktName: "Testmarkt", + Stadt: "Dresden", + NameNormalized: "testmarkt", + StartDatum: nil, // no date → year = 0 + }, nil + }, + getCacheFn: func(_ string) (enrich.Enrichment, bool, error) { + cacheHits++ + return enrich.Enrichment{}, false, nil + }, + } + llm := &stubLLMEnricher{result: enrich.Enrichment{Category: catMittelaltermarkt}} + svc := NewService(repo, nil, noopLinkVerifier{}, noopMarketCreator{}, nil, llm, nil) + + _, _ = svc.RunLLMEnrichOne(context.Background(), rowID) + _, _ = svc.RunLLMEnrichOne(context.Background(), rowID) + + // With year=0, cache must be skipped — GetEnrichmentCache must never be called. + if cacheHits != 0 { + t.Errorf("expected 0 cache hits for date-less row, got %d", cacheHits) + } +} + // TestRunCrawlEnrichAll_EmptyQueueNoOp: nothing pending, zero summary, no writes. func TestRunCrawlEnrichAll_EmptyQueueNoOp(t *testing.T) { var writes int