From f151c0865e447903c1111fa9ce1d90de36c2c2fe Mon Sep 17 00:00:00 2001 From: vikingowl Date: Sat, 25 Apr 2026 17:48:48 +0200 Subject: [PATCH] fix(discovery): use JSON schema instead of JSONMode for LLM enricher Replaces JSONMode:true with an embedded enricher_schema.json so Gemini returns structured output against a typed schema, preventing empty {} responses. Adds an all-empty warning when the LLM returns a valid but blank payload. --- .../enrich/assets/enricher_schema.json | 21 ++++++++++++++ .../domain/discovery/enrich/llm_enricher.go | 11 ++++++- .../discovery/enrich/llm_enricher_test.go | 29 +++++++++++++++++-- 3 files changed, 58 insertions(+), 3 deletions(-) create mode 100644 backend/internal/domain/discovery/enrich/assets/enricher_schema.json diff --git a/backend/internal/domain/discovery/enrich/assets/enricher_schema.json b/backend/internal/domain/discovery/enrich/assets/enricher_schema.json new file mode 100644 index 0000000..66c7891 --- /dev/null +++ b/backend/internal/domain/discovery/enrich/assets/enricher_schema.json @@ -0,0 +1,21 @@ +{ + "type": "object", + "required": ["category", "opening_hours", "description"], + "properties": { + "category": { + "type": "string", + "description": "Market category. Use empty string if unclear.", + "enum": ["mittelaltermarkt", "weihnachtsmarkt", "ritterfest", + "handwerkermarkt", "schlossfest", "ritterturnier", + "kirchweih", ""] + }, + "opening_hours": { + "type": "string", + "description": "Brief German summary, e.g. Sa-So 10:00-18:00. Empty if unclear." + }, + "description": { + "type": "string", + "description": "1-3 sentences in German. Empty if insufficient source info." + } + } +} diff --git a/backend/internal/domain/discovery/enrich/llm_enricher.go b/backend/internal/domain/discovery/enrich/llm_enricher.go index 06abcb0..81e1378 100644 --- a/backend/internal/domain/discovery/enrich/llm_enricher.go +++ b/backend/internal/domain/discovery/enrich/llm_enricher.go @@ -2,6 +2,7 @@ package enrich import ( "context" + _ "embed" "encoding/json" "errors" "fmt" @@ -12,6 +13,9 @@ import ( "marktvogt.de/backend/internal/pkg/ai" ) +//go:embed assets/enricher_schema.json +var enricherSchemaJSON []byte + // maxScrapeURLs limits how many quellen we pull per enrichment call. // Real-world queue rows rarely have more than 2-3 sources; 5 is a safe cap // that prevents a worst-case row from burning minutes on fetches. @@ -88,7 +92,7 @@ func (e *ProviderLLMEnricher) EnrichMissing(ctx context.Context, req LLMRequest) resp, err := e.AI.Chat(ctx, &ai.ChatRequest{ SystemPrompt: systemPrompt, UserMessage: userPrompt, - JSONMode: true, + JSONSchema: enricherSchemaJSON, Grounded: true, CallType: "enrich_b", Temperature: 0.1, @@ -102,6 +106,11 @@ func (e *ProviderLLMEnricher) EnrichMissing(ctx context.Context, req LLMRequest) return Enrichment{}, fmt.Errorf("parse llm response: %w (content=%q)", err, resp.Content) } + if parsed.Category == "" && parsed.OpeningHours == "" && parsed.Description == "" { + slog.WarnContext(ctx, "llm enrich: all fields empty despite successful call", + "markt", req.MarktName, "stadt", req.Stadt) + } + // Build the Enrichment payload with only the fields the model produced. now := time.Now().UTC() out := Enrichment{ diff --git a/backend/internal/domain/discovery/enrich/llm_enricher_test.go b/backend/internal/domain/discovery/enrich/llm_enricher_test.go index 18b7d71..20a4894 100644 --- a/backend/internal/domain/discovery/enrich/llm_enricher_test.go +++ b/backend/internal/domain/discovery/enrich/llm_enricher_test.go @@ -44,8 +44,11 @@ func TestLLMEnricher_HappyPath(t *testing.T) { if err != nil { t.Fatalf("unexpected error: %v", err) } - if stub.seen.JSONMode != true { - t.Fatalf("JSONMode must be set") + if stub.seen.JSONMode { + t.Fatal("JSONMode must not be set when JSONSchema is used") + } + if len(stub.seen.JSONSchema) == 0 { + t.Fatal("JSONSchema must be set") } // Result carries LLM fields and provenance llm. @@ -148,6 +151,28 @@ func TestLLMEnricher_EmptyFieldsNoProvenance(t *testing.T) { } } +func TestLLMEnricher_SendsJSONSchema(t *testing.T) { + stub := &stubProvider{ + content: `{"category":"mittelaltermarkt","opening_hours":"Sa-So 10-18","description":"Markt."}`, + } + scraper := &stubScraper{responses: map[string]string{"https://example.com": "some content"}} + e := NewLLMEnricher(stub, scraper) + _, err := e.EnrichMissing(context.Background(), LLMRequest{ + MarktName: "Testmarkt", + Stadt: "Dresden", + Quellen: []string{"https://example.com"}, + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if stub.seen.JSONMode { + t.Error("expected JSONMode to be false when schema is used") + } + if len(stub.seen.JSONSchema) == 0 { + t.Error("expected JSONSchema to be set") + } +} + func TestLLMEnricher_CapsURLsAtFive(t *testing.T) { // Supply 7 URLs; only the first 5 should be fetched. urls := []string{"u1", "u2", "u3", "u4", "u5", "u6", "u7"}