fix(discovery): use JSON schema instead of JSONMode for LLM enricher
Replaces JSONMode:true with an embedded enricher_schema.json so Gemini
returns structured output against a typed schema, preventing empty {} responses.
Adds an all-empty warning when the LLM returns a valid but blank payload.
This commit is contained in:
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"type": "object",
|
||||
"required": ["category", "opening_hours", "description"],
|
||||
"properties": {
|
||||
"category": {
|
||||
"type": "string",
|
||||
"description": "Market category. Use empty string if unclear.",
|
||||
"enum": ["mittelaltermarkt", "weihnachtsmarkt", "ritterfest",
|
||||
"handwerkermarkt", "schlossfest", "ritterturnier",
|
||||
"kirchweih", ""]
|
||||
},
|
||||
"opening_hours": {
|
||||
"type": "string",
|
||||
"description": "Brief German summary, e.g. Sa-So 10:00-18:00. Empty if unclear."
|
||||
},
|
||||
"description": {
|
||||
"type": "string",
|
||||
"description": "1-3 sentences in German. Empty if insufficient source info."
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,7 @@ package enrich
|
||||
|
||||
import (
|
||||
"context"
|
||||
_ "embed"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
@@ -12,6 +13,9 @@ import (
|
||||
"marktvogt.de/backend/internal/pkg/ai"
|
||||
)
|
||||
|
||||
//go:embed assets/enricher_schema.json
|
||||
var enricherSchemaJSON []byte
|
||||
|
||||
// maxScrapeURLs limits how many quellen we pull per enrichment call.
|
||||
// Real-world queue rows rarely have more than 2-3 sources; 5 is a safe cap
|
||||
// that prevents a worst-case row from burning minutes on fetches.
|
||||
@@ -88,7 +92,7 @@ func (e *ProviderLLMEnricher) EnrichMissing(ctx context.Context, req LLMRequest)
|
||||
resp, err := e.AI.Chat(ctx, &ai.ChatRequest{
|
||||
SystemPrompt: systemPrompt,
|
||||
UserMessage: userPrompt,
|
||||
JSONMode: true,
|
||||
JSONSchema: enricherSchemaJSON,
|
||||
Grounded: true,
|
||||
CallType: "enrich_b",
|
||||
Temperature: 0.1,
|
||||
@@ -102,6 +106,11 @@ func (e *ProviderLLMEnricher) EnrichMissing(ctx context.Context, req LLMRequest)
|
||||
return Enrichment{}, fmt.Errorf("parse llm response: %w (content=%q)", err, resp.Content)
|
||||
}
|
||||
|
||||
if parsed.Category == "" && parsed.OpeningHours == "" && parsed.Description == "" {
|
||||
slog.WarnContext(ctx, "llm enrich: all fields empty despite successful call",
|
||||
"markt", req.MarktName, "stadt", req.Stadt)
|
||||
}
|
||||
|
||||
// Build the Enrichment payload with only the fields the model produced.
|
||||
now := time.Now().UTC()
|
||||
out := Enrichment{
|
||||
|
||||
@@ -44,8 +44,11 @@ func TestLLMEnricher_HappyPath(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if stub.seen.JSONMode != true {
|
||||
t.Fatalf("JSONMode must be set")
|
||||
if stub.seen.JSONMode {
|
||||
t.Fatal("JSONMode must not be set when JSONSchema is used")
|
||||
}
|
||||
if len(stub.seen.JSONSchema) == 0 {
|
||||
t.Fatal("JSONSchema must be set")
|
||||
}
|
||||
|
||||
// Result carries LLM fields and provenance llm.
|
||||
@@ -148,6 +151,28 @@ func TestLLMEnricher_EmptyFieldsNoProvenance(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestLLMEnricher_SendsJSONSchema(t *testing.T) {
|
||||
stub := &stubProvider{
|
||||
content: `{"category":"mittelaltermarkt","opening_hours":"Sa-So 10-18","description":"Markt."}`,
|
||||
}
|
||||
scraper := &stubScraper{responses: map[string]string{"https://example.com": "some content"}}
|
||||
e := NewLLMEnricher(stub, scraper)
|
||||
_, err := e.EnrichMissing(context.Background(), LLMRequest{
|
||||
MarktName: "Testmarkt",
|
||||
Stadt: "Dresden",
|
||||
Quellen: []string{"https://example.com"},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if stub.seen.JSONMode {
|
||||
t.Error("expected JSONMode to be false when schema is used")
|
||||
}
|
||||
if len(stub.seen.JSONSchema) == 0 {
|
||||
t.Error("expected JSONSchema to be set")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLLMEnricher_CapsURLsAtFive(t *testing.T) {
|
||||
// Supply 7 URLs; only the first 5 should be fetched.
|
||||
urls := []string{"u1", "u2", "u3", "u4", "u5", "u6", "u7"}
|
||||
|
||||
Reference in New Issue
Block a user