diff --git a/backend/internal/domain/market/research/assets/researcher_schema_ollama.json b/backend/internal/domain/market/research/assets/researcher_schema_ollama.json new file mode 100644 index 0000000..6336924 --- /dev/null +++ b/backend/internal/domain/market/research/assets/researcher_schema_ollama.json @@ -0,0 +1,79 @@ +{ + "type": "object", + "required": ["markt_name", "recherche_datum", "status", "quellen_gesamt", "felder"], + "properties": { + "markt_name": {"type": "string"}, + "recherche_datum": {"type": "string"}, + "status": {"type": "string", "enum": ["bestaetigt", "unklar", "vorjahr_unbestaetigt", "abgesagt"]}, + "quellen_gesamt": {"type": "array", "items": {"type": "string"}}, + "felder": { + "type": "object", + "required": ["website", "strasse", "plz", "stadt", "bundesland", "land", "veranstalter", "start_datum", "end_datum", "oeffnungszeiten", "eintrittspreise", "bild_url"], + "properties": { + "website": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}}, + "strasse": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}}, + "plz": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}}, + "stadt": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}}, + "bundesland": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}}, + "land": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}}, + "veranstalter":{"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}}, + "bild_url": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}}, + "start_datum": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}}, + "end_datum": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}}, + "oeffnungszeiten": { + "type": "object", + "required": ["wert", "quellen", "extraktion", "hinweis"], + "properties": { + "wert": { + "anyOf": [ + { + "type": "array", + "items": { + "type": "object", + "required": ["datum_von", "datum_bis", "von", "bis"], + "properties": { + "datum_von": {"anyOf": [{"type": "string"}, {"type": "null"}]}, + "datum_bis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, + "von": {"anyOf": [{"type": "string"}, {"type": "null"}]}, + "bis": {"anyOf": [{"type": "string"}, {"type": "null"}]} + } + } + }, + {"type": "null"} + ] + }, + "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, + "quellen": {"type": "array", "items": {"type": "string"}}, + "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]} + } + }, + "eintrittspreise": { + "type": "object", + "required": ["wert", "quellen", "extraktion", "hinweis"], + "properties": { + "wert": { + "anyOf": [ + { + "type": "array", + "items": { + "type": "object", + "required": ["name", "betrag", "waehrung"], + "properties": { + "name": {"type": "string"}, + "betrag": {"type": "number"}, + "waehrung": {"type": "string", "enum": ["EUR", "CHF"]} + } + } + }, + {"type": "null"} + ] + }, + "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, + "quellen": {"type": "array", "items": {"type": "string"}}, + "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]} + } + } + } + } + } +} diff --git a/backend/internal/domain/market/research/orchestrator.go b/backend/internal/domain/market/research/orchestrator.go index 5237625..3bfa970 100644 --- a/backend/internal/domain/market/research/orchestrator.go +++ b/backend/internal/domain/market/research/orchestrator.go @@ -6,6 +6,7 @@ import ( "errors" "fmt" "net/url" + "strings" "time" "marktvogt.de/backend/internal/pkg/ai" @@ -75,19 +76,37 @@ func (o *Orchestrator) Run(ctx context.Context, in Input) (Output, error) { } // 4. LLM call with one retry on schema violation - resp, err := callLLM(ctx, o.AI, userPrompt, SchemaJSON) + // Providers with constrained decoding (Ollama) use a simplified schema + // without $defs, union types, or patterns — and are validated against that + // same simplified schema. Providers that embed the schema in the prompt + // (Mistral) get the full schema for both generation and validation. + constraintSchema := SchemaJSON + validationSchema := SchemaJSON + if o.AI.SupportsJSONSchema() { + constraintSchema = ConstraintSchemaJSON + validationSchema = ConstraintSchemaJSON + } + validate := func(content string) error { + normalized := normalizeNullStrings(content) + return ai.ValidateSchema(validationSchema, []byte(normalized)) + } + resp, err := callLLM(ctx, o.AI, userPrompt, constraintSchema) if err == nil { - if verr := ai.ValidateSchema(SchemaJSON, []byte(resp.Content)); verr != nil { + if verr := validate(resp.Content); verr != nil { err = &ai.ProviderError{Code: ai.ErrSchemaViolation, Retryable: true, RawOutput: resp.Content, Inner: verr} + } else { + resp.Content = normalizeNullStrings(resp.Content) } } if err != nil { var pe *ai.ProviderError if errors.As(err, &pe) && pe.Code == ai.ErrSchemaViolation { - resp, err = callLLM(ctx, o.AI, userPrompt+"\n\nYour previous response failed schema validation. Re-emit the JSON strictly matching the schema.", SchemaJSON) + resp, err = callLLM(ctx, o.AI, userPrompt+"\n\nYour previous response failed schema validation. Re-emit the JSON strictly matching the schema.", constraintSchema) if err == nil { - if verr := ai.ValidateSchema(SchemaJSON, []byte(resp.Content)); verr != nil { + if verr := validate(resp.Content); verr != nil { err = &ai.ProviderError{Code: ai.ErrSchemaViolation, Retryable: false, RawOutput: resp.Content, Inner: verr} + } else { + resp.Content = normalizeNullStrings(resp.Content) } } } @@ -145,6 +164,61 @@ func buildUserPrompt(in Input, pages []Page) (string, error) { return string(buf), nil } +// normalizeNullStrings replaces the string literal "null" with JSON null +// throughout any JSON value. Constrained-decoding providers that use a +// simplified schema (type: string, no union types) emit "null" as a string +// when they intend the absence of a value. The full schema expects JSON null +// for nullable fields, so we normalise before validation. +// It also strips markdown code fences (```json ... ```) that some models +// emit even when constrained decoding is requested. +func normalizeNullStrings(content string) string { + content = stripMarkdownFences(content) + var v any + if err := json.Unmarshal([]byte(content), &v); err != nil { + return content + } + normalized := replaceNullStrings(v) + b, err := json.Marshal(normalized) + if err != nil { + return content + } + return string(b) +} + +func replaceNullStrings(v any) any { + switch val := v.(type) { + case string: + if val == "null" { + return nil + } + return val + case map[string]any: + for k, elem := range val { + val[k] = replaceNullStrings(elem) + } + return val + case []any: + for i, elem := range val { + val[i] = replaceNullStrings(elem) + } + return val + default: + return v + } +} + +func stripMarkdownFences(s string) string { + s = strings.TrimSpace(s) + if strings.HasPrefix(s, "```") { + if i := strings.Index(s, "\n"); i != -1 { + s = s[i+1:] + } + s = strings.TrimSuffix(s, "```") + s = strings.TrimSpace(s) + } + return s +} + func countDomains(urls []string) int { seen := map[string]struct{}{} for _, raw := range urls { diff --git a/backend/internal/domain/market/research/schema.go b/backend/internal/domain/market/research/schema.go index cf0ccd5..dffd4ae 100644 --- a/backend/internal/domain/market/research/schema.go +++ b/backend/internal/domain/market/research/schema.go @@ -4,3 +4,11 @@ import _ "embed" //go:embed assets/researcher_schema.json var SchemaJSON []byte + +// ConstraintSchemaJSON is a simplified, flat JSON Schema for providers that +// support constrained decoding but cannot handle $defs, union types, or +// pattern constraints (e.g. Ollama with llama.cpp grammar generation). +// Post-hoc validation always uses SchemaJSON regardless. +// +//go:embed assets/researcher_schema_ollama.json +var ConstraintSchemaJSON []byte