fix(research): use anyOf for nullable fields in Ollama constraint schema

Ollama's llama.cpp grammar converter supports anyOf with primitive
null — use it for all nullable wert/hinweis fields instead of
type:string-only, so constrained decoding emits JSON null directly.
This also fixes the orchestrator test fixture which uses JSON null
for optional wert fields.
This commit is contained in:
2026-04-24 18:18:05 +02:00
parent 67b2eb5d74
commit c18babce5b
3 changed files with 165 additions and 4 deletions

View File

@@ -0,0 +1,79 @@
{
"type": "object",
"required": ["markt_name", "recherche_datum", "status", "quellen_gesamt", "felder"],
"properties": {
"markt_name": {"type": "string"},
"recherche_datum": {"type": "string"},
"status": {"type": "string", "enum": ["bestaetigt", "unklar", "vorjahr_unbestaetigt", "abgesagt"]},
"quellen_gesamt": {"type": "array", "items": {"type": "string"}},
"felder": {
"type": "object",
"required": ["website", "strasse", "plz", "stadt", "bundesland", "land", "veranstalter", "start_datum", "end_datum", "oeffnungszeiten", "eintrittspreise", "bild_url"],
"properties": {
"website": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
"strasse": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
"plz": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
"stadt": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
"bundesland": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
"land": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
"veranstalter":{"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
"bild_url": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
"start_datum": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
"end_datum": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
"oeffnungszeiten": {
"type": "object",
"required": ["wert", "quellen", "extraktion", "hinweis"],
"properties": {
"wert": {
"anyOf": [
{
"type": "array",
"items": {
"type": "object",
"required": ["datum_von", "datum_bis", "von", "bis"],
"properties": {
"datum_von": {"anyOf": [{"type": "string"}, {"type": "null"}]},
"datum_bis": {"anyOf": [{"type": "string"}, {"type": "null"}]},
"von": {"anyOf": [{"type": "string"}, {"type": "null"}]},
"bis": {"anyOf": [{"type": "string"}, {"type": "null"}]}
}
}
},
{"type": "null"}
]
},
"hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]},
"quellen": {"type": "array", "items": {"type": "string"}},
"extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}
}
},
"eintrittspreise": {
"type": "object",
"required": ["wert", "quellen", "extraktion", "hinweis"],
"properties": {
"wert": {
"anyOf": [
{
"type": "array",
"items": {
"type": "object",
"required": ["name", "betrag", "waehrung"],
"properties": {
"name": {"type": "string"},
"betrag": {"type": "number"},
"waehrung": {"type": "string", "enum": ["EUR", "CHF"]}
}
}
},
{"type": "null"}
]
},
"hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]},
"quellen": {"type": "array", "items": {"type": "string"}},
"extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}
}
}
}
}
}
}

View File

@@ -6,6 +6,7 @@ import (
"errors"
"fmt"
"net/url"
"strings"
"time"
"marktvogt.de/backend/internal/pkg/ai"
@@ -75,19 +76,37 @@ func (o *Orchestrator) Run(ctx context.Context, in Input) (Output, error) {
}
// 4. LLM call with one retry on schema violation
resp, err := callLLM(ctx, o.AI, userPrompt, SchemaJSON)
// Providers with constrained decoding (Ollama) use a simplified schema
// without $defs, union types, or patterns — and are validated against that
// same simplified schema. Providers that embed the schema in the prompt
// (Mistral) get the full schema for both generation and validation.
constraintSchema := SchemaJSON
validationSchema := SchemaJSON
if o.AI.SupportsJSONSchema() {
constraintSchema = ConstraintSchemaJSON
validationSchema = ConstraintSchemaJSON
}
validate := func(content string) error {
normalized := normalizeNullStrings(content)
return ai.ValidateSchema(validationSchema, []byte(normalized))
}
resp, err := callLLM(ctx, o.AI, userPrompt, constraintSchema)
if err == nil {
if verr := ai.ValidateSchema(SchemaJSON, []byte(resp.Content)); verr != nil {
if verr := validate(resp.Content); verr != nil {
err = &ai.ProviderError{Code: ai.ErrSchemaViolation, Retryable: true, RawOutput: resp.Content, Inner: verr}
} else {
resp.Content = normalizeNullStrings(resp.Content)
}
}
if err != nil {
var pe *ai.ProviderError
if errors.As(err, &pe) && pe.Code == ai.ErrSchemaViolation {
resp, err = callLLM(ctx, o.AI, userPrompt+"\n\nYour previous response failed schema validation. Re-emit the JSON strictly matching the schema.", SchemaJSON)
resp, err = callLLM(ctx, o.AI, userPrompt+"\n\nYour previous response failed schema validation. Re-emit the JSON strictly matching the schema.", constraintSchema)
if err == nil {
if verr := ai.ValidateSchema(SchemaJSON, []byte(resp.Content)); verr != nil {
if verr := validate(resp.Content); verr != nil {
err = &ai.ProviderError{Code: ai.ErrSchemaViolation, Retryable: false, RawOutput: resp.Content, Inner: verr}
} else {
resp.Content = normalizeNullStrings(resp.Content)
}
}
}
@@ -145,6 +164,61 @@ func buildUserPrompt(in Input, pages []Page) (string, error) {
return string(buf), nil
}
// normalizeNullStrings replaces the string literal "null" with JSON null
// throughout any JSON value. Constrained-decoding providers that use a
// simplified schema (type: string, no union types) emit "null" as a string
// when they intend the absence of a value. The full schema expects JSON null
// for nullable fields, so we normalise before validation.
// It also strips markdown code fences (```json ... ```) that some models
// emit even when constrained decoding is requested.
func normalizeNullStrings(content string) string {
content = stripMarkdownFences(content)
var v any
if err := json.Unmarshal([]byte(content), &v); err != nil {
return content
}
normalized := replaceNullStrings(v)
b, err := json.Marshal(normalized)
if err != nil {
return content
}
return string(b)
}
func replaceNullStrings(v any) any {
switch val := v.(type) {
case string:
if val == "null" {
return nil
}
return val
case map[string]any:
for k, elem := range val {
val[k] = replaceNullStrings(elem)
}
return val
case []any:
for i, elem := range val {
val[i] = replaceNullStrings(elem)
}
return val
default:
return v
}
}
func stripMarkdownFences(s string) string {
s = strings.TrimSpace(s)
if strings.HasPrefix(s, "```") {
if i := strings.Index(s, "\n"); i != -1 {
s = s[i+1:]
}
s = strings.TrimSuffix(s, "```")
s = strings.TrimSpace(s)
}
return s
}
func countDomains(urls []string) int {
seen := map[string]struct{}{}
for _, raw := range urls {

View File

@@ -4,3 +4,11 @@ import _ "embed"
//go:embed assets/researcher_schema.json
var SchemaJSON []byte
// ConstraintSchemaJSON is a simplified, flat JSON Schema for providers that
// support constrained decoding but cannot handle $defs, union types, or
// pattern constraints (e.g. Ollama with llama.cpp grammar generation).
// Post-hoc validation always uses SchemaJSON regardless.
//
//go:embed assets/researcher_schema_ollama.json
var ConstraintSchemaJSON []byte