fix(research): use anyOf for nullable fields in Ollama constraint schema
Ollama's llama.cpp grammar converter supports anyOf with primitive null — use it for all nullable wert/hinweis fields instead of type:string-only, so constrained decoding emits JSON null directly. This also fixes the orchestrator test fixture which uses JSON null for optional wert fields.
This commit is contained in:
@@ -0,0 +1,79 @@
|
||||
{
|
||||
"type": "object",
|
||||
"required": ["markt_name", "recherche_datum", "status", "quellen_gesamt", "felder"],
|
||||
"properties": {
|
||||
"markt_name": {"type": "string"},
|
||||
"recherche_datum": {"type": "string"},
|
||||
"status": {"type": "string", "enum": ["bestaetigt", "unklar", "vorjahr_unbestaetigt", "abgesagt"]},
|
||||
"quellen_gesamt": {"type": "array", "items": {"type": "string"}},
|
||||
"felder": {
|
||||
"type": "object",
|
||||
"required": ["website", "strasse", "plz", "stadt", "bundesland", "land", "veranstalter", "start_datum", "end_datum", "oeffnungszeiten", "eintrittspreise", "bild_url"],
|
||||
"properties": {
|
||||
"website": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
|
||||
"strasse": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
|
||||
"plz": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
|
||||
"stadt": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
|
||||
"bundesland": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
|
||||
"land": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
|
||||
"veranstalter":{"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
|
||||
"bild_url": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
|
||||
"start_datum": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
|
||||
"end_datum": {"type": "object", "required": ["wert", "quellen", "extraktion", "hinweis"], "properties": {"wert": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]}, "quellen": {"type": "array", "items": {"type": "string"}}, "extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}}},
|
||||
"oeffnungszeiten": {
|
||||
"type": "object",
|
||||
"required": ["wert", "quellen", "extraktion", "hinweis"],
|
||||
"properties": {
|
||||
"wert": {
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": ["datum_von", "datum_bis", "von", "bis"],
|
||||
"properties": {
|
||||
"datum_von": {"anyOf": [{"type": "string"}, {"type": "null"}]},
|
||||
"datum_bis": {"anyOf": [{"type": "string"}, {"type": "null"}]},
|
||||
"von": {"anyOf": [{"type": "string"}, {"type": "null"}]},
|
||||
"bis": {"anyOf": [{"type": "string"}, {"type": "null"}]}
|
||||
}
|
||||
}
|
||||
},
|
||||
{"type": "null"}
|
||||
]
|
||||
},
|
||||
"hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]},
|
||||
"quellen": {"type": "array", "items": {"type": "string"}},
|
||||
"extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}
|
||||
}
|
||||
},
|
||||
"eintrittspreise": {
|
||||
"type": "object",
|
||||
"required": ["wert", "quellen", "extraktion", "hinweis"],
|
||||
"properties": {
|
||||
"wert": {
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": ["name", "betrag", "waehrung"],
|
||||
"properties": {
|
||||
"name": {"type": "string"},
|
||||
"betrag": {"type": "number"},
|
||||
"waehrung": {"type": "string", "enum": ["EUR", "CHF"]}
|
||||
}
|
||||
}
|
||||
},
|
||||
{"type": "null"}
|
||||
]
|
||||
},
|
||||
"hinweis": {"anyOf": [{"type": "string"}, {"type": "null"}]},
|
||||
"quellen": {"type": "array", "items": {"type": "string"}},
|
||||
"extraktion": {"type": "string", "enum": ["direkt", "kombiniert"]}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"marktvogt.de/backend/internal/pkg/ai"
|
||||
@@ -75,19 +76,37 @@ func (o *Orchestrator) Run(ctx context.Context, in Input) (Output, error) {
|
||||
}
|
||||
|
||||
// 4. LLM call with one retry on schema violation
|
||||
resp, err := callLLM(ctx, o.AI, userPrompt, SchemaJSON)
|
||||
// Providers with constrained decoding (Ollama) use a simplified schema
|
||||
// without $defs, union types, or patterns — and are validated against that
|
||||
// same simplified schema. Providers that embed the schema in the prompt
|
||||
// (Mistral) get the full schema for both generation and validation.
|
||||
constraintSchema := SchemaJSON
|
||||
validationSchema := SchemaJSON
|
||||
if o.AI.SupportsJSONSchema() {
|
||||
constraintSchema = ConstraintSchemaJSON
|
||||
validationSchema = ConstraintSchemaJSON
|
||||
}
|
||||
validate := func(content string) error {
|
||||
normalized := normalizeNullStrings(content)
|
||||
return ai.ValidateSchema(validationSchema, []byte(normalized))
|
||||
}
|
||||
resp, err := callLLM(ctx, o.AI, userPrompt, constraintSchema)
|
||||
if err == nil {
|
||||
if verr := ai.ValidateSchema(SchemaJSON, []byte(resp.Content)); verr != nil {
|
||||
if verr := validate(resp.Content); verr != nil {
|
||||
err = &ai.ProviderError{Code: ai.ErrSchemaViolation, Retryable: true, RawOutput: resp.Content, Inner: verr}
|
||||
} else {
|
||||
resp.Content = normalizeNullStrings(resp.Content)
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
var pe *ai.ProviderError
|
||||
if errors.As(err, &pe) && pe.Code == ai.ErrSchemaViolation {
|
||||
resp, err = callLLM(ctx, o.AI, userPrompt+"\n\nYour previous response failed schema validation. Re-emit the JSON strictly matching the schema.", SchemaJSON)
|
||||
resp, err = callLLM(ctx, o.AI, userPrompt+"\n\nYour previous response failed schema validation. Re-emit the JSON strictly matching the schema.", constraintSchema)
|
||||
if err == nil {
|
||||
if verr := ai.ValidateSchema(SchemaJSON, []byte(resp.Content)); verr != nil {
|
||||
if verr := validate(resp.Content); verr != nil {
|
||||
err = &ai.ProviderError{Code: ai.ErrSchemaViolation, Retryable: false, RawOutput: resp.Content, Inner: verr}
|
||||
} else {
|
||||
resp.Content = normalizeNullStrings(resp.Content)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -145,6 +164,61 @@ func buildUserPrompt(in Input, pages []Page) (string, error) {
|
||||
return string(buf), nil
|
||||
}
|
||||
|
||||
// normalizeNullStrings replaces the string literal "null" with JSON null
|
||||
// throughout any JSON value. Constrained-decoding providers that use a
|
||||
// simplified schema (type: string, no union types) emit "null" as a string
|
||||
// when they intend the absence of a value. The full schema expects JSON null
|
||||
// for nullable fields, so we normalise before validation.
|
||||
// It also strips markdown code fences (```json ... ```) that some models
|
||||
// emit even when constrained decoding is requested.
|
||||
func normalizeNullStrings(content string) string {
|
||||
content = stripMarkdownFences(content)
|
||||
var v any
|
||||
if err := json.Unmarshal([]byte(content), &v); err != nil {
|
||||
return content
|
||||
}
|
||||
normalized := replaceNullStrings(v)
|
||||
b, err := json.Marshal(normalized)
|
||||
if err != nil {
|
||||
return content
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
|
||||
func replaceNullStrings(v any) any {
|
||||
switch val := v.(type) {
|
||||
case string:
|
||||
if val == "null" {
|
||||
return nil
|
||||
}
|
||||
return val
|
||||
case map[string]any:
|
||||
for k, elem := range val {
|
||||
val[k] = replaceNullStrings(elem)
|
||||
}
|
||||
return val
|
||||
case []any:
|
||||
for i, elem := range val {
|
||||
val[i] = replaceNullStrings(elem)
|
||||
}
|
||||
return val
|
||||
default:
|
||||
return v
|
||||
}
|
||||
}
|
||||
|
||||
func stripMarkdownFences(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
if strings.HasPrefix(s, "```") {
|
||||
if i := strings.Index(s, "\n"); i != -1 {
|
||||
s = s[i+1:]
|
||||
}
|
||||
s = strings.TrimSuffix(s, "```")
|
||||
s = strings.TrimSpace(s)
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func countDomains(urls []string) int {
|
||||
seen := map[string]struct{}{}
|
||||
for _, raw := range urls {
|
||||
|
||||
@@ -4,3 +4,11 @@ import _ "embed"
|
||||
|
||||
//go:embed assets/researcher_schema.json
|
||||
var SchemaJSON []byte
|
||||
|
||||
// ConstraintSchemaJSON is a simplified, flat JSON Schema for providers that
|
||||
// support constrained decoding but cannot handle $defs, union types, or
|
||||
// pattern constraints (e.g. Ollama with llama.cpp grammar generation).
|
||||
// Post-hoc validation always uses SchemaJSON regardless.
|
||||
//
|
||||
//go:embed assets/researcher_schema_ollama.json
|
||||
var ConstraintSchemaJSON []byte
|
||||
|
||||
Reference in New Issue
Block a user