fix(dedup): wire merge advisor JSON schema + flexible field_merges parser

Gemini returned field_merges as an array without structure constraint,
causing json.Unmarshal to fail with "cannot unmarshal array into Go struct
field of type map[string]mergeFieldDecision".

- Pass merge_advisor_schema.json via JSONSchema instead of bare JSONMode
- Add parseFieldMerges() that accepts both object and array LLM formats
- Validate target_id is one of the two input market IDs after parsing
- Fix schemaFromMap: minimum/maximum are supported by genai.Schema v1.54
This commit is contained in:
2026-04-25 21:33:58 +02:00
parent 3d922e50bf
commit e6445b5db8
3 changed files with 99 additions and 15 deletions

View File

@@ -17,6 +17,9 @@ import (
//go:embed assets/merge_advisor_prompt.de.md
var mergeAdvisorPromptDE string
//go:embed assets/merge_advisor_schema.json
var mergeAdvisorSchemaJSON []byte
// ErrNotDuplicate is returned by MergeAdvisor.Propose when the similarity
// verdict confidently says the two markets are not the same.
var ErrNotDuplicate = errors.New("markets are not duplicates")
@@ -41,12 +44,12 @@ type mergeFieldDecision struct {
// mergeAdvisorResponse is the raw JSON shape the LLM returns.
type mergeAdvisorResponse struct {
TargetID string `json:"target_id"`
TargetReason string `json:"target_reason"`
FieldMerges map[string]mergeFieldDecision `json:"field_merges"`
Flags []string `json:"flags"`
Confidence float64 `json:"confidence"`
Summary string `json:"summary"`
TargetID string `json:"target_id"`
TargetReason string `json:"target_reason"`
FieldMerges json.RawMessage `json:"field_merges"`
Flags []string `json:"flags"`
Confidence float64 `json:"confidence"`
Summary string `json:"summary"`
}
// MergeAdvisor calls the LLM to propose how to merge two market editions.
@@ -74,7 +77,7 @@ func (a *MergeAdvisor) Propose(ctx context.Context, mA, mB Market, verdict enric
resp, err := a.ai.Chat(ctx, &ai.ChatRequest{
SystemPrompt: mergeAdvisorPromptDE,
UserMessage: userMsg,
JSONMode: true,
JSONSchema: json.RawMessage(mergeAdvisorSchemaJSON),
Grounded: false,
Temperature: 0.1,
CallType: "merge_advisor",
@@ -93,6 +96,14 @@ func (a *MergeAdvisor) Propose(ctx context.Context, mA, mB Market, verdict enric
if err != nil {
return MarketMergeProposal{}, fmt.Errorf("invalid target_id %q: %w", raw.TargetID, err)
}
if targetID != mA.ID && targetID != mB.ID {
return MarketMergeProposal{}, fmt.Errorf("target_id %q is not one of the two input markets", raw.TargetID)
}
fieldMerges, err := parseFieldMerges(raw.FieldMerges)
if err != nil {
return MarketMergeProposal{}, fmt.Errorf("parse field_merges: %w", err)
}
conf := raw.Confidence
if conf < 0 {
@@ -110,7 +121,7 @@ func (a *MergeAdvisor) Propose(ctx context.Context, mA, mB Market, verdict enric
return MarketMergeProposal{
TargetID: targetID,
TargetReason: raw.TargetReason,
FieldMerges: raw.FieldMerges,
FieldMerges: fieldMerges,
Flags: flags,
Confidence: conf,
Summary: raw.Summary,
@@ -118,6 +129,40 @@ func (a *MergeAdvisor) Propose(ctx context.Context, mA, mB Market, verdict enric
}, nil
}
// parseFieldMerges handles two LLM output formats for field_merges:
// - object: {"name": {"source":"a","value":"...","reason":"..."}, ...}
// - array: [{"field":"name","source":"a","value":"...","reason":"..."}, ...]
func parseFieldMerges(raw json.RawMessage) (map[string]mergeFieldDecision, error) {
if len(raw) == 0 {
return map[string]mergeFieldDecision{}, nil
}
var m map[string]mergeFieldDecision
if err := json.Unmarshal(raw, &m); err == nil {
return m, nil
}
var arr []struct {
Field string `json:"field"`
Source string `json:"source"`
Value json.RawMessage `json:"value"`
Reason string `json:"reason"`
}
if err := json.Unmarshal(raw, &arr); err != nil {
return nil, fmt.Errorf("cannot parse as object or array: %w", err)
}
m = make(map[string]mergeFieldDecision, len(arr))
for _, e := range arr {
if e.Field == "" {
continue
}
var val any
if len(e.Value) > 0 {
_ = json.Unmarshal(e.Value, &val)
}
m[e.Field] = mergeFieldDecision{Source: e.Source, Value: val, Reason: e.Reason}
}
return m, nil
}
type advisorMarketInput struct {
ID string `json:"id"`
Name string `json:"name"`

View File

@@ -29,15 +29,16 @@ func (p *stubAIProvider) Chat(_ context.Context, _ *ai.ChatRequest) (*ai.ChatRes
}
func validAdvisorResponse(targetID string) string {
fm, _ := json.Marshal(map[string]mergeFieldDecision{
"name": {Source: "a", Value: "Testmarkt Berlin", Reason: "established name"},
})
resp := mergeAdvisorResponse{
TargetID: targetID,
TargetReason: "Market A has more complete data",
FieldMerges: map[string]mergeFieldDecision{
"name": {Source: "a", Value: "Testmarkt Berlin", Reason: "established name"},
},
Flags: []string{},
Confidence: 0.9,
Summary: "Beide Eintraege bezeichnen denselben Markt. Daten von A werden uebernommen.",
FieldMerges: json.RawMessage(fm),
Flags: []string{},
Confidence: 0.9,
Summary: "Beide Eintraege bezeichnen denselben Markt. Daten von A werden uebernommen.",
}
b, _ := json.Marshal(resp)
return string(b)
@@ -118,6 +119,38 @@ func TestMergeAdvisor_ProviderErrorPropagates(t *testing.T) {
}
}
func TestMergeAdvisor_ParsesArrayFieldMerges(t *testing.T) {
a := Market{Name: "Markt A", City: "Berlin"}
b := Market{Name: "Markt B", City: "Berlin"}
// LLM returns field_merges as array (Gemini's unguided default format)
response := `{
"target_id": "` + a.ID.String() + `",
"target_reason": "More complete",
"field_merges": [
{"field": "name", "source": "a", "value": "Testmarkt", "reason": "shorter"},
{"field": "stadt", "source": "b", "value": "Hamburg", "reason": "correct"}
],
"flags": [],
"confidence": 0.85,
"summary": "Same market."
}`
p := &stubAIProvider{response: response}
v := enrich.Verdict{Same: true, Confidence: 0.9}
advisor := NewMergeAdvisor(p)
proposal, err := advisor.Propose(context.Background(), a, b, v)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(proposal.FieldMerges) != 2 {
t.Errorf("expected 2 field merges, got %d", len(proposal.FieldMerges))
}
if proposal.FieldMerges["name"].Source != "a" {
t.Errorf("expected name.source=a, got %q", proposal.FieldMerges["name"].Source)
}
}
func TestMergeAdvisor_InvalidJSONErrors(t *testing.T) {
p := &stubAIProvider{response: "not json at all"}
a := Market{Name: "Markt A"}

View File

@@ -346,7 +346,7 @@ func promptHashShort(system, user string) string {
func schemaFromMap(m map[string]any) *genai.Schema {
// Keys that genai.Schema does not support — log a warning so the workaround
// (researcher_schema_simple.json) stays visible rather than silently disappearing.
unsupported := []string{"pattern", "minLength", "maxLength", "minimum", "maximum",
unsupported := []string{"pattern", "minLength", "maxLength",
"additionalProperties", "$ref", "$defs", "if", "then", "else"}
var dropped []string
for _, k := range unsupported {
@@ -398,5 +398,11 @@ func schemaFromMap(m map[string]any) *genai.Schema {
}
}
}
if minVal, ok := m["minimum"].(float64); ok {
s.Minimum = &minVal
}
if maxVal, ok := m["maximum"].(float64); ok {
s.Maximum = &maxVal
}
return s
}