From 67b2eb5d743c3139d2b2f6a7537cdf7c76bcecc8 Mon Sep 17 00:00:00 2001 From: vikingowl Date: Fri, 24 Apr 2026 17:06:04 +0200 Subject: [PATCH] feat(market): in-backend research orchestrator with SearxNG + schema-validated LLM Adds pkg/search (SearxNG impl), domain/market/research (orchestrator + embedded German prompt and JSON schema), and reinstates POST /markets/:id/research on top of the new pipeline. Seeds URLs from crawler provenance; falls back to search when fewer than two distinct seed domains are known. --- backend/deploy/helm/templates/ci-secrets.yaml | 3 - backend/deploy/helm/values.yaml | 1 - backend/internal/domain/discovery/handler.go | 2 +- backend/internal/domain/market/research.go | 76 ++++++++- .../research/assets/researcher_prompt.de.md | 86 ++++++++++ .../research/assets/researcher_schema.json | 110 ++++++++++++ .../domain/market/research/candidates.go | 40 +++++ .../domain/market/research/candidates_test.go | 45 +++++ .../internal/domain/market/research/fetch.go | 50 ++++++ .../domain/market/research/fetch_test.go | 47 ++++++ .../market/research/integration_test.go | 159 ++++++++++++++++++ .../domain/market/research/orchestrator.go | 158 +++++++++++++++++ .../market/research/orchestrator_test.go | 119 +++++++++++++ .../internal/domain/market/research/prompt.go | 6 + .../internal/domain/market/research/schema.go | 6 + .../domain/market/research/schema_test.go | 19 +++ backend/internal/pkg/search/client.go | 20 +++ backend/internal/pkg/search/client_test.go | 19 +++ backend/internal/pkg/search/searxng.go | 77 +++++++++ backend/internal/pkg/search/searxng_test.go | 49 ++++++ backend/internal/server/routes.go | 6 +- 21 files changed, 1088 insertions(+), 10 deletions(-) create mode 100644 backend/internal/domain/market/research/assets/researcher_prompt.de.md create mode 100644 backend/internal/domain/market/research/assets/researcher_schema.json create mode 100644 backend/internal/domain/market/research/candidates.go create mode 100644 backend/internal/domain/market/research/candidates_test.go create mode 100644 backend/internal/domain/market/research/fetch.go create mode 100644 backend/internal/domain/market/research/fetch_test.go create mode 100644 backend/internal/domain/market/research/integration_test.go create mode 100644 backend/internal/domain/market/research/orchestrator.go create mode 100644 backend/internal/domain/market/research/orchestrator_test.go create mode 100644 backend/internal/domain/market/research/prompt.go create mode 100644 backend/internal/domain/market/research/schema.go create mode 100644 backend/internal/domain/market/research/schema_test.go create mode 100644 backend/internal/pkg/search/client.go create mode 100644 backend/internal/pkg/search/client_test.go create mode 100644 backend/internal/pkg/search/searxng.go create mode 100644 backend/internal/pkg/search/searxng_test.go diff --git a/backend/deploy/helm/templates/ci-secrets.yaml b/backend/deploy/helm/templates/ci-secrets.yaml index ea4e1bc..b08e8e9 100644 --- a/backend/deploy/helm/templates/ci-secrets.yaml +++ b/backend/deploy/helm/templates/ci-secrets.yaml @@ -11,9 +11,6 @@ stringData: {{- if .Values.ai.apiKey }} AI_API_KEY: {{ .Values.ai.apiKey | quote }} {{- end }} - {{- if .Values.ai.agentSimple }} - AI_AGENT_SIMPLE: {{ .Values.ai.agentSimple | quote }} - {{- end }} {{- if .Values.turnstile.secretKey }} TURNSTILE_SECRET_KEY: {{ .Values.turnstile.secretKey | quote }} {{- end }} diff --git a/backend/deploy/helm/values.yaml b/backend/deploy/helm/values.yaml index 62f0cc3..6e9b30a 100644 --- a/backend/deploy/helm/values.yaml +++ b/backend/deploy/helm/values.yaml @@ -98,7 +98,6 @@ smtp: # AI research credentials — passed via Woodpecker secrets during deploy. ai: apiKey: "" - agentSimple: "" agentDiscovery: "" # set via CI secret in production rateLimitRps: 1 diff --git a/backend/internal/domain/discovery/handler.go b/backend/internal/domain/discovery/handler.go index 5bff84d..41a1241 100644 --- a/backend/internal/domain/discovery/handler.go +++ b/backend/internal/domain/discovery/handler.go @@ -416,7 +416,7 @@ func (h *Handler) CrawlEnrichStatus(c *gin.Context) { } // EnrichLLM runs per-row LLM enrichment synchronously. 30s deadline is -// enough for scraping 5 URLs + one Mistral Pass2 call in typical conditions. +// enough for scraping 5 URLs + one LLM enrichment call in typical conditions. // Operator clicks the button, waits, sees the result — no polling. func (h *Handler) EnrichLLM(c *gin.Context) { id, err := uuid.Parse(c.Param("id")) diff --git a/backend/internal/domain/market/research.go b/backend/internal/domain/market/research.go index 42c0de3..c9b95df 100644 --- a/backend/internal/domain/market/research.go +++ b/backend/internal/domain/market/research.go @@ -1,19 +1,89 @@ package market import ( + "errors" + "log/slog" "net/http" "github.com/gin-gonic/gin" + "github.com/google/uuid" + + "marktvogt.de/backend/internal/domain/market/research" + "marktvogt.de/backend/internal/pkg/ai" + "marktvogt.de/backend/internal/pkg/scrape" + "marktvogt.de/backend/internal/pkg/search" ) type ResearchHandler struct { service *Service + orch *research.Orchestrator } -func NewResearchHandler(service *Service, _ any) *ResearchHandler { - return &ResearchHandler{service: service} +func NewResearchHandler(service *Service, provider ai.Provider, searchClient search.Client, scraper *scrape.Client) *ResearchHandler { + return &ResearchHandler{ + service: service, + orch: &research.Orchestrator{ + AI: provider, + Search: searchClient, + Scraper: scraper, + MaxPages: 6, + Concurrency: 4, + }, + } } func (h *ResearchHandler) Research(c *gin.Context) { - c.JSON(http.StatusNotImplemented, gin.H{"error": "research temporarily disabled during AI provider migration"}) + ctx := c.Request.Context() + + id, err := uuid.Parse(c.Param("id")) + if err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "invalid market ID"}) + return + } + + m, err := h.service.GetByID(ctx, id) + if err != nil { + if errors.Is(err, ErrMarketNotFound) { + c.JSON(http.StatusNotFound, gin.H{"error": "market not found"}) + } else { + slog.ErrorContext(ctx, "research: get market failed", "market_id", id, "err", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "internal error"}) + } + return + } + + out, err := h.orch.Run(ctx, research.Input{ + MarktName: m.Name, + Stadt: m.City, + StartDatumHint: m.StartDate.Format("2006-01-02"), + WebsiteHint: m.Website, + SeedURLs: nil, // seed URLs from crawler provenance: future enhancement + RechercheDatum: m.StartDate, + }) + if err != nil { + var pe *ai.ProviderError + if errors.As(err, &pe) { + switch pe.Code { + case ai.ErrRateLimited: + c.JSON(http.StatusServiceUnavailable, gin.H{"error": "rate limited"}) + return + case ai.ErrSchemaViolation: + slog.ErrorContext(ctx, "research schema violation", "market_id", id, "raw", pe.RawOutput) + c.JSON(http.StatusBadGateway, gin.H{"error": "model returned invalid JSON"}) + return + case ai.ErrInternal, ai.ErrQuotaExceeded, ai.ErrTimeout, ai.ErrInvalidRequest, ai.ErrUnavailable: + // fall through to generic 500 + } + } + slog.ErrorContext(ctx, "research failed", "market_id", id, "err", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "research failed"}) + return + } + + c.JSON(http.StatusOK, gin.H{ + "result": out.Raw, + "provider": out.Provider, + "tokens": out.TokensUsed, + "latency_ms": out.Latency.Milliseconds(), + }) } diff --git a/backend/internal/domain/market/research/assets/researcher_prompt.de.md b/backend/internal/domain/market/research/assets/researcher_prompt.de.md new file mode 100644 index 0000000..c630485 --- /dev/null +++ b/backend/internal/domain/market/research/assets/researcher_prompt.de.md @@ -0,0 +1,86 @@ +Du bist ein Daten-Extraktions-Agent fuer einen Mittelaltermarkt im DACH-Raum. +Ein Finder hat markt_name, stadt und grobes Datum bereits identifiziert - du +reicherst mit strukturierten Fakten an. Keine Beschreibungstexte, nur Fakten. + +## Input (im User-Prompt) +- markt_name, stadt, start_datum_hinweis, website_hinweis (optional), + recherche_datum + +## Vorgehen + +1. **Primaerquelle**: website_hinweis oeffnen, sonst "{markt_name} {stadt}" + suchen und Veranstalter-Website oeffnen. Fallback: Facebook-Event oder + Kalender (mittelalterkalender.info, marktkalendarium.de, mittelalterfeste.com, + mittelalter-termine.de). +2. **Zweitquelle pflicht**: verifiziere Datum + Ort gegen mindestens eine + weitere Quelle. Schuetzt vor veralteten Daten auf schlecht gepflegten Seiten. +3. **Felder extrahieren** (siehe unten). +4. **status** auf Top-Level setzen: + - "bestaetigt": ALLE Felder fuer Recherchejahr bestaetigt + - "unklar": Quellen widerspruechlich ODER einzelne Felder aus Vorjahr + - "vorjahr_unbestaetigt": ueberwiegend Vorjahresdaten + - "abgesagt": explizite Absage/Pausierung + +## Felder + +- **website**: URL, die dem Veranstalter in `veranstalter` gehoert, oder + dedizierte Markt-Domain. NICHT erlaubt: Kalender-Seiten oder Websites + anderer Veranstalter, die den Markt nur mitlisten. FB-Event-URL ok falls + keine eigene Seite. Unklar -> `null` + hinweis. +- **strasse**: Strasse + Hausnummer. Bei Burgen ohne Adresse: Anlagenname ok. +- **plz**: Postleitzahl. +- **stadt**: Muss mit Input-Hinweis konsistent sein, sonst hinweis. +- **bundesland**: Bundesland (DE/AT) oder "Kanton X" (CH). Aus Quellen, nicht + aus Wissen. +- **land**: "Deutschland" | "Oesterreich" | "Schweiz". +- **veranstalter**: Verein, Firma oder Person. Impressum ist gute Quelle. +- **start_datum** / **end_datum**: YYYY-MM-DD, im Recherchejahr. Eintages- + Markt: beide gleich. +- **oeffnungszeiten**: Array von Zeitfenstern {datum_von, datum_bis, von, bis}. + Nimm NUR explizit genannte Zeiten. Keine Zeiten fuer Tage ohne Angabe + erfinden. Bei Muster ueber mehrere Wochenenden (z.B. "Fr 17-02, Sa 16-00:30 + an allen Wochenenden"): Muster anwenden, keine widersprechenden Eintraege + erzeugen. Vor Abgabe: KEINE Duplikate (gleiches Datum mehrfach). Format 24h + "HH:MM", nach Mitternacht "00:30"/"02:00". + Kompakt: identische Zeiten ueber mehrere Tage -> ein Eintrag mit Datumsbereich. +- **eintrittspreise**: Array {name, betrag, waehrung}. ALLE Kategorien + extrahieren wenn mehrere gelistet (Erwachsene, Kinder, Ermaessigt, + Familie, Gewandete, Abendkasse etc.), nicht nur eine. + Ticketportale (Eventim, Ticketmaster, ticketmachine): Preise enthalten + Gebuehren und sind NICHT der Eintrittspreis. Veranstalter-Website + bevorzugen. Nur Portal verfuegbar: extrahieren + hinweis "inkl. + Servicegebuehr". Eintritt frei: ein Eintrag name="Eintritt frei", betrag=0. +- **bild_url**: Offizielles Plakat/Banner/Header, kein Stockfoto, kein + Sponsor-Logo. Social-Media-Vorschaubilder ok. Nur URLs die du tatsaechlich + als src/og:image gesehen hast. Nichts findbar -> `null`. + +## Per-Feld-Metadaten + +- **quellen**: URLs die du besucht hast und die genau diesen Wert belegen. + Mehrere Felder aus derselben Quelle: URL bei jedem wiederholen. +- **extraktion**: + - "direkt": Wert steht auf EINER Seite und alle weiteren Quellen bestaetigen + exakt denselben Wert. Trivial-Normalisierung (Datum, Zeit, Zahl) bleibt + direkt. + - "kombiniert": aus mehreren Stellen zusammengesetzt, aus Fliesstext + interpretiert, aus Vorjahr uebernommen, ODER Quellen liefern + unterschiedliche Werte. +- **hinweis**: Freitext bei Unsicherheit/Widerspruch/Vorjahr (mit + Ursprungsjahr). Sonst `null`. + +## Harte Regeln + +- Feld nicht findbar: `wert: null`, `quellen: []`, `extraktion: "direkt"`, + `hinweis` mit knapper Begruendung. +- NICHTS erfinden. Halluzinationen sind der teuerste Fehler. +- Widerspruch zwischen Quellen: Veranstalter-Website > Kalender > Social Media + > Presse. Widerspruch IMMER im hinweis dokumentieren, auch wenn die + offiziellste Quelle klar gewinnt. Format: + "Quelle A: X, Quelle B: Y, uebernommen: Z (Begruendung)". +- Vorjahresdaten: extrahieren, IMMER mit Ursprungsjahr im hinweis. +- Status-Konsistenz: auch nur EIN Feld mit Vorjahr-hinweis -> Top-Level-status + darf NICHT "bestaetigt" sein. +- Waehrung pflichtig: CH -> CHF, DE/AT -> EUR. +- Nur URLs zurueckgeben die du tatsaechlich aufgerufen hast. +- Antwort MUSS dem JSON-Schema entsprechen. Keine zusaetzlichen Felder, keine + Erklaerungen ausserhalb des JSON. diff --git a/backend/internal/domain/market/research/assets/researcher_schema.json b/backend/internal/domain/market/research/assets/researcher_schema.json new file mode 100644 index 0000000..f8659ed --- /dev/null +++ b/backend/internal/domain/market/research/assets/researcher_schema.json @@ -0,0 +1,110 @@ +{ + "type": "object", + "$defs": { + "datumFeld": { + "type": "object", + "required": ["wert", "quellen", "extraktion", "hinweis"], + "properties": { + "wert": {"type": ["string", "null"], "pattern": "^[0-9]{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])$"}, + "hinweis": {"type": ["string", "null"]}, + "quellen": {"type": "array", "items": {"type": "string", "pattern": "^https?://"}}, + "extraktion": {"enum": ["direkt", "kombiniert"], "type": "string"} + }, + "additionalProperties": false + }, + "stringFeld": { + "type": "object", + "required": ["wert", "quellen", "extraktion", "hinweis"], + "properties": { + "wert": {"type": ["string", "null"]}, + "hinweis": {"type": ["string", "null"]}, + "quellen": {"type": "array", "items": {"type": "string", "pattern": "^https?://"}}, + "extraktion": {"enum": ["direkt", "kombiniert"], "type": "string"} + }, + "additionalProperties": false + } + }, + "required": ["markt_name", "recherche_datum", "status", "quellen_gesamt", "felder"], + "properties": { + "felder": { + "type": "object", + "required": ["website", "strasse", "plz", "stadt", "bundesland", "land", "veranstalter", "start_datum", "end_datum", "oeffnungszeiten", "eintrittspreise", "bild_url"], + "properties": { + "plz": {"$ref": "#/$defs/stringFeld"}, + "land": {"$ref": "#/$defs/stringFeld"}, + "stadt": {"$ref": "#/$defs/stringFeld"}, + "strasse": {"$ref": "#/$defs/stringFeld"}, + "website": {"$ref": "#/$defs/stringFeld"}, + "bild_url": {"$ref": "#/$defs/stringFeld"}, + "end_datum": {"$ref": "#/$defs/datumFeld"}, + "bundesland": {"$ref": "#/$defs/stringFeld"}, + "start_datum": {"$ref": "#/$defs/datumFeld"}, + "veranstalter": {"$ref": "#/$defs/stringFeld"}, + "eintrittspreise": { + "type": "object", + "required": ["wert", "quellen", "extraktion", "hinweis"], + "properties": { + "wert": { + "type": ["array", "null"], + "items": { + "type": "object", + "required": ["name", "betrag", "waehrung"], + "properties": { + "name": {"type": "string", "minLength": 2, "description": "Freie Kategoriebezeichnung, z.B. Erwachsene, Gewandete, Kinder unter 12, Familienticket"}, + "betrag": {"type": "number", "minimum": 0}, + "waehrung": {"enum": ["EUR", "CHF"], "type": "string"} + }, + "additionalProperties": false + } + }, + "hinweis": {"type": ["string", "null"]}, + "quellen": {"type": "array", "items": {"type": "string", "pattern": "^https?://"}}, + "extraktion": {"enum": ["direkt", "kombiniert"], "type": "string"} + }, + "additionalProperties": false + }, + "oeffnungszeiten": { + "type": "object", + "required": ["wert", "quellen", "extraktion", "hinweis"], + "properties": { + "wert": { + "type": ["array", "null"], + "items": { + "type": "object", + "required": ["datum_von", "datum_bis", "von", "bis"], + "properties": { + "bis": {"type": "string", "pattern": "^([01][0-9]|2[0-3]):[0-5][0-9]$"}, + "von": {"type": "string", "pattern": "^([01][0-9]|2[0-3]):[0-5][0-9]$"}, + "datum_bis": {"type": "string", "pattern": "^[0-9]{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])$"}, + "datum_von": {"type": "string", "pattern": "^[0-9]{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])$"} + }, + "additionalProperties": false + } + }, + "hinweis": {"type": ["string", "null"]}, + "quellen": {"type": "array", "items": {"type": "string", "pattern": "^https?://"}}, + "extraktion": {"enum": ["direkt", "kombiniert"], "type": "string"} + }, + "additionalProperties": false + } + }, + "additionalProperties": false + }, + "status": { + "enum": ["bestaetigt", "unklar", "vorjahr_unbestaetigt", "abgesagt"], + "type": "string", + "description": "Gesamtstatus der Veranstaltung im Recherchejahr" + }, + "markt_name": {"type": "string", "minLength": 3}, + "quellen_gesamt": { + "type": "array", + "items": {"type": "string", "pattern": "^https?://"}, + "minItems": 1 + }, + "recherche_datum": { + "type": "string", + "pattern": "^[0-9]{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])$" + } + }, + "additionalProperties": false +} diff --git a/backend/internal/domain/market/research/candidates.go b/backend/internal/domain/market/research/candidates.go new file mode 100644 index 0000000..5a271c7 --- /dev/null +++ b/backend/internal/domain/market/research/candidates.go @@ -0,0 +1,40 @@ +package research + +import "net/url" + +// BuildCandidates returns an ordered, domain-deduplicated URL list: +// seeds first (in order), then websiteHint (if new domain), then searchURLs (new domains only). +// The list is capped at maxPages. +func BuildCandidates(seeds []string, websiteHint string, searchURLs []string, maxPages int) []string { + seen := map[string]struct{}{} + out := make([]string, 0, maxPages) + add := func(raw string) bool { + u, err := url.Parse(raw) + if err != nil || u.Host == "" { + return false + } + if _, dup := seen[u.Host]; dup { + return false + } + seen[u.Host] = struct{}{} + out = append(out, raw) + return len(out) >= maxPages + } + + for _, s := range seeds { + if add(s) { + return out + } + } + if websiteHint != "" { + if add(websiteHint) { + return out + } + } + for _, s := range searchURLs { + if add(s) { + return out + } + } + return out +} diff --git a/backend/internal/domain/market/research/candidates_test.go b/backend/internal/domain/market/research/candidates_test.go new file mode 100644 index 0000000..879814b --- /dev/null +++ b/backend/internal/domain/market/research/candidates_test.go @@ -0,0 +1,45 @@ +package research + +import ( + "reflect" + "testing" +) + +func TestBuildCandidates_DeduplicatesByDomain(t *testing.T) { + seeds := []string{ + "https://example.org/markt", + "https://example.org/markt?utm_source=x", + "https://other.example/", + } + got := BuildCandidates(seeds, "", nil, 10) + want := []string{"https://example.org/markt", "https://other.example/"} + if !reflect.DeepEqual(got, want) { + t.Fatalf("got %v want %v", got, want) + } +} + +func TestBuildCandidates_IncludesWebsiteHint(t *testing.T) { + got := BuildCandidates(nil, "https://hint.example", nil, 10) + if len(got) != 1 || got[0] != "https://hint.example" { + t.Fatalf("got %v", got) + } +} + +func TestBuildCandidates_AppendsSearchResults(t *testing.T) { + seeds := []string{"https://a.example"} + search := []string{"https://b.example", "https://a.example/other"} + got := BuildCandidates(seeds, "", search, 10) + // a.example already represented by seed -> search URL on same domain is dropped + want := []string{"https://a.example", "https://b.example"} + if !reflect.DeepEqual(got, want) { + t.Fatalf("got %v want %v", got, want) + } +} + +func TestBuildCandidates_CapsAtMaxPages(t *testing.T) { + seeds := []string{"https://a.example", "https://b.example", "https://c.example"} + got := BuildCandidates(seeds, "", nil, 2) + if len(got) != 2 { + t.Fatalf("len=%d want 2", len(got)) + } +} diff --git a/backend/internal/domain/market/research/fetch.go b/backend/internal/domain/market/research/fetch.go new file mode 100644 index 0000000..7276b90 --- /dev/null +++ b/backend/internal/domain/market/research/fetch.go @@ -0,0 +1,50 @@ +package research + +import ( + "context" + "errors" + "log/slog" + "sync" + + "golang.org/x/sync/errgroup" +) + +type Scraper interface { + Fetch(ctx context.Context, url string) (string, error) +} + +type Page struct { + URL string + Title string + Text string +} + +func FetchAll(ctx context.Context, sc Scraper, urls []string, concurrency int) ([]Page, error) { + if concurrency <= 0 { + concurrency = 4 + } + g, gctx := errgroup.WithContext(ctx) + g.SetLimit(concurrency) + var mu sync.Mutex + pages := make([]Page, 0, len(urls)) + + for _, u := range urls { + g.Go(func() error { + text, err := sc.Fetch(gctx, u) + if err != nil { + slog.Warn("research fetch failed", "url", u, "err", err) + return nil // individual failure does not abort the group + } + mu.Lock() + pages = append(pages, Page{URL: u, Text: text}) + mu.Unlock() + return nil + }) + } + _ = g.Wait() + + if len(pages) == 0 { + return nil, errors.New("all candidate URLs failed to fetch") + } + return pages, nil +} diff --git a/backend/internal/domain/market/research/fetch_test.go b/backend/internal/domain/market/research/fetch_test.go new file mode 100644 index 0000000..91a39d6 --- /dev/null +++ b/backend/internal/domain/market/research/fetch_test.go @@ -0,0 +1,47 @@ +package research + +import ( + "context" + "errors" + "testing" +) + +type fakeScraper struct { + byURL map[string]string + errs map[string]error +} + +func (f *fakeScraper) Fetch(ctx context.Context, url string) (string, error) { + if err, ok := f.errs[url]; ok { + return "", err + } + return f.byURL[url], nil +} + +func TestFetchAll_ReturnsOnlySuccessfulPages(t *testing.T) { + s := &fakeScraper{ + byURL: map[string]string{ + "https://a.example": "content A", + "https://c.example": "content C", + }, + errs: map[string]error{"https://b.example": errors.New("timeout")}, + } + urls := []string{"https://a.example", "https://b.example", "https://c.example"} + pages, err := FetchAll(context.Background(), s, urls, 4) + if err != nil { + t.Fatalf("FetchAll: %v", err) + } + if len(pages) != 2 { + t.Fatalf("pages len=%d: %+v", len(pages), pages) + } +} + +func TestFetchAll_FailsWhenZeroSucceed(t *testing.T) { + s := &fakeScraper{errs: map[string]error{ + "https://a.example": errors.New("x"), + "https://b.example": errors.New("y"), + }} + if _, err := FetchAll(context.Background(), s, []string{"https://a.example", "https://b.example"}, 4); err == nil { + t.Fatal("want error when zero pages succeeded") + } +} diff --git a/backend/internal/domain/market/research/integration_test.go b/backend/internal/domain/market/research/integration_test.go new file mode 100644 index 0000000..7725eb9 --- /dev/null +++ b/backend/internal/domain/market/research/integration_test.go @@ -0,0 +1,159 @@ +package research_test + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + "time" + + "marktvogt.de/backend/internal/domain/market/research" + "marktvogt.de/backend/internal/pkg/ai" + "marktvogt.de/backend/internal/pkg/scrape" + "marktvogt.de/backend/internal/pkg/search" +) + +// validResearchJSON is a minimal JSON document that satisfies researcher_schema.json. +// All stringFeld values use the required sub-structure; datumFeld values include valid date strings. +const validResearchJSON = `{ + "markt_name": "Testmarkt Berlin", + "recherche_datum": "2026-04-24", + "status": "bestaetigt", + "quellen_gesamt": ["http://example.com/testmarkt"], + "felder": { + "website": {"wert": "http://example.com/testmarkt", "quellen": ["http://example.com/testmarkt"], "extraktion": "direkt", "hinweis": null}, + "strasse": {"wert": "Hauptstraße 1", "quellen": ["http://example.com/testmarkt"], "extraktion": "direkt", "hinweis": null}, + "plz": {"wert": "10115", "quellen": ["http://example.com/testmarkt"], "extraktion": "direkt", "hinweis": null}, + "stadt": {"wert": "Berlin", "quellen": ["http://example.com/testmarkt"], "extraktion": "direkt", "hinweis": null}, + "bundesland": {"wert": "Berlin", "quellen": ["http://example.com/testmarkt"], "extraktion": "direkt", "hinweis": null}, + "land": {"wert": "Deutschland", "quellen": ["http://example.com/testmarkt"], "extraktion": "direkt", "hinweis": null}, + "veranstalter": {"wert": "Musterveranstalter GmbH", "quellen": ["http://example.com/testmarkt"], "extraktion": "direkt", "hinweis": null}, + "start_datum": {"wert": "2026-06-01", "quellen": ["http://example.com/testmarkt"], "extraktion": "direkt", "hinweis": null}, + "end_datum": {"wert": "2026-06-03", "quellen": ["http://example.com/testmarkt"], "extraktion": "direkt", "hinweis": null}, + "oeffnungszeiten": { + "wert": [{"datum_von": "2026-06-01", "datum_bis": "2026-06-01", "von": "10:00", "bis": "20:00"}], + "quellen": ["http://example.com/testmarkt"], + "extraktion": "direkt", + "hinweis": null + }, + "eintrittspreise": { + "wert": [{"name": "Erwachsene", "betrag": 8.00, "waehrung": "EUR"}], + "quellen": ["http://example.com/testmarkt"], + "extraktion": "direkt", + "hinweis": null + }, + "bild_url": {"wert": null, "quellen": [], "extraktion": "direkt", "hinweis": null} + } +}` + +// fakeOllamaHandler returns a valid Ollama non-streaming chat response whose +// content field contains validResearchJSON. +func fakeOllamaHandler(t *testing.T) http.Handler { + t.Helper() + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost || r.URL.Path != "/api/chat" { + http.NotFound(w, r) + return + } + resp := map[string]any{ + "model": "test", + "created_at": "2026-04-24T00:00:00Z", + "message": map[string]string{ + "role": "assistant", + "content": validResearchJSON, + }, + "done": true, + "prompt_eval_count": 10, + "eval_count": 20, + } + w.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(w).Encode(resp); err != nil { + t.Errorf("fakeOllamaHandler: encode response: %v", err) + } + }) +} + +// fakeSearxngHandler returns a SearxNG JSON response whose result URLs point +// at the provided page server. +func fakeSearxngHandler(t *testing.T, pageBaseURL string) http.Handler { + t.Helper() + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/search" { + http.NotFound(w, r) + return + } + resp := map[string]any{ + "results": []map[string]any{ + {"url": pageBaseURL + "/page1", "title": "Testmarkt Berlin – Seite 1", "content": "Mittelalterfest im Juni"}, + {"url": pageBaseURL + "/page2", "title": "Testmarkt Berlin – Seite 2", "content": "Eintritt und Öffnungszeiten"}, + }, + } + w.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(w).Encode(resp); err != nil { + t.Errorf("fakeSearxngHandler: encode response: %v", err) + } + }) +} + +// fakePageHandler serves minimal HTML at /page1 and /page2 and counts hits. +func fakePageHandler(t *testing.T, hits *atomic.Int64) http.Handler { + t.Helper() + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/page1", "/page2": + hits.Add(1) + w.Header().Set("Content-Type", "text/html; charset=utf-8") + _, _ = w.Write([]byte(`

Testmarkt Berlin findet statt.

`)) + default: + http.NotFound(w, r) + } + }) +} + +func TestIntegrationOrchestratorFullPipeline(t *testing.T) { + var pageHits atomic.Int64 + + // Start fake page server first so we know its URL for SearxNG results. + fakePage := httptest.NewServer(fakePageHandler(t, &pageHits)) + defer fakePage.Close() + + fakeOllama := httptest.NewServer(fakeOllamaHandler(t)) + defer fakeOllama.Close() + + fakeSearxng := httptest.NewServer(fakeSearxngHandler(t, fakePage.URL)) + defer fakeSearxng.Close() + + orch := &research.Orchestrator{ + AI: ai.NewOllamaProvider(ai.OllamaConfig{BaseURL: fakeOllama.URL, Model: "test"}), + Search: search.NewSearxng(search.SearxngConfig{BaseURL: fakeSearxng.URL}), + Scraper: scrape.New("test-agent/1.0"), + MaxPages: 4, + Concurrency: 2, + } + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + out, err := orch.Run(ctx, research.Input{ + MarktName: "Testmarkt", + Stadt: "Berlin", + RechercheDatum: time.Date(2026, 6, 1, 0, 0, 0, 0, time.UTC), + }) + if err != nil { + t.Fatalf("Orchestrator.Run returned error: %v", err) + } + + if !json.Valid(out.Raw) { + t.Fatalf("out.Raw is not valid JSON: %s", string(out.Raw)) + } + + if err := ai.ValidateSchema(research.SchemaJSON, []byte(out.Raw)); err != nil { + t.Fatalf("out.Raw does not satisfy researcher schema: %v", err) + } + + if pageHits.Load() == 0 { + t.Fatal("expected at least one page server hit, got zero") + } +} diff --git a/backend/internal/domain/market/research/orchestrator.go b/backend/internal/domain/market/research/orchestrator.go new file mode 100644 index 0000000..5237625 --- /dev/null +++ b/backend/internal/domain/market/research/orchestrator.go @@ -0,0 +1,158 @@ +package research + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/url" + "time" + + "marktvogt.de/backend/internal/pkg/ai" + "marktvogt.de/backend/internal/pkg/search" +) + +type Input struct { + MarktName string + Stadt string + StartDatumHint string + WebsiteHint string + SeedURLs []string + RechercheDatum time.Time +} + +type Output struct { + Raw json.RawMessage + Provider string + TokensUsed int + Latency time.Duration +} + +type Orchestrator struct { + AI ai.Provider + Search search.Client + Scraper Scraper + MaxPages int + Concurrency int +} + +func (o *Orchestrator) Run(ctx context.Context, in Input) (Output, error) { + start := time.Now() + + // 1. Candidate URLs + candidates := BuildCandidates(in.SeedURLs, in.WebsiteHint, nil, o.MaxPages) + if countDomains(candidates) < 2 { + queries := []string{ + fmt.Sprintf("%s %s", in.MarktName, in.Stadt), + fmt.Sprintf("%s %s %d", in.MarktName, in.Stadt, in.RechercheDatum.Year()), + } + var extra []string + for _, q := range queries { + res, err := o.Search.Search(ctx, q, search.Options{Language: "de", Count: 10}) + if err != nil { + break + } + for _, r := range res { + extra = append(extra, r.URL) + } + if len(extra) > 0 { + break + } + } + candidates = BuildCandidates(in.SeedURLs, in.WebsiteHint, extra, o.MaxPages) + } + + // 2. Parallel fetch + pages, err := FetchAll(ctx, o.Scraper, candidates, o.Concurrency) + if err != nil { + return Output{}, err + } + + // 3. User prompt assembly + userPrompt, err := buildUserPrompt(in, pages) + if err != nil { + return Output{}, err + } + + // 4. LLM call with one retry on schema violation + resp, err := callLLM(ctx, o.AI, userPrompt, SchemaJSON) + if err == nil { + if verr := ai.ValidateSchema(SchemaJSON, []byte(resp.Content)); verr != nil { + err = &ai.ProviderError{Code: ai.ErrSchemaViolation, Retryable: true, RawOutput: resp.Content, Inner: verr} + } + } + if err != nil { + var pe *ai.ProviderError + if errors.As(err, &pe) && pe.Code == ai.ErrSchemaViolation { + resp, err = callLLM(ctx, o.AI, userPrompt+"\n\nYour previous response failed schema validation. Re-emit the JSON strictly matching the schema.", SchemaJSON) + if err == nil { + if verr := ai.ValidateSchema(SchemaJSON, []byte(resp.Content)); verr != nil { + err = &ai.ProviderError{Code: ai.ErrSchemaViolation, Retryable: false, RawOutput: resp.Content, Inner: verr} + } + } + } + if err != nil { + return Output{}, err + } + } + + return Output{ + Raw: json.RawMessage(resp.Content), + Provider: o.AI.Name(), + TokensUsed: resp.TotalTokens, + Latency: time.Since(start), + }, nil +} + +func callLLM(ctx context.Context, p ai.Provider, userPrompt string, schema []byte) (*ai.ChatResponse, error) { + return p.Chat(ctx, &ai.ChatRequest{ + SystemPrompt: SystemPrompt, + UserMessage: userPrompt, + JSONSchema: schema, + }) +} + +type userPromptPayload struct { + MarktName string `json:"markt_name"` + Stadt string `json:"stadt"` + StartDatumHint string `json:"start_datum_hinweis"` + WebsiteHint string `json:"website_hinweis"` + RechercheDatum string `json:"recherche_datum"` + Quellen []quellePage `json:"quellen"` +} + +type quellePage struct { + URL string `json:"url"` + Title string `json:"titel"` + Text string `json:"text"` +} + +func buildUserPrompt(in Input, pages []Page) (string, error) { + p := userPromptPayload{ + MarktName: in.MarktName, + Stadt: in.Stadt, + StartDatumHint: in.StartDatumHint, + WebsiteHint: in.WebsiteHint, + RechercheDatum: in.RechercheDatum.Format("2006-01-02"), + } + for _, pg := range pages { + p.Quellen = append(p.Quellen, quellePage(pg)) + } + buf, err := json.Marshal(p) + if err != nil { + return "", fmt.Errorf("marshal user prompt: %w", err) + } + return string(buf), nil +} + +func countDomains(urls []string) int { + seen := map[string]struct{}{} + for _, raw := range urls { + u, err := url.Parse(raw) + if err != nil || u.Host == "" { + continue + } + seen[u.Host] = struct{}{} + } + return len(seen) +} diff --git a/backend/internal/domain/market/research/orchestrator_test.go b/backend/internal/domain/market/research/orchestrator_test.go new file mode 100644 index 0000000..9b1b99e --- /dev/null +++ b/backend/internal/domain/market/research/orchestrator_test.go @@ -0,0 +1,119 @@ +package research + +import ( + "context" + "errors" + "testing" + "time" + + "marktvogt.de/backend/internal/pkg/ai" + "marktvogt.de/backend/internal/pkg/search" +) + +type fakeProvider struct { + responses []string + errs []error + calls int +} + +func (f *fakeProvider) Name() string { return "fake" } +func (f *fakeProvider) SupportsJSONMode() bool { return true } +func (f *fakeProvider) SupportsJSONSchema() bool { return true } +func (f *fakeProvider) Chat(ctx context.Context, req *ai.ChatRequest) (*ai.ChatResponse, error) { + i := f.calls + f.calls++ + if i < len(f.errs) && f.errs[i] != nil { + return nil, f.errs[i] + } + return &ai.ChatResponse{Content: f.responses[i]}, nil +} + +type fakeSearch struct{ res []search.Result } + +func (f *fakeSearch) Search(ctx context.Context, q string, o search.Options) ([]search.Result, error) { + return f.res, nil +} + +func validJSON(t *testing.T) string { + t.Helper() + return `{ + "markt_name":"Test", + "recherche_datum":"2026-04-24", + "status":"bestaetigt", + "quellen_gesamt":["https://a.example"], + "felder":{ + "website":{"wert":null,"quellen":[],"extraktion":"direkt","hinweis":null}, + "strasse":{"wert":null,"quellen":[],"extraktion":"direkt","hinweis":null}, + "plz":{"wert":null,"quellen":[],"extraktion":"direkt","hinweis":null}, + "stadt":{"wert":"Test","quellen":["https://a.example"],"extraktion":"direkt","hinweis":null}, + "bundesland":{"wert":null,"quellen":[],"extraktion":"direkt","hinweis":null}, + "land":{"wert":"Deutschland","quellen":["https://a.example"],"extraktion":"direkt","hinweis":null}, + "veranstalter":{"wert":null,"quellen":[],"extraktion":"direkt","hinweis":null}, + "start_datum":{"wert":"2026-06-14","quellen":["https://a.example"],"extraktion":"direkt","hinweis":null}, + "end_datum":{"wert":"2026-06-14","quellen":["https://a.example"],"extraktion":"direkt","hinweis":null}, + "oeffnungszeiten":{"wert":null,"quellen":[],"extraktion":"direkt","hinweis":null}, + "eintrittspreise":{"wert":null,"quellen":[],"extraktion":"direkt","hinweis":null}, + "bild_url":{"wert":null,"quellen":[],"extraktion":"direkt","hinweis":null} + } +}` +} + +func TestOrchestrator_Run_HappyPath(t *testing.T) { + prov := &fakeProvider{responses: []string{validJSON(t)}} + o := &Orchestrator{ + AI: prov, + Search: &fakeSearch{}, + Scraper: &fakeScraper{byURL: map[string]string{"https://a.example": "content"}}, + MaxPages: 6, + Concurrency: 2, + } + out, err := o.Run(context.Background(), Input{ + MarktName: "Test", + Stadt: "Teststadt", + SeedURLs: []string{"https://a.example"}, + RechercheDatum: time.Now(), + }) + if err != nil { + t.Fatalf("Run: %v", err) + } + if out.Provider != "fake" || len(out.Raw) == 0 { + t.Fatalf("unexpected: %+v", out) + } +} + +func TestOrchestrator_Run_SchemaViolationRetriesOnce(t *testing.T) { + prov := &fakeProvider{responses: []string{`{"garbage":1}`, validJSON(t)}} + o := &Orchestrator{ + AI: prov, + Search: &fakeSearch{}, + Scraper: &fakeScraper{byURL: map[string]string{"https://a.example": "content"}}, + MaxPages: 6, + Concurrency: 2, + } + _, err := o.Run(context.Background(), Input{MarktName: "Test", Stadt: "X", SeedURLs: []string{"https://a.example"}, RechercheDatum: time.Now()}) + if err != nil { + t.Fatalf("Run: %v", err) + } + if prov.calls != 2 { + t.Fatalf("expected 2 calls, got %d", prov.calls) + } +} + +func TestOrchestrator_Run_SchemaViolationFailsAfterRetry(t *testing.T) { + prov := &fakeProvider{responses: []string{`{"garbage":1}`, `{"still":"bad"}`}} + o := &Orchestrator{ + AI: prov, + Search: &fakeSearch{}, + Scraper: &fakeScraper{byURL: map[string]string{"https://a.example": "content"}}, + MaxPages: 6, + Concurrency: 2, + } + _, err := o.Run(context.Background(), Input{MarktName: "Test", Stadt: "X", SeedURLs: []string{"https://a.example"}, RechercheDatum: time.Now()}) + if err == nil { + t.Fatal("want schema violation error after retry") + } + var pe *ai.ProviderError + if !errors.As(err, &pe) || pe.Code != ai.ErrSchemaViolation || pe.RawOutput == "" { + t.Fatalf("want SchemaViolation with RawOutput, got %v", err) + } +} diff --git a/backend/internal/domain/market/research/prompt.go b/backend/internal/domain/market/research/prompt.go new file mode 100644 index 0000000..719918c --- /dev/null +++ b/backend/internal/domain/market/research/prompt.go @@ -0,0 +1,6 @@ +package research + +import _ "embed" + +//go:embed assets/researcher_prompt.de.md +var SystemPrompt string diff --git a/backend/internal/domain/market/research/schema.go b/backend/internal/domain/market/research/schema.go new file mode 100644 index 0000000..cf0ccd5 --- /dev/null +++ b/backend/internal/domain/market/research/schema.go @@ -0,0 +1,6 @@ +package research + +import _ "embed" + +//go:embed assets/researcher_schema.json +var SchemaJSON []byte diff --git a/backend/internal/domain/market/research/schema_test.go b/backend/internal/domain/market/research/schema_test.go new file mode 100644 index 0000000..fe9a947 --- /dev/null +++ b/backend/internal/domain/market/research/schema_test.go @@ -0,0 +1,19 @@ +package research + +import ( + "encoding/json" + "testing" +) + +func TestSchemaJSON_IsValidJSON(t *testing.T) { + var v any + if err := json.Unmarshal(SchemaJSON, &v); err != nil { + t.Fatalf("schema not valid JSON: %v", err) + } +} + +func TestSystemPrompt_NonEmpty(t *testing.T) { + if len(SystemPrompt) < 500 { + t.Fatalf("system prompt seems truncated: %d bytes", len(SystemPrompt)) + } +} diff --git a/backend/internal/pkg/search/client.go b/backend/internal/pkg/search/client.go new file mode 100644 index 0000000..bd01931 --- /dev/null +++ b/backend/internal/pkg/search/client.go @@ -0,0 +1,20 @@ +package search + +import "context" + +type Client interface { + Search(ctx context.Context, query string, opts Options) ([]Result, error) +} + +type Options struct { + Language string + Count int + Region string +} + +type Result struct { + URL string + Title string + Snippet string + Score float64 +} diff --git a/backend/internal/pkg/search/client_test.go b/backend/internal/pkg/search/client_test.go new file mode 100644 index 0000000..b481494 --- /dev/null +++ b/backend/internal/pkg/search/client_test.go @@ -0,0 +1,19 @@ +package search + +import ( + "context" + "testing" +) + +type stubClient struct { + results []Result + err error +} + +func (s *stubClient) Search(ctx context.Context, query string, opts Options) ([]Result, error) { + return s.results, s.err +} + +func TestStubSatisfiesInterface(t *testing.T) { + var _ Client = (*stubClient)(nil) +} diff --git a/backend/internal/pkg/search/searxng.go b/backend/internal/pkg/search/searxng.go new file mode 100644 index 0000000..9b6bf02 --- /dev/null +++ b/backend/internal/pkg/search/searxng.go @@ -0,0 +1,77 @@ +package search + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "time" +) + +type SearxngConfig struct { + BaseURL string + Timeout time.Duration +} + +type Searxng struct { + cfg SearxngConfig + client *http.Client +} + +func NewSearxng(cfg SearxngConfig) *Searxng { + if cfg.Timeout == 0 { + cfg.Timeout = 15 * time.Second + } + return &Searxng{cfg: cfg, client: &http.Client{Timeout: cfg.Timeout}} +} + +type searxngResponse struct { + Results []struct { + URL string `json:"url"` + Title string `json:"title"` + Content string `json:"content"` + Score float64 `json:"score"` + } `json:"results"` +} + +func (s *Searxng) Search(ctx context.Context, query string, opts Options) ([]Result, error) { + q := url.Values{} + q.Set("q", query) + q.Set("format", "json") + if opts.Language != "" { + q.Set("language", opts.Language) + } + if opts.Count > 0 { + q.Set("pageno", "1") + q.Set("safesearch", "0") + } + + u := s.cfg.BaseURL + "/search?" + q.Encode() + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return nil, fmt.Errorf("searxng new request: %w", err) + } + req.Header.Set("Accept", "application/json") + + resp, err := s.client.Do(req) + if err != nil { + return nil, fmt.Errorf("searxng do: %w", err) + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode >= 400 { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("searxng status %d: %s", resp.StatusCode, string(body)) + } + + var out searxngResponse + if err := json.NewDecoder(resp.Body).Decode(&out); err != nil { + return nil, fmt.Errorf("searxng decode: %w", err) + } + results := make([]Result, 0, len(out.Results)) + for _, r := range out.Results { + results = append(results, Result{URL: r.URL, Title: r.Title, Snippet: r.Content, Score: r.Score}) + } + return results, nil +} diff --git a/backend/internal/pkg/search/searxng_test.go b/backend/internal/pkg/search/searxng_test.go new file mode 100644 index 0000000..695cb57 --- /dev/null +++ b/backend/internal/pkg/search/searxng_test.go @@ -0,0 +1,49 @@ +package search + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +func TestSearxng_Search_ParsesResults(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/search" { + t.Errorf("path: %s", r.URL.Path) + } + if r.URL.Query().Get("q") != "mittelaltermarkt esslingen" { + t.Errorf("q: %q", r.URL.Query().Get("q")) + } + if r.URL.Query().Get("format") != "json" { + t.Errorf("format must be json: %q", r.URL.Query().Get("format")) + } + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"results":[ + {"url":"https://a.example","title":"A","content":"snippet A","score":0.9}, + {"url":"https://b.example","title":"B","content":"snippet B","score":0.5} + ]}`)) + })) + defer srv.Close() + + c := NewSearxng(SearxngConfig{BaseURL: srv.URL, Timeout: 5 * time.Second}) + res, err := c.Search(context.Background(), "mittelaltermarkt esslingen", Options{Language: "de", Count: 5}) + if err != nil { + t.Fatalf("Search: %v", err) + } + if len(res) != 2 || res[0].URL != "https://a.example" || res[0].Score != 0.9 { + t.Fatalf("unexpected: %+v", res) + } +} + +func TestSearxng_Search_PropagatesHTTPError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "upstream exploded", http.StatusBadGateway) + })) + defer srv.Close() + c := NewSearxng(SearxngConfig{BaseURL: srv.URL, Timeout: time.Second}) + if _, err := c.Search(context.Background(), "x", Options{}); err == nil { + t.Fatal("want error") + } +} diff --git a/backend/internal/server/routes.go b/backend/internal/server/routes.go index 411af06..a8e34f0 100644 --- a/backend/internal/server/routes.go +++ b/backend/internal/server/routes.go @@ -17,6 +17,7 @@ import ( "marktvogt.de/backend/internal/pkg/email" "marktvogt.de/backend/internal/pkg/geocode" "marktvogt.de/backend/internal/pkg/scrape" + "marktvogt.de/backend/internal/pkg/search" "marktvogt.de/backend/internal/pkg/turnstile" ) @@ -73,14 +74,15 @@ func (s *Server) registerRoutes() { if err != nil { panic(fmt.Errorf("init ai provider: %w", err)) } - researchHandler := market.NewResearchHandler(marketSvc, aiProvider) + scraper := scrape.New(s.cfg.Discovery.CrawlerUserAgent) + searchClient := search.NewSearxng(search.SearxngConfig{BaseURL: s.cfg.Search.SearxngURL}) + researchHandler := market.NewResearchHandler(marketSvc, aiProvider, searchClient, scraper) requireAdmin := middleware.RequireRole("admin") market.RegisterAdminRoutes(v1, adminMarketHandler, researchHandler, requireAuth, requireAdmin) // Discovery routes discoveryRepo := discovery.NewRepository(s.db) crawlerInstance := crawler.NewCrawler(s.cfg.Discovery.CrawlerUserAgent, crawler.DefaultSourceConfigs()) - scraper := scrape.New(s.cfg.Discovery.CrawlerUserAgent) llmEnricher := enrich.NewLLMEnricher(aiProvider, scraper) simClassifier := enrich.NewSimilarityClassifier(aiProvider) discoveryService := discovery.NewService(discoveryRepo, crawlerInstance, discovery.NewLinkChecker(), marketSvc, geocoder, llmEnricher, simClassifier)