fix(ai): per-model cost calc + thinking toggle and token tracking

estimateCost ignored the model name and billed every Gemini call at hardcoded flash-lite rates ($0.10 / $0.40 per 1M), under-counting Pro calls by ~12-25x. Switch to priceFor(model) and prefer resp.ModelVersion so aliases like gemini-pro-latest resolve to their concrete family. Capture ThoughtsTokenCount as a separate ThinkingTokens column on ai_usage (migration 000030) and bill it at the output rate. Add a global thinking on/off toggle that mirrors the grounding pattern: provider holds an in-memory cache (read at startup from settings.Store), handler keeps it in sync, Chat() applies ThinkingConfig.ThinkingBudget=0 only when disabled. Default true preserves SDK behavior. Grounding+ thinking get/set helpers folded into shared getBool/setBool to keep goconst happy. Web admin settings: new "Modell-Reasoning" toggle card; usage panel sums include thinking tokens. Types are optional with `?? 0` defaults so a brief web-before-backend rollout window cannot render NaN.
2026-04-28 12:56:04 +02:00
parent 34a3da6e8b
commit ba4dce1f76
13 changed files with 309 additions and 41 deletions
--- a/backend/internal/domain/settings/handler.go
+++ b/backend/internal/domain/settings/handler.go
@@ -19,6 +19,7 @@ type AIStatus struct {
 	APIKeyFingerprint string         `json:"api_key_fingerprint,omitempty"`
 	GroundingEnabled  bool           `json:"grounding_enabled"`
 	GroundingQuota    int            `json:"grounding_quota"`
+	ThinkingEnabled   bool           `json:"thinking_enabled"`
 	Usage             UsageSummary   `json:"usage"`
 }

@@ -55,6 +56,7 @@ func (h *Handler) GetAI(c *gin.Context) {
 	}

 	grounding, _ := h.store.GetGroundingEnabled(ctx)
+	thinking, _ := h.store.GetThinkingEnabled(ctx)

 	today, _ := h.usageRepo.Today(ctx)
 	month, _ := h.usageRepo.Month(ctx)
@@ -68,6 +70,7 @@ func (h *Handler) GetAI(c *gin.Context) {
 		APIKeyFingerprint: fingerprint,
 		GroundingEnabled:  grounding,
 		GroundingQuota:    1500,
+		ThinkingEnabled:   thinking,
 		Usage: UsageSummary{
 			Today:              today,
 			Month:              month,
@@ -150,6 +153,24 @@ func (h *Handler) SetGrounding(c *gin.Context) {
 	c.JSON(http.StatusOK, gin.H{"data": gin.H{"grounding_enabled": req.Enabled}})
 }

+func (h *Handler) SetThinking(c *gin.Context) {
+	ctx := c.Request.Context()
+	var req struct {
+		Enabled bool `json:"enabled"`
+	}
+	if err := c.ShouldBindJSON(&req); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "enabled is required"})
+		return
+	}
+	userID := callerID(c)
+	if err := h.store.SetThinkingEnabled(ctx, req.Enabled, userID); err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to save thinking setting"})
+		return
+	}
+	h.provider.SetThinkingEnabled(req.Enabled)
+	c.JSON(http.StatusOK, gin.H{"data": gin.H{"thinking_enabled": req.Enabled}})
+}
+
 func (h *Handler) GetUsage(c *gin.Context) {
 	ctx := c.Request.Context()
 	limit := 50
--- a/backend/internal/domain/settings/routes.go
+++ b/backend/internal/domain/settings/routes.go
@@ -8,5 +8,6 @@ func RegisterRoutes(rg *gin.RouterGroup, h *Handler, requireAuth, requireAdmin g
 	admin.POST("/settings/ai/model", h.SetModel)
 	admin.POST("/settings/ai/key", h.SetAPIKey)
 	admin.POST("/settings/ai/grounding", h.SetGrounding)
+	admin.POST("/settings/ai/thinking", h.SetThinking)
 	admin.GET("/settings/ai/usage", h.GetUsage)
 }
--- a/backend/internal/domain/settings/store.go
+++ b/backend/internal/domain/settings/store.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"strconv"

 	"github.com/google/uuid"
 	"github.com/jackc/pgx/v5"
@@ -16,6 +17,7 @@ const (
 	keyAPIKey           = "gemini.api_key"
 	keyModel            = "gemini.model"
 	keyGroundingEnabled = "gemini.grounding_enabled"
+	keyThinkingEnabled  = "gemini.thinking_enabled"
 )

 // Store persists AI provider configuration in system_settings.
@@ -71,19 +73,36 @@ func (s *Store) SetModel(ctx context.Context, model string, updatedBy uuid.UUID)
 }

 func (s *Store) GetGroundingEnabled(ctx context.Context) (bool, error) {
-	v, err := s.getText(ctx, keyGroundingEnabled, "true")
-	if err != nil {
-		return true, err
-	}
-	return v != "false", nil
+	return s.getBool(ctx, keyGroundingEnabled, true)
 }

 func (s *Store) SetGroundingEnabled(ctx context.Context, enabled bool, updatedBy uuid.UUID) error {
-	v := "false"
-	if enabled {
-		v = "true"
+	return s.setBool(ctx, keyGroundingEnabled, enabled, updatedBy)
+}
+
+// GetThinkingEnabled controls whether the provider sends ThinkingConfig.ThinkingBudget=0
+// to disable model reasoning. Default is true (preserves SDK default of dynamic thinking).
+func (s *Store) GetThinkingEnabled(ctx context.Context) (bool, error) {
+	return s.getBool(ctx, keyThinkingEnabled, true)
+}
+
+func (s *Store) SetThinkingEnabled(ctx context.Context, enabled bool, updatedBy uuid.UUID) error {
+	return s.setBool(ctx, keyThinkingEnabled, enabled, updatedBy)
+}
+
+func (s *Store) getBool(ctx context.Context, key string, fallback bool) (bool, error) {
+	v, err := s.getText(ctx, key, strconv.FormatBool(fallback))
+	if err != nil {
+		return fallback, err
 	}
-	return s.setText(ctx, keyGroundingEnabled, v, updatedBy)
+	if b, parseErr := strconv.ParseBool(v); parseErr == nil {
+		return b, nil
+	}
+	return fallback, nil
+}
+
+func (s *Store) setBool(ctx context.Context, key string, enabled bool, updatedBy uuid.UUID) error {
+	return s.setText(ctx, key, strconv.FormatBool(enabled), updatedBy)
 }

 func (s *Store) getText(ctx context.Context, key, fallback string) (string, error) {
--- a/backend/internal/domain/settings/usage.go
+++ b/backend/internal/domain/settings/usage.go
@@ -31,10 +31,10 @@ func (r *UsageRepo) Record(ctx context.Context, e ai.UsageEvent) error {
 	}
 	_, err := r.db.Exec(ctx, `
 		INSERT INTO ai_usage
-		  (provider, model, call_type, input_tokens, output_tokens,
+		  (provider, model, call_type, input_tokens, output_tokens, thinking_tokens,
 		   grounded, duration_ms, estimated_cost_usd, error, prompt_version)
-		VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10)
-	`, e.Provider, e.Model, e.CallType, e.InputTokens, e.OutputTokens,
+		VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11)
+	`, e.Provider, e.Model, e.CallType, e.InputTokens, e.OutputTokens, e.ThinkingTokens,
 		e.Grounded, e.DurationMs, e.EstimatedCostUSD, errStr, promptVersion)
 	if err != nil {
 		return fmt.Errorf("usage: record: %w", err)
@@ -47,6 +47,7 @@ type UsageStats struct {
 	Calls            int     `json:"calls"`
 	InputTokens      int     `json:"input_tokens"`
 	OutputTokens     int     `json:"output_tokens"`
+	ThinkingTokens   int     `json:"thinking_tokens"`
 	GroundingCalls   int     `json:"grounding_calls"`
 	EstimatedCostUSD float64 `json:"estimated_cost_usd"`
 }
@@ -71,16 +72,17 @@ func (r *UsageRepo) GroundingToday(ctx context.Context) (int, error) {
 func (r *UsageRepo) statsWindow(ctx context.Context, interval string) (UsageStats, error) {
 	row := r.db.QueryRow(ctx, fmt.Sprintf(`
 		SELECT
-		  COUNT(*)                      AS calls,
-		  COALESCE(SUM(input_tokens),0) AS input_tokens,
-		  COALESCE(SUM(output_tokens),0) AS output_tokens,
+		  COUNT(*)                         AS calls,
+		  COALESCE(SUM(input_tokens),0)    AS input_tokens,
+		  COALESCE(SUM(output_tokens),0)   AS output_tokens,
+		  COALESCE(SUM(thinking_tokens),0) AS thinking_tokens,
 		  COALESCE(SUM(CASE WHEN grounded THEN 1 ELSE 0 END),0) AS grounding_calls,
 		  COALESCE(SUM(estimated_cost_usd),0) AS cost
 		FROM ai_usage
 		WHERE created_at >= now() - INTERVAL '%s'
 	`, interval))
 	var s UsageStats
-	if err := row.Scan(&s.Calls, &s.InputTokens, &s.OutputTokens, &s.GroundingCalls, &s.EstimatedCostUSD); err != nil {
+	if err := row.Scan(&s.Calls, &s.InputTokens, &s.OutputTokens, &s.ThinkingTokens, &s.GroundingCalls, &s.EstimatedCostUSD); err != nil {
 		return s, fmt.Errorf("usage: stats(%s): %w", interval, err)
 	}
 	return s, nil
@@ -95,6 +97,7 @@ type UsageEvent struct {
 	CallType         string    `json:"call_type"`
 	InputTokens      int       `json:"input_tokens"`
 	OutputTokens     int       `json:"output_tokens"`
+	ThinkingTokens   int       `json:"thinking_tokens"`
 	Grounded         bool      `json:"grounded"`
 	DurationMs       int       `json:"duration_ms"`
 	EstimatedCostUSD float64   `json:"estimated_cost_usd"`
@@ -105,7 +108,7 @@ type UsageEvent struct {
 func (r *UsageRepo) Recent(ctx context.Context, limit int) ([]UsageEvent, error) {
 	rows, err := r.db.Query(ctx, `
 		SELECT id, created_at, provider, model, call_type,
-		       input_tokens, output_tokens, grounded, duration_ms,
+		       input_tokens, output_tokens, thinking_tokens, grounded, duration_ms,
 		       estimated_cost_usd, error, prompt_version
 		FROM ai_usage
 		ORDER BY created_at DESC
@@ -120,7 +123,7 @@ func (r *UsageRepo) Recent(ctx context.Context, limit int) ([]UsageEvent, error)
 	for rows.Next() {
 		var e UsageEvent
 		if err := rows.Scan(&e.ID, &e.CreatedAt, &e.Provider, &e.Model, &e.CallType,
-			&e.InputTokens, &e.OutputTokens, &e.Grounded, &e.DurationMs,
+			&e.InputTokens, &e.OutputTokens, &e.ThinkingTokens, &e.Grounded, &e.DurationMs,
 			&e.EstimatedCostUSD, &e.Error, &e.PromptVersion); err != nil {
 			return nil, fmt.Errorf("usage: scan: %w", err)
 		}
--- a/backend/internal/pkg/ai/factory.go
+++ b/backend/internal/pkg/ai/factory.go
@@ -11,6 +11,7 @@ import (
 type KeySource interface {
 	GetGeminiAPIKey(ctx context.Context) (string, error)
 	GetModel(ctx context.Context) (string, error)
+	GetThinkingEnabled(ctx context.Context) (bool, error)
 }

 // NewFromConfig creates a GeminiProvider. It reads the API key from store first;
@@ -31,11 +32,22 @@ func NewFromConfig(ctx context.Context, cfg config.AIConfig, store KeySource, re
 		model = "gemini-2.5-flash-lite"
 	}

-	if apiKey == "" {
-		// No key available. Return an unconfigured provider that will fail on use,
-		// but allows the server to start so the operator can configure the key via UI.
-		return newUnconfiguredGeminiProvider(model, recorder), nil
+	thinking, terr := store.GetThinkingEnabled(ctx)
+	if terr != nil {
+		slog.Warn("ai: could not read thinking setting; defaulting to enabled", "error", terr)
+		thinking = true
 	}

-	return NewGeminiProvider(ctx, apiKey, model, recorder)
+	if apiKey == "" {
+		p := newUnconfiguredGeminiProvider(model, recorder)
+		p.SetThinkingEnabled(thinking)
+		return p, nil
+	}
+
+	p, err := NewGeminiProvider(ctx, apiKey, model, recorder)
+	if err != nil {
+		return nil, err
+	}
+	p.SetThinkingEnabled(thinking)
+	return p, nil
 }
--- a/backend/internal/pkg/ai/gemini.go
+++ b/backend/internal/pkg/ai/gemini.go
@@ -101,13 +101,11 @@ func filterCompatibleModels(items []*genai.Model) []ModelInfo {
 	return out
 }

-// Gemini API pricing (as of 2026-04). Refresh constants when pricing changes.
+// Gemini grounding pricing (as of 2026-04). Per-model token rates live in geminiPricing.
 // https://ai.google.dev/gemini-api/docs/pricing
 const (
-	geminiInputCostPerToken  = 0.10 / 1_000_000 // $0.10 / 1M tokens
-	geminiOutputCostPerToken = 0.40 / 1_000_000 // $0.40 / 1M tokens
-	geminiGroundingCostPer1k = 35.0 / 1_000     // $35 / 1k grounded prompts (above free tier)
-	geminiGroundingFreeDaily = 1_500            // daily free grounding requests
+	geminiGroundingCostPer1k = 35.0 / 1_000 // $35 / 1k grounded prompts (above free tier)
+	geminiGroundingFreeDaily = 1_500        // daily free grounding requests
 )

 type GeminiProvider struct {
@@ -116,6 +114,11 @@ type GeminiProvider struct {
 	model    string
 	recorder UsageRecorder

+	// thinkingEnabled mirrors the persisted setting. When false, Chat() sets
+	// ThinkingConfig.ThinkingBudget=0 to disable reasoning on capable models.
+	// Default true preserves the SDK default of dynamic thinking.
+	thinkingEnabled bool
+
 	// groundingCallsToday is an in-process counter used for cost estimation only.
 	// It is not persisted and resets on restart. The authoritative count lives in ai_usage.
 	groundingCallsToday int
@@ -126,9 +129,10 @@ type GeminiProvider struct {
 // All Chat calls return ErrInternal until Reinitialize is called.
 func newUnconfiguredGeminiProvider(model string, recorder UsageRecorder) *GeminiProvider {
 	return &GeminiProvider{
-		model:         model,
-		recorder:      recorder,
-		groundingDate: time.Now().UTC().Truncate(24 * time.Hour),
+		model:           model,
+		recorder:        recorder,
+		thinkingEnabled: true,
+		groundingDate:   time.Now().UTC().Truncate(24 * time.Hour),
 	}
 }

@@ -157,10 +161,11 @@ func NewGeminiProvider(ctx context.Context, apiKey, model string, recorder Usage
 		return nil, fmt.Errorf("gemini: new client: %w", err)
 	}
 	return &GeminiProvider{
-		client:        client,
-		model:         model,
-		recorder:      recorder,
-		groundingDate: time.Now().UTC().Truncate(24 * time.Hour),
+		client:          client,
+		model:           model,
+		recorder:        recorder,
+		thinkingEnabled: true,
+		groundingDate:   time.Now().UTC().Truncate(24 * time.Hour),
 	}, nil
 }

@@ -181,6 +186,18 @@ func (p *GeminiProvider) SetModel(model string) {
 	p.model = model
 }

+func (p *GeminiProvider) ThinkingEnabled() bool {
+	p.mu.RLock()
+	defer p.mu.RUnlock()
+	return p.thinkingEnabled
+}
+
+func (p *GeminiProvider) SetThinkingEnabled(enabled bool) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	p.thinkingEnabled = enabled
+}
+
 func (p *GeminiProvider) ListModels(ctx context.Context) ([]ModelInfo, error) {
 	p.mu.RLock()
 	client := p.client
@@ -244,6 +261,13 @@ func (p *GeminiProvider) Chat(ctx context.Context, req *ChatRequest) (*ChatRespo
 		}
 	}

+	// Disable thinking for thinking-capable models when the operator has opted out.
+	// SDK default (no ThinkingConfig) keeps dynamic thinking on.
+	if !p.ThinkingEnabled() {
+		zero := int32(0)
+		cfg.ThinkingConfig = &genai.ThinkingConfig{ThinkingBudget: &zero}
+	}
+
 	resp, err := client.Models.GenerateContent(ctx, model,
 		genai.Text(req.UserMessage), cfg)

@@ -303,14 +327,28 @@ func (p *GeminiProvider) buildUsageEvent(model string, req *ChatRequest, resp *g
 	if resp != nil && resp.UsageMetadata != nil {
 		e.InputTokens = int(resp.UsageMetadata.PromptTokenCount)
 		e.OutputTokens = int(resp.UsageMetadata.CandidatesTokenCount)
+		e.ThinkingTokens = int(resp.UsageMetadata.ThoughtsTokenCount)
 	}
-	e.EstimatedCostUSD = p.estimateCost(e.InputTokens, e.OutputTokens, req.Grounded)
+	// Aliases like "gemini-pro-latest" don't match priceFor; the resolved name from
+	// the response (e.g. "gemini-2.5-pro-002") does. Prefer it when present.
+	pricingModel := model
+	if resp != nil && resp.ModelVersion != "" {
+		pricingModel = resp.ModelVersion
+	}
+	e.EstimatedCostUSD = p.estimateCost(pricingModel, e.InputTokens, e.OutputTokens+e.ThinkingTokens, req.Grounded)
 	return e
 }

-func (p *GeminiProvider) estimateCost(inputTokens, outputTokens int, grounded bool) float64 {
-	cost := float64(inputTokens)*geminiInputCostPerToken +
-		float64(outputTokens)*geminiOutputCostPerToken
+// estimateCost returns USD for the given token counts at the model's published rate.
+// outputTokens should already include any thinking tokens (Gemini bills thoughts at the output rate).
+// TODO: handle the >200K input tier for 2.5-pro / 3.1-pro if prompts ever exceed that.
+func (p *GeminiProvider) estimateCost(model string, inputTokens, outputTokens int, grounded bool) float64 {
+	inUSDPerM, outUSDPerM := priceFor(model)
+	cost := float64(inputTokens)*inUSDPerM/1_000_000 +
+		float64(outputTokens)*outUSDPerM/1_000_000
+	if inUSDPerM == 0 && outUSDPerM == 0 && (inputTokens > 0 || outputTokens > 0) && model != "" {
+		slog.Warn("ai: unknown model for pricing — estimated cost is 0", "model", model)
+	}
 	if grounded {
 		p.mu.Lock()
 		today := time.Now().UTC().Truncate(24 * time.Hour)
--- a/backend/internal/pkg/ai/gemini_test.go
+++ b/backend/internal/pkg/ai/gemini_test.go
@@ -156,6 +156,96 @@ func TestPriceFor_UnknownReturnsZero(t *testing.T) {
 	}
 }

+func TestThinkingEnabled_DefaultsTrueAndIsTogglable(t *testing.T) {
+	p := newUnconfiguredGeminiProvider("gemini-2.5-pro", nil)
+	if !p.ThinkingEnabled() {
+		t.Errorf("default ThinkingEnabled = false; want true (preserves SDK default)")
+	}
+	p.SetThinkingEnabled(false)
+	if p.ThinkingEnabled() {
+		t.Errorf("after SetThinkingEnabled(false), still true")
+	}
+	p.SetThinkingEnabled(true)
+	if !p.ThinkingEnabled() {
+		t.Errorf("after SetThinkingEnabled(true), still false")
+	}
+}
+
+func TestEstimateCost_UsesPriceForModel(t *testing.T) {
+	p := &GeminiProvider{}
+	cases := []struct {
+		name         string
+		model        string
+		inputTokens  int
+		outputTokens int
+		wantUSD      float64
+	}{
+		{"2.5-pro 1M+1M", "gemini-2.5-pro", 1_000_000, 1_000_000, 1.25 + 10.00},
+		{"2.5-flash 1M+1M", "gemini-2.5-flash", 1_000_000, 1_000_000, 0.30 + 2.50},
+		{"2.5-flash-lite 1M+1M", "gemini-2.5-flash-lite", 1_000_000, 1_000_000, 0.10 + 0.40},
+		{"3.1-pro 1M+1M", "gemini-3.1-pro", 1_000_000, 1_000_000, 2.00 + 12.00},
+		{"resolved alias 2.5-pro-002", "gemini-2.5-pro-002", 1000, 500, 1000*1.25/1_000_000 + 500*10.0/1_000_000},
+		{"unknown returns zero", "gemini-pro-latest", 1000, 1000, 0},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := p.estimateCost(tc.model, tc.inputTokens, tc.outputTokens, false)
+			if got != tc.wantUSD {
+				t.Errorf("estimateCost(%q, %d, %d) = %v; want %v",
+					tc.model, tc.inputTokens, tc.outputTokens, got, tc.wantUSD)
+			}
+		})
+	}
+}
+
+func TestBuildUsageEvent_PrefersResolvedModelVersion(t *testing.T) {
+	p := &GeminiProvider{}
+	resp := &genai.GenerateContentResponse{
+		ModelVersion: "gemini-2.5-pro-002",
+		UsageMetadata: &genai.GenerateContentResponseUsageMetadata{
+			PromptTokenCount:     1000,
+			CandidatesTokenCount: 500,
+		},
+	}
+	req := &ChatRequest{CallType: "research"}
+
+	// Caller passes the alias; resolved name from response should drive pricing.
+	e := p.buildUsageEvent("gemini-pro-latest", req, resp, nil, 100)
+
+	wantCost := float64(1000)*1.25/1_000_000 + float64(500)*10.0/1_000_000
+	if e.EstimatedCostUSD != wantCost {
+		t.Errorf("EstimatedCostUSD = %v; want %v (resolved model should price as 2.5-pro)",
+			e.EstimatedCostUSD, wantCost)
+	}
+}
+
+func TestBuildUsageEvent_BillsThoughtsTokens(t *testing.T) {
+	p := &GeminiProvider{}
+	resp := &genai.GenerateContentResponse{
+		ModelVersion: "gemini-2.5-pro",
+		UsageMetadata: &genai.GenerateContentResponseUsageMetadata{
+			PromptTokenCount:     1000,
+			CandidatesTokenCount: 500,
+			ThoughtsTokenCount:   200,
+		},
+	}
+	req := &ChatRequest{CallType: "research"}
+	e := p.buildUsageEvent("gemini-2.5-pro", req, resp, nil, 100)
+
+	if e.ThinkingTokens != 200 {
+		t.Errorf("ThinkingTokens = %d; want 200", e.ThinkingTokens)
+	}
+	if e.OutputTokens != 500 {
+		t.Errorf("OutputTokens = %d; want 500 (candidates only, thoughts tracked separately)", e.OutputTokens)
+	}
+	// Cost: input @ 1.25/1M, (output + thoughts) @ 10/1M
+	wantCost := float64(1000)*1.25/1_000_000 + float64(500+200)*10.0/1_000_000
+	if e.EstimatedCostUSD != wantCost {
+		t.Errorf("EstimatedCostUSD = %v; want %v (thoughts billed at output rate)",
+			e.EstimatedCostUSD, wantCost)
+	}
+}
+
 func modelNames(ms []ModelInfo) []string {
 	names := make([]string, len(ms))
 	for i, m := range ms {
--- a/backend/internal/pkg/ai/usage.go
+++ b/backend/internal/pkg/ai/usage.go
@@ -3,12 +3,17 @@ package ai
 import "context"

 // UsageEvent holds per-call telemetry recorded after each LLM call.
+//
+// OutputTokens holds visible response tokens (CandidatesTokenCount).
+// ThinkingTokens holds reasoning tokens (ThoughtsTokenCount), tracked separately
+// for visibility but billed at the output rate by Gemini.
 type UsageEvent struct {
 	Provider         string
 	Model            string
 	CallType         string
 	InputTokens      int
 	OutputTokens     int
+	ThinkingTokens   int
 	Grounded         bool
 	DurationMs       int
 	EstimatedCostUSD float64
--- a/backend/migrations/000030_ai_usage_thinking_tokens.down.sql
+++ b/backend/migrations/000030_ai_usage_thinking_tokens.down.sql
@@ -0,0 +1 @@
+ALTER TABLE ai_usage DROP COLUMN thinking_tokens;
--- a/backend/migrations/000030_ai_usage_thinking_tokens.up.sql
+++ b/backend/migrations/000030_ai_usage_thinking_tokens.up.sql
@@ -0,0 +1,2 @@
+ALTER TABLE ai_usage
+  ADD COLUMN thinking_tokens INT NOT NULL DEFAULT 0;
--- a/web/src/lib/api/types.ts
+++ b/web/src/lib/api/types.ts
@@ -204,6 +204,7 @@ export interface AIUsageStats {
 	calls: number;
 	input_tokens: number;
 	output_tokens: number;
+	thinking_tokens?: number;
 	grounding_calls: number;
 	estimated_cost_usd: number;
 }
@@ -216,6 +217,7 @@ export interface AIUsageEvent {
 	call_type: string;
 	input_tokens: number;
 	output_tokens: number;
+	thinking_tokens?: number;
 	grounded: boolean;
 	duration_ms: number;
 	estimated_cost_usd: number;
@@ -240,6 +242,7 @@ export interface AIStatus {
 	api_key_fingerprint?: string;
 	grounding_enabled: boolean;
 	grounding_quota: number;
+	thinking_enabled: boolean;
 	usage: {
 		today: AIUsageStats;
 		month: AIUsageStats;
--- a/web/src/routes/admin/einstellungen/+page.server.ts
+++ b/web/src/routes/admin/einstellungen/+page.server.ts
@@ -67,5 +67,20 @@ export const actions: Actions = {
 		} catch (err) {
 			return fail(500, { error: err instanceof Error ? err.message : 'Fehler beim Speichern.' });
 		}
+	},
+
+	setThinking: async ({ cookies, fetch, request }) => {
+		const data = await request.formData();
+		const enabled = data.get('enabled') === 'true';
+		try {
+			await serverFetch('/admin/settings/ai/thinking', cookies, {
+				method: 'POST',
+				body: JSON.stringify({ enabled }),
+				fetch
+			});
+			return { success: true, action: 'thinking', enabled };
+		} catch (err) {
+			return fail(500, { error: err instanceof Error ? err.message : 'Fehler beim Speichern.' });
+		}
 	}
 };
--- a/web/src/routes/admin/einstellungen/+page.svelte
+++ b/web/src/routes/admin/einstellungen/+page.svelte
@@ -14,6 +14,7 @@
 	let saving = $state(false);
 	let showKeyInput = $state(untrack(() => !data.ai?.api_key_fingerprint));
 	let groundingEnabled = $state(untrack(() => data.ai?.grounding_enabled ?? false));
+	let thinkingEnabled = $state(untrack(() => data.ai?.thinking_enabled ?? true));

 	let activeModel = $derived(
 		form?.success && form.action === 'model' && form.model ? form.model : (data.ai?.model ?? '')
@@ -266,6 +267,59 @@
 			</div>
 		</div>

+		<!-- Card 3b: Thinking (Reasoning) -->
+		<div
+			class="rounded-lg border border-stone-200 bg-white dark:border-stone-700 dark:bg-stone-900"
+		>
+			<div class="border-b border-stone-200 px-6 py-4 dark:border-stone-700">
+				<h2 class="text-base font-semibold text-stone-900 dark:text-stone-100">
+					Modell-Reasoning (Thinking)
+				</h2>
+			</div>
+			<div class="px-6 py-4">
+				<div class="flex items-center justify-between">
+					<div>
+						<p class="text-sm text-stone-700 dark:text-stone-300">
+							Thinking-Token bei kompatiblen Modellen erlauben
+						</p>
+						<p class="mt-0.5 text-xs text-stone-400">
+							Aus = günstiger, schneller. An = höhere Antwortqualität bei komplexen Aufgaben.
+						</p>
+					</div>
+					<form
+						method="POST"
+						action="?/setThinking"
+						use:enhance={() => {
+							return async ({ update }) => {
+								await update();
+							};
+						}}
+					>
+						<input type="hidden" name="enabled" value={thinkingEnabled ? 'false' : 'true'} />
+						<button
+							type="submit"
+							onclick={() => (thinkingEnabled = !thinkingEnabled)}
+							aria-label={thinkingEnabled ? 'Thinking deaktivieren' : 'Thinking aktivieren'}
+							class="relative inline-flex h-6 w-11 items-center rounded-full transition-colors {thinkingEnabled
+								? 'bg-primary-600'
+								: 'bg-stone-300 dark:bg-stone-600'}"
+						>
+							<span
+								class="inline-block h-4 w-4 transform rounded-full bg-white shadow transition-transform {thinkingEnabled
+									? 'translate-x-6'
+									: 'translate-x-1'}"
+							></span>
+						</button>
+					</form>
+				</div>
+				{#if form?.success && form.action === 'thinking'}
+					<p class="mt-2 text-xs text-green-600 dark:text-green-400">
+						Thinking {form.enabled ? 'aktiviert' : 'deaktiviert'}.
+					</p>
+				{/if}
+			</div>
+		</div>
+
 		<!-- Card 4: Usage -->
 		<div
 			class="rounded-lg border border-stone-200 bg-white dark:border-stone-700 dark:bg-stone-900"
@@ -276,7 +330,7 @@
 			<div class="space-y-4 px-6 py-4">
 				<!-- Rollup stats -->
 				<div class="grid grid-cols-2 gap-4 sm:grid-cols-4">
-					{#each [{ label: 'Anfragen heute', value: data.ai.usage.today.calls.toString() }, { label: 'Tokens heute', value: (data.ai.usage.today.input_tokens + data.ai.usage.today.output_tokens).toLocaleString('de-DE') }, { label: 'Kosten heute', value: formatCost(data.ai.usage.today.estimated_cost_usd) }, { label: 'Kosten (30 Tage)', value: formatCost(data.ai.usage.month.estimated_cost_usd) }] as stat}
+					{#each [{ label: 'Anfragen heute', value: data.ai.usage.today.calls.toString() }, { label: 'Tokens heute', value: (data.ai.usage.today.input_tokens + data.ai.usage.today.output_tokens + (data.ai.usage.today.thinking_tokens ?? 0)).toLocaleString('de-DE') }, { label: 'Kosten heute', value: formatCost(data.ai.usage.today.estimated_cost_usd) }, { label: 'Kosten (30 Tage)', value: formatCost(data.ai.usage.month.estimated_cost_usd) }] as stat}
 						<div class="rounded-md bg-stone-50 px-3 py-2 dark:bg-stone-800">
 							<p class="text-xs text-stone-400">{stat.label}</p>
 							<p class="mt-0.5 text-sm font-semibold text-stone-800 dark:text-stone-200">
@@ -311,7 +365,11 @@
 										<td class="py-1.5 pr-4">{event.call_type}</td>
 										<td class="max-w-32 truncate py-1.5 pr-4 font-mono">{event.model}</td>
 										<td class="py-1.5 pr-4"
-											>{(event.input_tokens + event.output_tokens).toLocaleString('de-DE')}</td
+											>{(
+												event.input_tokens +
+												event.output_tokens +
+												(event.thinking_tokens ?? 0)
+											).toLocaleString('de-DE')}</td
 										>
 										<td class="py-1.5 pr-4">{event.grounded ? '✓' : '—'}</td>
 										<td class="py-1.5">{formatCost(event.estimated_cost_usd)}</td>
				`@@ -0,0 +1 @@`
				`ALTER TABLE ai_usage DROP COLUMN thinking_tokens;`