fix(ai): per-model cost calc + thinking toggle and token tracking
estimateCost ignored the model name and billed every Gemini call at hardcoded flash-lite rates ($0.10 / $0.40 per 1M), under-counting Pro calls by ~12-25x. Switch to priceFor(model) and prefer resp.ModelVersion so aliases like gemini-pro-latest resolve to their concrete family. Capture ThoughtsTokenCount as a separate ThinkingTokens column on ai_usage (migration 000030) and bill it at the output rate. Add a global thinking on/off toggle that mirrors the grounding pattern: provider holds an in-memory cache (read at startup from settings.Store), handler keeps it in sync, Chat() applies ThinkingConfig.ThinkingBudget=0 only when disabled. Default true preserves SDK behavior. Grounding+ thinking get/set helpers folded into shared getBool/setBool to keep goconst happy. Web admin settings: new "Modell-Reasoning" toggle card; usage panel sums include thinking tokens. Types are optional with `?? 0` defaults so a brief web-before-backend rollout window cannot render NaN.
This commit is contained in:
@@ -19,6 +19,7 @@ type AIStatus struct {
|
||||
APIKeyFingerprint string `json:"api_key_fingerprint,omitempty"`
|
||||
GroundingEnabled bool `json:"grounding_enabled"`
|
||||
GroundingQuota int `json:"grounding_quota"`
|
||||
ThinkingEnabled bool `json:"thinking_enabled"`
|
||||
Usage UsageSummary `json:"usage"`
|
||||
}
|
||||
|
||||
@@ -55,6 +56,7 @@ func (h *Handler) GetAI(c *gin.Context) {
|
||||
}
|
||||
|
||||
grounding, _ := h.store.GetGroundingEnabled(ctx)
|
||||
thinking, _ := h.store.GetThinkingEnabled(ctx)
|
||||
|
||||
today, _ := h.usageRepo.Today(ctx)
|
||||
month, _ := h.usageRepo.Month(ctx)
|
||||
@@ -68,6 +70,7 @@ func (h *Handler) GetAI(c *gin.Context) {
|
||||
APIKeyFingerprint: fingerprint,
|
||||
GroundingEnabled: grounding,
|
||||
GroundingQuota: 1500,
|
||||
ThinkingEnabled: thinking,
|
||||
Usage: UsageSummary{
|
||||
Today: today,
|
||||
Month: month,
|
||||
@@ -150,6 +153,24 @@ func (h *Handler) SetGrounding(c *gin.Context) {
|
||||
c.JSON(http.StatusOK, gin.H{"data": gin.H{"grounding_enabled": req.Enabled}})
|
||||
}
|
||||
|
||||
func (h *Handler) SetThinking(c *gin.Context) {
|
||||
ctx := c.Request.Context()
|
||||
var req struct {
|
||||
Enabled bool `json:"enabled"`
|
||||
}
|
||||
if err := c.ShouldBindJSON(&req); err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "enabled is required"})
|
||||
return
|
||||
}
|
||||
userID := callerID(c)
|
||||
if err := h.store.SetThinkingEnabled(ctx, req.Enabled, userID); err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to save thinking setting"})
|
||||
return
|
||||
}
|
||||
h.provider.SetThinkingEnabled(req.Enabled)
|
||||
c.JSON(http.StatusOK, gin.H{"data": gin.H{"thinking_enabled": req.Enabled}})
|
||||
}
|
||||
|
||||
func (h *Handler) GetUsage(c *gin.Context) {
|
||||
ctx := c.Request.Context()
|
||||
limit := 50
|
||||
|
||||
@@ -8,5 +8,6 @@ func RegisterRoutes(rg *gin.RouterGroup, h *Handler, requireAuth, requireAdmin g
|
||||
admin.POST("/settings/ai/model", h.SetModel)
|
||||
admin.POST("/settings/ai/key", h.SetAPIKey)
|
||||
admin.POST("/settings/ai/grounding", h.SetGrounding)
|
||||
admin.POST("/settings/ai/thinking", h.SetThinking)
|
||||
admin.GET("/settings/ai/usage", h.GetUsage)
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strconv"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/jackc/pgx/v5"
|
||||
@@ -16,6 +17,7 @@ const (
|
||||
keyAPIKey = "gemini.api_key"
|
||||
keyModel = "gemini.model"
|
||||
keyGroundingEnabled = "gemini.grounding_enabled"
|
||||
keyThinkingEnabled = "gemini.thinking_enabled"
|
||||
)
|
||||
|
||||
// Store persists AI provider configuration in system_settings.
|
||||
@@ -71,19 +73,36 @@ func (s *Store) SetModel(ctx context.Context, model string, updatedBy uuid.UUID)
|
||||
}
|
||||
|
||||
func (s *Store) GetGroundingEnabled(ctx context.Context) (bool, error) {
|
||||
v, err := s.getText(ctx, keyGroundingEnabled, "true")
|
||||
if err != nil {
|
||||
return true, err
|
||||
}
|
||||
return v != "false", nil
|
||||
return s.getBool(ctx, keyGroundingEnabled, true)
|
||||
}
|
||||
|
||||
func (s *Store) SetGroundingEnabled(ctx context.Context, enabled bool, updatedBy uuid.UUID) error {
|
||||
v := "false"
|
||||
if enabled {
|
||||
v = "true"
|
||||
return s.setBool(ctx, keyGroundingEnabled, enabled, updatedBy)
|
||||
}
|
||||
|
||||
// GetThinkingEnabled controls whether the provider sends ThinkingConfig.ThinkingBudget=0
|
||||
// to disable model reasoning. Default is true (preserves SDK default of dynamic thinking).
|
||||
func (s *Store) GetThinkingEnabled(ctx context.Context) (bool, error) {
|
||||
return s.getBool(ctx, keyThinkingEnabled, true)
|
||||
}
|
||||
|
||||
func (s *Store) SetThinkingEnabled(ctx context.Context, enabled bool, updatedBy uuid.UUID) error {
|
||||
return s.setBool(ctx, keyThinkingEnabled, enabled, updatedBy)
|
||||
}
|
||||
|
||||
func (s *Store) getBool(ctx context.Context, key string, fallback bool) (bool, error) {
|
||||
v, err := s.getText(ctx, key, strconv.FormatBool(fallback))
|
||||
if err != nil {
|
||||
return fallback, err
|
||||
}
|
||||
return s.setText(ctx, keyGroundingEnabled, v, updatedBy)
|
||||
if b, parseErr := strconv.ParseBool(v); parseErr == nil {
|
||||
return b, nil
|
||||
}
|
||||
return fallback, nil
|
||||
}
|
||||
|
||||
func (s *Store) setBool(ctx context.Context, key string, enabled bool, updatedBy uuid.UUID) error {
|
||||
return s.setText(ctx, key, strconv.FormatBool(enabled), updatedBy)
|
||||
}
|
||||
|
||||
func (s *Store) getText(ctx context.Context, key, fallback string) (string, error) {
|
||||
|
||||
@@ -31,10 +31,10 @@ func (r *UsageRepo) Record(ctx context.Context, e ai.UsageEvent) error {
|
||||
}
|
||||
_, err := r.db.Exec(ctx, `
|
||||
INSERT INTO ai_usage
|
||||
(provider, model, call_type, input_tokens, output_tokens,
|
||||
(provider, model, call_type, input_tokens, output_tokens, thinking_tokens,
|
||||
grounded, duration_ms, estimated_cost_usd, error, prompt_version)
|
||||
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10)
|
||||
`, e.Provider, e.Model, e.CallType, e.InputTokens, e.OutputTokens,
|
||||
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11)
|
||||
`, e.Provider, e.Model, e.CallType, e.InputTokens, e.OutputTokens, e.ThinkingTokens,
|
||||
e.Grounded, e.DurationMs, e.EstimatedCostUSD, errStr, promptVersion)
|
||||
if err != nil {
|
||||
return fmt.Errorf("usage: record: %w", err)
|
||||
@@ -47,6 +47,7 @@ type UsageStats struct {
|
||||
Calls int `json:"calls"`
|
||||
InputTokens int `json:"input_tokens"`
|
||||
OutputTokens int `json:"output_tokens"`
|
||||
ThinkingTokens int `json:"thinking_tokens"`
|
||||
GroundingCalls int `json:"grounding_calls"`
|
||||
EstimatedCostUSD float64 `json:"estimated_cost_usd"`
|
||||
}
|
||||
@@ -71,16 +72,17 @@ func (r *UsageRepo) GroundingToday(ctx context.Context) (int, error) {
|
||||
func (r *UsageRepo) statsWindow(ctx context.Context, interval string) (UsageStats, error) {
|
||||
row := r.db.QueryRow(ctx, fmt.Sprintf(`
|
||||
SELECT
|
||||
COUNT(*) AS calls,
|
||||
COALESCE(SUM(input_tokens),0) AS input_tokens,
|
||||
COALESCE(SUM(output_tokens),0) AS output_tokens,
|
||||
COUNT(*) AS calls,
|
||||
COALESCE(SUM(input_tokens),0) AS input_tokens,
|
||||
COALESCE(SUM(output_tokens),0) AS output_tokens,
|
||||
COALESCE(SUM(thinking_tokens),0) AS thinking_tokens,
|
||||
COALESCE(SUM(CASE WHEN grounded THEN 1 ELSE 0 END),0) AS grounding_calls,
|
||||
COALESCE(SUM(estimated_cost_usd),0) AS cost
|
||||
FROM ai_usage
|
||||
WHERE created_at >= now() - INTERVAL '%s'
|
||||
`, interval))
|
||||
var s UsageStats
|
||||
if err := row.Scan(&s.Calls, &s.InputTokens, &s.OutputTokens, &s.GroundingCalls, &s.EstimatedCostUSD); err != nil {
|
||||
if err := row.Scan(&s.Calls, &s.InputTokens, &s.OutputTokens, &s.ThinkingTokens, &s.GroundingCalls, &s.EstimatedCostUSD); err != nil {
|
||||
return s, fmt.Errorf("usage: stats(%s): %w", interval, err)
|
||||
}
|
||||
return s, nil
|
||||
@@ -95,6 +97,7 @@ type UsageEvent struct {
|
||||
CallType string `json:"call_type"`
|
||||
InputTokens int `json:"input_tokens"`
|
||||
OutputTokens int `json:"output_tokens"`
|
||||
ThinkingTokens int `json:"thinking_tokens"`
|
||||
Grounded bool `json:"grounded"`
|
||||
DurationMs int `json:"duration_ms"`
|
||||
EstimatedCostUSD float64 `json:"estimated_cost_usd"`
|
||||
@@ -105,7 +108,7 @@ type UsageEvent struct {
|
||||
func (r *UsageRepo) Recent(ctx context.Context, limit int) ([]UsageEvent, error) {
|
||||
rows, err := r.db.Query(ctx, `
|
||||
SELECT id, created_at, provider, model, call_type,
|
||||
input_tokens, output_tokens, grounded, duration_ms,
|
||||
input_tokens, output_tokens, thinking_tokens, grounded, duration_ms,
|
||||
estimated_cost_usd, error, prompt_version
|
||||
FROM ai_usage
|
||||
ORDER BY created_at DESC
|
||||
@@ -120,7 +123,7 @@ func (r *UsageRepo) Recent(ctx context.Context, limit int) ([]UsageEvent, error)
|
||||
for rows.Next() {
|
||||
var e UsageEvent
|
||||
if err := rows.Scan(&e.ID, &e.CreatedAt, &e.Provider, &e.Model, &e.CallType,
|
||||
&e.InputTokens, &e.OutputTokens, &e.Grounded, &e.DurationMs,
|
||||
&e.InputTokens, &e.OutputTokens, &e.ThinkingTokens, &e.Grounded, &e.DurationMs,
|
||||
&e.EstimatedCostUSD, &e.Error, &e.PromptVersion); err != nil {
|
||||
return nil, fmt.Errorf("usage: scan: %w", err)
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@ import (
|
||||
type KeySource interface {
|
||||
GetGeminiAPIKey(ctx context.Context) (string, error)
|
||||
GetModel(ctx context.Context) (string, error)
|
||||
GetThinkingEnabled(ctx context.Context) (bool, error)
|
||||
}
|
||||
|
||||
// NewFromConfig creates a GeminiProvider. It reads the API key from store first;
|
||||
@@ -31,11 +32,22 @@ func NewFromConfig(ctx context.Context, cfg config.AIConfig, store KeySource, re
|
||||
model = "gemini-2.5-flash-lite"
|
||||
}
|
||||
|
||||
if apiKey == "" {
|
||||
// No key available. Return an unconfigured provider that will fail on use,
|
||||
// but allows the server to start so the operator can configure the key via UI.
|
||||
return newUnconfiguredGeminiProvider(model, recorder), nil
|
||||
thinking, terr := store.GetThinkingEnabled(ctx)
|
||||
if terr != nil {
|
||||
slog.Warn("ai: could not read thinking setting; defaulting to enabled", "error", terr)
|
||||
thinking = true
|
||||
}
|
||||
|
||||
return NewGeminiProvider(ctx, apiKey, model, recorder)
|
||||
if apiKey == "" {
|
||||
p := newUnconfiguredGeminiProvider(model, recorder)
|
||||
p.SetThinkingEnabled(thinking)
|
||||
return p, nil
|
||||
}
|
||||
|
||||
p, err := NewGeminiProvider(ctx, apiKey, model, recorder)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
p.SetThinkingEnabled(thinking)
|
||||
return p, nil
|
||||
}
|
||||
|
||||
@@ -101,13 +101,11 @@ func filterCompatibleModels(items []*genai.Model) []ModelInfo {
|
||||
return out
|
||||
}
|
||||
|
||||
// Gemini API pricing (as of 2026-04). Refresh constants when pricing changes.
|
||||
// Gemini grounding pricing (as of 2026-04). Per-model token rates live in geminiPricing.
|
||||
// https://ai.google.dev/gemini-api/docs/pricing
|
||||
const (
|
||||
geminiInputCostPerToken = 0.10 / 1_000_000 // $0.10 / 1M tokens
|
||||
geminiOutputCostPerToken = 0.40 / 1_000_000 // $0.40 / 1M tokens
|
||||
geminiGroundingCostPer1k = 35.0 / 1_000 // $35 / 1k grounded prompts (above free tier)
|
||||
geminiGroundingFreeDaily = 1_500 // daily free grounding requests
|
||||
geminiGroundingCostPer1k = 35.0 / 1_000 // $35 / 1k grounded prompts (above free tier)
|
||||
geminiGroundingFreeDaily = 1_500 // daily free grounding requests
|
||||
)
|
||||
|
||||
type GeminiProvider struct {
|
||||
@@ -116,6 +114,11 @@ type GeminiProvider struct {
|
||||
model string
|
||||
recorder UsageRecorder
|
||||
|
||||
// thinkingEnabled mirrors the persisted setting. When false, Chat() sets
|
||||
// ThinkingConfig.ThinkingBudget=0 to disable reasoning on capable models.
|
||||
// Default true preserves the SDK default of dynamic thinking.
|
||||
thinkingEnabled bool
|
||||
|
||||
// groundingCallsToday is an in-process counter used for cost estimation only.
|
||||
// It is not persisted and resets on restart. The authoritative count lives in ai_usage.
|
||||
groundingCallsToday int
|
||||
@@ -126,9 +129,10 @@ type GeminiProvider struct {
|
||||
// All Chat calls return ErrInternal until Reinitialize is called.
|
||||
func newUnconfiguredGeminiProvider(model string, recorder UsageRecorder) *GeminiProvider {
|
||||
return &GeminiProvider{
|
||||
model: model,
|
||||
recorder: recorder,
|
||||
groundingDate: time.Now().UTC().Truncate(24 * time.Hour),
|
||||
model: model,
|
||||
recorder: recorder,
|
||||
thinkingEnabled: true,
|
||||
groundingDate: time.Now().UTC().Truncate(24 * time.Hour),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -157,10 +161,11 @@ func NewGeminiProvider(ctx context.Context, apiKey, model string, recorder Usage
|
||||
return nil, fmt.Errorf("gemini: new client: %w", err)
|
||||
}
|
||||
return &GeminiProvider{
|
||||
client: client,
|
||||
model: model,
|
||||
recorder: recorder,
|
||||
groundingDate: time.Now().UTC().Truncate(24 * time.Hour),
|
||||
client: client,
|
||||
model: model,
|
||||
recorder: recorder,
|
||||
thinkingEnabled: true,
|
||||
groundingDate: time.Now().UTC().Truncate(24 * time.Hour),
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -181,6 +186,18 @@ func (p *GeminiProvider) SetModel(model string) {
|
||||
p.model = model
|
||||
}
|
||||
|
||||
func (p *GeminiProvider) ThinkingEnabled() bool {
|
||||
p.mu.RLock()
|
||||
defer p.mu.RUnlock()
|
||||
return p.thinkingEnabled
|
||||
}
|
||||
|
||||
func (p *GeminiProvider) SetThinkingEnabled(enabled bool) {
|
||||
p.mu.Lock()
|
||||
defer p.mu.Unlock()
|
||||
p.thinkingEnabled = enabled
|
||||
}
|
||||
|
||||
func (p *GeminiProvider) ListModels(ctx context.Context) ([]ModelInfo, error) {
|
||||
p.mu.RLock()
|
||||
client := p.client
|
||||
@@ -244,6 +261,13 @@ func (p *GeminiProvider) Chat(ctx context.Context, req *ChatRequest) (*ChatRespo
|
||||
}
|
||||
}
|
||||
|
||||
// Disable thinking for thinking-capable models when the operator has opted out.
|
||||
// SDK default (no ThinkingConfig) keeps dynamic thinking on.
|
||||
if !p.ThinkingEnabled() {
|
||||
zero := int32(0)
|
||||
cfg.ThinkingConfig = &genai.ThinkingConfig{ThinkingBudget: &zero}
|
||||
}
|
||||
|
||||
resp, err := client.Models.GenerateContent(ctx, model,
|
||||
genai.Text(req.UserMessage), cfg)
|
||||
|
||||
@@ -303,14 +327,28 @@ func (p *GeminiProvider) buildUsageEvent(model string, req *ChatRequest, resp *g
|
||||
if resp != nil && resp.UsageMetadata != nil {
|
||||
e.InputTokens = int(resp.UsageMetadata.PromptTokenCount)
|
||||
e.OutputTokens = int(resp.UsageMetadata.CandidatesTokenCount)
|
||||
e.ThinkingTokens = int(resp.UsageMetadata.ThoughtsTokenCount)
|
||||
}
|
||||
e.EstimatedCostUSD = p.estimateCost(e.InputTokens, e.OutputTokens, req.Grounded)
|
||||
// Aliases like "gemini-pro-latest" don't match priceFor; the resolved name from
|
||||
// the response (e.g. "gemini-2.5-pro-002") does. Prefer it when present.
|
||||
pricingModel := model
|
||||
if resp != nil && resp.ModelVersion != "" {
|
||||
pricingModel = resp.ModelVersion
|
||||
}
|
||||
e.EstimatedCostUSD = p.estimateCost(pricingModel, e.InputTokens, e.OutputTokens+e.ThinkingTokens, req.Grounded)
|
||||
return e
|
||||
}
|
||||
|
||||
func (p *GeminiProvider) estimateCost(inputTokens, outputTokens int, grounded bool) float64 {
|
||||
cost := float64(inputTokens)*geminiInputCostPerToken +
|
||||
float64(outputTokens)*geminiOutputCostPerToken
|
||||
// estimateCost returns USD for the given token counts at the model's published rate.
|
||||
// outputTokens should already include any thinking tokens (Gemini bills thoughts at the output rate).
|
||||
// TODO: handle the >200K input tier for 2.5-pro / 3.1-pro if prompts ever exceed that.
|
||||
func (p *GeminiProvider) estimateCost(model string, inputTokens, outputTokens int, grounded bool) float64 {
|
||||
inUSDPerM, outUSDPerM := priceFor(model)
|
||||
cost := float64(inputTokens)*inUSDPerM/1_000_000 +
|
||||
float64(outputTokens)*outUSDPerM/1_000_000
|
||||
if inUSDPerM == 0 && outUSDPerM == 0 && (inputTokens > 0 || outputTokens > 0) && model != "" {
|
||||
slog.Warn("ai: unknown model for pricing — estimated cost is 0", "model", model)
|
||||
}
|
||||
if grounded {
|
||||
p.mu.Lock()
|
||||
today := time.Now().UTC().Truncate(24 * time.Hour)
|
||||
|
||||
@@ -156,6 +156,96 @@ func TestPriceFor_UnknownReturnsZero(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestThinkingEnabled_DefaultsTrueAndIsTogglable(t *testing.T) {
|
||||
p := newUnconfiguredGeminiProvider("gemini-2.5-pro", nil)
|
||||
if !p.ThinkingEnabled() {
|
||||
t.Errorf("default ThinkingEnabled = false; want true (preserves SDK default)")
|
||||
}
|
||||
p.SetThinkingEnabled(false)
|
||||
if p.ThinkingEnabled() {
|
||||
t.Errorf("after SetThinkingEnabled(false), still true")
|
||||
}
|
||||
p.SetThinkingEnabled(true)
|
||||
if !p.ThinkingEnabled() {
|
||||
t.Errorf("after SetThinkingEnabled(true), still false")
|
||||
}
|
||||
}
|
||||
|
||||
func TestEstimateCost_UsesPriceForModel(t *testing.T) {
|
||||
p := &GeminiProvider{}
|
||||
cases := []struct {
|
||||
name string
|
||||
model string
|
||||
inputTokens int
|
||||
outputTokens int
|
||||
wantUSD float64
|
||||
}{
|
||||
{"2.5-pro 1M+1M", "gemini-2.5-pro", 1_000_000, 1_000_000, 1.25 + 10.00},
|
||||
{"2.5-flash 1M+1M", "gemini-2.5-flash", 1_000_000, 1_000_000, 0.30 + 2.50},
|
||||
{"2.5-flash-lite 1M+1M", "gemini-2.5-flash-lite", 1_000_000, 1_000_000, 0.10 + 0.40},
|
||||
{"3.1-pro 1M+1M", "gemini-3.1-pro", 1_000_000, 1_000_000, 2.00 + 12.00},
|
||||
{"resolved alias 2.5-pro-002", "gemini-2.5-pro-002", 1000, 500, 1000*1.25/1_000_000 + 500*10.0/1_000_000},
|
||||
{"unknown returns zero", "gemini-pro-latest", 1000, 1000, 0},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := p.estimateCost(tc.model, tc.inputTokens, tc.outputTokens, false)
|
||||
if got != tc.wantUSD {
|
||||
t.Errorf("estimateCost(%q, %d, %d) = %v; want %v",
|
||||
tc.model, tc.inputTokens, tc.outputTokens, got, tc.wantUSD)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildUsageEvent_PrefersResolvedModelVersion(t *testing.T) {
|
||||
p := &GeminiProvider{}
|
||||
resp := &genai.GenerateContentResponse{
|
||||
ModelVersion: "gemini-2.5-pro-002",
|
||||
UsageMetadata: &genai.GenerateContentResponseUsageMetadata{
|
||||
PromptTokenCount: 1000,
|
||||
CandidatesTokenCount: 500,
|
||||
},
|
||||
}
|
||||
req := &ChatRequest{CallType: "research"}
|
||||
|
||||
// Caller passes the alias; resolved name from response should drive pricing.
|
||||
e := p.buildUsageEvent("gemini-pro-latest", req, resp, nil, 100)
|
||||
|
||||
wantCost := float64(1000)*1.25/1_000_000 + float64(500)*10.0/1_000_000
|
||||
if e.EstimatedCostUSD != wantCost {
|
||||
t.Errorf("EstimatedCostUSD = %v; want %v (resolved model should price as 2.5-pro)",
|
||||
e.EstimatedCostUSD, wantCost)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildUsageEvent_BillsThoughtsTokens(t *testing.T) {
|
||||
p := &GeminiProvider{}
|
||||
resp := &genai.GenerateContentResponse{
|
||||
ModelVersion: "gemini-2.5-pro",
|
||||
UsageMetadata: &genai.GenerateContentResponseUsageMetadata{
|
||||
PromptTokenCount: 1000,
|
||||
CandidatesTokenCount: 500,
|
||||
ThoughtsTokenCount: 200,
|
||||
},
|
||||
}
|
||||
req := &ChatRequest{CallType: "research"}
|
||||
e := p.buildUsageEvent("gemini-2.5-pro", req, resp, nil, 100)
|
||||
|
||||
if e.ThinkingTokens != 200 {
|
||||
t.Errorf("ThinkingTokens = %d; want 200", e.ThinkingTokens)
|
||||
}
|
||||
if e.OutputTokens != 500 {
|
||||
t.Errorf("OutputTokens = %d; want 500 (candidates only, thoughts tracked separately)", e.OutputTokens)
|
||||
}
|
||||
// Cost: input @ 1.25/1M, (output + thoughts) @ 10/1M
|
||||
wantCost := float64(1000)*1.25/1_000_000 + float64(500+200)*10.0/1_000_000
|
||||
if e.EstimatedCostUSD != wantCost {
|
||||
t.Errorf("EstimatedCostUSD = %v; want %v (thoughts billed at output rate)",
|
||||
e.EstimatedCostUSD, wantCost)
|
||||
}
|
||||
}
|
||||
|
||||
func modelNames(ms []ModelInfo) []string {
|
||||
names := make([]string, len(ms))
|
||||
for i, m := range ms {
|
||||
|
||||
@@ -3,12 +3,17 @@ package ai
|
||||
import "context"
|
||||
|
||||
// UsageEvent holds per-call telemetry recorded after each LLM call.
|
||||
//
|
||||
// OutputTokens holds visible response tokens (CandidatesTokenCount).
|
||||
// ThinkingTokens holds reasoning tokens (ThoughtsTokenCount), tracked separately
|
||||
// for visibility but billed at the output rate by Gemini.
|
||||
type UsageEvent struct {
|
||||
Provider string
|
||||
Model string
|
||||
CallType string
|
||||
InputTokens int
|
||||
OutputTokens int
|
||||
ThinkingTokens int
|
||||
Grounded bool
|
||||
DurationMs int
|
||||
EstimatedCostUSD float64
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
ALTER TABLE ai_usage DROP COLUMN thinking_tokens;
|
||||
@@ -0,0 +1,2 @@
|
||||
ALTER TABLE ai_usage
|
||||
ADD COLUMN thinking_tokens INT NOT NULL DEFAULT 0;
|
||||
@@ -204,6 +204,7 @@ export interface AIUsageStats {
|
||||
calls: number;
|
||||
input_tokens: number;
|
||||
output_tokens: number;
|
||||
thinking_tokens?: number;
|
||||
grounding_calls: number;
|
||||
estimated_cost_usd: number;
|
||||
}
|
||||
@@ -216,6 +217,7 @@ export interface AIUsageEvent {
|
||||
call_type: string;
|
||||
input_tokens: number;
|
||||
output_tokens: number;
|
||||
thinking_tokens?: number;
|
||||
grounded: boolean;
|
||||
duration_ms: number;
|
||||
estimated_cost_usd: number;
|
||||
@@ -240,6 +242,7 @@ export interface AIStatus {
|
||||
api_key_fingerprint?: string;
|
||||
grounding_enabled: boolean;
|
||||
grounding_quota: number;
|
||||
thinking_enabled: boolean;
|
||||
usage: {
|
||||
today: AIUsageStats;
|
||||
month: AIUsageStats;
|
||||
|
||||
@@ -67,5 +67,20 @@ export const actions: Actions = {
|
||||
} catch (err) {
|
||||
return fail(500, { error: err instanceof Error ? err.message : 'Fehler beim Speichern.' });
|
||||
}
|
||||
},
|
||||
|
||||
setThinking: async ({ cookies, fetch, request }) => {
|
||||
const data = await request.formData();
|
||||
const enabled = data.get('enabled') === 'true';
|
||||
try {
|
||||
await serverFetch('/admin/settings/ai/thinking', cookies, {
|
||||
method: 'POST',
|
||||
body: JSON.stringify({ enabled }),
|
||||
fetch
|
||||
});
|
||||
return { success: true, action: 'thinking', enabled };
|
||||
} catch (err) {
|
||||
return fail(500, { error: err instanceof Error ? err.message : 'Fehler beim Speichern.' });
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
let saving = $state(false);
|
||||
let showKeyInput = $state(untrack(() => !data.ai?.api_key_fingerprint));
|
||||
let groundingEnabled = $state(untrack(() => data.ai?.grounding_enabled ?? false));
|
||||
let thinkingEnabled = $state(untrack(() => data.ai?.thinking_enabled ?? true));
|
||||
|
||||
let activeModel = $derived(
|
||||
form?.success && form.action === 'model' && form.model ? form.model : (data.ai?.model ?? '')
|
||||
@@ -266,6 +267,59 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Card 3b: Thinking (Reasoning) -->
|
||||
<div
|
||||
class="rounded-lg border border-stone-200 bg-white dark:border-stone-700 dark:bg-stone-900"
|
||||
>
|
||||
<div class="border-b border-stone-200 px-6 py-4 dark:border-stone-700">
|
||||
<h2 class="text-base font-semibold text-stone-900 dark:text-stone-100">
|
||||
Modell-Reasoning (Thinking)
|
||||
</h2>
|
||||
</div>
|
||||
<div class="px-6 py-4">
|
||||
<div class="flex items-center justify-between">
|
||||
<div>
|
||||
<p class="text-sm text-stone-700 dark:text-stone-300">
|
||||
Thinking-Token bei kompatiblen Modellen erlauben
|
||||
</p>
|
||||
<p class="mt-0.5 text-xs text-stone-400">
|
||||
Aus = günstiger, schneller. An = höhere Antwortqualität bei komplexen Aufgaben.
|
||||
</p>
|
||||
</div>
|
||||
<form
|
||||
method="POST"
|
||||
action="?/setThinking"
|
||||
use:enhance={() => {
|
||||
return async ({ update }) => {
|
||||
await update();
|
||||
};
|
||||
}}
|
||||
>
|
||||
<input type="hidden" name="enabled" value={thinkingEnabled ? 'false' : 'true'} />
|
||||
<button
|
||||
type="submit"
|
||||
onclick={() => (thinkingEnabled = !thinkingEnabled)}
|
||||
aria-label={thinkingEnabled ? 'Thinking deaktivieren' : 'Thinking aktivieren'}
|
||||
class="relative inline-flex h-6 w-11 items-center rounded-full transition-colors {thinkingEnabled
|
||||
? 'bg-primary-600'
|
||||
: 'bg-stone-300 dark:bg-stone-600'}"
|
||||
>
|
||||
<span
|
||||
class="inline-block h-4 w-4 transform rounded-full bg-white shadow transition-transform {thinkingEnabled
|
||||
? 'translate-x-6'
|
||||
: 'translate-x-1'}"
|
||||
></span>
|
||||
</button>
|
||||
</form>
|
||||
</div>
|
||||
{#if form?.success && form.action === 'thinking'}
|
||||
<p class="mt-2 text-xs text-green-600 dark:text-green-400">
|
||||
Thinking {form.enabled ? 'aktiviert' : 'deaktiviert'}.
|
||||
</p>
|
||||
{/if}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Card 4: Usage -->
|
||||
<div
|
||||
class="rounded-lg border border-stone-200 bg-white dark:border-stone-700 dark:bg-stone-900"
|
||||
@@ -276,7 +330,7 @@
|
||||
<div class="space-y-4 px-6 py-4">
|
||||
<!-- Rollup stats -->
|
||||
<div class="grid grid-cols-2 gap-4 sm:grid-cols-4">
|
||||
{#each [{ label: 'Anfragen heute', value: data.ai.usage.today.calls.toString() }, { label: 'Tokens heute', value: (data.ai.usage.today.input_tokens + data.ai.usage.today.output_tokens).toLocaleString('de-DE') }, { label: 'Kosten heute', value: formatCost(data.ai.usage.today.estimated_cost_usd) }, { label: 'Kosten (30 Tage)', value: formatCost(data.ai.usage.month.estimated_cost_usd) }] as stat}
|
||||
{#each [{ label: 'Anfragen heute', value: data.ai.usage.today.calls.toString() }, { label: 'Tokens heute', value: (data.ai.usage.today.input_tokens + data.ai.usage.today.output_tokens + (data.ai.usage.today.thinking_tokens ?? 0)).toLocaleString('de-DE') }, { label: 'Kosten heute', value: formatCost(data.ai.usage.today.estimated_cost_usd) }, { label: 'Kosten (30 Tage)', value: formatCost(data.ai.usage.month.estimated_cost_usd) }] as stat}
|
||||
<div class="rounded-md bg-stone-50 px-3 py-2 dark:bg-stone-800">
|
||||
<p class="text-xs text-stone-400">{stat.label}</p>
|
||||
<p class="mt-0.5 text-sm font-semibold text-stone-800 dark:text-stone-200">
|
||||
@@ -311,7 +365,11 @@
|
||||
<td class="py-1.5 pr-4">{event.call_type}</td>
|
||||
<td class="max-w-32 truncate py-1.5 pr-4 font-mono">{event.model}</td>
|
||||
<td class="py-1.5 pr-4"
|
||||
>{(event.input_tokens + event.output_tokens).toLocaleString('de-DE')}</td
|
||||
>{(
|
||||
event.input_tokens +
|
||||
event.output_tokens +
|
||||
(event.thinking_tokens ?? 0)
|
||||
).toLocaleString('de-DE')}</td
|
||||
>
|
||||
<td class="py-1.5 pr-4">{event.grounded ? '✓' : '—'}</td>
|
||||
<td class="py-1.5">{formatCost(event.estimated_cost_usd)}</td>
|
||||
|
||||
Reference in New Issue
Block a user