From c5cc98ed8a70d5b4c39028d3b3145c81b9f0238e Mon Sep 17 00:00:00 2001 From: vikingowl Date: Fri, 22 May 2026 11:50:55 +0200 Subject: [PATCH] feat(provider/openai): translate user image content to image_url parts When the user message has at least one ImageContent block, build a ChatCompletionContentPartUnionParam array with text + image_url parts instead of the string content path. Image bytes are inlined as a base64 data URL (data:;base64,...). Adjacent text blocks are merged into a single TextContentPart. Pure-text user messages stay on the existing string fast path. This covers OpenAI direct + every openaicompat backend (Ollama, llama.cpp, llamafile) since they all share the same provider. Tests: pure text uses OfString; image present emits 2 content parts (text + image_url with the expected base64 payload); nil-Image blocks are dropped and adjacent text merges correctly. --- internal/provider/openai/translate.go | 39 +++++++++++ internal/provider/openai/translate_test.go | 81 ++++++++++++++++++++++ 2 files changed, 120 insertions(+) diff --git a/internal/provider/openai/translate.go b/internal/provider/openai/translate.go index b448f70..90853cd 100644 --- a/internal/provider/openai/translate.go +++ b/internal/provider/openai/translate.go @@ -1,7 +1,9 @@ package openai import ( + "encoding/base64" "encoding/json" + "fmt" "strings" "somegit.dev/Owlibou/gnoma/internal/message" @@ -39,6 +41,37 @@ func unsanitizeToolName(name string) string { return name } +// buildUserContentParts converts a heterogeneous user-content slice into +// OpenAI content-parts. Adjacent text blocks are concatenated. Each Image +// block is emitted as an image_url part carrying a base64 data URL. +func buildUserContentParts(blocks []message.Content) []oai.ChatCompletionContentPartUnionParam { + parts := make([]oai.ChatCompletionContentPartUnionParam, 0, len(blocks)) + var textBuf strings.Builder + flushText := func() { + if textBuf.Len() > 0 { + parts = append(parts, oai.TextContentPart(textBuf.String())) + textBuf.Reset() + } + } + for _, c := range blocks { + switch c.Type { + case message.ContentText: + textBuf.WriteString(c.Text) + case message.ContentImage: + if c.Image == nil || len(c.Image.Data) == 0 { + continue + } + flushText() + dataURL := fmt.Sprintf("data:%s;base64,%s", c.Image.MediaType, base64.StdEncoding.EncodeToString(c.Image.Data)) + parts = append(parts, oai.ImageContentPart(oai.ChatCompletionContentPartImageImageURLParam{ + URL: dataURL, + })) + } + } + flushText() + return parts +} + // --- gnoma → OpenAI --- func translateMessages(msgs []message.Message) []oai.ChatCompletionMessageParamUnion { @@ -67,6 +100,12 @@ func translateMessage(m message.Message) []oai.ChatCompletionMessageParamUnion { } return msgs } + // Inline images → content parts array; pure text → plain string. + if m.HasImages() { + return []oai.ChatCompletionMessageParamUnion{ + oai.UserMessage(buildUserContentParts(m.Content)), + } + } return []oai.ChatCompletionMessageParamUnion{ oai.UserMessage(m.TextContent()), } diff --git a/internal/provider/openai/translate_test.go b/internal/provider/openai/translate_test.go index a0321de..a29b15b 100644 --- a/internal/provider/openai/translate_test.go +++ b/internal/provider/openai/translate_test.go @@ -1,7 +1,9 @@ package openai import ( + "encoding/base64" "encoding/json" + "strings" "testing" "somegit.dev/Owlibou/gnoma/internal/message" @@ -10,6 +12,85 @@ import ( "github.com/openai/openai-go/packages/param" ) +func TestTranslateMessage_UserTextOnly_UsesStringContent(t *testing.T) { + m := message.NewUserText("hello") + out := translateMessage(m) + if len(out) != 1 { + t.Fatalf("got %d messages, want 1", len(out)) + } + user := out[0].OfUser + if user == nil { + t.Fatal("expected OfUser to be set") + } + if user.Content.OfString.Value != "hello" { + t.Errorf("OfString = %q, want %q", user.Content.OfString.Value, "hello") + } + if len(user.Content.OfArrayOfContentParts) != 0 { + t.Errorf("OfArrayOfContentParts should be empty when no image, got %d parts", len(user.Content.OfArrayOfContentParts)) + } +} + +func TestTranslateMessage_UserWithImage_EmitsContentParts(t *testing.T) { + pngBytes := []byte{0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A} + m := message.Message{ + Role: message.RoleUser, + Content: []message.Content{ + message.NewTextContent("what is this?"), + message.NewImageContent(message.Image{ + Data: pngBytes, + MediaType: "image/png", + Path: "/tmp/x.png", + }), + }, + } + out := translateMessage(m) + if len(out) != 1 { + t.Fatalf("got %d messages, want 1", len(out)) + } + user := out[0].OfUser + if user == nil { + t.Fatal("expected OfUser to be set") + } + parts := user.Content.OfArrayOfContentParts + if len(parts) != 2 { + t.Fatalf("got %d content parts, want 2 (text + image)", len(parts)) + } + gotText := parts[0].GetText() + if gotText == nil || *gotText != "what is this?" { + t.Errorf("first part should be text %q, got %v", "what is this?", gotText) + } + gotImg := parts[1].GetImageURL() + if gotImg == nil { + t.Fatal("second part should be image") + } + wantPrefix := "data:image/png;base64," + if !strings.HasPrefix(gotImg.URL, wantPrefix) { + t.Errorf("image URL %q should start with %q", gotImg.URL, wantPrefix) + } + decoded, err := base64.StdEncoding.DecodeString(strings.TrimPrefix(gotImg.URL, wantPrefix)) + if err != nil { + t.Fatalf("base64 decode: %v", err) + } + if string(decoded) != string(pngBytes) { + t.Error("decoded image bytes do not match original") + } +} + +func TestBuildUserContentParts_DropsEmptyImage(t *testing.T) { + blocks := []message.Content{ + message.NewTextContent("a"), + {Type: message.ContentImage, Image: nil}, + message.NewTextContent("b"), + } + parts := buildUserContentParts(blocks) + if len(parts) != 1 { + t.Fatalf("got %d parts, want 1 (adjacent text concatenated, nil image dropped)", len(parts)) + } + if got := parts[0].GetText(); got == nil || *got != "ab" { + t.Errorf("merged text = %v, want %q", got, "ab") + } +} + func TestTranslateMessage_AssistantToolCallNames_Sanitized(t *testing.T) { msg := message.Message{ Role: message.RoleAssistant,