feat(provider/openai): translate user image content to image_url parts

When the user message has at least one ImageContent block, build a
ChatCompletionContentPartUnionParam array with text + image_url
parts instead of the string content path. Image bytes are inlined
as a base64 data URL (data:<media-type>;base64,...). Adjacent text
blocks are merged into a single TextContentPart. Pure-text user
messages stay on the existing string fast path.

This covers OpenAI direct + every openaicompat backend (Ollama,
llama.cpp, llamafile) since they all share the same provider.

Tests: pure text uses OfString; image present emits 2 content parts
(text + image_url with the expected base64 payload); nil-Image
blocks are dropped and adjacent text merges correctly.
This commit is contained in:
2026-05-22 11:50:55 +02:00
parent bc137182d4
commit c5cc98ed8a
2 changed files with 120 additions and 0 deletions
+39
View File
@@ -1,7 +1,9 @@
package openai
import (
"encoding/base64"
"encoding/json"
"fmt"
"strings"
"somegit.dev/Owlibou/gnoma/internal/message"
@@ -39,6 +41,37 @@ func unsanitizeToolName(name string) string {
return name
}
// buildUserContentParts converts a heterogeneous user-content slice into
// OpenAI content-parts. Adjacent text blocks are concatenated. Each Image
// block is emitted as an image_url part carrying a base64 data URL.
func buildUserContentParts(blocks []message.Content) []oai.ChatCompletionContentPartUnionParam {
parts := make([]oai.ChatCompletionContentPartUnionParam, 0, len(blocks))
var textBuf strings.Builder
flushText := func() {
if textBuf.Len() > 0 {
parts = append(parts, oai.TextContentPart(textBuf.String()))
textBuf.Reset()
}
}
for _, c := range blocks {
switch c.Type {
case message.ContentText:
textBuf.WriteString(c.Text)
case message.ContentImage:
if c.Image == nil || len(c.Image.Data) == 0 {
continue
}
flushText()
dataURL := fmt.Sprintf("data:%s;base64,%s", c.Image.MediaType, base64.StdEncoding.EncodeToString(c.Image.Data))
parts = append(parts, oai.ImageContentPart(oai.ChatCompletionContentPartImageImageURLParam{
URL: dataURL,
}))
}
}
flushText()
return parts
}
// --- gnoma → OpenAI ---
func translateMessages(msgs []message.Message) []oai.ChatCompletionMessageParamUnion {
@@ -67,6 +100,12 @@ func translateMessage(m message.Message) []oai.ChatCompletionMessageParamUnion {
}
return msgs
}
// Inline images → content parts array; pure text → plain string.
if m.HasImages() {
return []oai.ChatCompletionMessageParamUnion{
oai.UserMessage(buildUserContentParts(m.Content)),
}
}
return []oai.ChatCompletionMessageParamUnion{
oai.UserMessage(m.TextContent()),
}
@@ -1,7 +1,9 @@
package openai
import (
"encoding/base64"
"encoding/json"
"strings"
"testing"
"somegit.dev/Owlibou/gnoma/internal/message"
@@ -10,6 +12,85 @@ import (
"github.com/openai/openai-go/packages/param"
)
func TestTranslateMessage_UserTextOnly_UsesStringContent(t *testing.T) {
m := message.NewUserText("hello")
out := translateMessage(m)
if len(out) != 1 {
t.Fatalf("got %d messages, want 1", len(out))
}
user := out[0].OfUser
if user == nil {
t.Fatal("expected OfUser to be set")
}
if user.Content.OfString.Value != "hello" {
t.Errorf("OfString = %q, want %q", user.Content.OfString.Value, "hello")
}
if len(user.Content.OfArrayOfContentParts) != 0 {
t.Errorf("OfArrayOfContentParts should be empty when no image, got %d parts", len(user.Content.OfArrayOfContentParts))
}
}
func TestTranslateMessage_UserWithImage_EmitsContentParts(t *testing.T) {
pngBytes := []byte{0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A}
m := message.Message{
Role: message.RoleUser,
Content: []message.Content{
message.NewTextContent("what is this?"),
message.NewImageContent(message.Image{
Data: pngBytes,
MediaType: "image/png",
Path: "/tmp/x.png",
}),
},
}
out := translateMessage(m)
if len(out) != 1 {
t.Fatalf("got %d messages, want 1", len(out))
}
user := out[0].OfUser
if user == nil {
t.Fatal("expected OfUser to be set")
}
parts := user.Content.OfArrayOfContentParts
if len(parts) != 2 {
t.Fatalf("got %d content parts, want 2 (text + image)", len(parts))
}
gotText := parts[0].GetText()
if gotText == nil || *gotText != "what is this?" {
t.Errorf("first part should be text %q, got %v", "what is this?", gotText)
}
gotImg := parts[1].GetImageURL()
if gotImg == nil {
t.Fatal("second part should be image")
}
wantPrefix := "data:image/png;base64,"
if !strings.HasPrefix(gotImg.URL, wantPrefix) {
t.Errorf("image URL %q should start with %q", gotImg.URL, wantPrefix)
}
decoded, err := base64.StdEncoding.DecodeString(strings.TrimPrefix(gotImg.URL, wantPrefix))
if err != nil {
t.Fatalf("base64 decode: %v", err)
}
if string(decoded) != string(pngBytes) {
t.Error("decoded image bytes do not match original")
}
}
func TestBuildUserContentParts_DropsEmptyImage(t *testing.T) {
blocks := []message.Content{
message.NewTextContent("a"),
{Type: message.ContentImage, Image: nil},
message.NewTextContent("b"),
}
parts := buildUserContentParts(blocks)
if len(parts) != 1 {
t.Fatalf("got %d parts, want 1 (adjacent text concatenated, nil image dropped)", len(parts))
}
if got := parts[0].GetText(); got == nil || *got != "ab" {
t.Errorf("merged text = %v, want %q", got, "ab")
}
}
func TestTranslateMessage_AssistantToolCallNames_Sanitized(t *testing.T) {
msg := message.Message{
Role: message.RoleAssistant,