feat(provider/openai): translate user image content to image_url parts
When the user message has at least one ImageContent block, build a ChatCompletionContentPartUnionParam array with text + image_url parts instead of the string content path. Image bytes are inlined as a base64 data URL (data:<media-type>;base64,...). Adjacent text blocks are merged into a single TextContentPart. Pure-text user messages stay on the existing string fast path. This covers OpenAI direct + every openaicompat backend (Ollama, llama.cpp, llamafile) since they all share the same provider. Tests: pure text uses OfString; image present emits 2 content parts (text + image_url with the expected base64 payload); nil-Image blocks are dropped and adjacent text merges correctly.
This commit is contained in:
@@ -1,7 +1,9 @@
|
||||
package openai
|
||||
|
||||
import (
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"somegit.dev/Owlibou/gnoma/internal/message"
|
||||
@@ -39,6 +41,37 @@ func unsanitizeToolName(name string) string {
|
||||
return name
|
||||
}
|
||||
|
||||
// buildUserContentParts converts a heterogeneous user-content slice into
|
||||
// OpenAI content-parts. Adjacent text blocks are concatenated. Each Image
|
||||
// block is emitted as an image_url part carrying a base64 data URL.
|
||||
func buildUserContentParts(blocks []message.Content) []oai.ChatCompletionContentPartUnionParam {
|
||||
parts := make([]oai.ChatCompletionContentPartUnionParam, 0, len(blocks))
|
||||
var textBuf strings.Builder
|
||||
flushText := func() {
|
||||
if textBuf.Len() > 0 {
|
||||
parts = append(parts, oai.TextContentPart(textBuf.String()))
|
||||
textBuf.Reset()
|
||||
}
|
||||
}
|
||||
for _, c := range blocks {
|
||||
switch c.Type {
|
||||
case message.ContentText:
|
||||
textBuf.WriteString(c.Text)
|
||||
case message.ContentImage:
|
||||
if c.Image == nil || len(c.Image.Data) == 0 {
|
||||
continue
|
||||
}
|
||||
flushText()
|
||||
dataURL := fmt.Sprintf("data:%s;base64,%s", c.Image.MediaType, base64.StdEncoding.EncodeToString(c.Image.Data))
|
||||
parts = append(parts, oai.ImageContentPart(oai.ChatCompletionContentPartImageImageURLParam{
|
||||
URL: dataURL,
|
||||
}))
|
||||
}
|
||||
}
|
||||
flushText()
|
||||
return parts
|
||||
}
|
||||
|
||||
// --- gnoma → OpenAI ---
|
||||
|
||||
func translateMessages(msgs []message.Message) []oai.ChatCompletionMessageParamUnion {
|
||||
@@ -67,6 +100,12 @@ func translateMessage(m message.Message) []oai.ChatCompletionMessageParamUnion {
|
||||
}
|
||||
return msgs
|
||||
}
|
||||
// Inline images → content parts array; pure text → plain string.
|
||||
if m.HasImages() {
|
||||
return []oai.ChatCompletionMessageParamUnion{
|
||||
oai.UserMessage(buildUserContentParts(m.Content)),
|
||||
}
|
||||
}
|
||||
return []oai.ChatCompletionMessageParamUnion{
|
||||
oai.UserMessage(m.TextContent()),
|
||||
}
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
package openai
|
||||
|
||||
import (
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"somegit.dev/Owlibou/gnoma/internal/message"
|
||||
@@ -10,6 +12,85 @@ import (
|
||||
"github.com/openai/openai-go/packages/param"
|
||||
)
|
||||
|
||||
func TestTranslateMessage_UserTextOnly_UsesStringContent(t *testing.T) {
|
||||
m := message.NewUserText("hello")
|
||||
out := translateMessage(m)
|
||||
if len(out) != 1 {
|
||||
t.Fatalf("got %d messages, want 1", len(out))
|
||||
}
|
||||
user := out[0].OfUser
|
||||
if user == nil {
|
||||
t.Fatal("expected OfUser to be set")
|
||||
}
|
||||
if user.Content.OfString.Value != "hello" {
|
||||
t.Errorf("OfString = %q, want %q", user.Content.OfString.Value, "hello")
|
||||
}
|
||||
if len(user.Content.OfArrayOfContentParts) != 0 {
|
||||
t.Errorf("OfArrayOfContentParts should be empty when no image, got %d parts", len(user.Content.OfArrayOfContentParts))
|
||||
}
|
||||
}
|
||||
|
||||
func TestTranslateMessage_UserWithImage_EmitsContentParts(t *testing.T) {
|
||||
pngBytes := []byte{0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A}
|
||||
m := message.Message{
|
||||
Role: message.RoleUser,
|
||||
Content: []message.Content{
|
||||
message.NewTextContent("what is this?"),
|
||||
message.NewImageContent(message.Image{
|
||||
Data: pngBytes,
|
||||
MediaType: "image/png",
|
||||
Path: "/tmp/x.png",
|
||||
}),
|
||||
},
|
||||
}
|
||||
out := translateMessage(m)
|
||||
if len(out) != 1 {
|
||||
t.Fatalf("got %d messages, want 1", len(out))
|
||||
}
|
||||
user := out[0].OfUser
|
||||
if user == nil {
|
||||
t.Fatal("expected OfUser to be set")
|
||||
}
|
||||
parts := user.Content.OfArrayOfContentParts
|
||||
if len(parts) != 2 {
|
||||
t.Fatalf("got %d content parts, want 2 (text + image)", len(parts))
|
||||
}
|
||||
gotText := parts[0].GetText()
|
||||
if gotText == nil || *gotText != "what is this?" {
|
||||
t.Errorf("first part should be text %q, got %v", "what is this?", gotText)
|
||||
}
|
||||
gotImg := parts[1].GetImageURL()
|
||||
if gotImg == nil {
|
||||
t.Fatal("second part should be image")
|
||||
}
|
||||
wantPrefix := "data:image/png;base64,"
|
||||
if !strings.HasPrefix(gotImg.URL, wantPrefix) {
|
||||
t.Errorf("image URL %q should start with %q", gotImg.URL, wantPrefix)
|
||||
}
|
||||
decoded, err := base64.StdEncoding.DecodeString(strings.TrimPrefix(gotImg.URL, wantPrefix))
|
||||
if err != nil {
|
||||
t.Fatalf("base64 decode: %v", err)
|
||||
}
|
||||
if string(decoded) != string(pngBytes) {
|
||||
t.Error("decoded image bytes do not match original")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildUserContentParts_DropsEmptyImage(t *testing.T) {
|
||||
blocks := []message.Content{
|
||||
message.NewTextContent("a"),
|
||||
{Type: message.ContentImage, Image: nil},
|
||||
message.NewTextContent("b"),
|
||||
}
|
||||
parts := buildUserContentParts(blocks)
|
||||
if len(parts) != 1 {
|
||||
t.Fatalf("got %d parts, want 1 (adjacent text concatenated, nil image dropped)", len(parts))
|
||||
}
|
||||
if got := parts[0].GetText(); got == nil || *got != "ab" {
|
||||
t.Errorf("merged text = %v, want %q", got, "ab")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTranslateMessage_AssistantToolCallNames_Sanitized(t *testing.T) {
|
||||
msg := message.Message{
|
||||
Role: message.RoleAssistant,
|
||||
|
||||
Reference in New Issue
Block a user