OCR: document processing with pages, images, tables, dimensions. Audio: transcription with file upload/URL, streaming events (language, text delta, segment, done). Fine-tuning: job CRUD, cancel, start, model archive/unarchive. Batch: job CRUD with cancel. Added doMultipartStream for streaming multipart endpoints.
72 lines
2.7 KiB
Go
72 lines
2.7 KiB
Go
package ocr
|
|
|
|
import "encoding/json"
|
|
|
|
// Request represents an OCR request.
|
|
type Request struct {
|
|
Model *string `json:"model"`
|
|
ID string `json:"id,omitempty"`
|
|
Document json.RawMessage `json:"document"`
|
|
Pages []int `json:"pages,omitempty"`
|
|
IncludeImageBase64 *bool `json:"include_image_base64,omitempty"`
|
|
ImageLimit *int `json:"image_limit,omitempty"`
|
|
ImageMinSize *int `json:"image_min_size,omitempty"`
|
|
BboxAnnotationFormat json.RawMessage `json:"bbox_annotation_format,omitempty"`
|
|
DocumentAnnotationFormat json.RawMessage `json:"document_annotation_format,omitempty"`
|
|
DocumentAnnotationPrompt *string `json:"document_annotation_prompt,omitempty"`
|
|
TableFormat *string `json:"table_format,omitempty"`
|
|
ExtractHeader bool `json:"extract_header,omitempty"`
|
|
ExtractFooter bool `json:"extract_footer,omitempty"`
|
|
}
|
|
|
|
// Response is the OCR result.
|
|
type Response struct {
|
|
Pages []Page `json:"pages"`
|
|
Model string `json:"model"`
|
|
DocumentAnnotation *string `json:"document_annotation,omitempty"`
|
|
UsageInfo UsageInfo `json:"usage_info"`
|
|
}
|
|
|
|
// Page represents a single page's OCR results.
|
|
type Page struct {
|
|
Index int `json:"index"`
|
|
Markdown string `json:"markdown"`
|
|
Images []Image `json:"images"`
|
|
Tables []Table `json:"tables,omitempty"`
|
|
Hyperlinks []string `json:"hyperlinks,omitempty"`
|
|
Header *string `json:"header,omitempty"`
|
|
Footer *string `json:"footer,omitempty"`
|
|
Dimensions *PageDimensions `json:"dimensions"`
|
|
}
|
|
|
|
// Image represents an extracted image from a page.
|
|
type Image struct {
|
|
ID string `json:"id"`
|
|
TopLeftX *int `json:"top_left_x"`
|
|
TopLeftY *int `json:"top_left_y"`
|
|
BottomRightX *int `json:"bottom_right_x"`
|
|
BottomRightY *int `json:"bottom_right_y"`
|
|
ImageBase64 *string `json:"image_base64,omitempty"`
|
|
ImageAnnotation *string `json:"image_annotation,omitempty"`
|
|
}
|
|
|
|
// Table represents an extracted table from a page.
|
|
type Table struct {
|
|
ID string `json:"id"`
|
|
Content string `json:"content"`
|
|
Format string `json:"format"`
|
|
}
|
|
|
|
// PageDimensions holds the dimensions of a page image.
|
|
type PageDimensions struct {
|
|
DPI int `json:"dpi"`
|
|
Height int `json:"height"`
|
|
Width int `json:"width"`
|
|
}
|
|
|
|
// UsageInfo holds OCR usage statistics.
|
|
type UsageInfo struct {
|
|
PagesProcessed int `json:"pages_processed"`
|
|
DocSizeBytes *int `json:"doc_size_bytes,omitempty"`
|
|
}
|