Title here
Summary here
dspy-go has native multimodal support from day one. Process images, build vision Q&A systems, and create multimodal chat applications with seamless integration.
Modern LLM applications need to work with more than just text:
| Provider | Image Support | Streaming | Models |
|---|---|---|---|
| Google Gemini | β Yes | β Yes | gemini-pro-vision, gemini-1.5-pro |
| Anthropic Claude | β Yes | β Yes | claude-3-opus, claude-3-sonnet |
| OpenAI | β Yes | β Yes | gpt-4-vision-preview, gpt-4o |
Analyze images with natural language questions.
package main
import (
"context"
"fmt"
"log"
"github.com/XiaoConstantine/dspy-go/pkg/core"
"github.com/XiaoConstantine/dspy-go/pkg/llms"
"github.com/XiaoConstantine/dspy-go/pkg/modules"
)
func main() {
// Configure Gemini (has multimodal support)
llm, err := llms.NewGeminiLLM("your-api-key", core.ModelGoogleGeminiPro)
if err != nil {
log.Fatal(err)
}
core.SetDefaultLLM(llm)
// Define signature for image analysis
signature := core.NewSignature(
[]core.InputField{
{Field: core.NewField("image",
core.WithDescription("The image to analyze"))},
{Field: core.NewField("question",
core.WithDescription("Question about the image"))},
},
[]core.OutputField{
{Field: core.NewField("answer",
core.WithDescription("Answer based on the image"))},
},
)
// Create Predict module
predictor := modules.NewPredict(signature)
// Load image
imageData, err := os.ReadFile("path/to/image.jpg")
if err != nil {
log.Fatal(err)
}
// Analyze image
ctx := context.Background()
result, err := predictor.Process(ctx, map[string]interface{}{
"image": core.NewImageContent(imageData, "image/jpeg"),
"question": "What objects are in this image?",
})
fmt.Printf("Answer: %s\n", result["answer"])
}Structured analysis of visual content.
package main
import (
"context"
"github.com/XiaoConstantine/dspy-go/pkg/core"
"github.com/XiaoConstantine/dspy-go/pkg/modules"
)
func main() {
// Configure LLM with vision support
llm, err := llms.NewGeminiLLM("", core.ModelGoogleGeminiPro) // Will use GEMINI_API_KEY
if err != nil {
log.Fatal(err)
}
core.SetDefaultLLM(llm)
// Define a comprehensive vision analysis signature
signature := core.NewSignature(
[]core.InputField{
{Field: core.NewImageField("image",
core.WithDescription("The image to analyze in detail"))},
{Field: core.NewField("focus",
core.WithDescription("Specific aspect to focus on"))},
},
[]core.OutputField{
{Field: core.NewField("description",
core.WithDescription("Detailed description of the image"))},
{Field: core.NewField("objects",
core.WithDescription("List of objects identified"))},
{Field: core.NewField("colors",
core.WithDescription("Dominant colors in the image"))},
{Field: core.NewField("mood",
core.WithDescription("Overall mood or atmosphere"))},
},
).WithInstruction("Analyze the image thoroughly and provide detailed observations.")
// Create ChainOfThought for detailed analysis
analyzer := modules.NewChainOfThought(signature)
// Load image
imageData, _ := os.ReadFile("photo.jpg")
// Analyze
ctx := context.Background()
result, err := analyzer.Process(ctx, map[string]interface{}{
"image": core.NewImageContent(imageData, "image/jpeg"),
"focus": "architectural details and lighting",
})
// Print detailed analysis
fmt.Printf("Description: %s\n", result["description"])
fmt.Printf("Objects: %s\n", result["objects"])
fmt.Printf("Colors: %s\n", result["colors"])
fmt.Printf("Mood: %s\n", result["mood"])
fmt.Printf("Reasoning: %s\n", result["rationale"]) // From ChainOfThought
}Interactive conversations with images.
package main
import (
"context"
"fmt"
"github.com/XiaoConstantine/dspy-go/pkg/core"
"github.com/XiaoConstantine/dspy-go/pkg/agents/memory"
)
type MultimodalChat struct {
predictor modules.Module
memory memory.Memory
}
func NewMultimodalChat() *MultimodalChat {
signature := core.NewSignature(
[]core.InputField{
{Field: core.NewField("history")},
{Field: core.NewField("user_message")},
{Field: core.NewField("image")}, // Optional
},
[]core.OutputField{
{Field: core.NewField("response")},
},
)
return &MultimodalChat{
predictor: modules.NewPredict(signature),
memory: memory.NewBufferMemory(20),
}
}
func (c *MultimodalChat) SendText(ctx context.Context, message string) (string, error) {
history, _ := c.memory.Get(ctx)
result, err := c.predictor.Process(ctx, map[string]interface{}{
"history": formatHistory(history),
"user_message": message,
})
if err != nil {
return "", err
}
response := result["response"].(string)
c.memory.Add(ctx, "user", message)
c.memory.Add(ctx, "assistant", response)
return response, nil
}
func (c *MultimodalChat) SendImage(ctx context.Context, message string, imageData []byte) (string, error) {
history, _ := c.memory.Get(ctx)
result, err := c.predictor.Process(ctx, map[string]interface{}{
"history": formatHistory(history),
"user_message": message,
"image": core.NewImageContent(imageData, "image/jpeg"),
})
if err != nil {
return "", err
}
response := result["response"].(string)
c.memory.Add(ctx, "user", fmt.Sprintf("%s [image]", message))
c.memory.Add(ctx, "assistant", response)
return response, nil
}
func main() {
chat := NewMultimodalChat()
ctx := context.Background()
// Text conversation
response, _ := chat.SendText(ctx, "Hello! I'm going to show you a photo.")
fmt.Println("Assistant:", response)
// Send image
imageData, _ := os.ReadFile("vacation.jpg")
response, _ = chat.SendImage(ctx, "Where was this photo taken?", imageData)
fmt.Println("Assistant:", response)
// Follow-up question (using conversation memory)
response, _ = chat.SendText(ctx, "What's the weather like there?")
fmt.Println("Assistant:", response)
}Real-time processing of multimodal content.
package main
import (
"context"
"fmt"
"github.com/XiaoConstantine/dspy-go/pkg/core"
"github.com/XiaoConstantine/dspy-go/pkg/modules"
)
func main() {
// Configure LLM
llm, _ := llms.NewGeminiLLM("", core.ModelGoogleGeminiPro)
core.SetDefaultLLM(llm)
// Create signature
signature := core.NewSignature(
[]core.InputField{
{Field: core.NewField("image")},
{Field: core.NewField("prompt")},
},
[]core.OutputField{
{Field: core.NewField("description")},
},
)
// Create module
predictor := modules.NewPredict(signature)
// Set streaming handler
predictor.SetStreamingHandler(func(chunk string) {
fmt.Print(chunk) // Print each chunk as it arrives
})
// Load image
imageData, _ := os.ReadFile("scene.jpg")
// Process with streaming
ctx := context.Background()
result, err := predictor.Process(ctx, map[string]interface{}{
"image": core.NewImageContent(imageData, "image/jpeg"),
"prompt": "Describe this scene in vivid detail",
})
fmt.Printf("\n\nFinal: %s\n", result["description"])
}Compare and analyze multiple images simultaneously.
package main
import (
"context"
"github.com/XiaoConstantine/dspy-go/pkg/core"
"github.com/XiaoConstantine/dspy-go/pkg/modules"
)
func main() {
// Configure LLM
llm, _ := llms.NewGeminiLLM("", core.ModelGoogleGeminiPro)
core.SetDefaultLLM(llm)
// Define signature for comparing images
signature := core.NewSignature(
[]core.InputField{
{Field: core.NewField("image1")},
{Field: core.NewField("image2")},
{Field: core.NewField("question")},
},
[]core.OutputField{
{Field: core.NewField("comparison")},
{Field: core.NewField("differences")},
{Field: core.NewField("similarities")},
},
)
// Create module
comparator := modules.NewChainOfThought(signature)
// Load images
image1, _ := os.ReadFile("before.jpg")
image2, _ := os.ReadFile("after.jpg")
// Compare
ctx := context.Background()
result, err := comparator.Process(ctx, map[string]interface{}{
"image1": core.NewImageContent(image1, "image/jpeg"),
"image2": core.NewImageContent(image2, "image/jpeg"),
"question": "What changed between these two images?",
})
fmt.Printf("Comparison: %s\n", result["comparison"])
fmt.Printf("Differences: %s\n", result["differences"])
fmt.Printf("Similarities: %s\n", result["similarities"])
}Flexible handling of mixed content types.
package main
import (
"github.com/XiaoConstantine/dspy-go/pkg/core"
)
func main() {
// Create mixed content
content := []core.ContentBlock{
core.NewTextContent("Please analyze this image:"),
core.NewImageContent(imageData1, "image/jpeg"),
core.NewTextContent("And compare it to this one:"),
core.NewImageContent(imageData2, "image/jpeg"),
core.NewTextContent("What are the key differences?"),
}
// Use in module
result, err := predictor.Process(ctx, map[string]interface{}{
"content": content,
})
}// Analyze scanned documents
signature := core.NewSignature(
[]core.InputField{
{Field: core.NewField("document_image")},
},
[]core.OutputField{
{Field: core.NewField("text_content")},
{Field: core.NewField("document_type")},
{Field: core.NewField("key_information")},
},
)
extractor := modules.NewChainOfThought(signature)
result, _ := extractor.Process(ctx, map[string]interface{}{
"document_image": core.NewImageContent(scanData, "image/jpeg"),
})// Extract data from charts and graphs
signature := core.NewSignature(
[]core.InputField{
{Field: core.NewField("chart_image")},
},
[]core.OutputField{
{Field: core.NewField("chart_type")},
{Field: core.NewField("data_points")},
{Field: core.NewField("trends")},
{Field: core.NewField("insights")},
},
)
analyzer := modules.NewPredict(signature)// Find similar products
signature := core.NewSignature(
[]core.InputField{
{Field: core.NewField("query_image")},
{Field: core.NewField("description")},
},
[]core.OutputField{
{Field: core.NewField("product_name")},
{Field: core.NewField("category")},
{Field: core.NewField("attributes")},
},
)
searcher := modules.NewPredict(signature)β DO:
β DON’T:
// Resize images before sending
func resizeImage(data []byte, maxWidth, maxHeight int) []byte {
// Your resize implementation
return resizedData
}
// Use appropriate compression
imageData := resizeImage(originalData, 1024, 1024)# Set API key
export GEMINI_API_KEY="your-api-key"
# Run multimodal example
cd examples/multimodal && go run main.go