Multimodal
Chat with images and audio files using our unified Multimodal Model.
The Multimodal capability extends Addis-፩-አሌፍ beyond text. You can upload Images and Audio files alongside your prompts to perform complex reasoning tasks.
- Vision: Visual Q&A, Scene Description, and Object Detection.
- Audio Analysis: Summarize voice notes, analyze sentiment, or extract action items from meetings (distinct from simple transcription).
- OCR: Extract text from scanned Ethiopian documents.
Endpoint
Important: Data Format
Unlike standard text generation, this request must be sent as multipart/form-data.
The configuration (prompt, model, parameters) must be passed as a stringified JSON object in a field named request_data.
Usage Guide
Chat with Images (Vision)
Upload an image and ask questions about it. Ideal for explaining diagrams, identifying objects, or analyzing screenshots.
curl --location 'https://api.addisassistant.com/api/v1/chat_generate' \
--header 'X-API-Key: sk_YOUR_KEY' \
--form 'image=@"/path/to/photo.jpg"' \
--form 'request_data="{ \"prompt\": \"ይህ ምስል ምን ያሳያል?\", \"target_language\": \"am\" }"'// Example: Analyze an image
const API_KEY = "sk_YOUR_KEY";
async function analyzeImage() {
const formData = new FormData();
const fileInput = document.querySelector('input[type="file"]'); // Assuming an input element
if (!fileInput.files[0]) return;
// 1. Attach the image file
formData.append("image", fileInput.files[0]);
// 2. Attach configuration as a JSON string
// Note: We use the same parameters as the text endpoint, wrapped in 'request_data'
formData.append("request_data", JSON.stringify({
prompt: "ይህ ምስል ምን ያሳያል? በዝርዝር አስረዳኝ።", // "What does this image show? Explain in detail."
target_language: "am",
generation_config: {
temperature: 0.5
}
}));
try {
const response = await fetch("https://api.addisassistant.com/api/v1/chat_generate", {
method: "POST",
headers: {
"X-API-Key": API_KEY
// Do NOT set Content-Type header here. The browser sets it automatically with the boundary.
},
body: formData
});
const data = await response.json();
console.log("Analysis:", data.response_text);
} catch (error) {
console.error("Error:", error);
}
}import requests
import json
def analyze_image():
url = "https://api.addisassistant.com/api/v1/chat_generate"
headers = {"X-API-Key": "sk_YOUR_KEY"}
# 1. Prepare Metadata
metadata = {
"prompt": "ይህ ምስል ምን ያሳያል? በዝርዝር አስረዳኝ።",
"target_language": "am",
"generation_config": {
"temperature": 0.5
}
}
# 2. Prepare Files
# 'image' key maps to the file binary
files = [
('image', ('photo.jpg', open('photo.jpg', 'rb'), 'image/jpeg'))
]
# 3. Send Request
# Note: metadata is passed as a JSON string in 'request_data'
response = requests.post(
url,
headers=headers,
data={'request_data': json.dumps(metadata)},
files=files
)
print("Response:", response.json()['response_text'])
analyze_image()Chat with Audio
Upload an audio file (e.g., a voice note or meeting recording) and ask the model to perform reasoning on it.
Difference from STT: Speech-to-Text simply transcribes words. This endpoint listens to the audio and answers questions about it (e.g., "Summarize", "What was the tone?", "Extract action items").
curl --location 'https://api.addisassistant.com/api/v1/chat_generate' \
--header 'X-API-Key: sk_YOUR_KEY' \
--form 'audio=@"/path/to/recording.mp3"' \
--form 'request_data="{ \"prompt\": \"Summarize this audio\", \"target_language\": \"am\" }"'// Example: Summarize a voice note
const API_KEY = "sk_YOUR_KEY";
async function analyzeAudio(audioFile) {
const formData = new FormData();
// 1. Attach Audio
formData.append("audio", audioFile);
// 2. Ask the model to summarize
formData.append("request_data", JSON.stringify({
prompt: "በዚህ ድምፅ ውስጥ የተነሱትን ዋና ዋና ነጥቦች አብራራ።", // "Explain the main points raised in this audio."
target_language: "am"
}));
try {
const response = await fetch("https://api.addisassistant.com/api/v1/chat_generate", {
method: "POST",
headers: { "X-API-Key": API_KEY },
body: formData
});
const data = await response.json();
console.log("Summary:", data.response_text);
} catch (error) {
console.error("Analysis Failed:", error);
}
}import requests
import json
def analyze_audio():
url = "https://api.addisassistant.com/api/v1/chat_generate"
headers = {"X-API-Key": "sk_YOUR_KEY"}
metadata = {
"prompt": "በዚህ ድምፅ ውስጥ የተነሱትን ዋና ዋና ነጥቦች አብራራ።",
"target_language": "am"
}
# Open the audio file
files = [
('audio', ('recording.mp3', open('recording.mp3', 'rb'), 'audio/mpeg'))
]
response = requests.post(
url,
headers=headers,
data={'request_data': json.dumps(metadata)},
files=files
)
print(response.json()['response_text'])
analyze_audio()Document OCR (Text Extraction)
Use the vision model to extract text from scanned Ethiopian documents, ID cards, or handwritten notes.
Dedicated Endpoint
We are finalizing a dedicated OCR endpoint (/api/v1/ocr) for bulk processing. For now, you can use the chat endpoint with extraction prompts.
curl --location 'https://api.addisassistant.com/api/v1/chat_generate' \
--header 'X-API-Key: sk_YOUR_KEY' \
--form 'image=@"/path/to/doc.png"' \
--form 'request_data="{ \"prompt\": \"በምስሉ ላይ ያለውን ጽሑፍ ብቻ ወደ ቴክስት ቀይር።\", \"target_language\": \"am\" }"'// Example: OCR Extraction
const formData = new FormData();
formData.append("image", fileInput.files[0]);
formData.append("request_data", JSON.stringify({
// Specific prompt for extraction
prompt: "በምስሉ ላይ ያለውን ጽሑፍ ብቻ ወደ ቴክስት ቀይር።", // "Convert only the text in the image to text."
target_language: "am"
}));
const response = await fetch("https://api.addisassistant.com/api/v1/chat_generate", {
method: "POST",
headers: { "X-API-Key": "sk_YOUR_KEY" },
body: formData
});
const data = await response.json();
console.log("Extracted Text:", data.response_text);import requests
import json
metadata = {
"prompt": "በምስሉ ላይ ያለውን ጽሑፍ ብቻ ወደ ቴክስት ቀይር።",
"target_language": "am"
}
files = [('image', ('doc.png', open('doc.png', 'rb'), 'image/png'))]
response = requests.post("https://api.addisassistant.com/api/v1/chat_generate",
headers={"X-API-Key": "sk_KEY"},
data={'request_data': json.dumps(metadata)},
files=files)
print(response.json()['response_text'])Conversation History with Attachments
The API is stateless, meaning it treats every request independently. To continue a conversation involving attachments, you must provide the relevant context in the conversation_history array.
Note
Do not re-upload files. Instead, switch from multipart/form-data to a standard JSON request and reference the previously uploaded file using its fileUri inside the parts array.
This approach maintains continuity while minimizing latency and bandwidth.
curl --location 'https://api.addisassistant.com/api/v1/chat_generate' \
--header 'Content-Type: application/json' \
--header 'X-API-Key: sk_YOUR_KEY' \
--data '{
"prompt": "What was the document talking about in detail?",
"target_language": "am",
"conversation_history": [
{
"role": "user",
"parts": [
{
"fileData": {
"fileUri": "YOUR_FILE_URI_HERE",
"mimeType": "application/pdf"
}
},
{ "text": "Describe these attachment" }
]
},
{
"role": "assistant",
"parts": [
{ "text": "ይህ ሰነድ በታህሳስ 18 ቀን 2025 በተካሄደው..." }
]
}
]
}'const API_KEY = "sk_YOUR_KEY";
async function followUpChat() {
// 1. Define History with File URI
const history = [
{
role: "user",
parts: [
{
fileData: {
fileUri: "YOUR_FILE_URI_HERE", // e.g. https://.../files/12345
mimeType: "application/pdf"
}
},
{ text: "Describe these attachment" }
]
},
{
role: "assistant",
parts: [ { text: "ይህ ሰነድ በታህሳስ 18 ቀን 2025 በተካሄደው..." } ]
}
];
// 2. Send standard JSON request
const response = await fetch("https://api.addisassistant.com/api/v1/chat_generate", {
method: "POST",
headers: {
"Content-Type": "application/json",
"X-API-Key": API_KEY,
},
body: JSON.stringify({
prompt: "What was the document talking about in detail?",
target_language: "am",
conversation_history: history
}),
});
const data = await response.json();
console.log(data.data.response_text);
}
followUpChat();import requests
url = "https://api.addisassistant.com/api/v1/chat_generate"
headers = {
"Content-Type": "application/json",
"X-API-Key": "sk_YOUR_KEY"
}
# Define History
history = [
{
"role": "user",
"parts": [
{
"fileData": {
"fileUri": "YOUR_FILE_URI_HERE",
"mimeType": "application/pdf"
}
},
{"text": "Describe these attachment"}
]
},
{
"role": "assistant",
"parts": [{"text": "ይህ ሰነድ በታህሳስ 18 ቀን 2025 በተካሄደው..."}]
}
]
# Send Request
payload = {
"prompt": "What was the document talking about in detail?",
"target_language": "am",
"conversation_history": history
}
response = requests.post(url, headers=headers, json=payload)
print(response.json()['data']['response_text'])Conversation History Object Schema
The conversation_history array consists of message objects defined below.
Prop
Type
Part Object Structure:
Each item in the parts array must be one of the following:
Prop
Type
FileData Object Structure:
Prop
Type
API Reference
Form Data Parameters
These parameters are sent in the multipart/form-data body.
Prop
Type
Request Data Object
These parameters go inside the request_data stringified JSON.
Prop
Type
Basic Multimodal Response Schema
When files are uploaded, the response includes an uploaded_attachments array confirming the upload.
{
"status": "success",
"data": {
"response_text": "ይህ ሰነድ በ12ኛው ክፍለ ዘመን በንጉሥ ላሊበላ ስለተገነቡት የላሊበላ ውቅር አብያተ ክርስቲያናት ያብራራል። እነዚህ 11 አብያተ ክርስቲያናት ከወጥ ድንጋይ የተወቀሩ ሲሆኑ፣ በዓለም ቅርስነት የተመዘገቡ ድንቅ የኢትዮጵያ ስልጣኔ ማሳያዎች ናቸው። ሰነዱ በተለይ ስለ ቤተ ጊዮርጊስ የመስቀል ቅርጽ ግንባታ እና ስለ ውስብስብ የውሃ ማስወገጃ ስርዓታቸው ዝርዝር ትንታኔ ይሰጣል።",
"finish_reason": "STOP",
"usage_metadata": {
"prompt_token_count": 2427,
"candidates_token_count": 269,
"total_token_count": 2696
},
"modelVersion": "Addis-፩-አሌፍ",
"uploaded_attachments": [
{
"fileUri": "YOUR_FILE_URI_VARIABLE",
"mimeType": "application/pdf"
}
]
}
}Response Schema With Conversation History
{
"status": "success",
"data": {
"response_text": "በቀረበው የውይይት ታሪክ መሠረት፣ ፋሲል ግቢ (የጎንደር ቤተመንግስት) በ17ኛው ክፍለ ዘመን በአፄ ፋሲለደስ የተገነባ ድንቅ የኪነ-ሕንጻ ውጤት ነው። ግቢው በውስጡ ስድስት ዋና ዋና ቤተ-መንግስቶችን፣ ቤተ-መጻሕፍትን እና አብያተ ክርስቲያናትን ይዟል። ሰነዱ በተለይ ስለ ግንባታው የህንድ እና የአረብ ኪነ-ሕንጻ ተፅእኖ እንዲሁም ስለ ህንፃው የመሬት መንቀጥቀጥ መቋቋም ችሎታ ዝርዝር ትንታኔ ይሰጣል።",
"finish_reason": "STOP",
"usage_metadata": {
"prompt_token_count": 3500,
"candidates_token_count": 145,
"total_token_count": 3645
},
"modelVersion": "Addis-፩-አሌፍ"
}
}Best Practices
Optimize your multimodal integration with these architectural patterns.
Context Management
Don't Re-upload: Never re-upload the same file in a multi-turn chat. Upload it once, get the fileUri from the response, and reference that URI in future JSON requests.
Latency: Passing a URI is milliseconds; re-uploading a 10MB PDF takes seconds.
Document Prep
PDF Format: Native PDFs (text-selectable) process faster and more accurately than Scanned PDFs (images inside PDF).
Orientation: Ensure scanned documents are upright. Rotated text significantly degrades Amharic OCR performance.
Visual Prompting
Be Specific: Instead of "What's in this image?", ask "Extract the date and total amount from this receipt."
Language: For best results on Ethiopian documents, prompt in Amharic (e.g., "በምስሉ ላይ ያለውን ጽሑፍ አውጣ").
Constraints
10 MBToken Cost: Images and PDFs consume significantly more tokens than text. Monitor your usage_metadata to avoid hitting rate limits.