Addis AI

Realtime API

Low-latency, bidirectional voice conversations via WebSockets.

The Realtime API enables fluid, natural voice conversations. It uses WebSockets to stream audio bi-directionally, achieving response times under 300ms.

It is powered by አሌፍ-1.2-realtime-audio, allowing for natural interruptions and back-channeling.

Endpoint

WSSwss://relay.addisassistant.com/ws

Authentication

For browser and mobile clients, pass your API key as a query parameter:

wss://relay.addisassistant.com/ws?apiKey=YOUR_API_KEY

You can also use a short-lived JWT token in environments that issue user sessions:

wss://relay.addisassistant.com/ws?jwt=YOUR_JWT

Industry Use Cases

Our Realtime model is already deployed across key African sectors. You can test these specific Agents in our Playground.

Telecom Assistant

Automated customer support for balance checks, package purchasing, and troubleshooting in Amharic.

Ride Hailing

Voice-first driver negotiation and location finding. "Where are you?" "I'm at Bole near the bank."

Farmer's Assistant

Agricultural advisory for weather updates, crop disease diagnosis, and market prices in local dialects.


Audio Format Requirements

Critical: Audio Encoding

The API expects 16-bit Signed Integer PCM (PCM16) audio.

Developer Warning: The standard browser AudioContext returns 32-bit Float data (-1.0 to 1.0) by default. If you send this directly, it will sound like static noise. You must convert Float32 to Int16 before sending it over the WebSocket.

PropertyValue
EncodingPCM 16-bit (Little Endian)
Client Uploadaudio/pcm;rate=16000 (Mono)
Server Audio OutputPCM16 Mono, typically 24,000Hz

Integration Guide

Browser Implementation (JavaScript)

This example establishes a connection, captures microphone audio, converts it from Float32 to PCM16, and handles the audio stream.

const API_KEY = "sk_YOUR_KEY";
const WS_URL = `wss://relay.addisassistant.com/ws?apiKey=${encodeURIComponent(API_KEY)}`;
const INPUT_RATE = 16000;
const OUTPUT_RATE = 24000;

let socket;
let inputContext;
let outputContext;
let stream;
let source;
let processor;
let canStreamAudio = false;
let nextStartTime = 0;

async function startRealtime() {
  // Output context for AI playback
  outputContext = new (window.AudioContext || window.webkitAudioContext)({
    sampleRate: OUTPUT_RATE,
  });
  await outputContext.resume();
  nextStartTime = outputContext.currentTime;

  // Input context for mic capture
  inputContext = new (window.AudioContext || window.webkitAudioContext)({
    sampleRate: INPUT_RATE,
  });
  await inputContext.resume();

  stream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false });
  source = inputContext.createMediaStreamSource(stream);
  processor = inputContext.createScriptProcessor(2048, 1, 1);

  // Connect processor through a muted gain so callback keeps firing without mic echo
  const mute = inputContext.createGain();
  mute.gain.value = 0;
  source.connect(processor);
  processor.connect(mute);
  mute.connect(inputContext.destination);

  processor.onaudioprocess = (event) => {
    if (!canStreamAudio || socket?.readyState !== WebSocket.OPEN) return;

    // Browser gives Float32 (-1..1), convert to PCM16 and send as base64 JSON envelope
    const float32 = event.inputBuffer.getChannelData(0);
    const int16 = floatTo16BitPCM(float32);

    socket.send(
      JSON.stringify({
        data: arrayBufferToBase64(int16.buffer),
        mimeType: "audio/pcm;rate=16000",
      }),
    );
  };

  socket = new WebSocket(WS_URL);

  socket.onopen = () => {
    console.log("Connected. Waiting for setupComplete...");
  };

  socket.onmessage = async (event) => {
    const message = JSON.parse(event.data);
    console.log("Event:", message);

    if (message.setupComplete || (message.type === "status" && /ready/i.test(message.message || ""))) {
      canStreamAudio = true;
      return;
    }

    const b64 = message?.serverContent?.modelTurn?.parts?.[0]?.inlineData?.data;
    if (typeof b64 === "string" && b64.length) {
      await playPcm16Base64(b64, OUTPUT_RATE);
    }

    if (message?.error) {
      console.error("Realtime error:", message.error);
    }
  };

  socket.onerror = (event) => {
    console.error("WebSocket error:", event);
  };

  socket.onclose = (event) => {
    console.log(`WebSocket closed code=${event.code} reason=${event.reason || "n/a"}`);
  };
}

function stopRealtime() {
  canStreamAudio = false;

  if (socket && (socket.readyState === WebSocket.OPEN || socket.readyState === WebSocket.CONNECTING)) {
    socket.close(1000, "client-stop");
  }

  if (processor) processor.disconnect();
  if (source) source.disconnect();
  if (stream) stream.getTracks().forEach((t) => t.stop());

  if (inputContext) inputContext.close();
  if (outputContext) outputContext.close();
}

function floatTo16BitPCM(float32Array) {
  const int16Array = new Int16Array(float32Array.length);
  for (let i = 0; i < float32Array.length; i++) {
    const s = Math.max(-1, Math.min(1, float32Array[i]));
    int16Array[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
  }
  return int16Array;
}

function arrayBufferToBase64(arrayBuffer) {
  const bytes = new Uint8Array(arrayBuffer);
  let binary = "";
  for (let i = 0; i < bytes.length; i++) {
    binary += String.fromCharCode(bytes[i]);
  }
  return btoa(binary);
}

function base64ToUint8Array(base64) {
  const binary = atob(base64);
  const bytes = new Uint8Array(binary.length);
  for (let i = 0; i < binary.length; i++) {
    bytes[i] = binary.charCodeAt(i);
  }
  return bytes;
}

async function playPcm16Base64(base64Audio, sampleRate = 24000) {
  const bytes = base64ToUint8Array(base64Audio);
  const pcm16 = new Int16Array(bytes.buffer);
  const float32 = new Float32Array(pcm16.length);

  for (let i = 0; i < pcm16.length; i++) {
    float32[i] = pcm16[i] / 32768;
  }

  const buffer = outputContext.createBuffer(1, float32.length, sampleRate);
  buffer.copyToChannel(float32, 0);

  const src = outputContext.createBufferSource();
  src.buffer = buffer;
  src.connect(outputContext.destination);

  const startAt = Math.max(outputContext.currentTime, nextStartTime);
  src.start(startAt);
  nextStartTime = startAt + buffer.duration;
}
import asyncio
import base64
import json
import pyaudio
import websockets

# Audio Config
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 1024  # ~64ms

async def realtime_chat():
    api_key = "sk_YOUR_KEY"
    uri = f"wss://relay.addisassistant.com/ws?apiKey={api_key}"
    ready_event = asyncio.Event()
    
    async with websockets.connect(uri) as websocket:
        # 1. Setup Microphone
        p = pyaudio.PyAudio()
        stream = p.open(
            format=FORMAT,
            channels=CHANNELS,
            rate=RATE,
            input=True,
            frames_per_buffer=CHUNK,
        )

        async def receive_messages():
            async for raw in websocket:
                message = json.loads(raw)
                print("<", message)

                if message.get("setupComplete"):
                    ready_event.set()
                    continue

                if (
                    message.get("type") == "status"
                    and "ready" in message.get("message", "").lower()
                ):
                    ready_event.set()
                    continue

        print("Listening... (Press Ctrl+C to stop)")

        try:
            receive_task = asyncio.create_task(receive_messages())
            await ready_event.wait()

            while True:
                # Read raw PCM16 bytes from mic and send as base64 JSON message
                data = stream.read(CHUNK)
                await websocket.send(
                    json.dumps(
                        {
                            "data": base64.b64encode(data).decode("utf-8"),
                            "mimeType": "audio/pcm;rate=16000",
                        }
                    )
                )
                
        except KeyboardInterrupt:
            pass
        finally:
            receive_task.cancel()
            stream.stop_stream()
            stream.close()
            p.terminate()

asyncio.run(realtime_chat())

Protocol & Events

The WebSocket handles bidirectional traffic.

Connect + Setup Handshake

After connecting, wait for a readiness event before streaming microphone frames.

{
  "setupComplete": true
}

Some integrations may also emit:

{
  "type": "status",
  "message": "Ready to start conversation"
}

Client to Server Audio

Send JSON envelopes with base64 PCM16 chunks.

{
  "data": "BASE64_ENCODED_PCM16_CHUNK",
  "mimeType": "audio/pcm;rate=16000"
}

Server to Client

Server events are JSON messages.

AI Audio Response

The model audio is returned as base64 PCM16 under inlineData.data.

{
  "serverContent": {
    "modelTurn": {
      "parts": [
        {
          "inlineData": {
            "data": "BASE64_ENCODED_PCM16_AUDIO"
          }
        }
      ]
    }
  }
}

Turn Complete & Usage

When the AI finishes speaking, the server sends a completion event with billing details.

{
  "serverContent": {
    "turnComplete": true
  },
  "usageMetadata": {
    "totalBilledAudioDurationSeconds": 5.2
  }
}

Warning Messages

Sent if there are non-critical issues, such as billing thresholds.

{
  "type": "warning",
  "message": "Your wallet balance is low. Please top up to avoid service interruption."
}

Error Messages

Sent if a critical failure occurs.

{
  "error": {
    "message": "AI service error",
    "status": 500,
    "timestamp": "2025-07-15T10:00:00.000Z"
  }
}

Connection Troubleshooting

  • If the socket closes with 1006, verify:
    • You used the correct endpoint.
    • Auth query is valid (apiKey or jwt).
    • Client audio is sent as JSON (data + mimeType), not raw binary.

Capabilities Roadmap

We are rapidly expanding the Realtime engine.

VAD (Voice Activity Detection)

The model currently uses server-side VAD to determine when the user has stopped speaking.

Knowledge Base

Coming Soon

Soon you will be able to attach PDFs or Text documents to the Realtime session. This will allow the voice assistant to answer questions specifically based on your uploaded data (RAG).

On this page