Skip to content

Voice Agent Walkthrough

Build a real-time voice assistant agent.

Overview

This walkthrough demonstrates building a voice agent that:

  • Listens to user speech
  • Processes natural language
  • Responds with synthesized speech
  • Maintains conversation context

Prerequisites

  • GateFlow account
  • Audio input/output capability
  • Admin API key

Step 1: Create the Agent

bash
curl -X POST https://api.gateflow.ai/v1/mcp/agents \
  -H "Authorization: Bearer gw_prod_admin_key" \
  -H "Content-Type: application/json" \
  -d '{
    "name": "Voice Assistant",
    "description": "Real-time voice interaction agent",
    "permissions": {
      "tools": [
        "voice/transcribe",
        "voice/synthesize",
        "voice/pipeline",
        "voice/voices",
        "llm/chat",
        "retrieval/search"
      ],
      "models": [
        "whisper-1",
        "voxtral-mini-latest",
        "gpt-5-mini",
        "gpt-5.2",
        "eleven_turbo_v2_5",
        "eleven_multilingual_v2"
      ],
      "pipelines": [
        "voice-agent-fast",
        "voice-agent-premium"
      ]
    },
    "limits": {
      "audio_minutes_daily": 120,
      "cost_daily": 50.00
    }
  }'

Step 2: Set Up the Client

python
from gateflow_mcp import MCPClient
import base64
import asyncio

client = MCPClient(
    agent_id="agent_voice",
    api_key="gf-agent-xyz789..."
)

# Verify setup
whoami = client.call_tool("self_inspect/whoami", {})
print(f"Agent: {whoami['name']}")
print(f"Pipelines: {whoami['permissions']['pipelines']}")

Step 3: Basic Voice Interaction

python
def voice_interaction(audio_file: str, context: str = None):
    """Process a voice interaction."""

    with open(audio_file, "rb") as f:
        audio_b64 = base64.b64encode(f.read()).decode()

    result = client.call_tool(
        name="voice/pipeline",
        arguments={
            "audio": audio_b64,
            "template": "voice-agent-fast",
            "context": context or "You are a helpful voice assistant."
        }
    )

    return {
        "user_said": result["transcription"],
        "assistant_said": result["response"],
        "audio": base64.b64decode(result["audio"]),
        "latency_ms": result["latency"]["total_ms"]
    }

# Test interaction
response = voice_interaction("user_question.mp3")
print(f"User: {response['user_said']}")
print(f"Assistant: {response['assistant_said']}")
print(f"Latency: {response['latency_ms']}ms")

# Play response audio
with open("response.mp3", "wb") as f:
    f.write(response["audio"])

Step 4: Build Conversational Voice Agent

python
class VoiceAgent:
    def __init__(self, client: MCPClient, persona: str = None):
        self.client = client
        self.conversation_history = []
        self.persona = persona or "You are a helpful voice assistant named Alex."

    def process_audio(self, audio_bytes: bytes, template: str = "voice-agent-fast"):
        """Process audio input and generate voice response."""

        audio_b64 = base64.b64encode(audio_bytes).decode()

        # Build context with history
        context = self.persona
        if self.conversation_history:
            history = "\n".join([
                f"User: {h['user']}\nAssistant: {h['assistant']}"
                for h in self.conversation_history[-5:]
            ])
            context += f"\n\nPrevious conversation:\n{history}"

        result = self.client.call_tool(
            name="voice/pipeline",
            arguments={
                "audio": audio_b64,
                "template": template,
                "context": context
            }
        )

        # Update history
        self.conversation_history.append({
            "user": result["transcription"],
            "assistant": result["response"]
        })

        return {
            "transcription": result["transcription"],
            "response": result["response"],
            "audio": base64.b64decode(result["audio"]),
            "latency": result["latency"]
        }

    def reset_conversation(self):
        """Reset conversation history."""
        self.conversation_history = []

Step 5: Streaming Voice Response

python
async def stream_voice_response(agent: VoiceAgent, audio_bytes: bytes):
    """Stream voice response for lower latency."""

    audio_b64 = base64.b64encode(audio_bytes).decode()

    context = agent.persona
    if agent.conversation_history:
        history = "\n".join([
            f"User: {h['user']}\nAssistant: {h['assistant']}"
            for h in agent.conversation_history[-3:]
        ])
        context += f"\n\nRecent conversation:\n{history}"

    transcription = ""
    response_text = ""
    audio_chunks = []

    async for event in agent.client.stream_tool(
        name="voice/pipeline",
        arguments={
            "audio": audio_b64,
            "template": "voice-agent-fast",
            "context": context,
            "stream": True
        }
    ):
        if event["type"] == "transcription.partial":
            print(f"\rHearing: {event['text']}", end="")

        elif event["type"] == "transcription.final":
            transcription = event["text"]
            print(f"\nUser: {transcription}")

        elif event["type"] == "llm.token":
            response_text += event["token"]
            print(event["token"], end="", flush=True)

        elif event["type"] == "audio.chunk":
            chunk = base64.b64decode(event["data"])
            audio_chunks.append(chunk)
            # Play chunk immediately for low latency
            play_audio_chunk(chunk)

        elif event["type"] == "done":
            print(f"\n[Latency: {event['latency']['total_ms']}ms]")

    # Update history
    agent.conversation_history.append({
        "user": transcription,
        "assistant": response_text
    })

    return {
        "transcription": transcription,
        "response": response_text,
        "audio": b"".join(audio_chunks)
    }

Step 6: Voice Agent with Knowledge Base

python
class KnowledgeVoiceAgent(VoiceAgent):
    def __init__(self, client, persona, knowledge_collection):
        super().__init__(client, persona)
        self.knowledge_collection = knowledge_collection

    def process_audio(self, audio_bytes: bytes):
        """Process audio with knowledge retrieval."""

        audio_b64 = base64.b64encode(audio_bytes).decode()

        # Step 1: Transcribe
        transcript = self.client.call_tool(
            name="voice/transcribe",
            arguments={
                "audio": audio_b64,
                "model": "voxtral-mini-latest"
            }
        )

        user_query = transcript["text"]

        # Step 2: Search knowledge base
        search_results = self.client.call_tool(
            name="retrieval/search",
            arguments={
                "query": user_query,
                "collection": self.knowledge_collection,
                "limit": 5
            }
        )

        # Step 3: Build context
        knowledge_context = "\n".join([
            f"- {r['content'][:500]}"
            for r in search_results["results"]
        ])

        context = f"""{self.persona}

Use this information to answer:
{knowledge_context}

If the information isn't in the knowledge base, say so politely."""

        # Step 4: Generate response
        llm_response = self.client.call_tool(
            name="llm/chat",
            arguments={
                "model": "gpt-5-mini",
                "messages": [
                    {"role": "system", "content": context},
                    {"role": "user", "content": user_query}
                ],
                "temperature": 0.7,
                "max_tokens": 150
            }
        )

        # Step 5: Synthesize speech
        audio_response = self.client.call_tool(
            name="voice/synthesize",
            arguments={
                "text": llm_response["content"],
                "model": "eleven_turbo_v2_5",
                "voice": "friendly"
            }
        )

        # Update history
        self.conversation_history.append({
            "user": user_query,
            "assistant": llm_response["content"]
        })

        return {
            "transcription": user_query,
            "response": llm_response["content"],
            "audio": base64.b64decode(audio_response["audio"]),
            "sources": search_results["results"][:3]
        }

Step 7: Real-Time Voice Loop

python
import sounddevice as sd
import numpy as np
from queue import Queue

class RealTimeVoiceAgent:
    def __init__(self, agent: VoiceAgent):
        self.agent = agent
        self.audio_queue = Queue()
        self.is_listening = False
        self.sample_rate = 16000

    def start_listening(self):
        """Start the voice interaction loop."""
        self.is_listening = True

        def audio_callback(indata, frames, time, status):
            if self.is_listening:
                self.audio_queue.put(indata.copy())

        with sd.InputStream(callback=audio_callback, samplerate=self.sample_rate):
            print("Listening... (say 'stop' to end)")

            while self.is_listening:
                # Collect audio until silence detected
                audio_data = self.collect_utterance()

                if audio_data is not None:
                    # Process the utterance
                    response = self.agent.process_audio(audio_data)

                    print(f"You: {response['transcription']}")
                    print(f"Agent: {response['response']}")

                    # Check for stop command
                    if "stop" in response["transcription"].lower():
                        self.is_listening = False
                        break

                    # Play response
                    self.play_audio(response["audio"])

    def collect_utterance(self):
        """Collect audio until silence is detected."""
        frames = []
        silence_threshold = 0.01
        silence_duration = 0
        max_silence = 1.0  # 1 second of silence

        while True:
            if not self.audio_queue.empty():
                frame = self.audio_queue.get()
                frames.append(frame)

                # Check for silence
                volume = np.abs(frame).mean()
                if volume < silence_threshold:
                    silence_duration += len(frame) / self.sample_rate
                    if silence_duration > max_silence and len(frames) > 10:
                        break
                else:
                    silence_duration = 0

        if frames:
            audio = np.concatenate(frames)
            return audio.tobytes()
        return None

    def play_audio(self, audio_bytes: bytes):
        """Play audio response."""
        # Convert and play audio
        import io
        from pydub import AudioSegment

        audio = AudioSegment.from_mp3(io.BytesIO(audio_bytes))
        samples = np.array(audio.get_array_of_samples())
        sd.play(samples, audio.frame_rate)
        sd.wait()

Complete Example

python
from gateflow_mcp import MCPClient
import base64

def main():
    # Initialize
    client = MCPClient(
        agent_id="agent_voice",
        api_key="gf-agent-xyz789..."
    )

    agent = VoiceAgent(
        client=client,
        persona="""You are Alex, a friendly voice assistant.
Keep responses concise (1-2 sentences).
Be helpful and conversational."""
    )

    print("Voice Agent Ready!")
    print("Press Enter to record, Enter again to stop.")

    while True:
        input("Press Enter to speak...")

        # Record audio (simplified)
        audio_bytes = record_audio()

        if audio_bytes:
            response = agent.process_audio(audio_bytes)

            print(f"\nYou: {response['transcription']}")
            print(f"Alex: {response['response']}")
            print(f"[Latency: {response['latency']['total_ms']}ms]")

            # Play response
            play_audio(response["audio"])

            # Check for exit
            if "goodbye" in response["transcription"].lower():
                print("Goodbye!")
                break

if __name__ == "__main__":
    main()

Performance Tips

  1. Use voice-agent-fast - For lowest latency
  2. Stream responses - Play audio as it generates
  3. Keep responses short - Lower TTS latency
  4. Use VAD - Voice Activity Detection for natural turn-taking
  5. Preload models - Warm up connections

Latency Optimization

ConfigurationTTFBBest For
voice-agent-fast400-750msReal-time chat
voice-agent-premium850-1650msQuality-critical
Custom (streaming all)300-500msMaximum speed

Next Steps

Built with reliability in mind.