Appearance
Voice Agent Walkthrough
Build a real-time voice assistant agent.
Overview
This walkthrough demonstrates building a voice agent that:
- Listens to user speech
- Processes natural language
- Responds with synthesized speech
- Maintains conversation context
Prerequisites
- GateFlow account
- Audio input/output capability
- Admin API key
Step 1: Create the Agent
bash
curl -X POST https://api.gateflow.ai/v1/mcp/agents \
-H "Authorization: Bearer gw_prod_admin_key" \
-H "Content-Type: application/json" \
-d '{
"name": "Voice Assistant",
"description": "Real-time voice interaction agent",
"permissions": {
"tools": [
"voice/transcribe",
"voice/synthesize",
"voice/pipeline",
"voice/voices",
"llm/chat",
"retrieval/search"
],
"models": [
"whisper-1",
"voxtral-mini-latest",
"gpt-5-mini",
"gpt-5.2",
"eleven_turbo_v2_5",
"eleven_multilingual_v2"
],
"pipelines": [
"voice-agent-fast",
"voice-agent-premium"
]
},
"limits": {
"audio_minutes_daily": 120,
"cost_daily": 50.00
}
}'Step 2: Set Up the Client
python
from gateflow_mcp import MCPClient
import base64
import asyncio
client = MCPClient(
agent_id="agent_voice",
api_key="gf-agent-xyz789..."
)
# Verify setup
whoami = client.call_tool("self_inspect/whoami", {})
print(f"Agent: {whoami['name']}")
print(f"Pipelines: {whoami['permissions']['pipelines']}")Step 3: Basic Voice Interaction
python
def voice_interaction(audio_file: str, context: str = None):
"""Process a voice interaction."""
with open(audio_file, "rb") as f:
audio_b64 = base64.b64encode(f.read()).decode()
result = client.call_tool(
name="voice/pipeline",
arguments={
"audio": audio_b64,
"template": "voice-agent-fast",
"context": context or "You are a helpful voice assistant."
}
)
return {
"user_said": result["transcription"],
"assistant_said": result["response"],
"audio": base64.b64decode(result["audio"]),
"latency_ms": result["latency"]["total_ms"]
}
# Test interaction
response = voice_interaction("user_question.mp3")
print(f"User: {response['user_said']}")
print(f"Assistant: {response['assistant_said']}")
print(f"Latency: {response['latency_ms']}ms")
# Play response audio
with open("response.mp3", "wb") as f:
f.write(response["audio"])Step 4: Build Conversational Voice Agent
python
class VoiceAgent:
def __init__(self, client: MCPClient, persona: str = None):
self.client = client
self.conversation_history = []
self.persona = persona or "You are a helpful voice assistant named Alex."
def process_audio(self, audio_bytes: bytes, template: str = "voice-agent-fast"):
"""Process audio input and generate voice response."""
audio_b64 = base64.b64encode(audio_bytes).decode()
# Build context with history
context = self.persona
if self.conversation_history:
history = "\n".join([
f"User: {h['user']}\nAssistant: {h['assistant']}"
for h in self.conversation_history[-5:]
])
context += f"\n\nPrevious conversation:\n{history}"
result = self.client.call_tool(
name="voice/pipeline",
arguments={
"audio": audio_b64,
"template": template,
"context": context
}
)
# Update history
self.conversation_history.append({
"user": result["transcription"],
"assistant": result["response"]
})
return {
"transcription": result["transcription"],
"response": result["response"],
"audio": base64.b64decode(result["audio"]),
"latency": result["latency"]
}
def reset_conversation(self):
"""Reset conversation history."""
self.conversation_history = []Step 5: Streaming Voice Response
python
async def stream_voice_response(agent: VoiceAgent, audio_bytes: bytes):
"""Stream voice response for lower latency."""
audio_b64 = base64.b64encode(audio_bytes).decode()
context = agent.persona
if agent.conversation_history:
history = "\n".join([
f"User: {h['user']}\nAssistant: {h['assistant']}"
for h in agent.conversation_history[-3:]
])
context += f"\n\nRecent conversation:\n{history}"
transcription = ""
response_text = ""
audio_chunks = []
async for event in agent.client.stream_tool(
name="voice/pipeline",
arguments={
"audio": audio_b64,
"template": "voice-agent-fast",
"context": context,
"stream": True
}
):
if event["type"] == "transcription.partial":
print(f"\rHearing: {event['text']}", end="")
elif event["type"] == "transcription.final":
transcription = event["text"]
print(f"\nUser: {transcription}")
elif event["type"] == "llm.token":
response_text += event["token"]
print(event["token"], end="", flush=True)
elif event["type"] == "audio.chunk":
chunk = base64.b64decode(event["data"])
audio_chunks.append(chunk)
# Play chunk immediately for low latency
play_audio_chunk(chunk)
elif event["type"] == "done":
print(f"\n[Latency: {event['latency']['total_ms']}ms]")
# Update history
agent.conversation_history.append({
"user": transcription,
"assistant": response_text
})
return {
"transcription": transcription,
"response": response_text,
"audio": b"".join(audio_chunks)
}Step 6: Voice Agent with Knowledge Base
python
class KnowledgeVoiceAgent(VoiceAgent):
def __init__(self, client, persona, knowledge_collection):
super().__init__(client, persona)
self.knowledge_collection = knowledge_collection
def process_audio(self, audio_bytes: bytes):
"""Process audio with knowledge retrieval."""
audio_b64 = base64.b64encode(audio_bytes).decode()
# Step 1: Transcribe
transcript = self.client.call_tool(
name="voice/transcribe",
arguments={
"audio": audio_b64,
"model": "voxtral-mini-latest"
}
)
user_query = transcript["text"]
# Step 2: Search knowledge base
search_results = self.client.call_tool(
name="retrieval/search",
arguments={
"query": user_query,
"collection": self.knowledge_collection,
"limit": 5
}
)
# Step 3: Build context
knowledge_context = "\n".join([
f"- {r['content'][:500]}"
for r in search_results["results"]
])
context = f"""{self.persona}
Use this information to answer:
{knowledge_context}
If the information isn't in the knowledge base, say so politely."""
# Step 4: Generate response
llm_response = self.client.call_tool(
name="llm/chat",
arguments={
"model": "gpt-5-mini",
"messages": [
{"role": "system", "content": context},
{"role": "user", "content": user_query}
],
"temperature": 0.7,
"max_tokens": 150
}
)
# Step 5: Synthesize speech
audio_response = self.client.call_tool(
name="voice/synthesize",
arguments={
"text": llm_response["content"],
"model": "eleven_turbo_v2_5",
"voice": "friendly"
}
)
# Update history
self.conversation_history.append({
"user": user_query,
"assistant": llm_response["content"]
})
return {
"transcription": user_query,
"response": llm_response["content"],
"audio": base64.b64decode(audio_response["audio"]),
"sources": search_results["results"][:3]
}Step 7: Real-Time Voice Loop
python
import sounddevice as sd
import numpy as np
from queue import Queue
class RealTimeVoiceAgent:
def __init__(self, agent: VoiceAgent):
self.agent = agent
self.audio_queue = Queue()
self.is_listening = False
self.sample_rate = 16000
def start_listening(self):
"""Start the voice interaction loop."""
self.is_listening = True
def audio_callback(indata, frames, time, status):
if self.is_listening:
self.audio_queue.put(indata.copy())
with sd.InputStream(callback=audio_callback, samplerate=self.sample_rate):
print("Listening... (say 'stop' to end)")
while self.is_listening:
# Collect audio until silence detected
audio_data = self.collect_utterance()
if audio_data is not None:
# Process the utterance
response = self.agent.process_audio(audio_data)
print(f"You: {response['transcription']}")
print(f"Agent: {response['response']}")
# Check for stop command
if "stop" in response["transcription"].lower():
self.is_listening = False
break
# Play response
self.play_audio(response["audio"])
def collect_utterance(self):
"""Collect audio until silence is detected."""
frames = []
silence_threshold = 0.01
silence_duration = 0
max_silence = 1.0 # 1 second of silence
while True:
if not self.audio_queue.empty():
frame = self.audio_queue.get()
frames.append(frame)
# Check for silence
volume = np.abs(frame).mean()
if volume < silence_threshold:
silence_duration += len(frame) / self.sample_rate
if silence_duration > max_silence and len(frames) > 10:
break
else:
silence_duration = 0
if frames:
audio = np.concatenate(frames)
return audio.tobytes()
return None
def play_audio(self, audio_bytes: bytes):
"""Play audio response."""
# Convert and play audio
import io
from pydub import AudioSegment
audio = AudioSegment.from_mp3(io.BytesIO(audio_bytes))
samples = np.array(audio.get_array_of_samples())
sd.play(samples, audio.frame_rate)
sd.wait()Complete Example
python
from gateflow_mcp import MCPClient
import base64
def main():
# Initialize
client = MCPClient(
agent_id="agent_voice",
api_key="gf-agent-xyz789..."
)
agent = VoiceAgent(
client=client,
persona="""You are Alex, a friendly voice assistant.
Keep responses concise (1-2 sentences).
Be helpful and conversational."""
)
print("Voice Agent Ready!")
print("Press Enter to record, Enter again to stop.")
while True:
input("Press Enter to speak...")
# Record audio (simplified)
audio_bytes = record_audio()
if audio_bytes:
response = agent.process_audio(audio_bytes)
print(f"\nYou: {response['transcription']}")
print(f"Alex: {response['response']}")
print(f"[Latency: {response['latency']['total_ms']}ms]")
# Play response
play_audio(response["audio"])
# Check for exit
if "goodbye" in response["transcription"].lower():
print("Goodbye!")
break
if __name__ == "__main__":
main()Performance Tips
- Use voice-agent-fast - For lowest latency
- Stream responses - Play audio as it generates
- Keep responses short - Lower TTS latency
- Use VAD - Voice Activity Detection for natural turn-taking
- Preload models - Warm up connections
Latency Optimization
| Configuration | TTFB | Best For |
|---|---|---|
| voice-agent-fast | 400-750ms | Real-time chat |
| voice-agent-premium | 850-1650ms | Quality-critical |
| Custom (streaming all) | 300-500ms | Maximum speed |
Next Steps
- Healthcare Scribe - Medical transcription
- Voice Agent Fast Template - Template details
- Streaming Speech - Streaming guide