Skip to content

RAG Agent Walkthrough

Build a Retrieval-Augmented Generation agent for knowledge-based Q&A.

Overview

This walkthrough demonstrates building a RAG agent that:

  • Ingests and indexes documents
  • Retrieves relevant context for queries
  • Generates accurate, grounded answers
  • Cites sources in responses

Prerequisites

  • GateFlow account
  • Documents to index
  • Admin API key

Step 1: Create the Agent

bash
curl -X POST https://api.gateflow.ai/v1/mcp/agents \
  -H "Authorization: Bearer gw_prod_admin_key" \
  -H "Content-Type: application/json" \
  -d '{
    "name": "Knowledge Assistant",
    "description": "RAG-powered Q&A agent",
    "permissions": {
      "tools": [
        "llm/chat",
        "llm/embed",
        "retrieval/search",
        "retrieval/rerank",
        "retrieval/search_and_rerank",
        "document/process"
      ],
      "models": [
        "gpt-5.2",
        "text-embedding-3-large",
        "rerank-english-v3.0"
      ],
      "collections": [
        "knowledge-base"
      ]
    },
    "limits": {
      "requests_per_minute": 60,
      "cost_daily": 100.00
    }
  }'

Step 2: Set Up the Client

python
from gateflow_mcp import MCPClient
import base64

client = MCPClient(
    agent_id="agent_knowledge",
    api_key="gf-agent-xyz789..."
)

# Verify setup
whoami = client.call_tool("self_inspect/whoami", {})
print(f"Agent: {whoami['name']}")
print(f"Collections: {whoami['permissions']['collections']}")

Step 3: Ingest Documents

python
def ingest_documents(file_paths: list, collection: str = "knowledge-base"):
    """Ingest documents into the knowledge base."""

    results = []
    for path in file_paths:
        with open(path, "rb") as f:
            file_b64 = base64.b64encode(f.read()).decode()

        result = client.call_tool(
            name="document/process",
            arguments={
                "file": file_b64,
                "filename": os.path.basename(path),
                "collection": collection,
                "chunk_size": 1000,
                "chunk_overlap": 200,
                "metadata": {
                    "source": path,
                    "ingested_at": datetime.now().isoformat()
                }
            }
        )

        results.append({
            "file": path,
            "document_id": result["document_id"],
            "chunks": result["chunks"]
        })
        print(f"Ingested: {path} ({result['chunks']} chunks)")

    return results

# Ingest your documents
docs = ingest_documents([
    "docs/product_guide.pdf",
    "docs/faq.pdf",
    "docs/troubleshooting.md"
])

Step 4: Build the RAG Pipeline

python
class RAGAgent:
    def __init__(self, client: MCPClient, collection: str = "knowledge-base"):
        self.client = client
        self.collection = collection

    def search(self, query: str, top_k: int = 10):
        """Search for relevant documents."""
        result = self.client.call_tool(
            name="retrieval/search_and_rerank",
            arguments={
                "query": query,
                "collection": self.collection,
                "limit": top_k,
                "initial_limit": 30
            }
        )
        return result["results"]

    def build_context(self, results: list, max_tokens: int = 4000):
        """Build context from search results."""
        context_parts = []
        total_length = 0

        for i, r in enumerate(results):
            chunk = f"[Source {i+1}: {r.get('title', 'Unknown')}]\n{r['content']}\n"
            if total_length + len(chunk) > max_tokens * 4:  # Rough char estimate
                break
            context_parts.append(chunk)
            total_length += len(chunk)

        return "\n".join(context_parts)

    def generate_answer(self, query: str, context: str, results: list):
        """Generate answer with citations."""
        response = self.client.call_tool(
            name="llm/chat",
            arguments={
                "model": "gpt-5.2",
                "messages": [
                    {
                        "role": "system",
                        "content": """You are a helpful assistant that answers questions based on the provided context.

Rules:
1. Only use information from the provided context
2. If the context doesn't contain the answer, say so
3. Cite sources using [Source N] notation
4. Be concise but complete"""
                    },
                    {
                        "role": "user",
                        "content": f"""Context:
{context}

Question: {query}

Answer the question using only the context above. Include citations."""
                    }
                ],
                "temperature": 0.3
            }
        )

        return {
            "answer": response["content"],
            "sources": [
                {
                    "index": i + 1,
                    "title": r.get("title", "Unknown"),
                    "score": r.get("rerank_score", r.get("score")),
                    "document_id": r.get("document_id")
                }
                for i, r in enumerate(results)
            ],
            "usage": response.get("usage", {})
        }

    def ask(self, query: str):
        """Complete RAG pipeline."""
        # 1. Search
        results = self.search(query)

        if not results:
            return {
                "answer": "I couldn't find any relevant information to answer your question.",
                "sources": []
            }

        # 2. Build context
        context = self.build_context(results)

        # 3. Generate answer
        return self.generate_answer(query, context, results)

Step 5: Use the RAG Agent

python
# Initialize
rag = RAGAgent(client)

# Ask questions
response = rag.ask("How do I reset my password?")

print("Answer:")
print(response["answer"])
print("\nSources:")
for source in response["sources"][:3]:
    print(f"  [{source['index']}] {source['title']} (score: {source['score']:.3f})")

Example Output:

Answer:
To reset your password, follow these steps [Source 1]:

1. Go to the login page and click "Forgot Password"
2. Enter your email address
3. Check your email for the reset link
4. Click the link and enter your new password
5. Password must be at least 12 characters with a mix of letters, numbers, and symbols [Source 2]

The reset link expires after 24 hours. If you don't receive the email, check your spam folder [Source 1].

Sources:
  [1] Password Reset Guide (score: 0.952)
  [2] Security FAQ (score: 0.891)
  [3] Account Settings Help (score: 0.834)

Step 6: Advanced Features

Conversation History

python
class ConversationalRAG(RAGAgent):
    def __init__(self, client, collection):
        super().__init__(client, collection)
        self.history = []

    def ask(self, query: str):
        # Include history in search
        search_query = query
        if self.history:
            # Use recent context to improve search
            recent = " ".join([h["query"] for h in self.history[-3:]])
            search_query = f"{query} {recent}"

        results = self.search(search_query)
        context = self.build_context(results)

        # Include history in prompt
        history_text = ""
        for h in self.history[-5:]:
            history_text += f"User: {h['query']}\nAssistant: {h['answer'][:200]}...\n\n"

        response = self.client.call_tool(
            name="llm/chat",
            arguments={
                "model": "gpt-5.2",
                "messages": [
                    {"role": "system", "content": "Answer based on context. Cite sources."},
                    {"role": "user", "content": f"""Previous conversation:
{history_text}

Context:
{context}

Current question: {query}"""}
                ]
            }
        )

        # Update history
        self.history.append({
            "query": query,
            "answer": response["content"]
        })

        return {"answer": response["content"], "sources": results}
python
def hybrid_search(self, query: str, keywords: list = None):
    """Combine semantic and keyword search."""
    results = self.client.call_tool(
        name="retrieval/search",
        arguments={
            "query": query,
            "collection": self.collection,
            "search_type": "hybrid",
            "semantic_weight": 0.7,
            "keyword_weight": 0.3,
            "filters": {
                "$or": [{"content": {"$contains": kw}} for kw in keywords]
            } if keywords else None
        }
    )
    return results["results"]

Query Expansion

python
def expand_query(self, query: str):
    """Expand query with related terms."""
    expansion = self.client.call_tool(
        name="llm/chat",
        arguments={
            "model": "gpt-5-mini",
            "messages": [{
                "role": "user",
                "content": f"""Generate 3 alternative phrasings for this search query:
Query: {query}

Return only the alternatives, one per line."""
            }],
            "temperature": 0.7
        }
    )

    alternatives = expansion["content"].strip().split("\n")
    return [query] + alternatives[:3]

Step 7: Monitoring and Evaluation

python
def evaluate_response(query: str, response: dict, ground_truth: str = None):
    """Evaluate RAG response quality."""

    metrics = {
        "has_answer": len(response["answer"]) > 50,
        "has_citations": "[Source" in response["answer"],
        "source_count": len(response["sources"]),
        "top_source_score": response["sources"][0]["score"] if response["sources"] else 0
    }

    if ground_truth:
        # Use LLM to evaluate accuracy
        eval_result = client.call_tool(
            name="llm/chat",
            arguments={
                "model": "gpt-5-mini",
                "messages": [{
                    "role": "user",
                    "content": f"""Rate this answer's accuracy from 1-5:

Question: {query}
Expected: {ground_truth}
Actual: {response['answer']}

Return only the number."""
                }]
            }
        )
        metrics["accuracy_score"] = int(eval_result["content"].strip())

    return metrics

Complete Example

python
from gateflow_mcp import MCPClient

def main():
    # Initialize
    client = MCPClient(
        agent_id="agent_knowledge",
        api_key="gf-agent-xyz789..."
    )

    rag = RAGAgent(client, collection="knowledge-base")

    # Interactive Q&A loop
    print("Knowledge Assistant Ready! (type 'quit' to exit)")

    while True:
        query = input("\nYou: ").strip()
        if query.lower() == "quit":
            break

        response = rag.ask(query)

        print(f"\nAssistant: {response['answer']}")

        if response["sources"]:
            print("\nSources:")
            for s in response["sources"][:3]:
                print(f"  • {s['title']}")

if __name__ == "__main__":
    main()

Best Practices

  1. Chunk wisely - 500-1500 chars with 10-20% overlap
  2. Use reranking - Significantly improves relevance
  3. Set temperature low - 0.2-0.4 for factual answers
  4. Cite sources - Build trust with citations
  5. Handle no-results - Graceful "I don't know" responses
  6. Monitor quality - Track relevance metrics

Next Steps

Built with reliability in mind.