Appearance
Table Extraction
Extract structured tables from documents with high accuracy.
Overview
GateFlow's table extraction uses advanced AI to detect, structure, and extract tabular data from documents, including complex layouts, merged cells, and borderless tables.
Basic Table Extraction
Extract All Tables
python
from openai import OpenAI
client = OpenAI(
base_url="https://api.gateflow.ai/v1",
api_key="gw_prod_..."
)
# Extract tables from a document
response = client.post(
"/data/ocr",
files={"file": open("financial_report.pdf", "rb")},
data={
"extract_tables": True,
"table_format": "json"
}
)
print(f"Found {len(response['tables'])} tables")
for i, table in enumerate(response["tables"]):
print(f"\nTable {i + 1}:")
print(f" Page: {table['page']}")
print(f" Rows: {table['row_count']}")
print(f" Columns: {table['column_count']}")
print(f" Has header: {table['has_header']}")cURL Example
bash
curl -X POST https://api.gateflow.ai/v1/data/ocr \
-H "Authorization: Bearer gw_prod_..." \
-F "file=@report.pdf" \
-F "extract_tables=true" \
-F "table_format=csv"Output Formats
JSON Format
python
response = client.post(
"/data/ocr",
files={"file": open("document.pdf", "rb")},
data={
"extract_tables": True,
"table_format": "json"
}
)
table = response["tables"][0]
# Access structured data
for row in table["rows"]:
for cell in row["cells"]:
print(f" {cell['text']}", end="\t")
print()JSON Output Structure:
json
{
"tables": [
{
"table_id": "table_001",
"page": 1,
"bbox": [50, 100, 550, 400],
"row_count": 5,
"column_count": 4,
"has_header": true,
"header_row": 0,
"rows": [
{
"row_index": 0,
"is_header": true,
"cells": [
{
"text": "Product",
"column_index": 0,
"row_span": 1,
"col_span": 1,
"confidence": 0.98
},
{
"text": "Q1",
"column_index": 1,
"row_span": 1,
"col_span": 1,
"confidence": 0.99
}
]
}
]
}
]
}CSV Format
python
response = client.post(
"/data/ocr",
files={"file": open("document.pdf", "rb")},
data={
"extract_tables": True,
"table_format": "csv"
}
)
# Each table as CSV string
for i, table in enumerate(response["tables"]):
with open(f"table_{i}.csv", "w") as f:
f.write(table["csv"])DataFrame Format
python
import pandas as pd
response = client.post(
"/data/ocr",
files={"file": open("document.pdf", "rb")},
data={
"extract_tables": True,
"table_format": "json"
}
)
# Convert to pandas DataFrame
for table in response["tables"]:
df = pd.DataFrame(table["data"])
if table["has_header"]:
df.columns = df.iloc[0]
df = df[1:]
print(df)Excel Format
python
response = client.post(
"/data/ocr",
files={"file": open("document.pdf", "rb")},
data={
"extract_tables": True,
"table_format": "xlsx",
"combine_tables": True # All tables in one workbook
}
)
# Download Excel file
with open("tables.xlsx", "wb") as f:
f.write(response["excel_data"])Table Detection Options
Detection Modes
python
# Automatic detection (default)
response = client.post(
"/data/ocr",
files={"file": open("document.pdf", "rb")},
data={
"extract_tables": True,
"table_detection": "auto"
}
)
# Bordered tables only
response = client.post(
"/data/ocr",
files={"file": open("document.pdf", "rb")},
data={
"extract_tables": True,
"table_detection": "bordered"
}
)
# Include borderless/whitespace tables
response = client.post(
"/data/ocr",
files={"file": open("document.pdf", "rb")},
data={
"extract_tables": True,
"table_detection": "all",
"min_columns": 2,
"min_rows": 2
}
)Region-Based Extraction
python
# Extract table from specific region
response = client.post(
"/data/ocr",
files={"file": open("document.pdf", "rb")},
data={
"extract_tables": True,
"regions": [
{
"page": 1,
"bbox": [50, 100, 550, 400] # x1, y1, x2, y2
}
]
}
)Complex Table Handling
Merged Cells
python
response = client.post(
"/data/ocr",
files={"file": open("complex_table.pdf", "rb")},
data={
"extract_tables": True,
"handle_merged_cells": True
}
)
table = response["tables"][0]
for row in table["rows"]:
for cell in row["cells"]:
if cell["row_span"] > 1 or cell["col_span"] > 1:
print(f"Merged cell: {cell['text']}")
print(f" Spans: {cell['row_span']} rows, {cell['col_span']} cols")Multi-Row Headers
python
response = client.post(
"/data/ocr",
files={"file": open("multi_header.pdf", "rb")},
data={
"extract_tables": True,
"multi_row_header": True,
"header_rows": 2
}
)
table = response["tables"][0]
# First two rows are header
headers = table["rows"][:2]
data = table["rows"][2:]Nested Tables
python
response = client.post(
"/data/ocr",
files={"file": open("nested_tables.pdf", "rb")},
data={
"extract_tables": True,
"detect_nested": True
}
)
for table in response["tables"]:
if table.get("parent_table"):
print(f"Nested table in table {table['parent_table']}")
else:
print(f"Top-level table: {table['table_id']}")Tables Spanning Pages
python
response = client.post(
"/data/ocr",
files={"file": open("multi_page_table.pdf", "rb")},
data={
"extract_tables": True,
"merge_split_tables": True,
"split_detection": "header_repeat" # or "continuation"
}
)
# Tables spanning pages are merged
for table in response["tables"]:
if table.get("spans_pages"):
print(f"Table spans pages: {table['page_range']}")Data Type Detection
Automatic Type Inference
python
response = client.post(
"/data/ocr",
files={"file": open("data_table.pdf", "rb")},
data={
"extract_tables": True,
"infer_types": True
}
)
table = response["tables"][0]
for col in table["columns"]:
print(f"Column: {col['name']}")
print(f" Type: {col['inferred_type']}") # string, number, date, currency, etc.
print(f" Format: {col['format']}")Supported Data Types
| Type | Examples | Format Detection |
|---|---|---|
string | "Product A", "Notes" | Default |
integer | 42, 1000 | Whole numbers |
decimal | 3.14, 99.99 | Floating point |
currency | $100.00, €50 | Currency symbols |
percentage | 25%, 0.25 | Percent signs |
date | 2026-02-17, Feb 17 | Date patterns |
time | 14:30, 2:30 PM | Time patterns |
boolean | Yes/No, True/False | Boolean patterns |
Custom Type Mapping
python
response = client.post(
"/data/ocr",
files={"file": open("document.pdf", "rb")},
data={
"extract_tables": True,
"type_mapping": {
"column_patterns": {
"Amount|Total|Price": "currency",
"Date|Created|Modified": "date",
"Qty|Quantity|Count": "integer"
}
}
}
)Validation and Quality
Confidence Scores
python
response = client.post(
"/data/ocr",
files={"file": open("document.pdf", "rb")},
data={
"extract_tables": True,
"include_confidence": True
}
)
table = response["tables"][0]
# Check overall table confidence
print(f"Table confidence: {table['confidence']}")
# Check cell-level confidence
low_confidence_cells = []
for row in table["rows"]:
for cell in row["cells"]:
if cell["confidence"] < 0.8:
low_confidence_cells.append({
"position": f"({cell['row_index']}, {cell['column_index']})",
"text": cell["text"],
"confidence": cell["confidence"]
})
if low_confidence_cells:
print("Low confidence cells:")
for cell in low_confidence_cells:
print(f" {cell['position']}: '{cell['text']}' ({cell['confidence']:.2f})")Validation Rules
python
response = client.post(
"/data/ocr",
files={"file": open("document.pdf", "rb")},
data={
"extract_tables": True,
"validation": {
"rules": [
{
"column": "Amount",
"type": "numeric",
"min": 0
},
{
"column": "Date",
"type": "date",
"format": "YYYY-MM-DD"
},
{
"column": "Email",
"type": "pattern",
"pattern": r"^[\w.-]+@[\w.-]+\.\w+$"
}
]
}
}
)
# Check validation results
for table in response["tables"]:
if table.get("validation_errors"):
print(f"Validation errors in table {table['table_id']}:")
for error in table["validation_errors"]:
print(f" Row {error['row']}, Col {error['column']}: {error['message']}")Post-Processing
Clean Extracted Data
python
response = client.post(
"/data/ocr",
files={"file": open("document.pdf", "rb")},
data={
"extract_tables": True,
"post_processing": {
"trim_whitespace": True,
"normalize_numbers": True, # 1,000.00 -> 1000.00
"normalize_dates": "ISO", # -> YYYY-MM-DD
"remove_currency_symbols": True,
"convert_percentages": True # 25% -> 0.25
}
}
)Header Normalization
python
response = client.post(
"/data/ocr",
files={"file": open("document.pdf", "rb")},
data={
"extract_tables": True,
"normalize_headers": {
"lowercase": True,
"replace_spaces": "_",
"remove_special_chars": True
}
}
)
# Headers transformed: "Product Name" -> "product_name"Batch Processing
Multiple Documents
python
# Process multiple documents
files = ["report1.pdf", "report2.pdf", "report3.pdf"]
all_tables = []
for file_path in files:
response = client.post(
"/data/ocr",
files={"file": open(file_path, "rb")},
data={
"extract_tables": True,
"table_format": "json"
}
)
for table in response["tables"]:
table["source_file"] = file_path
all_tables.append(table)
print(f"Total tables extracted: {len(all_tables)}")Async Batch Processing
python
# Submit batch job
batch = client.post(
"/data/ocr/batch",
json={
"documents": [
{"url": "https://example.com/doc1.pdf"},
{"url": "https://example.com/doc2.pdf"},
{"url": "https://example.com/doc3.pdf"}
],
"options": {
"extract_tables": True,
"table_format": "json"
}
}
)
batch_id = batch["batch_id"]
# Check status
status = client.get(f"/data/ocr/batch/{batch_id}")
print(f"Progress: {status['completed']}/{status['total']}")Integration Examples
Export to Database
python
import sqlite3
response = client.post(
"/data/ocr",
files={"file": open("data.pdf", "rb")},
data={
"extract_tables": True,
"table_format": "json",
"infer_types": True
}
)
# Create SQLite table from extracted data
conn = sqlite3.connect("extracted_data.db")
cursor = conn.cursor()
for table in response["tables"]:
# Create table schema
columns = [f"{col['name']} {col['sql_type']}" for col in table["columns"]]
create_sql = f"CREATE TABLE {table['table_id']} ({', '.join(columns)})"
cursor.execute(create_sql)
# Insert data
for row in table["rows"][1:]: # Skip header
values = [cell["text"] for cell in row["cells"]]
placeholders = ", ".join(["?" for _ in values])
cursor.execute(f"INSERT INTO {table['table_id']} VALUES ({placeholders})", values)
conn.commit()Export to Spreadsheet
python
from openpyxl import Workbook
response = client.post(
"/data/ocr",
files={"file": open("document.pdf", "rb")},
data={
"extract_tables": True,
"table_format": "json"
}
)
wb = Workbook()
for i, table in enumerate(response["tables"]):
ws = wb.create_sheet(title=f"Table_{i+1}")
for row_idx, row in enumerate(table["rows"], 1):
for col_idx, cell in enumerate(row["cells"], 1):
ws.cell(row=row_idx, column=col_idx, value=cell["text"])
wb.remove(wb["Sheet"]) # Remove default sheet
wb.save("extracted_tables.xlsx")Best Practices
- Use bordered tables when possible - Higher accuracy
- Specify expected structure - Helps with ambiguous layouts
- Enable type inference - Get properly typed data
- Check confidence scores - Flag uncertain extractions
- Handle merged cells explicitly - Don't assume simple grids
- Use validation rules - Catch extraction errors early
- Test with representative samples - Validate extraction quality
Next Steps
- Supported Formats - File format reference
- Mistral Document AI - OCR engine
- Document Ingestion - Full pipeline