Skip to content

Table Extraction

Extract structured tables from documents with high accuracy.

Overview

GateFlow's table extraction uses advanced AI to detect, structure, and extract tabular data from documents, including complex layouts, merged cells, and borderless tables.

Basic Table Extraction

Extract All Tables

python
from openai import OpenAI

client = OpenAI(
    base_url="https://api.gateflow.ai/v1",
    api_key="gw_prod_..."
)

# Extract tables from a document
response = client.post(
    "/data/ocr",
    files={"file": open("financial_report.pdf", "rb")},
    data={
        "extract_tables": True,
        "table_format": "json"
    }
)

print(f"Found {len(response['tables'])} tables")

for i, table in enumerate(response["tables"]):
    print(f"\nTable {i + 1}:")
    print(f"  Page: {table['page']}")
    print(f"  Rows: {table['row_count']}")
    print(f"  Columns: {table['column_count']}")
    print(f"  Has header: {table['has_header']}")

cURL Example

bash
curl -X POST https://api.gateflow.ai/v1/data/ocr \
  -H "Authorization: Bearer gw_prod_..." \
  -F "file=@report.pdf" \
  -F "extract_tables=true" \
  -F "table_format=csv"

Output Formats

JSON Format

python
response = client.post(
    "/data/ocr",
    files={"file": open("document.pdf", "rb")},
    data={
        "extract_tables": True,
        "table_format": "json"
    }
)

table = response["tables"][0]

# Access structured data
for row in table["rows"]:
    for cell in row["cells"]:
        print(f"  {cell['text']}", end="\t")
    print()

JSON Output Structure:

json
{
  "tables": [
    {
      "table_id": "table_001",
      "page": 1,
      "bbox": [50, 100, 550, 400],
      "row_count": 5,
      "column_count": 4,
      "has_header": true,
      "header_row": 0,
      "rows": [
        {
          "row_index": 0,
          "is_header": true,
          "cells": [
            {
              "text": "Product",
              "column_index": 0,
              "row_span": 1,
              "col_span": 1,
              "confidence": 0.98
            },
            {
              "text": "Q1",
              "column_index": 1,
              "row_span": 1,
              "col_span": 1,
              "confidence": 0.99
            }
          ]
        }
      ]
    }
  ]
}

CSV Format

python
response = client.post(
    "/data/ocr",
    files={"file": open("document.pdf", "rb")},
    data={
        "extract_tables": True,
        "table_format": "csv"
    }
)

# Each table as CSV string
for i, table in enumerate(response["tables"]):
    with open(f"table_{i}.csv", "w") as f:
        f.write(table["csv"])

DataFrame Format

python
import pandas as pd

response = client.post(
    "/data/ocr",
    files={"file": open("document.pdf", "rb")},
    data={
        "extract_tables": True,
        "table_format": "json"
    }
)

# Convert to pandas DataFrame
for table in response["tables"]:
    df = pd.DataFrame(table["data"])
    if table["has_header"]:
        df.columns = df.iloc[0]
        df = df[1:]
    print(df)

Excel Format

python
response = client.post(
    "/data/ocr",
    files={"file": open("document.pdf", "rb")},
    data={
        "extract_tables": True,
        "table_format": "xlsx",
        "combine_tables": True  # All tables in one workbook
    }
)

# Download Excel file
with open("tables.xlsx", "wb") as f:
    f.write(response["excel_data"])

Table Detection Options

Detection Modes

python
# Automatic detection (default)
response = client.post(
    "/data/ocr",
    files={"file": open("document.pdf", "rb")},
    data={
        "extract_tables": True,
        "table_detection": "auto"
    }
)

# Bordered tables only
response = client.post(
    "/data/ocr",
    files={"file": open("document.pdf", "rb")},
    data={
        "extract_tables": True,
        "table_detection": "bordered"
    }
)

# Include borderless/whitespace tables
response = client.post(
    "/data/ocr",
    files={"file": open("document.pdf", "rb")},
    data={
        "extract_tables": True,
        "table_detection": "all",
        "min_columns": 2,
        "min_rows": 2
    }
)

Region-Based Extraction

python
# Extract table from specific region
response = client.post(
    "/data/ocr",
    files={"file": open("document.pdf", "rb")},
    data={
        "extract_tables": True,
        "regions": [
            {
                "page": 1,
                "bbox": [50, 100, 550, 400]  # x1, y1, x2, y2
            }
        ]
    }
)

Complex Table Handling

Merged Cells

python
response = client.post(
    "/data/ocr",
    files={"file": open("complex_table.pdf", "rb")},
    data={
        "extract_tables": True,
        "handle_merged_cells": True
    }
)

table = response["tables"][0]
for row in table["rows"]:
    for cell in row["cells"]:
        if cell["row_span"] > 1 or cell["col_span"] > 1:
            print(f"Merged cell: {cell['text']}")
            print(f"  Spans: {cell['row_span']} rows, {cell['col_span']} cols")

Multi-Row Headers

python
response = client.post(
    "/data/ocr",
    files={"file": open("multi_header.pdf", "rb")},
    data={
        "extract_tables": True,
        "multi_row_header": True,
        "header_rows": 2
    }
)

table = response["tables"][0]
# First two rows are header
headers = table["rows"][:2]
data = table["rows"][2:]

Nested Tables

python
response = client.post(
    "/data/ocr",
    files={"file": open("nested_tables.pdf", "rb")},
    data={
        "extract_tables": True,
        "detect_nested": True
    }
)

for table in response["tables"]:
    if table.get("parent_table"):
        print(f"Nested table in table {table['parent_table']}")
    else:
        print(f"Top-level table: {table['table_id']}")

Tables Spanning Pages

python
response = client.post(
    "/data/ocr",
    files={"file": open("multi_page_table.pdf", "rb")},
    data={
        "extract_tables": True,
        "merge_split_tables": True,
        "split_detection": "header_repeat"  # or "continuation"
    }
)

# Tables spanning pages are merged
for table in response["tables"]:
    if table.get("spans_pages"):
        print(f"Table spans pages: {table['page_range']}")

Data Type Detection

Automatic Type Inference

python
response = client.post(
    "/data/ocr",
    files={"file": open("data_table.pdf", "rb")},
    data={
        "extract_tables": True,
        "infer_types": True
    }
)

table = response["tables"][0]
for col in table["columns"]:
    print(f"Column: {col['name']}")
    print(f"  Type: {col['inferred_type']}")  # string, number, date, currency, etc.
    print(f"  Format: {col['format']}")

Supported Data Types

TypeExamplesFormat Detection
string"Product A", "Notes"Default
integer42, 1000Whole numbers
decimal3.14, 99.99Floating point
currency$100.00, €50Currency symbols
percentage25%, 0.25Percent signs
date2026-02-17, Feb 17Date patterns
time14:30, 2:30 PMTime patterns
booleanYes/No, True/FalseBoolean patterns

Custom Type Mapping

python
response = client.post(
    "/data/ocr",
    files={"file": open("document.pdf", "rb")},
    data={
        "extract_tables": True,
        "type_mapping": {
            "column_patterns": {
                "Amount|Total|Price": "currency",
                "Date|Created|Modified": "date",
                "Qty|Quantity|Count": "integer"
            }
        }
    }
)

Validation and Quality

Confidence Scores

python
response = client.post(
    "/data/ocr",
    files={"file": open("document.pdf", "rb")},
    data={
        "extract_tables": True,
        "include_confidence": True
    }
)

table = response["tables"][0]

# Check overall table confidence
print(f"Table confidence: {table['confidence']}")

# Check cell-level confidence
low_confidence_cells = []
for row in table["rows"]:
    for cell in row["cells"]:
        if cell["confidence"] < 0.8:
            low_confidence_cells.append({
                "position": f"({cell['row_index']}, {cell['column_index']})",
                "text": cell["text"],
                "confidence": cell["confidence"]
            })

if low_confidence_cells:
    print("Low confidence cells:")
    for cell in low_confidence_cells:
        print(f"  {cell['position']}: '{cell['text']}' ({cell['confidence']:.2f})")

Validation Rules

python
response = client.post(
    "/data/ocr",
    files={"file": open("document.pdf", "rb")},
    data={
        "extract_tables": True,
        "validation": {
            "rules": [
                {
                    "column": "Amount",
                    "type": "numeric",
                    "min": 0
                },
                {
                    "column": "Date",
                    "type": "date",
                    "format": "YYYY-MM-DD"
                },
                {
                    "column": "Email",
                    "type": "pattern",
                    "pattern": r"^[\w.-]+@[\w.-]+\.\w+$"
                }
            ]
        }
    }
)

# Check validation results
for table in response["tables"]:
    if table.get("validation_errors"):
        print(f"Validation errors in table {table['table_id']}:")
        for error in table["validation_errors"]:
            print(f"  Row {error['row']}, Col {error['column']}: {error['message']}")

Post-Processing

Clean Extracted Data

python
response = client.post(
    "/data/ocr",
    files={"file": open("document.pdf", "rb")},
    data={
        "extract_tables": True,
        "post_processing": {
            "trim_whitespace": True,
            "normalize_numbers": True,      # 1,000.00 -> 1000.00
            "normalize_dates": "ISO",       # -> YYYY-MM-DD
            "remove_currency_symbols": True,
            "convert_percentages": True     # 25% -> 0.25
        }
    }
)

Header Normalization

python
response = client.post(
    "/data/ocr",
    files={"file": open("document.pdf", "rb")},
    data={
        "extract_tables": True,
        "normalize_headers": {
            "lowercase": True,
            "replace_spaces": "_",
            "remove_special_chars": True
        }
    }
)

# Headers transformed: "Product Name" -> "product_name"

Batch Processing

Multiple Documents

python
# Process multiple documents
files = ["report1.pdf", "report2.pdf", "report3.pdf"]
all_tables = []

for file_path in files:
    response = client.post(
        "/data/ocr",
        files={"file": open(file_path, "rb")},
        data={
            "extract_tables": True,
            "table_format": "json"
        }
    )

    for table in response["tables"]:
        table["source_file"] = file_path
        all_tables.append(table)

print(f"Total tables extracted: {len(all_tables)}")

Async Batch Processing

python
# Submit batch job
batch = client.post(
    "/data/ocr/batch",
    json={
        "documents": [
            {"url": "https://example.com/doc1.pdf"},
            {"url": "https://example.com/doc2.pdf"},
            {"url": "https://example.com/doc3.pdf"}
        ],
        "options": {
            "extract_tables": True,
            "table_format": "json"
        }
    }
)

batch_id = batch["batch_id"]

# Check status
status = client.get(f"/data/ocr/batch/{batch_id}")
print(f"Progress: {status['completed']}/{status['total']}")

Integration Examples

Export to Database

python
import sqlite3

response = client.post(
    "/data/ocr",
    files={"file": open("data.pdf", "rb")},
    data={
        "extract_tables": True,
        "table_format": "json",
        "infer_types": True
    }
)

# Create SQLite table from extracted data
conn = sqlite3.connect("extracted_data.db")
cursor = conn.cursor()

for table in response["tables"]:
    # Create table schema
    columns = [f"{col['name']} {col['sql_type']}" for col in table["columns"]]
    create_sql = f"CREATE TABLE {table['table_id']} ({', '.join(columns)})"
    cursor.execute(create_sql)

    # Insert data
    for row in table["rows"][1:]:  # Skip header
        values = [cell["text"] for cell in row["cells"]]
        placeholders = ", ".join(["?" for _ in values])
        cursor.execute(f"INSERT INTO {table['table_id']} VALUES ({placeholders})", values)

conn.commit()

Export to Spreadsheet

python
from openpyxl import Workbook

response = client.post(
    "/data/ocr",
    files={"file": open("document.pdf", "rb")},
    data={
        "extract_tables": True,
        "table_format": "json"
    }
)

wb = Workbook()

for i, table in enumerate(response["tables"]):
    ws = wb.create_sheet(title=f"Table_{i+1}")

    for row_idx, row in enumerate(table["rows"], 1):
        for col_idx, cell in enumerate(row["cells"], 1):
            ws.cell(row=row_idx, column=col_idx, value=cell["text"])

wb.remove(wb["Sheet"])  # Remove default sheet
wb.save("extracted_tables.xlsx")

Best Practices

  1. Use bordered tables when possible - Higher accuracy
  2. Specify expected structure - Helps with ambiguous layouts
  3. Enable type inference - Get properly typed data
  4. Check confidence scores - Flag uncertain extractions
  5. Handle merged cells explicitly - Don't assume simple grids
  6. Use validation rules - Catch extraction errors early
  7. Test with representative samples - Validate extraction quality

Next Steps

Built with reliability in mind.