OCR Bench Harness ← bench essay · read the book

The complete bench harness.

Every script we used to OCR a 443-page scanned physics book through four engines, diff the outputs, and pick a winner. Copy what you need.

Workspace.

projects/ocr-bench/
├── input/
│   ├── book.pdf              # 101 MB original (not redistributed)
│   ├── sample.pdf            # 10-page subset
│   └── pages/                # 443 × p-NNN.png at 200dpi (via pdftoppm -r 200)
├── sample/                   # 10 × benchmark PNGs
├── out/
│   ├── gemini-3-1-pro/       # 10-page sample, Gemini 3.1 Pro Preview
│   ├── gemini-3-1-pro-full/  # 443-page full book
│   ├── book-gemini.md        # raw assembled markdown
│   ├── book-gemini-clean.md  # cleaned, with page anchors (1 MB)
│   ├── m4max-olmocr/         # 10-page MLX sample
│   ├── hyperion-olmocr/      # 10-page Hyperion vLLM sample
│   └── book-convert/         # 10-page Tesseract sample
├── scripts/
│   ├── ocr_gemini.py         # REST OCR per page (Gemini)
│   ├── ocr_mlx_batch.py      # MLX batch OCR (model loaded once)
│   ├── ocr_hyperion.py       # vLLM REST OCR (Hyperion 5090)
│   ├── run_gemini_book.sh    # 8-way parallel full book
│   ├── clean_book.py         # strip Gemini code-fence wrappers, add anchors
│   └── bench.py              # 4-way diff/aggregate
└── report/
    ├── BENCH.md              # analytical summary
    └── BENCH-raw.md          # per-page diff tables

Environment.

Mac (host machine for Gemini + MLX + bench)

cd projects/ocr-bench
uv venv .venv --python 3.12
source .venv/bin/activate
uv pip install mlx-vlm torch torchvision pypdf
brew install poppler   # for pdftoppm

Hyperion (CUDA 12.8 on RTX 5090, WSL2 Ubuntu)

cd ~/ocr-bench
uv venv --python 3.11 .venv
VIRTUAL_ENV=$PWD/.venv uv pip install "olmocr[gpu]" \
    --extra-index-url https://download.pytorch.org/whl/cu128 \
    --index-strategy unsafe-best-match

Gemini API key

export GEMINI_API_KEY=AIza...   # https://aistudio.google.com/apikey

Rasterize the PDF.

mkdir -p input/pages
pdftoppm -r 200 -png input/book.pdf input/pages/p

200 dpi is the sweet spot. Higher resolution (300 dpi) doesn't materially help VLM OCR quality and inflates per-page tokens. Lower (150 dpi) starts to lose subscript fidelity.

OCR one page via REST.

scripts/ocr_gemini.py
#!/usr/bin/env python3
"""OCR a single page PNG via Gemini 3.1 Pro REST API. Outputs markdown to stdout."""
import base64, json, os, sys, time
from pathlib import Path
import urllib.request, urllib.error

MODEL = os.environ.get("OCR_GEMINI_MODEL", "gemini-3.1-pro-preview")
KEY = os.environ.get("GEMINI_API_KEY")
if not KEY: sys.exit("GEMINI_API_KEY not set")

PROMPT = """You are a precision OCR engine for an academic physics monograph.
Transcribe this single book page to clean Markdown with LaTeX math.

Rules:
- Preserve reading order. Body text as prose paragraphs.
- All math (inline and display) as LaTeX: $...$ inline, $$...$$ display.
- Equation numbers like (5.59) appear as plain trailing text on the display line.
- Section headers as `##` / `###` matching apparent level.
- Page-edge watermark must be omitted.
- Page number at top corners: record as `<!-- page N -->` at top of output.
- Figures: emit `[FIGURE: brief caption]` placeholder.
- Tables: GitHub-flavored Markdown table.
- Do not add commentary. Just transcribe.
- If illegible, use `⟨?⟩`."""

def ocr_one(img):
    body = {
        "contents": [{"parts": [
            {"text": PROMPT},
            {"inline_data": {"mime_type": "image/png",
                             "data": base64.b64encode(img.read_bytes()).decode()}},
        ]}],
        "generationConfig": {"temperature": 0.0, "maxOutputTokens": 8192},
        "safetySettings": [{"category": c, "threshold": "BLOCK_NONE"} for c in [
            "HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH",
            "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT",
            "HARM_CATEGORY_CIVIC_INTEGRITY"]],
    }
    url = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL}:generateContent?key={KEY}"
    req = urllib.request.Request(url, json.dumps(body).encode(),
        headers={"Content-Type": "application/json"}, method="POST")
    t0 = time.time()
    with urllib.request.urlopen(req, timeout=180) as r:
        data = json.loads(r.read())
    elapsed = time.time() - t0
    cand = data["candidates"][0]
    if cand.get("finishReason") == "RECITATION":
        # Fallback to gemini-2.5-pro for copyright-filtered pages
        return ocr_one_alt(img, "gemini-2.5-pro")
    return {"text": cand["content"]["parts"][0]["text"], "elapsed": elapsed,
            "usage": data.get("usageMetadata", {})}

# usage: ocr_gemini.py page.png out_dir/

~3 of 443 pages hit Google's RECITATION copyright filter. The fallback to gemini-2.5-pro recovers all of them. Without the fallback, you'd have holes.

Local batch on Apple Silicon.

scripts/ocr_mlx_batch.py
#!/usr/bin/env python3
"""Batch OCR via olmOCR-2 on MLX. Loads model once, iterates over images."""
import json, os, sys, time
from pathlib import Path

MODEL = "mlx-community/olmOCR-2-7B-1025-mlx-8bit"
PROMPT = """Below is an image of a single book page from an academic physics monograph.
Transcribe ALL natural text content of the page to clean Markdown.
- Math as LaTeX: $...$ inline, $$...$$ display.
- Equation numbers as trailing text.
- Strip page-edge watermarks.
- Record top-corner page number as `<!-- page N -->`.
- No commentary."""

from mlx_vlm import load, generate
from mlx_vlm.prompt_utils import apply_chat_template
from mlx_vlm.utils import load_config

outdir = Path(sys.argv[1])
images = [Path(p) for p in sys.argv[2:]]
outdir.mkdir(parents=True, exist_ok=True)

model, processor = load(MODEL)
cfg = load_config(MODEL)
formatted = apply_chat_template(processor, cfg, PROMPT, num_images=1)

for img in images:
    stem = img.stem
    if (outdir / f"{stem}.md").exists(): continue
    t0 = time.time()
    text = generate(model, processor, formatted, image=[str(img)],
                    max_tokens=4096, temperature=0.0, verbose=False)
    elapsed = time.time() - t0
    (outdir / f"{stem}.md").write_text(text if isinstance(text, str) else text.text)
    (outdir / f"{stem}.meta.json").write_text(json.dumps({"elapsed": elapsed}))
    print(f"OK {stem}  {elapsed:.1f}s")

One model load (~1.1 s warm cache, ~30 s cold) amortised over the batch. 17.5 s/page on M4 Max.

Self-hosted on a 5090 (or 4090).

# On Hyperion:
nohup ./.venv/bin/vllm serve allenai/olmOCR-2-7B-1025-FP8 \
    --max-model-len 16384 \
    --gpu-memory-utilization 0.6 \
    --port 30024 \
    > logs/vllm.log 2>&1 &

# On client:
ssh -p 2222 -N -L 30024:localhost:30024 user@hyperion-1 &
VLLM_BASE=http://localhost:30024/v1 python scripts/ocr_hyperion.py out/dir page.png
scripts/ocr_hyperion.py
"""Call Hyperion vLLM directly over OpenAI-compatible chat completions."""
import base64, json, os, sys, time, urllib.request
from pathlib import Path

VLLM_BASE = os.environ.get("VLLM_BASE", "http://localhost:30024/v1")
MODEL = "allenai/olmOCR-2-7B-1025-FP8"

def ocr_one(img):
    body = {
        "model": MODEL,
        "messages": [{"role": "user", "content": [
            {"type": "text", "text": PROMPT},
            {"type": "image_url", "image_url": {"url":
                f"data:image/png;base64,{base64.b64encode(img.read_bytes()).decode()}"}},
        ]}],
        "max_tokens": 4096, "temperature": 0.0,
    }
    req = urllib.request.Request(f"{VLLM_BASE}/chat/completions",
        json.dumps(body).encode(),
        headers={"Content-Type": "application/json"}, method="POST")
    t0 = time.time()
    with urllib.request.urlopen(req, timeout=300) as r:
        data = json.loads(r.read())
    return {"text": data["choices"][0]["message"]["content"],
            "elapsed": time.time()-t0,
            "usage": data.get("usage", {})}

vLLM binds to localhost only by default. SSH tunnel is the easy fix; --host 0.0.0.0 is the other.

Run the winner across all 443 pages.

scripts/run_gemini_book.sh
#!/usr/bin/env bash
# 8-way parallel Gemini OCR over all pages.
set -u
ROOT="$(cd "$(dirname "$0")/.." && pwd)"
OUT="$ROOT/out/gemini-3-1-pro-full"
mkdir -p "$OUT/log"
MAX_PARALLEL=${MAX_PARALLEL:-8}
cd "$ROOT"
run_page() {
    local png="$1"
    local stem; stem=$(basename "$png" .png)
    [[ -f "$OUT/$stem.md" ]] && { echo "SKIP $stem"; return 0; }
    python3 scripts/ocr_gemini.py "$png" "$OUT/" >> "$OUT/log/$stem.log" 2>&1
}
export -f run_page; export OUT
ls input/pages/p-*.png | xargs -n 1 -P "$MAX_PARALLEL" -I{} bash -c 'run_page "$@"' _ {}
echo "DONE"

Bench script.

scripts/bench.py
#!/usr/bin/env python3
"""4-way OCR diff/bench report. Counts equations across both notations
(markdown `$..$` `$$..$$` and LaTeX `\\(..\\)` `\\[..\\]`), watermark leaks,
OCR artifacts (letter-digit-letter), and per-page wall time."""
import json, re, difflib
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
OUT = ROOT / "out"
SAMPLES = sorted(p.stem for p in (ROOT / "sample").glob("*.png"))

ENGINES = {
    "gemini-3-1-pro":   "Gemini 3.1 Pro Preview (cloud)",
    "m4max-olmocr":     "olmOCR-2-7B mlx-8bit (M4 Max)",
    "hyperion-olmocr":  "olmOCR-2-7B FP8 (Hyperion 5090)",
    "book-convert":     "book-convert/Tesseract (M4 Max)",
}

def stats(text):
    if not text:
        return {"len": 0, "eqs_inline": 0, "eqs_display": 0,
                "watermark": 0, "wm_gib": 0, "artifacts": 0}
    return {
        "len": len(text),
        "eqs_inline": (len(re.findall(r"(?<!\\$)\\$[^\\$\\n]+\\$(?!\\$)", text))
                     + len(re.findall(r"\\\\\\([^)]+\\\\\\)", text))),
        "eqs_display": (len(re.findall(r"\\$\\$.+?\\$\\$", text, re.DOTALL))
                      + len(re.findall(r"\\\\\\[.+?\\\\\\]", text, re.DOTALL))),
        "watermark": len(re.findall(r"Downloaded from https://academic\\.oup\\.com", text)),
        "wm_gib": len(re.findall(r"(Aseiqr|Agjeyseg|Asesqry|sasn|sdijy|Asesqi[lr])", text)),
        "artifacts": len(re.findall(r"[a-zA-Z]\\d[a-zA-Z]", text)),
    }
# ... see /gifts/ocr-bench/scripts/bench.py for full source

What we got.

Bench run 2026-05-26 on Apple M4 Max 128 GB, RTX 5090 32 GB on WSL2 13900K, and Google Vertex free tier.