Every script we used to OCR a 443-page scanned physics book through four engines, diff the outputs, and pick a winner. Copy what you need.
projects/ocr-bench/
├── input/
│ ├── book.pdf # 101 MB original (not redistributed)
│ ├── sample.pdf # 10-page subset
│ └── pages/ # 443 × p-NNN.png at 200dpi (via pdftoppm -r 200)
├── sample/ # 10 × benchmark PNGs
├── out/
│ ├── gemini-3-1-pro/ # 10-page sample, Gemini 3.1 Pro Preview
│ ├── gemini-3-1-pro-full/ # 443-page full book
│ ├── book-gemini.md # raw assembled markdown
│ ├── book-gemini-clean.md # cleaned, with page anchors (1 MB)
│ ├── m4max-olmocr/ # 10-page MLX sample
│ ├── hyperion-olmocr/ # 10-page Hyperion vLLM sample
│ └── book-convert/ # 10-page Tesseract sample
├── scripts/
│ ├── ocr_gemini.py # REST OCR per page (Gemini)
│ ├── ocr_mlx_batch.py # MLX batch OCR (model loaded once)
│ ├── ocr_hyperion.py # vLLM REST OCR (Hyperion 5090)
│ ├── run_gemini_book.sh # 8-way parallel full book
│ ├── clean_book.py # strip Gemini code-fence wrappers, add anchors
│ └── bench.py # 4-way diff/aggregate
└── report/
├── BENCH.md # analytical summary
└── BENCH-raw.md # per-page diff tables
cd projects/ocr-bench
uv venv .venv --python 3.12
source .venv/bin/activate
uv pip install mlx-vlm torch torchvision pypdf
brew install poppler # for pdftoppm
cd ~/ocr-bench
uv venv --python 3.11 .venv
VIRTUAL_ENV=$PWD/.venv uv pip install "olmocr[gpu]" \
--extra-index-url https://download.pytorch.org/whl/cu128 \
--index-strategy unsafe-best-match
export GEMINI_API_KEY=AIza... # https://aistudio.google.com/apikey
mkdir -p input/pages
pdftoppm -r 200 -png input/book.pdf input/pages/p
200 dpi is the sweet spot. Higher resolution (300 dpi) doesn't materially help VLM OCR quality and inflates per-page tokens. Lower (150 dpi) starts to lose subscript fidelity.
#!/usr/bin/env python3
"""OCR a single page PNG via Gemini 3.1 Pro REST API. Outputs markdown to stdout."""
import base64, json, os, sys, time
from pathlib import Path
import urllib.request, urllib.error
MODEL = os.environ.get("OCR_GEMINI_MODEL", "gemini-3.1-pro-preview")
KEY = os.environ.get("GEMINI_API_KEY")
if not KEY: sys.exit("GEMINI_API_KEY not set")
PROMPT = """You are a precision OCR engine for an academic physics monograph.
Transcribe this single book page to clean Markdown with LaTeX math.
Rules:
- Preserve reading order. Body text as prose paragraphs.
- All math (inline and display) as LaTeX: $...$ inline, $$...$$ display.
- Equation numbers like (5.59) appear as plain trailing text on the display line.
- Section headers as `##` / `###` matching apparent level.
- Page-edge watermark must be omitted.
- Page number at top corners: record as `<!-- page N -->` at top of output.
- Figures: emit `[FIGURE: brief caption]` placeholder.
- Tables: GitHub-flavored Markdown table.
- Do not add commentary. Just transcribe.
- If illegible, use `⟨?⟩`."""
def ocr_one(img):
body = {
"contents": [{"parts": [
{"text": PROMPT},
{"inline_data": {"mime_type": "image/png",
"data": base64.b64encode(img.read_bytes()).decode()}},
]}],
"generationConfig": {"temperature": 0.0, "maxOutputTokens": 8192},
"safetySettings": [{"category": c, "threshold": "BLOCK_NONE"} for c in [
"HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH",
"HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT",
"HARM_CATEGORY_CIVIC_INTEGRITY"]],
}
url = f"https://generativelanguage.googleapis.com/v1beta/models/{MODEL}:generateContent?key={KEY}"
req = urllib.request.Request(url, json.dumps(body).encode(),
headers={"Content-Type": "application/json"}, method="POST")
t0 = time.time()
with urllib.request.urlopen(req, timeout=180) as r:
data = json.loads(r.read())
elapsed = time.time() - t0
cand = data["candidates"][0]
if cand.get("finishReason") == "RECITATION":
# Fallback to gemini-2.5-pro for copyright-filtered pages
return ocr_one_alt(img, "gemini-2.5-pro")
return {"text": cand["content"]["parts"][0]["text"], "elapsed": elapsed,
"usage": data.get("usageMetadata", {})}
# usage: ocr_gemini.py page.png out_dir/
~3 of 443 pages hit Google's RECITATION copyright filter. The fallback to gemini-2.5-pro recovers all of them. Without the fallback, you'd have holes.
#!/usr/bin/env python3
"""Batch OCR via olmOCR-2 on MLX. Loads model once, iterates over images."""
import json, os, sys, time
from pathlib import Path
MODEL = "mlx-community/olmOCR-2-7B-1025-mlx-8bit"
PROMPT = """Below is an image of a single book page from an academic physics monograph.
Transcribe ALL natural text content of the page to clean Markdown.
- Math as LaTeX: $...$ inline, $$...$$ display.
- Equation numbers as trailing text.
- Strip page-edge watermarks.
- Record top-corner page number as `<!-- page N -->`.
- No commentary."""
from mlx_vlm import load, generate
from mlx_vlm.prompt_utils import apply_chat_template
from mlx_vlm.utils import load_config
outdir = Path(sys.argv[1])
images = [Path(p) for p in sys.argv[2:]]
outdir.mkdir(parents=True, exist_ok=True)
model, processor = load(MODEL)
cfg = load_config(MODEL)
formatted = apply_chat_template(processor, cfg, PROMPT, num_images=1)
for img in images:
stem = img.stem
if (outdir / f"{stem}.md").exists(): continue
t0 = time.time()
text = generate(model, processor, formatted, image=[str(img)],
max_tokens=4096, temperature=0.0, verbose=False)
elapsed = time.time() - t0
(outdir / f"{stem}.md").write_text(text if isinstance(text, str) else text.text)
(outdir / f"{stem}.meta.json").write_text(json.dumps({"elapsed": elapsed}))
print(f"OK {stem} {elapsed:.1f}s")
One model load (~1.1 s warm cache, ~30 s cold) amortised over the batch. 17.5 s/page on M4 Max.
# On Hyperion:
nohup ./.venv/bin/vllm serve allenai/olmOCR-2-7B-1025-FP8 \
--max-model-len 16384 \
--gpu-memory-utilization 0.6 \
--port 30024 \
> logs/vllm.log 2>&1 &
# On client:
ssh -p 2222 -N -L 30024:localhost:30024 user@hyperion-1 &
VLLM_BASE=http://localhost:30024/v1 python scripts/ocr_hyperion.py out/dir page.png
"""Call Hyperion vLLM directly over OpenAI-compatible chat completions."""
import base64, json, os, sys, time, urllib.request
from pathlib import Path
VLLM_BASE = os.environ.get("VLLM_BASE", "http://localhost:30024/v1")
MODEL = "allenai/olmOCR-2-7B-1025-FP8"
def ocr_one(img):
body = {
"model": MODEL,
"messages": [{"role": "user", "content": [
{"type": "text", "text": PROMPT},
{"type": "image_url", "image_url": {"url":
f"data:image/png;base64,{base64.b64encode(img.read_bytes()).decode()}"}},
]}],
"max_tokens": 4096, "temperature": 0.0,
}
req = urllib.request.Request(f"{VLLM_BASE}/chat/completions",
json.dumps(body).encode(),
headers={"Content-Type": "application/json"}, method="POST")
t0 = time.time()
with urllib.request.urlopen(req, timeout=300) as r:
data = json.loads(r.read())
return {"text": data["choices"][0]["message"]["content"],
"elapsed": time.time()-t0,
"usage": data.get("usage", {})}
vLLM binds to localhost only by default. SSH tunnel is the easy fix; --host 0.0.0.0 is the other.
#!/usr/bin/env bash
# 8-way parallel Gemini OCR over all pages.
set -u
ROOT="$(cd "$(dirname "$0")/.." && pwd)"
OUT="$ROOT/out/gemini-3-1-pro-full"
mkdir -p "$OUT/log"
MAX_PARALLEL=${MAX_PARALLEL:-8}
cd "$ROOT"
run_page() {
local png="$1"
local stem; stem=$(basename "$png" .png)
[[ -f "$OUT/$stem.md" ]] && { echo "SKIP $stem"; return 0; }
python3 scripts/ocr_gemini.py "$png" "$OUT/" >> "$OUT/log/$stem.log" 2>&1
}
export -f run_page; export OUT
ls input/pages/p-*.png | xargs -n 1 -P "$MAX_PARALLEL" -I{} bash -c 'run_page "$@"' _ {}
echo "DONE"
#!/usr/bin/env python3
"""4-way OCR diff/bench report. Counts equations across both notations
(markdown `$..$` `$$..$$` and LaTeX `\\(..\\)` `\\[..\\]`), watermark leaks,
OCR artifacts (letter-digit-letter), and per-page wall time."""
import json, re, difflib
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
OUT = ROOT / "out"
SAMPLES = sorted(p.stem for p in (ROOT / "sample").glob("*.png"))
ENGINES = {
"gemini-3-1-pro": "Gemini 3.1 Pro Preview (cloud)",
"m4max-olmocr": "olmOCR-2-7B mlx-8bit (M4 Max)",
"hyperion-olmocr": "olmOCR-2-7B FP8 (Hyperion 5090)",
"book-convert": "book-convert/Tesseract (M4 Max)",
}
def stats(text):
if not text:
return {"len": 0, "eqs_inline": 0, "eqs_display": 0,
"watermark": 0, "wm_gib": 0, "artifacts": 0}
return {
"len": len(text),
"eqs_inline": (len(re.findall(r"(?<!\\$)\\$[^\\$\\n]+\\$(?!\\$)", text))
+ len(re.findall(r"\\\\\\([^)]+\\\\\\)", text))),
"eqs_display": (len(re.findall(r"\\$\\$.+?\\$\\$", text, re.DOTALL))
+ len(re.findall(r"\\\\\\[.+?\\\\\\]", text, re.DOTALL))),
"watermark": len(re.findall(r"Downloaded from https://academic\\.oup\\.com", text)),
"wm_gib": len(re.findall(r"(Aseiqr|Agjeyseg|Asesqry|sasn|sdijy|Asesqi[lr])", text)),
"artifacts": len(re.findall(r"[a-zA-Z]\\d[a-zA-Z]", text)),
}
# ... see /gifts/ocr-bench/scripts/bench.py for full source
Bench run 2026-05-26 on Apple M4 Max 128 GB, RTX 5090 32 GB on WSL2 13900K, and Google Vertex free tier.