Add Mistral AI OCR script with test data and documentation

- ocr.php: two-step pipeline (mistral-ocr-latest + mistral-small-latest)
  extracts Serial Number, Model Number, and Date from part label photos
- input/: 5 test images of industrial part labels
- output/: corresponding YAML results
- README.md: full usage, setup, and troubleshooting docs
- .gitignore: excludes .env only
- .env.example: API key template

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Elmar Sönser 2026-03-04 18:29:07 +01:00
commit 5bf9e065e4
14 changed files with 682 additions and 0 deletions

398
ocr.php Normal file
View file

@ -0,0 +1,398 @@
<?php
declare(strict_types=1);
/**
* ckOCR Part Identification Label OCR via Mistral AI
*
* Scans input/ for images, extracts Serial Number, Model Number and Date
* using Mistral's OCR API + structured extraction, writes results as YAML to output/.
*
* PHP 8.1 8.5 compatible
*
* Usage:
* php ocr.php [--force] [--verbose]
*
* --force Re-process images that already have an output file
* --verbose Print OCR text and API details
*/
// ── Configuration ─────────────────────────────────────────────────────────────
const INPUT_DIR = __DIR__ . '/input';
const OUTPUT_DIR = __DIR__ . '/output';
const MISTRAL_BASE_URL = 'https://api.mistral.ai/v1';
const OCR_MODEL = 'mistral-ocr-latest';
const CHAT_MODEL = 'mistral-small-latest';
const SUPPORTED_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif'];
const MAX_IMAGE_BYTES = 5 * 1024 * 1024; // 5 MB — Mistral API limit
// ── CLI arguments ─────────────────────────────────────────────────────────────
$opts = getopt('', ['force', 'verbose', 'help']);
$force = isset($opts['force']);
$verbose = isset($opts['verbose']);
if (isset($opts['help'])) {
echo <<<HELP
ckOCR Part Label OCR using Mistral AI
Usage: php ocr.php [options]
Options:
--force Re-process already completed images
--verbose Show OCR text and request details
--help Show this help message
Environment / .env:
MISTRAL_API_KEY Your Mistral AI API key (required)
HELP;
exit(0);
}
// ── API key ───────────────────────────────────────────────────────────────────
$apiKey = load_api_key();
if (!$apiKey) {
stderr("MISTRAL_API_KEY not set.");
stderr("Set it as an environment variable or add it to a .env file in the project root.");
exit(1);
}
// ── Helpers ───────────────────────────────────────────────────────────────────
function load_api_key(): string
{
$key = (string) getenv('MISTRAL_API_KEY');
if ($key !== '') {
return $key;
}
$envFile = __DIR__ . '/.env';
if (!file_exists($envFile)) {
return '';
}
foreach (file($envFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES) as $line) {
$line = trim($line);
if ($line === '' || str_starts_with($line, '#')) {
continue;
}
[$envKey, $envVal] = array_map('trim', explode('=', $line, 2)) + ['', ''];
if ($envKey === 'MISTRAL_API_KEY') {
return trim($envVal, '"\'');
}
}
return '';
}
function stdout(string $msg): void
{
echo $msg . "\n";
}
function stderr(string $msg): void
{
fwrite(STDERR, "ERROR: {$msg}\n");
}
function verbose(string $msg, bool $verbose): void
{
if ($verbose) {
echo " [v] {$msg}\n";
}
}
function mime_for(string $path): string
{
return match (strtolower(pathinfo($path, PATHINFO_EXTENSION))) {
'jpg', 'jpeg' => 'image/jpeg',
'png' => 'image/png',
'webp' => 'image/webp',
'gif' => 'image/gif',
default => 'image/jpeg',
};
}
/**
* Minimal YAML serialiser handles the flat structure we produce.
* Supports: null, bool, int, float, single-line strings, multi-line strings (literal block).
*/
function to_yaml(array $data, int $depth = 0): string
{
$out = '';
$pad = str_repeat(' ', $depth);
foreach ($data as $key => $value) {
if ($value === null) {
$out .= "{$pad}{$key}: null\n";
continue;
}
if (is_bool($value)) {
$out .= "{$pad}{$key}: " . ($value ? 'true' : 'false') . "\n";
continue;
}
if (is_int($value) || is_float($value)) {
$out .= "{$pad}{$key}: {$value}\n";
continue;
}
if (is_array($value)) {
$out .= "{$pad}{$key}:\n" . to_yaml($value, $depth + 1);
continue;
}
$str = (string) $value;
// Multi-line → YAML literal block scalar
if (str_contains($str, "\n")) {
$childPad = str_repeat(' ', $depth + 1);
$indented = $childPad . implode("\n{$childPad}", explode("\n", rtrim($str)));
$out .= "{$pad}{$key}: |\n{$indented}\n";
continue;
}
// Single-line — quote if the value contains YAML special characters
if ($str === '' || preg_match('/[:#\[\]{}|>&!\'"%@`,]|^\s|\s$/', $str)) {
$escaped = str_replace(['\\', '"'], ['\\\\', '\\"'], $str);
$out .= "{$pad}{$key}: \"{$escaped}\"\n";
continue;
}
$out .= "{$pad}{$key}: {$str}\n";
}
return $out;
}
// ── Mistral API ───────────────────────────────────────────────────────────────
/**
* Generic JSON POST to the Mistral REST API.
*
* @throws RuntimeException on network error or non-200 response
*/
function mistral_post(string $endpoint, array $payload, string $apiKey, bool $verbose): array
{
$url = MISTRAL_BASE_URL . $endpoint;
$body = json_encode($payload, JSON_UNESCAPED_UNICODE | JSON_THROW_ON_ERROR);
verbose("POST {$url}", $verbose);
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $body,
CURLOPT_HTTPHEADER => [
'Authorization: Bearer ' . $apiKey,
'Content-Type: application/json',
'Accept: application/json',
],
CURLOPT_TIMEOUT => 120,
CURLOPT_CONNECTTIMEOUT => 15,
]);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$curlError = curl_error($ch);
curl_close($ch);
if ($curlError !== '') {
throw new RuntimeException("cURL error: {$curlError}");
}
if ($httpCode !== 200) {
$decoded = json_decode((string) $response, true);
$msg = $decoded['message']
?? $decoded['error']['message']
?? (string) $response;
throw new RuntimeException("Mistral API {$httpCode}: {$msg}");
}
$decoded = json_decode((string) $response, true);
if (!is_array($decoded)) {
throw new RuntimeException("Non-JSON response from Mistral API");
}
return $decoded;
}
/**
* Step 1 Send the image to mistral-ocr-latest and get markdown text back.
*/
function ocr_image(string $imagePath, string $apiKey, bool $verbose): string
{
$mime = mime_for($imagePath);
$imageData = base64_encode((string) file_get_contents($imagePath));
verbose("OCR model: " . OCR_MODEL, $verbose);
$result = mistral_post('/ocr', [
'model' => OCR_MODEL,
'document' => [
'type' => 'image_url',
'image_url' => "data:{$mime};base64,{$imageData}",
],
], $apiKey, $verbose);
$text = '';
foreach ($result['pages'] ?? [] as $page) {
$text .= ($page['markdown'] ?? '') . "\n";
}
return trim($text);
}
/**
* Step 2 Extract Serial Number, Model Number, Date from raw OCR text
* using a chat model with JSON response mode.
*/
function extract_fields(string $ocrText, string $apiKey, bool $verbose): array
{
verbose("Extraction model: " . CHAT_MODEL, $verbose);
$system = 'You are a precision industrial part-label parser. '
. 'Extract structured fields from OCR text. '
. 'Return ONLY valid JSON — no explanation, no markdown fences.';
$user = <<<PROMPT
Extract these fields from the OCR text below.
Return a JSON object with exactly these keys (use null when not found):
{
"serial_number": "Serial Number (labelled S/N, SN, Serial No., etc.)",
"model_number": "Model or Part Number (labelled M/N, Model, Part No., P/N, MPN, etc.)",
"date": "Any date present (manufacturing, MFG, expiry, DOM, etc.) — keep original format"
}
OCR text:
{$ocrText}
PROMPT;
$result = mistral_post('/chat/completions', [
'model' => CHAT_MODEL,
'messages' => [
['role' => 'system', 'content' => $system],
['role' => 'user', 'content' => $user],
],
'response_format' => ['type' => 'json_object'],
'temperature' => 0.0,
], $apiKey, $verbose);
$content = $result['choices'][0]['message']['content'] ?? '{}';
$fields = json_decode($content, true);
if (!is_array($fields)) {
stderr("Could not parse extraction response: {$content}");
$fields = [];
}
return [
'serial_number' => isset($fields['serial_number']) ? (string) $fields['serial_number'] : null,
'model_number' => isset($fields['model_number']) ? (string) $fields['model_number'] : null,
'date' => isset($fields['date']) ? (string) $fields['date'] : null,
];
}
// ── Image processing ──────────────────────────────────────────────────────────
function process_image(string $imagePath, string $outputPath, string $apiKey, bool $verbose): bool
{
$filename = basename($imagePath);
$size = filesize($imagePath);
if ($size === false || $size > MAX_IMAGE_BYTES) {
stderr("File too large or unreadable ({$size} bytes): {$filename}");
return false;
}
// Step 1: OCR
$ocrText = ocr_image($imagePath, $apiKey, $verbose);
if ($ocrText === '') {
stderr("No text found in: {$filename}");
// Still write output so we don't retry repeatedly
}
verbose("--- OCR text ---\n{$ocrText}\n---", $verbose);
// Step 2: Structured extraction (skip if nothing to parse)
$fields = ['serial_number' => null, 'model_number' => null, 'date' => null];
if ($ocrText !== '') {
$fields = extract_fields($ocrText, $apiKey, $verbose);
}
// Build and write YAML
$output = [
'serial_number' => $fields['serial_number'],
'model_number' => $fields['model_number'],
'date' => $fields['date'],
'source_file' => $filename,
'processed_at' => date('Y-m-d H:i:s'),
'raw_ocr' => $ocrText !== '' ? $ocrText : null,
];
$yaml = "---\n" . to_yaml($output);
file_put_contents($outputPath, $yaml);
return true;
}
// ── Main ──────────────────────────────────────────────────────────────────────
if (!is_dir(OUTPUT_DIR)) {
mkdir(OUTPUT_DIR, 0755, true);
}
// Collect images
$pattern = INPUT_DIR . '/*.{' . implode(',', SUPPORTED_EXTENSIONS) . '}';
$images = glob($pattern, GLOB_BRACE) ?: [];
if ($images === []) {
stdout("No supported images found in " . INPUT_DIR);
exit(0);
}
stdout(sprintf("Found %d image(s). Starting OCR…\n", count($images)));
$processed = 0;
$skipped = 0;
$failed = 0;
foreach ($images as $imagePath) {
$filename = basename($imagePath);
$stem = pathinfo($filename, PATHINFO_FILENAME);
$outputPath = OUTPUT_DIR . '/' . $stem . '.yaml';
if (!$force && file_exists($outputPath)) {
stdout("SKIP {$filename} (output exists, use --force to re-run)");
$skipped++;
continue;
}
stdout("PROCESS {$filename}");
try {
$ok = process_image($imagePath, $outputPath, $apiKey, $verbose);
if ($ok) {
stdout(" → output/{$stem}.yaml");
$processed++;
} else {
$failed++;
}
} catch (RuntimeException $e) {
stderr($e->getMessage());
$failed++;
}
}
stdout(sprintf(
"\nDone — processed: %d skipped: %d failed: %d",
$processed,
$skipped,
$failed
));