ckOCR/ocr.php
Karamelmar 5bf9e065e4 Add Mistral AI OCR script with test data and documentation
- ocr.php: two-step pipeline (mistral-ocr-latest + mistral-small-latest)
  extracts Serial Number, Model Number, and Date from part label photos
- input/: 5 test images of industrial part labels
- output/: corresponding YAML results
- README.md: full usage, setup, and troubleshooting docs
- .gitignore: excludes .env only
- .env.example: API key template

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-04 18:29:07 +01:00

398 lines
12 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
declare(strict_types=1);
/**
* ckOCR — Part Identification Label OCR via Mistral AI
*
* Scans input/ for images, extracts Serial Number, Model Number and Date
* using Mistral's OCR API + structured extraction, writes results as YAML to output/.
*
* PHP 8.1 8.5 compatible
*
* Usage:
* php ocr.php [--force] [--verbose]
*
* --force Re-process images that already have an output file
* --verbose Print OCR text and API details
*/
// ── Configuration ─────────────────────────────────────────────────────────────
const INPUT_DIR = __DIR__ . '/input';
const OUTPUT_DIR = __DIR__ . '/output';
const MISTRAL_BASE_URL = 'https://api.mistral.ai/v1';
const OCR_MODEL = 'mistral-ocr-latest';
const CHAT_MODEL = 'mistral-small-latest';
const SUPPORTED_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif'];
const MAX_IMAGE_BYTES = 5 * 1024 * 1024; // 5 MB — Mistral API limit
// ── CLI arguments ─────────────────────────────────────────────────────────────
$opts = getopt('', ['force', 'verbose', 'help']);
$force = isset($opts['force']);
$verbose = isset($opts['verbose']);
if (isset($opts['help'])) {
echo <<<HELP
ckOCR — Part Label OCR using Mistral AI
Usage: php ocr.php [options]
Options:
--force Re-process already completed images
--verbose Show OCR text and request details
--help Show this help message
Environment / .env:
MISTRAL_API_KEY Your Mistral AI API key (required)
HELP;
exit(0);
}
// ── API key ───────────────────────────────────────────────────────────────────
$apiKey = load_api_key();
if (!$apiKey) {
stderr("MISTRAL_API_KEY not set.");
stderr("Set it as an environment variable or add it to a .env file in the project root.");
exit(1);
}
// ── Helpers ───────────────────────────────────────────────────────────────────
function load_api_key(): string
{
$key = (string) getenv('MISTRAL_API_KEY');
if ($key !== '') {
return $key;
}
$envFile = __DIR__ . '/.env';
if (!file_exists($envFile)) {
return '';
}
foreach (file($envFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES) as $line) {
$line = trim($line);
if ($line === '' || str_starts_with($line, '#')) {
continue;
}
[$envKey, $envVal] = array_map('trim', explode('=', $line, 2)) + ['', ''];
if ($envKey === 'MISTRAL_API_KEY') {
return trim($envVal, '"\'');
}
}
return '';
}
function stdout(string $msg): void
{
echo $msg . "\n";
}
function stderr(string $msg): void
{
fwrite(STDERR, "ERROR: {$msg}\n");
}
function verbose(string $msg, bool $verbose): void
{
if ($verbose) {
echo " [v] {$msg}\n";
}
}
function mime_for(string $path): string
{
return match (strtolower(pathinfo($path, PATHINFO_EXTENSION))) {
'jpg', 'jpeg' => 'image/jpeg',
'png' => 'image/png',
'webp' => 'image/webp',
'gif' => 'image/gif',
default => 'image/jpeg',
};
}
/**
* Minimal YAML serialiser — handles the flat structure we produce.
* Supports: null, bool, int, float, single-line strings, multi-line strings (literal block).
*/
function to_yaml(array $data, int $depth = 0): string
{
$out = '';
$pad = str_repeat(' ', $depth);
foreach ($data as $key => $value) {
if ($value === null) {
$out .= "{$pad}{$key}: null\n";
continue;
}
if (is_bool($value)) {
$out .= "{$pad}{$key}: " . ($value ? 'true' : 'false') . "\n";
continue;
}
if (is_int($value) || is_float($value)) {
$out .= "{$pad}{$key}: {$value}\n";
continue;
}
if (is_array($value)) {
$out .= "{$pad}{$key}:\n" . to_yaml($value, $depth + 1);
continue;
}
$str = (string) $value;
// Multi-line → YAML literal block scalar
if (str_contains($str, "\n")) {
$childPad = str_repeat(' ', $depth + 1);
$indented = $childPad . implode("\n{$childPad}", explode("\n", rtrim($str)));
$out .= "{$pad}{$key}: |\n{$indented}\n";
continue;
}
// Single-line — quote if the value contains YAML special characters
if ($str === '' || preg_match('/[:#\[\]{}|>&!\'"%@`,]|^\s|\s$/', $str)) {
$escaped = str_replace(['\\', '"'], ['\\\\', '\\"'], $str);
$out .= "{$pad}{$key}: \"{$escaped}\"\n";
continue;
}
$out .= "{$pad}{$key}: {$str}\n";
}
return $out;
}
// ── Mistral API ───────────────────────────────────────────────────────────────
/**
* Generic JSON POST to the Mistral REST API.
*
* @throws RuntimeException on network error or non-200 response
*/
function mistral_post(string $endpoint, array $payload, string $apiKey, bool $verbose): array
{
$url = MISTRAL_BASE_URL . $endpoint;
$body = json_encode($payload, JSON_UNESCAPED_UNICODE | JSON_THROW_ON_ERROR);
verbose("POST {$url}", $verbose);
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $body,
CURLOPT_HTTPHEADER => [
'Authorization: Bearer ' . $apiKey,
'Content-Type: application/json',
'Accept: application/json',
],
CURLOPT_TIMEOUT => 120,
CURLOPT_CONNECTTIMEOUT => 15,
]);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$curlError = curl_error($ch);
curl_close($ch);
if ($curlError !== '') {
throw new RuntimeException("cURL error: {$curlError}");
}
if ($httpCode !== 200) {
$decoded = json_decode((string) $response, true);
$msg = $decoded['message']
?? $decoded['error']['message']
?? (string) $response;
throw new RuntimeException("Mistral API {$httpCode}: {$msg}");
}
$decoded = json_decode((string) $response, true);
if (!is_array($decoded)) {
throw new RuntimeException("Non-JSON response from Mistral API");
}
return $decoded;
}
/**
* Step 1 — Send the image to mistral-ocr-latest and get markdown text back.
*/
function ocr_image(string $imagePath, string $apiKey, bool $verbose): string
{
$mime = mime_for($imagePath);
$imageData = base64_encode((string) file_get_contents($imagePath));
verbose("OCR model: " . OCR_MODEL, $verbose);
$result = mistral_post('/ocr', [
'model' => OCR_MODEL,
'document' => [
'type' => 'image_url',
'image_url' => "data:{$mime};base64,{$imageData}",
],
], $apiKey, $verbose);
$text = '';
foreach ($result['pages'] ?? [] as $page) {
$text .= ($page['markdown'] ?? '') . "\n";
}
return trim($text);
}
/**
* Step 2 — Extract Serial Number, Model Number, Date from raw OCR text
* using a chat model with JSON response mode.
*/
function extract_fields(string $ocrText, string $apiKey, bool $verbose): array
{
verbose("Extraction model: " . CHAT_MODEL, $verbose);
$system = 'You are a precision industrial part-label parser. '
. 'Extract structured fields from OCR text. '
. 'Return ONLY valid JSON — no explanation, no markdown fences.';
$user = <<<PROMPT
Extract these fields from the OCR text below.
Return a JSON object with exactly these keys (use null when not found):
{
"serial_number": "Serial Number (labelled S/N, SN, Serial No., etc.)",
"model_number": "Model or Part Number (labelled M/N, Model, Part No., P/N, MPN, etc.)",
"date": "Any date present (manufacturing, MFG, expiry, DOM, etc.) — keep original format"
}
OCR text:
{$ocrText}
PROMPT;
$result = mistral_post('/chat/completions', [
'model' => CHAT_MODEL,
'messages' => [
['role' => 'system', 'content' => $system],
['role' => 'user', 'content' => $user],
],
'response_format' => ['type' => 'json_object'],
'temperature' => 0.0,
], $apiKey, $verbose);
$content = $result['choices'][0]['message']['content'] ?? '{}';
$fields = json_decode($content, true);
if (!is_array($fields)) {
stderr("Could not parse extraction response: {$content}");
$fields = [];
}
return [
'serial_number' => isset($fields['serial_number']) ? (string) $fields['serial_number'] : null,
'model_number' => isset($fields['model_number']) ? (string) $fields['model_number'] : null,
'date' => isset($fields['date']) ? (string) $fields['date'] : null,
];
}
// ── Image processing ──────────────────────────────────────────────────────────
function process_image(string $imagePath, string $outputPath, string $apiKey, bool $verbose): bool
{
$filename = basename($imagePath);
$size = filesize($imagePath);
if ($size === false || $size > MAX_IMAGE_BYTES) {
stderr("File too large or unreadable ({$size} bytes): {$filename}");
return false;
}
// Step 1: OCR
$ocrText = ocr_image($imagePath, $apiKey, $verbose);
if ($ocrText === '') {
stderr("No text found in: {$filename}");
// Still write output so we don't retry repeatedly
}
verbose("--- OCR text ---\n{$ocrText}\n---", $verbose);
// Step 2: Structured extraction (skip if nothing to parse)
$fields = ['serial_number' => null, 'model_number' => null, 'date' => null];
if ($ocrText !== '') {
$fields = extract_fields($ocrText, $apiKey, $verbose);
}
// Build and write YAML
$output = [
'serial_number' => $fields['serial_number'],
'model_number' => $fields['model_number'],
'date' => $fields['date'],
'source_file' => $filename,
'processed_at' => date('Y-m-d H:i:s'),
'raw_ocr' => $ocrText !== '' ? $ocrText : null,
];
$yaml = "---\n" . to_yaml($output);
file_put_contents($outputPath, $yaml);
return true;
}
// ── Main ──────────────────────────────────────────────────────────────────────
if (!is_dir(OUTPUT_DIR)) {
mkdir(OUTPUT_DIR, 0755, true);
}
// Collect images
$pattern = INPUT_DIR . '/*.{' . implode(',', SUPPORTED_EXTENSIONS) . '}';
$images = glob($pattern, GLOB_BRACE) ?: [];
if ($images === []) {
stdout("No supported images found in " . INPUT_DIR);
exit(0);
}
stdout(sprintf("Found %d image(s). Starting OCR…\n", count($images)));
$processed = 0;
$skipped = 0;
$failed = 0;
foreach ($images as $imagePath) {
$filename = basename($imagePath);
$stem = pathinfo($filename, PATHINFO_FILENAME);
$outputPath = OUTPUT_DIR . '/' . $stem . '.yaml';
if (!$force && file_exists($outputPath)) {
stdout("SKIP {$filename} (output exists, use --force to re-run)");
$skipped++;
continue;
}
stdout("PROCESS {$filename}");
try {
$ok = process_image($imagePath, $outputPath, $apiKey, $verbose);
if ($ok) {
stdout(" → output/{$stem}.yaml");
$processed++;
} else {
$failed++;
}
} catch (RuntimeException $e) {
stderr($e->getMessage());
$failed++;
}
}
stdout(sprintf(
"\nDone — processed: %d skipped: %d failed: %d",
$processed,
$skipped,
$failed
));