- ocr.php: two-step pipeline (mistral-ocr-latest + mistral-small-latest) extracts Serial Number, Model Number, and Date from part label photos - input/: 5 test images of industrial part labels - output/: corresponding YAML results - README.md: full usage, setup, and troubleshooting docs - .gitignore: excludes .env only - .env.example: API key template Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
398 lines
12 KiB
PHP
398 lines
12 KiB
PHP
<?php
|
||
|
||
declare(strict_types=1);
|
||
|
||
/**
|
||
* ckOCR — Part Identification Label OCR via Mistral AI
|
||
*
|
||
* Scans input/ for images, extracts Serial Number, Model Number and Date
|
||
* using Mistral's OCR API + structured extraction, writes results as YAML to output/.
|
||
*
|
||
* PHP 8.1 – 8.5 compatible
|
||
*
|
||
* Usage:
|
||
* php ocr.php [--force] [--verbose]
|
||
*
|
||
* --force Re-process images that already have an output file
|
||
* --verbose Print OCR text and API details
|
||
*/
|
||
|
||
// ── Configuration ─────────────────────────────────────────────────────────────
|
||
|
||
const INPUT_DIR = __DIR__ . '/input';
|
||
const OUTPUT_DIR = __DIR__ . '/output';
|
||
|
||
const MISTRAL_BASE_URL = 'https://api.mistral.ai/v1';
|
||
const OCR_MODEL = 'mistral-ocr-latest';
|
||
const CHAT_MODEL = 'mistral-small-latest';
|
||
|
||
const SUPPORTED_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif'];
|
||
const MAX_IMAGE_BYTES = 5 * 1024 * 1024; // 5 MB — Mistral API limit
|
||
|
||
// ── CLI arguments ─────────────────────────────────────────────────────────────
|
||
|
||
$opts = getopt('', ['force', 'verbose', 'help']);
|
||
$force = isset($opts['force']);
|
||
$verbose = isset($opts['verbose']);
|
||
|
||
if (isset($opts['help'])) {
|
||
echo <<<HELP
|
||
ckOCR — Part Label OCR using Mistral AI
|
||
|
||
Usage: php ocr.php [options]
|
||
|
||
Options:
|
||
--force Re-process already completed images
|
||
--verbose Show OCR text and request details
|
||
--help Show this help message
|
||
|
||
Environment / .env:
|
||
MISTRAL_API_KEY Your Mistral AI API key (required)
|
||
|
||
HELP;
|
||
exit(0);
|
||
}
|
||
|
||
// ── API key ───────────────────────────────────────────────────────────────────
|
||
|
||
$apiKey = load_api_key();
|
||
if (!$apiKey) {
|
||
stderr("MISTRAL_API_KEY not set.");
|
||
stderr("Set it as an environment variable or add it to a .env file in the project root.");
|
||
exit(1);
|
||
}
|
||
|
||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||
|
||
function load_api_key(): string
|
||
{
|
||
$key = (string) getenv('MISTRAL_API_KEY');
|
||
if ($key !== '') {
|
||
return $key;
|
||
}
|
||
|
||
$envFile = __DIR__ . '/.env';
|
||
if (!file_exists($envFile)) {
|
||
return '';
|
||
}
|
||
|
||
foreach (file($envFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES) as $line) {
|
||
$line = trim($line);
|
||
if ($line === '' || str_starts_with($line, '#')) {
|
||
continue;
|
||
}
|
||
[$envKey, $envVal] = array_map('trim', explode('=', $line, 2)) + ['', ''];
|
||
if ($envKey === 'MISTRAL_API_KEY') {
|
||
return trim($envVal, '"\'');
|
||
}
|
||
}
|
||
|
||
return '';
|
||
}
|
||
|
||
function stdout(string $msg): void
|
||
{
|
||
echo $msg . "\n";
|
||
}
|
||
|
||
function stderr(string $msg): void
|
||
{
|
||
fwrite(STDERR, "ERROR: {$msg}\n");
|
||
}
|
||
|
||
function verbose(string $msg, bool $verbose): void
|
||
{
|
||
if ($verbose) {
|
||
echo " [v] {$msg}\n";
|
||
}
|
||
}
|
||
|
||
function mime_for(string $path): string
|
||
{
|
||
return match (strtolower(pathinfo($path, PATHINFO_EXTENSION))) {
|
||
'jpg', 'jpeg' => 'image/jpeg',
|
||
'png' => 'image/png',
|
||
'webp' => 'image/webp',
|
||
'gif' => 'image/gif',
|
||
default => 'image/jpeg',
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Minimal YAML serialiser — handles the flat structure we produce.
|
||
* Supports: null, bool, int, float, single-line strings, multi-line strings (literal block).
|
||
*/
|
||
function to_yaml(array $data, int $depth = 0): string
|
||
{
|
||
$out = '';
|
||
$pad = str_repeat(' ', $depth);
|
||
|
||
foreach ($data as $key => $value) {
|
||
if ($value === null) {
|
||
$out .= "{$pad}{$key}: null\n";
|
||
continue;
|
||
}
|
||
if (is_bool($value)) {
|
||
$out .= "{$pad}{$key}: " . ($value ? 'true' : 'false') . "\n";
|
||
continue;
|
||
}
|
||
if (is_int($value) || is_float($value)) {
|
||
$out .= "{$pad}{$key}: {$value}\n";
|
||
continue;
|
||
}
|
||
if (is_array($value)) {
|
||
$out .= "{$pad}{$key}:\n" . to_yaml($value, $depth + 1);
|
||
continue;
|
||
}
|
||
|
||
$str = (string) $value;
|
||
|
||
// Multi-line → YAML literal block scalar
|
||
if (str_contains($str, "\n")) {
|
||
$childPad = str_repeat(' ', $depth + 1);
|
||
$indented = $childPad . implode("\n{$childPad}", explode("\n", rtrim($str)));
|
||
$out .= "{$pad}{$key}: |\n{$indented}\n";
|
||
continue;
|
||
}
|
||
|
||
// Single-line — quote if the value contains YAML special characters
|
||
if ($str === '' || preg_match('/[:#\[\]{}|>&!\'"%@`,]|^\s|\s$/', $str)) {
|
||
$escaped = str_replace(['\\', '"'], ['\\\\', '\\"'], $str);
|
||
$out .= "{$pad}{$key}: \"{$escaped}\"\n";
|
||
continue;
|
||
}
|
||
|
||
$out .= "{$pad}{$key}: {$str}\n";
|
||
}
|
||
|
||
return $out;
|
||
}
|
||
|
||
// ── Mistral API ───────────────────────────────────────────────────────────────
|
||
|
||
/**
|
||
* Generic JSON POST to the Mistral REST API.
|
||
*
|
||
* @throws RuntimeException on network error or non-200 response
|
||
*/
|
||
function mistral_post(string $endpoint, array $payload, string $apiKey, bool $verbose): array
|
||
{
|
||
$url = MISTRAL_BASE_URL . $endpoint;
|
||
$body = json_encode($payload, JSON_UNESCAPED_UNICODE | JSON_THROW_ON_ERROR);
|
||
|
||
verbose("POST {$url}", $verbose);
|
||
|
||
$ch = curl_init($url);
|
||
curl_setopt_array($ch, [
|
||
CURLOPT_RETURNTRANSFER => true,
|
||
CURLOPT_POST => true,
|
||
CURLOPT_POSTFIELDS => $body,
|
||
CURLOPT_HTTPHEADER => [
|
||
'Authorization: Bearer ' . $apiKey,
|
||
'Content-Type: application/json',
|
||
'Accept: application/json',
|
||
],
|
||
CURLOPT_TIMEOUT => 120,
|
||
CURLOPT_CONNECTTIMEOUT => 15,
|
||
]);
|
||
|
||
$response = curl_exec($ch);
|
||
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||
$curlError = curl_error($ch);
|
||
curl_close($ch);
|
||
|
||
if ($curlError !== '') {
|
||
throw new RuntimeException("cURL error: {$curlError}");
|
||
}
|
||
|
||
if ($httpCode !== 200) {
|
||
$decoded = json_decode((string) $response, true);
|
||
$msg = $decoded['message']
|
||
?? $decoded['error']['message']
|
||
?? (string) $response;
|
||
throw new RuntimeException("Mistral API {$httpCode}: {$msg}");
|
||
}
|
||
|
||
$decoded = json_decode((string) $response, true);
|
||
if (!is_array($decoded)) {
|
||
throw new RuntimeException("Non-JSON response from Mistral API");
|
||
}
|
||
|
||
return $decoded;
|
||
}
|
||
|
||
/**
|
||
* Step 1 — Send the image to mistral-ocr-latest and get markdown text back.
|
||
*/
|
||
function ocr_image(string $imagePath, string $apiKey, bool $verbose): string
|
||
{
|
||
$mime = mime_for($imagePath);
|
||
$imageData = base64_encode((string) file_get_contents($imagePath));
|
||
|
||
verbose("OCR model: " . OCR_MODEL, $verbose);
|
||
|
||
$result = mistral_post('/ocr', [
|
||
'model' => OCR_MODEL,
|
||
'document' => [
|
||
'type' => 'image_url',
|
||
'image_url' => "data:{$mime};base64,{$imageData}",
|
||
],
|
||
], $apiKey, $verbose);
|
||
|
||
$text = '';
|
||
foreach ($result['pages'] ?? [] as $page) {
|
||
$text .= ($page['markdown'] ?? '') . "\n";
|
||
}
|
||
|
||
return trim($text);
|
||
}
|
||
|
||
/**
|
||
* Step 2 — Extract Serial Number, Model Number, Date from raw OCR text
|
||
* using a chat model with JSON response mode.
|
||
*/
|
||
function extract_fields(string $ocrText, string $apiKey, bool $verbose): array
|
||
{
|
||
verbose("Extraction model: " . CHAT_MODEL, $verbose);
|
||
|
||
$system = 'You are a precision industrial part-label parser. '
|
||
. 'Extract structured fields from OCR text. '
|
||
. 'Return ONLY valid JSON — no explanation, no markdown fences.';
|
||
|
||
$user = <<<PROMPT
|
||
Extract these fields from the OCR text below.
|
||
Return a JSON object with exactly these keys (use null when not found):
|
||
|
||
{
|
||
"serial_number": "Serial Number (labelled S/N, SN, Serial No., etc.)",
|
||
"model_number": "Model or Part Number (labelled M/N, Model, Part No., P/N, MPN, etc.)",
|
||
"date": "Any date present (manufacturing, MFG, expiry, DOM, etc.) — keep original format"
|
||
}
|
||
|
||
OCR text:
|
||
{$ocrText}
|
||
PROMPT;
|
||
|
||
$result = mistral_post('/chat/completions', [
|
||
'model' => CHAT_MODEL,
|
||
'messages' => [
|
||
['role' => 'system', 'content' => $system],
|
||
['role' => 'user', 'content' => $user],
|
||
],
|
||
'response_format' => ['type' => 'json_object'],
|
||
'temperature' => 0.0,
|
||
], $apiKey, $verbose);
|
||
|
||
$content = $result['choices'][0]['message']['content'] ?? '{}';
|
||
$fields = json_decode($content, true);
|
||
|
||
if (!is_array($fields)) {
|
||
stderr("Could not parse extraction response: {$content}");
|
||
$fields = [];
|
||
}
|
||
|
||
return [
|
||
'serial_number' => isset($fields['serial_number']) ? (string) $fields['serial_number'] : null,
|
||
'model_number' => isset($fields['model_number']) ? (string) $fields['model_number'] : null,
|
||
'date' => isset($fields['date']) ? (string) $fields['date'] : null,
|
||
];
|
||
}
|
||
|
||
// ── Image processing ──────────────────────────────────────────────────────────
|
||
|
||
function process_image(string $imagePath, string $outputPath, string $apiKey, bool $verbose): bool
|
||
{
|
||
$filename = basename($imagePath);
|
||
$size = filesize($imagePath);
|
||
|
||
if ($size === false || $size > MAX_IMAGE_BYTES) {
|
||
stderr("File too large or unreadable ({$size} bytes): {$filename}");
|
||
return false;
|
||
}
|
||
|
||
// Step 1: OCR
|
||
$ocrText = ocr_image($imagePath, $apiKey, $verbose);
|
||
|
||
if ($ocrText === '') {
|
||
stderr("No text found in: {$filename}");
|
||
// Still write output so we don't retry repeatedly
|
||
}
|
||
|
||
verbose("--- OCR text ---\n{$ocrText}\n---", $verbose);
|
||
|
||
// Step 2: Structured extraction (skip if nothing to parse)
|
||
$fields = ['serial_number' => null, 'model_number' => null, 'date' => null];
|
||
if ($ocrText !== '') {
|
||
$fields = extract_fields($ocrText, $apiKey, $verbose);
|
||
}
|
||
|
||
// Build and write YAML
|
||
$output = [
|
||
'serial_number' => $fields['serial_number'],
|
||
'model_number' => $fields['model_number'],
|
||
'date' => $fields['date'],
|
||
'source_file' => $filename,
|
||
'processed_at' => date('Y-m-d H:i:s'),
|
||
'raw_ocr' => $ocrText !== '' ? $ocrText : null,
|
||
];
|
||
|
||
$yaml = "---\n" . to_yaml($output);
|
||
file_put_contents($outputPath, $yaml);
|
||
|
||
return true;
|
||
}
|
||
|
||
// ── Main ──────────────────────────────────────────────────────────────────────
|
||
|
||
if (!is_dir(OUTPUT_DIR)) {
|
||
mkdir(OUTPUT_DIR, 0755, true);
|
||
}
|
||
|
||
// Collect images
|
||
$pattern = INPUT_DIR . '/*.{' . implode(',', SUPPORTED_EXTENSIONS) . '}';
|
||
$images = glob($pattern, GLOB_BRACE) ?: [];
|
||
|
||
if ($images === []) {
|
||
stdout("No supported images found in " . INPUT_DIR);
|
||
exit(0);
|
||
}
|
||
|
||
stdout(sprintf("Found %d image(s). Starting OCR…\n", count($images)));
|
||
|
||
$processed = 0;
|
||
$skipped = 0;
|
||
$failed = 0;
|
||
|
||
foreach ($images as $imagePath) {
|
||
$filename = basename($imagePath);
|
||
$stem = pathinfo($filename, PATHINFO_FILENAME);
|
||
$outputPath = OUTPUT_DIR . '/' . $stem . '.yaml';
|
||
|
||
if (!$force && file_exists($outputPath)) {
|
||
stdout("SKIP {$filename} (output exists, use --force to re-run)");
|
||
$skipped++;
|
||
continue;
|
||
}
|
||
|
||
stdout("PROCESS {$filename}");
|
||
|
||
try {
|
||
$ok = process_image($imagePath, $outputPath, $apiKey, $verbose);
|
||
if ($ok) {
|
||
stdout(" → output/{$stem}.yaml");
|
||
$processed++;
|
||
} else {
|
||
$failed++;
|
||
}
|
||
} catch (RuntimeException $e) {
|
||
stderr($e->getMessage());
|
||
$failed++;
|
||
}
|
||
}
|
||
|
||
stdout(sprintf(
|
||
"\nDone — processed: %d skipped: %d failed: %d",
|
||
$processed,
|
||
$skipped,
|
||
$failed
|
||
));
|