Add Mistral AI OCR script with test data and documentation
- ocr.php: two-step pipeline (mistral-ocr-latest + mistral-small-latest) extracts Serial Number, Model Number, and Date from part label photos - input/: 5 test images of industrial part labels - output/: corresponding YAML results - README.md: full usage, setup, and troubleshooting docs - .gitignore: excludes .env only - .env.example: API key template Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
3219ea6916
commit
5bf9e065e4
14 changed files with 682 additions and 0 deletions
398
ocr.php
Normal file
398
ocr.php
Normal file
|
|
@ -0,0 +1,398 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* ckOCR — Part Identification Label OCR via Mistral AI
|
||||
*
|
||||
* Scans input/ for images, extracts Serial Number, Model Number and Date
|
||||
* using Mistral's OCR API + structured extraction, writes results as YAML to output/.
|
||||
*
|
||||
* PHP 8.1 – 8.5 compatible
|
||||
*
|
||||
* Usage:
|
||||
* php ocr.php [--force] [--verbose]
|
||||
*
|
||||
* --force Re-process images that already have an output file
|
||||
* --verbose Print OCR text and API details
|
||||
*/
|
||||
|
||||
// ── Configuration ─────────────────────────────────────────────────────────────
|
||||
|
||||
const INPUT_DIR = __DIR__ . '/input';
|
||||
const OUTPUT_DIR = __DIR__ . '/output';
|
||||
|
||||
const MISTRAL_BASE_URL = 'https://api.mistral.ai/v1';
|
||||
const OCR_MODEL = 'mistral-ocr-latest';
|
||||
const CHAT_MODEL = 'mistral-small-latest';
|
||||
|
||||
const SUPPORTED_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif'];
|
||||
const MAX_IMAGE_BYTES = 5 * 1024 * 1024; // 5 MB — Mistral API limit
|
||||
|
||||
// ── CLI arguments ─────────────────────────────────────────────────────────────
|
||||
|
||||
$opts = getopt('', ['force', 'verbose', 'help']);
|
||||
$force = isset($opts['force']);
|
||||
$verbose = isset($opts['verbose']);
|
||||
|
||||
if (isset($opts['help'])) {
|
||||
echo <<<HELP
|
||||
ckOCR — Part Label OCR using Mistral AI
|
||||
|
||||
Usage: php ocr.php [options]
|
||||
|
||||
Options:
|
||||
--force Re-process already completed images
|
||||
--verbose Show OCR text and request details
|
||||
--help Show this help message
|
||||
|
||||
Environment / .env:
|
||||
MISTRAL_API_KEY Your Mistral AI API key (required)
|
||||
|
||||
HELP;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// ── API key ───────────────────────────────────────────────────────────────────
|
||||
|
||||
$apiKey = load_api_key();
|
||||
if (!$apiKey) {
|
||||
stderr("MISTRAL_API_KEY not set.");
|
||||
stderr("Set it as an environment variable or add it to a .env file in the project root.");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
function load_api_key(): string
|
||||
{
|
||||
$key = (string) getenv('MISTRAL_API_KEY');
|
||||
if ($key !== '') {
|
||||
return $key;
|
||||
}
|
||||
|
||||
$envFile = __DIR__ . '/.env';
|
||||
if (!file_exists($envFile)) {
|
||||
return '';
|
||||
}
|
||||
|
||||
foreach (file($envFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES) as $line) {
|
||||
$line = trim($line);
|
||||
if ($line === '' || str_starts_with($line, '#')) {
|
||||
continue;
|
||||
}
|
||||
[$envKey, $envVal] = array_map('trim', explode('=', $line, 2)) + ['', ''];
|
||||
if ($envKey === 'MISTRAL_API_KEY') {
|
||||
return trim($envVal, '"\'');
|
||||
}
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
function stdout(string $msg): void
|
||||
{
|
||||
echo $msg . "\n";
|
||||
}
|
||||
|
||||
function stderr(string $msg): void
|
||||
{
|
||||
fwrite(STDERR, "ERROR: {$msg}\n");
|
||||
}
|
||||
|
||||
function verbose(string $msg, bool $verbose): void
|
||||
{
|
||||
if ($verbose) {
|
||||
echo " [v] {$msg}\n";
|
||||
}
|
||||
}
|
||||
|
||||
function mime_for(string $path): string
|
||||
{
|
||||
return match (strtolower(pathinfo($path, PATHINFO_EXTENSION))) {
|
||||
'jpg', 'jpeg' => 'image/jpeg',
|
||||
'png' => 'image/png',
|
||||
'webp' => 'image/webp',
|
||||
'gif' => 'image/gif',
|
||||
default => 'image/jpeg',
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Minimal YAML serialiser — handles the flat structure we produce.
|
||||
* Supports: null, bool, int, float, single-line strings, multi-line strings (literal block).
|
||||
*/
|
||||
function to_yaml(array $data, int $depth = 0): string
|
||||
{
|
||||
$out = '';
|
||||
$pad = str_repeat(' ', $depth);
|
||||
|
||||
foreach ($data as $key => $value) {
|
||||
if ($value === null) {
|
||||
$out .= "{$pad}{$key}: null\n";
|
||||
continue;
|
||||
}
|
||||
if (is_bool($value)) {
|
||||
$out .= "{$pad}{$key}: " . ($value ? 'true' : 'false') . "\n";
|
||||
continue;
|
||||
}
|
||||
if (is_int($value) || is_float($value)) {
|
||||
$out .= "{$pad}{$key}: {$value}\n";
|
||||
continue;
|
||||
}
|
||||
if (is_array($value)) {
|
||||
$out .= "{$pad}{$key}:\n" . to_yaml($value, $depth + 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
$str = (string) $value;
|
||||
|
||||
// Multi-line → YAML literal block scalar
|
||||
if (str_contains($str, "\n")) {
|
||||
$childPad = str_repeat(' ', $depth + 1);
|
||||
$indented = $childPad . implode("\n{$childPad}", explode("\n", rtrim($str)));
|
||||
$out .= "{$pad}{$key}: |\n{$indented}\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
// Single-line — quote if the value contains YAML special characters
|
||||
if ($str === '' || preg_match('/[:#\[\]{}|>&!\'"%@`,]|^\s|\s$/', $str)) {
|
||||
$escaped = str_replace(['\\', '"'], ['\\\\', '\\"'], $str);
|
||||
$out .= "{$pad}{$key}: \"{$escaped}\"\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
$out .= "{$pad}{$key}: {$str}\n";
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
// ── Mistral API ───────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Generic JSON POST to the Mistral REST API.
|
||||
*
|
||||
* @throws RuntimeException on network error or non-200 response
|
||||
*/
|
||||
function mistral_post(string $endpoint, array $payload, string $apiKey, bool $verbose): array
|
||||
{
|
||||
$url = MISTRAL_BASE_URL . $endpoint;
|
||||
$body = json_encode($payload, JSON_UNESCAPED_UNICODE | JSON_THROW_ON_ERROR);
|
||||
|
||||
verbose("POST {$url}", $verbose);
|
||||
|
||||
$ch = curl_init($url);
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_POST => true,
|
||||
CURLOPT_POSTFIELDS => $body,
|
||||
CURLOPT_HTTPHEADER => [
|
||||
'Authorization: Bearer ' . $apiKey,
|
||||
'Content-Type: application/json',
|
||||
'Accept: application/json',
|
||||
],
|
||||
CURLOPT_TIMEOUT => 120,
|
||||
CURLOPT_CONNECTTIMEOUT => 15,
|
||||
]);
|
||||
|
||||
$response = curl_exec($ch);
|
||||
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||
$curlError = curl_error($ch);
|
||||
curl_close($ch);
|
||||
|
||||
if ($curlError !== '') {
|
||||
throw new RuntimeException("cURL error: {$curlError}");
|
||||
}
|
||||
|
||||
if ($httpCode !== 200) {
|
||||
$decoded = json_decode((string) $response, true);
|
||||
$msg = $decoded['message']
|
||||
?? $decoded['error']['message']
|
||||
?? (string) $response;
|
||||
throw new RuntimeException("Mistral API {$httpCode}: {$msg}");
|
||||
}
|
||||
|
||||
$decoded = json_decode((string) $response, true);
|
||||
if (!is_array($decoded)) {
|
||||
throw new RuntimeException("Non-JSON response from Mistral API");
|
||||
}
|
||||
|
||||
return $decoded;
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 1 — Send the image to mistral-ocr-latest and get markdown text back.
|
||||
*/
|
||||
function ocr_image(string $imagePath, string $apiKey, bool $verbose): string
|
||||
{
|
||||
$mime = mime_for($imagePath);
|
||||
$imageData = base64_encode((string) file_get_contents($imagePath));
|
||||
|
||||
verbose("OCR model: " . OCR_MODEL, $verbose);
|
||||
|
||||
$result = mistral_post('/ocr', [
|
||||
'model' => OCR_MODEL,
|
||||
'document' => [
|
||||
'type' => 'image_url',
|
||||
'image_url' => "data:{$mime};base64,{$imageData}",
|
||||
],
|
||||
], $apiKey, $verbose);
|
||||
|
||||
$text = '';
|
||||
foreach ($result['pages'] ?? [] as $page) {
|
||||
$text .= ($page['markdown'] ?? '') . "\n";
|
||||
}
|
||||
|
||||
return trim($text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Step 2 — Extract Serial Number, Model Number, Date from raw OCR text
|
||||
* using a chat model with JSON response mode.
|
||||
*/
|
||||
function extract_fields(string $ocrText, string $apiKey, bool $verbose): array
|
||||
{
|
||||
verbose("Extraction model: " . CHAT_MODEL, $verbose);
|
||||
|
||||
$system = 'You are a precision industrial part-label parser. '
|
||||
. 'Extract structured fields from OCR text. '
|
||||
. 'Return ONLY valid JSON — no explanation, no markdown fences.';
|
||||
|
||||
$user = <<<PROMPT
|
||||
Extract these fields from the OCR text below.
|
||||
Return a JSON object with exactly these keys (use null when not found):
|
||||
|
||||
{
|
||||
"serial_number": "Serial Number (labelled S/N, SN, Serial No., etc.)",
|
||||
"model_number": "Model or Part Number (labelled M/N, Model, Part No., P/N, MPN, etc.)",
|
||||
"date": "Any date present (manufacturing, MFG, expiry, DOM, etc.) — keep original format"
|
||||
}
|
||||
|
||||
OCR text:
|
||||
{$ocrText}
|
||||
PROMPT;
|
||||
|
||||
$result = mistral_post('/chat/completions', [
|
||||
'model' => CHAT_MODEL,
|
||||
'messages' => [
|
||||
['role' => 'system', 'content' => $system],
|
||||
['role' => 'user', 'content' => $user],
|
||||
],
|
||||
'response_format' => ['type' => 'json_object'],
|
||||
'temperature' => 0.0,
|
||||
], $apiKey, $verbose);
|
||||
|
||||
$content = $result['choices'][0]['message']['content'] ?? '{}';
|
||||
$fields = json_decode($content, true);
|
||||
|
||||
if (!is_array($fields)) {
|
||||
stderr("Could not parse extraction response: {$content}");
|
||||
$fields = [];
|
||||
}
|
||||
|
||||
return [
|
||||
'serial_number' => isset($fields['serial_number']) ? (string) $fields['serial_number'] : null,
|
||||
'model_number' => isset($fields['model_number']) ? (string) $fields['model_number'] : null,
|
||||
'date' => isset($fields['date']) ? (string) $fields['date'] : null,
|
||||
];
|
||||
}
|
||||
|
||||
// ── Image processing ──────────────────────────────────────────────────────────
|
||||
|
||||
function process_image(string $imagePath, string $outputPath, string $apiKey, bool $verbose): bool
|
||||
{
|
||||
$filename = basename($imagePath);
|
||||
$size = filesize($imagePath);
|
||||
|
||||
if ($size === false || $size > MAX_IMAGE_BYTES) {
|
||||
stderr("File too large or unreadable ({$size} bytes): {$filename}");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Step 1: OCR
|
||||
$ocrText = ocr_image($imagePath, $apiKey, $verbose);
|
||||
|
||||
if ($ocrText === '') {
|
||||
stderr("No text found in: {$filename}");
|
||||
// Still write output so we don't retry repeatedly
|
||||
}
|
||||
|
||||
verbose("--- OCR text ---\n{$ocrText}\n---", $verbose);
|
||||
|
||||
// Step 2: Structured extraction (skip if nothing to parse)
|
||||
$fields = ['serial_number' => null, 'model_number' => null, 'date' => null];
|
||||
if ($ocrText !== '') {
|
||||
$fields = extract_fields($ocrText, $apiKey, $verbose);
|
||||
}
|
||||
|
||||
// Build and write YAML
|
||||
$output = [
|
||||
'serial_number' => $fields['serial_number'],
|
||||
'model_number' => $fields['model_number'],
|
||||
'date' => $fields['date'],
|
||||
'source_file' => $filename,
|
||||
'processed_at' => date('Y-m-d H:i:s'),
|
||||
'raw_ocr' => $ocrText !== '' ? $ocrText : null,
|
||||
];
|
||||
|
||||
$yaml = "---\n" . to_yaml($output);
|
||||
file_put_contents($outputPath, $yaml);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// ── Main ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
if (!is_dir(OUTPUT_DIR)) {
|
||||
mkdir(OUTPUT_DIR, 0755, true);
|
||||
}
|
||||
|
||||
// Collect images
|
||||
$pattern = INPUT_DIR . '/*.{' . implode(',', SUPPORTED_EXTENSIONS) . '}';
|
||||
$images = glob($pattern, GLOB_BRACE) ?: [];
|
||||
|
||||
if ($images === []) {
|
||||
stdout("No supported images found in " . INPUT_DIR);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
stdout(sprintf("Found %d image(s). Starting OCR…\n", count($images)));
|
||||
|
||||
$processed = 0;
|
||||
$skipped = 0;
|
||||
$failed = 0;
|
||||
|
||||
foreach ($images as $imagePath) {
|
||||
$filename = basename($imagePath);
|
||||
$stem = pathinfo($filename, PATHINFO_FILENAME);
|
||||
$outputPath = OUTPUT_DIR . '/' . $stem . '.yaml';
|
||||
|
||||
if (!$force && file_exists($outputPath)) {
|
||||
stdout("SKIP {$filename} (output exists, use --force to re-run)");
|
||||
$skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
stdout("PROCESS {$filename}");
|
||||
|
||||
try {
|
||||
$ok = process_image($imagePath, $outputPath, $apiKey, $verbose);
|
||||
if ($ok) {
|
||||
stdout(" → output/{$stem}.yaml");
|
||||
$processed++;
|
||||
} else {
|
||||
$failed++;
|
||||
}
|
||||
} catch (RuntimeException $e) {
|
||||
stderr($e->getMessage());
|
||||
$failed++;
|
||||
}
|
||||
}
|
||||
|
||||
stdout(sprintf(
|
||||
"\nDone — processed: %d skipped: %d failed: %d",
|
||||
$processed,
|
||||
$skipped,
|
||||
$failed
|
||||
));
|
||||
Loading…
Add table
Add a link
Reference in a new issue