'image/jpeg', 'png' => 'image/png', 'webp' => 'image/webp', 'gif' => 'image/gif', default => 'image/jpeg', }; } /** * Minimal YAML serialiser — handles the flat structure we produce. * Supports: null, bool, int, float, single-line strings, multi-line strings (literal block). */ function to_yaml(array $data, int $depth = 0): string { $out = ''; $pad = str_repeat(' ', $depth); foreach ($data as $key => $value) { if ($value === null) { $out .= "{$pad}{$key}: null\n"; continue; } if (is_bool($value)) { $out .= "{$pad}{$key}: " . ($value ? 'true' : 'false') . "\n"; continue; } if (is_int($value) || is_float($value)) { $out .= "{$pad}{$key}: {$value}\n"; continue; } if (is_array($value)) { $out .= "{$pad}{$key}:\n" . to_yaml($value, $depth + 1); continue; } $str = (string) $value; // Multi-line → YAML literal block scalar if (str_contains($str, "\n")) { $childPad = str_repeat(' ', $depth + 1); $indented = $childPad . implode("\n{$childPad}", explode("\n", rtrim($str))); $out .= "{$pad}{$key}: |\n{$indented}\n"; continue; } // Single-line — quote if the value contains YAML special characters if ($str === '' || preg_match('/[:#\[\]{}|>&!\'"%@`,]|^\s|\s$/', $str)) { $escaped = str_replace(['\\', '"'], ['\\\\', '\\"'], $str); $out .= "{$pad}{$key}: \"{$escaped}\"\n"; continue; } $out .= "{$pad}{$key}: {$str}\n"; } return $out; } // ── Mistral API ─────────────────────────────────────────────────────────────── /** * Generic JSON POST to the Mistral REST API. * * @throws RuntimeException on network error or non-200 response */ function mistral_post(string $endpoint, array $payload, string $apiKey, bool $verbose): array { $url = MISTRAL_BASE_URL . $endpoint; $body = json_encode($payload, JSON_UNESCAPED_UNICODE | JSON_THROW_ON_ERROR); verbose("POST {$url}", $verbose); $ch = curl_init($url); curl_setopt_array($ch, [ CURLOPT_RETURNTRANSFER => true, CURLOPT_POST => true, CURLOPT_POSTFIELDS => $body, CURLOPT_HTTPHEADER => [ 'Authorization: Bearer ' . $apiKey, 'Content-Type: application/json', 'Accept: application/json', ], CURLOPT_TIMEOUT => 120, CURLOPT_CONNECTTIMEOUT => 15, ]); $response = curl_exec($ch); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); $curlError = curl_error($ch); curl_close($ch); if ($curlError !== '') { throw new RuntimeException("cURL error: {$curlError}"); } if ($httpCode !== 200) { $decoded = json_decode((string) $response, true); $msg = $decoded['message'] ?? $decoded['error']['message'] ?? (string) $response; throw new RuntimeException("Mistral API {$httpCode}: {$msg}"); } $decoded = json_decode((string) $response, true); if (!is_array($decoded)) { throw new RuntimeException("Non-JSON response from Mistral API"); } return $decoded; } /** * Step 1 — Send the image to mistral-ocr-latest and get markdown text back. */ function ocr_image(string $imagePath, string $apiKey, bool $verbose): string { $mime = mime_for($imagePath); $imageData = base64_encode((string) file_get_contents($imagePath)); verbose("OCR model: " . OCR_MODEL, $verbose); $result = mistral_post('/ocr', [ 'model' => OCR_MODEL, 'document' => [ 'type' => 'image_url', 'image_url' => "data:{$mime};base64,{$imageData}", ], ], $apiKey, $verbose); $text = ''; foreach ($result['pages'] ?? [] as $page) { $text .= ($page['markdown'] ?? '') . "\n"; } return trim($text); } /** * Step 2 — Extract Serial Number, Model Number, Date from raw OCR text * using a chat model with JSON response mode. */ function extract_fields(string $ocrText, string $apiKey, bool $verbose): array { verbose("Extraction model: " . CHAT_MODEL, $verbose); $system = 'You are a precision industrial part-label parser. ' . 'Extract structured fields from OCR text. ' . 'Return ONLY valid JSON — no explanation, no markdown fences.'; $user = << CHAT_MODEL, 'messages' => [ ['role' => 'system', 'content' => $system], ['role' => 'user', 'content' => $user], ], 'response_format' => ['type' => 'json_object'], 'temperature' => 0.0, ], $apiKey, $verbose); $content = $result['choices'][0]['message']['content'] ?? '{}'; $fields = json_decode($content, true); if (!is_array($fields)) { stderr("Could not parse extraction response: {$content}"); $fields = []; } return [ 'serial_number' => isset($fields['serial_number']) ? (string) $fields['serial_number'] : null, 'model_number' => isset($fields['model_number']) ? (string) $fields['model_number'] : null, 'date' => isset($fields['date']) ? (string) $fields['date'] : null, ]; } // ── Image processing ────────────────────────────────────────────────────────── function process_image(string $imagePath, string $outputPath, string $apiKey, bool $verbose): bool { $filename = basename($imagePath); $size = filesize($imagePath); if ($size === false || $size > MAX_IMAGE_BYTES) { stderr("File too large or unreadable ({$size} bytes): {$filename}"); return false; } // Step 1: OCR $ocrText = ocr_image($imagePath, $apiKey, $verbose); if ($ocrText === '') { stderr("No text found in: {$filename}"); // Still write output so we don't retry repeatedly } verbose("--- OCR text ---\n{$ocrText}\n---", $verbose); // Step 2: Structured extraction (skip if nothing to parse) $fields = ['serial_number' => null, 'model_number' => null, 'date' => null]; if ($ocrText !== '') { $fields = extract_fields($ocrText, $apiKey, $verbose); } // Build and write YAML $output = [ 'serial_number' => $fields['serial_number'], 'model_number' => $fields['model_number'], 'date' => $fields['date'], 'source_file' => $filename, 'processed_at' => date('Y-m-d H:i:s'), 'raw_ocr' => $ocrText !== '' ? $ocrText : null, ]; $yaml = "---\n" . to_yaml($output); file_put_contents($outputPath, $yaml); return true; } // ── Main ────────────────────────────────────────────────────────────────────── if (!is_dir(OUTPUT_DIR)) { mkdir(OUTPUT_DIR, 0755, true); } // Collect images $pattern = INPUT_DIR . '/*.{' . implode(',', SUPPORTED_EXTENSIONS) . '}'; $images = glob($pattern, GLOB_BRACE) ?: []; if ($images === []) { stdout("No supported images found in " . INPUT_DIR); exit(0); } stdout(sprintf("Found %d image(s). Starting OCR…\n", count($images))); $processed = 0; $skipped = 0; $failed = 0; foreach ($images as $imagePath) { $filename = basename($imagePath); $stem = pathinfo($filename, PATHINFO_FILENAME); $outputPath = OUTPUT_DIR . '/' . $stem . '.yaml'; if (!$force && file_exists($outputPath)) { stdout("SKIP {$filename} (output exists, use --force to re-run)"); $skipped++; continue; } stdout("PROCESS {$filename}"); try { $ok = process_image($imagePath, $outputPath, $apiKey, $verbose); if ($ok) { stdout(" → output/{$stem}.yaml"); $processed++; } else { $failed++; } } catch (RuntimeException $e) { stderr($e->getMessage()); $failed++; } } stdout(sprintf( "\nDone — processed: %d skipped: %d failed: %d", $processed, $skipped, $failed ));