Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
73.08% |
19 / 26 |
|
0.00% |
0 / 2 |
CRAP | |
0.00% |
0 / 1 |
| TesseractOcr | |
73.08% |
19 / 26 |
|
0.00% |
0 / 2 |
9.25 | |
0.00% |
0 / 1 |
| setBin | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
| parseImage | |
82.61% |
19 / 23 |
|
0.00% |
0 / 1 |
6.19 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * Jingga |
| 4 | * |
| 5 | * PHP Version 8.1 |
| 6 | * |
| 7 | * @package phpOMS\Ai\Ocr\Tesseract |
| 8 | * @copyright Dennis Eichhorn |
| 9 | * @license OMS License 2.0 |
| 10 | * @version 1.0.0 |
| 11 | * @link https://jingga.app |
| 12 | */ |
| 13 | declare(strict_types=1); |
| 14 | |
| 15 | namespace phpOMS\Ai\Ocr\Tesseract; |
| 16 | |
| 17 | use phpOMS\System\File\PathException; |
| 18 | use phpOMS\System\SystemUtils; |
| 19 | |
| 20 | /** |
| 21 | * Tesseract api |
| 22 | * |
| 23 | * @package phpOMS\Ai\Ocr\Tesseract |
| 24 | * @license OMS License 2.0 |
| 25 | * @link https://jingga.app |
| 26 | * @since 1.0.0 |
| 27 | */ |
| 28 | final class TesseractOcr |
| 29 | { |
| 30 | /** |
| 31 | * Tesseract path. |
| 32 | * |
| 33 | * @var string |
| 34 | * @since 1.0.0 |
| 35 | */ |
| 36 | protected static string $bin = '/usr/bin/tesseract'; |
| 37 | |
| 38 | /** |
| 39 | * Set tesseract binary. |
| 40 | * |
| 41 | * @param string $path tesseract path |
| 42 | * |
| 43 | * @return void |
| 44 | * |
| 45 | * @throws PathException This exception is thrown if the binary path doesn't exist |
| 46 | * |
| 47 | * @since 1.0.0 |
| 48 | */ |
| 49 | public static function setBin(string $path) : void |
| 50 | { |
| 51 | if (\realpath($path) === false) { |
| 52 | throw new PathException($path); |
| 53 | } |
| 54 | |
| 55 | self::$bin = \realpath($path); |
| 56 | } |
| 57 | |
| 58 | /** |
| 59 | * Prase image |
| 60 | * |
| 61 | * @param string $image Image path |
| 62 | * @param array $languages Languages to use |
| 63 | * @param int $psm Page segmentation mode (0 - 13) |
| 64 | * 0 Orientation and script detection (OSD) only. |
| 65 | * 1 Automatic page segmentation with OSD. |
| 66 | * 2 Automatic page segmentation, but no OSD, or OCR. |
| 67 | * 3 Fully automatic page segmentation, but no OSD. (Default) |
| 68 | * 4 Assume a single column of text of variable sizes. |
| 69 | * 5 Assume a single uniform block of vertically aligned text. |
| 70 | * 6 Assume a single uniform block of text. |
| 71 | * 7 Treat the image as a single text line. |
| 72 | * 8 Treat the image as a single word. |
| 73 | * 9 Treat the image as a single word in a circle. |
| 74 | * 10 Treat the image as a single character. |
| 75 | * 11 Sparse text. Find as much text as possible in no particular order. |
| 76 | * 12 Sparse text with OSD. |
| 77 | * 13 Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific. |
| 78 | * @param int $oem OCR engine modes |
| 79 | * 0 Legacy engine only. |
| 80 | * 1 Neural nets LSTM engine only. |
| 81 | * 2 Legacy + LSTM engines. |
| 82 | * 3 Default, based on what is available |
| 83 | * |
| 84 | * @return string |
| 85 | * |
| 86 | * @since 1.0.0 |
| 87 | */ |
| 88 | public function parseImage(string $image, array $languages = ['eng'], int $psm = 3, int $oem = 3) : string |
| 89 | { |
| 90 | $temp = \tempnam(\sys_get_temp_dir(), 'oms_ocr_'); |
| 91 | if ($temp === false) { |
| 92 | return ''; |
| 93 | } |
| 94 | |
| 95 | try { |
| 96 | SystemUtils::runProc( |
| 97 | self::$bin, |
| 98 | $image . ' ' |
| 99 | . $temp |
| 100 | . ' -c preserve_interword_spaces=1' |
| 101 | . ' --psm ' . $psm |
| 102 | . ' --oem ' . $oem |
| 103 | . ' -l ' . \implode('+', $languages) |
| 104 | ); |
| 105 | } catch (\Throwable $_) { |
| 106 | return ''; |
| 107 | } |
| 108 | |
| 109 | $filepath = \is_file($temp . '.txt') |
| 110 | ? $temp . '.txt' |
| 111 | : $temp; |
| 112 | |
| 113 | if (!\is_file($filepath)) { |
| 114 | // @codeCoverageIgnoreStart |
| 115 | \unlink($temp); |
| 116 | |
| 117 | return ''; |
| 118 | // @codeCoverageIgnoreEnd |
| 119 | } |
| 120 | |
| 121 | $parsed = \file_get_contents($filepath); |
| 122 | if ($parsed === false) { |
| 123 | // @codeCoverageIgnoreStart |
| 124 | \unlink($temp); |
| 125 | |
| 126 | return ''; |
| 127 | // @codeCoverageIgnoreEnd |
| 128 | } |
| 129 | |
| 130 | \unlink($filepath); |
| 131 | \unlink($temp); |
| 132 | |
| 133 | return \trim($parsed); |
| 134 | } |
| 135 | } |