Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
73.08% |
19 / 26 |
|
0.00% |
0 / 2 |
CRAP | |
0.00% |
0 / 1 |
TesseractOcr | |
73.08% |
19 / 26 |
|
0.00% |
0 / 2 |
9.25 | |
0.00% |
0 / 1 |
setBin | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
parseImage | |
82.61% |
19 / 23 |
|
0.00% |
0 / 1 |
6.19 |
1 | <?php |
2 | /** |
3 | * Jingga |
4 | * |
5 | * PHP Version 8.1 |
6 | * |
7 | * @package phpOMS\Ai\Ocr\Tesseract |
8 | * @copyright Dennis Eichhorn |
9 | * @license OMS License 2.0 |
10 | * @version 1.0.0 |
11 | * @link https://jingga.app |
12 | */ |
13 | declare(strict_types=1); |
14 | |
15 | namespace phpOMS\Ai\Ocr\Tesseract; |
16 | |
17 | use phpOMS\System\File\PathException; |
18 | use phpOMS\System\SystemUtils; |
19 | |
20 | /** |
21 | * Tesseract api |
22 | * |
23 | * @package phpOMS\Ai\Ocr\Tesseract |
24 | * @license OMS License 2.0 |
25 | * @link https://jingga.app |
26 | * @since 1.0.0 |
27 | */ |
28 | final class TesseractOcr |
29 | { |
30 | /** |
31 | * Tesseract path. |
32 | * |
33 | * @var string |
34 | * @since 1.0.0 |
35 | */ |
36 | protected static string $bin = '/usr/bin/tesseract'; |
37 | |
38 | /** |
39 | * Set tesseract binary. |
40 | * |
41 | * @param string $path tesseract path |
42 | * |
43 | * @return void |
44 | * |
45 | * @throws PathException This exception is thrown if the binary path doesn't exist |
46 | * |
47 | * @since 1.0.0 |
48 | */ |
49 | public static function setBin(string $path) : void |
50 | { |
51 | if (\realpath($path) === false) { |
52 | throw new PathException($path); |
53 | } |
54 | |
55 | self::$bin = \realpath($path); |
56 | } |
57 | |
58 | /** |
59 | * Prase image |
60 | * |
61 | * @param string $image Image path |
62 | * @param array $languages Languages to use |
63 | * @param int $psm Page segmentation mode (0 - 13) |
64 | * 0 Orientation and script detection (OSD) only. |
65 | * 1 Automatic page segmentation with OSD. |
66 | * 2 Automatic page segmentation, but no OSD, or OCR. |
67 | * 3 Fully automatic page segmentation, but no OSD. (Default) |
68 | * 4 Assume a single column of text of variable sizes. |
69 | * 5 Assume a single uniform block of vertically aligned text. |
70 | * 6 Assume a single uniform block of text. |
71 | * 7 Treat the image as a single text line. |
72 | * 8 Treat the image as a single word. |
73 | * 9 Treat the image as a single word in a circle. |
74 | * 10 Treat the image as a single character. |
75 | * 11 Sparse text. Find as much text as possible in no particular order. |
76 | * 12 Sparse text with OSD. |
77 | * 13 Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific. |
78 | * @param int $oem OCR engine modes |
79 | * 0 Legacy engine only. |
80 | * 1 Neural nets LSTM engine only. |
81 | * 2 Legacy + LSTM engines. |
82 | * 3 Default, based on what is available |
83 | * |
84 | * @return string |
85 | * |
86 | * @since 1.0.0 |
87 | */ |
88 | public function parseImage(string $image, array $languages = ['eng'], int $psm = 3, int $oem = 3) : string |
89 | { |
90 | $temp = \tempnam(\sys_get_temp_dir(), 'oms_ocr_'); |
91 | if ($temp === false) { |
92 | return ''; |
93 | } |
94 | |
95 | try { |
96 | SystemUtils::runProc( |
97 | self::$bin, |
98 | $image . ' ' |
99 | . $temp |
100 | . ' -c preserve_interword_spaces=1' |
101 | . ' --psm ' . $psm |
102 | . ' --oem ' . $oem |
103 | . ' -l ' . \implode('+', $languages) |
104 | ); |
105 | } catch (\Throwable $_) { |
106 | return ''; |
107 | } |
108 | |
109 | $filepath = \is_file($temp . '.txt') |
110 | ? $temp . '.txt' |
111 | : $temp; |
112 | |
113 | if (!\is_file($filepath)) { |
114 | // @codeCoverageIgnoreStart |
115 | \unlink($temp); |
116 | |
117 | return ''; |
118 | // @codeCoverageIgnoreEnd |
119 | } |
120 | |
121 | $parsed = \file_get_contents($filepath); |
122 | if ($parsed === false) { |
123 | // @codeCoverageIgnoreStart |
124 | \unlink($temp); |
125 | |
126 | return ''; |
127 | // @codeCoverageIgnoreEnd |
128 | } |
129 | |
130 | \unlink($filepath); |
131 | \unlink($temp); |
132 | |
133 | return \trim($parsed); |
134 | } |
135 | } |