Code Coverage for /var/www/html/dev/src/phpOMS/Ai/Ocr/Tesseract/TesseractOcr.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	73.08% covered (warning)	73.08%	19 / 26	0.00% covered (danger)	0.00%	0 / 2	CRAP	0.00% covered (danger)	0.00%	0 / 1
TesseractOcr	73.08% covered (warning)	73.08%	19 / 26	0.00% covered (danger)	0.00%	0 / 2	9.25	0.00% covered (danger)	0.00%	0 / 1
setBin	0.00% covered (danger)	0.00%	0 / 3	0.00% covered (danger)	0.00%	0 / 1	6
parseImage	82.61% covered (warning)	82.61%	19 / 23	0.00% covered (danger)	0.00%	0 / 1	6.19

1	<?php
2	/**
3	* Jingga
4	*
5	* PHP Version 8.1
6	*
7	* @package phpOMS\Ai\Ocr\Tesseract
8	* @copyright Dennis Eichhorn
9	* @license OMS License 2.0
10	* @version 1.0.0
11	* @link https://jingga.app
12	*/
13	declare(strict_types=1);
14
15	namespace phpOMS\Ai\Ocr\Tesseract;
16
17	use phpOMS\System\File\PathException;
18	use phpOMS\System\SystemUtils;
19
20	/**
21	* Tesseract api
22	*
23	* @package phpOMS\Ai\Ocr\Tesseract
24	* @license OMS License 2.0
25	* @link https://jingga.app
26	* @since 1.0.0
27	*/
28	final class TesseractOcr
29	{
30	/**
31	* Tesseract path.
32	*
33	* @var string
34	* @since 1.0.0
35	*/
36	protected static string $bin = '/usr/bin/tesseract';
37
38	/**
39	* Set tesseract binary.
40	*
41	* @param string $path tesseract path
42	*
43	* @return void
44	*
45	* @throws PathException This exception is thrown if the binary path doesn't exist
46	*
47	* @since 1.0.0
48	*/
49	public static function setBin(string $path) : void
50	{
51	if (\realpath($path) === false) {
52	throw new PathException($path);
53	}
54
55	self::$bin = \realpath($path);
56	}
57
58	/**
59	* Prase image
60	*
61	* @param string $image Image path
62	* @param array $languages Languages to use
63	* @param int $psm Page segmentation mode (0 - 13)
64	* 0 Orientation and script detection (OSD) only.
65	* 1 Automatic page segmentation with OSD.
66	* 2 Automatic page segmentation, but no OSD, or OCR.
67	* 3 Fully automatic page segmentation, but no OSD. (Default)
68	* 4 Assume a single column of text of variable sizes.
69	* 5 Assume a single uniform block of vertically aligned text.
70	* 6 Assume a single uniform block of text.
71	* 7 Treat the image as a single text line.
72	* 8 Treat the image as a single word.
73	* 9 Treat the image as a single word in a circle.
74	* 10 Treat the image as a single character.
75	* 11 Sparse text. Find as much text as possible in no particular order.
76	* 12 Sparse text with OSD.
77	* 13 Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.
78	* @param int $oem OCR engine modes
79	* 0 Legacy engine only.
80	* 1 Neural nets LSTM engine only.
81	* 2 Legacy + LSTM engines.
82	* 3 Default, based on what is available
83	*
84	* @return string
85	*
86	* @since 1.0.0
87	*/
88	public function parseImage(string $image, array $languages = ['eng'], int $psm = 3, int $oem = 3) : string
89	{
90	$temp = \tempnam(\sys_get_temp_dir(), 'oms_ocr_');
91	if ($temp === false) {
92	return '';
93	}
94
95	try {
96	SystemUtils::runProc(
97	self::$bin,
98	$image . ' '
99	. $temp
100	. ' -c preserve_interword_spaces=1'
101	. ' --psm ' . $psm
102	. ' --oem ' . $oem
103	. ' -l ' . \implode('+', $languages)
104	);
105	} catch (\Throwable $_) {
106	return '';
107	}
108
109	$filepath = \is_file($temp . '.txt')
110	? $temp . '.txt'
111	: $temp;
112
113	if (!\is_file($filepath)) {
114	// @codeCoverageIgnoreStart
115	\unlink($temp);
116
117	return '';
118	// @codeCoverageIgnoreEnd
119	}
120
121	$parsed = \file_get_contents($filepath);
122	if ($parsed === false) {
123	// @codeCoverageIgnoreStart
124	\unlink($temp);
125
126	return '';
127	// @codeCoverageIgnoreEnd
128	}
129
130	\unlink($filepath);
131	\unlink($temp);
132
133	return \trim($parsed);
134	}
135	}