Code Coverage for /var/www/html/dev/src/phpOMS/Localization/LanguageDetection/NgramParser.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	96.43% covered (success)	96.43%	27 / 28	50.00% covered (danger)	50.00%	1 / 2	CRAP	0.00% covered (danger)	0.00%	0 / 1
NgramParser	96.43% covered (success)	96.43%	27 / 28	50.00% covered (danger)	50.00%	1 / 2	12	0.00% covered (danger)	0.00%	0 / 1
tokenize	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	2
getNgrams	96.00% covered (success)	96.00%	24 / 25	0.00% covered (danger)	0.00%	0 / 1	10

1	<?php
2	/**
3	* Jingga
4	*
5	* PHP Version 8.1
6	*
7	* @package phpOMS\Localization\LanguageDetection
8	* @author Patrick Schur <patrick_schur@outlook.de>
9	* @copyright Patrick Schur
10	* @license https://opensource.org/licenses/mit-license.html MIT
11	* @link https://github.com/patrickschur/language-detection
12	*/
13	declare(strict_types=1);
14
15	namespace phpOMS\Localization\LanguageDetection;
16
17	use phpOMS\Localization\LanguageDetection\Tokenizer\WhitespaceTokenizer;
18
19	/**
20	* Ngram parser class
21	*
22	* @package phpOMS\Localization\LanguageDetection
23	* @license https://opensource.org/licenses/mit-license.html MIT
24	* @link https://github.com/patrickschur/language-detection
25	* @since 1.0.0
26	*/
27	abstract class NgramParser
28	{
29	/**
30	* Minimum length
31	*
32	* @var int
33	* @since 1.0.0
34	*/
35	public int $minLength = 1;
36
37	/**
38	* Maximum length
39	*
40	* @var int
41	* @since 1.0.0
42	*/
43	public int $maxLength = 3;
44
45	/**
46	* Maximum amount of ngrams
47	*
48	* @var int
49	* @since 1.0.0
50	*/
51	public int $maxNgrams = 310;
52
53	/**
54	* Tokenizer to use
55	*
56	* @var null\|WhitespaceTokenizer
57	* @since 1.0.0
58	*/
59	public ?WhitespaceTokenizer $tokenizer = null;
60
61	/**
62	* Tokenize string
63	*
64	* @param string $str String to tokenize
65	*
66	* @return array
67	*
68	* @since 1.0.0
69	*/
70	private function tokenize(string $str) : array
71	{
72	if ($this->tokenizer === null) {
73	$this->tokenizer = new WhitespaceTokenizer();
74	}
75
76	return $this->tokenizer->tokenize($str);
77	}
78
79	/**
80	* Get ngrams
81	*
82	* @param string $str String to parse
83	*
84	* @return array
85	*
86	* @since 1.0.0
87	*/
88	protected function getNgrams(string $str) : array
89	{
90	$tokens = [];
91	$words = $this->tokenize($str);
92
93	foreach ($words as $word) {
94	$l = \mb_strlen($word);
95	$tmp = 0;
96
97	for ($i = $this->minLength; $i <= $this->maxLength; ++$i) {
98	for ($j = 0; ($i + $j - 1) < $l; ++$j, ++$tmp) {
99	if (!isset($tokens[$i][$char = \mb_substr($word, $j, $i)])) {
100	$tokens[$i][$char] = 0;
101	}
102
103	$tmp = &$tokens[$i][$char];
104	}
105	}
106	}
107
108	foreach ($tokens as $i => $token) {
109	$sum = \array_sum($token);
110
111	foreach ($token as $j => $value) {
112	/** @phpstan-ignore-next-line */
113	$tokens[$i][$j] = $sum === 0 ? 0 : $value / $sum;
114	}
115	}
116
117	if (empty($tokens)) {
118	return [];
119	}
120
121	$tokens = \array_merge(...$tokens);
122
123	if (isset($tokens['_'])) {
124	unset($tokens['_']);
125	}
126
127	\arsort($tokens, \SORT_NUMERIC);
128
129	return \array_slice(
130	\array_keys($tokens),
131	0,
132	$this->maxNgrams
133	);
134	}
135	}