Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
96.43% |
27 / 28 |
|
50.00% |
1 / 2 |
CRAP | |
0.00% |
0 / 1 |
NgramParser | |
96.43% |
27 / 28 |
|
50.00% |
1 / 2 |
12 | |
0.00% |
0 / 1 |
tokenize | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
getNgrams | |
96.00% |
24 / 25 |
|
0.00% |
0 / 1 |
10 |
1 | <?php |
2 | /** |
3 | * Jingga |
4 | * |
5 | * PHP Version 8.1 |
6 | * |
7 | * @package phpOMS\Localization\LanguageDetection |
8 | * @author Patrick Schur <patrick_schur@outlook.de> |
9 | * @copyright Patrick Schur |
10 | * @license https://opensource.org/licenses/mit-license.html MIT |
11 | * @link https://github.com/patrickschur/language-detection |
12 | */ |
13 | declare(strict_types=1); |
14 | |
15 | namespace phpOMS\Localization\LanguageDetection; |
16 | |
17 | use phpOMS\Localization\LanguageDetection\Tokenizer\WhitespaceTokenizer; |
18 | |
19 | /** |
20 | * Ngram parser class |
21 | * |
22 | * @package phpOMS\Localization\LanguageDetection |
23 | * @license https://opensource.org/licenses/mit-license.html MIT |
24 | * @link https://github.com/patrickschur/language-detection |
25 | * @since 1.0.0 |
26 | */ |
27 | abstract class NgramParser |
28 | { |
29 | /** |
30 | * Minimum length |
31 | * |
32 | * @var int |
33 | * @since 1.0.0 |
34 | */ |
35 | public int $minLength = 1; |
36 | |
37 | /** |
38 | * Maximum length |
39 | * |
40 | * @var int |
41 | * @since 1.0.0 |
42 | */ |
43 | public int $maxLength = 3; |
44 | |
45 | /** |
46 | * Maximum amount of ngrams |
47 | * |
48 | * @var int |
49 | * @since 1.0.0 |
50 | */ |
51 | public int $maxNgrams = 310; |
52 | |
53 | /** |
54 | * Tokenizer to use |
55 | * |
56 | * @var null|WhitespaceTokenizer |
57 | * @since 1.0.0 |
58 | */ |
59 | public ?WhitespaceTokenizer $tokenizer = null; |
60 | |
61 | /** |
62 | * Tokenize string |
63 | * |
64 | * @param string $str String to tokenize |
65 | * |
66 | * @return array |
67 | * |
68 | * @since 1.0.0 |
69 | */ |
70 | private function tokenize(string $str) : array |
71 | { |
72 | if ($this->tokenizer === null) { |
73 | $this->tokenizer = new WhitespaceTokenizer(); |
74 | } |
75 | |
76 | return $this->tokenizer->tokenize($str); |
77 | } |
78 | |
79 | /** |
80 | * Get ngrams |
81 | * |
82 | * @param string $str String to parse |
83 | * |
84 | * @return array |
85 | * |
86 | * @since 1.0.0 |
87 | */ |
88 | protected function getNgrams(string $str) : array |
89 | { |
90 | $tokens = []; |
91 | $words = $this->tokenize($str); |
92 | |
93 | foreach ($words as $word) { |
94 | $l = \mb_strlen($word); |
95 | $tmp = 0; |
96 | |
97 | for ($i = $this->minLength; $i <= $this->maxLength; ++$i) { |
98 | for ($j = 0; ($i + $j - 1) < $l; ++$j, ++$tmp) { |
99 | if (!isset($tokens[$i][$char = \mb_substr($word, $j, $i)])) { |
100 | $tokens[$i][$char] = 0; |
101 | } |
102 | |
103 | $tmp = &$tokens[$i][$char]; |
104 | } |
105 | } |
106 | } |
107 | |
108 | foreach ($tokens as $i => $token) { |
109 | $sum = \array_sum($token); |
110 | |
111 | foreach ($token as $j => $value) { |
112 | /** @phpstan-ignore-next-line */ |
113 | $tokens[$i][$j] = $sum === 0 ? 0 : $value / $sum; |
114 | } |
115 | } |
116 | |
117 | if (empty($tokens)) { |
118 | return []; |
119 | } |
120 | |
121 | $tokens = \array_merge(...$tokens); |
122 | |
123 | if (isset($tokens['_'])) { |
124 | unset($tokens['_']); |
125 | } |
126 | |
127 | \arsort($tokens, \SORT_NUMERIC); |
128 | |
129 | return \array_slice( |
130 | \array_keys($tokens), |
131 | 0, |
132 | $this->maxNgrams |
133 | ); |
134 | } |
135 | } |