Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
96.43% |
27 / 28 |
|
50.00% |
1 / 2 |
CRAP | |
0.00% |
0 / 1 |
| NgramParser | |
96.43% |
27 / 28 |
|
50.00% |
1 / 2 |
12 | |
0.00% |
0 / 1 |
| tokenize | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
| getNgrams | |
96.00% |
24 / 25 |
|
0.00% |
0 / 1 |
10 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * Jingga |
| 4 | * |
| 5 | * PHP Version 8.1 |
| 6 | * |
| 7 | * @package phpOMS\Localization\LanguageDetection |
| 8 | * @author Patrick Schur <patrick_schur@outlook.de> |
| 9 | * @copyright Patrick Schur |
| 10 | * @license https://opensource.org/licenses/mit-license.html MIT |
| 11 | * @link https://github.com/patrickschur/language-detection |
| 12 | */ |
| 13 | declare(strict_types=1); |
| 14 | |
| 15 | namespace phpOMS\Localization\LanguageDetection; |
| 16 | |
| 17 | use phpOMS\Localization\LanguageDetection\Tokenizer\WhitespaceTokenizer; |
| 18 | |
| 19 | /** |
| 20 | * Ngram parser class |
| 21 | * |
| 22 | * @package phpOMS\Localization\LanguageDetection |
| 23 | * @license https://opensource.org/licenses/mit-license.html MIT |
| 24 | * @link https://github.com/patrickschur/language-detection |
| 25 | * @since 1.0.0 |
| 26 | */ |
| 27 | abstract class NgramParser |
| 28 | { |
| 29 | /** |
| 30 | * Minimum length |
| 31 | * |
| 32 | * @var int |
| 33 | * @since 1.0.0 |
| 34 | */ |
| 35 | public int $minLength = 1; |
| 36 | |
| 37 | /** |
| 38 | * Maximum length |
| 39 | * |
| 40 | * @var int |
| 41 | * @since 1.0.0 |
| 42 | */ |
| 43 | public int $maxLength = 3; |
| 44 | |
| 45 | /** |
| 46 | * Maximum amount of ngrams |
| 47 | * |
| 48 | * @var int |
| 49 | * @since 1.0.0 |
| 50 | */ |
| 51 | public int $maxNgrams = 310; |
| 52 | |
| 53 | /** |
| 54 | * Tokenizer to use |
| 55 | * |
| 56 | * @var null|WhitespaceTokenizer |
| 57 | * @since 1.0.0 |
| 58 | */ |
| 59 | public ?WhitespaceTokenizer $tokenizer = null; |
| 60 | |
| 61 | /** |
| 62 | * Tokenize string |
| 63 | * |
| 64 | * @param string $str String to tokenize |
| 65 | * |
| 66 | * @return array |
| 67 | * |
| 68 | * @since 1.0.0 |
| 69 | */ |
| 70 | private function tokenize(string $str) : array |
| 71 | { |
| 72 | if ($this->tokenizer === null) { |
| 73 | $this->tokenizer = new WhitespaceTokenizer(); |
| 74 | } |
| 75 | |
| 76 | return $this->tokenizer->tokenize($str); |
| 77 | } |
| 78 | |
| 79 | /** |
| 80 | * Get ngrams |
| 81 | * |
| 82 | * @param string $str String to parse |
| 83 | * |
| 84 | * @return array |
| 85 | * |
| 86 | * @since 1.0.0 |
| 87 | */ |
| 88 | protected function getNgrams(string $str) : array |
| 89 | { |
| 90 | $tokens = []; |
| 91 | $words = $this->tokenize($str); |
| 92 | |
| 93 | foreach ($words as $word) { |
| 94 | $l = \mb_strlen($word); |
| 95 | $tmp = 0; |
| 96 | |
| 97 | for ($i = $this->minLength; $i <= $this->maxLength; ++$i) { |
| 98 | for ($j = 0; ($i + $j - 1) < $l; ++$j, ++$tmp) { |
| 99 | if (!isset($tokens[$i][$char = \mb_substr($word, $j, $i)])) { |
| 100 | $tokens[$i][$char] = 0; |
| 101 | } |
| 102 | |
| 103 | $tmp = &$tokens[$i][$char]; |
| 104 | } |
| 105 | } |
| 106 | } |
| 107 | |
| 108 | foreach ($tokens as $i => $token) { |
| 109 | $sum = \array_sum($token); |
| 110 | |
| 111 | foreach ($token as $j => $value) { |
| 112 | /** @phpstan-ignore-next-line */ |
| 113 | $tokens[$i][$j] = $sum === 0 ? 0 : $value / $sum; |
| 114 | } |
| 115 | } |
| 116 | |
| 117 | if (empty($tokens)) { |
| 118 | return []; |
| 119 | } |
| 120 | |
| 121 | $tokens = \array_merge(...$tokens); |
| 122 | |
| 123 | if (isset($tokens['_'])) { |
| 124 | unset($tokens['_']); |
| 125 | } |
| 126 | |
| 127 | \arsort($tokens, \SORT_NUMERIC); |
| 128 | |
| 129 | return \array_slice( |
| 130 | \array_keys($tokens), |
| 131 | 0, |
| 132 | $this->maxNgrams |
| 133 | ); |
| 134 | } |
| 135 | } |