Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
96.43% covered (success)
96.43%
27 / 28
50.00% covered (danger)
50.00%
1 / 2
CRAP
0.00% covered (danger)
0.00%
0 / 1
NgramParser
96.43% covered (success)
96.43%
27 / 28
50.00% covered (danger)
50.00%
1 / 2
12
0.00% covered (danger)
0.00%
0 / 1
 tokenize
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
2
 getNgrams
96.00% covered (success)
96.00%
24 / 25
0.00% covered (danger)
0.00%
0 / 1
10
1<?php
2/**
3 * Jingga
4 *
5 * PHP Version 8.1
6 *
7 * @package   phpOMS\Localization\LanguageDetection
8 * @author    Patrick Schur <patrick_schur@outlook.de>
9 * @copyright Patrick Schur
10 * @license   https://opensource.org/licenses/mit-license.html MIT
11 * @link      https://github.com/patrickschur/language-detection
12 */
13declare(strict_types=1);
14
15namespace phpOMS\Localization\LanguageDetection;
16
17use phpOMS\Localization\LanguageDetection\Tokenizer\WhitespaceTokenizer;
18
19/**
20 * Ngram parser class
21 *
22 * @package phpOMS\Localization\LanguageDetection
23 * @license https://opensource.org/licenses/mit-license.html MIT
24 * @link    https://github.com/patrickschur/language-detection
25 * @since   1.0.0
26 */
27abstract class NgramParser
28{
29    /**
30     * Minimum length
31     *
32     * @var int
33     * @since 1.0.0
34     */
35    public int $minLength = 1;
36
37    /**
38     * Maximum length
39     *
40     * @var int
41     * @since 1.0.0
42     */
43    public int $maxLength = 3;
44
45    /**
46     * Maximum amount of ngrams
47     *
48     * @var int
49     * @since 1.0.0
50     */
51    public int $maxNgrams = 310;
52
53    /**
54     * Tokenizer to use
55     *
56     * @var null|WhitespaceTokenizer
57     * @since 1.0.0
58     */
59    public ?WhitespaceTokenizer $tokenizer = null;
60
61    /**
62     * Tokenize string
63     *
64     * @param string $str String to tokenize
65     *
66     * @return array
67     *
68     * @since 1.0.0
69     */
70    private function tokenize(string $str) : array
71    {
72        if ($this->tokenizer === null) {
73            $this->tokenizer = new WhitespaceTokenizer();
74        }
75
76        return $this->tokenizer->tokenize($str);
77    }
78
79    /**
80     * Get ngrams
81     *
82     * @param string $str String to parse
83     *
84     * @return array
85     *
86     * @since 1.0.0
87     */
88    protected function getNgrams(string $str) : array
89    {
90        $tokens = [];
91        $words  = $this->tokenize($str);
92
93        foreach ($words as $word) {
94            $l   = \mb_strlen($word);
95            $tmp = 0;
96
97            for ($i = $this->minLength; $i <= $this->maxLength; ++$i) {
98                for ($j = 0; ($i + $j - 1) < $l; ++$j, ++$tmp) {
99                    if (!isset($tokens[$i][$char = \mb_substr($word, $j, $i)])) {
100                        $tokens[$i][$char] = 0;
101                    }
102
103                    $tmp = &$tokens[$i][$char];
104                }
105            }
106        }
107
108        foreach ($tokens as $i => $token) {
109            $sum = \array_sum($token);
110
111            foreach ($token as $j => $value) {
112                /** @phpstan-ignore-next-line */
113                $tokens[$i][$j] = $sum === 0 ? 0 : $value / $sum;
114            }
115        }
116
117        if (empty($tokens)) {
118            return [];
119        }
120
121        $tokens = \array_merge(...$tokens);
122
123        if (isset($tokens['_'])) {
124            unset($tokens['_']);
125        }
126
127        \arsort($tokens, \SORT_NUMERIC);
128
129        return \array_slice(
130            \array_keys($tokens),
131            0,
132            $this->maxNgrams
133        );
134    }
135}