Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
97.56% |
80 / 82 |
|
33.33% |
1 / 3 |
CRAP | |
0.00% |
0 / 1 |
NaiveBayesClassifier | |
97.56% |
80 / 82 |
|
33.33% |
1 / 3 |
27 | |
0.00% |
0 / 1 |
train | |
100.00% |
34 / 34 |
|
100.00% |
1 / 1 |
11 | |||
matchCriteria | |
95.45% |
21 / 22 |
|
0.00% |
0 / 1 |
7 | |||
preCalculateProbabilities | |
96.15% |
25 / 26 |
|
0.00% |
0 / 1 |
9 |
1 | <?php |
2 | /** |
3 | * Jingga |
4 | * |
5 | * PHP Version 8.1 |
6 | * |
7 | * @package phpOMS\Math\Stochastic |
8 | * @copyright Dennis Eichhorn |
9 | * @license OMS License 2.0 |
10 | * @version 1.0.0 |
11 | * @link https://jingga.app |
12 | */ |
13 | declare(strict_types=1); |
14 | |
15 | namespace phpOMS\Math\Stochastic; |
16 | |
17 | use phpOMS\Math\Statistic\Average; |
18 | use phpOMS\Math\Statistic\MeasureOfDispersion; |
19 | |
20 | /** |
21 | * Naive bayes matching. |
22 | * |
23 | * @package phpOMS\Math\Stochastic |
24 | * @license OMS License 2.0 |
25 | * @link https://jingga.app |
26 | * @since 1.0.0 |
27 | */ |
28 | final class NaiveBayesClassifier |
29 | { |
30 | /** |
31 | * Dictionary of different criterias. |
32 | * |
33 | * @var array |
34 | * @since 1.0.0 |
35 | */ |
36 | private array $dict = []; |
37 | |
38 | /** |
39 | * Cached probabilities. |
40 | * |
41 | * @var array |
42 | * @since 1.0.0 |
43 | */ |
44 | private array $probabilities = [ |
45 | 'count' => 0, |
46 | 'criteria' => [], |
47 | 'attr' => [], |
48 | ]; |
49 | |
50 | /** |
51 | * Train matches. |
52 | * |
53 | * @param string $criteria Criteria to match against |
54 | * @param array $matched Matches |
55 | * |
56 | * @return void |
57 | * |
58 | * @since 1.0.0 |
59 | */ |
60 | public function train(string $criteria, array $matched) : void |
61 | { |
62 | if (!isset($this->probabilities['criteria'][$criteria])) { |
63 | $this->probabilities['criteria'][$criteria] = [ |
64 | 'count' => 0, |
65 | 'attr' => [], |
66 | ]; |
67 | } |
68 | |
69 | foreach ($matched as $dataset) { |
70 | foreach ($dataset as $attr => $value) { |
71 | if (!isset($this->dict[$criteria][$attr])) { |
72 | $this->dict[$criteria][$attr] = [ |
73 | 'type' => \is_array($value) ? 1 : 2, |
74 | 'data' => [], |
75 | ]; |
76 | } |
77 | |
78 | if (!isset($this->probabilities['attr'][$attr])) { |
79 | $this->probabilities['attr'][$attr] = [ |
80 | 'count' => 0, |
81 | 'data' => [], |
82 | ]; |
83 | } |
84 | |
85 | if (!isset($this->probabilities['criteria'][$criteria]['attr'][$attr])) { |
86 | $this->probabilities['criteria'][$criteria]['attr'][$attr] = [ |
87 | 'count' => 0, |
88 | 'mean' => 0, |
89 | 'variance' => 0, |
90 | ]; |
91 | } |
92 | |
93 | if (\is_array($value)) { |
94 | foreach ($value as $word) { |
95 | if (!isset($this->dict[$criteria][$attr]['data'][$word])) { |
96 | $this->dict[$criteria][$attr]['data'][$word] = 0; |
97 | } |
98 | |
99 | ++$this->dict[$criteria][$attr]['data'][$word]; |
100 | ++$this->probabilities['attr'][$attr]['count']; |
101 | } |
102 | } else { |
103 | $this->dict[$criteria][$attr]['data'][] = $value; |
104 | |
105 | ++$this->probabilities['attr'][$attr]['count']; |
106 | ++$this->probabilities['criteria'][$criteria]['attr'][$attr]['count']; |
107 | } |
108 | } |
109 | |
110 | ++$this->probabilities['criteria'][$criteria]['count']; |
111 | ++$this->probabilities['count']; |
112 | } |
113 | } |
114 | |
115 | /** |
116 | * Check against matches. |
117 | * |
118 | * @param string $criteria Criteria to match against |
119 | * @param array $toMatch Values to match |
120 | * @param int $minimum Minimum amount of ocurances for consideration |
121 | * |
122 | * @return float |
123 | * |
124 | * @since 1.0.0 |
125 | */ |
126 | public function matchCriteria(string $criteria, array $toMatch, int $minimum = 3) : float |
127 | { |
128 | $this->preCalculateProbabilities($toMatch); |
129 | |
130 | $n = 0.0; |
131 | foreach ($toMatch as $attr => $value) { |
132 | if (!isset($this->dict[$criteria], $this->dict[$criteria][$attr])) { |
133 | continue; |
134 | } |
135 | |
136 | if (\is_array($value)) { |
137 | /** |
138 | * @var string $word |
139 | */ |
140 | foreach ($value as $word) { |
141 | if (isset($this->dict[$criteria][$attr]['data'][$word]) |
142 | && $this->dict[$criteria][$attr]['data'][$word] >= $minimum |
143 | ) { |
144 | $p = ($this->dict[$criteria][$attr]['data'][$word] / \array_sum($this->dict[$criteria][$attr]['data'])) |
145 | * ($this->probabilities['criteria'][$criteria]['count'] / $this->probabilities['count']) |
146 | / $this->probabilities['attr'][$attr]['data'][$word]; |
147 | |
148 | $n += \log(1 - $p) - \log($p); |
149 | } |
150 | } |
151 | } else { |
152 | $p = (1 / \sqrt(2 * \M_PI * $this->probabilities['criteria'][$criteria]['attr'][$attr]['variance']) |
153 | * \exp(-($value - $this->probabilities['criteria'][$criteria]['attr'][$attr]['mean']) ** 2 |
154 | / (2 * $this->probabilities['criteria'][$criteria]['attr'][$attr]['variance']) |
155 | ) |
156 | ) |
157 | * ($this->probabilities['criteria'][$criteria]['count'] / $this->probabilities['count']) |
158 | / $this->probabilities['attr'][$attr]['data']; |
159 | |
160 | $n += \log(1 - $p) - \log($p); |
161 | } |
162 | } |
163 | |
164 | return 1 / (1 + \exp($n)); |
165 | } |
166 | |
167 | /** |
168 | * Pre-calculate some probabilities used for the matching process |
169 | * |
170 | * @param array $toMatch Data to match. Some probabilities depend on the passed values. |
171 | * |
172 | * @return void |
173 | * |
174 | * @since 1.0.0 |
175 | */ |
176 | private function preCalculateProbabilities(array $toMatch) : void |
177 | { |
178 | $this->probabilities['attr'] = []; |
179 | |
180 | foreach ($this->dict as $criteria => $subDict) { |
181 | foreach ($subDict as $attr => $valueArray) { |
182 | if ($valueArray['type'] === 2) { |
183 | $this->probabilities['criteria'][$criteria]['attr'][$attr]['mean'] = Average::arithmeticMean($this->dict[$criteria][$attr]['data']); |
184 | $this->probabilities['criteria'][$criteria]['attr'][$attr]['variance'] = MeasureOfDispersion::sampleVariance( |
185 | $this->dict[$criteria][$attr]['data'], |
186 | $this->probabilities['criteria'][$criteria]['attr'][$attr]['mean'] |
187 | ); |
188 | |
189 | if (!isset($this->probabilities['attr'][$attr])) { |
190 | $this->probabilities['attr'][$attr] = ['data' => 0.0]; |
191 | } |
192 | |
193 | $this->probabilities['attr'][$attr]['data'] += (1 / \sqrt(2 * \M_PI * $this->probabilities['criteria'][$criteria]['attr'][$attr]['variance']) |
194 | * \exp(-($toMatch[$attr] - $this->probabilities['criteria'][$criteria]['attr'][$attr]['mean']) ** 2 |
195 | / (2 * $this->probabilities['criteria'][$criteria]['attr'][$attr]['variance']) |
196 | ) |
197 | ) |
198 | * ($this->probabilities['criteria'][$criteria]['count'] / $this->probabilities['count']); |
199 | } else { |
200 | if (!isset($this->probabilities['attr'][$attr])) { |
201 | $this->probabilities['attr'][$attr] = ['data' => []]; |
202 | } |
203 | |
204 | foreach ($valueArray['data'] as $word => $_) { |
205 | if (!isset($this->dict[$criteria][$attr]['data'][$word])) { |
206 | continue; |
207 | } |
208 | |
209 | if (!isset($this->probabilities['attr'][$attr]['data'][$word])) { |
210 | $this->probabilities['attr'][$attr]['data'][$word] = 0.0; |
211 | } |
212 | |
213 | $this->probabilities['attr'][$attr]['data'][$word] += ($this->dict[$criteria][$attr]['data'][$word] / \array_sum($this->dict[$criteria][$attr]['data'])) |
214 | * ($this->probabilities['criteria'][$criteria]['count'] / $this->probabilities['count']); |
215 | } |
216 | } |
217 | } |
218 | } |
219 | } |
220 | } |