Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
97.56% covered (success)
97.56%
80 / 82
33.33% covered (danger)
33.33%
1 / 3
CRAP
0.00% covered (danger)
0.00%
0 / 1
NaiveBayesClassifier
97.56% covered (success)
97.56%
80 / 82
33.33% covered (danger)
33.33%
1 / 3
27
0.00% covered (danger)
0.00%
0 / 1
 train
100.00% covered (success)
100.00%
34 / 34
100.00% covered (success)
100.00%
1 / 1
11
 matchCriteria
95.45% covered (success)
95.45%
21 / 22
0.00% covered (danger)
0.00%
0 / 1
7
 preCalculateProbabilities
96.15% covered (success)
96.15%
25 / 26
0.00% covered (danger)
0.00%
0 / 1
9
1<?php
2/**
3 * Jingga
4 *
5 * PHP Version 8.1
6 *
7 * @package   phpOMS\Math\Stochastic
8 * @copyright Dennis Eichhorn
9 * @license   OMS License 2.0
10 * @version   1.0.0
11 * @link      https://jingga.app
12 */
13declare(strict_types=1);
14
15namespace phpOMS\Math\Stochastic;
16
17use phpOMS\Math\Statistic\Average;
18use phpOMS\Math\Statistic\MeasureOfDispersion;
19
20/**
21 * Naive bayes matching.
22 *
23 * @package phpOMS\Math\Stochastic
24 * @license OMS License 2.0
25 * @link    https://jingga.app
26 * @since   1.0.0
27 */
28final class NaiveBayesClassifier
29{
30    /**
31     * Dictionary of different criterias.
32     *
33     * @var array
34     * @since 1.0.0
35     */
36    private array $dict = [];
37
38    /**
39     * Cached probabilities.
40     *
41     * @var array
42     * @since 1.0.0
43     */
44    private array $probabilities = [
45        'count'    => 0,
46        'criteria' => [],
47        'attr'     => [],
48    ];
49
50    /**
51     * Train matches.
52     *
53     * @param string $criteria Criteria to match against
54     * @param array  $matched  Matches
55     *
56     * @return void
57     *
58     * @since 1.0.0
59     */
60    public function train(string $criteria, array $matched) : void
61    {
62        if (!isset($this->probabilities['criteria'][$criteria])) {
63            $this->probabilities['criteria'][$criteria] = [
64                'count' => 0,
65                'attr'  => [],
66            ];
67        }
68
69        foreach ($matched as $dataset) {
70            foreach ($dataset as $attr => $value) {
71                if (!isset($this->dict[$criteria][$attr])) {
72                    $this->dict[$criteria][$attr] = [
73                        'type'  => \is_array($value) ? 1 : 2,
74                        'data'  => [],
75                    ];
76                }
77
78                if (!isset($this->probabilities['attr'][$attr])) {
79                    $this->probabilities['attr'][$attr] = [
80                        'count'    => 0,
81                        'data'     => [],
82                    ];
83                }
84
85                if (!isset($this->probabilities['criteria'][$criteria]['attr'][$attr])) {
86                    $this->probabilities['criteria'][$criteria]['attr'][$attr] = [
87                        'count'    => 0,
88                        'mean'     => 0,
89                        'variance' => 0,
90                    ];
91                }
92
93                if (\is_array($value)) {
94                    foreach ($value as $word) {
95                        if (!isset($this->dict[$criteria][$attr]['data'][$word])) {
96                            $this->dict[$criteria][$attr]['data'][$word] = 0;
97                        }
98
99                        ++$this->dict[$criteria][$attr]['data'][$word];
100                        ++$this->probabilities['attr'][$attr]['count'];
101                    }
102                } else {
103                    $this->dict[$criteria][$attr]['data'][] = $value;
104
105                    ++$this->probabilities['attr'][$attr]['count'];
106                    ++$this->probabilities['criteria'][$criteria]['attr'][$attr]['count'];
107                }
108            }
109
110            ++$this->probabilities['criteria'][$criteria]['count'];
111            ++$this->probabilities['count'];
112        }
113    }
114
115    /**
116     * Check against matches.
117     *
118     * @param string $criteria Criteria to match against
119     * @param array  $toMatch  Values to match
120     * @param int    $minimum  Minimum amount of ocurances for consideration
121     *
122     * @return float
123     *
124     * @since 1.0.0
125     */
126    public function matchCriteria(string $criteria, array $toMatch, int $minimum = 3) : float
127    {
128        $this->preCalculateProbabilities($toMatch);
129
130        $n = 0.0;
131        foreach ($toMatch as $attr => $value) {
132            if (!isset($this->dict[$criteria], $this->dict[$criteria][$attr])) {
133                continue;
134            }
135
136            if (\is_array($value)) {
137                /**
138                 * @var string $word
139                 */
140                foreach ($value as $word) {
141                    if (isset($this->dict[$criteria][$attr]['data'][$word])
142                        && $this->dict[$criteria][$attr]['data'][$word] >= $minimum
143                    ) {
144                        $p = ($this->dict[$criteria][$attr]['data'][$word] / \array_sum($this->dict[$criteria][$attr]['data']))
145                            * ($this->probabilities['criteria'][$criteria]['count'] / $this->probabilities['count'])
146                            / $this->probabilities['attr'][$attr]['data'][$word];
147
148                        $n += \log(1 - $p) - \log($p);
149                    }
150                }
151            } else {
152                $p = (1 / \sqrt(2 * \M_PI * $this->probabilities['criteria'][$criteria]['attr'][$attr]['variance'])
153                        * \exp(-($value - $this->probabilities['criteria'][$criteria]['attr'][$attr]['mean']) ** 2
154                            / (2 * $this->probabilities['criteria'][$criteria]['attr'][$attr]['variance'])
155                        )
156                    )
157                    * ($this->probabilities['criteria'][$criteria]['count'] / $this->probabilities['count'])
158                    / $this->probabilities['attr'][$attr]['data'];
159
160                $n += \log(1 - $p) - \log($p);
161            }
162        }
163
164        return 1 / (1 + \exp($n));
165    }
166
167    /**
168     * Pre-calculate some probabilities used for the matching process
169     *
170     * @param array $toMatch Data to match. Some probabilities depend on the passed values.
171     *
172     * @return void
173     *
174     * @since 1.0.0
175     */
176    private function preCalculateProbabilities(array $toMatch) : void
177    {
178        $this->probabilities['attr'] = [];
179
180        foreach ($this->dict as $criteria => $subDict) {
181            foreach ($subDict as $attr => $valueArray) {
182                if ($valueArray['type'] === 2) {
183                    $this->probabilities['criteria'][$criteria]['attr'][$attr]['mean']     = Average::arithmeticMean($this->dict[$criteria][$attr]['data']);
184                    $this->probabilities['criteria'][$criteria]['attr'][$attr]['variance'] = MeasureOfDispersion::sampleVariance(
185                        $this->dict[$criteria][$attr]['data'],
186                        $this->probabilities['criteria'][$criteria]['attr'][$attr]['mean']
187                    );
188
189                    if (!isset($this->probabilities['attr'][$attr])) {
190                        $this->probabilities['attr'][$attr] = ['data' => 0.0];
191                    }
192
193                    $this->probabilities['attr'][$attr]['data'] += (1 / \sqrt(2 * \M_PI * $this->probabilities['criteria'][$criteria]['attr'][$attr]['variance'])
194                            * \exp(-($toMatch[$attr] - $this->probabilities['criteria'][$criteria]['attr'][$attr]['mean']) ** 2
195                                / (2 * $this->probabilities['criteria'][$criteria]['attr'][$attr]['variance'])
196                            )
197                        )
198                        * ($this->probabilities['criteria'][$criteria]['count'] / $this->probabilities['count']);
199                } else {
200                    if (!isset($this->probabilities['attr'][$attr])) {
201                        $this->probabilities['attr'][$attr] = ['data' => []];
202                    }
203
204                    foreach ($valueArray['data'] as $word => $_) {
205                        if (!isset($this->dict[$criteria][$attr]['data'][$word])) {
206                            continue;
207                        }
208
209                        if (!isset($this->probabilities['attr'][$attr]['data'][$word])) {
210                            $this->probabilities['attr'][$attr]['data'][$word] = 0.0;
211                        }
212
213                        $this->probabilities['attr'][$attr]['data'][$word] += ($this->dict[$criteria][$attr]['data'][$word] / \array_sum($this->dict[$criteria][$attr]['data']))
214                            * ($this->probabilities['criteria'][$criteria]['count'] / $this->probabilities['count']);
215                    }
216                }
217            }
218        }
219    }
220}