���ѧۧݧ�ӧ�� �ާ֧ߧ֧էا֧� - ���֧էѧܧ�ڧ��ӧѧ�� - /home3/cpr76684/public_html/Phpml.tar
���ѧ٧ѧ�
DimensionReduction/PCA.php 0000644 00000007526 15152065050 0011473 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\DimensionReduction; use Phpml\Exception\InvalidArgumentException; use Phpml\Exception\InvalidOperationException; use Phpml\Math\Statistic\Covariance; use Phpml\Math\Statistic\Mean; class PCA extends EigenTransformerBase { /** * Temporary storage for mean values for each dimension in given data * * @var array */ protected $means = []; /** * @var bool */ protected $fit = false; /** * PCA (Principal Component Analysis) used to explain given * data with lower number of dimensions. This analysis transforms the * data to a lower dimensional version of it by conserving a proportion of total variance * within the data. It is a lossy data compression technique.<br> * * @param float $totalVariance Total explained variance to be preserved * @param int $numFeatures Number of features to be preserved * * @throws InvalidArgumentException */ public function __construct(?float $totalVariance = null, ?int $numFeatures = null) { if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) { throw new InvalidArgumentException('Total variance can be a value between 0.1 and 0.99'); } if ($numFeatures !== null && $numFeatures <= 0) { throw new InvalidArgumentException('Number of features to be preserved should be greater than 0'); } if (($totalVariance !== null) === ($numFeatures !== null)) { throw new InvalidArgumentException('Either totalVariance or numFeatures should be specified in order to run the algorithm'); } if ($numFeatures !== null) { $this->numFeatures = $numFeatures; } if ($totalVariance !== null) { $this->totalVariance = $totalVariance; } } /** * Takes a data and returns a lower dimensional version * of this data while preserving $totalVariance or $numFeatures. <br> * $data is an n-by-m matrix and returned array is * n-by-k matrix where k <= m */ public function fit(array $data): array { $n = count($data[0]); $data = $this->normalize($data, $n); $covMatrix = Covariance::covarianceMatrix($data, array_fill(0, $n, 0)); $this->eigenDecomposition($covMatrix); $this->fit = true; return $this->reduce($data); } /** * Transforms the given sample to a lower dimensional vector by using * the eigenVectors obtained in the last run of <code>fit</code>. * * @throws InvalidOperationException */ public function transform(array $sample): array { if (!$this->fit) { throw new InvalidOperationException('PCA has not been fitted with respect to original dataset, please run PCA::fit() first'); } if (!is_array($sample[0])) { $sample = [$sample]; } $sample = $this->normalize($sample, count($sample[0])); return $this->reduce($sample); } protected function calculateMeans(array $data, int $n): void { // Calculate means for each dimension $this->means = []; for ($i = 0; $i < $n; ++$i) { $column = array_column($data, $i); $this->means[] = Mean::arithmetic($column); } } /** * Normalization of the data includes subtracting mean from * each dimension therefore dimensions will be centered to zero */ protected function normalize(array $data, int $n): array { if (count($this->means) === 0) { $this->calculateMeans($data, $n); } // Normalize data foreach (array_keys($data) as $i) { for ($k = 0; $k < $n; ++$k) { $data[$i][$k] -= $this->means[$k]; } } return $data; } } DimensionReduction/KernelPCA.php 0000644 00000016010 15152065050 0012620 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\DimensionReduction; use Closure; use Phpml\Exception\InvalidArgumentException; use Phpml\Exception\InvalidOperationException; use Phpml\Math\Distance\Euclidean; use Phpml\Math\Distance\Manhattan; use Phpml\Math\Matrix; class KernelPCA extends PCA { public const KERNEL_RBF = 1; public const KERNEL_SIGMOID = 2; public const KERNEL_LAPLACIAN = 3; public const KERNEL_LINEAR = 4; /** * Selected kernel function * * @var int */ protected $kernel; /** * Gamma value used by the kernel * * @var float|null */ protected $gamma; /** * Original dataset used to fit KernelPCA * * @var array */ protected $data = []; /** * Kernel principal component analysis (KernelPCA) is an extension of PCA using * techniques of kernel methods. It is more suitable for data that involves * vectors that are not linearly separable<br><br> * Example: <b>$kpca = new KernelPCA(KernelPCA::KERNEL_RBF, null, 2, 15.0);</b> * will initialize the algorithm with an RBF kernel having the gamma parameter as 15,0. <br> * This transformation will return the same number of rows with only <i>2</i> columns. * * @param float $totalVariance Total variance to be preserved if numFeatures is not given * @param int $numFeatures Number of columns to be returned * @param float $gamma Gamma parameter is used with RBF and Sigmoid kernels * * @throws InvalidArgumentException */ public function __construct(int $kernel = self::KERNEL_RBF, ?float $totalVariance = null, ?int $numFeatures = null, ?float $gamma = null) { if (!in_array($kernel, [self::KERNEL_RBF, self::KERNEL_SIGMOID, self::KERNEL_LAPLACIAN, self::KERNEL_LINEAR], true)) { throw new InvalidArgumentException('KernelPCA can be initialized with the following kernels only: Linear, RBF, Sigmoid and Laplacian'); } parent::__construct($totalVariance, $numFeatures); $this->kernel = $kernel; $this->gamma = $gamma; } /** * Takes a data and returns a lower dimensional version * of this data while preserving $totalVariance or $numFeatures. <br> * $data is an n-by-m matrix and returned array is * n-by-k matrix where k <= m */ public function fit(array $data): array { $numRows = count($data); $this->data = $data; if ($this->gamma === null) { $this->gamma = 1.0 / $numRows; } $matrix = $this->calculateKernelMatrix($this->data, $numRows); $matrix = $this->centerMatrix($matrix, $numRows); $this->eigenDecomposition($matrix); $this->fit = true; return Matrix::transposeArray($this->eigVectors); } /** * Transforms the given sample to a lower dimensional vector by using * the variables obtained during the last run of <code>fit</code>. * * @throws InvalidArgumentException * @throws InvalidOperationException */ public function transform(array $sample): array { if (!$this->fit) { throw new InvalidOperationException('KernelPCA has not been fitted with respect to original dataset, please run KernelPCA::fit() first'); } if (is_array($sample[0])) { throw new InvalidArgumentException('KernelPCA::transform() accepts only one-dimensional arrays'); } $pairs = $this->getDistancePairs($sample); return $this->projectSample($pairs); } /** * Calculates similarity matrix by use of selected kernel function<br> * An n-by-m matrix is given and an n-by-n matrix is returned */ protected function calculateKernelMatrix(array $data, int $numRows): array { $kernelFunc = $this->getKernel(); $matrix = []; for ($i = 0; $i < $numRows; ++$i) { for ($k = 0; $k < $numRows; ++$k) { if ($i <= $k) { $matrix[$i][$k] = $kernelFunc($data[$i], $data[$k]); } else { $matrix[$i][$k] = $matrix[$k][$i]; } } } return $matrix; } /** * Kernel matrix is centered in its original space by using the following * conversion: * * K′ = K − N.K − K.N + N.K.N where N is n-by-n matrix filled with 1/n */ protected function centerMatrix(array $matrix, int $n): array { $N = array_fill(0, $n, array_fill(0, $n, 1.0 / $n)); $N = new Matrix($N, false); $K = new Matrix($matrix, false); // K.N (This term is repeated so we cache it once) $K_N = $K->multiply($N); // N.K $N_K = $N->multiply($K); // N.K.N $N_K_N = $N->multiply($K_N); return $K->subtract($N_K) ->subtract($K_N) ->add($N_K_N) ->toArray(); } /** * Returns the callable kernel function * * @throws \Exception */ protected function getKernel(): Closure { switch ($this->kernel) { case self::KERNEL_LINEAR: // k(x,y) = xT.y return function ($x, $y) { return Matrix::dot($x, $y)[0]; }; case self::KERNEL_RBF: // k(x,y)=exp(-γ.|x-y|) where |..| is Euclidean distance $dist = new Euclidean(); return function ($x, $y) use ($dist): float { return exp(-$this->gamma * $dist->sqDistance($x, $y)); }; case self::KERNEL_SIGMOID: // k(x,y)=tanh(γ.xT.y+c0) where c0=1 return function ($x, $y): float { $res = Matrix::dot($x, $y)[0] + 1.0; return tanh((float) $this->gamma * $res); }; case self::KERNEL_LAPLACIAN: // k(x,y)=exp(-γ.|x-y|) where |..| is Manhattan distance $dist = new Manhattan(); return function ($x, $y) use ($dist): float { return exp(-$this->gamma * $dist->distance($x, $y)); }; default: // Not reached throw new InvalidArgumentException(sprintf('KernelPCA initialized with invalid kernel: %d', $this->kernel)); } } protected function getDistancePairs(array $sample): array { $kernel = $this->getKernel(); $pairs = []; foreach ($this->data as $row) { $pairs[] = $kernel($row, $sample); } return $pairs; } protected function projectSample(array $pairs): array { // Normalize eigenvectors by eig = eigVectors / eigValues $func = function ($eigVal, $eigVect) { $m = new Matrix($eigVect, false); $a = $m->divideByScalar($eigVal)->toArray(); return $a[0]; }; $eig = array_map($func, $this->eigValues, $this->eigVectors); // return k.dot(eig) return Matrix::dot($pairs, $eig); } } DimensionReduction/LDA.php 0000644 00000014712 15152065050 0011463 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\DimensionReduction; use Phpml\Exception\InvalidArgumentException; use Phpml\Exception\InvalidOperationException; use Phpml\Math\Matrix; class LDA extends EigenTransformerBase { /** * @var bool */ public $fit = false; /** * @var array */ public $labels = []; /** * @var array */ public $means = []; /** * @var array */ public $counts = []; /** * @var float[] */ public $overallMean = []; /** * Linear Discriminant Analysis (LDA) is used to reduce the dimensionality * of the data. Unlike Principal Component Analysis (PCA), it is a supervised * technique that requires the class labels in order to fit the data to a * lower dimensional space. <br><br> * The algorithm can be initialized by speciyfing * either with the totalVariance(a value between 0.1 and 0.99) * or numFeatures (number of features in the dataset) to be preserved. * * @param float|null $totalVariance Total explained variance to be preserved * @param int|null $numFeatures Number of features to be preserved * * @throws InvalidArgumentException */ public function __construct(?float $totalVariance = null, ?int $numFeatures = null) { if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) { throw new InvalidArgumentException('Total variance can be a value between 0.1 and 0.99'); } if ($numFeatures !== null && $numFeatures <= 0) { throw new InvalidArgumentException('Number of features to be preserved should be greater than 0'); } if (($totalVariance !== null) === ($numFeatures !== null)) { throw new InvalidArgumentException('Either totalVariance or numFeatures should be specified in order to run the algorithm'); } if ($numFeatures !== null) { $this->numFeatures = $numFeatures; } if ($totalVariance !== null) { $this->totalVariance = $totalVariance; } } /** * Trains the algorithm to transform the given data to a lower dimensional space. */ public function fit(array $data, array $classes): array { $this->labels = $this->getLabels($classes); $this->means = $this->calculateMeans($data, $classes); $sW = $this->calculateClassVar($data, $classes); $sB = $this->calculateClassCov(); $S = $sW->inverse()->multiply($sB); $this->eigenDecomposition($S->toArray()); $this->fit = true; return $this->reduce($data); } /** * Transforms the given sample to a lower dimensional vector by using * the eigenVectors obtained in the last run of <code>fit</code>. * * @throws InvalidOperationException */ public function transform(array $sample): array { if (!$this->fit) { throw new InvalidOperationException('LDA has not been fitted with respect to original dataset, please run LDA::fit() first'); } if (!is_array($sample[0])) { $sample = [$sample]; } return $this->reduce($sample); } /** * Returns unique labels in the dataset */ protected function getLabels(array $classes): array { $counts = array_count_values($classes); return array_keys($counts); } /** * Calculates mean of each column for each class and returns * n by m matrix where n is number of labels and m is number of columns */ protected function calculateMeans(array $data, array $classes): array { $means = []; $counts = []; $overallMean = array_fill(0, count($data[0]), 0.0); foreach ($data as $index => $row) { $label = array_search($classes[$index], $this->labels, true); foreach ($row as $col => $val) { if (!isset($means[$label][$col])) { $means[$label][$col] = 0.0; } $means[$label][$col] += $val; $overallMean[$col] += $val; } if (!isset($counts[$label])) { $counts[$label] = 0; } ++$counts[$label]; } foreach ($means as $index => $row) { foreach ($row as $col => $sum) { $means[$index][$col] = $sum / $counts[$index]; } } // Calculate overall mean of the dataset for each column $numElements = array_sum($counts); $map = function ($el) use ($numElements) { return $el / $numElements; }; $this->overallMean = array_map($map, $overallMean); $this->counts = $counts; return $means; } /** * Returns in-class scatter matrix for each class, which * is a n by m matrix where n is number of classes and * m is number of columns */ protected function calculateClassVar(array $data, array $classes): Matrix { // s is an n (number of classes) by m (number of column) matrix $s = array_fill(0, count($data[0]), array_fill(0, count($data[0]), 0)); $sW = new Matrix($s, false); foreach ($data as $index => $row) { $label = array_search($classes[$index], $this->labels, true); $means = $this->means[$label]; $row = $this->calculateVar($row, $means); $sW = $sW->add($row); } return $sW; } /** * Returns between-class scatter matrix for each class, which * is an n by m matrix where n is number of classes and * m is number of columns */ protected function calculateClassCov(): Matrix { // s is an n (number of classes) by m (number of column) matrix $s = array_fill(0, count($this->overallMean), array_fill(0, count($this->overallMean), 0)); $sB = new Matrix($s, false); foreach ($this->means as $index => $classMeans) { $row = $this->calculateVar($classMeans, $this->overallMean); $N = $this->counts[$index]; $sB = $sB->add($row->multiplyByScalar($N)); } return $sB; } /** * Returns the result of the calculation (x - m)T.(x - m) */ protected function calculateVar(array $row, array $means): Matrix { $x = new Matrix($row, false); $m = new Matrix($means, false); $diff = $x->subtract($m); return $diff->transpose()->multiply($diff); } } DimensionReduction/EigenTransformerBase.php 0000644 00000004420 15152065050 0015123 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\DimensionReduction; use Phpml\Math\LinearAlgebra\EigenvalueDecomposition; use Phpml\Math\Matrix; /** * Class to compute eigen pairs (values & vectors) of a given matrix * with the consideration of numFeatures or totalVariance to be preserved * * @author hp */ abstract class EigenTransformerBase { /** * Total variance to be conserved after the reduction * * @var float */ public $totalVariance = 0.9; /** * Number of features to be preserved after the reduction * * @var int */ public $numFeatures = null; /** * Top eigenvectors of the matrix * * @var array */ protected $eigVectors = []; /** * Top eigenValues of the matrix * * @var array */ protected $eigValues = []; /** * Calculates eigenValues and eigenVectors of the given matrix. Returns * top eigenVectors along with the largest eigenValues. The total explained variance * of these eigenVectors will be no less than desired $totalVariance value */ protected function eigenDecomposition(array $matrix): void { $eig = new EigenvalueDecomposition($matrix); $eigVals = $eig->getRealEigenvalues(); $eigVects = $eig->getEigenvectors(); $totalEigVal = array_sum($eigVals); // Sort eigenvalues in descending order arsort($eigVals); $explainedVar = 0.0; $vectors = []; $values = []; foreach ($eigVals as $i => $eigVal) { $explainedVar += $eigVal / $totalEigVal; $vectors[] = $eigVects[$i]; $values[] = $eigVal; if ($this->numFeatures !== null) { if (count($vectors) == $this->numFeatures) { break; } } else { if ($explainedVar >= $this->totalVariance) { break; } } } $this->eigValues = $values; $this->eigVectors = $vectors; } /** * Returns the reduced data */ protected function reduce(array $data): array { $m1 = new Matrix($data); $m2 = new Matrix($this->eigVectors); return $m1->multiply($m2->transpose())->toArray(); } } FeatureSelection/VarianceThreshold.php 0000644 00000002504 15152065050 0014123 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\FeatureSelection; use Phpml\Exception\InvalidArgumentException; use Phpml\Math\Matrix; use Phpml\Math\Statistic\Variance; use Phpml\Transformer; final class VarianceThreshold implements Transformer { /** * @var float */ private $threshold; /** * @var array */ private $variances = []; /** * @var array */ private $keepColumns = []; public function __construct(float $threshold = 0.0) { if ($threshold < 0) { throw new InvalidArgumentException('Threshold can\'t be lower than zero'); } $this->threshold = $threshold; } public function fit(array $samples, ?array $targets = null): void { $this->variances = array_map(static function (array $column): float { return Variance::population($column); }, Matrix::transposeArray($samples)); foreach ($this->variances as $column => $variance) { if ($variance > $this->threshold) { $this->keepColumns[$column] = true; } } } public function transform(array &$samples, ?array &$targets = null): void { foreach ($samples as &$sample) { $sample = array_values(array_intersect_key($sample, $this->keepColumns)); } } } FeatureSelection/ScoringFunction/UnivariateLinearRegression.php 0000644 00000004717 15152065050 0021143 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\FeatureSelection\ScoringFunction; use Phpml\FeatureSelection\ScoringFunction; use Phpml\Math\Matrix; use Phpml\Math\Statistic\Mean; /** * Quick linear model for testing the effect of a single regressor, * sequentially for many regressors. * * This is done in 2 steps: * * 1. The cross correlation between each regressor and the target is computed, * that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *std(y)). * 2. It is converted to an F score. * * Ported from scikit-learn f_regression function (http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html#sklearn.feature_selection.f_regression) */ final class UnivariateLinearRegression implements ScoringFunction { /** * @var bool */ private $center; /** * @param bool $center - if true samples and targets will be centered */ public function __construct(bool $center = true) { $this->center = $center; } public function score(array $samples, array $targets): array { if ($this->center) { $this->centerTargets($targets); $this->centerSamples($samples); } $correlations = []; foreach (array_keys($samples[0]) as $index) { $featureColumn = array_column($samples, $index); $correlations[$index] = Matrix::dot($targets, $featureColumn)[0] / (new Matrix($featureColumn, false))->transpose()->frobeniusNorm() / (new Matrix($targets, false))->frobeniusNorm(); } $degreesOfFreedom = count($targets) - ($this->center ? 2 : 1); return array_map(function (float $correlation) use ($degreesOfFreedom): float { return $correlation ** 2 / (1 - $correlation ** 2) * $degreesOfFreedom; }, $correlations); } private function centerTargets(array &$targets): void { $mean = Mean::arithmetic($targets); array_walk($targets, function (&$target) use ($mean): void { $target -= $mean; }); } private function centerSamples(array &$samples): void { $means = []; foreach ($samples[0] as $index => $feature) { $means[$index] = Mean::arithmetic(array_column($samples, $index)); } foreach ($samples as &$sample) { foreach ($sample as $index => &$feature) { $feature -= $means[$index]; } } } } FeatureSelection/ScoringFunction/ANOVAFValue.php 0000644 00000000744 15152065050 0015603 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\FeatureSelection\ScoringFunction; use Phpml\FeatureSelection\ScoringFunction; use Phpml\Math\Statistic\ANOVA; final class ANOVAFValue implements ScoringFunction { public function score(array $samples, array $targets): array { $grouped = []; foreach ($samples as $index => $sample) { $grouped[$targets[$index]][] = $sample; } return ANOVA::oneWayF(array_values($grouped)); } } FeatureSelection/ScoringFunction.php 0000644 00000000244 15152065050 0013627 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\FeatureSelection; interface ScoringFunction { public function score(array $samples, array $targets): array; } FeatureSelection/SelectKBest.php 0000644 00000003465 15152065050 0012675 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\FeatureSelection; use Phpml\Exception\InvalidArgumentException; use Phpml\Exception\InvalidOperationException; use Phpml\FeatureSelection\ScoringFunction\ANOVAFValue; use Phpml\Transformer; final class SelectKBest implements Transformer { /** * @var ScoringFunction */ private $scoringFunction; /** * @var int */ private $k; /** * @var array|null */ private $scores = null; /** * @var array|null */ private $keepColumns = null; public function __construct(int $k = 10, ?ScoringFunction $scoringFunction = null) { if ($scoringFunction === null) { $scoringFunction = new ANOVAFValue(); } $this->scoringFunction = $scoringFunction; $this->k = $k; } public function fit(array $samples, ?array $targets = null): void { if ($targets === null || count($targets) === 0) { throw new InvalidArgumentException('The array has zero elements'); } $this->scores = $sorted = $this->scoringFunction->score($samples, $targets); if ($this->k >= count($sorted)) { return; } arsort($sorted); $this->keepColumns = array_slice($sorted, 0, $this->k, true); } public function transform(array &$samples, ?array &$targets = null): void { if ($this->keepColumns === null) { return; } foreach ($samples as &$sample) { $sample = array_values(array_intersect_key($sample, $this->keepColumns)); } } public function scores(): array { if ($this->scores === null) { throw new InvalidOperationException('SelectKBest require to fit first to get scores'); } return $this->scores; } } SupportVectorMachine/SupportVectorMachine.php 0000644 00000021047 15152065050 0015530 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\SupportVectorMachine; use Phpml\Exception\InvalidArgumentException; use Phpml\Exception\InvalidOperationException; use Phpml\Exception\LibsvmCommandException; use Phpml\Helper\Trainable; class SupportVectorMachine { use Trainable; /** * @var int */ private $type; /** * @var int */ private $kernel; /** * @var float */ private $cost; /** * @var float */ private $nu; /** * @var int */ private $degree; /** * @var float|null */ private $gamma; /** * @var float */ private $coef0; /** * @var float */ private $epsilon; /** * @var float */ private $tolerance; /** * @var int */ private $cacheSize; /** * @var bool */ private $shrinking; /** * @var bool */ private $probabilityEstimates; /** * @var string */ private $binPath; /** * @var string */ private $varPath; /** * @var string */ private $model; /** * @var array */ private $targets = []; public function __construct( int $type, int $kernel, float $cost = 1.0, float $nu = 0.5, int $degree = 3, ?float $gamma = null, float $coef0 = 0.0, float $epsilon = 0.1, float $tolerance = 0.001, int $cacheSize = 100, bool $shrinking = true, bool $probabilityEstimates = false ) { $this->type = $type; $this->kernel = $kernel; $this->cost = $cost; $this->nu = $nu; $this->degree = $degree; $this->gamma = $gamma; $this->coef0 = $coef0; $this->epsilon = $epsilon; $this->tolerance = $tolerance; $this->cacheSize = $cacheSize; $this->shrinking = $shrinking; $this->probabilityEstimates = $probabilityEstimates; $rootPath = realpath(implode(DIRECTORY_SEPARATOR, [__DIR__, '..', '..'])).DIRECTORY_SEPARATOR; $this->binPath = $rootPath.'bin'.DIRECTORY_SEPARATOR.'libsvm'.DIRECTORY_SEPARATOR; $this->varPath = $rootPath.'var'.DIRECTORY_SEPARATOR; } public function setBinPath(string $binPath): void { $this->ensureDirectorySeparator($binPath); $this->verifyBinPath($binPath); $this->binPath = $binPath; } public function setVarPath(string $varPath): void { if (!is_writable($varPath)) { throw new InvalidArgumentException(sprintf('The specified path "%s" is not writable', $varPath)); } $this->ensureDirectorySeparator($varPath); $this->varPath = $varPath; } public function train(array $samples, array $targets): void { $this->samples = array_merge($this->samples, $samples); $this->targets = array_merge($this->targets, $targets); $trainingSet = DataTransformer::trainingSet($this->samples, $this->targets, in_array($this->type, [Type::EPSILON_SVR, Type::NU_SVR], true)); file_put_contents($trainingSetFileName = $this->varPath.uniqid('phpml', true), $trainingSet); $modelFileName = $trainingSetFileName.'-model'; $command = $this->buildTrainCommand($trainingSetFileName, $modelFileName); $output = []; exec(escapeshellcmd($command).' 2>&1', $output, $return); unlink($trainingSetFileName); if ($return !== 0) { throw new LibsvmCommandException( sprintf('Failed running libsvm command: "%s" with reason: "%s"', $command, array_pop($output)) ); } $this->model = (string) file_get_contents($modelFileName); unlink($modelFileName); } public function getModel(): string { return $this->model; } /** * @return array|string * * @throws LibsvmCommandException */ public function predict(array $samples) { $predictions = $this->runSvmPredict($samples, false); if (in_array($this->type, [Type::C_SVC, Type::NU_SVC], true)) { $predictions = DataTransformer::predictions($predictions, $this->targets); } else { $predictions = explode(PHP_EOL, trim($predictions)); } if (!is_array($samples[0])) { return $predictions[0]; } return $predictions; } /** * @return array|string * * @throws LibsvmCommandException */ public function predictProbability(array $samples) { if (!$this->probabilityEstimates) { throw new InvalidOperationException('Model does not support probabiliy estimates'); } $predictions = $this->runSvmPredict($samples, true); if (in_array($this->type, [Type::C_SVC, Type::NU_SVC], true)) { $predictions = DataTransformer::probabilities($predictions, $this->targets); } else { $predictions = explode(PHP_EOL, trim($predictions)); } if (!is_array($samples[0])) { return $predictions[0]; } return $predictions; } private function runSvmPredict(array $samples, bool $probabilityEstimates): string { $testSet = DataTransformer::testSet($samples); file_put_contents($testSetFileName = $this->varPath.uniqid('phpml', true), $testSet); file_put_contents($modelFileName = $testSetFileName.'-model', $this->model); $outputFileName = $testSetFileName.'-output'; $command = $this->buildPredictCommand( $testSetFileName, $modelFileName, $outputFileName, $probabilityEstimates ); $output = []; exec(escapeshellcmd($command).' 2>&1', $output, $return); unlink($testSetFileName); unlink($modelFileName); $predictions = (string) file_get_contents($outputFileName); unlink($outputFileName); if ($return !== 0) { throw new LibsvmCommandException( sprintf('Failed running libsvm command: "%s" with reason: "%s"', $command, array_pop($output)) ); } return $predictions; } private function getOSExtension(): string { $os = strtoupper(substr(PHP_OS, 0, 3)); if ($os === 'WIN') { return '.exe'; } elseif ($os === 'DAR') { return '-osx'; } return ''; } private function buildTrainCommand(string $trainingSetFileName, string $modelFileName): string { return sprintf( '%ssvm-train%s -s %s -t %s -c %s -n %F -d %s%s -r %s -p %F -m %F -e %F -h %d -b %d %s %s', $this->binPath, $this->getOSExtension(), $this->type, $this->kernel, $this->cost, $this->nu, $this->degree, $this->gamma !== null ? ' -g '.$this->gamma : '', $this->coef0, $this->epsilon, $this->cacheSize, $this->tolerance, $this->shrinking, $this->probabilityEstimates, escapeshellarg($trainingSetFileName), escapeshellarg($modelFileName) ); } private function buildPredictCommand( string $testSetFileName, string $modelFileName, string $outputFileName, bool $probabilityEstimates ): string { return sprintf( '%ssvm-predict%s -b %d %s %s %s', $this->binPath, $this->getOSExtension(), $probabilityEstimates ? 1 : 0, escapeshellarg($testSetFileName), escapeshellarg($modelFileName), escapeshellarg($outputFileName) ); } private function ensureDirectorySeparator(string &$path): void { if (substr($path, -1) !== DIRECTORY_SEPARATOR) { $path .= DIRECTORY_SEPARATOR; } } private function verifyBinPath(string $path): void { if (!is_dir($path)) { throw new InvalidArgumentException(sprintf('The specified path "%s" does not exist', $path)); } $osExtension = $this->getOSExtension(); foreach (['svm-predict', 'svm-scale', 'svm-train'] as $filename) { $filePath = $path.$filename.$osExtension; if (!file_exists($filePath)) { throw new InvalidArgumentException(sprintf('File "%s" not found', $filePath)); } if (!is_executable($filePath)) { throw new InvalidArgumentException(sprintf('File "%s" is not executable', $filePath)); } } } } SupportVectorMachine/Kernel.php 0000644 00000000611 15152065050 0012616 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\SupportVectorMachine; abstract class Kernel { /** * u'*v. */ public const LINEAR = 0; /** * (gamma*u'*v + coef0)^degree. */ public const POLYNOMIAL = 1; /** * exp(-gamma*|u-v|^2). */ public const RBF = 2; /** * tanh(gamma*u'*v + coef0). */ public const SIGMOID = 3; } SupportVectorMachine/Type.php 0000644 00000000704 15152065050 0012322 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\SupportVectorMachine; abstract class Type { /** * classification. */ public const C_SVC = 0; /** * classification. */ public const NU_SVC = 1; /** * distribution estimation. */ public const ONE_CLASS_SVM = 2; /** * regression. */ public const EPSILON_SVR = 3; /** * regression. */ public const NU_SVR = 4; } SupportVectorMachine/DataTransformer.php 0000644 00000005652 15152065050 0014504 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\SupportVectorMachine; use Phpml\Exception\InvalidArgumentException; class DataTransformer { public static function trainingSet(array $samples, array $labels, bool $targets = false): string { $set = ''; $numericLabels = []; if (!$targets) { $numericLabels = self::numericLabels($labels); } foreach ($labels as $index => $label) { $set .= sprintf('%s %s %s', ($targets ? $label : $numericLabels[$label]), self::sampleRow($samples[$index]), PHP_EOL); } return $set; } public static function testSet(array $samples): string { if (count($samples) === 0) { throw new InvalidArgumentException('The array has zero elements'); } if (!is_array($samples[0])) { $samples = [$samples]; } $set = ''; foreach ($samples as $sample) { $set .= sprintf('0 %s %s', self::sampleRow($sample), PHP_EOL); } return $set; } public static function predictions(string $rawPredictions, array $labels): array { $numericLabels = self::numericLabels($labels); $results = []; foreach (explode(PHP_EOL, $rawPredictions) as $result) { if (isset($result[0])) { $results[] = array_search((int) $result, $numericLabels, true); } } return $results; } public static function probabilities(string $rawPredictions, array $labels): array { $numericLabels = self::numericLabels($labels); $predictions = explode(PHP_EOL, trim($rawPredictions)); $header = array_shift($predictions); $headerColumns = explode(' ', (string) $header); array_shift($headerColumns); $columnLabels = []; foreach ($headerColumns as $numericLabel) { $columnLabels[] = array_search((int) $numericLabel, $numericLabels, true); } $results = []; foreach ($predictions as $rawResult) { $probabilities = explode(' ', $rawResult); array_shift($probabilities); $result = []; foreach ($probabilities as $i => $prob) { $result[$columnLabels[$i]] = (float) $prob; } $results[] = $result; } return $results; } public static function numericLabels(array $labels): array { $numericLabels = []; foreach ($labels as $label) { if (isset($numericLabels[$label])) { continue; } $numericLabels[$label] = count($numericLabels); } return $numericLabels; } private static function sampleRow(array $sample): string { $row = []; foreach ($sample as $index => $feature) { $row[] = sprintf('%s:%F', $index + 1, $feature); } return implode(' ', $row); } } Math/Matrix.php 0000644 00000017513 15152065050 0007420 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math; use Phpml\Exception\InvalidArgumentException; use Phpml\Exception\MatrixException; use Phpml\Math\LinearAlgebra\LUDecomposition; class Matrix { /** * @var array */ private $matrix = []; /** * @var int */ private $rows; /** * @var int */ private $columns; /** * @var float */ private $determinant; /** * @throws InvalidArgumentException */ public function __construct(array $matrix, bool $validate = true) { // When a row vector is given if (!is_array($matrix[0])) { $this->rows = 1; $this->columns = count($matrix); $matrix = [$matrix]; } else { $this->rows = count($matrix); $this->columns = count($matrix[0]); } if ($validate) { for ($i = 0; $i < $this->rows; ++$i) { if (count($matrix[$i]) !== $this->columns) { throw new InvalidArgumentException('Matrix dimensions did not match'); } } } $this->matrix = $matrix; } public static function fromFlatArray(array $array): self { $matrix = []; foreach ($array as $value) { $matrix[] = [$value]; } return new self($matrix); } public function toArray(): array { return $this->matrix; } public function toScalar(): float { return $this->matrix[0][0]; } public function getRows(): int { return $this->rows; } public function getColumns(): int { return $this->columns; } /** * @throws MatrixException */ public function getColumnValues(int $column): array { if ($column >= $this->columns) { throw new MatrixException('Column out of range'); } return array_column($this->matrix, $column); } /** * @return float|int * * @throws MatrixException */ public function getDeterminant() { if ($this->determinant !== null) { return $this->determinant; } if (!$this->isSquare()) { throw new MatrixException('Matrix is not square matrix'); } $lu = new LUDecomposition($this); return $this->determinant = $lu->det(); } public function isSquare(): bool { return $this->columns === $this->rows; } public function transpose(): self { if ($this->rows === 1) { $matrix = array_map(static function ($el): array { return [$el]; }, $this->matrix[0]); } else { $matrix = array_map(null, ...$this->matrix); } return new self($matrix, false); } public function multiply(self $matrix): self { if ($this->columns !== $matrix->getRows()) { throw new InvalidArgumentException('Inconsistent matrix supplied'); } $array1 = $this->toArray(); $array2 = $matrix->toArray(); $colCount = $matrix->columns; /* - To speed-up multiplication, we need to avoid use of array index operator [ ] as much as possible( See #255 for details) - A combination of "foreach" and "array_column" works much faster then accessing the array via index operator */ $product = []; foreach ($array1 as $row => $rowData) { for ($col = 0; $col < $colCount; ++$col) { $columnData = array_column($array2, $col); $sum = 0; foreach ($rowData as $key => $valueData) { $sum += $valueData * $columnData[$key]; } $product[$row][$col] = $sum; } } return new self($product, false); } /** * @param float|int $value */ public function divideByScalar($value): self { $newMatrix = []; for ($i = 0; $i < $this->rows; ++$i) { for ($j = 0; $j < $this->columns; ++$j) { $newMatrix[$i][$j] = $this->matrix[$i][$j] / $value; } } return new self($newMatrix, false); } /** * @param float|int $value */ public function multiplyByScalar($value): self { $newMatrix = []; for ($i = 0; $i < $this->rows; ++$i) { for ($j = 0; $j < $this->columns; ++$j) { $newMatrix[$i][$j] = $this->matrix[$i][$j] * $value; } } return new self($newMatrix, false); } /** * Element-wise addition of the matrix with another one */ public function add(self $other): self { return $this->sum($other); } /** * Element-wise subtracting of another matrix from this one */ public function subtract(self $other): self { return $this->sum($other, -1); } public function inverse(): self { if (!$this->isSquare()) { throw new MatrixException('Matrix is not square matrix'); } $LU = new LUDecomposition($this); $identity = $this->getIdentity(); $inverse = $LU->solve($identity); return new self($inverse, false); } public function crossOut(int $row, int $column): self { $newMatrix = []; $r = 0; for ($i = 0; $i < $this->rows; ++$i) { $c = 0; if ($row != $i) { for ($j = 0; $j < $this->columns; ++$j) { if ($column != $j) { $newMatrix[$r][$c] = $this->matrix[$i][$j]; ++$c; } } ++$r; } } return new self($newMatrix, false); } public function isSingular(): bool { return $this->getDeterminant() == 0; } /** * Frobenius norm (Hilbert–Schmidt norm, Euclidean norm) (‖A‖F) * Square root of the sum of the square of all elements. * * https://en.wikipedia.org/wiki/Matrix_norm#Frobenius_norm * * _____________ * /ᵐ ⁿ * ‖A‖F = √ Σ Σ |aᵢⱼ|² * ᵢ₌₁ ᵢ₌₁ */ public function frobeniusNorm(): float { $squareSum = 0; for ($i = 0; $i < $this->rows; ++$i) { for ($j = 0; $j < $this->columns; ++$j) { $squareSum += $this->matrix[$i][$j] ** 2; } } return $squareSum ** .5; } /** * Returns the transpose of given array */ public static function transposeArray(array $array): array { return (new self($array, false))->transpose()->toArray(); } /** * Returns the dot product of two arrays<br> * Matrix::dot(x, y) ==> x.y' */ public static function dot(array $array1, array $array2): array { $m1 = new self($array1, false); $m2 = new self($array2, false); return $m1->multiply($m2->transpose())->toArray()[0]; } /** * Element-wise addition or substraction depending on the given sign parameter */ private function sum(self $other, int $sign = 1): self { $a1 = $this->toArray(); $a2 = $other->toArray(); $newMatrix = []; for ($i = 0; $i < $this->rows; ++$i) { for ($k = 0; $k < $this->columns; ++$k) { $newMatrix[$i][$k] = $a1[$i][$k] + $sign * $a2[$i][$k]; } } return new self($newMatrix, false); } /** * Returns diagonal identity matrix of the same size of this matrix */ private function getIdentity(): self { $array = array_fill(0, $this->rows, array_fill(0, $this->columns, 0)); for ($i = 0; $i < $this->rows; ++$i) { $array[$i][$i] = 1; } return new self($array, false); } } Math/Set.php 0000644 00000007264 15152065050 0006711 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math; use ArrayIterator; use IteratorAggregate; class Set implements IteratorAggregate { /** * @var string[]|int[]|float[]|bool[] */ private $elements = []; /** * @param string[]|int[]|float[]|bool[] $elements */ public function __construct(array $elements = []) { $this->elements = self::sanitize($elements); } /** * Creates the union of A and B. */ public static function union(self $a, self $b): self { return new self(array_merge($a->toArray(), $b->toArray())); } /** * Creates the intersection of A and B. */ public static function intersection(self $a, self $b): self { return new self(array_intersect($a->toArray(), $b->toArray())); } /** * Creates the difference of A and B. */ public static function difference(self $a, self $b): self { return new self(array_diff($a->toArray(), $b->toArray())); } /** * Creates the Cartesian product of A and B. * * @return Set[] */ public static function cartesian(self $a, self $b): array { $cartesian = []; foreach ($a as $multiplier) { foreach ($b as $multiplicand) { $cartesian[] = new self(array_merge([$multiplicand], [$multiplier])); } } return $cartesian; } /** * Creates the power set of A. * * @return Set[] */ public static function power(self $a): array { $power = [new self()]; foreach ($a as $multiplicand) { foreach ($power as $multiplier) { $power[] = new self(array_merge([$multiplicand], $multiplier->toArray())); } } return $power; } /** * @param string|int|float|bool $element */ public function add($element): self { return $this->addAll([$element]); } /** * @param string[]|int[]|float[]|bool[] $elements */ public function addAll(array $elements): self { $this->elements = self::sanitize(array_merge($this->elements, $elements)); return $this; } /** * @param string|int|float $element */ public function remove($element): self { return $this->removeAll([$element]); } /** * @param string[]|int[]|float[] $elements */ public function removeAll(array $elements): self { $this->elements = self::sanitize(array_diff($this->elements, $elements)); return $this; } /** * @param string|int|float $element */ public function contains($element): bool { return $this->containsAll([$element]); } /** * @param string[]|int[]|float[] $elements */ public function containsAll(array $elements): bool { return count(array_diff($elements, $this->elements)) === 0; } /** * @return string[]|int[]|float[]|bool[] */ public function toArray(): array { return $this->elements; } public function getIterator(): ArrayIterator { return new ArrayIterator($this->elements); } public function isEmpty(): bool { return $this->cardinality() === 0; } public function cardinality(): int { return count($this->elements); } /** * Removes duplicates and rewrites index. * * @param string[]|int[]|float[]|bool[] $elements * * @return string[]|int[]|float[]|bool[] */ private static function sanitize(array $elements): array { sort($elements, SORT_ASC); return array_values(array_unique($elements, SORT_ASC)); } } Math/Product.php 0000644 00000000645 15152065050 0007572 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math; class Product { /** * @return mixed */ public static function scalar(array $a, array $b) { $product = 0; foreach ($a as $index => $value) { if (is_numeric($value) && is_numeric($b[$index])) { $product += (float) $value * (float) $b[$index]; } } return $product; } } Math/Kernel.php 0000644 00000000336 15152065050 0007367 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math; interface Kernel { /** * @param float|array $a * @param float|array $b * * @return float|array */ public function compute($a, $b); } Math/LinearAlgebra/LUDecomposition.php 0000644 00000016325 15152065050 0013721 0 ustar 00 <?php declare(strict_types=1); /** * @package JAMA * * For an m-by-n matrix A with m >= n, the LU decomposition is an m-by-n * unit lower triangular matrix L, an n-by-n upper triangular matrix U, * and a permutation vector piv of length m so that A(piv,:) = L*U. * If m < n, then L is m-by-m and U is m-by-n. * * The LU decompostion with pivoting always exists, even if the matrix is * singular, so the constructor will never fail. The primary use of the * LU decomposition is in the solution of square systems of simultaneous * linear equations. This will fail if isNonsingular() returns false. * * @author Paul Meagher * @author Bartosz Matosiuk * @author Michael Bommarito * * @version 1.1 * * @license PHP v3.0 * * Slightly changed to adapt the original code to PHP-ML library * @date 2017/04/24 * * @author Mustafa Karabulut */ namespace Phpml\Math\LinearAlgebra; use Phpml\Exception\MatrixException; use Phpml\Math\Matrix; class LUDecomposition { /** * Decomposition storage * * @var array */ private $LU = []; /** * Row dimension. * * @var int */ private $m; /** * Column dimension. * * @var int */ private $n; /** * Pivot sign. * * @var int */ private $pivsign; /** * Internal storage of pivot vector. * * @var array */ private $piv = []; /** * Constructs Structure to access L, U and piv. * * @param Matrix $A Rectangular matrix * * @throws MatrixException */ public function __construct(Matrix $A) { if ($A->getRows() !== $A->getColumns()) { throw new MatrixException('Matrix is not square matrix'); } // Use a "left-looking", dot-product, Crout/Doolittle algorithm. $this->LU = $A->toArray(); $this->m = $A->getRows(); $this->n = $A->getColumns(); for ($i = 0; $i < $this->m; ++$i) { $this->piv[$i] = $i; } $this->pivsign = 1; $LUcolj = []; // Outer loop. for ($j = 0; $j < $this->n; ++$j) { // Make a copy of the j-th column to localize references. for ($i = 0; $i < $this->m; ++$i) { $LUcolj[$i] = &$this->LU[$i][$j]; } // Apply previous transformations. for ($i = 0; $i < $this->m; ++$i) { $LUrowi = $this->LU[$i]; // Most of the time is spent in the following dot product. $kmax = min($i, $j); $s = 0.0; for ($k = 0; $k < $kmax; ++$k) { $s += $LUrowi[$k] * $LUcolj[$k]; } $LUrowi[$j] = $LUcolj[$i] -= $s; } // Find pivot and exchange if necessary. $p = $j; for ($i = $j + 1; $i < $this->m; ++$i) { if (abs($LUcolj[$i] ?? 0) > abs($LUcolj[$p] ?? 0)) { $p = $i; } } if ($p != $j) { for ($k = 0; $k < $this->n; ++$k) { $t = $this->LU[$p][$k]; $this->LU[$p][$k] = $this->LU[$j][$k]; $this->LU[$j][$k] = $t; } $k = $this->piv[$p]; $this->piv[$p] = $this->piv[$j]; $this->piv[$j] = $k; $this->pivsign *= -1; } // Compute multipliers. if (($j < $this->m) && ($this->LU[$j][$j] != 0.0)) { for ($i = $j + 1; $i < $this->m; ++$i) { $this->LU[$i][$j] /= $this->LU[$j][$j]; } } } } /** * Get lower triangular factor. * * @return Matrix Lower triangular factor */ public function getL(): Matrix { $L = []; for ($i = 0; $i < $this->m; ++$i) { for ($j = 0; $j < $this->n; ++$j) { if ($i > $j) { $L[$i][$j] = $this->LU[$i][$j]; } elseif ($i == $j) { $L[$i][$j] = 1.0; } else { $L[$i][$j] = 0.0; } } } return new Matrix($L); } /** * Get upper triangular factor. * * @return Matrix Upper triangular factor */ public function getU(): Matrix { $U = []; for ($i = 0; $i < $this->n; ++$i) { for ($j = 0; $j < $this->n; ++$j) { if ($i <= $j) { $U[$i][$j] = $this->LU[$i][$j]; } else { $U[$i][$j] = 0.0; } } } return new Matrix($U); } /** * Return pivot permutation vector. * * @return array Pivot vector */ public function getPivot(): array { return $this->piv; } /** * Alias for getPivot * * @see getPivot */ public function getDoublePivot(): array { return $this->getPivot(); } /** * Is the matrix nonsingular? * * @return bool true if U, and hence A, is nonsingular. */ public function isNonsingular(): bool { for ($j = 0; $j < $this->n; ++$j) { if ($this->LU[$j][$j] == 0) { return false; } } return true; } public function det(): float { $d = $this->pivsign; for ($j = 0; $j < $this->n; ++$j) { $d *= $this->LU[$j][$j]; } return (float) $d; } /** * Solve A*X = B * * @param Matrix $B A Matrix with as many rows as A and any number of columns. * * @return array X so that L*U*X = B(piv,:) * * @throws MatrixException */ public function solve(Matrix $B): array { if ($B->getRows() != $this->m) { throw new MatrixException('Matrix is not square matrix'); } if (!$this->isNonsingular()) { throw new MatrixException('Matrix is singular'); } // Copy right hand side with pivoting $nx = $B->getColumns(); $X = $this->getSubMatrix($B->toArray(), $this->piv, 0, $nx - 1); // Solve L*Y = B(piv,:) for ($k = 0; $k < $this->n; ++$k) { for ($i = $k + 1; $i < $this->n; ++$i) { for ($j = 0; $j < $nx; ++$j) { $X[$i][$j] -= $X[$k][$j] * $this->LU[$i][$k]; } } } // Solve U*X = Y; for ($k = $this->n - 1; $k >= 0; --$k) { for ($j = 0; $j < $nx; ++$j) { $X[$k][$j] /= $this->LU[$k][$k]; } for ($i = 0; $i < $k; ++$i) { for ($j = 0; $j < $nx; ++$j) { $X[$i][$j] -= $X[$k][$j] * $this->LU[$i][$k]; } } } return $X; } protected function getSubMatrix(array $matrix, array $RL, int $j0, int $jF): array { $m = count($RL); $n = $jF - $j0; $R = array_fill(0, $m, array_fill(0, $n + 1, 0.0)); for ($i = 0; $i < $m; ++$i) { for ($j = $j0; $j <= $jF; ++$j) { $R[$i][$j - $j0] = $matrix[$RL[$i]][$j]; } } return $R; } } Math/LinearAlgebra/EigenvalueDecomposition.php 0000644 00000077566 15152065050 0015503 0 ustar 00 <?php declare(strict_types=1); /** * Class to obtain eigenvalues and eigenvectors of a real matrix. * * If A is symmetric, then A = V*D*V' where the eigenvalue matrix D * is diagonal and the eigenvector matrix V is orthogonal (i.e. * A = V.times(D.times(V.transpose())) and V.times(V.transpose()) * equals the identity matrix). * * If A is not symmetric, then the eigenvalue matrix D is block diagonal * with the real eigenvalues in 1-by-1 blocks and any complex eigenvalues, * lambda + i*mu, in 2-by-2 blocks, [lambda, mu; -mu, lambda]. The * columns of V represent the eigenvectors in the sense that A*V = V*D, * i.e. A.times(V) equals V.times(D). The matrix V may be badly * conditioned, or even singular, so the validity of the equation * A = V*D*inverse(V) depends upon V.cond(). * * @author Paul Meagher * @license PHP v3.0 * * @version 1.1 * * Slightly changed to adapt the original code to PHP-ML library * @date 2017/04/11 * * @author Mustafa Karabulut */ namespace Phpml\Math\LinearAlgebra; use Phpml\Math\Matrix; class EigenvalueDecomposition { /** * Row and column dimension (square matrix). * * @var int */ private $n; /** * Arrays for internal storage of eigenvalues. * * @var array */ private $d = []; /** * @var array */ private $e = []; /** * Array for internal storage of eigenvectors. * * @var array */ private $V = []; /** * Array for internal storage of nonsymmetric Hessenberg form. * * @var array */ private $H = []; /** * Working storage for nonsymmetric algorithm. * * @var array */ private $ort = []; /** * Used for complex scalar division. * * @var float */ private $cdivr; /** * @var float */ private $cdivi; /** * Constructor: Check for symmetry, then construct the eigenvalue decomposition */ public function __construct(array $arg) { $this->n = count($arg[0]); $symmetric = true; for ($j = 0; ($j < $this->n) & $symmetric; ++$j) { for ($i = 0; ($i < $this->n) & $symmetric; ++$i) { $symmetric = $arg[$i][$j] == $arg[$j][$i]; } } if ($symmetric) { $this->V = $arg; // Tridiagonalize. $this->tred2(); // Diagonalize. $this->tql2(); } else { $this->H = $arg; $this->ort = []; // Reduce to Hessenberg form. $this->orthes(); // Reduce Hessenberg to real Schur form. $this->hqr2(); } } /** * Return the eigenvector matrix */ public function getEigenvectors(): array { $vectors = $this->V; // Always return the eigenvectors of length 1.0 $vectors = new Matrix($vectors); $vectors = array_map(function ($vect) { $sum = 0; $count = count($vect); for ($i = 0; $i < $count; ++$i) { $sum += $vect[$i] ** 2; } $sum **= .5; for ($i = 0; $i < $count; ++$i) { $vect[$i] /= $sum; } return $vect; }, $vectors->transpose()->toArray()); return $vectors; } /** * Return the real parts of the eigenvalues<br> * d = real(diag(D)); */ public function getRealEigenvalues(): array { return $this->d; } /** * Return the imaginary parts of the eigenvalues <br> * d = imag(diag(D)) */ public function getImagEigenvalues(): array { return $this->e; } /** * Return the block diagonal eigenvalue matrix */ public function getDiagonalEigenvalues(): array { $D = []; for ($i = 0; $i < $this->n; ++$i) { $D[$i] = array_fill(0, $this->n, 0.0); $D[$i][$i] = $this->d[$i]; if ($this->e[$i] == 0) { continue; } $o = $this->e[$i] > 0 ? $i + 1 : $i - 1; $D[$i][$o] = $this->e[$i]; } return $D; } /** * Symmetric Householder reduction to tridiagonal form. */ private function tred2(): void { // This is derived from the Algol procedures tred2 by // Bowdler, Martin, Reinsch, and Wilkinson, Handbook for // Auto. Comp., Vol.ii-Linear Algebra, and the corresponding // Fortran subroutine in EISPACK. $this->d = $this->V[$this->n - 1]; // Householder reduction to tridiagonal form. for ($i = $this->n - 1; $i > 0; --$i) { $i_ = $i - 1; // Scale to avoid under/overflow. $h = $scale = 0.0; $scale += array_sum(array_map('abs', $this->d)); if ($scale == 0.0) { $this->e[$i] = $this->d[$i_]; $this->d = array_slice($this->V[$i_], 0, $this->n - 1); for ($j = 0; $j < $i; ++$j) { $this->V[$j][$i] = $this->V[$i][$j] = 0.0; } } else { // Generate Householder vector. for ($k = 0; $k < $i; ++$k) { $this->d[$k] /= $scale; $h += $this->d[$k] ** 2; } $f = $this->d[$i_]; $g = $h ** .5; if ($f > 0) { $g = -$g; } $this->e[$i] = $scale * $g; $h -= $f * $g; $this->d[$i_] = $f - $g; for ($j = 0; $j < $i; ++$j) { $this->e[$j] = 0.0; } // Apply similarity transformation to remaining columns. for ($j = 0; $j < $i; ++$j) { $f = $this->d[$j]; $this->V[$j][$i] = $f; $g = $this->e[$j] + $this->V[$j][$j] * $f; for ($k = $j + 1; $k <= $i_; ++$k) { $g += $this->V[$k][$j] * $this->d[$k]; $this->e[$k] += $this->V[$k][$j] * $f; } $this->e[$j] = $g; } $f = 0.0; if ($h == 0.0) { $h = 1e-32; } for ($j = 0; $j < $i; ++$j) { $this->e[$j] /= $h; $f += $this->e[$j] * $this->d[$j]; } $hh = $f / (2 * $h); for ($j = 0; $j < $i; ++$j) { $this->e[$j] -= $hh * $this->d[$j]; } for ($j = 0; $j < $i; ++$j) { $f = $this->d[$j]; $g = $this->e[$j]; for ($k = $j; $k <= $i_; ++$k) { $this->V[$k][$j] -= ($f * $this->e[$k] + $g * $this->d[$k]); } $this->d[$j] = $this->V[$i - 1][$j]; $this->V[$i][$j] = 0.0; } } $this->d[$i] = $h; } // Accumulate transformations. for ($i = 0; $i < $this->n - 1; ++$i) { $this->V[$this->n - 1][$i] = $this->V[$i][$i]; $this->V[$i][$i] = 1.0; $h = $this->d[$i + 1]; if ($h != 0.0) { for ($k = 0; $k <= $i; ++$k) { $this->d[$k] = $this->V[$k][$i + 1] / $h; } for ($j = 0; $j <= $i; ++$j) { $g = 0.0; for ($k = 0; $k <= $i; ++$k) { $g += $this->V[$k][$i + 1] * $this->V[$k][$j]; } for ($k = 0; $k <= $i; ++$k) { $this->V[$k][$j] -= $g * $this->d[$k]; } } } for ($k = 0; $k <= $i; ++$k) { $this->V[$k][$i + 1] = 0.0; } } $this->d = $this->V[$this->n - 1]; $this->V[$this->n - 1] = array_fill(0, $this->n, 0.0); $this->V[$this->n - 1][$this->n - 1] = 1.0; $this->e[0] = 0.0; } /** * Symmetric tridiagonal QL algorithm. * * This is derived from the Algol procedures tql2, by * Bowdler, Martin, Reinsch, and Wilkinson, Handbook for * Auto. Comp., Vol.ii-Linear Algebra, and the corresponding * Fortran subroutine in EISPACK. */ private function tql2(): void { for ($i = 1; $i < $this->n; ++$i) { $this->e[$i - 1] = $this->e[$i]; } $this->e[$this->n - 1] = 0.0; $f = 0.0; $tst1 = 0.0; $eps = 2.0 ** -52.0; for ($l = 0; $l < $this->n; ++$l) { // Find small subdiagonal element $tst1 = max($tst1, abs($this->d[$l]) + abs($this->e[$l])); $m = $l; while ($m < $this->n) { if (abs($this->e[$m]) <= $eps * $tst1) { break; } ++$m; } // If m == l, $this->d[l] is an eigenvalue, // otherwise, iterate. if ($m > $l) { do { // Compute implicit shift $g = $this->d[$l]; $p = ($this->d[$l + 1] - $g) / (2.0 * $this->e[$l]); $r = hypot($p, 1.0); if ($p < 0) { $r *= -1; } $this->d[$l] = $this->e[$l] / ($p + $r); $this->d[$l + 1] = $this->e[$l] * ($p + $r); $dl1 = $this->d[$l + 1]; $h = $g - $this->d[$l]; for ($i = $l + 2; $i < $this->n; ++$i) { $this->d[$i] -= $h; } $f += $h; // Implicit QL transformation. $p = $this->d[$m]; $c = 1.0; $c2 = $c3 = $c; $el1 = $this->e[$l + 1]; $s = $s2 = 0.0; for ($i = $m - 1; $i >= $l; --$i) { $c3 = $c2; $c2 = $c; $s2 = $s; $g = $c * $this->e[$i]; $h = $c * $p; $r = hypot($p, $this->e[$i]); $this->e[$i + 1] = $s * $r; $s = $this->e[$i] / $r; $c = $p / $r; $p = $c * $this->d[$i] - $s * $g; $this->d[$i + 1] = $h + $s * ($c * $g + $s * $this->d[$i]); // Accumulate transformation. for ($k = 0; $k < $this->n; ++$k) { $h = $this->V[$k][$i + 1]; $this->V[$k][$i + 1] = $s * $this->V[$k][$i] + $c * $h; $this->V[$k][$i] = $c * $this->V[$k][$i] - $s * $h; } } $p = -$s * $s2 * $c3 * $el1 * $this->e[$l] / $dl1; $this->e[$l] = $s * $p; $this->d[$l] = $c * $p; // Check for convergence. } while (abs($this->e[$l]) > $eps * $tst1); } $this->d[$l] += $f; $this->e[$l] = 0.0; } // Sort eigenvalues and corresponding vectors. for ($i = 0; $i < $this->n - 1; ++$i) { $k = $i; $p = $this->d[$i]; for ($j = $i + 1; $j < $this->n; ++$j) { if ($this->d[$j] < $p) { $k = $j; $p = $this->d[$j]; } } if ($k != $i) { $this->d[$k] = $this->d[$i]; $this->d[$i] = $p; for ($j = 0; $j < $this->n; ++$j) { $p = $this->V[$j][$i]; $this->V[$j][$i] = $this->V[$j][$k]; $this->V[$j][$k] = $p; } } } } /** * Nonsymmetric reduction to Hessenberg form. * * This is derived from the Algol procedures orthes and ortran, * by Martin and Wilkinson, Handbook for Auto. Comp., * Vol.ii-Linear Algebra, and the corresponding * Fortran subroutines in EISPACK. */ private function orthes(): void { $low = 0; $high = $this->n - 1; for ($m = $low + 1; $m <= $high - 1; ++$m) { // Scale column. $scale = 0.0; for ($i = $m; $i <= $high; ++$i) { $scale += abs($this->H[$i][$m - 1]); } if ($scale != 0.0) { // Compute Householder transformation. $h = 0.0; for ($i = $high; $i >= $m; --$i) { $this->ort[$i] = $this->H[$i][$m - 1] / $scale; $h += $this->ort[$i] * $this->ort[$i]; } $g = $h ** .5; if ($this->ort[$m] > 0) { $g *= -1; } $h -= $this->ort[$m] * $g; $this->ort[$m] -= $g; // Apply Householder similarity transformation // H = (I -u * u' / h) * H * (I -u * u') / h) for ($j = $m; $j < $this->n; ++$j) { $f = 0.0; for ($i = $high; $i >= $m; --$i) { $f += $this->ort[$i] * $this->H[$i][$j]; } $f /= $h; for ($i = $m; $i <= $high; ++$i) { $this->H[$i][$j] -= $f * $this->ort[$i]; } } for ($i = 0; $i <= $high; ++$i) { $f = 0.0; for ($j = $high; $j >= $m; --$j) { $f += $this->ort[$j] * $this->H[$i][$j]; } $f /= $h; for ($j = $m; $j <= $high; ++$j) { $this->H[$i][$j] -= $f * $this->ort[$j]; } } $this->ort[$m] = $scale * $this->ort[$m]; $this->H[$m][$m - 1] = $scale * $g; } } // Accumulate transformations (Algol's ortran). for ($i = 0; $i < $this->n; ++$i) { for ($j = 0; $j < $this->n; ++$j) { $this->V[$i][$j] = ($i == $j ? 1.0 : 0.0); } } for ($m = $high - 1; $m >= $low + 1; --$m) { if ($this->H[$m][$m - 1] != 0.0) { for ($i = $m + 1; $i <= $high; ++$i) { $this->ort[$i] = $this->H[$i][$m - 1]; } for ($j = $m; $j <= $high; ++$j) { $g = 0.0; for ($i = $m; $i <= $high; ++$i) { $g += $this->ort[$i] * $this->V[$i][$j]; } // Double division avoids possible underflow $g /= $this->ort[$m]; $g /= $this->H[$m][$m - 1]; for ($i = $m; $i <= $high; ++$i) { $this->V[$i][$j] += $g * $this->ort[$i]; } } } } } /** * Performs complex division. * * @param int|float $xr * @param int|float $xi * @param int|float $yr * @param int|float $yi */ private function cdiv($xr, $xi, $yr, $yi): void { if (abs($yr) > abs($yi)) { $r = $yi / $yr; $d = $yr + $r * $yi; $this->cdivr = ($xr + $r * $xi) / $d; $this->cdivi = ($xi - $r * $xr) / $d; } else { $r = $yr / $yi; $d = $yi + $r * $yr; $this->cdivr = ($r * $xr + $xi) / $d; $this->cdivi = ($r * $xi - $xr) / $d; } } /** * Nonsymmetric reduction from Hessenberg to real Schur form. * * Code is derived from the Algol procedure hqr2, * by Martin and Wilkinson, Handbook for Auto. Comp., * Vol.ii-Linear Algebra, and the corresponding * Fortran subroutine in EISPACK. */ private function hqr2(): void { // Initialize $nn = $this->n; $n = $nn - 1; $low = 0; $high = $nn - 1; $eps = 2.0 ** -52.0; $exshift = 0.0; $p = $q = $r = $s = $z = 0; // Store roots isolated by balanc and compute matrix norm $norm = 0.0; for ($i = 0; $i < $nn; ++$i) { if (($i < $low) or ($i > $high)) { $this->d[$i] = $this->H[$i][$i]; $this->e[$i] = 0.0; } for ($j = max($i - 1, 0); $j < $nn; ++$j) { $norm += abs($this->H[$i][$j]); } } // Outer loop over eigenvalue index $iter = 0; while ($n >= $low) { // Look for single small sub-diagonal element $l = $n; while ($l > $low) { $s = abs($this->H[$l - 1][$l - 1]) + abs($this->H[$l][$l]); if ($s == 0.0) { $s = $norm; } if (abs($this->H[$l][$l - 1]) < $eps * $s) { break; } --$l; } // Check for convergence // One root found if ($l == $n) { $this->H[$n][$n] += $exshift; $this->d[$n] = $this->H[$n][$n]; $this->e[$n] = 0.0; --$n; $iter = 0; // Two roots found } elseif ($l == $n - 1) { $w = $this->H[$n][$n - 1] * $this->H[$n - 1][$n]; $p = ($this->H[$n - 1][$n - 1] - $this->H[$n][$n]) / 2.0; $q = $p * $p + $w; $z = abs($q) ** .5; $this->H[$n][$n] += $exshift; $this->H[$n - 1][$n - 1] += $exshift; $x = $this->H[$n][$n]; // Real pair if ($q >= 0) { if ($p >= 0) { $z = $p + $z; } else { $z = $p - $z; } $this->d[$n - 1] = $x + $z; $this->d[$n] = $this->d[$n - 1]; if ($z != 0.0) { $this->d[$n] = $x - $w / $z; } $this->e[$n - 1] = 0.0; $this->e[$n] = 0.0; $x = $this->H[$n][$n - 1]; $s = abs($x) + abs($z); $p = $x / $s; $q = $z / $s; $r = ($p * $p + $q * $q) ** .5; $p /= $r; $q /= $r; // Row modification for ($j = $n - 1; $j < $nn; ++$j) { $z = $this->H[$n - 1][$j]; $this->H[$n - 1][$j] = $q * $z + $p * $this->H[$n][$j]; $this->H[$n][$j] = $q * $this->H[$n][$j] - $p * $z; } // Column modification for ($i = 0; $i <= $n; ++$i) { $z = $this->H[$i][$n - 1]; $this->H[$i][$n - 1] = $q * $z + $p * $this->H[$i][$n]; $this->H[$i][$n] = $q * $this->H[$i][$n] - $p * $z; } // Accumulate transformations for ($i = $low; $i <= $high; ++$i) { $z = $this->V[$i][$n - 1]; $this->V[$i][$n - 1] = $q * $z + $p * $this->V[$i][$n]; $this->V[$i][$n] = $q * $this->V[$i][$n] - $p * $z; } // Complex pair } else { $this->d[$n - 1] = $x + $p; $this->d[$n] = $x + $p; $this->e[$n - 1] = $z; $this->e[$n] = -$z; } $n -= 2; $iter = 0; // No convergence yet } else { // Form shift $x = $this->H[$n][$n]; $y = 0.0; $w = 0.0; if ($l < $n) { $y = $this->H[$n - 1][$n - 1]; $w = $this->H[$n][$n - 1] * $this->H[$n - 1][$n]; } // Wilkinson's original ad hoc shift if ($iter == 10) { $exshift += $x; for ($i = $low; $i <= $n; ++$i) { $this->H[$i][$i] -= $x; } $s = abs($this->H[$n][$n - 1]) + abs($this->H[$n - 1][$n - 2]); $x = $y = 0.75 * $s; $w = -0.4375 * $s * $s; } // MATLAB's new ad hoc shift if ($iter == 30) { $s = ($y - $x) / 2.0; $s *= $s + $w; if ($s > 0) { $s **= .5; if ($y < $x) { $s = -$s; } $s = $x - $w / (($y - $x) / 2.0 + $s); for ($i = $low; $i <= $n; ++$i) { $this->H[$i][$i] -= $s; } $exshift += $s; $x = $y = $w = 0.964; } } // Could check iteration count here. ++$iter; // Look for two consecutive small sub-diagonal elements $m = $n - 2; while ($m >= $l) { $z = $this->H[$m][$m]; $r = $x - $z; $s = $y - $z; $p = ($r * $s - $w) / $this->H[$m + 1][$m] + $this->H[$m][$m + 1]; $q = $this->H[$m + 1][$m + 1] - $z - $r - $s; $r = $this->H[$m + 2][$m + 1]; $s = abs($p) + abs($q) + abs($r); $p /= $s; $q /= $s; $r /= $s; if ($m == $l) { break; } if (abs($this->H[$m][$m - 1]) * (abs($q) + abs($r)) < $eps * (abs($p) * (abs($this->H[$m - 1][$m - 1]) + abs($z) + abs($this->H[$m + 1][$m + 1])))) { break; } --$m; } for ($i = $m + 2; $i <= $n; ++$i) { $this->H[$i][$i - 2] = 0.0; if ($i > $m + 2) { $this->H[$i][$i - 3] = 0.0; } } // Double QR step involving rows l:n and columns m:n for ($k = $m; $k <= $n - 1; ++$k) { $notlast = $k != $n - 1; if ($k != $m) { $p = $this->H[$k][$k - 1]; $q = $this->H[$k + 1][$k - 1]; $r = ($notlast ? $this->H[$k + 2][$k - 1] : 0.0); $x = abs($p) + abs($q) + abs($r); if ($x != 0.0) { $p /= $x; $q /= $x; $r /= $x; } } if ($x == 0.0) { break; } $s = ($p * $p + $q * $q + $r * $r) ** .5; if ($p < 0) { $s = -$s; } if ($s != 0) { if ($k != $m) { $this->H[$k][$k - 1] = -$s * $x; } elseif ($l != $m) { $this->H[$k][$k - 1] = -$this->H[$k][$k - 1]; } $p += $s; $x = $p / $s; $y = $q / $s; $z = $r / $s; $q /= $p; $r /= $p; // Row modification for ($j = $k; $j < $nn; ++$j) { $p = $this->H[$k][$j] + $q * $this->H[$k + 1][$j]; if ($notlast) { $p += $r * $this->H[$k + 2][$j]; $this->H[$k + 2][$j] -= $p * $z; } $this->H[$k][$j] -= $p * $x; $this->H[$k + 1][$j] -= $p * $y; } // Column modification for ($i = 0; $i <= min($n, $k + 3); ++$i) { $p = $x * $this->H[$i][$k] + $y * $this->H[$i][$k + 1]; if ($notlast) { $p += $z * $this->H[$i][$k + 2]; $this->H[$i][$k + 2] -= $p * $r; } $this->H[$i][$k] -= $p; $this->H[$i][$k + 1] -= $p * $q; } // Accumulate transformations for ($i = $low; $i <= $high; ++$i) { $p = $x * $this->V[$i][$k] + $y * $this->V[$i][$k + 1]; if ($notlast) { $p += $z * $this->V[$i][$k + 2]; $this->V[$i][$k + 2] -= $p * $r; } $this->V[$i][$k] -= $p; $this->V[$i][$k + 1] -= $p * $q; } } // ($s != 0) } // k loop } // check convergence } // while ($n >= $low) // Backsubstitute to find vectors of upper triangular form if ($norm == 0.0) { return; } for ($n = $nn - 1; $n >= 0; --$n) { $p = $this->d[$n]; $q = $this->e[$n]; // Real vector if ($q == 0) { $l = $n; $this->H[$n][$n] = 1.0; for ($i = $n - 1; $i >= 0; --$i) { $w = $this->H[$i][$i] - $p; $r = 0.0; for ($j = $l; $j <= $n; ++$j) { $r += $this->H[$i][$j] * $this->H[$j][$n]; } if ($this->e[$i] < 0.0) { $z = $w; $s = $r; } else { $l = $i; if ($this->e[$i] == 0.0) { if ($w != 0.0) { $this->H[$i][$n] = -$r / $w; } else { $this->H[$i][$n] = -$r / ($eps * $norm); } // Solve real equations } else { $x = $this->H[$i][$i + 1]; $y = $this->H[$i + 1][$i]; $q = ($this->d[$i] - $p) * ($this->d[$i] - $p) + $this->e[$i] * $this->e[$i]; $t = ($x * $s - $z * $r) / $q; $this->H[$i][$n] = $t; if (abs($x) > abs($z)) { $this->H[$i + 1][$n] = (-$r - $w * $t) / $x; } else { $this->H[$i + 1][$n] = (-$s - $y * $t) / $z; } } // Overflow control $t = abs($this->H[$i][$n]); if (($eps * $t) * $t > 1) { for ($j = $i; $j <= $n; ++$j) { $this->H[$j][$n] /= $t; } } } } // Complex vector } elseif ($q < 0) { $l = $n - 1; // Last vector component imaginary so matrix is triangular if (abs($this->H[$n][$n - 1]) > abs($this->H[$n - 1][$n])) { $this->H[$n - 1][$n - 1] = $q / $this->H[$n][$n - 1]; $this->H[$n - 1][$n] = -($this->H[$n][$n] - $p) / $this->H[$n][$n - 1]; } else { $this->cdiv(0.0, -$this->H[$n - 1][$n], $this->H[$n - 1][$n - 1] - $p, $q); $this->H[$n - 1][$n - 1] = $this->cdivr; $this->H[$n - 1][$n] = $this->cdivi; } $this->H[$n][$n - 1] = 0.0; $this->H[$n][$n] = 1.0; for ($i = $n - 2; $i >= 0; --$i) { // double ra,sa,vr,vi; $ra = 0.0; $sa = 0.0; for ($j = $l; $j <= $n; ++$j) { $ra += $this->H[$i][$j] * $this->H[$j][$n - 1]; $sa += $this->H[$i][$j] * $this->H[$j][$n]; } $w = $this->H[$i][$i] - $p; if ($this->e[$i] < 0.0) { $z = $w; $r = $ra; $s = $sa; } else { $l = $i; if ($this->e[$i] == 0) { $this->cdiv(-$ra, -$sa, $w, $q); $this->H[$i][$n - 1] = $this->cdivr; $this->H[$i][$n] = $this->cdivi; } else { // Solve complex equations $x = $this->H[$i][$i + 1]; $y = $this->H[$i + 1][$i]; $vr = ($this->d[$i] - $p) * ($this->d[$i] - $p) + $this->e[$i] * $this->e[$i] - $q * $q; $vi = ($this->d[$i] - $p) * 2.0 * $q; if ($vr == 0.0 && $vi == 0.0) { $vr = $eps * $norm * (abs($w) + abs($q) + abs($x) + abs($y) + abs($z)); } $this->cdiv($x * $r - $z * $ra + $q * $sa, $x * $s - $z * $sa - $q * $ra, $vr, $vi); $this->H[$i][$n - 1] = $this->cdivr; $this->H[$i][$n] = $this->cdivi; if (abs($x) > (abs($z) + abs($q))) { $this->H[$i + 1][$n - 1] = (-$ra - $w * $this->H[$i][$n - 1] + $q * $this->H[$i][$n]) / $x; $this->H[$i + 1][$n] = (-$sa - $w * $this->H[$i][$n] - $q * $this->H[$i][$n - 1]) / $x; } else { $this->cdiv(-$r - $y * $this->H[$i][$n - 1], -$s - $y * $this->H[$i][$n], $z, $q); $this->H[$i + 1][$n - 1] = $this->cdivr; $this->H[$i + 1][$n] = $this->cdivi; } } // Overflow control $t = max(abs($this->H[$i][$n - 1]), abs($this->H[$i][$n])); if (($eps * $t) * $t > 1) { for ($j = $i; $j <= $n; ++$j) { $this->H[$j][$n - 1] /= $t; $this->H[$j][$n] /= $t; } } } // end else } // end for } // end else for complex case } // end for // Vectors of isolated roots for ($i = 0; $i < $nn; ++$i) { if ($i < $low || $i > $high) { for ($j = $i; $j < $nn; ++$j) { $this->V[$i][$j] = $this->H[$i][$j]; } } } // Back transformation to get eigenvectors of original matrix for ($j = $nn - 1; $j >= $low; --$j) { for ($i = $low; $i <= $high; ++$i) { $z = 0.0; for ($k = $low; $k <= min($j, $high); ++$k) { $z += $this->V[$i][$k] * $this->H[$k][$j]; } $this->V[$i][$j] = $z; } } } } Math/Comparison.php 0000644 00000001713 15152065050 0010261 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math; use Phpml\Exception\InvalidArgumentException; class Comparison { /** * @param mixed $a * @param mixed $b * * @throws InvalidArgumentException */ public static function compare($a, $b, string $operator): bool { switch ($operator) { case '>': return $a > $b; case '>=': return $a >= $b; case '=': case '==': return $a == $b; case '===': return $a === $b; case '<=': return $a <= $b; case '<': return $a < $b; case '!=': case '<>': return $a != $b; case '!==': return $a !== $b; default: throw new InvalidArgumentException(sprintf('Invalid operator "%s" provided', $operator)); } } } Math/Distance.php 0000644 00000000210 15152065050 0007670 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math; interface Distance { public function distance(array $a, array $b): float; } Math/Statistic/Correlation.php 0000644 00000001560 15152065050 0012377 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math\Statistic; use Phpml\Exception\InvalidArgumentException; class Correlation { /** * @param int[]|float[] $x * @param int[]|float[] $y * * @throws InvalidArgumentException */ public static function pearson(array $x, array $y): float { if (count($x) !== count($y)) { throw new InvalidArgumentException('Size of given arrays does not match'); } $count = count($x); $meanX = Mean::arithmetic($x); $meanY = Mean::arithmetic($y); $axb = 0; $a2 = 0; $b2 = 0; for ($i = 0; $i < $count; ++$i) { $a = $x[$i] - $meanX; $b = $y[$i] - $meanY; $axb += ($a * $b); $a2 += $a ** 2; $b2 += $b ** 2; } return $axb / ($a2 * $b2) ** .5; } } Math/Statistic/StandardDeviation.php 0000644 00000002527 15152065050 0013525 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math\Statistic; use Phpml\Exception\InvalidArgumentException; class StandardDeviation { /** * @param float[]|int[] $numbers */ public static function population(array $numbers, bool $sample = true): float { $n = count($numbers); if ($n === 0) { throw new InvalidArgumentException('The array has zero elements'); } if ($sample && $n === 1) { throw new InvalidArgumentException('The array must have at least 2 elements'); } $mean = Mean::arithmetic($numbers); $carry = 0.0; foreach ($numbers as $val) { $carry += ($val - $mean) ** 2; } if ($sample) { --$n; } return ($carry / $n) ** .5; } /** * Sum of squares deviations * ∑⟮xᵢ - μ⟯² * * @param float[]|int[] $numbers */ public static function sumOfSquares(array $numbers): float { if (count($numbers) === 0) { throw new InvalidArgumentException('The array has zero elements'); } $mean = Mean::arithmetic($numbers); return array_sum(array_map( static function ($val) use ($mean): float { return ($val - $mean) ** 2; }, $numbers )); } } Math/Statistic/Covariance.php 0000644 00000010323 15152065050 0012165 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math\Statistic; use Phpml\Exception\InvalidArgumentException; class Covariance { /** * Calculates covariance from two given arrays, x and y, respectively * * @throws InvalidArgumentException */ public static function fromXYArrays(array $x, array $y, bool $sample = true, ?float $meanX = null, ?float $meanY = null): float { $n = count($x); if ($n === 0 || count($y) === 0) { throw new InvalidArgumentException('The array has zero elements'); } if ($sample && $n === 1) { throw new InvalidArgumentException('The array must have at least 2 elements'); } if ($meanX === null) { $meanX = Mean::arithmetic($x); } if ($meanY === null) { $meanY = Mean::arithmetic($y); } $sum = 0.0; foreach ($x as $index => $xi) { $yi = $y[$index]; $sum += ($xi - $meanX) * ($yi - $meanY); } if ($sample) { --$n; } return $sum / $n; } /** * Calculates covariance of two dimensions, i and k in the given data. * * @throws InvalidArgumentException * @throws \Exception */ public static function fromDataset(array $data, int $i, int $k, bool $sample = true, ?float $meanX = null, ?float $meanY = null): float { if (count($data) === 0) { throw new InvalidArgumentException('The array has zero elements'); } $n = count($data); if ($sample && $n === 1) { throw new InvalidArgumentException('The array must have at least 2 elements'); } if ($i < 0 || $k < 0 || $i >= $n || $k >= $n) { throw new InvalidArgumentException('Given indices i and k do not match with the dimensionality of data'); } if ($meanX === null || $meanY === null) { $x = array_column($data, $i); $y = array_column($data, $k); $meanX = Mean::arithmetic($x); $meanY = Mean::arithmetic($y); $sum = 0.0; foreach ($x as $index => $xi) { $yi = $y[$index]; $sum += ($xi - $meanX) * ($yi - $meanY); } } else { // In the case, whole dataset given along with dimension indices, i and k, // we would like to avoid getting column data with array_column and operate // over this extra copy of column data for memory efficiency purposes. // // Instead we traverse through the whole data and get what we actually need // without copying the data. This way, memory use will be reduced // with a slight cost of CPU utilization. $sum = 0.0; foreach ($data as $row) { $val = [0, 0]; foreach ($row as $index => $col) { if ($index == $i) { $val[0] = $col - $meanX; } if ($index == $k) { $val[1] = $col - $meanY; } } $sum += $val[0] * $val[1]; } } if ($sample) { --$n; } return $sum / $n; } /** * Returns the covariance matrix of n-dimensional data * * @param array|null $means */ public static function covarianceMatrix(array $data, ?array $means = null): array { $n = count($data[0]); if ($means === null) { $means = []; for ($i = 0; $i < $n; ++$i) { $means[] = Mean::arithmetic(array_column($data, $i)); } } $cov = []; for ($i = 0; $i < $n; ++$i) { for ($k = 0; $k < $n; ++$k) { if ($i > $k) { $cov[$i][$k] = $cov[$k][$i]; } else { $cov[$i][$k] = self::fromDataset( $data, $i, $k, true, $means[$i], $means[$k] ); } } } return $cov; } } Math/Statistic/Mean.php 0000644 00000002567 15152065050 0011006 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math\Statistic; use Phpml\Exception\InvalidArgumentException; class Mean { /** * @throws InvalidArgumentException */ public static function arithmetic(array $numbers): float { self::checkArrayLength($numbers); return array_sum($numbers) / count($numbers); } /** * @return float|mixed * * @throws InvalidArgumentException */ public static function median(array $numbers) { self::checkArrayLength($numbers); $count = count($numbers); $middleIndex = (int) floor($count / 2); sort($numbers, SORT_NUMERIC); $median = $numbers[$middleIndex]; if ($count % 2 === 0) { $median = ($median + $numbers[$middleIndex - 1]) / 2; } return $median; } /** * @return mixed * * @throws InvalidArgumentException */ public static function mode(array $numbers) { self::checkArrayLength($numbers); $values = array_count_values($numbers); return array_search(max($values), $values, true); } /** * @throws InvalidArgumentException */ private static function checkArrayLength(array $array): void { if (count($array) === 0) { throw new InvalidArgumentException('The array has zero elements'); } } } Math/Statistic/Gaussian.php 0000644 00000002117 15152065050 0011667 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math\Statistic; class Gaussian { /** * @var float */ protected $mean; /** * @var float */ protected $std; public function __construct(float $mean, float $std) { $this->mean = $mean; $this->std = $std; } /** * Returns probability density of the given <i>$value</i> * * @return float|int */ public function pdf(float $value) { // Calculate the probability density by use of normal/Gaussian distribution // Ref: https://en.wikipedia.org/wiki/Normal_distribution $std2 = $this->std ** 2; $mean = $this->mean; return exp(-(($value - $mean) ** 2) / (2 * $std2)) / ((2 * $std2 * M_PI) ** .5); } /** * Returns probability density value of the given <i>$value</i> based on * given standard deviation and the mean */ public static function distributionPdf(float $mean, float $std, float $value): float { $normal = new self($mean, $std); return $normal->pdf($value); } } Math/Statistic/ANOVA.php 0000644 00000010252 15152065050 0010760 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math\Statistic; use Phpml\Exception\InvalidArgumentException; /** * Analysis of variance * https://en.wikipedia.org/wiki/Analysis_of_variance */ final class ANOVA { /** * The one-way ANOVA tests the null hypothesis that 2 or more groups have * the same population mean. The test is applied to samples from two or * more groups, possibly with differing sizes. * * @param array[] $samples - each row is class samples * * @return float[] */ public static function oneWayF(array $samples): array { $classes = count($samples); if ($classes < 2) { throw new InvalidArgumentException('The array must have at least 2 elements'); } $samplesPerClass = array_map(static function (array $class): int { return count($class); }, $samples); $allSamples = (int) array_sum($samplesPerClass); $ssAllSamples = self::sumOfSquaresPerFeature($samples); $sumSamples = self::sumOfFeaturesPerClass($samples); $squareSumSamples = self::sumOfSquares($sumSamples); $sumSamplesSquare = self::squaresSum($sumSamples); $ssbn = self::calculateSsbn($samples, $sumSamplesSquare, $samplesPerClass, $squareSumSamples, $allSamples); $sswn = self::calculateSswn($ssbn, $ssAllSamples, $squareSumSamples, $allSamples); $dfbn = $classes - 1; $dfwn = $allSamples - $classes; $msb = array_map(static function ($s) use ($dfbn) { return $s / $dfbn; }, $ssbn); $msw = array_map(static function ($s) use ($dfwn) { if ($dfwn === 0) { return 1; } return $s / $dfwn; }, $sswn); $f = []; foreach ($msb as $index => $msbValue) { $f[$index] = $msbValue / $msw[$index]; } return $f; } private static function sumOfSquaresPerFeature(array $samples): array { $sum = array_fill(0, count($samples[0][0]), 0); foreach ($samples as $class) { foreach ($class as $sample) { foreach ($sample as $index => $feature) { $sum[$index] += $feature ** 2; } } } return $sum; } private static function sumOfFeaturesPerClass(array $samples): array { return array_map(static function (array $class): array { $sum = array_fill(0, count($class[0]), 0); foreach ($class as $sample) { foreach ($sample as $index => $feature) { $sum[$index] += $feature; } } return $sum; }, $samples); } private static function sumOfSquares(array $sums): array { $squares = array_fill(0, count($sums[0]), 0); foreach ($sums as $row) { foreach ($row as $index => $sum) { $squares[$index] += $sum; } } return array_map(static function ($sum) { return $sum ** 2; }, $squares); } private static function squaresSum(array $sums): array { foreach ($sums as &$row) { foreach ($row as &$sum) { $sum **= 2; } } return $sums; } private static function calculateSsbn(array $samples, array $sumSamplesSquare, array $samplesPerClass, array $squareSumSamples, int $allSamples): array { $ssbn = array_fill(0, count($samples[0][0]), 0); foreach ($sumSamplesSquare as $classIndex => $class) { foreach ($class as $index => $feature) { $ssbn[$index] += $feature / $samplesPerClass[$classIndex]; } } foreach ($squareSumSamples as $index => $sum) { $ssbn[$index] -= $sum / $allSamples; } return $ssbn; } private static function calculateSswn(array $ssbn, array $ssAllSamples, array $squareSumSamples, int $allSamples): array { $sswn = []; foreach ($ssAllSamples as $index => $ss) { $sswn[$index] = ($ss - $squareSumSamples[$index] / $allSamples) - $ssbn[$index]; } return $sswn; } } Math/Statistic/Variance.php 0000644 00000001453 15152065050 0011647 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math\Statistic; /** * In probability theory and statistics, variance is the expectation of the squared deviation of a random variable from its mean. * Informally, it measures how far a set of (random) numbers are spread out from their average value * https://en.wikipedia.org/wiki/Variance */ final class Variance { /** * Population variance * Use when all possible observations of the system are present. * If used with a subset of data (sample variance), it will be a biased variance. * * ∑⟮xᵢ - μ⟯² * σ² = ---------- * N */ public static function population(array $population): float { return StandardDeviation::sumOfSquares($population) / count($population); } } Math/Distance/Euclidean.php 0000644 00000000771 15152065050 0011575 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math\Distance; /** * Class Euclidean * * L^2 Metric. */ class Euclidean extends Distance { /** * Euclidean constructor. */ public function __construct() { parent::__construct(2.0); } /** * Square of Euclidean distance * * @throws \Phpml\Exception\InvalidArgumentException */ public function sqDistance(array $a, array $b): float { return $this->distance($a, $b) ** 2; } } Math/Distance/Minkowski.php 0000644 00000000223 15152065050 0011647 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math\Distance; /** * Class Minkowski * * L^n Metric. */ class Minkowski extends Distance { } Math/Distance/Manhattan.php 0000644 00000000421 15152065050 0011607 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math\Distance; /** * Class Manhattan * * L^1 Metric. */ class Manhattan extends Distance { /** * Manhattan constructor. */ public function __construct() { parent::__construct(1.0); } } Math/Distance/Chebyshev.php 0000644 00000000425 15152065050 0011620 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math\Distance; /** * Class Chebyshev */ class Chebyshev extends Distance { /** * {@inheritdoc} */ public function distance(array $a, array $b): float { return max($this->deltas($a, $b)); } } Math/Distance/Distance.php 0000644 00000002231 15152065050 0011427 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math\Distance; use Phpml\Exception\InvalidArgumentException; use Phpml\Math\Distance as DistanceInterface; /** * Class Distance */ abstract class Distance implements DistanceInterface { /** * @var float|int */ public $norm; /** * Distance constructor. */ public function __construct(float $norm = 3.0) { $this->norm = $norm; } /** * @throws InvalidArgumentException */ public function distance(array $a, array $b): float { $distance = 0; foreach ($this->deltas($a, $b) as $delta) { $distance += $delta ** $this->norm; } return $distance ** (1 / $this->norm); } /** * @throws InvalidArgumentException */ protected function deltas(array $a, array $b): array { $count = count($a); if ($count !== count($b)) { throw new InvalidArgumentException('Size of given arrays does not match'); } $deltas = []; for ($i = 0; $i < $count; $i++) { $deltas[] = abs($a[$i] - $b[$i]); } return $deltas; } } Math/Kernel/RBF.php 0000644 00000001323 15152065050 0007775 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Math\Kernel; use Phpml\Exception\InvalidArgumentException; use Phpml\Math\Kernel; use Phpml\Math\Product; class RBF implements Kernel { /** * @var float */ private $gamma; public function __construct(float $gamma) { $this->gamma = $gamma; } public function compute($a, $b): float { if (!is_array($a) || !is_array($b)) { throw new InvalidArgumentException(sprintf('Arguments of %s must be arrays', __METHOD__)); } $score = 2 * Product::scalar($a, $b); $squares = Product::scalar($a, $a) + Product::scalar($b, $b); return exp(-$this->gamma * ($squares - $score)); } } Classification/Classifier.php 0000644 00000000203 15152065050 0012266 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Classification; use Phpml\Estimator; interface Classifier extends Estimator { } Classification/NaiveBayes.php 0000644 00000013150 15152065050 0012235 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Classification; use Phpml\Exception\InvalidArgumentException; use Phpml\Helper\Predictable; use Phpml\Helper\Trainable; use Phpml\Math\Statistic\Mean; use Phpml\Math\Statistic\StandardDeviation; class NaiveBayes implements Classifier { use Trainable; use Predictable; public const CONTINUOS = 1; public const NOMINAL = 2; public const EPSILON = 1e-10; /** * @var array */ private $std = []; /** * @var array */ private $mean = []; /** * @var array */ private $discreteProb = []; /** * @var array */ private $dataType = []; /** * @var array */ private $p = []; /** * @var int */ private $sampleCount = 0; /** * @var int */ private $featureCount = 0; /** * @var array */ private $labels = []; public function train(array $samples, array $targets): void { $this->samples = array_merge($this->samples, $samples); $this->targets = array_merge($this->targets, $targets); $this->sampleCount = count($this->samples); $this->featureCount = count($this->samples[0]); $this->labels = array_map('strval', array_flip(array_flip($this->targets))); foreach ($this->labels as $label) { $samples = $this->getSamplesByLabel($label); $this->p[$label] = count($samples) / $this->sampleCount; $this->calculateStatistics($label, $samples); } } /** * @return mixed */ protected function predictSample(array $sample) { // Use NaiveBayes assumption for each label using: // P(label|features) = P(label) * P(feature0|label) * P(feature1|label) .... P(featureN|label) // Then compare probability for each class to determine which label is most likely $predictions = []; foreach ($this->labels as $label) { $p = $this->p[$label]; for ($i = 0; $i < $this->featureCount; ++$i) { $Plf = $this->sampleProbability($sample, $i, $label); $p += $Plf; } $predictions[$label] = $p; } arsort($predictions, SORT_NUMERIC); reset($predictions); return key($predictions); } /** * Calculates vital statistics for each label & feature. Stores these * values in private array in order to avoid repeated calculation */ private function calculateStatistics(string $label, array $samples): void { $this->std[$label] = array_fill(0, $this->featureCount, 0); $this->mean[$label] = array_fill(0, $this->featureCount, 0); $this->dataType[$label] = array_fill(0, $this->featureCount, self::CONTINUOS); $this->discreteProb[$label] = array_fill(0, $this->featureCount, self::CONTINUOS); for ($i = 0; $i < $this->featureCount; ++$i) { // Get the values of nth column in the samples array // Mean::arithmetic is called twice, can be optimized $values = array_column($samples, $i); $numValues = count($values); // if the values contain non-numeric data, // then it should be treated as nominal/categorical/discrete column if ($values !== array_filter($values, 'is_numeric')) { $this->dataType[$label][$i] = self::NOMINAL; $this->discreteProb[$label][$i] = array_count_values($values); $db = &$this->discreteProb[$label][$i]; $db = array_map(function ($el) use ($numValues) { return $el / $numValues; }, $db); } else { $this->mean[$label][$i] = Mean::arithmetic($values); // Add epsilon in order to avoid zero stdev $this->std[$label][$i] = 1e-10 + StandardDeviation::population($values, false); } } } /** * Calculates the probability P(label|sample_n) */ private function sampleProbability(array $sample, int $feature, string $label): float { if (!isset($sample[$feature])) { throw new InvalidArgumentException('Missing feature. All samples must have equal number of features'); } $value = $sample[$feature]; if ($this->dataType[$label][$feature] == self::NOMINAL) { if (!isset($this->discreteProb[$label][$feature][$value]) || $this->discreteProb[$label][$feature][$value] == 0) { return self::EPSILON; } return $this->discreteProb[$label][$feature][$value]; } $std = $this->std[$label][$feature]; $mean = $this->mean[$label][$feature]; // Calculate the probability density by use of normal/Gaussian distribution // Ref: https://en.wikipedia.org/wiki/Normal_distribution // // In order to avoid numerical errors because of small or zero values, // some libraries adopt taking log of calculations such as // scikit-learn did. // (See : https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/naive_bayes.py) $pdf = -0.5 * log(2.0 * M_PI * $std * $std); $pdf -= 0.5 * (($value - $mean) ** 2) / ($std * $std); return $pdf; } /** * Return samples belonging to specific label */ private function getSamplesByLabel(string $label): array { $samples = []; for ($i = 0; $i < $this->sampleCount; ++$i) { if ($this->targets[$i] == $label) { $samples[] = $this->samples[$i]; } } return $samples; } } Classification/MLPClassifier.php 0000644 00000002600 15152065050 0012642 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Classification; use Phpml\Exception\InvalidArgumentException; use Phpml\NeuralNetwork\Network\MultilayerPerceptron; class MLPClassifier extends MultilayerPerceptron implements Classifier { /** * @param mixed $target * * @throws InvalidArgumentException */ public function getTargetClass($target): int { if (!in_array($target, $this->classes, true)) { throw new InvalidArgumentException( sprintf('Target with value "%s" is not part of the accepted classes', $target) ); } return array_search($target, $this->classes, true); } /** * @return mixed */ protected function predictSample(array $sample) { $output = $this->setInput($sample)->getOutput(); $predictedClass = null; $max = 0; foreach ($output as $class => $value) { if ($value > $max) { $predictedClass = $class; $max = $value; } } return $predictedClass; } /** * @param mixed $target */ protected function trainSample(array $sample, $target): void { // Feed-forward. $this->setInput($sample); // Back-propagate. $this->backpropagation->backpropagate($this->getLayers(), $this->getTargetClass($target)); } } Classification/Linear/LogisticRegression.php 0000644 00000021317 15152065050 0015243 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Classification\Linear; use Closure; use Exception; use Phpml\Exception\InvalidArgumentException; use Phpml\Helper\Optimizer\ConjugateGradient; class LogisticRegression extends Adaline { /** * Batch training: Gradient descent algorithm (default) */ public const BATCH_TRAINING = 1; /** * Online training: Stochastic gradient descent learning */ public const ONLINE_TRAINING = 2; /** * Conjugate Batch: Conjugate Gradient algorithm */ public const CONJUGATE_GRAD_TRAINING = 3; /** * Cost function to optimize: 'log' and 'sse' are supported <br> * - 'log' : log likelihood <br> * - 'sse' : sum of squared errors <br> * * @var string */ protected $costFunction = 'log'; /** * Regularization term: only 'L2' is supported * * @var string */ protected $penalty = 'L2'; /** * Lambda (λ) parameter of regularization term. If λ is set to 0, then * regularization term is cancelled. * * @var float */ protected $lambda = 0.5; /** * Initalize a Logistic Regression classifier with maximum number of iterations * and learning rule to be applied <br> * * Maximum number of iterations can be an integer value greater than 0 <br> * If normalizeInputs is set to true, then every input given to the algorithm will be standardized * by use of standard deviation and mean calculation <br> * * Cost function can be 'log' for log-likelihood and 'sse' for sum of squared errors <br> * * Penalty (Regularization term) can be 'L2' or empty string to cancel penalty term * * @throws InvalidArgumentException */ public function __construct( int $maxIterations = 500, bool $normalizeInputs = true, int $trainingType = self::CONJUGATE_GRAD_TRAINING, string $cost = 'log', string $penalty = 'L2' ) { $trainingTypes = range(self::BATCH_TRAINING, self::CONJUGATE_GRAD_TRAINING); if (!in_array($trainingType, $trainingTypes, true)) { throw new InvalidArgumentException( 'Logistic regression can only be trained with '. 'batch (gradient descent), online (stochastic gradient descent) '. 'or conjugate batch (conjugate gradients) algorithms' ); } if (!in_array($cost, ['log', 'sse'], true)) { throw new InvalidArgumentException( "Logistic regression cost function can be one of the following: \n". "'log' for log-likelihood and 'sse' for sum of squared errors" ); } if ($penalty !== '' && strtoupper($penalty) !== 'L2') { throw new InvalidArgumentException('Logistic regression supports only \'L2\' regularization'); } $this->learningRate = 0.001; parent::__construct($this->learningRate, $maxIterations, $normalizeInputs); $this->trainingType = $trainingType; $this->costFunction = $cost; $this->penalty = $penalty; } /** * Sets the learning rate if gradient descent algorithm is * selected for training */ public function setLearningRate(float $learningRate): void { $this->learningRate = $learningRate; } /** * Lambda (λ) parameter of regularization term. If 0 is given, * then the regularization term is cancelled */ public function setLambda(float $lambda): void { $this->lambda = $lambda; } /** * Adapts the weights with respect to given samples and targets * by use of selected solver * * @throws \Exception */ protected function runTraining(array $samples, array $targets): void { $callback = $this->getCostFunction(); switch ($this->trainingType) { case self::BATCH_TRAINING: $this->runGradientDescent($samples, $targets, $callback, true); return; case self::ONLINE_TRAINING: $this->runGradientDescent($samples, $targets, $callback, false); return; case self::CONJUGATE_GRAD_TRAINING: $this->runConjugateGradient($samples, $targets, $callback); return; default: // Not reached throw new Exception(sprintf('Logistic regression has invalid training type: %d.', $this->trainingType)); } } /** * Executes Conjugate Gradient method to optimize the weights of the LogReg model */ protected function runConjugateGradient(array $samples, array $targets, Closure $gradientFunc): void { if ($this->optimizer === null) { $this->optimizer = (new ConjugateGradient($this->featureCount)) ->setMaxIterations($this->maxIterations); } $this->weights = $this->optimizer->runOptimization($samples, $targets, $gradientFunc); $this->costValues = $this->optimizer->getCostValues(); } /** * Returns the appropriate callback function for the selected cost function * * @throws \Exception */ protected function getCostFunction(): Closure { $penalty = 0; if ($this->penalty === 'L2') { $penalty = $this->lambda; } switch ($this->costFunction) { case 'log': /* * Negative of Log-likelihood cost function to be minimized: * J(x) = ∑( - y . log(h(x)) - (1 - y) . log(1 - h(x))) * * If regularization term is given, then it will be added to the cost: * for L2 : J(x) = J(x) + λ/m . w * * The gradient of the cost function to be used with gradient descent: * ∇J(x) = -(y - h(x)) = (h(x) - y) */ return function ($weights, $sample, $y) use ($penalty): array { $this->weights = $weights; $hX = $this->output($sample); // In cases where $hX = 1 or $hX = 0, the log-likelihood // value will give a NaN, so we fix these values if ($hX == 1) { $hX = 1 - 1e-10; } if ($hX == 0) { $hX = 1e-10; } $y = $y < 0 ? 0 : 1; $error = -$y * log($hX) - (1 - $y) * log(1 - $hX); $gradient = $hX - $y; return [$error, $gradient, $penalty]; }; case 'sse': /* * Sum of squared errors or least squared errors cost function: * J(x) = ∑ (y - h(x))^2 * * If regularization term is given, then it will be added to the cost: * for L2 : J(x) = J(x) + λ/m . w * * The gradient of the cost function: * ∇J(x) = -(h(x) - y) . h(x) . (1 - h(x)) */ return function ($weights, $sample, $y) use ($penalty): array { $this->weights = $weights; $hX = $this->output($sample); $y = $y < 0 ? 0 : 1; $error = (($y - $hX) ** 2); $gradient = -($y - $hX) * $hX * (1 - $hX); return [$error, $gradient, $penalty]; }; default: // Not reached throw new Exception(sprintf('Logistic regression has invalid cost function: %s.', $this->costFunction)); } } /** * Returns the output of the network, a float value between 0.0 and 1.0 */ protected function output(array $sample): float { $sum = parent::output($sample); return 1.0 / (1.0 + exp(-$sum)); } /** * Returns the class value (either -1 or 1) for the given input */ protected function outputClass(array $sample): int { $output = $this->output($sample); if ($output > 0.5) { return 1; } return -1; } /** * Returns the probability of the sample of belonging to the given label. * * The probability is simply taken as the distance of the sample * to the decision plane. * * @param mixed $label */ protected function predictProbability(array $sample, $label): float { $sample = $this->checkNormalizedSample($sample); $probability = $this->output($sample); if (array_search($label, $this->labels, true) > 0) { return $probability; } return 1 - $probability; } } Classification/Linear/DecisionStump.php 0000644 00000023534 15152065050 0014216 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Classification\Linear; use Phpml\Classification\DecisionTree; use Phpml\Classification\WeightedClassifier; use Phpml\Exception\InvalidArgumentException; use Phpml\Helper\OneVsRest; use Phpml\Helper\Predictable; use Phpml\Math\Comparison; class DecisionStump extends WeightedClassifier { use Predictable; use OneVsRest; public const AUTO_SELECT = -1; /** * @var int */ protected $givenColumnIndex; /** * @var array */ protected $binaryLabels = []; /** * Lowest error rate obtained while training/optimizing the model * * @var float */ protected $trainingErrorRate; /** * @var int */ protected $column; /** * @var mixed */ protected $value; /** * @var string */ protected $operator; /** * @var array */ protected $columnTypes = []; /** * @var int */ protected $featureCount; /** * @var float */ protected $numSplitCount = 100.0; /** * Distribution of samples in the leaves * * @var array */ protected $prob = []; /** * A DecisionStump classifier is a one-level deep DecisionTree. It is generally * used with ensemble algorithms as in the weak classifier role. <br> * * If columnIndex is given, then the stump tries to produce a decision node * on this column, otherwise in cases given the value of -1, the stump itself * decides which column to take for the decision (Default DecisionTree behaviour) */ public function __construct(int $columnIndex = self::AUTO_SELECT) { $this->givenColumnIndex = $columnIndex; } public function __toString(): string { return "IF ${this}->column ${this}->operator ${this}->value ". 'THEN '.$this->binaryLabels[0].' '. 'ELSE '.$this->binaryLabels[1]; } /** * While finding best split point for a numerical valued column, * DecisionStump looks for equally distanced values between minimum and maximum * values in the column. Given <i>$count</i> value determines how many split * points to be probed. The more split counts, the better performance but * worse processing time (Default value is 10.0) */ public function setNumericalSplitCount(float $count): void { $this->numSplitCount = $count; } /** * @throws InvalidArgumentException */ protected function trainBinary(array $samples, array $targets, array $labels): void { $this->binaryLabels = $labels; $this->featureCount = count($samples[0]); // If a column index is given, it should be among the existing columns if ($this->givenColumnIndex > count($samples[0]) - 1) { $this->givenColumnIndex = self::AUTO_SELECT; } // Check the size of the weights given. // If none given, then assign 1 as a weight to each sample if (count($this->weights) === 0) { $this->weights = array_fill(0, count($samples), 1); } else { $numWeights = count($this->weights); if ($numWeights !== count($samples)) { throw new InvalidArgumentException('Number of sample weights does not match with number of samples'); } } // Determine type of each column as either "continuous" or "nominal" $this->columnTypes = DecisionTree::getColumnTypes($samples); // Try to find the best split in the columns of the dataset // by calculating error rate for each split point in each column $columns = range(0, count($samples[0]) - 1); if ($this->givenColumnIndex !== self::AUTO_SELECT) { $columns = [$this->givenColumnIndex]; } $bestSplit = [ 'value' => 0, 'operator' => '', 'prob' => [], 'column' => 0, 'trainingErrorRate' => 1.0, ]; foreach ($columns as $col) { if ($this->columnTypes[$col] == DecisionTree::CONTINUOUS) { $split = $this->getBestNumericalSplit($samples, $targets, $col); } else { $split = $this->getBestNominalSplit($samples, $targets, $col); } if ($split['trainingErrorRate'] < $bestSplit['trainingErrorRate']) { $bestSplit = $split; } } // Assign determined best values to the stump foreach ($bestSplit as $name => $value) { $this->{$name} = $value; } } /** * Determines best split point for the given column */ protected function getBestNumericalSplit(array $samples, array $targets, int $col): array { $values = array_column($samples, $col); // Trying all possible points may be accomplished in two general ways: // 1- Try all values in the $samples array ($values) // 2- Artificially split the range of values into several parts and try them // We choose the second one because it is faster in larger datasets $minValue = min($values); $maxValue = max($values); $stepSize = ($maxValue - $minValue) / $this->numSplitCount; $split = []; foreach (['<=', '>'] as $operator) { // Before trying all possible split points, let's first try // the average value for the cut point $threshold = array_sum($values) / (float) count($values); [$errorRate, $prob] = $this->calculateErrorRate($targets, $threshold, $operator, $values); if (!isset($split['trainingErrorRate']) || $errorRate < $split['trainingErrorRate']) { $split = [ 'value' => $threshold, 'operator' => $operator, 'prob' => $prob, 'column' => $col, 'trainingErrorRate' => $errorRate, ]; } // Try other possible points one by one for ($step = $minValue; $step <= $maxValue; $step += $stepSize) { $threshold = (float) $step; [$errorRate, $prob] = $this->calculateErrorRate($targets, $threshold, $operator, $values); if ($errorRate < $split['trainingErrorRate']) { $split = [ 'value' => $threshold, 'operator' => $operator, 'prob' => $prob, 'column' => $col, 'trainingErrorRate' => $errorRate, ]; } }// for } return $split; } protected function getBestNominalSplit(array $samples, array $targets, int $col): array { $values = array_column($samples, $col); $valueCounts = array_count_values($values); $distinctVals = array_keys($valueCounts); $split = []; foreach (['=', '!='] as $operator) { foreach ($distinctVals as $val) { [$errorRate, $prob] = $this->calculateErrorRate($targets, $val, $operator, $values); if (!isset($split['trainingErrorRate']) || $split['trainingErrorRate'] < $errorRate) { $split = [ 'value' => $val, 'operator' => $operator, 'prob' => $prob, 'column' => $col, 'trainingErrorRate' => $errorRate, ]; } } } return $split; } /** * Calculates the ratio of wrong predictions based on the new threshold * value given as the parameter */ protected function calculateErrorRate(array $targets, float $threshold, string $operator, array $values): array { $wrong = 0.0; $prob = []; $leftLabel = $this->binaryLabels[0]; $rightLabel = $this->binaryLabels[1]; foreach ($values as $index => $value) { if (Comparison::compare($value, $threshold, $operator)) { $predicted = $leftLabel; } else { $predicted = $rightLabel; } $target = $targets[$index]; if ((string) $predicted != (string) $targets[$index]) { $wrong += $this->weights[$index]; } if (!isset($prob[$predicted][$target])) { $prob[$predicted][$target] = 0; } ++$prob[$predicted][$target]; } // Calculate probabilities: Proportion of labels in each leaf $dist = array_combine($this->binaryLabels, array_fill(0, 2, 0.0)); foreach ($prob as $leaf => $counts) { $leafTotal = (float) array_sum($prob[$leaf]); foreach ($counts as $label => $count) { if ((string) $leaf == (string) $label) { $dist[$leaf] = $count / $leafTotal; } } } return [$wrong / (float) array_sum($this->weights), $dist]; } /** * Returns the probability of the sample of belonging to the given label * * Probability of a sample is calculated as the proportion of the label * within the labels of the training samples in the decision node * * @param mixed $label */ protected function predictProbability(array $sample, $label): float { $predicted = $this->predictSampleBinary($sample); if ((string) $predicted == (string) $label) { return $this->prob[$label]; } return 0.0; } /** * @return mixed */ protected function predictSampleBinary(array $sample) { if (Comparison::compare($sample[$this->column], $this->value, $this->operator)) { return $this->binaryLabels[0]; } return $this->binaryLabels[1]; } protected function resetBinary(): void { } } Classification/Linear/Perceptron.php 0000644 00000015620 15152065050 0013546 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Classification\Linear; use Closure; use Phpml\Classification\Classifier; use Phpml\Exception\InvalidArgumentException; use Phpml\Helper\OneVsRest; use Phpml\Helper\Optimizer\GD; use Phpml\Helper\Optimizer\Optimizer; use Phpml\Helper\Optimizer\StochasticGD; use Phpml\Helper\Predictable; use Phpml\IncrementalEstimator; use Phpml\Preprocessing\Normalizer; class Perceptron implements Classifier, IncrementalEstimator { use Predictable; use OneVsRest; /** * @var Optimizer|GD|StochasticGD|null */ protected $optimizer; /** * @var array */ protected $labels = []; /** * @var int */ protected $featureCount = 0; /** * @var array */ protected $weights = []; /** * @var float */ protected $learningRate; /** * @var int */ protected $maxIterations; /** * @var Normalizer */ protected $normalizer; /** * @var bool */ protected $enableEarlyStop = true; /** * Initalize a perceptron classifier with given learning rate and maximum * number of iterations used while training the perceptron * * @param float $learningRate Value between 0.0(exclusive) and 1.0(inclusive) * @param int $maxIterations Must be at least 1 * * @throws InvalidArgumentException */ public function __construct(float $learningRate = 0.001, int $maxIterations = 1000, bool $normalizeInputs = true) { if ($learningRate <= 0.0 || $learningRate > 1.0) { throw new InvalidArgumentException('Learning rate should be a float value between 0.0(exclusive) and 1.0(inclusive)'); } if ($maxIterations <= 0) { throw new InvalidArgumentException('Maximum number of iterations must be an integer greater than 0'); } if ($normalizeInputs) { $this->normalizer = new Normalizer(Normalizer::NORM_STD); } $this->learningRate = $learningRate; $this->maxIterations = $maxIterations; } public function partialTrain(array $samples, array $targets, array $labels = []): void { $this->trainByLabel($samples, $targets, $labels); } public function trainBinary(array $samples, array $targets, array $labels): void { if ($this->normalizer !== null) { $this->normalizer->transform($samples); } // Set all target values to either -1 or 1 $this->labels = [ 1 => $labels[0], -1 => $labels[1], ]; foreach ($targets as $key => $target) { $targets[$key] = (string) $target == (string) $this->labels[1] ? 1 : -1; } // Set samples and feature count vars $this->featureCount = count($samples[0]); $this->runTraining($samples, $targets); } /** * Normally enabling early stopping for the optimization procedure may * help saving processing time while in some cases it may result in * premature convergence.<br> * * If "false" is given, the optimization procedure will always be executed * for $maxIterations times * * @return $this */ public function setEarlyStop(bool $enable = true) { $this->enableEarlyStop = $enable; return $this; } /** * Returns the cost values obtained during the training. */ public function getCostValues(): array { return $this->costValues; } protected function resetBinary(): void { $this->labels = []; $this->optimizer = null; $this->featureCount = 0; $this->weights = []; $this->costValues = []; } /** * Trains the perceptron model with Stochastic Gradient Descent optimization * to get the correct set of weights */ protected function runTraining(array $samples, array $targets): void { // The cost function is the sum of squares $callback = function ($weights, $sample, $target): array { $this->weights = $weights; $prediction = $this->outputClass($sample); $gradient = $prediction - $target; $error = $gradient ** 2; return [$error, $gradient]; }; $this->runGradientDescent($samples, $targets, $callback); } /** * Executes a Gradient Descent algorithm for * the given cost function */ protected function runGradientDescent(array $samples, array $targets, Closure $gradientFunc, bool $isBatch = false): void { $class = $isBatch ? GD::class : StochasticGD::class; if ($this->optimizer === null) { $this->optimizer = (new $class($this->featureCount)) ->setLearningRate($this->learningRate) ->setMaxIterations($this->maxIterations) ->setChangeThreshold(1e-6) ->setEarlyStop($this->enableEarlyStop); } $this->weights = $this->optimizer->runOptimization($samples, $targets, $gradientFunc); $this->costValues = $this->optimizer->getCostValues(); } /** * Checks if the sample should be normalized and if so, returns the * normalized sample */ protected function checkNormalizedSample(array $sample): array { if ($this->normalizer !== null) { $samples = [$sample]; $this->normalizer->transform($samples); $sample = $samples[0]; } return $sample; } /** * Calculates net output of the network as a float value for the given input * * @return int|float */ protected function output(array $sample) { $sum = 0; foreach ($this->weights as $index => $w) { if ($index == 0) { $sum += $w; } else { $sum += $w * $sample[$index - 1]; } } return $sum; } /** * Returns the class value (either -1 or 1) for the given input */ protected function outputClass(array $sample): int { return $this->output($sample) > 0 ? 1 : -1; } /** * Returns the probability of the sample of belonging to the given label. * * The probability is simply taken as the distance of the sample * to the decision plane. * * @param mixed $label */ protected function predictProbability(array $sample, $label): float { $predicted = $this->predictSampleBinary($sample); if ((string) $predicted == (string) $label) { $sample = $this->checkNormalizedSample($sample); return (float) abs($this->output($sample)); } return 0.0; } /** * @return mixed */ protected function predictSampleBinary(array $sample) { $sample = $this->checkNormalizedSample($sample); $predictedClass = $this->outputClass($sample); return $this->labels[$predictedClass]; } } Classification/Linear/Adaline.php 0000644 00000004505 15152065050 0012762 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Classification\Linear; use Phpml\Exception\InvalidArgumentException; class Adaline extends Perceptron { /** * Batch training is the default Adaline training algorithm */ public const BATCH_TRAINING = 1; /** * Online training: Stochastic gradient descent learning */ public const ONLINE_TRAINING = 2; /** * Training type may be either 'Batch' or 'Online' learning * * @var string|int */ protected $trainingType; /** * Initalize an Adaline (ADAptive LInear NEuron) classifier with given learning rate and maximum * number of iterations used while training the classifier <br> * * Learning rate should be a float value between 0.0(exclusive) and 1.0 (inclusive) <br> * Maximum number of iterations can be an integer value greater than 0 <br> * If normalizeInputs is set to true, then every input given to the algorithm will be standardized * by use of standard deviation and mean calculation * * @throws InvalidArgumentException */ public function __construct( float $learningRate = 0.001, int $maxIterations = 1000, bool $normalizeInputs = true, int $trainingType = self::BATCH_TRAINING ) { if (!in_array($trainingType, [self::BATCH_TRAINING, self::ONLINE_TRAINING], true)) { throw new InvalidArgumentException('Adaline can only be trained with batch and online/stochastic gradient descent algorithm'); } $this->trainingType = $trainingType; parent::__construct($learningRate, $maxIterations, $normalizeInputs); } /** * Adapts the weights with respect to given samples and targets * by use of gradient descent learning rule */ protected function runTraining(array $samples, array $targets): void { // The cost function is the sum of squares $callback = function ($weights, $sample, $target): array { $this->weights = $weights; $output = $this->output($sample); $gradient = $output - $target; $error = $gradient ** 2; return [$error, $gradient]; }; $isBatch = $this->trainingType == self::BATCH_TRAINING; parent::runGradientDescent($samples, $targets, $callback, $isBatch); } } Classification/SVC.php 0000644 00000001350 15152065050 0010641 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Classification; use Phpml\SupportVectorMachine\Kernel; use Phpml\SupportVectorMachine\SupportVectorMachine; use Phpml\SupportVectorMachine\Type; class SVC extends SupportVectorMachine implements Classifier { public function __construct( int $kernel = Kernel::RBF, float $cost = 1.0, int $degree = 3, ?float $gamma = null, float $coef0 = 0.0, float $tolerance = 0.001, int $cacheSize = 100, bool $shrinking = true, bool $probabilityEstimates = false ) { parent::__construct(Type::C_SVC, $kernel, $cost, 0.5, $degree, $gamma, $coef0, 0.1, $tolerance, $cacheSize, $shrinking, $probabilityEstimates); } } Classification/DecisionTree.php 0000644 00000035166 15152065050 0012577 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Classification; use Phpml\Classification\DecisionTree\DecisionTreeLeaf; use Phpml\Exception\InvalidArgumentException; use Phpml\Helper\Predictable; use Phpml\Helper\Trainable; use Phpml\Math\Statistic\Mean; class DecisionTree implements Classifier { use Trainable; use Predictable; public const CONTINUOUS = 1; public const NOMINAL = 2; /** * @var int */ public $actualDepth = 0; /** * @var array */ protected $columnTypes = []; /** * @var DecisionTreeLeaf */ protected $tree; /** * @var int */ protected $maxDepth; /** * @var array */ private $labels = []; /** * @var int */ private $featureCount = 0; /** * @var int */ private $numUsableFeatures = 0; /** * @var array */ private $selectedFeatures = []; /** * @var array|null */ private $featureImportances; /** * @var array */ private $columnNames = []; public function __construct(int $maxDepth = 10) { $this->maxDepth = $maxDepth; } public function train(array $samples, array $targets): void { $this->samples = array_merge($this->samples, $samples); $this->targets = array_merge($this->targets, $targets); $this->featureCount = count($this->samples[0]); $this->columnTypes = self::getColumnTypes($this->samples); $this->labels = array_keys(array_count_values($this->targets)); $this->tree = $this->getSplitLeaf(range(0, count($this->samples) - 1)); // Each time the tree is trained, feature importances are reset so that // we will have to compute it again depending on the new data $this->featureImportances = null; // If column names are given or computed before, then there is no // need to init it and accidentally remove the previous given names if ($this->columnNames === []) { $this->columnNames = range(0, $this->featureCount - 1); } elseif (count($this->columnNames) > $this->featureCount) { $this->columnNames = array_slice($this->columnNames, 0, $this->featureCount); } elseif (count($this->columnNames) < $this->featureCount) { $this->columnNames = array_merge( $this->columnNames, range(count($this->columnNames), $this->featureCount - 1) ); } } public static function getColumnTypes(array $samples): array { $types = []; $featureCount = count($samples[0]); for ($i = 0; $i < $featureCount; ++$i) { $values = array_column($samples, $i); $isCategorical = self::isCategoricalColumn($values); $types[] = $isCategorical ? self::NOMINAL : self::CONTINUOUS; } return $types; } /** * @param mixed $baseValue */ public function getGiniIndex($baseValue, array $colValues, array $targets): float { $countMatrix = []; foreach ($this->labels as $label) { $countMatrix[$label] = [0, 0]; } foreach ($colValues as $index => $value) { $label = $targets[$index]; $rowIndex = $value === $baseValue ? 0 : 1; ++$countMatrix[$label][$rowIndex]; } $giniParts = [0, 0]; for ($i = 0; $i <= 1; ++$i) { $part = 0; $sum = array_sum(array_column($countMatrix, $i)); if ($sum > 0) { foreach ($this->labels as $label) { $part += ($countMatrix[$label][$i] / (float) $sum) ** 2; } } $giniParts[$i] = (1 - $part) * $sum; } return array_sum($giniParts) / count($colValues); } /** * This method is used to set number of columns to be used * when deciding a split at an internal node of the tree. <br> * If the value is given 0, then all features are used (default behaviour), * otherwise the given value will be used as a maximum for number of columns * randomly selected for each split operation. * * @return $this * * @throws InvalidArgumentException */ public function setNumFeatures(int $numFeatures) { if ($numFeatures < 0) { throw new InvalidArgumentException('Selected column count should be greater or equal to zero'); } $this->numUsableFeatures = $numFeatures; return $this; } /** * A string array to represent columns. Useful when HTML output or * column importances are desired to be inspected. * * @return $this * * @throws InvalidArgumentException */ public function setColumnNames(array $names) { if ($this->featureCount !== 0 && count($names) !== $this->featureCount) { throw new InvalidArgumentException(sprintf('Length of the given array should be equal to feature count %s', $this->featureCount)); } $this->columnNames = $names; return $this; } public function getHtml(): string { return $this->tree->getHTML($this->columnNames); } /** * This will return an array including an importance value for * each column in the given dataset. The importance values are * normalized and their total makes 1.<br/> */ public function getFeatureImportances(): array { if ($this->featureImportances !== null) { return $this->featureImportances; } $sampleCount = count($this->samples); $this->featureImportances = []; foreach ($this->columnNames as $column => $columnName) { $nodes = $this->getSplitNodesByColumn($column, $this->tree); $importance = 0; foreach ($nodes as $node) { $importance += $node->getNodeImpurityDecrease($sampleCount); } $this->featureImportances[$columnName] = $importance; } // Normalize & sort the importances $total = array_sum($this->featureImportances); if ($total > 0) { array_walk($this->featureImportances, function (&$importance) use ($total): void { $importance /= $total; }); arsort($this->featureImportances); } return $this->featureImportances; } protected function getSplitLeaf(array $records, int $depth = 0): DecisionTreeLeaf { $split = $this->getBestSplit($records); $split->level = $depth; if ($this->actualDepth < $depth) { $this->actualDepth = $depth; } // Traverse all records to see if all records belong to the same class, // otherwise group the records so that we can classify the leaf // in case maximum depth is reached $leftRecords = []; $rightRecords = []; $remainingTargets = []; $prevRecord = null; $allSame = true; foreach ($records as $recordNo) { // Check if the previous record is the same with the current one $record = $this->samples[$recordNo]; if ($prevRecord !== null && $prevRecord != $record) { $allSame = false; } $prevRecord = $record; // According to the split criteron, this record will // belong to either left or the right side in the next split if ($split->evaluate($record)) { $leftRecords[] = $recordNo; } else { $rightRecords[] = $recordNo; } // Group remaining targets $target = $this->targets[$recordNo]; if (!array_key_exists($target, $remainingTargets)) { $remainingTargets[$target] = 1; } else { ++$remainingTargets[$target]; } } if ($allSame || $depth >= $this->maxDepth || count($remainingTargets) === 1) { $split->isTerminal = true; arsort($remainingTargets); $split->classValue = (string) key($remainingTargets); } else { if (isset($leftRecords[0])) { $split->leftLeaf = $this->getSplitLeaf($leftRecords, $depth + 1); } if (isset($rightRecords[0])) { $split->rightLeaf = $this->getSplitLeaf($rightRecords, $depth + 1); } } return $split; } protected function getBestSplit(array $records): DecisionTreeLeaf { $targets = array_intersect_key($this->targets, array_flip($records)); $samples = (array) array_combine( $records, $this->preprocess(array_intersect_key($this->samples, array_flip($records))) ); $bestGiniVal = 1; $bestSplit = null; $features = $this->getSelectedFeatures(); foreach ($features as $i) { $colValues = []; foreach ($samples as $index => $row) { $colValues[$index] = $row[$i]; } $counts = array_count_values($colValues); arsort($counts); $baseValue = key($counts); if ($baseValue === null) { continue; } $gini = $this->getGiniIndex($baseValue, $colValues, $targets); if ($bestSplit === null || $bestGiniVal > $gini) { $split = new DecisionTreeLeaf(); $split->value = $baseValue; $split->giniIndex = $gini; $split->columnIndex = $i; $split->isContinuous = $this->columnTypes[$i] === self::CONTINUOUS; $split->records = $records; // If a numeric column is to be selected, then // the original numeric value and the selected operator // will also be saved into the leaf for future access if ($this->columnTypes[$i] === self::CONTINUOUS) { $matches = []; preg_match("/^([<>=]{1,2})\s*(.*)/", (string) $split->value, $matches); $split->operator = $matches[1]; $split->numericValue = (float) $matches[2]; } $bestSplit = $split; $bestGiniVal = $gini; } } return $bestSplit; } /** * Returns available features/columns to the tree for the decision making * process. <br> * * If a number is given with setNumFeatures() method, then a random selection * of features up to this number is returned. <br> * * If some features are manually selected by use of setSelectedFeatures(), * then only these features are returned <br> * * If any of above methods were not called beforehand, then all features * are returned by default. */ protected function getSelectedFeatures(): array { $allFeatures = range(0, $this->featureCount - 1); if ($this->numUsableFeatures === 0 && count($this->selectedFeatures) === 0) { return $allFeatures; } if (count($this->selectedFeatures) > 0) { return $this->selectedFeatures; } $numFeatures = $this->numUsableFeatures; if ($numFeatures > $this->featureCount) { $numFeatures = $this->featureCount; } shuffle($allFeatures); $selectedFeatures = array_slice($allFeatures, 0, $numFeatures); sort($selectedFeatures); return $selectedFeatures; } protected function preprocess(array $samples): array { // Detect and convert continuous data column values into // discrete values by using the median as a threshold value $columns = []; for ($i = 0; $i < $this->featureCount; ++$i) { $values = array_column($samples, $i); if ($this->columnTypes[$i] == self::CONTINUOUS) { $median = Mean::median($values); foreach ($values as &$value) { if ($value <= $median) { $value = "<= ${median}"; } else { $value = "> ${median}"; } } } $columns[] = $values; } // Below method is a strange yet very simple & efficient method // to get the transpose of a 2D array return array_map(null, ...$columns); } protected static function isCategoricalColumn(array $columnValues): bool { $count = count($columnValues); // There are two main indicators that *may* show whether a // column is composed of discrete set of values: // 1- Column may contain string values and non-float values // 2- Number of unique values in the column is only a small fraction of // all values in that column (Lower than or equal to %20 of all values) $numericValues = array_filter($columnValues, 'is_numeric'); $floatValues = array_filter($columnValues, 'is_float'); if (count($floatValues) > 0) { return false; } if (count($numericValues) !== $count) { return true; } $distinctValues = array_count_values($columnValues); return count($distinctValues) <= $count / 5; } /** * Used to set predefined features to consider while deciding which column to use for a split */ protected function setSelectedFeatures(array $selectedFeatures): void { $this->selectedFeatures = $selectedFeatures; } /** * Collects and returns an array of internal nodes that use the given * column as a split criterion */ protected function getSplitNodesByColumn(int $column, DecisionTreeLeaf $node): array { if ($node->isTerminal) { return []; } $nodes = []; if ($node->columnIndex === $column) { $nodes[] = $node; } $lNodes = []; $rNodes = []; if ($node->leftLeaf !== null) { $lNodes = $this->getSplitNodesByColumn($column, $node->leftLeaf); } if ($node->rightLeaf !== null) { $rNodes = $this->getSplitNodesByColumn($column, $node->rightLeaf); } return array_merge($nodes, $lNodes, $rNodes); } /** * @return mixed */ protected function predictSample(array $sample) { $node = $this->tree; do { if ($node->isTerminal) { return $node->classValue; } if ($node->evaluate($sample)) { $node = $node->leftLeaf; } else { $node = $node->rightLeaf; } } while ($node); return $this->labels[0]; } } Classification/Ensemble/Bagging.php 0000644 00000010725 15152065050 0013304 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Classification\Ensemble; use Phpml\Classification\Classifier; use Phpml\Classification\DecisionTree; use Phpml\Exception\InvalidArgumentException; use Phpml\Helper\Predictable; use Phpml\Helper\Trainable; use ReflectionClass; class Bagging implements Classifier { use Trainable; use Predictable; /** * @var int */ protected $numSamples; /** * @var int */ protected $featureCount = 0; /** * @var int */ protected $numClassifier; /** * @var string */ protected $classifier = DecisionTree::class; /** * @var array */ protected $classifierOptions = ['depth' => 20]; /** * @var array */ protected $classifiers = []; /** * @var float */ protected $subsetRatio = 0.7; /** * Creates an ensemble classifier with given number of base classifiers * Default number of base classifiers is 50. * The more number of base classifiers, the better performance but at the cost of procesing time */ public function __construct(int $numClassifier = 50) { $this->numClassifier = $numClassifier; } /** * This method determines the ratio of samples used to create the 'bootstrap' subset, * e.g., random samples drawn from the original dataset with replacement (allow repeats), * to train each base classifier. * * @return $this * * @throws InvalidArgumentException */ public function setSubsetRatio(float $ratio) { if ($ratio < 0.1 || $ratio > 1.0) { throw new InvalidArgumentException('Subset ratio should be between 0.1 and 1.0'); } $this->subsetRatio = $ratio; return $this; } /** * This method is used to set the base classifier. Default value is * DecisionTree::class, but any class that implements the <i>Classifier</i> * can be used. <br> * While giving the parameters of the classifier, the values should be * given in the order they are in the constructor of the classifier and parameter * names are neglected. * * @return $this */ public function setClassifer(string $classifier, array $classifierOptions = []) { $this->classifier = $classifier; $this->classifierOptions = $classifierOptions; return $this; } public function train(array $samples, array $targets): void { $this->samples = array_merge($this->samples, $samples); $this->targets = array_merge($this->targets, $targets); $this->featureCount = count($samples[0]); $this->numSamples = count($this->samples); // Init classifiers and train them with bootstrap samples $this->classifiers = $this->initClassifiers(); $index = 0; foreach ($this->classifiers as $classifier) { [$samples, $targets] = $this->getRandomSubset($index); $classifier->train($samples, $targets); ++$index; } } protected function getRandomSubset(int $index): array { $samples = []; $targets = []; srand($index); $bootstrapSize = $this->subsetRatio * $this->numSamples; for ($i = 0; $i < $bootstrapSize; ++$i) { $rand = random_int(0, $this->numSamples - 1); $samples[] = $this->samples[$rand]; $targets[] = $this->targets[$rand]; } return [$samples, $targets]; } protected function initClassifiers(): array { $classifiers = []; for ($i = 0; $i < $this->numClassifier; ++$i) { $ref = new ReflectionClass($this->classifier); /** @var Classifier $obj */ $obj = count($this->classifierOptions) === 0 ? $ref->newInstance() : $ref->newInstanceArgs($this->classifierOptions); $classifiers[] = $this->initSingleClassifier($obj); } return $classifiers; } protected function initSingleClassifier(Classifier $classifier): Classifier { return $classifier; } /** * @return mixed */ protected function predictSample(array $sample) { $predictions = []; foreach ($this->classifiers as $classifier) { /** @var Classifier $classifier */ $predictions[] = $classifier->predict($sample); } $counts = array_count_values($predictions); arsort($counts); reset($counts); return key($counts); } } Classification/Ensemble/AdaBoost.php 0000644 00000015435 15152065050 0013445 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Classification\Ensemble; use Phpml\Classification\Classifier; use Phpml\Classification\Linear\DecisionStump; use Phpml\Classification\WeightedClassifier; use Phpml\Exception\InvalidArgumentException; use Phpml\Helper\Predictable; use Phpml\Helper\Trainable; use Phpml\Math\Statistic\Mean; use Phpml\Math\Statistic\StandardDeviation; use ReflectionClass; class AdaBoost implements Classifier { use Predictable; use Trainable; /** * Actual labels given in the targets array * * @var array */ protected $labels = []; /** * @var int */ protected $sampleCount; /** * @var int */ protected $featureCount; /** * Number of maximum iterations to be done * * @var int */ protected $maxIterations; /** * Sample weights * * @var array */ protected $weights = []; /** * List of selected 'weak' classifiers * * @var array */ protected $classifiers = []; /** * Base classifier weights * * @var array */ protected $alpha = []; /** * @var string */ protected $baseClassifier = DecisionStump::class; /** * @var array */ protected $classifierOptions = []; /** * ADAptive BOOSTing (AdaBoost) is an ensemble algorithm to * improve classification performance of 'weak' classifiers such as * DecisionStump (default base classifier of AdaBoost). */ public function __construct(int $maxIterations = 50) { $this->maxIterations = $maxIterations; } /** * Sets the base classifier that will be used for boosting (default = DecisionStump) */ public function setBaseClassifier(string $baseClassifier = DecisionStump::class, array $classifierOptions = []): void { $this->baseClassifier = $baseClassifier; $this->classifierOptions = $classifierOptions; } /** * @throws InvalidArgumentException */ public function train(array $samples, array $targets): void { // Initialize usual variables $this->labels = array_keys(array_count_values($targets)); if (count($this->labels) !== 2) { throw new InvalidArgumentException('AdaBoost is a binary classifier and can classify between two classes only'); } // Set all target values to either -1 or 1 $this->labels = [ 1 => $this->labels[0], -1 => $this->labels[1], ]; foreach ($targets as $target) { $this->targets[] = $target == $this->labels[1] ? 1 : -1; } $this->samples = array_merge($this->samples, $samples); $this->featureCount = count($samples[0]); $this->sampleCount = count($this->samples); // Initialize AdaBoost parameters $this->weights = array_fill(0, $this->sampleCount, 1.0 / $this->sampleCount); $this->classifiers = []; $this->alpha = []; // Execute the algorithm for a maximum number of iterations $currIter = 0; while ($this->maxIterations > $currIter++) { // Determine the best 'weak' classifier based on current weights $classifier = $this->getBestClassifier(); $errorRate = $this->evaluateClassifier($classifier); // Update alpha & weight values at each iteration $alpha = $this->calculateAlpha($errorRate); $this->updateWeights($classifier, $alpha); $this->classifiers[] = $classifier; $this->alpha[] = $alpha; } } /** * @return mixed */ public function predictSample(array $sample) { $sum = 0; foreach ($this->alpha as $index => $alpha) { $h = $this->classifiers[$index]->predict($sample); $sum += $h * $alpha; } return $this->labels[$sum > 0 ? 1 : -1]; } /** * Returns the classifier with the lowest error rate with the * consideration of current sample weights */ protected function getBestClassifier(): Classifier { $ref = new ReflectionClass($this->baseClassifier); /** @var Classifier $classifier */ $classifier = count($this->classifierOptions) === 0 ? $ref->newInstance() : $ref->newInstanceArgs($this->classifierOptions); if ($classifier instanceof WeightedClassifier) { $classifier->setSampleWeights($this->weights); $classifier->train($this->samples, $this->targets); } else { [$samples, $targets] = $this->resample(); $classifier->train($samples, $targets); } return $classifier; } /** * Resamples the dataset in accordance with the weights and * returns the new dataset */ protected function resample(): array { $weights = $this->weights; $std = StandardDeviation::population($weights); $mean = Mean::arithmetic($weights); $min = min($weights); $minZ = (int) round(($min - $mean) / $std); $samples = []; $targets = []; foreach ($weights as $index => $weight) { $z = (int) round(($weight - $mean) / $std) - $minZ + 1; for ($i = 0; $i < $z; ++$i) { if (random_int(0, 1) == 0) { continue; } $samples[] = $this->samples[$index]; $targets[] = $this->targets[$index]; } } return [$samples, $targets]; } /** * Evaluates the classifier and returns the classification error rate */ protected function evaluateClassifier(Classifier $classifier): float { $total = (float) array_sum($this->weights); $wrong = 0; foreach ($this->samples as $index => $sample) { $predicted = $classifier->predict($sample); if ($predicted != $this->targets[$index]) { $wrong += $this->weights[$index]; } } return $wrong / $total; } /** * Calculates alpha of a classifier */ protected function calculateAlpha(float $errorRate): float { if ($errorRate == 0) { $errorRate = 1e-10; } return 0.5 * log((1 - $errorRate) / $errorRate); } /** * Updates the sample weights */ protected function updateWeights(Classifier $classifier, float $alpha): void { $sumOfWeights = array_sum($this->weights); $weightsT1 = []; foreach ($this->weights as $index => $weight) { $desired = $this->targets[$index]; $output = $classifier->predict($this->samples[$index]); $weight *= exp(-$alpha * $desired * $output) / $sumOfWeights; $weightsT1[] = $weight; } $this->weights = $weightsT1; } } Classification/Ensemble/RandomForest.php 0000644 00000011422 15152065050 0014344 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Classification\Ensemble; use Phpml\Classification\Classifier; use Phpml\Classification\DecisionTree; use Phpml\Exception\InvalidArgumentException; class RandomForest extends Bagging { /** * @var float|string */ protected $featureSubsetRatio = 'log'; /** * @var array|null */ protected $columnNames; /** * Initializes RandomForest with the given number of trees. More trees * may increase the prediction performance while it will also substantially * increase the processing time and the required memory */ public function __construct(int $numClassifier = 50) { parent::__construct($numClassifier); $this->setSubsetRatio(1.0); } /** * This method is used to determine how many of the original columns (features) * will be used to construct subsets to train base classifiers.<br> * * Allowed values: 'sqrt', 'log' or any float number between 0.1 and 1.0 <br> * * Default value for the ratio is 'log' which results in log(numFeatures, 2) + 1 * features to be taken into consideration while selecting subspace of features * * @param mixed $ratio */ public function setFeatureSubsetRatio($ratio): self { if (!is_string($ratio) && !is_float($ratio)) { throw new InvalidArgumentException('Feature subset ratio must be a string or a float'); } if (is_float($ratio) && ($ratio < 0.1 || $ratio > 1.0)) { throw new InvalidArgumentException('When a float is given, feature subset ratio should be between 0.1 and 1.0'); } if (is_string($ratio) && $ratio !== 'sqrt' && $ratio !== 'log') { throw new InvalidArgumentException("When a string is given, feature subset ratio can only be 'sqrt' or 'log'"); } $this->featureSubsetRatio = $ratio; return $this; } /** * RandomForest algorithm is usable *only* with DecisionTree * * @return $this */ public function setClassifer(string $classifier, array $classifierOptions = []) { if ($classifier !== DecisionTree::class) { throw new InvalidArgumentException('RandomForest can only use DecisionTree as base classifier'); } parent::setClassifer($classifier, $classifierOptions); return $this; } /** * This will return an array including an importance value for * each column in the given dataset. Importance values for a column * is the average importance of that column in all trees in the forest */ public function getFeatureImportances(): array { // Traverse each tree and sum importance of the columns $sum = []; foreach ($this->classifiers as $tree) { /** @var DecisionTree $tree */ $importances = $tree->getFeatureImportances(); foreach ($importances as $column => $importance) { if (array_key_exists($column, $sum)) { $sum[$column] += $importance; } else { $sum[$column] = $importance; } } } // Normalize & sort the importance values $total = array_sum($sum); array_walk($sum, function (&$importance) use ($total): void { $importance /= $total; }); arsort($sum); return $sum; } /** * A string array to represent the columns is given. They are useful * when trying to print some information about the trees such as feature importances * * @return $this */ public function setColumnNames(array $names) { $this->columnNames = $names; return $this; } /** * @return DecisionTree */ protected function initSingleClassifier(Classifier $classifier): Classifier { if (!$classifier instanceof DecisionTree) { throw new InvalidArgumentException( sprintf('Classifier %s expected, got %s', DecisionTree::class, get_class($classifier)) ); } if (is_float($this->featureSubsetRatio)) { $featureCount = (int) ($this->featureSubsetRatio * $this->featureCount); } elseif ($this->featureSubsetRatio === 'sqrt') { $featureCount = (int) ($this->featureCount ** .5) + 1; } else { $featureCount = (int) log($this->featureCount, 2) + 1; } if ($featureCount >= $this->featureCount) { $featureCount = $this->featureCount; } if ($this->columnNames === null) { $this->columnNames = range(0, $this->featureCount - 1); } return $classifier ->setColumnNames($this->columnNames) ->setNumFeatures($featureCount); } } Classification/DecisionTree/DecisionTreeLeaf.php 0000644 00000007264 15152065050 0015742 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Classification\DecisionTree; use Phpml\Math\Comparison; class DecisionTreeLeaf { /** * @var string|int */ public $value; /** * @var float */ public $numericValue; /** * @var string */ public $operator; /** * @var int */ public $columnIndex; /** * @var DecisionTreeLeaf|null */ public $leftLeaf; /** * @var DecisionTreeLeaf|null */ public $rightLeaf; /** * @var array */ public $records = []; /** * Class value represented by the leaf, this value is non-empty * only for terminal leaves * * @var string */ public $classValue = ''; /** * @var bool */ public $isTerminal = false; /** * @var bool */ public $isContinuous = false; /** * @var float */ public $giniIndex = 0; /** * @var int */ public $level = 0; /** * HTML representation of the tree without column names */ public function __toString(): string { return $this->getHTML(); } public function evaluate(array $record): bool { $recordField = $record[$this->columnIndex]; if ($this->isContinuous) { return Comparison::compare((string) $recordField, $this->numericValue, $this->operator); } return $recordField == $this->value; } /** * Returns Mean Decrease Impurity (MDI) in the node. * For terminal nodes, this value is equal to 0 */ public function getNodeImpurityDecrease(int $parentRecordCount): float { if ($this->isTerminal) { return 0.0; } $nodeSampleCount = (float) count($this->records); $iT = $this->giniIndex; if ($this->leftLeaf !== null) { $pL = count($this->leftLeaf->records) / $nodeSampleCount; $iT -= $pL * $this->leftLeaf->giniIndex; } if ($this->rightLeaf !== null) { $pR = count($this->rightLeaf->records) / $nodeSampleCount; $iT -= $pR * $this->rightLeaf->giniIndex; } return $iT * $nodeSampleCount / $parentRecordCount; } /** * Returns HTML representation of the node including children nodes */ public function getHTML(?array $columnNames = null): string { if ($this->isTerminal) { $value = "<b>${this}->classValue</b>"; } else { $value = $this->value; if ($columnNames !== null) { $col = $columnNames[$this->columnIndex]; } else { $col = "col_$this->columnIndex"; } if ((bool) preg_match('/^[<>=]{1,2}/', (string) $value) === false) { $value = "=${value}"; } $value = "<b>${col} ${value}</b><br>Gini: ".number_format($this->giniIndex, 2); } $str = "<table ><tr><td colspan=3 align=center style='border:1px solid;'>${value}</td></tr>"; if ($this->leftLeaf !== null || $this->rightLeaf !== null) { $str .= '<tr>'; if ($this->leftLeaf !== null) { $str .= '<td valign=top><b>| Yes</b><br>'.$this->leftLeaf->getHTML($columnNames).'</td>'; } else { $str .= '<td></td>'; } $str .= '<td> </td>'; if ($this->rightLeaf !== null) { $str .= '<td valign=top align=right><b>No |</b><br>'.$this->rightLeaf->getHTML($columnNames).'</td>'; } else { $str .= '<td></td>'; } $str .= '</tr>'; } $str .= '</table>'; return $str; } } Classification/KNearestNeighbors.php 0000644 00000003270 15152065050 0013566 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Classification; use Phpml\Helper\Predictable; use Phpml\Helper\Trainable; use Phpml\Math\Distance; use Phpml\Math\Distance\Euclidean; class KNearestNeighbors implements Classifier { use Trainable; use Predictable; /** * @var int */ private $k; /** * @var Distance */ private $distanceMetric; /** * @param Distance|null $distanceMetric (if null then Euclidean distance as default) */ public function __construct(int $k = 3, ?Distance $distanceMetric = null) { if ($distanceMetric === null) { $distanceMetric = new Euclidean(); } $this->k = $k; $this->samples = []; $this->targets = []; $this->distanceMetric = $distanceMetric; } /** * @return mixed */ protected function predictSample(array $sample) { $distances = $this->kNeighborsDistances($sample); $predictions = (array) array_combine(array_values($this->targets), array_fill(0, count($this->targets), 0)); foreach (array_keys($distances) as $index) { ++$predictions[$this->targets[$index]]; } arsort($predictions); reset($predictions); return key($predictions); } /** * @throws \Phpml\Exception\InvalidArgumentException */ private function kNeighborsDistances(array $sample): array { $distances = []; foreach ($this->samples as $index => $neighbor) { $distances[$index] = $this->distanceMetric->distance($sample, $neighbor); } asort($distances); return array_slice($distances, 0, $this->k, true); } } Classification/WeightedClassifier.php 0000644 00000000561 15152065050 0013756 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Classification; abstract class WeightedClassifier implements Classifier { /** * @var array */ protected $weights = []; /** * Sets the array including a weight for each sample */ public function setSampleWeights(array $weights): void { $this->weights = $weights; } } Exception/MatrixException.php 0000644 00000000171 15152065050 0012334 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Exception; use Exception; class MatrixException extends Exception { } Exception/InvalidOperationException.php 0000644 00000000203 15152065050 0014333 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Exception; use Exception; class InvalidOperationException extends Exception { } Exception/DatasetException.php 0000644 00000000172 15152065050 0012456 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Exception; use Exception; class DatasetException extends Exception { } Exception/InvalidArgumentException.php 0000644 00000000202 15152065050 0014154 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Exception; use Exception; class InvalidArgumentException extends Exception { } Exception/FileException.php 0000644 00000000167 15152065050 0011754 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Exception; use Exception; class FileException extends Exception { } Exception/LibsvmCommandException.php 0000644 00000000200 15152065050 0013614 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Exception; use Exception; class LibsvmCommandException extends Exception { } Exception/NormalizerException.php 0000644 00000000175 15152065050 0013216 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Exception; use Exception; class NormalizerException extends Exception { } Exception/SerializeException.php 0000644 00000000174 15152065050 0013022 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Exception; use Exception; class SerializeException extends Exception { } Metric/Accuracy.php 0000644 00000001402 15152065050 0010226 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Metric; use Phpml\Exception\InvalidArgumentException; class Accuracy { /** * @return float|int * * @throws InvalidArgumentException */ public static function score(array $actualLabels, array $predictedLabels, bool $normalize = true) { if (count($actualLabels) != count($predictedLabels)) { throw new InvalidArgumentException('Size of given arrays does not match'); } $score = 0; foreach ($actualLabels as $index => $label) { if ($label == $predictedLabels[$index]) { ++$score; } } if ($normalize) { $score /= count($actualLabels); } return $score; } } Metric/ConfusionMatrix.php 0000644 00000002464 15152065050 0011635 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Metric; class ConfusionMatrix { public static function compute(array $actualLabels, array $predictedLabels, array $labels = []): array { $labels = count($labels) === 0 ? self::getUniqueLabels($actualLabels) : array_flip($labels); $matrix = self::generateMatrixWithZeros($labels); foreach ($actualLabels as $index => $actual) { $predicted = $predictedLabels[$index]; if (!isset($labels[$actual], $labels[$predicted])) { continue; } if ($predicted === $actual) { $row = $column = $labels[$actual]; } else { $row = $labels[$actual]; $column = $labels[$predicted]; } ++$matrix[$row][$column]; } return $matrix; } private static function generateMatrixWithZeros(array $labels): array { $count = count($labels); $matrix = []; for ($i = 0; $i < $count; ++$i) { $matrix[$i] = array_fill(0, $count, 0); } return $matrix; } private static function getUniqueLabels(array $labels): array { $labels = array_values(array_unique($labels)); sort($labels); return array_flip($labels); } } Metric/ClassificationReport.php 0000644 00000013415 15152065050 0012632 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Metric; use Phpml\Exception\InvalidArgumentException; class ClassificationReport { public const MICRO_AVERAGE = 1; public const MACRO_AVERAGE = 2; public const WEIGHTED_AVERAGE = 3; /** * @var array */ private $truePositive = []; /** * @var array */ private $falsePositive = []; /** * @var array */ private $falseNegative = []; /** * @var array */ private $support = []; /** * @var array */ private $precision = []; /** * @var array */ private $recall = []; /** * @var array */ private $f1score = []; /** * @var array */ private $average = []; public function __construct(array $actualLabels, array $predictedLabels, int $average = self::MACRO_AVERAGE) { $averagingMethods = range(self::MICRO_AVERAGE, self::WEIGHTED_AVERAGE); if (!in_array($average, $averagingMethods, true)) { throw new InvalidArgumentException('Averaging method must be MICRO_AVERAGE, MACRO_AVERAGE or WEIGHTED_AVERAGE'); } $this->aggregateClassificationResults($actualLabels, $predictedLabels); $this->computeMetrics(); $this->computeAverage($average); } public function getPrecision(): array { return $this->precision; } public function getRecall(): array { return $this->recall; } public function getF1score(): array { return $this->f1score; } public function getSupport(): array { return $this->support; } public function getAverage(): array { return $this->average; } private function aggregateClassificationResults(array $actualLabels, array $predictedLabels): void { $truePositive = $falsePositive = $falseNegative = $support = self::getLabelIndexedArray($actualLabels, $predictedLabels); foreach ($actualLabels as $index => $actual) { $predicted = $predictedLabels[$index]; ++$support[$actual]; if ($actual === $predicted) { ++$truePositive[$actual]; } else { ++$falsePositive[$predicted]; ++$falseNegative[$actual]; } } $this->truePositive = $truePositive; $this->falsePositive = $falsePositive; $this->falseNegative = $falseNegative; $this->support = $support; } private function computeMetrics(): void { foreach ($this->truePositive as $label => $tp) { $this->precision[$label] = $this->computePrecision($tp, $this->falsePositive[$label]); $this->recall[$label] = $this->computeRecall($tp, $this->falseNegative[$label]); $this->f1score[$label] = $this->computeF1Score((float) $this->precision[$label], (float) $this->recall[$label]); } } private function computeAverage(int $average): void { switch ($average) { case self::MICRO_AVERAGE: $this->computeMicroAverage(); return; case self::MACRO_AVERAGE: $this->computeMacroAverage(); return; case self::WEIGHTED_AVERAGE: $this->computeWeightedAverage(); return; } } private function computeMicroAverage(): void { $truePositive = (int) array_sum($this->truePositive); $falsePositive = (int) array_sum($this->falsePositive); $falseNegative = (int) array_sum($this->falseNegative); $precision = $this->computePrecision($truePositive, $falsePositive); $recall = $this->computeRecall($truePositive, $falseNegative); $f1score = $this->computeF1Score($precision, $recall); $this->average = compact('precision', 'recall', 'f1score'); } private function computeMacroAverage(): void { foreach (['precision', 'recall', 'f1score'] as $metric) { $values = $this->{$metric}; if (count($values) == 0) { $this->average[$metric] = 0.0; continue; } $this->average[$metric] = array_sum($values) / count($values); } } private function computeWeightedAverage(): void { foreach (['precision', 'recall', 'f1score'] as $metric) { $values = $this->{$metric}; if (count($values) == 0) { $this->average[$metric] = 0.0; continue; } $sum = 0; foreach ($values as $i => $value) { $sum += $value * $this->support[$i]; } $this->average[$metric] = $sum / array_sum($this->support); } } private function computePrecision(int $truePositive, int $falsePositive): float { $divider = $truePositive + $falsePositive; if ($divider == 0) { return 0.0; } return $truePositive / $divider; } private function computeRecall(int $truePositive, int $falseNegative): float { $divider = $truePositive + $falseNegative; if ($divider == 0) { return 0.0; } return $truePositive / $divider; } private function computeF1Score(float $precision, float $recall): float { $divider = $precision + $recall; if ($divider == 0) { return 0.0; } return 2.0 * (($precision * $recall) / $divider); } private static function getLabelIndexedArray(array $actualLabels, array $predictedLabels): array { $labels = array_values(array_unique(array_merge($actualLabels, $predictedLabels))); sort($labels); return (array) array_combine($labels, array_fill(0, count($labels), 0)); } } Metric/Regression.php 0000644 00000004600 15152065050 0010617 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Metric; use Phpml\Exception\InvalidArgumentException; use Phpml\Math\Statistic\Correlation; use Phpml\Math\Statistic\Mean; final class Regression { public static function meanSquaredError(array $targets, array $predictions): float { self::assertCountEquals($targets, $predictions); $errors = []; foreach ($targets as $index => $target) { $errors[] = (($target - $predictions[$index]) ** 2); } return Mean::arithmetic($errors); } public static function meanSquaredLogarithmicError(array $targets, array $predictions): float { self::assertCountEquals($targets, $predictions); $errors = []; foreach ($targets as $index => $target) { $errors[] = log((1 + $target) / (1 + $predictions[$index])) ** 2; } return Mean::arithmetic($errors); } public static function meanAbsoluteError(array $targets, array $predictions): float { self::assertCountEquals($targets, $predictions); $errors = []; foreach ($targets as $index => $target) { $errors[] = abs($target - $predictions[$index]); } return Mean::arithmetic($errors); } public static function medianAbsoluteError(array $targets, array $predictions): float { self::assertCountEquals($targets, $predictions); $errors = []; foreach ($targets as $index => $target) { $errors[] = abs($target - $predictions[$index]); } return (float) Mean::median($errors); } public static function r2Score(array $targets, array $predictions): float { self::assertCountEquals($targets, $predictions); return Correlation::pearson($targets, $predictions) ** 2; } public static function maxError(array $targets, array $predictions): float { self::assertCountEquals($targets, $predictions); $errors = []; foreach ($targets as $index => $target) { $errors[] = abs($target - $predictions[$index]); } return (float) max($errors); } private static function assertCountEquals(array &$targets, array &$predictions): void { if (count($targets) !== count($predictions)) { throw new InvalidArgumentException('Targets count must be equal with predictions count'); } } } ModelManager.php 0000644 00000002467 15152065050 0007620 0 ustar 00 <?php declare(strict_types=1); namespace Phpml; use Phpml\Exception\FileException; use Phpml\Exception\SerializeException; class ModelManager { public function saveToFile(Estimator $estimator, string $filepath): void { if (!is_writable(dirname($filepath))) { throw new FileException(sprintf('File "%s" cannot be saved.', basename($filepath))); } $serialized = serialize($estimator); if (!isset($serialized[0])) { throw new SerializeException(sprintf('Class "%s" cannot be serialized.', gettype($estimator))); } $result = file_put_contents($filepath, $serialized, LOCK_EX); if ($result === false) { throw new FileException(sprintf('File "%s" cannot be saved.', basename($filepath))); } } public function restoreFromFile(string $filepath): Estimator { if (!file_exists($filepath) || !is_readable($filepath)) { throw new FileException(sprintf('File "%s" cannot be opened.', basename($filepath))); } $object = unserialize((string) file_get_contents($filepath)); if ($object === false || !$object instanceof Estimator) { throw new SerializeException(sprintf('"%s" cannot be unserialized.', basename($filepath))); } return $object; } } Tokenization/Tokenizer.php 0000644 00000000213 15152065050 0011700 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Tokenization; interface Tokenizer { public function tokenize(string $text): array; } Tokenization/WhitespaceTokenizer.php 0000644 00000000716 15152065050 0013725 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Tokenization; use Phpml\Exception\InvalidArgumentException; class WhitespaceTokenizer implements Tokenizer { public function tokenize(string $text): array { $substrings = preg_split('/[\pZ\pC]+/u', $text, -1, PREG_SPLIT_NO_EMPTY); if ($substrings === false) { throw new InvalidArgumentException('preg_split failed on: '.$text); } return $substrings; } } Tokenization/NGramWordTokenizer.php 0000644 00000002644 15152065050 0013473 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Tokenization; use Phpml\Exception\InvalidArgumentException; class NGramWordTokenizer extends WordTokenizer { /** * @var int */ private $minGram; /** * @var int */ private $maxGram; public function __construct(int $minGram = 1, int $maxGram = 2) { if ($minGram < 1 || $maxGram < 1 || $minGram > $maxGram) { throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram)); } $this->minGram = $minGram; $this->maxGram = $maxGram; } /** * {@inheritdoc} */ public function tokenize(string $text): array { preg_match_all('/\w\w+/u', $text, $words); $words = $words[0]; $nGrams = []; for ($j = $this->minGram; $j <= $this->maxGram; $j++) { $nGrams = array_merge($nGrams, $this->getNgrams($words, $j)); } return $nGrams; } private function getNgrams(array $match, int $n = 2): array { $ngrams = []; $len = count($match); for ($i = 0; $i < $len; $i++) { if ($i > ($n - 2)) { $ng = ''; for ($j = $n - 1; $j >= 0; $j--) { $ng .= ' '.$match[$i - $j]; } $ngrams[] = trim($ng); } } return $ngrams; } } Tokenization/WordTokenizer.php 0000644 00000000421 15152065050 0012535 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Tokenization; class WordTokenizer implements Tokenizer { public function tokenize(string $text): array { $tokens = []; preg_match_all('/\w\w+/u', $text, $tokens); return $tokens[0]; } } Tokenization/NGramTokenizer.php 0000644 00000002454 15152065050 0012636 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Tokenization; use Phpml\Exception\InvalidArgumentException; class NGramTokenizer extends WordTokenizer { /** * @var int */ private $minGram; /** * @var int */ private $maxGram; public function __construct(int $minGram = 1, int $maxGram = 2) { if ($minGram < 1 || $maxGram < 1 || $minGram > $maxGram) { throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram)); } $this->minGram = $minGram; $this->maxGram = $maxGram; } /** * {@inheritdoc} */ public function tokenize(string $text): array { $words = []; preg_match_all('/\w\w+/u', $text, $words); $nGrams = []; foreach ($words[0] as $word) { $this->generateNGrams($word, $nGrams); } return $nGrams; } private function generateNGrams(string $word, array &$nGrams): void { $length = mb_strlen($word); for ($j = 1; $j <= $this->maxGram; $j++) { for ($k = 0; $k < $length - $j + 1; $k++) { if ($j >= $this->minGram) { $nGrams[] = mb_substr($word, $k, $j); } } } } } Estimator.php 0000644 00000000337 15152065050 0007226 0 ustar 00 <?php declare(strict_types=1); namespace Phpml; interface Estimator { public function train(array $samples, array $targets): void; /** * @return mixed */ public function predict(array $samples); } NeuralNetwork/ActivationFunction.php 0000644 00000000526 15152065050 0013666 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\NeuralNetwork; interface ActivationFunction { /** * @param float|int $value */ public function compute($value): float; /** * @param float|int $value * @param float|int $computedvalue */ public function differentiate($value, $computedvalue): float; } NeuralNetwork/Node.php 0000644 00000000174 15152065050 0010743 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\NeuralNetwork; interface Node { public function getOutput(): float; } NeuralNetwork/Network.php 0000644 00000000532 15152065050 0011505 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\NeuralNetwork; interface Network { /** * @param mixed $input */ public function setInput($input): self; public function getOutput(): array; public function addLayer(Layer $layer): void; /** * @return Layer[] */ public function getLayers(): array; } NeuralNetwork/ActivationFunction/PReLU.php 0000644 00000001301 15152065050 0014605 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\NeuralNetwork\ActivationFunction; use Phpml\NeuralNetwork\ActivationFunction; class PReLU implements ActivationFunction { /** * @var float */ private $beta; public function __construct(float $beta = 0.01) { $this->beta = $beta; } /** * @param float|int $value */ public function compute($value): float { return $value >= 0 ? $value : $this->beta * $value; } /** * @param float|int $value * @param float|int $computedvalue */ public function differentiate($value, $computedvalue): float { return $computedvalue >= 0 ? 1.0 : $this->beta; } } NeuralNetwork/ActivationFunction/Sigmoid.php 0000644 00000001271 15152065050 0015257 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\NeuralNetwork\ActivationFunction; use Phpml\NeuralNetwork\ActivationFunction; class Sigmoid implements ActivationFunction { /** * @var float */ private $beta; public function __construct(float $beta = 1.0) { $this->beta = $beta; } /** * @param float|int $value */ public function compute($value): float { return 1 / (1 + exp(-$this->beta * $value)); } /** * @param float|int $value * @param float|int $computedvalue */ public function differentiate($value, $computedvalue): float { return $computedvalue * (1 - $computedvalue); } } NeuralNetwork/ActivationFunction/Gaussian.php 0000644 00000001022 15152065050 0015430 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\NeuralNetwork\ActivationFunction; use Phpml\NeuralNetwork\ActivationFunction; class Gaussian implements ActivationFunction { /** * @param float|int $value */ public function compute($value): float { return exp(- $value ** 2); } /** * @param float|int $value * @param float|int $calculatedvalue */ public function differentiate($value, $calculatedvalue): float { return -2 * $value * $calculatedvalue; } } NeuralNetwork/ActivationFunction/HyperbolicTangent.php 0000644 00000001253 15152065050 0017305 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\NeuralNetwork\ActivationFunction; use Phpml\NeuralNetwork\ActivationFunction; class HyperbolicTangent implements ActivationFunction { /** * @var float */ private $beta; public function __construct(float $beta = 1.0) { $this->beta = $beta; } /** * @param float|int $value */ public function compute($value): float { return tanh($this->beta * $value); } /** * @param float|int $value * @param float|int $computedvalue */ public function differentiate($value, $computedvalue): float { return 1 - $computedvalue ** 2; } } NeuralNetwork/ActivationFunction/ThresholdedReLU.php 0000644 00000001320 15152065050 0016654 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\NeuralNetwork\ActivationFunction; use Phpml\NeuralNetwork\ActivationFunction; class ThresholdedReLU implements ActivationFunction { /** * @var float */ private $theta; public function __construct(float $theta = 0.0) { $this->theta = $theta; } /** * @param float|int $value */ public function compute($value): float { return $value > $this->theta ? $value : 0.0; } /** * @param float|int $value * @param float|int $calculatedvalue */ public function differentiate($value, $calculatedvalue): float { return $calculatedvalue >= $this->theta ? 1.0 : 0.0; } } NeuralNetwork/ActivationFunction/BinaryStep.php 0000644 00000001107 15152065050 0015742 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\NeuralNetwork\ActivationFunction; use Phpml\NeuralNetwork\ActivationFunction; class BinaryStep implements ActivationFunction { /** * @param float|int $value */ public function compute($value): float { return $value >= 0 ? 1.0 : 0.0; } /** * @param float|int $value * @param float|int $computedvalue */ public function differentiate($value, $computedvalue): float { if ($value === 0 || $value === 0.0) { return 1; } return 0; } } NeuralNetwork/Layer.php 0000644 00000002304 15152065050 0011127 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\NeuralNetwork; use Phpml\Exception\InvalidArgumentException; use Phpml\NeuralNetwork\Node\Neuron; class Layer { /** * @var Node[] */ private $nodes = []; /** * @throws InvalidArgumentException */ public function __construct(int $nodesNumber = 0, string $nodeClass = Neuron::class, ?ActivationFunction $activationFunction = null) { if (!in_array(Node::class, class_implements($nodeClass), true)) { throw new InvalidArgumentException('Layer node class must implement Node interface'); } for ($i = 0; $i < $nodesNumber; ++$i) { $this->nodes[] = $this->createNode($nodeClass, $activationFunction); } } public function addNode(Node $node): void { $this->nodes[] = $node; } /** * @return Node[] */ public function getNodes(): array { return $this->nodes; } private function createNode(string $nodeClass, ?ActivationFunction $activationFunction = null): Node { if ($nodeClass === Neuron::class) { return new Neuron($activationFunction); } return new $nodeClass(); } } NeuralNetwork/Training/Backpropagation.php 0000644 00000004611 15152065050 0014735 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\NeuralNetwork\Training; use Phpml\NeuralNetwork\Node\Neuron; use Phpml\NeuralNetwork\Training\Backpropagation\Sigma; class Backpropagation { /** * @var float */ private $learningRate; /** * @var array */ private $sigmas = []; /** * @var array */ private $prevSigmas = []; public function __construct(float $learningRate) { $this->setLearningRate($learningRate); } public function setLearningRate(float $learningRate): void { $this->learningRate = $learningRate; } public function getLearningRate(): float { return $this->learningRate; } /** * @param mixed $targetClass */ public function backpropagate(array $layers, $targetClass): void { $layersNumber = count($layers); // Backpropagation. for ($i = $layersNumber; $i > 1; --$i) { $this->sigmas = []; foreach ($layers[$i - 1]->getNodes() as $key => $neuron) { if ($neuron instanceof Neuron) { $sigma = $this->getSigma($neuron, $targetClass, $key, $i == $layersNumber); foreach ($neuron->getSynapses() as $synapse) { $synapse->changeWeight($this->learningRate * $sigma * $synapse->getNode()->getOutput()); } } } $this->prevSigmas = $this->sigmas; } // Clean some memory (also it helps make MLP persistency & children more maintainable). $this->sigmas = []; $this->prevSigmas = []; } private function getSigma(Neuron $neuron, int $targetClass, int $key, bool $lastLayer): float { $neuronOutput = $neuron->getOutput(); $sigma = $neuron->getDerivative(); if ($lastLayer) { $value = 0; if ($targetClass === $key) { $value = 1; } $sigma *= ($value - $neuronOutput); } else { $sigma *= $this->getPrevSigma($neuron); } $this->sigmas[] = new Sigma($neuron, $sigma); return $sigma; } private function getPrevSigma(Neuron $neuron): float { $sigma = 0.0; foreach ($this->prevSigmas as $neuronSigma) { $sigma += $neuronSigma->getSigmaForNeuron($neuron); } return $sigma; } } NeuralNetwork/Training/Backpropagation/Sigma.php 0000644 00000001562 15152065050 0015777 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\NeuralNetwork\Training\Backpropagation; use Phpml\NeuralNetwork\Node\Neuron; class Sigma { /** * @var Neuron */ private $neuron; /** * @var float */ private $sigma; public function __construct(Neuron $neuron, float $sigma) { $this->neuron = $neuron; $this->sigma = $sigma; } public function getNeuron(): Neuron { return $this->neuron; } public function getSigma(): float { return $this->sigma; } public function getSigmaForNeuron(Neuron $neuron): float { $sigma = 0.0; foreach ($this->neuron->getSynapses() as $synapse) { if ($synapse->getNode() == $neuron) { $sigma += $synapse->getWeight() * $this->getSigma(); } } return $sigma; } } NeuralNetwork/Network/LayeredNetwork.php 0000644 00000003001 15152065050 0014436 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\NeuralNetwork\Network; use Phpml\NeuralNetwork\Layer; use Phpml\NeuralNetwork\Network; use Phpml\NeuralNetwork\Node\Input; use Phpml\NeuralNetwork\Node\Neuron; abstract class LayeredNetwork implements Network { /** * @var Layer[] */ protected $layers = []; public function addLayer(Layer $layer): void { $this->layers[] = $layer; } /** * @return Layer[] */ public function getLayers(): array { return $this->layers; } public function removeLayers(): void { unset($this->layers); } public function getOutputLayer(): Layer { return $this->layers[count($this->layers) - 1]; } public function getOutput(): array { $result = []; foreach ($this->getOutputLayer()->getNodes() as $neuron) { $result[] = $neuron->getOutput(); } return $result; } /** * @param mixed $input */ public function setInput($input): Network { $firstLayer = $this->layers[0]; foreach ($firstLayer->getNodes() as $key => $neuron) { if ($neuron instanceof Input) { $neuron->setInput($input[$key]); } } foreach ($this->getLayers() as $layer) { foreach ($layer->getNodes() as $node) { if ($node instanceof Neuron) { $node->reset(); } } } return $this; } } NeuralNetwork/Network/MultilayerPerceptron.php 0000644 00000014523 15152065050 0015703 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\NeuralNetwork\Network; use Phpml\Estimator; use Phpml\Exception\InvalidArgumentException; use Phpml\Helper\Predictable; use Phpml\IncrementalEstimator; use Phpml\NeuralNetwork\ActivationFunction; use Phpml\NeuralNetwork\ActivationFunction\Sigmoid; use Phpml\NeuralNetwork\Layer; use Phpml\NeuralNetwork\Node\Bias; use Phpml\NeuralNetwork\Node\Input; use Phpml\NeuralNetwork\Node\Neuron; use Phpml\NeuralNetwork\Node\Neuron\Synapse; use Phpml\NeuralNetwork\Training\Backpropagation; abstract class MultilayerPerceptron extends LayeredNetwork implements Estimator, IncrementalEstimator { use Predictable; /** * @var array */ protected $classes = []; /** * @var ActivationFunction|null */ protected $activationFunction; /** * @var Backpropagation */ protected $backpropagation; /** * @var int */ private $inputLayerFeatures; /** * @var array */ private $hiddenLayers = []; /** * @var float */ private $learningRate; /** * @var int */ private $iterations; /** * @throws InvalidArgumentException */ public function __construct( int $inputLayerFeatures, array $hiddenLayers, array $classes, int $iterations = 10000, ?ActivationFunction $activationFunction = null, float $learningRate = 1. ) { if (count($hiddenLayers) === 0) { throw new InvalidArgumentException('Provide at least 1 hidden layer'); } if (count($classes) < 2) { throw new InvalidArgumentException('Provide at least 2 different classes'); } if (count($classes) !== count(array_unique($classes))) { throw new InvalidArgumentException('Classes must be unique'); } $this->classes = array_values($classes); $this->iterations = $iterations; $this->inputLayerFeatures = $inputLayerFeatures; $this->hiddenLayers = $hiddenLayers; $this->activationFunction = $activationFunction; $this->learningRate = $learningRate; $this->initNetwork(); } public function train(array $samples, array $targets): void { $this->reset(); $this->initNetwork(); $this->partialTrain($samples, $targets, $this->classes); } /** * @throws InvalidArgumentException */ public function partialTrain(array $samples, array $targets, array $classes = []): void { if (count($classes) > 0 && array_values($classes) !== $this->classes) { // We require the list of classes in the constructor. throw new InvalidArgumentException( 'The provided classes don\'t match the classes provided in the constructor' ); } for ($i = 0; $i < $this->iterations; ++$i) { $this->trainSamples($samples, $targets); } } public function setLearningRate(float $learningRate): void { $this->learningRate = $learningRate; $this->backpropagation->setLearningRate($this->learningRate); } public function getOutput(): array { $result = []; foreach ($this->getOutputLayer()->getNodes() as $i => $neuron) { $result[$this->classes[$i]] = $neuron->getOutput(); } return $result; } public function getLearningRate(): float { return $this->learningRate; } public function getBackpropagation(): Backpropagation { return $this->backpropagation; } /** * @param mixed $target */ abstract protected function trainSample(array $sample, $target): void; /** * @return mixed */ abstract protected function predictSample(array $sample); protected function reset(): void { $this->removeLayers(); } private function initNetwork(): void { $this->addInputLayer($this->inputLayerFeatures); $this->addNeuronLayers($this->hiddenLayers, $this->activationFunction); // Sigmoid function for the output layer as we want a value from 0 to 1. $sigmoid = new Sigmoid(); $this->addNeuronLayers([count($this->classes)], $sigmoid); $this->addBiasNodes(); $this->generateSynapses(); $this->backpropagation = new Backpropagation($this->learningRate); } private function addInputLayer(int $nodes): void { $this->addLayer(new Layer($nodes, Input::class)); } private function addNeuronLayers(array $layers, ?ActivationFunction $defaultActivationFunction = null): void { foreach ($layers as $layer) { if (is_array($layer)) { $function = $layer[1] instanceof ActivationFunction ? $layer[1] : $defaultActivationFunction; $this->addLayer(new Layer($layer[0], Neuron::class, $function)); } elseif ($layer instanceof Layer) { $this->addLayer($layer); } else { $this->addLayer(new Layer($layer, Neuron::class, $defaultActivationFunction)); } } } private function generateSynapses(): void { $layersNumber = count($this->layers) - 1; for ($i = 0; $i < $layersNumber; ++$i) { $currentLayer = $this->layers[$i]; $nextLayer = $this->layers[$i + 1]; $this->generateLayerSynapses($nextLayer, $currentLayer); } } private function addBiasNodes(): void { $biasLayers = count($this->layers) - 1; for ($i = 0; $i < $biasLayers; ++$i) { $this->layers[$i]->addNode(new Bias()); } } private function generateLayerSynapses(Layer $nextLayer, Layer $currentLayer): void { foreach ($nextLayer->getNodes() as $nextNeuron) { if ($nextNeuron instanceof Neuron) { $this->generateNeuronSynapses($currentLayer, $nextNeuron); } } } private function generateNeuronSynapses(Layer $currentLayer, Neuron $nextNeuron): void { foreach ($currentLayer->getNodes() as $currentNeuron) { $nextNeuron->addSynapse(new Synapse($currentNeuron)); } } private function trainSamples(array $samples, array $targets): void { foreach ($targets as $key => $target) { $this->trainSample($samples[$key], $target); } } } NeuralNetwork/Node/Input.php 0000644 00000000711 15152065050 0012037 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\NeuralNetwork\Node; use Phpml\NeuralNetwork\Node; class Input implements Node { /** * @var float */ private $input; public function __construct(float $input = 0.0) { $this->input = $input; } public function getOutput(): float { return $this->input; } public function setInput(float $input): void { $this->input = $input; } } NeuralNetwork/Node/Neuron/Synapse.php 0000644 00000001707 15152065050 0013636 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\NeuralNetwork\Node\Neuron; use Phpml\NeuralNetwork\Node; class Synapse { /** * @var float */ protected $weight; /** * @var Node */ protected $node; /** * @param float|null $weight */ public function __construct(Node $node, ?float $weight = null) { $this->node = $node; $this->weight = $weight ?? $this->generateRandomWeight(); } public function getOutput(): float { return $this->weight * $this->node->getOutput(); } public function changeWeight(float $delta): void { $this->weight += $delta; } public function getWeight(): float { return $this->weight; } public function getNode(): Node { return $this->node; } protected function generateRandomWeight(): float { return (1 / random_int(5, 25) * random_int(0, 1)) > 0 ? -1 : 1; } } NeuralNetwork/Node/Neuron.php 0000644 00000002777 15152065050 0012224 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\NeuralNetwork\Node; use Phpml\NeuralNetwork\ActivationFunction; use Phpml\NeuralNetwork\ActivationFunction\Sigmoid; use Phpml\NeuralNetwork\Node; use Phpml\NeuralNetwork\Node\Neuron\Synapse; class Neuron implements Node { /** * @var Synapse[] */ protected $synapses = []; /** * @var ActivationFunction */ protected $activationFunction; /** * @var float */ protected $output = 0.0; /** * @var float */ protected $z = 0.0; public function __construct(?ActivationFunction $activationFunction = null) { $this->activationFunction = $activationFunction ?? new Sigmoid(); } public function addSynapse(Synapse $synapse): void { $this->synapses[] = $synapse; } /** * @return Synapse[] */ public function getSynapses(): array { return $this->synapses; } public function getOutput(): float { if ($this->output === 0.0) { $this->z = 0; foreach ($this->synapses as $synapse) { $this->z += $synapse->getOutput(); } $this->output = $this->activationFunction->compute($this->z); } return $this->output; } public function getDerivative(): float { return $this->activationFunction->differentiate($this->z, $this->output); } public function reset(): void { $this->output = 0.0; $this->z = 0.0; } } NeuralNetwork/Node/Bias.php 0000644 00000000313 15152065050 0011614 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\NeuralNetwork\Node; use Phpml\NeuralNetwork\Node; class Bias implements Node { public function getOutput(): float { return 1.0; } } Preprocessing/NumberConverter.php 0000644 00000002225 15152065050 0013220 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Preprocessing; final class NumberConverter implements Preprocessor { /** * @var bool */ private $transformTargets; /** * @var mixed */ private $nonNumericPlaceholder; /** * @param mixed $nonNumericPlaceholder */ public function __construct(bool $transformTargets = false, $nonNumericPlaceholder = null) { $this->transformTargets = $transformTargets; $this->nonNumericPlaceholder = $nonNumericPlaceholder; } public function fit(array $samples, ?array $targets = null): void { //nothing to do } public function transform(array &$samples, ?array &$targets = null): void { foreach ($samples as &$sample) { foreach ($sample as &$feature) { $feature = is_numeric($feature) ? (float) $feature : $this->nonNumericPlaceholder; } } if ($this->transformTargets && is_array($targets)) { foreach ($targets as &$target) { $target = is_numeric($target) ? (float) $target : $this->nonNumericPlaceholder; } } } } Preprocessing/Preprocessor.php 0000644 00000000210 15152065050 0012556 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Preprocessing; use Phpml\Transformer; interface Preprocessor extends Transformer { } Preprocessing/LambdaTransformer.php 0000644 00000001115 15152065050 0013500 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Preprocessing; final class LambdaTransformer implements Preprocessor { /** * @var callable */ private $lambda; public function __construct(callable $lambda) { $this->lambda = $lambda; } public function fit(array $samples, ?array $targets = null): void { // nothing to do } public function transform(array &$samples, ?array &$targets = null): void { foreach ($samples as &$sample) { $sample = call_user_func($this->lambda, $sample); } } } Preprocessing/Normalizer.php 0000644 00000006001 15152065050 0012216 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Preprocessing; use Phpml\Exception\NormalizerException; use Phpml\Math\Statistic\Mean; use Phpml\Math\Statistic\StandardDeviation; class Normalizer implements Preprocessor { public const NORM_L1 = 1; public const NORM_L2 = 2; public const NORM_STD = 3; /** * @var int */ private $norm; /** * @var bool */ private $fitted = false; /** * @var array */ private $std = []; /** * @var array */ private $mean = []; /** * @throws NormalizerException */ public function __construct(int $norm = self::NORM_L2) { if (!in_array($norm, [self::NORM_L1, self::NORM_L2, self::NORM_STD], true)) { throw new NormalizerException('Unknown norm supplied.'); } $this->norm = $norm; } public function fit(array $samples, ?array $targets = null): void { if ($this->fitted) { return; } if ($this->norm === self::NORM_STD) { $features = range(0, count($samples[0]) - 1); foreach ($features as $i) { $values = array_column($samples, $i); $this->std[$i] = StandardDeviation::population($values); $this->mean[$i] = Mean::arithmetic($values); } } $this->fitted = true; } public function transform(array &$samples, ?array &$targets = null): void { $methods = [ self::NORM_L1 => 'normalizeL1', self::NORM_L2 => 'normalizeL2', self::NORM_STD => 'normalizeSTD', ]; $method = $methods[$this->norm]; $this->fit($samples); foreach ($samples as &$sample) { $this->{$method}($sample); } } private function normalizeL1(array &$sample): void { $norm1 = 0; foreach ($sample as $feature) { $norm1 += abs($feature); } if ($norm1 == 0) { $count = count($sample); $sample = array_fill(0, $count, 1.0 / $count); } else { array_walk($sample, function (&$feature) use ($norm1): void { $feature /= $norm1; }); } } private function normalizeL2(array &$sample): void { $norm2 = 0; foreach ($sample as $feature) { $norm2 += $feature * $feature; } $norm2 **= .5; if ($norm2 == 0) { $sample = array_fill(0, count($sample), 1); } else { array_walk($sample, function (&$feature) use ($norm2): void { $feature /= $norm2; }); } } private function normalizeSTD(array &$sample): void { foreach (array_keys($sample) as $i) { if ($this->std[$i] != 0) { $sample[$i] = ($sample[$i] - $this->mean[$i]) / $this->std[$i]; } else { // Same value for all samples. $sample[$i] = 0; } } } } Preprocessing/LabelEncoder.php 0000644 00000002011 15152065050 0012410 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Preprocessing; final class LabelEncoder implements Preprocessor { /** * @var int[] */ private $classes = []; public function fit(array $samples, ?array $targets = null): void { $this->classes = []; foreach ($samples as $sample) { if (!isset($this->classes[(string) $sample])) { $this->classes[(string) $sample] = count($this->classes); } } } public function transform(array &$samples, ?array &$targets = null): void { foreach ($samples as &$sample) { $sample = $this->classes[(string) $sample]; } } public function inverseTransform(array &$samples): void { $classes = array_flip($this->classes); foreach ($samples as &$sample) { $sample = $classes[$sample]; } } /** * @return string[] */ public function classes(): array { return array_keys($this->classes); } } Preprocessing/OneHotEncoder.php 0000644 00000003410 15152065050 0012571 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Preprocessing; use Phpml\Exception\InvalidArgumentException; final class OneHotEncoder implements Preprocessor { /** * @var bool */ private $ignoreUnknown; /** * @var array */ private $categories = []; public function __construct(bool $ignoreUnknown = false) { $this->ignoreUnknown = $ignoreUnknown; } public function fit(array $samples, ?array $targets = null): void { foreach (array_keys(array_values(current($samples))) as $column) { $this->fitColumn($column, array_values(array_unique(array_column($samples, $column)))); } } public function transform(array &$samples, ?array &$targets = null): void { foreach ($samples as &$sample) { $sample = $this->transformSample(array_values($sample)); } } private function fitColumn(int $column, array $values): void { $count = count($values); foreach ($values as $index => $value) { $map = array_fill(0, $count, 0); $map[$index] = 1; $this->categories[$column][$value] = $map; } } private function transformSample(array $sample): array { $encoded = []; foreach ($sample as $column => $feature) { if (!isset($this->categories[$column][$feature]) && !$this->ignoreUnknown) { throw new InvalidArgumentException(sprintf('Missing category "%s" for column %s in trained encoder', $feature, $column)); } $encoded = array_merge( $encoded, $this->categories[$column][$feature] ?? array_fill(0, count($this->categories[$column]), 0) ); } return $encoded; } } Preprocessing/Imputer.php 0000644 00000003705 15152065050 0011531 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Preprocessing; use Phpml\Exception\InvalidOperationException; use Phpml\Preprocessing\Imputer\Strategy; class Imputer implements Preprocessor { public const AXIS_COLUMN = 0; public const AXIS_ROW = 1; /** * @var mixed */ private $missingValue; /** * @var Strategy */ private $strategy; /** * @var int */ private $axis; /** * @var mixed[] */ private $samples = []; /** * @param mixed $missingValue */ public function __construct($missingValue, Strategy $strategy, int $axis = self::AXIS_COLUMN, array $samples = []) { $this->missingValue = $missingValue; $this->strategy = $strategy; $this->axis = $axis; $this->samples = $samples; } public function fit(array $samples, ?array $targets = null): void { $this->samples = $samples; } public function transform(array &$samples, ?array &$targets = null): void { if ($this->samples === []) { throw new InvalidOperationException('Missing training samples for Imputer.'); } foreach ($samples as &$sample) { $this->preprocessSample($sample); } } private function preprocessSample(array &$sample): void { foreach ($sample as $column => &$value) { if ($value === $this->missingValue) { $value = $this->strategy->replaceValue($this->getAxis($column, $sample)); } } } private function getAxis(int $column, array $currentSample): array { if ($this->axis === self::AXIS_ROW) { return array_diff($currentSample, [$this->missingValue]); } $axis = []; foreach ($this->samples as $sample) { if ($sample[$column] !== $this->missingValue) { $axis[] = $sample[$column]; } } return $axis; } } Preprocessing/ColumnFilter.php 0000644 00000002011 15152065050 0012474 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Preprocessing; final class ColumnFilter implements Preprocessor { /** * @var string[] */ private $datasetColumns = []; /** * @var string[] */ private $filterColumns = []; public function __construct(array $datasetColumns, array $filterColumns) { $this->datasetColumns = array_map(static function (string $column): string { return $column; }, $datasetColumns); $this->filterColumns = array_map(static function (string $column): string { return $column; }, $filterColumns); } public function fit(array $samples, ?array $targets = null): void { //nothing to do } public function transform(array &$samples, ?array &$targets = null): void { $keys = array_intersect($this->datasetColumns, $this->filterColumns); foreach ($samples as &$sample) { $sample = array_values(array_intersect_key($sample, $keys)); } } } Preprocessing/Imputer/Strategy/MeanStrategy.php 0000644 00000000476 15152065050 0015740 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Preprocessing\Imputer\Strategy; use Phpml\Math\Statistic\Mean; use Phpml\Preprocessing\Imputer\Strategy; class MeanStrategy implements Strategy { public function replaceValue(array $currentAxis): float { return Mean::arithmetic($currentAxis); } } Preprocessing/Imputer/Strategy/MedianStrategy.php 0000644 00000000474 15152065050 0016253 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Preprocessing\Imputer\Strategy; use Phpml\Math\Statistic\Mean; use Phpml\Preprocessing\Imputer\Strategy; class MedianStrategy implements Strategy { public function replaceValue(array $currentAxis): float { return Mean::median($currentAxis); } } Preprocessing/Imputer/Strategy/MostFrequentStrategy.php 0000644 00000000544 15152065050 0017510 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Preprocessing\Imputer\Strategy; use Phpml\Math\Statistic\Mean; use Phpml\Preprocessing\Imputer\Strategy; class MostFrequentStrategy implements Strategy { /** * @return float|mixed */ public function replaceValue(array $currentAxis) { return Mean::mode($currentAxis); } } Preprocessing/Imputer/Strategy.php 0000644 00000000273 15152065050 0013330 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Preprocessing\Imputer; interface Strategy { /** * @return mixed */ public function replaceValue(array $currentAxis); } CrossValidation/Split.php 0000644 00000002612 15152065050 0011454 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\CrossValidation; use Phpml\Dataset\Dataset; use Phpml\Exception\InvalidArgumentException; abstract class Split { /** * @var array */ protected $trainSamples = []; /** * @var array */ protected $testSamples = []; /** * @var array */ protected $trainLabels = []; /** * @var array */ protected $testLabels = []; public function __construct(Dataset $dataset, float $testSize = 0.3, ?int $seed = null) { if ($testSize <= 0 || $testSize >= 1) { throw new InvalidArgumentException('testsize must be between 0.0 and 1.0'); } $this->seedGenerator($seed); $this->splitDataset($dataset, $testSize); } public function getTrainSamples(): array { return $this->trainSamples; } public function getTestSamples(): array { return $this->testSamples; } public function getTrainLabels(): array { return $this->trainLabels; } public function getTestLabels(): array { return $this->testLabels; } abstract protected function splitDataset(Dataset $dataset, float $testSize): void; protected function seedGenerator(?int $seed = null): void { if ($seed === null) { mt_srand(); } else { mt_srand($seed); } } } CrossValidation/StratifiedRandomSplit.php 0000644 00000002412 15152065050 0014632 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\CrossValidation; use Phpml\Dataset\ArrayDataset; use Phpml\Dataset\Dataset; class StratifiedRandomSplit extends RandomSplit { protected function splitDataset(Dataset $dataset, float $testSize): void { $datasets = $this->splitByTarget($dataset); foreach ($datasets as $targetSet) { parent::splitDataset($targetSet, $testSize); } } /** * @return Dataset[] */ private function splitByTarget(Dataset $dataset): array { $targets = $dataset->getTargets(); $samples = $dataset->getSamples(); $uniqueTargets = array_unique($targets); /** @var array $split */ $split = array_combine($uniqueTargets, array_fill(0, count($uniqueTargets), [])); foreach ($samples as $key => $sample) { $split[$targets[$key]][] = $sample; } return $this->createDatasets($uniqueTargets, $split); } private function createDatasets(array $uniqueTargets, array $split): array { $datasets = []; foreach ($uniqueTargets as $target) { $datasets[$target] = new ArrayDataset($split[$target], array_fill(0, count($split[$target]), $target)); } return $datasets; } } CrossValidation/RandomSplit.php 0000644 00000001334 15152065050 0012615 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\CrossValidation; use Phpml\Dataset\Dataset; class RandomSplit extends Split { protected function splitDataset(Dataset $dataset, float $testSize): void { $samples = $dataset->getSamples(); $labels = $dataset->getTargets(); $datasetSize = count($samples); $testCount = count($this->testSamples); for ($i = $datasetSize; $i > 0; --$i) { $key = mt_rand(0, $datasetSize - 1); $setName = (count($this->testSamples) - $testCount) / $datasetSize >= $testSize ? 'train' : 'test'; $this->{$setName.'Samples'}[] = $samples[$key]; $this->{$setName.'Labels'}[] = $labels[$key]; } } } Association/Associator.php 0000644 00000000200 15152065050 0011627 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Association; use Phpml\Estimator; interface Associator extends Estimator { } Association/Apriori.php 0000644 00000020346 15152065050 0011142 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Association; use Phpml\Helper\Predictable; use Phpml\Helper\Trainable; class Apriori implements Associator { use Trainable; use Predictable; public const ARRAY_KEY_ANTECEDENT = 'antecedent'; public const ARRAY_KEY_CONFIDENCE = 'confidence'; public const ARRAY_KEY_CONSEQUENT = 'consequent'; public const ARRAY_KEY_SUPPORT = 'support'; /** * Minimum relative probability of frequent transactions. * * @var float */ private $confidence; /** * The large set contains frequent k-length item sets. * * @var mixed[][][] */ private $large = []; /** * Minimum relative frequency of transactions. * * @var float */ private $support; /** * The generated Apriori association rules. * * @var mixed[][] */ private $rules = []; /** * Apriori constructor. */ public function __construct(float $support = 0.0, float $confidence = 0.0) { $this->support = $support; $this->confidence = $confidence; } /** * Get all association rules which are generated for every k-length frequent item set. * * @return mixed[][] */ public function getRules(): array { if (count($this->large) === 0) { $this->large = $this->apriori(); } if (count($this->rules) > 0) { return $this->rules; } $this->rules = []; $this->generateAllRules(); return $this->rules; } /** * Generates frequent item sets. * * @return mixed[][][] */ public function apriori(): array { $L = []; $items = $this->frequent($this->items()); for ($k = 1; isset($items[0]); ++$k) { $L[$k] = $items; $items = $this->frequent($this->candidates($items)); } return $L; } /** * @param mixed[] $sample * * @return mixed[][] */ protected function predictSample(array $sample): array { $predicts = array_values(array_filter($this->getRules(), function ($rule) use ($sample): bool { return $this->equals($rule[self::ARRAY_KEY_ANTECEDENT], $sample); })); return array_map(static function ($rule) { return $rule[self::ARRAY_KEY_CONSEQUENT]; }, $predicts); } /** * Generate rules for each k-length frequent item set. */ private function generateAllRules(): void { for ($k = 2; isset($this->large[$k]); ++$k) { foreach ($this->large[$k] as $frequent) { $this->generateRules($frequent); } } } /** * Generate confident rules for frequent item set. * * @param mixed[] $frequent */ private function generateRules(array $frequent): void { foreach ($this->antecedents($frequent) as $antecedent) { $confidence = $this->confidence($frequent, $antecedent); if ($this->confidence <= $confidence) { $consequent = array_values(array_diff($frequent, $antecedent)); $this->rules[] = [ self::ARRAY_KEY_ANTECEDENT => $antecedent, self::ARRAY_KEY_CONSEQUENT => $consequent, self::ARRAY_KEY_SUPPORT => $this->support($frequent), self::ARRAY_KEY_CONFIDENCE => $confidence, ]; } } } /** * Generates the power set for given item set $sample. * * @param mixed[] $sample * * @return mixed[][] */ private function powerSet(array $sample): array { $results = [[]]; foreach ($sample as $item) { foreach ($results as $combination) { $results[] = array_merge([$item], $combination); } } return $results; } /** * Generates all proper subsets for given set $sample without the empty set. * * @param mixed[] $sample * * @return mixed[][] */ private function antecedents(array $sample): array { $cardinality = count($sample); $antecedents = $this->powerSet($sample); return array_filter($antecedents, static function ($antecedent) use ($cardinality): bool { return (count($antecedent) != $cardinality) && ($antecedent != []); }); } /** * Calculates frequent k = 1 item sets. * * @return mixed[][] */ private function items(): array { $items = []; foreach ($this->samples as $sample) { foreach ($sample as $item) { if (!in_array($item, $items, true)) { $items[] = $item; } } } return array_map(static function ($entry): array { return [$entry]; }, $items); } /** * Returns frequent item sets only. * * @param mixed[][] $samples * * @return mixed[][] */ private function frequent(array $samples): array { return array_values(array_filter($samples, function ($entry): bool { return $this->support($entry) >= $this->support; })); } /** * Calculates frequent k item sets, where count($samples) == $k - 1. * * @param mixed[][] $samples * * @return mixed[][] */ private function candidates(array $samples): array { $candidates = []; foreach ($samples as $p) { foreach ($samples as $q) { if (count(array_merge(array_diff($p, $q), array_diff($q, $p))) != 2) { continue; } $candidate = array_values(array_unique(array_merge($p, $q))); if ($this->contains($candidates, $candidate)) { continue; } foreach ($this->samples as $sample) { if ($this->subset($sample, $candidate)) { $candidates[] = $candidate; continue 2; } } } } return $candidates; } /** * Calculates confidence for $set. Confidence is the relative amount of sets containing $subset which also contain * $set. * * @param mixed[] $set * @param mixed[] $subset */ private function confidence(array $set, array $subset): float { return $this->support($set) / $this->support($subset); } /** * Calculates support for item set $sample. Support is the relative amount of sets containing $sample in the data * pool. * * @see \Phpml\Association\Apriori::samples * * @param mixed[] $sample */ private function support(array $sample): float { return $this->frequency($sample) / count($this->samples); } /** * Counts occurrences of $sample as subset in data pool. * * @see \Phpml\Association\Apriori::samples * * @param mixed[] $sample */ private function frequency(array $sample): int { return count(array_filter($this->samples, function ($entry) use ($sample): bool { return $this->subset($entry, $sample); })); } /** * Returns true if set is an element of system. * * @see \Phpml\Association\Apriori::equals() * * @param mixed[][] $system * @param mixed[] $set */ private function contains(array $system, array $set): bool { return (bool) array_filter($system, function ($entry) use ($set): bool { return $this->equals($entry, $set); }); } /** * Returns true if subset is a (proper) subset of set by its items string representation. * * @param mixed[] $set * @param mixed[] $subset */ private function subset(array $set, array $subset): bool { return count(array_diff($subset, array_intersect($subset, $set))) === 0; } /** * Returns true if string representation of items does not differ. * * @param mixed[] $set1 * @param mixed[] $set2 */ private function equals(array $set1, array $set2): bool { return array_diff($set1, $set2) == array_diff($set2, $set1); } } FeatureUnion.php 0000644 00000003761 15152065050 0007667 0 ustar 00 <?php declare(strict_types=1); namespace Phpml; use Phpml\Exception\InvalidArgumentException; final class FeatureUnion implements Transformer { /** * @var Pipeline[] */ private $pipelines = []; /** * @var Pipeline[] */ public function __construct(array $pipelines) { if ($pipelines === []) { throw new InvalidArgumentException('At least one pipeline is required'); } $this->pipelines = array_map(static function (Pipeline $pipeline): Pipeline { return $pipeline; }, $pipelines); } public function fit(array $samples, ?array $targets = null): void { $originSamples = $samples; foreach ($this->pipelines as $pipeline) { foreach ($pipeline->getTransformers() as $transformer) { $transformer->fit($samples, $targets); $transformer->transform($samples, $targets); } $samples = $originSamples; } } public function transform(array &$samples, ?array &$targets = null): void { $this->transformSamples($samples, $targets); } public function fitAndTransform(array &$samples, ?array &$targets = null): void { $this->transformSamples($samples, $targets, true); } private function transformSamples(array &$samples, ?array &$targets = null, bool $fit = false): void { $union = []; $originSamples = $samples; foreach ($this->pipelines as $pipeline) { foreach ($pipeline->getTransformers() as $transformer) { if ($fit) { $transformer->fit($samples, $targets); } $transformer->transform($samples, $targets); } foreach ($samples as $index => $sample) { $union[$index] = array_merge($union[$index] ?? [], is_array($sample) ? $sample : [$sample]); } $samples = $originSamples; } $samples = $union; } } Regression/LeastSquares.php 0000644 00000004210 15152065050 0012005 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Regression; use Phpml\Helper\Predictable; use Phpml\Math\Matrix; class LeastSquares implements Regression { use Predictable; /** * @var array */ private $samples = []; /** * @var array */ private $targets = []; /** * @var float */ private $intercept; /** * @var array */ private $coefficients = []; public function train(array $samples, array $targets): void { $this->samples = array_merge($this->samples, $samples); $this->targets = array_merge($this->targets, $targets); $this->computeCoefficients(); } /** * @return mixed */ public function predictSample(array $sample) { $result = $this->intercept; foreach ($this->coefficients as $index => $coefficient) { $result += $coefficient * $sample[$index]; } return $result; } public function getCoefficients(): array { return $this->coefficients; } public function getIntercept(): float { return $this->intercept; } /** * coefficient(b) = (X'X)-1X'Y. */ private function computeCoefficients(): void { $samplesMatrix = $this->getSamplesMatrix(); $targetsMatrix = $this->getTargetsMatrix(); $ts = $samplesMatrix->transpose()->multiply($samplesMatrix)->inverse(); $tf = $samplesMatrix->transpose()->multiply($targetsMatrix); $this->coefficients = $ts->multiply($tf)->getColumnValues(0); $this->intercept = array_shift($this->coefficients); } /** * Add one dimension for intercept calculation. */ private function getSamplesMatrix(): Matrix { $samples = []; foreach ($this->samples as $sample) { array_unshift($sample, 1); $samples[] = $sample; } return new Matrix($samples); } private function getTargetsMatrix(): Matrix { if (is_array($this->targets[0])) { return new Matrix($this->targets); } return Matrix::fromFlatArray($this->targets); } } Regression/Regression.php 0000644 00000000177 15152065050 0011521 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Regression; use Phpml\Estimator; interface Regression extends Estimator { } Regression/SVR.php 0000644 00000001321 15152065050 0010043 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Regression; use Phpml\SupportVectorMachine\Kernel; use Phpml\SupportVectorMachine\SupportVectorMachine; use Phpml\SupportVectorMachine\Type; class SVR extends SupportVectorMachine implements Regression { public function __construct( int $kernel = Kernel::RBF, int $degree = 3, float $epsilon = 0.1, float $cost = 1.0, ?float $gamma = null, float $coef0 = 0.0, float $tolerance = 0.001, int $cacheSize = 100, bool $shrinking = true ) { parent::__construct(Type::EPSILON_SVR, $kernel, $cost, 0.5, $degree, $gamma, $coef0, $epsilon, $tolerance, $cacheSize, $shrinking, false); } } Transformer.php 0000644 00000000544 15152065050 0007561 0 ustar 00 <?php declare(strict_types=1); namespace Phpml; interface Transformer { /** * most transformers don't require targets to train so null allow to use fit method without setting targets */ public function fit(array $samples, ?array $targets = null): void; public function transform(array &$samples, ?array &$targets = null): void; } Helper/Trainable.php 0000644 00000000626 15152065050 0010400 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Helper; trait Trainable { /** * @var array */ private $samples = []; /** * @var array */ private $targets = []; public function train(array $samples, array $targets): void { $this->samples = array_merge($this->samples, $samples); $this->targets = array_merge($this->targets, $targets); } } Helper/Predictable.php 0000644 00000001046 15152065050 0010712 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Helper; trait Predictable { /** * @return mixed */ public function predict(array $samples) { if (!is_array($samples[0])) { return $this->predictSample($samples); } $predicted = []; foreach ($samples as $index => $sample) { $predicted[$index] = $this->predictSample($sample); } return $predicted; } /** * @return mixed */ abstract protected function predictSample(array $sample); } Helper/OneVsRest.php 0000644 00000011642 15152065050 0010367 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Helper; use Phpml\Classification\Classifier; trait OneVsRest { /** * @var array */ protected $classifiers = []; /** * All provided training targets' labels. * * @var array */ protected $allLabels = []; /** * @var array */ protected $costValues = []; /** * Train a binary classifier in the OvR style */ public function train(array $samples, array $targets): void { // Clears previous stuff. $this->reset(); $this->trainByLabel($samples, $targets); } /** * Resets the classifier and the vars internally used by OneVsRest to create multiple classifiers. */ public function reset(): void { $this->classifiers = []; $this->allLabels = []; $this->costValues = []; $this->resetBinary(); } protected function trainByLabel(array $samples, array $targets, array $allLabels = []): void { // Overwrites the current value if it exist. $allLabels must be provided for each partialTrain run. $this->allLabels = count($allLabels) === 0 ? array_keys(array_count_values($targets)) : $allLabels; sort($this->allLabels, SORT_STRING); // If there are only two targets, then there is no need to perform OvR if (count($this->allLabels) === 2) { // Init classifier if required. if (count($this->classifiers) === 0) { $this->classifiers[0] = $this->getClassifierCopy(); } $this->classifiers[0]->trainBinary($samples, $targets, $this->allLabels); } else { // Train a separate classifier for each label and memorize them foreach ($this->allLabels as $label) { // Init classifier if required. if (!isset($this->classifiers[$label])) { $this->classifiers[$label] = $this->getClassifierCopy(); } [$binarizedTargets, $classifierLabels] = $this->binarizeTargets($targets, $label); $this->classifiers[$label]->trainBinary($samples, $binarizedTargets, $classifierLabels); } } // If the underlying classifier is capable of giving the cost values // during the training, then assign it to the relevant variable // Adding just the first classifier cost values to avoid complex average calculations. $classifierref = reset($this->classifiers); if (method_exists($classifierref, 'getCostValues')) { $this->costValues = $classifierref->getCostValues(); } } /** * Returns an instance of the current class after cleaning up OneVsRest stuff. */ protected function getClassifierCopy(): Classifier { // Clone the current classifier, so that // we don't mess up its variables while training // multiple instances of this classifier $classifier = clone $this; $classifier->reset(); return $classifier; } /** * @return mixed */ protected function predictSample(array $sample) { if (count($this->allLabels) === 2) { return $this->classifiers[0]->predictSampleBinary($sample); } $probs = []; foreach ($this->classifiers as $label => $predictor) { $probs[$label] = $predictor->predictProbability($sample, $label); } arsort($probs, SORT_NUMERIC); return key($probs); } /** * Each classifier should implement this method instead of train(samples, targets) */ abstract protected function trainBinary(array $samples, array $targets, array $labels); /** * To be overwritten by OneVsRest classifiers. */ abstract protected function resetBinary(): void; /** * Each classifier that make use of OvR approach should be able to * return a probability for a sample to belong to the given label. * * @return mixed */ abstract protected function predictProbability(array $sample, string $label); /** * Each classifier should implement this method instead of predictSample() * * @return mixed */ abstract protected function predictSampleBinary(array $sample); /** * Groups all targets into two groups: Targets equal to * the given label and the others * * $targets is not passed by reference nor contains objects so this method * changes will not affect the caller $targets array. * * @param mixed $label * * @return array Binarized targets and target's labels */ private function binarizeTargets(array $targets, $label): array { $notLabel = "not_${label}"; foreach ($targets as $key => $target) { $targets[$key] = $target == $label ? $label : $notLabel; } $labels = [$label, $notLabel]; return [$targets, $labels]; } } Helper/Optimizer/StochasticGD.php 0000644 00000016275 15152065050 0013007 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Helper\Optimizer; use Closure; use Phpml\Exception\InvalidArgumentException; use Phpml\Exception\InvalidOperationException; /** * Stochastic Gradient Descent optimization method * to find a solution for the equation A.ϴ = y where * A (samples) and y (targets) are known and ϴ is unknown. */ class StochasticGD extends Optimizer { /** * A (samples) * * @var array */ protected $samples = []; /** * y (targets) * * @var array */ protected $targets = []; /** * Callback function to get the gradient and cost value * for a specific set of theta (ϴ) and a pair of sample & target * * @var \Closure|null */ protected $gradientCb; /** * Maximum number of iterations used to train the model * * @var int */ protected $maxIterations = 1000; /** * Learning rate is used to control the speed of the optimization.<br> * * Larger values of lr may overshoot the optimum or even cause divergence * while small values slows down the convergence and increases the time * required for the training * * @var float */ protected $learningRate = 0.001; /** * Minimum amount of change in the weights and error values * between iterations that needs to be obtained to continue the training * * @var float */ protected $threshold = 1e-4; /** * Enable/Disable early stopping by checking the weight & cost values * to see whether they changed large enough to continue the optimization * * @var bool */ protected $enableEarlyStop = true; /** * List of values obtained by evaluating the cost function at each iteration * of the algorithm * * @var array */ protected $costValues = []; /** * Initializes the SGD optimizer for the given number of dimensions */ public function __construct(int $dimensions) { // Add one more dimension for the bias parent::__construct($dimensions + 1); $this->dimensions = $dimensions; } public function setTheta(array $theta): Optimizer { if (count($theta) !== $this->dimensions + 1) { throw new InvalidArgumentException(sprintf('Number of values in the weights array should be %s', $this->dimensions + 1)); } $this->theta = $theta; return $this; } /** * Sets minimum value for the change in the theta values * between iterations to continue the iterations.<br> * * If change in the theta is less than given value then the * algorithm will stop training * * @return $this */ public function setChangeThreshold(float $threshold = 1e-5) { $this->threshold = $threshold; return $this; } /** * Enable/Disable early stopping by checking at each iteration * whether changes in theta or cost value are not large enough * * @return $this */ public function setEarlyStop(bool $enable = true) { $this->enableEarlyStop = $enable; return $this; } /** * @return $this */ public function setLearningRate(float $learningRate) { $this->learningRate = $learningRate; return $this; } /** * @return $this */ public function setMaxIterations(int $maxIterations) { $this->maxIterations = $maxIterations; return $this; } /** * Optimization procedure finds the unknow variables for the equation A.ϴ = y * for the given samples (A) and targets (y).<br> * * The cost function to minimize and the gradient of the function are to be * handled by the callback function provided as the third parameter of the method. */ public function runOptimization(array $samples, array $targets, Closure $gradientCb): array { $this->samples = $samples; $this->targets = $targets; $this->gradientCb = $gradientCb; $currIter = 0; $bestTheta = null; $bestScore = 0.0; $this->costValues = []; while ($this->maxIterations > $currIter++) { $theta = $this->theta; // Update the guess $cost = $this->updateTheta(); // Save the best theta in the "pocket" so that // any future set of theta worse than this will be disregarded if ($bestTheta === null || $cost <= $bestScore) { $bestTheta = $theta; $bestScore = $cost; } // Add the cost value for this iteration to the list $this->costValues[] = $cost; // Check for early stop if ($this->enableEarlyStop && $this->earlyStop($theta)) { break; } } $this->clear(); // Solution in the pocket is better than or equal to the last state // so, we use this solution return $this->theta = (array) $bestTheta; } /** * Returns the list of cost values for each iteration executed in * last run of the optimization */ public function getCostValues(): array { return $this->costValues; } protected function updateTheta(): float { $jValue = 0.0; $theta = $this->theta; if ($this->gradientCb === null) { throw new InvalidOperationException('Gradient callback is not defined'); } foreach ($this->samples as $index => $sample) { $target = $this->targets[$index]; $result = ($this->gradientCb)($theta, $sample, $target); [$error, $gradient, $penalty] = array_pad($result, 3, 0); // Update bias $this->theta[0] -= $this->learningRate * $gradient; // Update other values for ($i = 1; $i <= $this->dimensions; ++$i) { $this->theta[$i] -= $this->learningRate * ($gradient * $sample[$i - 1] + $penalty * $this->theta[$i]); } // Sum error rate $jValue += $error; } return $jValue / count($this->samples); } /** * Checks if the optimization is not effective enough and can be stopped * in case large enough changes in the solution do not happen */ protected function earlyStop(array $oldTheta): bool { // Check for early stop: No change larger than threshold (default 1e-5) $diff = array_map( function ($w1, $w2) { return abs($w1 - $w2) > $this->threshold ? 1 : 0; }, $oldTheta, $this->theta ); if (array_sum($diff) == 0) { return true; } // Check if the last two cost values are almost the same $costs = array_slice($this->costValues, -2); if (count($costs) === 2 && abs($costs[1] - $costs[0]) < $this->threshold) { return true; } return false; } /** * Clears the optimizer internal vars after the optimization process. */ protected function clear(): void { $this->samples = []; $this->targets = []; $this->gradientCb = null; } } Helper/Optimizer/Optimizer.php 0000644 00000002525 15152065050 0012443 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Helper\Optimizer; use Closure; use Phpml\Exception\InvalidArgumentException; abstract class Optimizer { /** * Unknown variables to be found * * @var array */ protected $theta = []; /** * Number of dimensions * * @var int */ protected $dimensions; /** * Inits a new instance of Optimizer for the given number of dimensions */ public function __construct(int $dimensions) { $this->dimensions = $dimensions; // Inits the weights randomly $this->theta = []; for ($i = 0; $i < $this->dimensions; ++$i) { $this->theta[] = (random_int(0, PHP_INT_MAX) / PHP_INT_MAX) + 0.1; } } public function setTheta(array $theta): self { if (count($theta) !== $this->dimensions) { throw new InvalidArgumentException(sprintf('Number of values in the weights array should be %s', $this->dimensions)); } $this->theta = $theta; return $this; } public function theta(): array { return $this->theta; } /** * Executes the optimization with the given samples & targets * and returns the weights */ abstract public function runOptimization(array $samples, array $targets, Closure $gradientCb): array; } Helper/Optimizer/ConjugateGradient.php 0000644 00000020535 15152065050 0014057 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Helper\Optimizer; use Closure; /** * Conjugate Gradient method to solve a non-linear f(x) with respect to unknown x * See https://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method) * * The method applied below is explained in the below document in a practical manner * - http://web.cs.iastate.edu/~cs577/handouts/conjugate-gradient.pdf * * However it is compliant with the general Conjugate Gradient method with * Fletcher-Reeves update method. Note that, the f(x) is assumed to be one-dimensional * and one gradient is utilized for all dimensions in the given data. */ class ConjugateGradient extends GD { public function runOptimization(array $samples, array $targets, Closure $gradientCb): array { $this->samples = $samples; $this->targets = $targets; $this->gradientCb = $gradientCb; $this->sampleCount = count($samples); $this->costValues = []; $d = MP::muls($this->gradient($this->theta), -1); for ($i = 0; $i < $this->maxIterations; ++$i) { // Obtain α that minimizes f(θ + α.d) $alpha = $this->getAlpha($d); // θ(k+1) = θ(k) + α.d $thetaNew = $this->getNewTheta($alpha, $d); // β = ||∇f(x(k+1))||² ∕ ||∇f(x(k))||² $beta = $this->getBeta($thetaNew); // d(k+1) =–∇f(x(k+1)) + β(k).d(k) $d = $this->getNewDirection($thetaNew, $beta, $d); // Save values for the next iteration $oldTheta = $this->theta; $this->costValues[] = $this->cost($thetaNew); $this->theta = $thetaNew; if ($this->enableEarlyStop && $this->earlyStop($oldTheta)) { break; } } $this->clear(); return $this->theta; } /** * Executes the callback function for the problem and returns * sum of the gradient for all samples & targets. */ protected function gradient(array $theta): array { [, $updates, $penalty] = parent::gradient($theta); // Calculate gradient for each dimension $gradient = []; for ($i = 0; $i <= $this->dimensions; ++$i) { if ($i === 0) { $gradient[$i] = array_sum($updates); } else { $col = array_column($this->samples, $i - 1); $error = 0; foreach ($col as $index => $val) { $error += $val * $updates[$index]; } $gradient[$i] = $error + $penalty * $theta[$i]; } } return $gradient; } /** * Returns the value of f(x) for given solution */ protected function cost(array $theta): float { [$cost] = parent::gradient($theta); return array_sum($cost) / (int) $this->sampleCount; } /** * Calculates alpha that minimizes the function f(θ + α.d) * by performing a line search that does not rely upon the derivation. * * There are several alternatives for this function. For now, we * prefer a method inspired from the bisection method for its simplicity. * This algorithm attempts to find an optimum alpha value between 0.0001 and 0.01 * * Algorithm as follows: * a) Probe a small alpha (0.0001) and calculate cost function * b) Probe a larger alpha (0.01) and calculate cost function * b-1) If cost function decreases, continue enlarging alpha * b-2) If cost function increases, take the midpoint and try again */ protected function getAlpha(array $d): float { $small = MP::muls($d, 0.0001); $large = MP::muls($d, 0.01); // Obtain θ + α.d for two initial values, x0 and x1 $x0 = MP::add($this->theta, $small); $x1 = MP::add($this->theta, $large); $epsilon = 0.0001; $iteration = 0; do { $fx1 = $this->cost($x1); $fx0 = $this->cost($x0); // If the difference between two values is small enough // then break the loop if (abs($fx1 - $fx0) <= $epsilon) { break; } if ($fx1 < $fx0) { $x0 = $x1; $x1 = MP::adds($x1, 0.01); // Enlarge second } else { $x1 = MP::divs(MP::add($x1, $x0), 2.0); } // Get to the midpoint $error = $fx1 / $this->dimensions; } while ($error <= $epsilon || $iteration++ < 10); // Return α = θ / d // For accuracy, choose a dimension which maximize |d[i]| $imax = 0; for ($i = 1; $i <= $this->dimensions; ++$i) { if (abs($d[$i]) > abs($d[$imax])) { $imax = $i; } } if ($d[$imax] == 0) { return $x1[$imax] - $this->theta[$imax]; } return ($x1[$imax] - $this->theta[$imax]) / $d[$imax]; } /** * Calculates new set of solutions with given alpha (for each θ(k)) and * gradient direction. * * θ(k+1) = θ(k) + α.d */ protected function getNewTheta(float $alpha, array $d): array { return MP::add($this->theta, MP::muls($d, $alpha)); } /** * Calculates new beta (β) for given set of solutions by using * Fletcher–Reeves method. * * β = ||f(x(k+1))||² ∕ ||f(x(k))||² * * See: * R. Fletcher and C. M. Reeves, "Function minimization by conjugate gradients", Comput. J. 7 (1964), 149–154. */ protected function getBeta(array $newTheta): float { $gNew = $this->gradient($newTheta); $gOld = $this->gradient($this->theta); $dNew = 0; $dOld = 1e-100; for ($i = 0; $i <= $this->dimensions; ++$i) { $dNew += $gNew[$i] ** 2; $dOld += $gOld[$i] ** 2; } return $dNew / $dOld; } /** * Calculates the new conjugate direction * * d(k+1) =–∇f(x(k+1)) + β(k).d(k) */ protected function getNewDirection(array $theta, float $beta, array $d): array { $grad = $this->gradient($theta); return MP::add(MP::muls($grad, -1), MP::muls($d, $beta)); } } /** * Handles element-wise vector operations between vector-vector * and vector-scalar variables */ class MP { /** * Element-wise <b>multiplication</b> of two vectors of the same size */ public static function mul(array $m1, array $m2): array { $res = []; foreach ($m1 as $i => $val) { $res[] = $val * $m2[$i]; } return $res; } /** * Element-wise <b>division</b> of two vectors of the same size */ public static function div(array $m1, array $m2): array { $res = []; foreach ($m1 as $i => $val) { $res[] = $val / $m2[$i]; } return $res; } /** * Element-wise <b>addition</b> of two vectors of the same size */ public static function add(array $m1, array $m2, int $mag = 1): array { $res = []; foreach ($m1 as $i => $val) { $res[] = $val + $mag * $m2[$i]; } return $res; } /** * Element-wise <b>subtraction</b> of two vectors of the same size */ public static function sub(array $m1, array $m2): array { return self::add($m1, $m2, -1); } /** * Element-wise <b>multiplication</b> of a vector with a scalar */ public static function muls(array $m1, float $m2): array { $res = []; foreach ($m1 as $val) { $res[] = $val * $m2; } return $res; } /** * Element-wise <b>division</b> of a vector with a scalar */ public static function divs(array $m1, float $m2): array { $res = []; foreach ($m1 as $val) { $res[] = $val / ($m2 + 1e-32); } return $res; } /** * Element-wise <b>addition</b> of a vector with a scalar */ public static function adds(array $m1, float $m2, int $mag = 1): array { $res = []; foreach ($m1 as $val) { $res[] = $val + $mag * $m2; } return $res; } /** * Element-wise <b>subtraction</b> of a vector with a scalar */ public static function subs(array $m1, float $m2): array { return self::adds($m1, $m2, -1); } } Helper/Optimizer/GD.php 0000644 00000005737 15152065050 0010763 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Helper\Optimizer; use Closure; use Phpml\Exception\InvalidOperationException; /** * Batch version of Gradient Descent to optimize the weights * of a classifier given samples, targets and the objective function to minimize */ class GD extends StochasticGD { /** * Number of samples given * * @var int|null */ protected $sampleCount; public function runOptimization(array $samples, array $targets, Closure $gradientCb): array { $this->samples = $samples; $this->targets = $targets; $this->gradientCb = $gradientCb; $this->sampleCount = count($this->samples); // Batch learning is executed: $currIter = 0; $this->costValues = []; while ($this->maxIterations > $currIter++) { $theta = $this->theta; // Calculate update terms for each sample [$errors, $updates, $totalPenalty] = $this->gradient($theta); $this->updateWeightsWithUpdates($updates, $totalPenalty); $this->costValues[] = array_sum($errors) / (int) $this->sampleCount; if ($this->earlyStop($theta)) { break; } } $this->clear(); return $this->theta; } /** * Calculates gradient, cost function and penalty term for each sample * then returns them as an array of values */ protected function gradient(array $theta): array { $costs = []; $gradient = []; $totalPenalty = 0; if ($this->gradientCb === null) { throw new InvalidOperationException('Gradient callback is not defined'); } foreach ($this->samples as $index => $sample) { $target = $this->targets[$index]; $result = ($this->gradientCb)($theta, $sample, $target); [$cost, $grad, $penalty] = array_pad($result, 3, 0); $costs[] = $cost; $gradient[] = $grad; $totalPenalty += $penalty; } $totalPenalty /= $this->sampleCount; return [$costs, $gradient, $totalPenalty]; } protected function updateWeightsWithUpdates(array $updates, float $penalty): void { // Updates all weights at once for ($i = 0; $i <= $this->dimensions; ++$i) { if ($i === 0) { $this->theta[0] -= $this->learningRate * array_sum($updates); } else { $col = array_column($this->samples, $i - 1); $error = 0; foreach ($col as $index => $val) { $error += $val * $updates[$index]; } $this->theta[$i] -= $this->learningRate * ($error + $penalty * $this->theta[$i]); } } } /** * Clears the optimizer internal vars after the optimization process. */ protected function clear(): void { $this->sampleCount = null; parent::clear(); } } Clustering/DBSCAN.php 0000644 00000005625 15152065050 0010335 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Clustering; use Phpml\Math\Distance; use Phpml\Math\Distance\Euclidean; class DBSCAN implements Clusterer { private const NOISE = -1; /** * @var float */ private $epsilon; /** * @var int */ private $minSamples; /** * @var Distance */ private $distanceMetric; public function __construct(float $epsilon = 0.5, int $minSamples = 3, ?Distance $distanceMetric = null) { if ($distanceMetric === null) { $distanceMetric = new Euclidean(); } $this->epsilon = $epsilon; $this->minSamples = $minSamples; $this->distanceMetric = $distanceMetric; } public function cluster(array $samples): array { $labels = []; $n = 0; foreach ($samples as $index => $sample) { if (isset($labels[$index])) { continue; } $neighborIndices = $this->getIndicesInRegion($sample, $samples); if (count($neighborIndices) < $this->minSamples) { $labels[$index] = self::NOISE; continue; } $labels[$index] = $n; $this->expandCluster($samples, $neighborIndices, $labels, $n); ++$n; } return $this->groupByCluster($samples, $labels, $n); } private function expandCluster(array $samples, array $seeds, array &$labels, int $n): void { while (($index = array_pop($seeds)) !== null) { if (isset($labels[$index])) { if ($labels[$index] === self::NOISE) { $labels[$index] = $n; } continue; } $labels[$index] = $n; $sample = $samples[$index]; $neighborIndices = $this->getIndicesInRegion($sample, $samples); if (count($neighborIndices) >= $this->minSamples) { $seeds = array_unique(array_merge($seeds, $neighborIndices)); } } } private function getIndicesInRegion(array $center, array $samples): array { $indices = []; foreach ($samples as $index => $sample) { if ($this->distanceMetric->distance($center, $sample) < $this->epsilon) { $indices[] = $index; } } return $indices; } private function groupByCluster(array $samples, array $labels, int $n): array { $clusters = array_fill(0, $n, []); foreach ($samples as $index => $sample) { if ($labels[$index] !== self::NOISE) { $clusters[$labels[$index]][$index] = $sample; } } // Reindex (i.e. to 0, 1, 2, ...) integer indices for backword compatibility foreach ($clusters as $index => $cluster) { $clusters[$index] = array_merge($cluster, []); } return $clusters; } } Clustering/FuzzyCMeans.php 0000644 00000014636 15152065050 0011623 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Clustering; use Phpml\Clustering\KMeans\Cluster; use Phpml\Clustering\KMeans\Point; use Phpml\Clustering\KMeans\Space; use Phpml\Exception\InvalidArgumentException; use Phpml\Math\Distance\Euclidean; class FuzzyCMeans implements Clusterer { /** * @var int */ private $clustersNumber; /** * @var Cluster[] */ private $clusters = []; /** * @var Space */ private $space; /** * @var float[][] */ private $membership = []; /** * @var float */ private $fuzziness; /** * @var float */ private $epsilon; /** * @var int */ private $maxIterations; /** * @var int */ private $sampleCount; /** * @var array */ private $samples = []; /** * @throws InvalidArgumentException */ public function __construct(int $clustersNumber, float $fuzziness = 2.0, float $epsilon = 1e-2, int $maxIterations = 100) { if ($clustersNumber <= 0) { throw new InvalidArgumentException('Invalid clusters number'); } $this->clustersNumber = $clustersNumber; $this->fuzziness = $fuzziness; $this->epsilon = $epsilon; $this->maxIterations = $maxIterations; } public function getMembershipMatrix(): array { return $this->membership; } public function cluster(array $samples): array { // Initialize variables, clusters and membership matrix $this->sampleCount = count($samples); $this->samples = &$samples; $this->space = new Space(count($samples[0])); $this->initClusters(); // Our goal is minimizing the objective value while // executing the clustering steps at a maximum number of iterations $lastObjective = 0.0; $iterations = 0; do { // Update the membership matrix and cluster centers, respectively $this->updateMembershipMatrix(); $this->updateClusters(); // Calculate the new value of the objective function $objectiveVal = $this->getObjective(); $difference = abs($lastObjective - $objectiveVal); $lastObjective = $objectiveVal; } while ($difference > $this->epsilon && $iterations++ <= $this->maxIterations); // Attach (hard cluster) each data point to the nearest cluster for ($k = 0; $k < $this->sampleCount; ++$k) { $column = array_column($this->membership, $k); arsort($column); reset($column); $cluster = $this->clusters[key($column)]; $cluster->attach(new Point($this->samples[$k])); } // Return grouped samples $grouped = []; foreach ($this->clusters as $cluster) { $grouped[] = $cluster->getPoints(); } return $grouped; } protected function initClusters(): void { // Membership array is a matrix of cluster number by sample counts // We initilize the membership array with random values $dim = $this->space->getDimension(); $this->generateRandomMembership($dim, $this->sampleCount); $this->updateClusters(); } protected function generateRandomMembership(int $rows, int $cols): void { $this->membership = []; for ($i = 0; $i < $rows; ++$i) { $row = []; $total = 0.0; for ($k = 0; $k < $cols; ++$k) { $val = random_int(1, 5) / 10.0; $row[] = $val; $total += $val; } $this->membership[] = array_map(static function ($val) use ($total): float { return $val / $total; }, $row); } } protected function updateClusters(): void { $dim = $this->space->getDimension(); if (count($this->clusters) === 0) { for ($i = 0; $i < $this->clustersNumber; ++$i) { $this->clusters[] = new Cluster($this->space, array_fill(0, $dim, 0.0)); } } for ($i = 0; $i < $this->clustersNumber; ++$i) { $cluster = $this->clusters[$i]; $center = $cluster->getCoordinates(); for ($k = 0; $k < $dim; ++$k) { $a = $this->getMembershipRowTotal($i, $k, true); $b = $this->getMembershipRowTotal($i, $k, false); $center[$k] = $a / $b; } $cluster->setCoordinates($center); } } protected function getMembershipRowTotal(int $row, int $col, bool $multiply): float { $sum = 0.0; for ($k = 0; $k < $this->sampleCount; ++$k) { $val = $this->membership[$row][$k] ** $this->fuzziness; if ($multiply) { $val *= $this->samples[$k][$col]; } $sum += $val; } return $sum; } protected function updateMembershipMatrix(): void { for ($i = 0; $i < $this->clustersNumber; ++$i) { for ($k = 0; $k < $this->sampleCount; ++$k) { $distCalc = $this->getDistanceCalc($i, $k); $this->membership[$i][$k] = 1.0 / $distCalc; } } } protected function getDistanceCalc(int $row, int $col): float { $sum = 0.0; $distance = new Euclidean(); $dist1 = $distance->distance( $this->clusters[$row]->getCoordinates(), $this->samples[$col] ); for ($j = 0; $j < $this->clustersNumber; ++$j) { $dist2 = $distance->distance( $this->clusters[$j]->getCoordinates(), $this->samples[$col] ); $val = (($dist1 / $dist2) ** 2.0) / ($this->fuzziness - 1); $sum += $val; } return $sum; } /** * The objective is to minimize the distance between all data points * and all cluster centers. This method returns the summation of all * these distances */ protected function getObjective(): float { $sum = 0.0; $distance = new Euclidean(); for ($i = 0; $i < $this->clustersNumber; ++$i) { $clust = $this->clusters[$i]->getCoordinates(); for ($k = 0; $k < $this->sampleCount; ++$k) { $point = $this->samples[$k]; $sum += $distance->distance($clust, $point); } } return $sum; } } Clustering/KMeans/Space.php 0000644 00000014561 15152065050 0011613 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Clustering\KMeans; use InvalidArgumentException; use LogicException; use Phpml\Clustering\KMeans; use SplObjectStorage; class Space extends SplObjectStorage { /** * @var int */ protected $dimension; public function __construct(int $dimension) { if ($dimension < 1) { throw new LogicException('a space dimension cannot be null or negative'); } $this->dimension = $dimension; } public function toArray(): array { $points = []; /** @var Point $point */ foreach ($this as $point) { $points[] = $point->toArray(); } return ['points' => $points]; } /** * @param mixed $label */ public function newPoint(array $coordinates, $label = null): Point { if (count($coordinates) !== $this->dimension) { throw new LogicException('('.implode(',', $coordinates).') is not a point of this space'); } return new Point($coordinates, $label); } /** * @param mixed $label * @param mixed $data */ public function addPoint(array $coordinates, $label = null, $data = null): void { $this->attach($this->newPoint($coordinates, $label), $data); } /** * @param object $point * @param mixed $data */ public function attach($point, $data = null): void { if (!$point instanceof Point) { throw new InvalidArgumentException('can only attach points to spaces'); } parent::attach($point, $data); } public function getDimension(): int { return $this->dimension; } /** * @return array|bool */ public function getBoundaries() { if (count($this) === 0) { return false; } $min = $this->newPoint(array_fill(0, $this->dimension, null)); $max = $this->newPoint(array_fill(0, $this->dimension, null)); /** @var Point $point */ foreach ($this as $point) { for ($n = 0; $n < $this->dimension; ++$n) { if ($min[$n] === null || $min[$n] > $point[$n]) { $min[$n] = $point[$n]; } if ($max[$n] === null || $max[$n] < $point[$n]) { $max[$n] = $point[$n]; } } } return [$min, $max]; } public function getRandomPoint(Point $min, Point $max): Point { $point = $this->newPoint(array_fill(0, $this->dimension, null)); for ($n = 0; $n < $this->dimension; ++$n) { $point[$n] = random_int($min[$n], $max[$n]); } return $point; } /** * @return Cluster[] */ public function cluster(int $clustersNumber, int $initMethod = KMeans::INIT_RANDOM): array { $clusters = $this->initializeClusters($clustersNumber, $initMethod); do { } while (!$this->iterate($clusters)); return $clusters; } /** * @return Cluster[] */ protected function initializeClusters(int $clustersNumber, int $initMethod): array { switch ($initMethod) { case KMeans::INIT_RANDOM: $clusters = $this->initializeRandomClusters($clustersNumber); break; case KMeans::INIT_KMEANS_PLUS_PLUS: $clusters = $this->initializeKMPPClusters($clustersNumber); break; default: return []; } $clusters[0]->attachAll($this); return $clusters; } /** * @param Cluster[] $clusters */ protected function iterate(array $clusters): bool { $convergence = true; $attach = new SplObjectStorage(); $detach = new SplObjectStorage(); foreach ($clusters as $cluster) { foreach ($cluster as $point) { $closest = $point->getClosest($clusters); if ($closest === null) { continue; } if ($closest !== $cluster) { $attach[$closest] ?? $attach[$closest] = new SplObjectStorage(); $detach[$cluster] ?? $detach[$cluster] = new SplObjectStorage(); $attach[$closest]->attach($point); $detach[$cluster]->attach($point); $convergence = false; } } } /** @var Cluster $cluster */ foreach ($attach as $cluster) { $cluster->attachAll($attach[$cluster]); } /** @var Cluster $cluster */ foreach ($detach as $cluster) { $cluster->detachAll($detach[$cluster]); } foreach ($clusters as $cluster) { $cluster->updateCentroid(); } return $convergence; } /** * @return Cluster[] */ protected function initializeKMPPClusters(int $clustersNumber): array { $clusters = []; $this->rewind(); /** @var Point $current */ $current = $this->current(); $clusters[] = new Cluster($this, $current->getCoordinates()); $distances = new SplObjectStorage(); for ($i = 1; $i < $clustersNumber; ++$i) { $sum = 0; /** @var Point $point */ foreach ($this as $point) { $closest = $point->getClosest($clusters); if ($closest === null) { continue; } $distance = $point->getDistanceWith($closest); $sum += $distances[$point] = $distance; } $sum = random_int(0, (int) $sum); /** @var Point $point */ foreach ($this as $point) { $sum -= $distances[$point]; if ($sum > 0) { continue; } $clusters[] = new Cluster($this, $point->getCoordinates()); break; } } return $clusters; } /** * @return Cluster[] */ private function initializeRandomClusters(int $clustersNumber): array { $clusters = []; [$min, $max] = $this->getBoundaries(); for ($n = 0; $n < $clustersNumber; ++$n) { $clusters[] = new Cluster($this, $this->getRandomPoint($min, $max)->getCoordinates()); } return $clusters; } } Clustering/KMeans/Cluster.php 0000644 00000004762 15152065050 0012203 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Clustering\KMeans; use IteratorAggregate; use LogicException; use SplObjectStorage; class Cluster extends Point implements IteratorAggregate { /** * @var Space */ protected $space; /** * @var SplObjectStorage|Point[] */ protected $points; public function __construct(Space $space, array $coordinates) { parent::__construct($coordinates); $this->space = $space; $this->points = new SplObjectStorage(); } public function getPoints(): array { $points = []; foreach ($this->points as $point) { if ($point->label === null) { $points[] = $point->toArray(); } else { $points[$point->label] = $point->toArray(); } } return $points; } public function toArray(): array { return [ 'centroid' => parent::toArray(), 'points' => $this->getPoints(), ]; } public function attach(Point $point): Point { if ($point instanceof self) { throw new LogicException('Cannot attach a cluster to another'); } $this->points->attach($point); return $point; } public function detach(Point $point): Point { $this->points->detach($point); return $point; } public function attachAll(SplObjectStorage $points): void { $this->points->addAll($points); } public function detachAll(SplObjectStorage $points): void { $this->points->removeAll($points); } public function updateCentroid(): void { $count = count($this->points); if ($count === 0) { return; } $centroid = $this->space->newPoint(array_fill(0, $this->dimension, 0)); foreach ($this->points as $point) { for ($n = 0; $n < $this->dimension; ++$n) { $centroid->coordinates[$n] += $point->coordinates[$n]; } } for ($n = 0; $n < $this->dimension; ++$n) { $this->coordinates[$n] = $centroid->coordinates[$n] / $count; } } /** * @return Point[]|SplObjectStorage */ public function getIterator() { return $this->points; } public function count(): int { return count($this->points); } public function setCoordinates(array $newCoordinates): void { $this->coordinates = $newCoordinates; } } Clustering/KMeans/Point.php 0000644 00000004626 15152065050 0011652 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Clustering\KMeans; use ArrayAccess; class Point implements ArrayAccess, \Countable { /** * @var int */ protected $dimension; /** * @var array */ protected $coordinates = []; /** * @var mixed */ protected $label; /** * @param mixed $label */ public function __construct(array $coordinates, $label = null) { $this->dimension = count($coordinates); $this->coordinates = $coordinates; $this->label = $label; } public function toArray(): array { return $this->coordinates; } /** * @return float|int */ public function getDistanceWith(self $point, bool $precise = true) { $distance = 0; for ($n = 0; $n < $this->dimension; ++$n) { $difference = $this->coordinates[$n] - $point->coordinates[$n]; $distance += $difference * $difference; } return $precise ? $distance ** .5 : $distance; } /** * @param Point[] $points */ public function getClosest(array $points): ?self { $minPoint = null; foreach ($points as $point) { $distance = $this->getDistanceWith($point, false); if (!isset($minDistance)) { $minDistance = $distance; $minPoint = $point; continue; } if ($distance < $minDistance) { $minDistance = $distance; $minPoint = $point; } } return $minPoint; } public function getCoordinates(): array { return $this->coordinates; } /** * @param mixed $offset */ public function offsetExists($offset): bool { return isset($this->coordinates[$offset]); } /** * @param mixed $offset * * @return mixed */ public function offsetGet($offset) { return $this->coordinates[$offset]; } /** * @param mixed $offset * @param mixed $value */ public function offsetSet($offset, $value): void { $this->coordinates[$offset] = $value; } /** * @param mixed $offset */ public function offsetUnset($offset): void { unset($this->coordinates[$offset]); } public function count(): int { return count($this->coordinates); } } Clustering/KMeans.php 0000644 00000002172 15152065050 0010553 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Clustering; use Phpml\Clustering\KMeans\Space; use Phpml\Exception\InvalidArgumentException; class KMeans implements Clusterer { public const INIT_RANDOM = 1; public const INIT_KMEANS_PLUS_PLUS = 2; /** * @var int */ private $clustersNumber; /** * @var int */ private $initialization; public function __construct(int $clustersNumber, int $initialization = self::INIT_KMEANS_PLUS_PLUS) { if ($clustersNumber <= 0) { throw new InvalidArgumentException('Invalid clusters number'); } $this->clustersNumber = $clustersNumber; $this->initialization = $initialization; } public function cluster(array $samples): array { $space = new Space(count(reset($samples))); foreach ($samples as $key => $sample) { $space->addPoint($sample, $key); } $clusters = []; foreach ($space->cluster($this->clustersNumber, $this->initialization) as $cluster) { $clusters[] = $cluster->getPoints(); } return $clusters; } } Clustering/Clusterer.php 0000644 00000000212 15152065050 0011336 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Clustering; interface Clusterer { public function cluster(array $samples): array; } Pipeline.php 0000644 00000004144 15152065050 0007024 0 ustar 00 <?php declare(strict_types=1); namespace Phpml; use Phpml\Exception\InvalidOperationException; class Pipeline implements Estimator, Transformer { /** * @var Transformer[] */ private $transformers = []; /** * @var Estimator|null */ private $estimator; /** * @param Transformer[] $transformers */ public function __construct(array $transformers, ?Estimator $estimator = null) { $this->transformers = array_map(static function (Transformer $transformer): Transformer { return $transformer; }, $transformers); $this->estimator = $estimator; } /** * @return Transformer[] */ public function getTransformers(): array { return $this->transformers; } public function getEstimator(): ?Estimator { return $this->estimator; } public function train(array $samples, array $targets): void { if ($this->estimator === null) { throw new InvalidOperationException('Pipeline without estimator can\'t use train method'); } foreach ($this->transformers as $transformer) { $transformer->fit($samples, $targets); $transformer->transform($samples, $targets); } $this->estimator->train($samples, $targets); } /** * @return mixed */ public function predict(array $samples) { $this->transform($samples); if ($this->estimator === null) { throw new InvalidOperationException('Pipeline without estimator can\'t use predict method'); } return $this->estimator->predict($samples); } public function fit(array $samples, ?array $targets = null): void { foreach ($this->transformers as $transformer) { $transformer->fit($samples, $targets); $transformer->transform($samples, $targets); } } public function transform(array &$samples, ?array &$targets = null): void { foreach ($this->transformers as $transformer) { $transformer->transform($samples, $targets); } } } Dataset/CsvDataset.php 0000644 00000002521 15152065050 0010702 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Dataset; use Phpml\Exception\FileException; class CsvDataset extends ArrayDataset { /** * @var array */ protected $columnNames = []; /** * @throws FileException */ public function __construct(string $filepath, int $features, bool $headingRow = true, string $delimiter = ',', int $maxLineLength = 0) { if (!file_exists($filepath)) { throw new FileException(sprintf('File "%s" missing.', basename($filepath))); } $handle = fopen($filepath, 'rb'); if ($handle === false) { throw new FileException(sprintf('File "%s" can\'t be open.', basename($filepath))); } if ($headingRow) { $data = fgetcsv($handle, $maxLineLength, $delimiter); $this->columnNames = array_slice((array) $data, 0, $features); } else { $this->columnNames = range(0, $features - 1); } $samples = $targets = []; while ($data = fgetcsv($handle, $maxLineLength, $delimiter)) { $samples[] = array_slice($data, 0, $features); $targets[] = $data[$features]; } fclose($handle); parent::__construct($samples, $targets); } public function getColumnNames(): array { return $this->columnNames; } } Dataset/Dataset.php 0000644 00000000244 15152065050 0010226 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Dataset; interface Dataset { public function getSamples(): array; public function getTargets(): array; } Dataset/MnistDataset.php 0000644 00000005540 15152065050 0011245 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Dataset; use Phpml\Exception\InvalidArgumentException; /** * MNIST dataset: http://yann.lecun.com/exdb/mnist/ * original mnist dataset reader: https://github.com/AndrewCarterUK/mnist-neural-network-plain-php */ final class MnistDataset extends ArrayDataset { private const MAGIC_IMAGE = 0x00000803; private const MAGIC_LABEL = 0x00000801; private const IMAGE_ROWS = 28; private const IMAGE_COLS = 28; public function __construct(string $imagePath, string $labelPath) { $this->samples = $this->readImages($imagePath); $this->targets = $this->readLabels($labelPath); if (count($this->samples) !== count($this->targets)) { throw new InvalidArgumentException('Must have the same number of images and labels'); } } private function readImages(string $imagePath): array { $stream = fopen($imagePath, 'rb'); if ($stream === false) { throw new InvalidArgumentException('Could not open file: '.$imagePath); } $images = []; try { $header = fread($stream, 16); $fields = unpack('Nmagic/Nsize/Nrows/Ncols', (string) $header); if ($fields['magic'] !== self::MAGIC_IMAGE) { throw new InvalidArgumentException('Invalid magic number: '.$imagePath); } if ($fields['rows'] != self::IMAGE_ROWS) { throw new InvalidArgumentException('Invalid number of image rows: '.$imagePath); } if ($fields['cols'] != self::IMAGE_COLS) { throw new InvalidArgumentException('Invalid number of image cols: '.$imagePath); } for ($i = 0; $i < $fields['size']; $i++) { $imageBytes = fread($stream, $fields['rows'] * $fields['cols']); // Convert to float between 0 and 1 $images[] = array_map(function ($b) { return $b / 255; }, array_values(unpack('C*', (string) $imageBytes))); } } finally { fclose($stream); } return $images; } private function readLabels(string $labelPath): array { $stream = fopen($labelPath, 'rb'); if ($stream === false) { throw new InvalidArgumentException('Could not open file: '.$labelPath); } $labels = []; try { $header = fread($stream, 8); $fields = unpack('Nmagic/Nsize', (string) $header); if ($fields['magic'] !== self::MAGIC_LABEL) { throw new InvalidArgumentException('Invalid magic number: '.$labelPath); } $labels = fread($stream, $fields['size']); } finally { fclose($stream); } return array_values(unpack('C*', (string) $labels)); } } Dataset/SvmDataset.php 0000644 00000006660 15152065050 0010724 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Dataset; use Phpml\Exception\DatasetException; use Phpml\Exception\FileException; class SvmDataset extends ArrayDataset { public function __construct(string $filePath) { [$samples, $targets] = self::readProblem($filePath); parent::__construct($samples, $targets); } private static function readProblem(string $filePath): array { $handle = self::openFile($filePath); $samples = []; $targets = []; $maxIndex = 0; while (false !== $line = fgets($handle)) { [$sample, $target, $maxIndex] = self::processLine($line, $maxIndex); $samples[] = $sample; $targets[] = $target; } fclose($handle); foreach ($samples as &$sample) { $sample = array_pad($sample, $maxIndex + 1, 0); } return [$samples, $targets]; } /** * @return resource */ private static function openFile(string $filePath) { if (!file_exists($filePath)) { throw new FileException(sprintf('File "%s" missing.', basename($filePath))); } $handle = fopen($filePath, 'rb'); if ($handle === false) { throw new FileException(sprintf('File "%s" can\'t be open.', basename($filePath))); } return $handle; } private static function processLine(string $line, int $maxIndex): array { $columns = self::parseLine($line); $target = self::parseTargetColumn($columns[0]); $sample = array_fill(0, $maxIndex + 1, 0); $n = count($columns); for ($i = 1; $i < $n; ++$i) { [$index, $value] = self::parseFeatureColumn($columns[$i]); if ($index > $maxIndex) { $maxIndex = $index; $sample = array_pad($sample, $maxIndex + 1, 0); } $sample[$index] = $value; } return [$sample, $target, $maxIndex]; } private static function parseLine(string $line): array { $line = explode('#', $line, 2)[0]; $line = rtrim($line); $line = str_replace("\t", ' ', $line); return explode(' ', $line); } private static function parseTargetColumn(string $column): float { if (!is_numeric($column)) { throw new DatasetException(sprintf('Invalid target "%s".', $column)); } return (float) $column; } private static function parseFeatureColumn(string $column): array { $feature = explode(':', $column, 2); if (count($feature) !== 2) { throw new DatasetException(sprintf('Invalid value "%s".', $column)); } $index = self::parseFeatureIndex($feature[0]); $value = self::parseFeatureValue($feature[1]); return [$index, $value]; } private static function parseFeatureIndex(string $index): int { if (!is_numeric($index) || !ctype_digit($index)) { throw new DatasetException(sprintf('Invalid index "%s".', $index)); } if ((int) $index < 1) { throw new DatasetException(sprintf('Invalid index "%s".', $index)); } return (int) $index - 1; } private static function parseFeatureValue(string $value): float { if (!is_numeric($value)) { throw new DatasetException(sprintf('Invalid value "%s".', $value)); } return (float) $value; } } Dataset/ArrayDataset.php 0000644 00000002344 15152065050 0011230 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Dataset; use Phpml\Exception\InvalidArgumentException; class ArrayDataset implements Dataset { /** * @var array */ protected $samples = []; /** * @var array */ protected $targets = []; /** * @throws InvalidArgumentException */ public function __construct(array $samples, array $targets) { if (count($samples) !== count($targets)) { throw new InvalidArgumentException('Size of given arrays does not match'); } $this->samples = $samples; $this->targets = $targets; } public function getSamples(): array { return $this->samples; } public function getTargets(): array { return $this->targets; } /** * @param int[] $columns */ public function removeColumns(array $columns): void { foreach ($this->samples as &$sample) { $this->removeColumnsFromSample($sample, $columns); } } private function removeColumnsFromSample(array &$sample, array $columns): void { foreach ($columns as $index) { unset($sample[$index]); } $sample = array_values($sample); } } Dataset/Demo/GlassDataset.php 0000644 00000001103 15152065050 0012077 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Dataset\Demo; use Phpml\Dataset\CsvDataset; /** * Classes: 6 * Samples per class: * 70 float processed building windows * 17 float processed vehicle windows * 76 non-float processed building windows * 13 containers * 9 tableware * 29 headlamps * Samples total: 214 * Features per sample: 9. */ class GlassDataset extends CsvDataset { public function __construct() { $filepath = __DIR__.'/../../../data/glass.csv'; parent::__construct($filepath, 9, true); } } Dataset/Demo/WineDataset.php 0000644 00000000640 15152065050 0011735 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Dataset\Demo; use Phpml\Dataset\CsvDataset; /** * Classes: 3 * Samples per class: class 1 59; class 2 71; class 3 48 * Samples total: 178 * Features per sample: 13. */ class WineDataset extends CsvDataset { public function __construct() { $filepath = __DIR__.'/../../../data/wine.csv'; parent::__construct($filepath, 13, true); } } Dataset/Demo/IrisDataset.php 0000644 00000000576 15152065050 0011751 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Dataset\Demo; use Phpml\Dataset\CsvDataset; /** * Classes: 3 * Samples per class: 50 * Samples total: 150 * Features per sample: 4. */ class IrisDataset extends CsvDataset { public function __construct() { $filepath = __DIR__.'/../../../data/iris.csv'; parent::__construct($filepath, 4, true); } } Dataset/FilesDataset.php 0000644 00000002161 15152065050 0011211 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\Dataset; use Phpml\Exception\DatasetException; class FilesDataset extends ArrayDataset { public function __construct(string $rootPath) { if (!is_dir($rootPath)) { throw new DatasetException(sprintf('Dataset root folder "%s" missing.', $rootPath)); } $this->scanRootPath($rootPath); } private function scanRootPath(string $rootPath): void { $dirs = glob($rootPath.DIRECTORY_SEPARATOR.'*', GLOB_ONLYDIR); if ($dirs === false) { throw new DatasetException(sprintf('An error occurred during directory "%s" scan', $rootPath)); } foreach ($dirs as $dir) { $this->scanDir($dir); } } private function scanDir(string $dir): void { $target = basename($dir); $files = glob($dir.DIRECTORY_SEPARATOR.'*'); if ($files === false) { return; } foreach (array_filter($files, 'is_file') as $file) { $this->samples[] = file_get_contents($file); $this->targets[] = $target; } } } IncrementalEstimator.php 0000644 00000000262 15152065050 0011405 0 ustar 00 <?php declare(strict_types=1); namespace Phpml; interface IncrementalEstimator { public function partialTrain(array $samples, array $targets, array $labels = []): void; } FeatureExtraction/TokenCountVectorizer.php 0000644 00000007531 15152065050 0015064 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\FeatureExtraction; use Phpml\Tokenization\Tokenizer; use Phpml\Transformer; class TokenCountVectorizer implements Transformer { /** * @var Tokenizer */ private $tokenizer; /** * @var StopWords|null */ private $stopWords; /** * @var float */ private $minDF; /** * @var array */ private $vocabulary = []; /** * @var array */ private $frequencies = []; public function __construct(Tokenizer $tokenizer, ?StopWords $stopWords = null, float $minDF = 0.0) { $this->tokenizer = $tokenizer; $this->stopWords = $stopWords; $this->minDF = $minDF; } public function fit(array $samples, ?array $targets = null): void { $this->buildVocabulary($samples); } public function transform(array &$samples, ?array &$targets = null): void { array_walk($samples, function (string &$sample): void { $this->transformSample($sample); }); $this->checkDocumentFrequency($samples); } public function getVocabulary(): array { return array_flip($this->vocabulary); } private function buildVocabulary(array &$samples): void { foreach ($samples as $sample) { $tokens = $this->tokenizer->tokenize($sample); foreach ($tokens as $token) { $this->addTokenToVocabulary($token); } } } private function transformSample(string &$sample): void { $counts = []; $tokens = $this->tokenizer->tokenize($sample); foreach ($tokens as $token) { $index = $this->getTokenIndex($token); if ($index !== false) { $this->updateFrequency($token); if (!isset($counts[$index])) { $counts[$index] = 0; } ++$counts[$index]; } } foreach ($this->vocabulary as $index) { if (!isset($counts[$index])) { $counts[$index] = 0; } } ksort($counts); $sample = $counts; } /** * @return int|bool */ private function getTokenIndex(string $token) { if ($this->isStopWord($token)) { return false; } return $this->vocabulary[$token] ?? false; } private function addTokenToVocabulary(string $token): void { if ($this->isStopWord($token)) { return; } if (!isset($this->vocabulary[$token])) { $this->vocabulary[$token] = count($this->vocabulary); } } private function isStopWord(string $token): bool { return $this->stopWords !== null && $this->stopWords->isStopWord($token); } private function updateFrequency(string $token): void { if (!isset($this->frequencies[$token])) { $this->frequencies[$token] = 0; } ++$this->frequencies[$token]; } private function checkDocumentFrequency(array &$samples): void { if ($this->minDF > 0) { $beyondMinimum = $this->getBeyondMinimumIndexes(count($samples)); foreach ($samples as &$sample) { $this->resetBeyondMinimum($sample, $beyondMinimum); } } } private function resetBeyondMinimum(array &$sample, array $beyondMinimum): void { foreach ($beyondMinimum as $index) { $sample[$index] = 0; } } private function getBeyondMinimumIndexes(int $samplesCount): array { $indexes = []; foreach ($this->frequencies as $token => $frequency) { if (($frequency / $samplesCount) < $this->minDF) { $indexes[] = $this->getTokenIndex((string) $token); } } return $indexes; } } FeatureExtraction/TfIdfTransformer.php 0000644 00000002322 15152065050 0014126 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\FeatureExtraction; use Phpml\Transformer; class TfIdfTransformer implements Transformer { /** * @var array */ private $idf = []; public function __construct(array $samples = []) { if (count($samples) > 0) { $this->fit($samples); } } public function fit(array $samples, ?array $targets = null): void { $this->countTokensFrequency($samples); $count = count($samples); foreach ($this->idf as &$value) { $value = log((float) ($count / $value), 10.0); } } public function transform(array &$samples, ?array &$targets = null): void { foreach ($samples as &$sample) { foreach ($sample as $index => &$feature) { $feature *= $this->idf[$index]; } } } private function countTokensFrequency(array $samples): void { $this->idf = array_fill_keys(array_keys($samples[0]), 0); foreach ($samples as $sample) { foreach ($sample as $index => $count) { if ($count > 0) { ++$this->idf[$index]; } } } } } FeatureExtraction/StopWords.php 0000644 00000001417 15152065050 0012657 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\FeatureExtraction; use Phpml\Exception\InvalidArgumentException; class StopWords { /** * @var array */ protected $stopWords = []; public function __construct(array $stopWords) { $this->stopWords = array_fill_keys($stopWords, true); } public function isStopWord(string $token): bool { return isset($this->stopWords[$token]); } public static function factory(string $language = 'English'): self { $className = __NAMESPACE__."\\StopWords\\${language}"; if (!class_exists($className)) { throw new InvalidArgumentException(sprintf('Can\'t find "%s" language for StopWords', $language)); } return new $className(); } } FeatureExtraction/StopWords/Polish.php 0000644 00000002752 15152065050 0014120 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\FeatureExtraction\StopWords; use Phpml\FeatureExtraction\StopWords; final class Polish extends StopWords { /** * @var array */ protected $stopWords = [ 'ach', 'aj', 'albo', 'bardzo', 'bez', 'bo', 'być', 'ci', 'cię', 'ciebie', 'co', 'czy', 'daleko', 'dla', 'dlaczego', 'dlatego', 'do', 'dobrze', 'dokąd', 'dość', 'dużo', 'dwa', 'dwaj', 'dwie', 'dwoje', 'dziś', 'dzisiaj', 'gdyby', 'gdzie', 'go', 'ich', 'ile', 'im', 'inny', 'ja', 'ją', 'jak', 'jakby', 'jaki', 'je', 'jeden', 'jedna', 'jedno', 'jego', 'jej', 'jemu', 'jeśli', 'jest', 'jestem', 'jeżeli', 'już', 'każdy', 'kiedy', 'kierunku', 'kto', 'ku', 'lub', 'ma', 'mają', 'mam', 'mi', 'mną', 'mnie', 'moi', 'mój', 'moja', 'moje', 'może', 'mu', 'my', 'na', 'nam', 'nami', 'nas', 'nasi', 'nasz', 'nasza', 'nasze', 'natychmiast', 'nią', 'nic', 'nich', 'nie', 'niego', 'niej', 'niemu', 'nigdy', 'nim', 'nimi', 'niż', 'obok', 'od', 'około', 'on', 'ona', 'one', 'oni', 'ono', 'owszem', 'po', 'pod', 'ponieważ', 'przed', 'przedtem', 'są', 'sam', 'sama', 'się', 'skąd', 'tak', 'taki', 'tam', 'ten', 'to', 'tobą', 'tobie', 'tu', 'tutaj', 'twoi', 'twój', 'twoja', 'twoje', 'ty', 'wam', 'wami', 'was', 'wasi', 'wasz', 'wasza', 'wasze', 'we', 'więc', 'wszystko', 'wtedy', 'wy', 'żaden', 'zawsze', 'że', ]; public function __construct() { parent::__construct($this->stopWords); } } FeatureExtraction/StopWords/Russian.php 0000644 00000004045 15152065050 0014303 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\FeatureExtraction\StopWords; use Phpml\FeatureExtraction\StopWords; final class Russian extends StopWords { /** * @var array */ protected $stopWords = [ 'и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 'или', 'ни', 'быть', 'был', 'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 'потом', 'себя', 'ничего', 'ей', 'может', 'они', 'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 'чем', 'была', 'сам', 'чтоб', 'без', 'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 'кто', 'этот', 'того', 'потому', 'этого', 'какой', 'совсем', 'ним', 'здесь', 'этом', 'один', 'почти', 'мой', 'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем', 'всех', 'никогда', 'можно', 'при', 'наконец', 'два', 'об', 'другой', 'хоть', 'после', 'над', 'больше', 'тот', 'через', 'эти', 'нас', 'про', 'всего', 'них', 'какая', 'много', 'разве', 'три', 'эту', 'моя', 'впрочем', 'хорошо', 'свою', 'этой', 'перед', 'иногда', 'лучше', 'чуть', 'том', 'нельзя', 'такой', 'им', 'более', 'всегда', 'конечно', 'всю', 'между', ]; public function __construct() { parent::__construct($this->stopWords); } } FeatureExtraction/StopWords/French.php 0000644 00000002473 15152065050 0014067 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\FeatureExtraction\StopWords; use Phpml\FeatureExtraction\StopWords; final class French extends StopWords { /** * @var array */ protected $stopWords = [ 'alors', 'au', 'aucuns', 'aussi', 'autre', 'avant', 'avec', 'avoir', 'bon', 'car', 'ce', 'cela', 'ces', 'ceux', 'chaque', 'ci', 'comme', 'comment', 'dans', 'des', 'du', 'dedans', 'dehors', 'depuis', 'devrait', 'doit', 'donc', 'dos', 'début', 'elle', 'elles', 'en', 'encore', 'essai', 'est', 'et', 'eu', 'fait', 'faites', 'fois', 'font', 'hors', 'ici', 'il', 'ils', 'je', 'juste', 'la', 'le', 'les', 'leur', 'là', 'ma', 'maintenant', 'mais', 'mes', 'mine', 'moins', 'mon', 'mot', 'même', 'ni', 'nommés', 'notre', 'nous', 'ou', 'où', 'par', 'parce', 'pas', 'peut', 'peu', 'plupart', 'pour', 'pourquoi', 'quand', 'que', 'quel', 'quelle', 'quelles', 'quels', 'qui', 'sa', 'sans', 'ses', 'seulement', 'si', 'sien', 'son', 'sont', 'sous', 'soyez', 'sujet', 'sur', 'ta', 'tandis', 'tellement', 'tels', 'tes', 'ton', 'tous', 'tout', 'trop', 'très', 'tu', 'voient', 'vont', 'votre', 'vous', 'vu', 'ça', 'étaient', 'état', 'étions', 'été', 'être', ]; public function __construct() { parent::__construct($this->stopWords); } } FeatureExtraction/StopWords/English.php 0000644 00000003627 15152065050 0014255 0 ustar 00 <?php declare(strict_types=1); namespace Phpml\FeatureExtraction\StopWords; use Phpml\FeatureExtraction\StopWords; final class English extends StopWords { /** * @var array */ protected $stopWords = [ 'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'aren\'t', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can\'t', 'cannot', 'could', 'couldn\'t', 'did', 'didn\'t', 'do', 'does', 'doesn\'t', 'doing', 'don\'t', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn\'t', 'has', 'hasn\'t', 'have', 'haven\'t', 'having', 'he', 'he\'d', 'he\'ll', 'he\'s', 'her', 'here', 'here\'s', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'how\'s', 'i', 'i\'d', 'i\'ll', 'i\'m', 'i\'ve', 'if', 'in', 'into', 'is', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'let\'s', 'me', 'more', 'most', 'mustn\'t', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'oursourselves', 'out', 'over', 'own', 'same', 'shan\'t', 'she', 'she\'d', 'she\'ll', 'she\'s', 'should', 'shouldn\'t', 'so', 'some', 'such', 'than', 'that', 'that\'s', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'there\'s', 'these', 'they', 'they\'d', 'they\'ll', 'they\'re', 'they\'ve', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'wasn\'t', 'we', 'we\'d', 'we\'ll', 'we\'re', 'we\'ve', 'were', 'weren\'t', 'what', 'what\'s', 'when', 'when\'s', 'where', 'where\'s', 'which', 'while', 'who', 'who\'s', 'whom', 'why', 'why\'s', 'with', 'won\'t', 'would', 'wouldn\'t', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves', ]; public function __construct() { parent::__construct($this->stopWords); } }
| ver. 1.4 |
Github
|
.
| PHP 7.4.33 | ���֧ߧ֧�ѧ�ڧ� ����ѧߧڧ��: 0 |
proxy
|
phpinfo
|
���ѧ����ۧܧ�