|
| 1 | +<?php declare(strict_types=1); |
| 2 | + |
| 3 | +namespace PhpmlExamples; |
| 4 | + |
| 5 | +include 'vendor/autoload.php'; |
| 6 | + |
| 7 | +use Phpml\Dataset\CsvDataset; |
| 8 | +use Phpml\Dataset\ArrayDataset; |
| 9 | +use Phpml\FeatureExtraction\TokenCountVectorizer; |
| 10 | +use Phpml\Tokenization\WordTokenizer; |
| 11 | +use Phpml\CrossValidation\StratifiedRandomSplit; |
| 12 | +use Phpml\FeatureExtraction\TfIdfTransformer; |
| 13 | +use Phpml\Metric\Accuracy; |
| 14 | +use Phpml\Classification\SVC; |
| 15 | +use Phpml\SupportVectorMachine\Kernel; |
| 16 | + |
| 17 | +//temporarily alter the memory limit for such large dataset |
| 18 | +ini_set('memory_limit', '-1'); |
| 19 | + |
| 20 | +echo 'Loading dataset...' . PHP_EOL; |
| 21 | +$dataset = new CsvDataset('data/spam.csv', 1); |
| 22 | +$vectorizer = new TokenCountVectorizer(new WordTokenizer()); |
| 23 | +$tfIdfTransformer = new TfIdfTransformer(); |
| 24 | + |
| 25 | +echo 'Extracting samples ...' . PHP_EOL; |
| 26 | +$samples = []; |
| 27 | +foreach ($dataset->getSamples() as $sample) { |
| 28 | + $samples[] = $sample[0]; |
| 29 | +} |
| 30 | + |
| 31 | +echo 'Vectorizing samples ...' . PHP_EOL; |
| 32 | +$vectorizer->fit($samples); |
| 33 | +$vectorizer->transform($samples); |
| 34 | + |
| 35 | +$tfIdfTransformer->fit($samples); |
| 36 | +$tfIdfTransformer->transform($samples); |
| 37 | + |
| 38 | +$dataset = new ArrayDataset($samples, $dataset->getTargets()); |
| 39 | + |
| 40 | +$randomSplit = new StratifiedRandomSplit($dataset, 0.1); |
| 41 | + |
| 42 | +echo 'Training model ...' . PHP_EOL; |
| 43 | +$classifier = new SVC(Kernel::RBF, 1000); |
| 44 | +$classifier->train($randomSplit->getTrainSamples(), $randomSplit->getTrainLabels()); |
| 45 | + |
| 46 | +echo 'Performing prediction ...' . PHP_EOL; |
| 47 | +$predictedLabels = $classifier->predict($randomSplit->getTestSamples()); |
| 48 | + |
| 49 | +echo 'Accuracy: '.Accuracy::score($randomSplit->getTestLabels(), $predictedLabels) . PHP_EOL; |
0 commit comments