Skip to content

Commit 02294eb

Browse files
Sezekielakondas
authored andcommitted
Spam filter classification example (#17)
* Added spam dataset * Added spam dataset * complete sms spam classifier * Fix type and add info to readme
1 parent 7d74fa8 commit 02294eb

File tree

3 files changed

+5623
-0
lines changed

3 files changed

+5623
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ Classification:
2020

2121
* `languageDetection.php` - classifier build for language detection
2222
* `minst.php` - recognize handwritten digits from MNIST dataset (to download dataset use `bin/download-mnist.sh`)
23+
* `spamFilter.php` - simple spam filter with example dataset
2324

2425
Regression:
2526

classification/spamFilter.php

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
<?php declare(strict_types=1);
2+
3+
namespace PhpmlExamples;
4+
5+
include 'vendor/autoload.php';
6+
7+
use Phpml\Dataset\CsvDataset;
8+
use Phpml\Dataset\ArrayDataset;
9+
use Phpml\FeatureExtraction\TokenCountVectorizer;
10+
use Phpml\Tokenization\WordTokenizer;
11+
use Phpml\CrossValidation\StratifiedRandomSplit;
12+
use Phpml\FeatureExtraction\TfIdfTransformer;
13+
use Phpml\Metric\Accuracy;
14+
use Phpml\Classification\SVC;
15+
use Phpml\SupportVectorMachine\Kernel;
16+
17+
//temporarily alter the memory limit for such large dataset
18+
ini_set('memory_limit', '-1');
19+
20+
echo 'Loading dataset...' . PHP_EOL;
21+
$dataset = new CsvDataset('data/spam.csv', 1);
22+
$vectorizer = new TokenCountVectorizer(new WordTokenizer());
23+
$tfIdfTransformer = new TfIdfTransformer();
24+
25+
echo 'Extracting samples ...' . PHP_EOL;
26+
$samples = [];
27+
foreach ($dataset->getSamples() as $sample) {
28+
$samples[] = $sample[0];
29+
}
30+
31+
echo 'Vectorizing samples ...' . PHP_EOL;
32+
$vectorizer->fit($samples);
33+
$vectorizer->transform($samples);
34+
35+
$tfIdfTransformer->fit($samples);
36+
$tfIdfTransformer->transform($samples);
37+
38+
$dataset = new ArrayDataset($samples, $dataset->getTargets());
39+
40+
$randomSplit = new StratifiedRandomSplit($dataset, 0.1);
41+
42+
echo 'Training model ...' . PHP_EOL;
43+
$classifier = new SVC(Kernel::RBF, 1000);
44+
$classifier->train($randomSplit->getTrainSamples(), $randomSplit->getTrainLabels());
45+
46+
echo 'Performing prediction ...' . PHP_EOL;
47+
$predictedLabels = $classifier->predict($randomSplit->getTestSamples());
48+
49+
echo 'Accuracy: '.Accuracy::score($randomSplit->getTestLabels(), $predictedLabels) . PHP_EOL;

0 commit comments

Comments
 (0)