|
6 | 6 | use Phpml\Dataset\ArrayDataset; |
7 | 7 | use Phpml\FeatureExtraction\TokenCountVectorizer; |
8 | 8 | use Phpml\Tokenization\WordTokenizer; |
9 | | -use Phpml\CrossValidation\RandomSplit; |
| 9 | +use Phpml\CrossValidation\StratifiedRandomSplit; |
| 10 | +use Phpml\FeatureExtraction\TfIdfTransformer; |
10 | 11 | use Phpml\Metric\Accuracy; |
11 | 12 | use Phpml\Classification\SVC; |
12 | 13 | use Phpml\SupportVectorMachine\Kernel; |
13 | 14 |
|
14 | 15 | $dataset = new CsvDataset('data/languages.csv', 1); |
15 | 16 | $vectorizer = new TokenCountVectorizer(new WordTokenizer()); |
| 17 | +$tfIdfTransformer = new TfIdfTransformer(); |
16 | 18 |
|
17 | 19 | $samples = []; |
18 | 20 | foreach ($dataset->getSamples() as $sample) { |
19 | 21 | $samples[] = $sample[0]; |
20 | 22 | } |
21 | 23 |
|
| 24 | +$vectorizer->fit($samples); |
22 | 25 | $vectorizer->transform($samples); |
| 26 | + |
| 27 | +$tfIdfTransformer->fit($samples); |
| 28 | +$tfIdfTransformer->transform($samples); |
| 29 | + |
23 | 30 | $dataset = new ArrayDataset($samples, $dataset->getTargets()); |
24 | 31 |
|
25 | | -$randomSplit = new RandomSplit($dataset, 0.25); |
| 32 | +$randomSplit = new StratifiedRandomSplit($dataset, 0.1); |
26 | 33 |
|
27 | | -$classifier = new SVC(Kernel::RBF, 100); |
| 34 | +$classifier = new SVC(Kernel::RBF, 10000); |
28 | 35 | $classifier->train($randomSplit->getTrainSamples(), $randomSplit->getTrainLabels()); |
29 | 36 |
|
30 | 37 | $predictedLabels = $classifier->predict($randomSplit->getTestSamples()); |
|
0 commit comments