1
+ """Calculate data for learning rate calculation.
2
+
3
+ Taking data from sampling_learning_rate.py
4
+ Output may be processed by plot_learning_rate.py
5
+ """
1
6
# @Author: Joey Teng
2
7
# @Email: joey.teng.dev@gmail.com
3
8
# @Filename: learning_rate.py
22
27
23
28
24
29
def split_data_target (dataset ):
30
+ """Split the input CSV files into X, y vectors for sklearn implementations.
31
+
32
+ Args:
33
+ dataset (list): List of list of floats.
34
+ [
35
+ [0...n - 1]: X, feature vector
36
+ [-1]: y, label
37
+ ]
38
+
39
+ Returns:
40
+ tuple: (X, y) for sklearn implementations
41
+
42
+ """
25
43
try :
26
44
return ([[float (element )
27
45
for element in row .strip ().split (',' )[:- 1 ]]
@@ -34,15 +52,59 @@ def split_data_target(dataset):
34
52
35
53
36
54
def generate_training_sets (dataset , percentage , copies ):
55
+ """Resample from separated training sets to generate smaller training sets.
56
+
57
+ No instance will present in one new training set more than once.
58
+ Mechanism is to shuffle, then pick the first percentage% instances.
59
+
60
+ Args:
61
+ dataset (list): List of vectors (features + label)
62
+ percentage (number that supports __mul__ and __floordiv__):
63
+ This decides the size of new training set generated related to the
64
+ population.
65
+ copies (int): The number of new training sets required.
66
+
67
+ Returns:
68
+ list: list of new training datasets
69
+ list of list of vectors
70
+
71
+ """
37
72
training_sets = []
38
- for i in range (copies ):
73
+ i = copies
74
+ while i > 0 :
39
75
population = copy .deepcopy (dataset )
40
76
random .shuffle (population )
41
77
training_sets .append (population [:len (population ) * percentage // 100 ])
78
+ i -= 1
79
+
42
80
return training_sets
43
81
44
82
45
83
def generate_result (datasets , classifier , path ):
84
+ """Generate the learning rate accuracies.
85
+
86
+ Args:
87
+ datasets (dict): {
88
+ 'test set': testing set for the specific dataset
89
+ 'remainder': instances in the dataset but not testing set
90
+ }
91
+ classifier (func): a function that will return an instance of
92
+ sklearn classifier.
93
+ path (str): path of the dataset, for logging only.
94
+
95
+ Returns:
96
+ dict: dict of dict {
97
+ percentage: results under respective portion of training data {
98
+ 'raw' (list): raw accuracy values [
99
+ accuracy values of each training set-testing set pairs
100
+ ]
101
+ 'average': average of 'raw'
102
+ 'standard deviation': standard deviation of 'raw'
103
+ 'range': range of 'raw'
104
+ }
105
+ }
106
+
107
+ """
46
108
results = []
47
109
for dataset in datasets :
48
110
test_set = dataset ['test set' ]
@@ -84,11 +146,15 @@ def generate_result(datasets, classifier, path):
84
146
85
147
86
148
def RandomForestClassifier ():
149
+ """Wrap a default Random Forest classifier with fixed parameter."""
87
150
return sklearn .ensemble .RandomForestClassifier (n_estimators = 64 )
88
151
89
152
90
153
def main (path ):
91
- """main"""
154
+ """Start main function here.
155
+
156
+ Run tasks and dump result files.
157
+ """
92
158
print ("{} Start" .format (path ), flush = True )
93
159
94
160
datasets = json .load (open (path , 'r' ))
@@ -103,6 +169,15 @@ def main(path):
103
169
104
170
105
171
def traverse (paths ):
172
+ """Travsere to append all files in children folders into the task queue.
173
+
174
+ Args:
175
+ paths (list): Paths of all folders to be detected
176
+
177
+ Returns:
178
+ list: Paths of all files added in the task queue
179
+
180
+ """
106
181
print ("Starting Traverse Through" , flush = True )
107
182
files = []
108
183
while paths :
@@ -119,6 +194,14 @@ def traverse(paths):
119
194
120
195
121
196
def parse_path ():
197
+ """Parse the arguments.
198
+
199
+ No argument is required for calling this function.
200
+
201
+ Returns:
202
+ Namespace: parsed arguments enclosed by an object defined in argparse
203
+
204
+ """
122
205
parser = argparse .ArgumentParser (
123
206
description = "Generate Datasets for Detecting Learning Rate" )
124
207
parser .add_argument ('-r' , action = 'store' , nargs = '+' , default = [],
0 commit comments