@@ -233,6 +233,51 @@ def gen_noisy_linear(size=10000, n_dim=100, n_valid=5, noise_scale=0.5, test_rat
233
233
return (x_train_noise , y_train ), (x_test , y_test )
234
234
return (x_train_noise , DataUtil .get_one_hot (y_train , 2 )), (x_test , DataUtil .get_one_hot (y_test , 2 ))
235
235
236
+ @staticmethod
237
+ def gen_noisy_poly (size = 10000 , p = 3 , n_dim = 100 , n_valid = 5 , noise_scale = 0.5 , test_ratio = 0.15 , one_hot = True ):
238
+ p = int (p )
239
+ assert p > 1 , "p should be greater than 1"
240
+ x_train = np .random .randn (size , n_dim )
241
+ x_train_list = [x_train ] + [x_train ** i for i in range (2 , p + 1 )]
242
+ x_train_noise = x_train + np .random .randn (size , n_dim ) * noise_scale
243
+ x_test = np .random .randn (int (size * test_ratio ), n_dim )
244
+ x_test_list = [x_test ] + [x_test ** i for i in range (2 , p + 1 )]
245
+ idx_list = [np .random .permutation (n_dim )[:n_valid ] for _ in range (p )]
246
+ w_list = [np .random .randn (n_valid , 1 ) for _ in range (p )]
247
+ o_train = [x [..., idx ].dot (w ) for x , idx , w in zip (x_train_list , idx_list , w_list )]
248
+ o_test = [x [..., idx ].dot (w ) for x , idx , w in zip (x_test_list , idx_list , w_list )]
249
+ y_train = (np .sum (o_train , axis = 0 ) > 0 ).astype (np .int8 ).ravel ()
250
+ y_test = (np .sum (o_test , axis = 0 ) > 0 ).astype (np .int8 ).ravel ()
251
+ if not one_hot :
252
+ return (x_train_noise , y_train ), (x_test , y_test )
253
+ return (x_train_noise , DataUtil .get_one_hot (y_train , 2 )), (x_test , DataUtil .get_one_hot (y_test , 2 ))
254
+
255
+ @staticmethod
256
+ def gen_special_linear (size = 10000 , n_dim = 10 , n_redundant = 3 , n_categorical = 3 ,
257
+ cv_ratio = 0.15 , test_ratio = 0.15 , one_hot = True ):
258
+ x_train = np .random .randn (size , n_dim )
259
+ x_train_redundant = np .ones ([size , n_redundant ]) * np .random .randint (0 , 3 , n_redundant )
260
+ x_train_categorical = np .random .randint (3 , 8 , [size , n_categorical ])
261
+ x_train_stacked = np .hstack ([x_train , x_train_redundant , x_train_categorical ])
262
+ n_test = int (size * test_ratio )
263
+ x_test = np .random .randn (n_test , n_dim )
264
+ x_test_redundant = np .ones ([n_test , n_redundant ]) * np .random .randint (3 , 6 , n_redundant )
265
+ x_test_categorical = np .random .randint (0 , 5 , [n_test , n_categorical ])
266
+ x_test_stacked = np .hstack ([x_test , x_test_redundant , x_test_categorical ])
267
+ w = np .random .randn (n_dim , 1 )
268
+ y_train = (x_train .dot (w ) > 0 ).astype (np .int8 ).ravel ()
269
+ y_test = (x_test .dot (w ) > 0 ).astype (np .int8 ).ravel ()
270
+ n_cv = int (size * cv_ratio )
271
+ x_train_stacked , x_cv_stacked = x_train_stacked [:- n_cv ], x_train_stacked [- n_cv :]
272
+ y_train , y_cv = y_train [:- n_cv ], y_train [- n_cv :]
273
+ if not one_hot :
274
+ return (x_train_stacked , y_train ), (x_cv_stacked , y_cv ), (x_test_stacked , y_test )
275
+ return (
276
+ (x_train_stacked , DataUtil .get_one_hot (y_train , 2 )),
277
+ (x_cv_stacked , DataUtil .get_one_hot (y_cv , 2 )),
278
+ (x_test_stacked , DataUtil .get_one_hot (y_test , 2 ))
279
+ )
280
+
236
281
@staticmethod
237
282
def quantize_data (x , y , wc = None , continuous_rate = 0.1 , separate = False ):
238
283
if isinstance (x , list ):
0 commit comments