Changed configs readme and made small fixes in GB testing configs

IntelPython · Alexsandruss · Apr 26, 2021 · Mar 22, 2021 · Mar 22, 2021 · Mar 22, 2021
commit 13fcd20204a2aeecc1569d8f061fdeaca1ea2e1e
diff --git a/configs/README.md b/configs/README.md
@@ -1,4 +1,4 @@
-##  Config JSON Schema
+## Config JSON Schema
 
 Configure benchmarks by editing the `config.json` file.
 You can configure some algorithm parameters, datasets, a list of frameworks to use, and the usage of some environment variables.
@@ -11,57 +11,57 @@ Refer to the tables below for descriptions of all fields in the configuration fi
 - [Training Object](#training-object)
 - [Testing Object](#testing-object)
 
-###  Root Config Object
+### Root Config Object
 | Field Name  | Type | Description |
 | ----- | ---- |------------ |
-|omp_env| array[string] | For xgboost only. Specify an environment variable to set the number of omp threads |
 |common| [Common Object](#common-object)| **REQUIRED** common benchmarks setting: frameworks and input data settings |
-|cases| array[[Case Object](#case-object)] | **REQUIRED**  list of algorithms, their parameters and training data |
+|cases| List[[Case Object](#case-object)] | **REQUIRED**  list of algorithms, their parameters and training data |
 
-###  Common Object
+### Common Object
 
 | Field Name  | Type | Description |
 | ----- | ---- |------------ |
-|lib| array[string] | **REQUIRED** list of test frameworks. It can be *sklearn*, *daal4py*, *cuml* or *xgboost* |
-|data-format| array[string] | **REQUIRED** input data format. Data formats: *numpy*, *pandas* or *cudf* |
-|data-order| array[string] | **REQUIRED**  input data order. Data order: *C* (row-major, default) or *F* (column-major) |
-|dtype| array[string] | **REQUIRED**  input data type. Data type: *float64* (default) or *float32* |
-|check-finitness| array[] | Check finiteness in sklearn input check(disabled by default) |
+|data-format| Union[str, List[str]] | **REQUIRED** input data format. Data formats: *numpy*, *pandas* or *cudf* |
+|data-order| Union[str, List[str]] | **REQUIRED**  input data order. Data order: *C* (row-major, default) or *F* (column-major) |
+|dtype| Union[str, List[str]] | **REQUIRED**  input data type. Data type: *float64* (default) or *float32* |
+|check-finitness| List[] | Check finiteness in sklearn input check(disabled by default) |
 
-###  Case Object
+### Case Object
 
 | Field Name  | Type | Description |
 | ----- | ---- |------------ |
-|lib| array[string] | **REQUIRED** list of test frameworks. It can be *sklearn*, *daal4py*, *cuml* or *xgboost*|
-|algorithm| string | **REQUIRED** benchmark name |
-|dataset| array[[Dataset Object](#dataset-object)] | **REQUIRED**  input data specifications. |
-|benchmark parameters| array[Any] | **REQUIRED** algorithm parameters. a list of supported parameters can be found here |
+|lib| Union[str, List[str]] | **REQUIRED** Test framework or list of frameworks. Must be from [*sklearn*, *daal4py*, *cuml* or *xgboost*] |
+|algorithm| string | **REQUIRED** benchmark file name. |
+|dataset| List[[Dataset Object](#dataset-object)] | **REQUIRED**  input data specifications. |
+|**specific algorithm parameters**| Union[int, float, str, List[int], List[float], List[str]] | other specific algorithm parameters. The list of supported parameters can be found here |
 
-###  Dataset Object
+#### **Important:** feel free to move any parameter from **cases** to **common** section since this parameter is common for all cases
+
+### Dataset Object
 
 | Field Name  | Type | Description |
 | ----- | ---- |------------ |
-|source| string | **REQUIRED** data source. It can be *synthetic* or *csv* |
-|type| string | **REQUIRED**  for synthetic data only. The type of task for which the dataset is generated. It can be *classification*, *blobs* or *regression* |
+|source| string | **REQUIRED** data source. It can be *synthetic*, *csv* or *npy* |
+|type| string | **REQUIRED for synthetic data**. The type of task for which the dataset is generated. It can be *classification*, *blobs* or *regression* |
 |n_classes| int | For *synthetic* data and for *classification* type only. The number of classes (or labels) of the classification problem |
 |n_clusters| int | For *synthetic* data and for *blobs* type only. The number of centers to generate |
-|n_features| int | **REQUIRED**  For *synthetic* data only. The number of features to generate |
-|name| string | Name of dataset |
-|training| [Training Object](#training-object) | **REQUIRED** algorithm parameters. a list of supported parameters can be found here |
-|testing| [Testing Object](#testing-object) | **REQUIRED** algorithm parameters. a list of supported parameters can be found here |
+|n_features| int | **REQUIRED for *synthetic* data**. The number of features to generate |
+|name| string | Name of the dataset |
+|training| [Training Object](#training-object) | **REQUIRED** An object with training dataset paths |
+|testing| [Testing Object](#testing-object) | An object with testing dataset paths. If not provided, training datasets are used |
 
-###  Training Object
+### Training Object
 
 | Field Name  | Type | Description |
 | ----- | ---- |------------ |
-| n_samples | int | The total number of the training points |
-| x | str | The path to the training samples |
-| y | str | The path to the training labels |
+| n_samples | int | **REQUIRED** The total number of the training samples |
+| x | str | **REQUIRED** The path to the training samples |
+| y | str | **REQUIRED** The path to the training labels |
 
-###  Testing Object
+### Testing Object
 
 | Field Name  | Type | Description |
 | ----- | ---- |------------ |
-| n_samples | int | The total number of the testing points |
-| x | str | The path to the testing samples |
-| y | str | The path to the testing labels |
+| n_samples | int | **REQUIRED** The total number of the testing samples |
+| x | str | **REQUIRED** The path to the testing samples |
+| y | str | **REQUIRED** The path to the testing labels |
diff --git a/configs/testing/daal4py_xgboost.json b/configs/testing/daal4py_xgboost.json
@@ -1,20 +1,21 @@
 {
-    "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"],
     "common": {
-        "lib": ["modelbuilders"],
-        "data-format": ["pandas"],
-        "data-order": ["F"],
-        "dtype": ["float32"]
+        "lib":          "modelbuilders",
+        "data-format":  "pandas",
+        "data-order":   "F",
+        "dtype":        "float32",
+        "algorithm":    "xgb_mb",
+        "tree-method":  "hist",
+        "count-dmatrix":""
     },
     "cases": [
         {
-            "algorithm": "xgb_mb",
             "dataset": [
                 {
-                    "source": "synthetic",
-                    "type": "classification",
-                    "n_classes": 5,
-                    "n_features": 10,
+                    "source":       "synthetic",
+                    "type":         "classification",
+                    "n_classes":    5,
+                    "n_features":   10,
                     "training": {
                         "n_samples": 100
                     },
@@ -23,10 +24,9 @@
                     }
                 }
             ],
-            "n-estimators": [10],
-            "tree-method": ["hist"],
-            "objective": ["multi:softprob"],
-            "max-depth": [8]
+            "n-estimators": 10,
+            "max-depth":    8,
+            "objective":    "multi:softprob"
         }
     ]
 }
diff --git a/configs/testing/xgboost.json b/configs/testing/xgboost.json
@@ -1,21 +1,21 @@
 {
-    "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"],
     "common": {
-        "lib": ["xgboost"],
-        "data-format": ["pandas"],
-        "data-order": ["F"],
-        "dtype": ["float64"]
+        "lib":          "xgboost",
+        "data-format":  "pandas",
+        "data-order":   "F",
+        "dtype":        "float32",
+        "algorithm":    "gbt",
+        "tree-method":  "hist",
+        "count-dmatrix":""
     },
     "cases": [
-
         {
-            "algorithm": "gbt",
             "dataset": [
                 {
-                    "source": "synthetic",
-                    "type": "classification",
-                    "n_classes": 5,
-                    "n_features": 10,
+                    "source":       "synthetic",
+                    "type":         "classification",
+                    "n_classes":    5,
+                    "n_features":   10,
                     "training": {
                         "n_samples": 1000
                     },
@@ -24,21 +24,19 @@
                     }
                 }
             ],
-            "n-estimators": [50],
-            "objective": ["multi:softprob"],
-            "tree-method": ["hist"],
-            "max-depth": [7],
-            "subsample": [0.7],
-            "colsample-bytree": [0.7]
+            "n-estimators":     50,
+            "max-depth":        7,
+            "subsample":        0.7,
+            "colsample-bytree": 0.7,
+            "objective":        "multi:softprob"
         },
         {
-            "algorithm": "gbt",
             "dataset": [
                 {
-                    "source": "synthetic",
-                    "type": "regression",
-                    "n_classes": 5,
-                    "n_features": 10,
+                    "source":       "synthetic",
+                    "type":         "regression",
+                    "n_classes":    5,
+                    "n_features":   10,
                     "training": {
                         "n_samples": 100
                     },
@@ -47,12 +45,11 @@
                     }
                 }
             ],
-            "n-estimators": [50],
-            "objective": ["reg:squarederror"],
-            "tree-method": ["hist"],
-            "max-depth": [8],
-            "learning-rate": [0.1],
-            "reg-alpha": [0.9]
+            "n-estimators":     50,
+            "max-depth":        8,
+            "learning-rate":    0.1,
+            "reg-alpha":        0.9,
+            "objective":        "reg:squarederror"
         }
     ]
 }