Rechecked dataset configs

FalaahArifKhan · Sep 17, 2024 · cded508 · cded508
1 parent 10cab77
commit cded508
Show file tree

Hide file tree

Showing 6 changed files with 103 additions and 63 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,10 @@ results/*
 .pt_tmp
 lightning_logs
 saved_models
+cluster/run_baselines/sbatch_files
+cluster/run_exp1/sbatch_files
+cluster/run_exp2_3/sbatch_files
+cluster/run_mixed_exp/sbatch_files
 
 # Created by https://www.gitignore.io/api/python,pycharm+all
 # Edit at https://www.gitignore.io/?templates=python,pycharm+all

diff --git a/cluster/run_baselines/run-baselines.sh b/cluster/run_baselines/run-baselines.sh
@@ -1,11 +1,10 @@
 # Define the list of tuples (dataset_name, model_name, run_nums)
 declare -a job_configs=(
+    "bank gandalf_clf 6"
+    "diabetes gandalf_clf 1,2,3,4,5,6"
     "folk gandalf_clf 1,2"
     "folk gandalf_clf 3,4"
     "folk gandalf_clf 5,6"
-    "law_school gandalf_clf 6"
-    "bank gandalf_clf 2,4"
-    "bank gandalf_clf 6"
 )
 
 TEMPLATE_FILE="../cluster/run_baselines/run-baselines-template.sbatch"

diff --git a/cluster/run_exp1/run-exp1-models-gpu.sh b/cluster/run_exp1/run-exp1-models-gpu.sh
@@ -1,23 +1,8 @@
 # Define the list of tuples (dataset_name, model_name, run_nums)
 declare -a job_configs=(
-    "heart deletion exp1_mcar3 gandalf_clf 1,2,3"
-    "heart deletion exp1_mcar3 gandalf_clf 4,5,6"
-    "heart deletion exp1_mar3 gandalf_clf 1,2,3"
-    "heart deletion exp1_mar3 gandalf_clf 4,5,6"
-    "heart deletion exp1_mnar3 gandalf_clf 1,2,3"
-    "heart deletion exp1_mnar3 gandalf_clf 4,5,6"
-    "heart median-mode exp1_mcar3 gandalf_clf 1,2,3"
-    "heart median-mode exp1_mcar3 gandalf_clf 4,5,6"
-    "heart median-mode exp1_mar3 gandalf_clf 1,2,3"
-    "heart median-mode exp1_mar3 gandalf_clf 4,5,6"
-    "heart median-mode exp1_mnar3 gandalf_clf 1,2,3"
-    "heart median-mode exp1_mnar3 gandalf_clf 4,5,6"
-    "heart median-dummy exp1_mcar3 gandalf_clf 1,2,3"
-    "heart median-dummy exp1_mcar3 gandalf_clf 4,5,6"
-    "heart median-dummy exp1_mar3 gandalf_clf 1,2,3"
-    "heart median-dummy exp1_mar3 gandalf_clf 4,5,6"
-    "heart median-dummy exp1_mnar3 gandalf_clf 1,2,3"
-    "heart median-dummy exp1_mnar3 gandalf_clf 4,5,6"
+    "heart deletion exp1_mar3 gandalf_clf 3"
+    "heart deletion exp1_mcar3 gandalf_clf 6"
+    "heart deletion exp1_mnar3 gandalf_clf 3,6"
 )
 
 TEMPLATE_FILE="../cluster/run_exp1/run-exp1-models-gpu-template.sbatch"

diff --git a/cluster/run_exp1/run-exp1-models.sh b/cluster/run_exp1/run-exp1-models.sh
@@ -9,14 +9,9 @@ declare -a job_configs=(
     "diabetes median-dummy exp1_mcar3 gandalf_clf 1,2,3,4,5,6"
     "diabetes median-dummy exp1_mar3 gandalf_clf 1,2,3,4,5,6"
     "diabetes median-dummy exp1_mnar3 gandalf_clf 1,2,3,4,5,6"
-    "german deletion exp1_mcar3 gandalf_clf 1,2,3,4,5,6"
-    "german deletion exp1_mar3 gandalf_clf 1,2,3,4,5,6"
-    "german deletion exp1_mnar3 gandalf_clf 1,2,3,4,5,6"
-    "german median-mode exp1_mcar3 gandalf_clf 1,2,3,4,5,6"
-    "german median-mode exp1_mar3 gandalf_clf 1,2,3,4,5,6"
-    "german median-mode exp1_mnar3 gandalf_clf 1,2,3,4,5,6"
-    "german median-dummy exp1_mcar3 gandalf_clf 1,2,3,4,5,6"
-    "german median-dummy exp1_mar3 gandalf_clf 1,2,3,4,5,6"
+    "german deletion exp1_mcar3 gandalf_clf 2,3,4,5,6"
+    "german median-mode exp1_mnar3 gandalf_clf 3,4,5,6"
+    "german median-dummy exp1_mcar3 gandalf_clf 4,5,6"
     "german median-dummy exp1_mnar3 gandalf_clf 1,2,3,4,5,6"
     "folk deletion exp1_mcar3 gandalf_clf 1,2"
     "folk deletion exp1_mcar3 gandalf_clf 3,4"
@@ -45,41 +40,10 @@ declare -a job_configs=(
     "folk median-dummy exp1_mnar3 gandalf_clf 1,2"
     "folk median-dummy exp1_mnar3 gandalf_clf 3,4"
     "folk median-dummy exp1_mnar3 gandalf_clf 5,6"
-    "law_school deletion exp1_mcar3 gandalf_clf 1,2"
-    "law_school deletion exp1_mcar3 gandalf_clf 3,4"
-    "law_school deletion exp1_mcar3 gandalf_clf 5,6"
     "law_school deletion exp1_mar3 gandalf_clf 1,2"
     "law_school deletion exp1_mar3 gandalf_clf 3,4"
     "law_school deletion exp1_mar3 gandalf_clf 5,6"
-    "law_school deletion exp1_mnar3 gandalf_clf 1,2"
-    "law_school deletion exp1_mnar3 gandalf_clf 3,4"
-    "law_school deletion exp1_mnar3 gandalf_clf 5,6"
-    "law_school median-mode exp1_mcar3 gandalf_clf 1,2"
-    "law_school median-mode exp1_mcar3 gandalf_clf 3,4"
-    "law_school median-mode exp1_mcar3 gandalf_clf 5,6"
-    "law_school median-mode exp1_mar3 gandalf_clf 1,2"
-    "law_school median-mode exp1_mar3 gandalf_clf 3,4"
-    "law_school median-mode exp1_mar3 gandalf_clf 5,6"
-    "law_school median-mode exp1_mnar3 gandalf_clf 1,2"
-    "law_school median-mode exp1_mnar3 gandalf_clf 3,4"
-    "law_school median-mode exp1_mnar3 gandalf_clf 5,6"
-    "law_school median-dummy exp1_mcar3 gandalf_clf 1,2"
-    "law_school median-dummy exp1_mcar3 gandalf_clf 3,4"
-    "law_school median-dummy exp1_mcar3 gandalf_clf 5,6"
-    "law_school median-dummy exp1_mar3 gandalf_clf 1,2"
-    "law_school median-dummy exp1_mar3 gandalf_clf 3,4"
-    "law_school median-dummy exp1_mar3 gandalf_clf 5,6"
-    "law_school median-dummy exp1_mnar3 gandalf_clf 1,2"
-    "law_school median-dummy exp1_mnar3 gandalf_clf 3,4"
-    "law_school median-dummy exp1_mnar3 gandalf_clf 5,6"
-    "bank deletion exp1_mcar3 gandalf_clf 1,2"
-    "bank deletion exp1_mcar3 gandalf_clf 3,4"
-    "bank deletion exp1_mcar3 gandalf_clf 5,6"
-    "bank deletion exp1_mar3 gandalf_clf 1,2"
-    "bank deletion exp1_mar3 gandalf_clf 3,4"
-    "bank deletion exp1_mar3 gandalf_clf 5,6"
-    "bank deletion exp1_mnar3 gandalf_clf 1,2"
-    "bank deletion exp1_mnar3 gandalf_clf 3,4"
+    "bank deletion exp1_mar3 gandalf_clf 2"
     "bank deletion exp1_mnar3 gandalf_clf 5,6"
     "bank median-mode exp1_mcar3 gandalf_clf 1,2"
     "bank median-mode exp1_mcar3 gandalf_clf 3,4"

diff --git a/cluster/useful_mongo_queries.js b/cluster/useful_mongo_queries.js
@@ -0,0 +1,89 @@
+// ==============================================================
+// Find all successfully executed experiments for specific model
+// ==============================================================
+db.exp_nulls_data_cleaning.aggregate([
+  {
+    $match: {
+      model_name: 'gandalf_clf',
+      tag: 'OK'
+    }
+  },
+  {
+    $group: {
+      _id: {
+        dataset_name: "$dataset_name",
+        null_imputer_name: "$null_imputer_name",
+        evaluation_scenario: "$evaluation_scenario",
+        model_name: "$model_name",
+        experiment_iteration: "$experiment_iteration"
+      }
+    }
+  },
+  {
+    $sort: {
+      "_id.dataset_name": 1,
+      "_id.null_imputer_name": 1,
+      "_id.evaluation_scenario": 1,
+      "_id.experiment_iteration": 1
+    }
+  },
+  {
+    $project: {
+      _id: 0,
+      dataset_name: "$_id.dataset_name",
+      null_imputer_name: "$_id.null_imputer_name",
+      evaluation_scenario: "$_id.evaluation_scenario",
+      model_name: "$_id.model_name",
+      experiment_iteration: "$_id.experiment_iteration"
+    }
+  }
+]);
+
+
+// ==========================================================================================================
+// Find best models based on F1 for specific dataset and each experiment iteration.
+// If model results are the same, two models will be displayed per one experiment iteration as the best ones.
+// ==========================================================================================================
+db.exp_nulls_data_cleaning.aggregate([
+  {
+    $match: { dataset_name: "law_school", evaluation_scenario: "baseline", model_name: {$ne: "tabpfn_clf"}, metric: "F1", subgroup: "overall", tag: "OK" }
+  },
+  {
+    $group: {
+      _id: "$experiment_iteration",                 // Group by 'group'
+      maxTotalValue: { $max: "$metric_value" }      // Find the maximum total_value for each group
+    }
+  },
+  {
+    $lookup: {                                     // Use $lookup to join original documents back to max values
+      from: "exp_nulls_data_cleaning",             // Perform a self-join
+      let: { groupId: "$_id", maxValue: "$maxTotalValue" },
+      pipeline: [
+        { $match: {
+            $expr: {
+              $and: [
+                { $eq: ["$experiment_iteration", "$$groupId"] },     // Match group
+                { $eq: ["$metric_value", "$$maxValue"] },            // Match max value
+                { $eq: ["$dataset_name", "law_school"] },
+                { $eq: ["$evaluation_scenario", "baseline"] },
+                { $eq: ["$metric", "F1"] },
+                { $eq: ["$subgroup", "overall"] },
+                { $eq: ["$tag", "OK"] },
+                { $ne: ["$model_name", "tabpfn_clf"] }
+              ]
+            }
+        }}
+      ],
+      as: "maxValueDocs"
+    }
+  },
+  {
+    $unwind: "$maxValueDocs"                        // Unwind the array to return documents
+  },
+  {
+    $replaceRoot: { newRoot: "$maxValueDocs" }      // Replace root to return original documents
+  },
+  {
+    $project: { _id: 0, dataset_split_seed: 0, exp_pipeline_guid: 0, model_params: 0, model_init_seed: 0, null_imputer_name: 0 }   // Optionally, hide the total_value field from output
+  }
+]);
diff --git a/configs/yaml_files/diabetes_config.yaml b/configs/yaml_files/diabetes_config.yaml
@@ -1,6 +1,5 @@
 dataset_name: diabetes
 bootstrap_fraction: 0.8
-n_estimators: 5
-#n_estimators: 50
+n_estimators: 50
 computation_mode: error_analysis
 sensitive_attributes_dct: {'Gender': 'Female'}