From d4e91ee905f8a221b3206c2ac758097b8ba5c190 Mon Sep 17 00:00:00 2001
From: Michael Clifford <mcliffor@redhat.com>
Date: Tue, 3 Dec 2024 14:14:27 -0500
Subject: [PATCH 1/5] update rhelai 1.2 to 1.3

Signed-off-by: Michael Clifford <mcliffor@redhat.com>
---
 importer-pipeline.yaml |  2 +-
 pipeline.yaml          | 12 ++++++------
 training/components.py |  2 +-
 utils/consts.py        |  2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/importer-pipeline.yaml b/importer-pipeline.yaml
index c4486c7e..b34e0d97 100644
--- a/importer-pipeline.yaml
+++ b/importer-pipeline.yaml
@@ -32,7 +32,7 @@ deploymentSpec:
         env:
         - name: REGISTRY_AUTH_FILE
           value: /mnt/containers/.dockerconfigjson
-        image: quay.io/redhat-et/ilab:1.2
+        image: quay.io/redhat-et/ilab:1.3
 pipelineInfo:
   description: Helper pipeline to the InstructLab pipeline which allows users to seed/import
     a new base model
diff --git a/pipeline.yaml b/pipeline.yaml
index 23b8999c..38e42e69 100644
--- a/pipeline.yaml
+++ b/pipeline.yaml
@@ -645,7 +645,7 @@ deploymentSpec:
           \                max_seq_len=train_args.max_seq_len,\n                chat_tmpl_path=train_args.chat_tmpl_path,\n\
           \            )\n        )\n\n    data_processing(train_args=skill_training_args)\n\
           \    data_processing(train_args=knowledge_training_args)\n\n"
-        image: quay.io/redhat-et/ilab:1.2
+        image: quay.io/redhat-et/ilab:1.3
     exec-deletepvc:
       container:
         image: argostub/deletepvc
@@ -744,7 +744,7 @@ deploymentSpec:
           \       path_to_data = \"/input_data/knowledge/data.jsonl\"\n    elif phase_num\
           \ == 2:\n        path_to_model = list_phase1_final_model()\n        path_to_data\
           \ = \"/input_data/skills/data.jsonl\"\n    else:\n        raise RuntimeError(f\"\
-          Unsupported value of {phase_num=}\")\n\n    image = \"quay.io/redhat-et/ilab:1.2\"\
+          Unsupported value of {phase_num=}\")\n\n    image = \"quay.io/redhat-et/ilab:1.3\"\
           \n\n    manifest = inspect.cleandoc(\n        f\"\"\"\n        apiVersion:\
           \ kubeflow.org/v1\n        kind: PyTorchJob\n        metadata:\n       \
           \   name: {name}\n        spec:\n          nprocPerNode: \\\"{nproc_per_node}\\\
@@ -951,7 +951,7 @@ deploymentSpec:
           \       path_to_data = \"/input_data/knowledge/data.jsonl\"\n    elif phase_num\
           \ == 2:\n        path_to_model = list_phase1_final_model()\n        path_to_data\
           \ = \"/input_data/skills/data.jsonl\"\n    else:\n        raise RuntimeError(f\"\
-          Unsupported value of {phase_num=}\")\n\n    image = \"quay.io/redhat-et/ilab:1.2\"\
+          Unsupported value of {phase_num=}\")\n\n    image = \"quay.io/redhat-et/ilab:1.3\"\
           \n\n    manifest = inspect.cleandoc(\n        f\"\"\"\n        apiVersion:\
           \ kubeflow.org/v1\n        kind: PyTorchJob\n        metadata:\n       \
           \   name: {name}\n        spec:\n          nprocPerNode: \\\"{nproc_per_node}\\\
@@ -1376,7 +1376,7 @@ deploymentSpec:
           value: /tmp
         - name: HF_HOME
           value: /tmp
-        image: quay.io/redhat-et/ilab:1.2
+        image: quay.io/redhat-et/ilab:1.3
         resources:
           accelerator:
             count: '1'
@@ -1512,7 +1512,7 @@ deploymentSpec:
           value: /tmp
         - name: HF_HOME
           value: /tmp
-        image: quay.io/redhat-et/ilab:1.2
+        image: quay.io/redhat-et/ilab:1.3
         resources:
           accelerator:
             count: '1'
@@ -1569,7 +1569,7 @@ deploymentSpec:
           value: /tmp
         - name: HF_HOME
           value: /tmp
-        image: quay.io/redhat-et/ilab:1.2
+        image: quay.io/redhat-et/ilab:1.3
     exec-sdg-to-artifact-op:
       container:
         args:
diff --git a/training/components.py b/training/components.py
index a12ad747..3a007cd0 100644
--- a/training/components.py
+++ b/training/components.py
@@ -167,7 +167,7 @@ def list_phase1_final_model():
     else:
         raise RuntimeError(f"Unsupported value of {phase_num=}")
 
-    image = "quay.io/redhat-et/ilab:1.2"
+    image = "quay.io/redhat-et/ilab:1.3"
 
     manifest = inspect.cleandoc(
         f"""
diff --git a/utils/consts.py b/utils/consts.py
index f8116212..aad0d11f 100644
--- a/utils/consts.py
+++ b/utils/consts.py
@@ -1,4 +1,4 @@
 PYTHON_IMAGE = "quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111"
 TOOLBOX_IMAGE = "registry.access.redhat.com/ubi9/toolbox"
 OC_IMAGE = "registry.redhat.io/openshift4/ose-cli"
-RHELAI_IMAGE = "quay.io/redhat-et/ilab:1.2"
+RHELAI_IMAGE = "quay.io/redhat-et/ilab:1.3"

From ba97769b8be626cace865821d1231896e398a395 Mon Sep 17 00:00:00 2001
From: Michael Clifford <mcliffor@redhat.com>
Date: Tue, 3 Dec 2024 19:46:26 -0500
Subject: [PATCH 2/5] wip: working on changes needed for RHELAI1.3

Signed-off-by: Michael Clifford <mcliffor@redhat.com>
---
 pipeline.py            |   1 +
 pipeline.yaml          | 222 ++++++++++++++++++++---------------------
 sdg/components.py      |   8 +-
 training/components.py |  10 +-
 4 files changed, 119 insertions(+), 122 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 5269088b..177179ed 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -271,6 +271,7 @@ def pipeline(
         )
         data_processing_task.after(model_to_pvc_task, sdg_task)
         data_processing_task.set_caching_options(False)
+        data_processing_task.set_env_variable("XDG_CACHE_HOME", "/tmp")
 
         set_image_pull_secrets(data_processing_task, [IMAGE_PULL_SECRET])
 
diff --git a/pipeline.yaml b/pipeline.yaml
index 38e42e69..e89b9f47 100644
--- a/pipeline.yaml
+++ b/pipeline.yaml
@@ -645,6 +645,9 @@ deploymentSpec:
           \                max_seq_len=train_args.max_seq_len,\n                chat_tmpl_path=train_args.chat_tmpl_path,\n\
           \            )\n        )\n\n    data_processing(train_args=skill_training_args)\n\
           \    data_processing(train_args=knowledge_training_args)\n\n"
+        env:
+        - name: XDG_CACHE_HOME
+          value: /tmp
         image: quay.io/redhat-et/ilab:1.3
     exec-deletepvc:
       container:
@@ -773,29 +776,28 @@ deploymentSpec:
           \                        --log_level=INFO \\\n                         \
           \     --max_batch_len={max_batch_len} \\\n                             \
           \ --seed={seed} \\\n                              --cpu_offload_optimizer\
-          \ \\\n                              --cpu_offload_params \\\n          \
-          \                    --distributed_training_framework fsdp \\\n        \
-          \                      --is_granite \\\n                              --checkpoint_at_epoch\n\
-          \                      command:\n                        - /bin/bash\n \
-          \                       - '-c'\n                        - '--'\n       \
-          \               image: {image}\n                      name: pytorch\n  \
-          \                    volumeMounts:\n                        - mountPath:\
-          \ /input_data\n                          name: input-data\n            \
-          \              readOnly: true\n                        - mountPath: /input_model\n\
-          \                          name: model\n                          readOnly:\
-          \ true\n                        - mountPath: /output\n                 \
-          \         name: output\n                      env:\n                   \
-          \     - name: NNODES\n                          value: \\\"{nnodes}\\\"\n\
-          \                        - name: NPROC_PER_NODE\n                      \
-          \    value: \\\"{nproc_per_node}\\\"\n                        - name: XDG_CACHE_HOME\n\
+          \ \\\n                              --cpu_offload_params_fsdp \\\n     \
+          \                         --distributed_training_framework fsdp \\\n   \
+          \                           --checkpoint_at_epoch\n                    \
+          \  command:\n                        - /bin/bash\n                     \
+          \   - '-c'\n                        - '--'\n                      image:\
+          \ {image}\n                      name: pytorch\n                      volumeMounts:\n\
+          \                        - mountPath: /input_data\n                    \
+          \      name: input-data\n                          readOnly: true\n    \
+          \                    - mountPath: /input_model\n                       \
+          \   name: model\n                          readOnly: true\n            \
+          \            - mountPath: /output\n                          name: output\n\
+          \                      env:\n                        - name: NNODES\n  \
+          \                        value: \\\"{nnodes}\\\"\n                     \
+          \   - name: NPROC_PER_NODE\n                          value: \\\"{nproc_per_node}\\\
+          \"\n                        - name: XDG_CACHE_HOME\n                   \
+          \       value: /tmp\n                        - name: TRITON_CACHE_DIR\n\
           \                          value: /tmp\n                        - name:\
-          \ TRITON_CACHE_DIR\n                          value: /tmp\n            \
-          \            - name: HF_HOME\n                          value: /tmp\n  \
-          \                      - name: TRANSFORMERS_CACHE\n                    \
-          \      value: /tmp\n                      resources:\n                 \
-          \       requests:\n                          cpu: 8\n                  \
-          \        \"nvidia.com/gpu\": {nproc_per_node}\n                        limits:\n\
-          \                          cpu: 8\n                          \"nvidia.com/gpu\"\
+          \ HF_HOME\n                          value: /tmp\n                     \
+          \   - name: TRANSFORMERS_CACHE\n                          value: /tmp\n\
+          \                      resources:\n                        requests:\n \
+          \                         \"nvidia.com/gpu\": {nproc_per_node}\n       \
+          \                 limits:\n                          \"nvidia.com/gpu\"\
           : {nproc_per_node}\n                  volumes:\n                    - name:\
           \ input-data\n                      persistentVolumeClaim:\n           \
           \             claimName: {input_pvc_name}\n                    - name: model\n\
@@ -824,41 +826,39 @@ deploymentSpec:
           \                           --save_samples={save_samples} \\\n         \
           \                   --log_level=INFO \\\n                            --max_batch_len={max_batch_len}\
           \ \\\n                            --seed={seed} \\\n                   \
-          \         --cpu_offload_optimizer \\\n                            --cpu_offload_params\
+          \         --cpu_offload_optimizer \\\n                            --cpu_offload_params_fsdp\
           \ \\\n                            --distributed_training_framework fsdp\
-          \ \\\n                            --is_granite \\\n                    \
-          \        --checkpoint_at_epoch\n                      command:\n       \
-          \                 - /bin/bash\n                        - '-c'\n        \
-          \                - '--'\n                      image: {image}\n        \
-          \              name: pytorch\n                      volumeMounts:\n    \
-          \                    - mountPath: /input_data\n                        \
-          \  name: input-data\n                          readOnly: true\n        \
-          \                - mountPath: /input_model\n                          name:\
-          \ model\n                          readOnly: true\n                    \
-          \    - mountPath: /output\n                          name: output\n    \
-          \                      readOnly: true\n                      env:\n    \
-          \                    - name: NNODES\n                          value: \\\
-          \"{nnodes}\\\"\n                        - name: NPROC_PER_NODE\n       \
-          \                   value: \\\"{nproc_per_node}\\\"\n                  \
-          \      - name: XDG_CACHE_HOME\n                          value: /tmp\n \
-          \                       - name: TRITON_CACHE_DIR\n                     \
-          \     value: /tmp\n                        - name: HF_HOME\n           \
-          \               value: /tmp\n                        - name: TRANSFORMERS_CACHE\n\
+          \ \\\n                            --checkpoint_at_epoch\n              \
+          \        command:\n                        - /bin/bash\n               \
+          \         - '-c'\n                        - '--'\n                     \
+          \ image: {image}\n                      name: pytorch\n                \
+          \      volumeMounts:\n                        - mountPath: /input_data\n\
+          \                          name: input-data\n                          readOnly:\
+          \ true\n                        - mountPath: /input_model\n            \
+          \              name: model\n                          readOnly: true\n \
+          \                       - mountPath: /output\n                         \
+          \ name: output\n                          readOnly: true\n             \
+          \         env:\n                        - name: NNODES\n               \
+          \           value: \\\"{nnodes}\\\"\n                        - name: NPROC_PER_NODE\n\
+          \                          value: \\\"{nproc_per_node}\\\"\n           \
+          \             - name: XDG_CACHE_HOME\n                          value: /tmp\n\
+          \                        - name: TRITON_CACHE_DIR\n                    \
+          \      value: /tmp\n                        - name: HF_HOME\n          \
+          \                value: /tmp\n                        - name: TRANSFORMERS_CACHE\n\
           \                          value: /tmp\n                      resources:\n\
-          \                        requests:\n                          cpu: 8\n \
-          \                         \"nvidia.com/gpu\": {nproc_per_node}\n       \
-          \                 limits:\n                          cpu: 8\n          \
-          \                \"nvidia.com/gpu\": {nproc_per_node}\n                \
-          \  volumes:\n                    - name: input-data\n                  \
-          \    persistentVolumeClaim:\n                        claimName: {input_pvc_name}\n\
-          \                    - name: model\n                      persistentVolumeClaim:\n\
-          \                        claimName: {model_pvc_name}\n                 \
-          \   - name: output\n                      persistentVolumeClaim:\n     \
-          \                   claimName: {output_pvc_name}\n        \"\"\"\n    )\n\
-          \n    try:\n        manifest_yaml = yaml.safe_load(manifest)\n    except\
-          \ yaml.YAMLError as exc:\n        raise RuntimeError(f\"Error parsing manifest:\
-          \ {exc}\") from exc\n\n    # Discover the namespace in which the pod is\
-          \ running\n    with open(\n        \"/var/run/secrets/kubernetes.io/serviceaccount/namespace\"\
+          \                        requests:\n                          \"nvidia.com/gpu\"\
+          : {nproc_per_node}\n                        limits:\n                  \
+          \        \"nvidia.com/gpu\": {nproc_per_node}\n                  volumes:\n\
+          \                    - name: input-data\n                      persistentVolumeClaim:\n\
+          \                        claimName: {input_pvc_name}\n                 \
+          \   - name: model\n                      persistentVolumeClaim:\n      \
+          \                  claimName: {model_pvc_name}\n                    - name:\
+          \ output\n                      persistentVolumeClaim:\n               \
+          \         claimName: {output_pvc_name}\n        \"\"\"\n    )\n\n    try:\n\
+          \        manifest_yaml = yaml.safe_load(manifest)\n    except yaml.YAMLError\
+          \ as exc:\n        raise RuntimeError(f\"Error parsing manifest: {exc}\"\
+          ) from exc\n\n    # Discover the namespace in which the pod is running\n\
+          \    with open(\n        \"/var/run/secrets/kubernetes.io/serviceaccount/namespace\"\
           , \"r\", encoding=\"utf-8\"\n    ) as f:\n        namespace = f.read().strip()\n\
           \        print(f\"The pod is running in the namespace: {namespace}\")\n\n\
           \    try:\n        kubernetes.config.load_kube_config()\n        print(\"\
@@ -980,29 +980,28 @@ deploymentSpec:
           \                        --log_level=INFO \\\n                         \
           \     --max_batch_len={max_batch_len} \\\n                             \
           \ --seed={seed} \\\n                              --cpu_offload_optimizer\
-          \ \\\n                              --cpu_offload_params \\\n          \
-          \                    --distributed_training_framework fsdp \\\n        \
-          \                      --is_granite \\\n                              --checkpoint_at_epoch\n\
-          \                      command:\n                        - /bin/bash\n \
-          \                       - '-c'\n                        - '--'\n       \
-          \               image: {image}\n                      name: pytorch\n  \
-          \                    volumeMounts:\n                        - mountPath:\
-          \ /input_data\n                          name: input-data\n            \
-          \              readOnly: true\n                        - mountPath: /input_model\n\
-          \                          name: model\n                          readOnly:\
-          \ true\n                        - mountPath: /output\n                 \
-          \         name: output\n                      env:\n                   \
-          \     - name: NNODES\n                          value: \\\"{nnodes}\\\"\n\
-          \                        - name: NPROC_PER_NODE\n                      \
-          \    value: \\\"{nproc_per_node}\\\"\n                        - name: XDG_CACHE_HOME\n\
+          \ \\\n                              --cpu_offload_params_fsdp \\\n     \
+          \                         --distributed_training_framework fsdp \\\n   \
+          \                           --checkpoint_at_epoch\n                    \
+          \  command:\n                        - /bin/bash\n                     \
+          \   - '-c'\n                        - '--'\n                      image:\
+          \ {image}\n                      name: pytorch\n                      volumeMounts:\n\
+          \                        - mountPath: /input_data\n                    \
+          \      name: input-data\n                          readOnly: true\n    \
+          \                    - mountPath: /input_model\n                       \
+          \   name: model\n                          readOnly: true\n            \
+          \            - mountPath: /output\n                          name: output\n\
+          \                      env:\n                        - name: NNODES\n  \
+          \                        value: \\\"{nnodes}\\\"\n                     \
+          \   - name: NPROC_PER_NODE\n                          value: \\\"{nproc_per_node}\\\
+          \"\n                        - name: XDG_CACHE_HOME\n                   \
+          \       value: /tmp\n                        - name: TRITON_CACHE_DIR\n\
           \                          value: /tmp\n                        - name:\
-          \ TRITON_CACHE_DIR\n                          value: /tmp\n            \
-          \            - name: HF_HOME\n                          value: /tmp\n  \
-          \                      - name: TRANSFORMERS_CACHE\n                    \
-          \      value: /tmp\n                      resources:\n                 \
-          \       requests:\n                          cpu: 8\n                  \
-          \        \"nvidia.com/gpu\": {nproc_per_node}\n                        limits:\n\
-          \                          cpu: 8\n                          \"nvidia.com/gpu\"\
+          \ HF_HOME\n                          value: /tmp\n                     \
+          \   - name: TRANSFORMERS_CACHE\n                          value: /tmp\n\
+          \                      resources:\n                        requests:\n \
+          \                         \"nvidia.com/gpu\": {nproc_per_node}\n       \
+          \                 limits:\n                          \"nvidia.com/gpu\"\
           : {nproc_per_node}\n                  volumes:\n                    - name:\
           \ input-data\n                      persistentVolumeClaim:\n           \
           \             claimName: {input_pvc_name}\n                    - name: model\n\
@@ -1031,41 +1030,39 @@ deploymentSpec:
           \                           --save_samples={save_samples} \\\n         \
           \                   --log_level=INFO \\\n                            --max_batch_len={max_batch_len}\
           \ \\\n                            --seed={seed} \\\n                   \
-          \         --cpu_offload_optimizer \\\n                            --cpu_offload_params\
+          \         --cpu_offload_optimizer \\\n                            --cpu_offload_params_fsdp\
           \ \\\n                            --distributed_training_framework fsdp\
-          \ \\\n                            --is_granite \\\n                    \
-          \        --checkpoint_at_epoch\n                      command:\n       \
-          \                 - /bin/bash\n                        - '-c'\n        \
-          \                - '--'\n                      image: {image}\n        \
-          \              name: pytorch\n                      volumeMounts:\n    \
-          \                    - mountPath: /input_data\n                        \
-          \  name: input-data\n                          readOnly: true\n        \
-          \                - mountPath: /input_model\n                          name:\
-          \ model\n                          readOnly: true\n                    \
-          \    - mountPath: /output\n                          name: output\n    \
-          \                      readOnly: true\n                      env:\n    \
-          \                    - name: NNODES\n                          value: \\\
-          \"{nnodes}\\\"\n                        - name: NPROC_PER_NODE\n       \
-          \                   value: \\\"{nproc_per_node}\\\"\n                  \
-          \      - name: XDG_CACHE_HOME\n                          value: /tmp\n \
-          \                       - name: TRITON_CACHE_DIR\n                     \
-          \     value: /tmp\n                        - name: HF_HOME\n           \
-          \               value: /tmp\n                        - name: TRANSFORMERS_CACHE\n\
+          \ \\\n                            --checkpoint_at_epoch\n              \
+          \        command:\n                        - /bin/bash\n               \
+          \         - '-c'\n                        - '--'\n                     \
+          \ image: {image}\n                      name: pytorch\n                \
+          \      volumeMounts:\n                        - mountPath: /input_data\n\
+          \                          name: input-data\n                          readOnly:\
+          \ true\n                        - mountPath: /input_model\n            \
+          \              name: model\n                          readOnly: true\n \
+          \                       - mountPath: /output\n                         \
+          \ name: output\n                          readOnly: true\n             \
+          \         env:\n                        - name: NNODES\n               \
+          \           value: \\\"{nnodes}\\\"\n                        - name: NPROC_PER_NODE\n\
+          \                          value: \\\"{nproc_per_node}\\\"\n           \
+          \             - name: XDG_CACHE_HOME\n                          value: /tmp\n\
+          \                        - name: TRITON_CACHE_DIR\n                    \
+          \      value: /tmp\n                        - name: HF_HOME\n          \
+          \                value: /tmp\n                        - name: TRANSFORMERS_CACHE\n\
           \                          value: /tmp\n                      resources:\n\
-          \                        requests:\n                          cpu: 8\n \
-          \                         \"nvidia.com/gpu\": {nproc_per_node}\n       \
-          \                 limits:\n                          cpu: 8\n          \
-          \                \"nvidia.com/gpu\": {nproc_per_node}\n                \
-          \  volumes:\n                    - name: input-data\n                  \
-          \    persistentVolumeClaim:\n                        claimName: {input_pvc_name}\n\
-          \                    - name: model\n                      persistentVolumeClaim:\n\
-          \                        claimName: {model_pvc_name}\n                 \
-          \   - name: output\n                      persistentVolumeClaim:\n     \
-          \                   claimName: {output_pvc_name}\n        \"\"\"\n    )\n\
-          \n    try:\n        manifest_yaml = yaml.safe_load(manifest)\n    except\
-          \ yaml.YAMLError as exc:\n        raise RuntimeError(f\"Error parsing manifest:\
-          \ {exc}\") from exc\n\n    # Discover the namespace in which the pod is\
-          \ running\n    with open(\n        \"/var/run/secrets/kubernetes.io/serviceaccount/namespace\"\
+          \                        requests:\n                          \"nvidia.com/gpu\"\
+          : {nproc_per_node}\n                        limits:\n                  \
+          \        \"nvidia.com/gpu\": {nproc_per_node}\n                  volumes:\n\
+          \                    - name: input-data\n                      persistentVolumeClaim:\n\
+          \                        claimName: {input_pvc_name}\n                 \
+          \   - name: model\n                      persistentVolumeClaim:\n      \
+          \                  claimName: {model_pvc_name}\n                    - name:\
+          \ output\n                      persistentVolumeClaim:\n               \
+          \         claimName: {output_pvc_name}\n        \"\"\"\n    )\n\n    try:\n\
+          \        manifest_yaml = yaml.safe_load(manifest)\n    except yaml.YAMLError\
+          \ as exc:\n        raise RuntimeError(f\"Error parsing manifest: {exc}\"\
+          ) from exc\n\n    # Discover the namespace in which the pod is running\n\
+          \    with open(\n        \"/var/run/secrets/kubernetes.io/serviceaccount/namespace\"\
           , \"r\", encoding=\"utf-8\"\n    ) as f:\n        namespace = f.read().strip()\n\
           \        print(f\"The pod is running in the namespace: {namespace}\")\n\n\
           \    try:\n        kubernetes.config.load_kube_config()\n        print(\"\
@@ -1556,8 +1553,9 @@ deploymentSpec:
           \ http_client=custom_http_client\n        )\n    else:\n        client =\
           \ openai.OpenAI(base_url=endpoint, api_key=api_key)\n\n    taxonomy_base\
           \ = \"main\" if repo_branch or (repo_pr and int(repo_pr) > 0) else \"empty\"\
-          \n\n    print(\"Generating synthetic dataset for:\")\n    print()\n    print(read_taxonomy(taxonomy_path,\
-          \ taxonomy_base))\n\n    set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)\n\
+          \n\n    print(\"Generating synthetic dataset for:\")\n    print()\n    print(\n\
+          \        read_taxonomy(\n            taxonomy_path, taxonomy_base, document_output_dir=f\"\
+          {sdg_path}/documents\"\n        )\n    )\n\n    # sset_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)\n\
           \n    # generate_data has a magic word for its taxonomy_base argument -\
           \ 'empty'\n    # it allows generating from the whole repo, see:\n    # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
           \    generate_data(\n        client=client,\n        num_instructions_to_generate=num_instructions_to_generate,\n\
diff --git a/sdg/components.py b/sdg/components.py
index aa2cdfd9..a81acb01 100644
--- a/sdg/components.py
+++ b/sdg/components.py
@@ -73,9 +73,13 @@ def set_precomputed_skills_data_ratio(sampling_size: float):
 
     print("Generating synthetic dataset for:")
     print()
-    print(read_taxonomy(taxonomy_path, taxonomy_base))
+    print(
+        read_taxonomy(
+            taxonomy_path, taxonomy_base, document_output_dir=f"{sdg_path}/documents"
+        )
+    )
 
-    set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)
+    # sset_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)
 
     # generate_data has a magic word for its taxonomy_base argument - 'empty'
     # it allows generating from the whole repo, see:
diff --git a/training/components.py b/training/components.py
index 3a007cd0..8059bda4 100644
--- a/training/components.py
+++ b/training/components.py
@@ -211,9 +211,8 @@ def list_phase1_final_model():
                               --max_batch_len={max_batch_len} \
                               --seed={seed} \
                               --cpu_offload_optimizer \
-                              --cpu_offload_params \
+                              --cpu_offload_params_fsdp \
                               --distributed_training_framework fsdp \
-                              --is_granite \
                               --checkpoint_at_epoch
                       command:
                         - /bin/bash
@@ -245,10 +244,8 @@ def list_phase1_final_model():
                           value: /tmp
                       resources:
                         requests:
-                          cpu: 8
                           "nvidia.com/gpu": {nproc_per_node}
                         limits:
-                          cpu: 8
                           "nvidia.com/gpu": {nproc_per_node}
                   volumes:
                     - name: input-data
@@ -292,9 +289,8 @@ def list_phase1_final_model():
                             --max_batch_len={max_batch_len} \
                             --seed={seed} \
                             --cpu_offload_optimizer \
-                            --cpu_offload_params \
+                            --cpu_offload_params_fsdp \
                             --distributed_training_framework fsdp \
-                            --is_granite \
                             --checkpoint_at_epoch
                       command:
                         - /bin/bash
@@ -327,10 +323,8 @@ def list_phase1_final_model():
                           value: /tmp
                       resources:
                         requests:
-                          cpu: 8
                           "nvidia.com/gpu": {nproc_per_node}
                         limits:
-                          cpu: 8
                           "nvidia.com/gpu": {nproc_per_node}
                   volumes:
                     - name: input-data

From b235539466a276dddb53ac28d355b33f8d25bbe9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Wed, 4 Dec 2024 10:10:18 +0100
Subject: [PATCH 3/5] fix: set_precomputed_skills_data_ratio if EACCES
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On 1.3, we cannot edit
/usr/share/instructlab/sdg/default_data_recipes/skills.yaml, thus we had
to make adjustments to override the SDG DataMixer class to pass a
different skills file.

Also, sdg_sampling_size is now optional in the pipeline.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 pipeline.py              |   2 +-
 pipeline.yaml            |  95 ++++++++++++++++++++-------
 sdg/components.py        | 134 +++++++++++++++++++++++++++++---------
 standalone/standalone.py | 136 ++++++++++++++++++++++++++++++---------
 4 files changed, 281 insertions(+), 86 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 177179ed..64118bda 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -115,7 +115,7 @@ def pipeline(
         sdg_scale_factor: int = 2,  # Renamed upstream https://github.com/instructlab/instructlab/blob/f7d40f6ed5112d59132dd832bd332fa6fbbe7010/src/instructlab/configuration.py#L279-L290
         sdg_pipeline: str = SDG_PIPELINE,
         sdg_max_batch_len: int = MAX_BATCH_LEN,
-        sdg_sample_size: float = 1.0,
+        sdg_sample_size: float = None,
         # Training phase
         train_nproc_per_node: int = 3,
         train_nnodes: int = 2,
diff --git a/pipeline.yaml b/pipeline.yaml
index e89b9f47..d8cef59a 100644
--- a/pipeline.yaml
+++ b/pipeline.yaml
@@ -15,7 +15,7 @@
 #    sdg_repo_branch: str
 #    sdg_repo_pr: int
 #    sdg_repo_url: str [Default: 'https://github.com/instructlab/taxonomy.git']
-#    sdg_sample_size: float [Default: 1.0]
+#    sdg_sample_size: float
 #    sdg_scale_factor: int [Default: 2.0]
 #    train_effective_batch_size_phase_1: int [Default: 3840.0]
 #    train_effective_batch_size_phase_2: int [Default: 3840.0]
@@ -522,7 +522,6 @@ components:
           isOptional: true
           parameterType: STRING
         sdg_sampling_size:
-          defaultValue: 1.0
           isOptional: true
           parameterType: NUMBER_DOUBLE
         taxonomy_path:
@@ -1536,32 +1535,81 @@ deploymentSpec:
           \ *\n\ndef sdg_op(\n    num_instructions_to_generate: int,\n    pipeline:\
           \ str,\n    repo_branch: Optional[str],\n    repo_pr: Optional[int],\n \
           \   taxonomy_path: str = \"/data/taxonomy\",\n    sdg_path: str = \"/data/sdg\"\
-          ,\n    sdg_sampling_size: float = 1.0,\n):\n    from os import getenv, path\n\
-          \n    import openai\n    import yaml\n    from instructlab.sdg import generate_data\n\
-          \    from instructlab.sdg.utils.taxonomy import read_taxonomy\n\n    def\
-          \ set_precomputed_skills_data_ratio(sampling_size: float):\n        skills_recipe\
-          \ = \"/usr/share/instructlab/sdg/default_data_recipes/skills.yaml\"\n  \
-          \      if path.exists(skills_recipe):\n            with open(skills_recipe,\
-          \ \"r\") as file:\n                skills_yaml = yaml.load(file, Loader=yaml.Loader)\n\
-          \n            skills_yaml[\"datasets\"][0][\"sampling_size\"] = sampling_size\n\
-          \n            with open(skills_recipe, \"w\", encoding=\"utf-8\") as file:\n\
-          \                yaml.dump(skills_yaml, file)\n\n    api_key = getenv(\"\
-          api_key\")\n    model = getenv(\"model\")\n    endpoint = getenv(\"endpoint\"\
-          )\n\n    if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"):\n        import\
-          \ httpx\n\n        custom_http_client = httpx.Client(verify=sdg_ca_cert)\n\
+          ,\n    sdg_sampling_size: float = None,\n):\n    from os import getenv,\
+          \ path\n\n    import instructlab.sdg\n    import openai\n    import yaml\n\
+          \n    api_key = getenv(\"api_key\")\n    model = getenv(\"model\")\n   \
+          \ endpoint = getenv(\"endpoint\")\n\n    if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"\
+          ):\n        import httpx\n\n        custom_http_client = httpx.Client(verify=sdg_ca_cert)\n\
           \        client = openai.OpenAI(\n            base_url=endpoint, api_key=api_key,\
           \ http_client=custom_http_client\n        )\n    else:\n        client =\
           \ openai.OpenAI(base_url=endpoint, api_key=api_key)\n\n    taxonomy_base\
           \ = \"main\" if repo_branch or (repo_pr and int(repo_pr) > 0) else \"empty\"\
           \n\n    print(\"Generating synthetic dataset for:\")\n    print()\n    print(\n\
-          \        read_taxonomy(\n            taxonomy_path, taxonomy_base, document_output_dir=f\"\
-          {sdg_path}/documents\"\n        )\n    )\n\n    # sset_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)\n\
-          \n    # generate_data has a magic word for its taxonomy_base argument -\
-          \ 'empty'\n    # it allows generating from the whole repo, see:\n    # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
-          \    generate_data(\n        client=client,\n        num_instructions_to_generate=num_instructions_to_generate,\n\
-          \        output_dir=sdg_path,\n        taxonomy=taxonomy_path,\n       \
-          \ taxonomy_base=taxonomy_base,\n        model_name=model,\n        pipeline=pipeline,\n\
-          \        chunk_word_count=1000,\n        server_ctx_size=4096,\n    )\n\n"
+          \        instructlab.sdg.utils.taxonomy.read_taxonomy(\n            taxonomy_path,\
+          \ taxonomy_base, document_output_dir=f\"{sdg_path}/documents\"\n       \
+          \ )\n    )\n\n    # Generate synthetic dataset\n    if sdg_sampling_size\
+          \ is None:\n        # generate_data has a magic word for its taxonomy_base\
+          \ argument - 'empty'\n        # it allows generating from the whole repo,\
+          \ see:\n        # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
+          \        instructlab.sdg.generate_data(\n            client=client,\n  \
+          \          num_instructions_to_generate=num_instructions_to_generate,\n\
+          \            output_dir=sdg_path,\n            taxonomy=taxonomy_path,\n\
+          \            taxonomy_base=taxonomy_base,\n            model_name=model,\n\
+          \            pipeline=pipeline,\n            chunk_word_count=1000,\n  \
+          \          server_ctx_size=4096,\n        )\n    # Tweak precomputed skills\
+          \ data ratio if needed\n    else:\n        skills_recipe = \"/usr/share/instructlab/sdg/default_data_recipes/skills.yaml\"\
+          \n\n        def set_precomputed_skills_data_ratio(sampling_size: float,\
+          \ skills_recipe: str):\n            if path.exists(skills_recipe):\n   \
+          \             with open(skills_recipe, \"r\", encoding=\"utf-8\") as file:\n\
+          \                    skills_yaml = yaml.load(file, Loader=yaml.Loader)\n\
+          \n                skills_yaml[\"datasets\"][0][\"sampling_size\"] = sampling_size\n\
+          \n                with open(skills_recipe, \"w\", encoding=\"utf-8\") as\
+          \ file:\n                    yaml.dump(skills_yaml, file)\n\n        try:\n\
+          \            set_precomputed_skills_data_ratio(\n                sampling_size=sdg_sampling_size,\
+          \ skills_recipe=skills_recipe\n            )\n        except PermissionError:\n\
+          \            print(\"Failed to set precomputed skills data ratio: Permission\
+          \ denied\")\n            print(\"Attempting to override DataMixer class\
+          \ to set the ratio\")\n            import os\n            import shutil\n\
+          \            import tempfile\n\n            import xdg_base_dirs\n\n   \
+          \         # Create a temporary directory\n            with tempfile.TemporaryDirectory()\
+          \ as temp_dir:\n                # Create a default_data_recipes directory\n\
+          \                temp_dir = path.join(temp_dir, \"default_data_recipes\"\
+          )\n                os.mkdir(temp_dir)\n\n                # Copy default_data_recipes/skills.yaml\
+          \ to the temporary directory\n                shutil.copy(skills_recipe,\
+          \ temp_dir)\n\n                # Also copy the current pipeline directory\
+          \ to the temporary directory - it's a small\n                # directory\
+          \ like 28KB\n                # This isn't needed if the pipeline is either\
+          \ \"full\" or \"simple\" but it's future-proofing\n                data_dirs\
+          \ = [\n                    os.path.join(str(dir), \"instructlab\", \"sdg\"\
+          )\n                    for dir in xdg_base_dirs.xdg_data_dirs()\n      \
+          \          ]\n                temp_pipeline_dir = path.join(temp_dir, \"\
+          pipeline\")\n                os.mkdir(temp_pipeline_dir)\n             \
+          \   for d in data_dirs:\n                    pipeline_path = os.path.join(d,\
+          \ \"pipelines\", pipeline)\n                    if os.path.exists(pipeline_path):\n\
+          \                        shutil.copytree(pipeline_path, temp_pipeline_dir)\n\
+          \                        break\n\n                # Build new skills.yaml\
+          \ path\n                new_skills_recipe = path.join(temp_dir, \"skills.yaml\"\
+          )\n                print(f\"New skills recipe path: {new_skills_recipe}\"\
+          )\n\n                # Override XDG_DATA_DIRS with the temporary directory\n\
+          \                # This allows SDG to read the new skills.yaml since it's\
+          \ looking into XDG_DATA_DIRS\n                # and looks for a default_data_recipes\
+          \ directory with a skills.yaml file\n                os.environ[\"XDG_DATA_DIRS\"\
+          ] = f\"{temp_dir}\"\n\n                # Try to set the precomputed skills\
+          \ data ratio again\n                try:\n                    set_precomputed_skills_data_ratio(\n\
+          \                        sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe\n\
+          \                    )\n                    print(\"Successfully set precomputed\
+          \ skills data ratio\")\n\n                    # generate_data has a magic\
+          \ word for its taxonomy_base argument - 'empty'\n                    # it\
+          \ allows generating from the whole repo, see:\n                    # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
+          \                    instructlab.sdg.generate_data(\n                  \
+          \      client=client,\n                        num_instructions_to_generate=num_instructions_to_generate,\n\
+          \                        output_dir=sdg_path,\n                        taxonomy=taxonomy_path,\n\
+          \                        taxonomy_base=taxonomy_base,\n                \
+          \        model_name=model,\n                        pipeline=pipeline,\n\
+          \                        chunk_word_count=1000,\n                      \
+          \  server_ctx_size=4096,\n                    )\n                except\
+          \ Exception as e:\n                    print(f\"Failed to set precomputed\
+          \ skills data ratio: {e}\")\n                    raise\n\n"
         env:
         - name: HOME
           value: /tmp
@@ -2091,7 +2139,6 @@ root:
         isOptional: true
         parameterType: STRING
       sdg_sample_size:
-        defaultValue: 1.0
         description: SDG parameter. Represents the sdg skills recipe sampling size
           as percentage in decimal form.
         isOptional: true
diff --git a/sdg/components.py b/sdg/components.py
index a81acb01..941815b9 100644
--- a/sdg/components.py
+++ b/sdg/components.py
@@ -35,25 +35,13 @@ def sdg_op(
     repo_pr: Optional[int],
     taxonomy_path: str = "/data/taxonomy",
     sdg_path: str = "/data/sdg",
-    sdg_sampling_size: float = 1.0,
+    sdg_sampling_size: float = None,
 ):
     from os import getenv, path
 
+    import instructlab.sdg
     import openai
     import yaml
-    from instructlab.sdg import generate_data
-    from instructlab.sdg.utils.taxonomy import read_taxonomy
-
-    def set_precomputed_skills_data_ratio(sampling_size: float):
-        skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
-        if path.exists(skills_recipe):
-            with open(skills_recipe, "r") as file:
-                skills_yaml = yaml.load(file, Loader=yaml.Loader)
-
-            skills_yaml["datasets"][0]["sampling_size"] = sampling_size
-
-            with open(skills_recipe, "w", encoding="utf-8") as file:
-                yaml.dump(skills_yaml, file)
 
     api_key = getenv("api_key")
     model = getenv("model")
@@ -74,27 +62,111 @@ def set_precomputed_skills_data_ratio(sampling_size: float):
     print("Generating synthetic dataset for:")
     print()
     print(
-        read_taxonomy(
+        instructlab.sdg.utils.taxonomy.read_taxonomy(
             taxonomy_path, taxonomy_base, document_output_dir=f"{sdg_path}/documents"
         )
     )
 
-    # sset_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)
-
-    # generate_data has a magic word for its taxonomy_base argument - 'empty'
-    # it allows generating from the whole repo, see:
-    # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
-    generate_data(
-        client=client,
-        num_instructions_to_generate=num_instructions_to_generate,
-        output_dir=sdg_path,
-        taxonomy=taxonomy_path,
-        taxonomy_base=taxonomy_base,
-        model_name=model,
-        pipeline=pipeline,
-        chunk_word_count=1000,
-        server_ctx_size=4096,
-    )
+    # Generate synthetic dataset
+    if sdg_sampling_size is None:
+        # generate_data has a magic word for its taxonomy_base argument - 'empty'
+        # it allows generating from the whole repo, see:
+        # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
+        instructlab.sdg.generate_data(
+            client=client,
+            num_instructions_to_generate=num_instructions_to_generate,
+            output_dir=sdg_path,
+            taxonomy=taxonomy_path,
+            taxonomy_base=taxonomy_base,
+            model_name=model,
+            pipeline=pipeline,
+            chunk_word_count=1000,
+            server_ctx_size=4096,
+        )
+    # Tweak precomputed skills data ratio if needed
+    else:
+        skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
+
+        def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
+            if path.exists(skills_recipe):
+                with open(skills_recipe, "r", encoding="utf-8") as file:
+                    skills_yaml = yaml.load(file, Loader=yaml.Loader)
+
+                skills_yaml["datasets"][0]["sampling_size"] = sampling_size
+
+                with open(skills_recipe, "w", encoding="utf-8") as file:
+                    yaml.dump(skills_yaml, file)
+
+        try:
+            set_precomputed_skills_data_ratio(
+                sampling_size=sdg_sampling_size, skills_recipe=skills_recipe
+            )
+        except PermissionError:
+            print("Failed to set precomputed skills data ratio: Permission denied")
+            print("Attempting to override DataMixer class to set the ratio")
+            import os
+            import shutil
+            import tempfile
+
+            import xdg_base_dirs
+
+            # Create a temporary directory
+            with tempfile.TemporaryDirectory() as temp_dir:
+                # Create a default_data_recipes directory
+                temp_dir = path.join(temp_dir, "default_data_recipes")
+                os.mkdir(temp_dir)
+
+                # Copy default_data_recipes/skills.yaml to the temporary directory
+                shutil.copy(skills_recipe, temp_dir)
+
+                # Also copy the current pipeline directory to the temporary directory - it's a small
+                # directory like 28KB
+                # This isn't needed if the pipeline is either "full" or "simple" but it's future-proofing
+                data_dirs = [
+                    os.path.join(str(dir), "instructlab", "sdg")
+                    for dir in xdg_base_dirs.xdg_data_dirs()
+                ]
+                temp_pipeline_dir = path.join(temp_dir, "pipeline")
+                os.mkdir(temp_pipeline_dir)
+                for d in data_dirs:
+                    pipeline_path = os.path.join(d, "pipelines", pipeline)
+                    if os.path.exists(pipeline_path):
+                        shutil.copytree(pipeline_path, temp_pipeline_dir)
+                        break
+
+                # Build new skills.yaml path
+                new_skills_recipe = path.join(temp_dir, "skills.yaml")
+                print(f"New skills recipe path: {new_skills_recipe}")
+
+                # Override XDG_DATA_DIRS with the temporary directory
+                # This allows SDG to read the new skills.yaml since it's looking into XDG_DATA_DIRS
+                # and looks for a default_data_recipes directory with a skills.yaml file
+                os.environ["XDG_DATA_DIRS"] = f"{temp_dir}"
+
+                # Try to set the precomputed skills data ratio again
+                try:
+                    set_precomputed_skills_data_ratio(
+                        sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe
+                    )
+                    print("Successfully set precomputed skills data ratio")
+
+                    # generate_data has a magic word for its taxonomy_base argument - 'empty'
+                    # it allows generating from the whole repo, see:
+                    # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
+                    instructlab.sdg.generate_data(
+                        client=client,
+                        num_instructions_to_generate=num_instructions_to_generate,
+                        output_dir=sdg_path,
+                        taxonomy=taxonomy_path,
+                        taxonomy_base=taxonomy_base,
+                        model_name=model,
+                        pipeline=pipeline,
+                        chunk_word_count=1000,
+                        server_ctx_size=4096,
+                    )
+                except Exception as e:
+                    print(f"Failed to set precomputed skills data ratio: {e}")
+                    raise
 
 
 @dsl.container_component
diff --git a/standalone/standalone.py b/standalone/standalone.py
index 028ea995..7c1f41ed 100755
--- a/standalone/standalone.py
+++ b/standalone/standalone.py
@@ -1130,25 +1130,13 @@ def sdg_op(
     repo_pr: Optional[int],
     taxonomy_path: str = "/data/taxonomy",
     sdg_path: str = "/data/sdg",
-    sdg_sampling_size: float = 1.0,
+    sdg_sampling_size: float = None,
 ):
     from os import getenv, path
 
+    import instructlab.sdg
     import openai
     import yaml
-    from instructlab.sdg import generate_data
-    from instructlab.sdg.utils.taxonomy import read_taxonomy
-
-    def set_precomputed_skills_data_ratio(sampling_size: float):
-        skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
-        if path.exists(skills_recipe):
-            with open(skills_recipe, "r") as file:
-                skills_yaml = yaml.load(file, Loader=yaml.Loader)
-
-            skills_yaml["datasets"][0]["sampling_size"] = sampling_size
-
-            with open(skills_recipe, "w", encoding="utf-8") as file:
-                yaml.dump(skills_yaml, file)
 
     api_key = getenv("api_key")
     model = getenv("model")
@@ -1168,24 +1156,112 @@ def set_precomputed_skills_data_ratio(sampling_size: float):
 
     print("Generating synthetic dataset for:")
     print()
-    print(read_taxonomy(taxonomy_path, taxonomy_base))
+    print(
+        instructlab.sdg.utils.taxonomy.read_taxonomy(
+            taxonomy_path, taxonomy_base, document_output_dir=f"{sdg_path}/documents"
+        )
+    )
 
-    set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)
+    # Generate synthetic dataset
+    if sdg_sampling_size is None:
+        # generate_data has a magic word for its taxonomy_base argument - 'empty'
+        # it allows generating from the whole repo, see:
+        # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
+        instructlab.sdg.generate_data(
+            client=client,
+            num_instructions_to_generate=num_instructions_to_generate,
+            output_dir=sdg_path,
+            taxonomy=taxonomy_path,
+            taxonomy_base=taxonomy_base,
+            model_name=model,
+            pipeline=pipeline,
+            chunk_word_count=1000,
+            server_ctx_size=4096,
+        )
+    # Tweak precomputed skills data ratio if needed
+    else:
+        skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
 
-    # generate_data has a magic word for its taxonomy_base argument - 'empty'
-    # it allows generating from the whole repo, see:
-    # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
-    generate_data(
-        client=client,
-        num_instructions_to_generate=num_instructions_to_generate,
-        output_dir=sdg_path,
-        taxonomy=taxonomy_path,
-        taxonomy_base=taxonomy_base,
-        model_name=model,
-        pipeline=pipeline,
-        chunk_word_count=1000,
-        server_ctx_size=4096,
-    )
+        def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
+            if path.exists(skills_recipe):
+                with open(skills_recipe, "r", encoding="utf-8") as file:
+                    skills_yaml = yaml.load(file, Loader=yaml.Loader)
+
+                skills_yaml["datasets"][0]["sampling_size"] = sampling_size
+
+                with open(skills_recipe, "w", encoding="utf-8") as file:
+                    yaml.dump(skills_yaml, file)
+
+        try:
+            set_precomputed_skills_data_ratio(
+                sampling_size=sdg_sampling_size, skills_recipe=skills_recipe
+            )
+        except PermissionError:
+            print("Failed to set precomputed skills data ratio: Permission denied")
+            print("Attempting to override DataMixer class to set the ratio")
+            import os
+            import shutil
+            import tempfile
+
+            import xdg_base_dirs
+
+            # Create a temporary directory
+            with tempfile.TemporaryDirectory() as temp_dir:
+                # Create a default_data_recipes directory
+                temp_dir = path.join(temp_dir, "default_data_recipes")
+                os.mkdir(temp_dir)
+
+                # Copy default_data_recipes/skills.yaml to the temporary directory
+                shutil.copy(skills_recipe, temp_dir)
+
+                # Also copy the current pipeline directory to the temporary directory - it's a small
+                # directory like 28KB
+                # This isn't needed if the pipeline is either "full" or "simple" but it's future-proofing
+                data_dirs = [
+                    os.path.join(str(dir), "instructlab", "sdg")
+                    for dir in xdg_base_dirs.xdg_data_dirs()
+                ]
+                temp_pipeline_dir = path.join(temp_dir, "pipeline")
+                os.mkdir(temp_pipeline_dir)
+                for d in data_dirs:
+                    pipeline_path = os.path.join(d, "pipelines", pipeline)
+                    if os.path.exists(pipeline_path):
+                        shutil.copytree(pipeline_path, temp_pipeline_dir)
+                        break
+
+                # Build new skills.yaml path
+                new_skills_recipe = path.join(temp_dir, "skills.yaml")
+                print(f"New skills recipe path: {new_skills_recipe}")
+
+                # Override XDG_DATA_DIRS with the temporary directory
+                # This allows SDG to read the new skills.yaml since it's looking into XDG_DATA_DIRS
+                # and looks for a default_data_recipes directory with a skills.yaml file
+                os.environ["XDG_DATA_DIRS"] = f"{temp_dir}"
+
+                # Try to set the precomputed skills data ratio again
+                try:
+                    set_precomputed_skills_data_ratio(
+                        sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe
+                    )
+                    print("Successfully set precomputed skills data ratio")
+
+                    # generate_data has a magic word for its taxonomy_base argument - 'empty'
+                    # it allows generating from the whole repo, see:
+                    # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
+                    instructlab.sdg.generate_data(
+                        client=client,
+                        num_instructions_to_generate=num_instructions_to_generate,
+                        output_dir=sdg_path,
+                        taxonomy=taxonomy_path,
+                        taxonomy_base=taxonomy_base,
+                        model_name=model,
+                        pipeline=pipeline,
+                        chunk_word_count=1000,
+                        server_ctx_size=4096,
+                    )
+                except Exception as e:
+                    print(f"Failed to set precomputed skills data ratio: {e}")
+                    raise
 """
     exec_sdg_op_args = f"""
 sdg_op(num_instructions_to_generate={num_instructions_to_generate}, pipeline="{sdg_pipeline}", repo_branch="{exec_git_clone_op_repo_branch or ""}", repo_pr={exec_git_clone_op_repo_pr or 0}, taxonomy_path="{TAXONOMY_DATA_PATH}", sdg_path="{DATA_PVC_SDG_PATH}", sdg_sampling_size={sdg_sampling_size})

From 262fb94ab94a2225b91108777e6b17134defcc77 Mon Sep 17 00:00:00 2001
From: Michael Clifford <mcliffor@redhat.com>
Date: Thu, 5 Dec 2024 15:25:24 -0500
Subject: [PATCH 4/5] set copytree dirs_exist_ok to True in sdg op

Signed-off-by: Michael Clifford <mcliffor@redhat.com>
---
 pipeline.yaml            | 21 ++++++++++++---------
 sdg/components.py        | 12 +++++++++---
 standalone/standalone.py | 12 +++++++++---
 3 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/pipeline.yaml b/pipeline.yaml
index d8cef59a..9a73e564 100644
--- a/pipeline.yaml
+++ b/pipeline.yaml
@@ -1568,8 +1568,8 @@ deploymentSpec:
           \            set_precomputed_skills_data_ratio(\n                sampling_size=sdg_sampling_size,\
           \ skills_recipe=skills_recipe\n            )\n        except PermissionError:\n\
           \            print(\"Failed to set precomputed skills data ratio: Permission\
-          \ denied\")\n            print(\"Attempting to override DataMixer class\
-          \ to set the ratio\")\n            import os\n            import shutil\n\
+          \ denied\")\n            print(\"Attempting to move default data recipes\
+          \ to temporary directory\")\n            import os\n            import shutil\n\
           \            import tempfile\n\n            import xdg_base_dirs\n\n   \
           \         # Create a temporary directory\n            with tempfile.TemporaryDirectory()\
           \ as temp_dir:\n                # Create a default_data_recipes directory\n\
@@ -1586,19 +1586,22 @@ deploymentSpec:
           pipeline\")\n                os.mkdir(temp_pipeline_dir)\n             \
           \   for d in data_dirs:\n                    pipeline_path = os.path.join(d,\
           \ \"pipelines\", pipeline)\n                    if os.path.exists(pipeline_path):\n\
-          \                        shutil.copytree(pipeline_path, temp_pipeline_dir)\n\
-          \                        break\n\n                # Build new skills.yaml\
-          \ path\n                new_skills_recipe = path.join(temp_dir, \"skills.yaml\"\
-          )\n                print(f\"New skills recipe path: {new_skills_recipe}\"\
-          )\n\n                # Override XDG_DATA_DIRS with the temporary directory\n\
+          \                        shutil.copytree(\n                            pipeline_path,\n\
+          \                            temp_pipeline_dir,\n                      \
+          \      dirs_exist_ok=True,\n                        )\n                \
+          \        break\n\n                # Build new skills.yaml path\n       \
+          \         new_skills_recipe = path.join(temp_dir, \"skills.yaml\")\n   \
+          \             print(f\"New skills recipe path: {new_skills_recipe}\")\n\n\
+          \                # Override XDG_DATA_DIRS with the temporary directory\n\
           \                # This allows SDG to read the new skills.yaml since it's\
           \ looking into XDG_DATA_DIRS\n                # and looks for a default_data_recipes\
           \ directory with a skills.yaml file\n                os.environ[\"XDG_DATA_DIRS\"\
           ] = f\"{temp_dir}\"\n\n                # Try to set the precomputed skills\
           \ data ratio again\n                try:\n                    set_precomputed_skills_data_ratio(\n\
           \                        sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe\n\
-          \                    )\n                    print(\"Successfully set precomputed\
-          \ skills data ratio\")\n\n                    # generate_data has a magic\
+          \                    )\n                    print(\n                   \
+          \     f\"Successfully set precomputed skills data ratio to {sdg_sampling_size}\"\
+          \n                    )\n\n                    # generate_data has a magic\
           \ word for its taxonomy_base argument - 'empty'\n                    # it\
           \ allows generating from the whole repo, see:\n                    # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
           \                    instructlab.sdg.generate_data(\n                  \
diff --git a/sdg/components.py b/sdg/components.py
index 941815b9..40f0b4bd 100644
--- a/sdg/components.py
+++ b/sdg/components.py
@@ -103,7 +103,7 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
             )
         except PermissionError:
             print("Failed to set precomputed skills data ratio: Permission denied")
-            print("Attempting to override DataMixer class to set the ratio")
+            print("Attempting to move default data recipes to temporary directory")
             import os
             import shutil
             import tempfile
@@ -131,7 +131,11 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
                 for d in data_dirs:
                     pipeline_path = os.path.join(d, "pipelines", pipeline)
                     if os.path.exists(pipeline_path):
-                        shutil.copytree(pipeline_path, temp_pipeline_dir)
+                        shutil.copytree(
+                            pipeline_path,
+                            temp_pipeline_dir,
+                            dirs_exist_ok=True,
+                        )
                         break
 
                 # Build new skills.yaml path
@@ -148,7 +152,9 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
                     set_precomputed_skills_data_ratio(
                         sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe
                     )
-                    print("Successfully set precomputed skills data ratio")
+                    print(
+                        f"Successfully set precomputed skills data ratio to {sdg_sampling_size}"
+                    )
 
                     # generate_data has a magic word for its taxonomy_base argument - 'empty'
                     # it allows generating from the whole repo, see:
diff --git a/standalone/standalone.py b/standalone/standalone.py
index 7c1f41ed..f85c4764 100755
--- a/standalone/standalone.py
+++ b/standalone/standalone.py
@@ -1198,7 +1198,7 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
             )
         except PermissionError:
             print("Failed to set precomputed skills data ratio: Permission denied")
-            print("Attempting to override DataMixer class to set the ratio")
+            print("Attempting to move default data recipes to temporary directory")
             import os
             import shutil
             import tempfile
@@ -1226,7 +1226,11 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
                 for d in data_dirs:
                     pipeline_path = os.path.join(d, "pipelines", pipeline)
                     if os.path.exists(pipeline_path):
-                        shutil.copytree(pipeline_path, temp_pipeline_dir)
+                        shutil.copytree(
+                            pipeline_path,
+                            temp_pipeline_dir,
+                            dirs_exist_ok=True,
+                        )
                         break
 
                 # Build new skills.yaml path
@@ -1243,7 +1247,9 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
                     set_precomputed_skills_data_ratio(
                         sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe
                     )
-                    print("Successfully set precomputed skills data ratio")
+                    print(
+                        f"Successfully set precomputed skills data ratio to {sdg_sampling_size}"
+                    )
 
                     # generate_data has a magic word for its taxonomy_base argument - 'empty'
                     # it allows generating from the whole repo, see:

From 46b91495d8e92d3e362137497662140a99174dc0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Mon, 9 Dec 2024 14:18:32 +0100
Subject: [PATCH 5/5] fix: set a default to sdg_sample_size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Do not use None since it is not supported by the pipeline. Use the
default 1.0 and compare against it to determine whether we need to
tweak it.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 pipeline.py              |  2 +-
 pipeline.yaml            | 20 +++++++++++---------
 sdg/components.py        |  5 +++--
 standalone/standalone.py |  5 +++--
 4 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 64118bda..177179ed 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -115,7 +115,7 @@ def pipeline(
         sdg_scale_factor: int = 2,  # Renamed upstream https://github.com/instructlab/instructlab/blob/f7d40f6ed5112d59132dd832bd332fa6fbbe7010/src/instructlab/configuration.py#L279-L290
         sdg_pipeline: str = SDG_PIPELINE,
         sdg_max_batch_len: int = MAX_BATCH_LEN,
-        sdg_sample_size: float = None,
+        sdg_sample_size: float = 1.0,
         # Training phase
         train_nproc_per_node: int = 3,
         train_nnodes: int = 2,
diff --git a/pipeline.yaml b/pipeline.yaml
index 9a73e564..823d7567 100644
--- a/pipeline.yaml
+++ b/pipeline.yaml
@@ -15,7 +15,7 @@
 #    sdg_repo_branch: str
 #    sdg_repo_pr: int
 #    sdg_repo_url: str [Default: 'https://github.com/instructlab/taxonomy.git']
-#    sdg_sample_size: float
+#    sdg_sample_size: float [Default: 1.0]
 #    sdg_scale_factor: int [Default: 2.0]
 #    train_effective_batch_size_phase_1: int [Default: 3840.0]
 #    train_effective_batch_size_phase_2: int [Default: 3840.0]
@@ -522,6 +522,7 @@ components:
           isOptional: true
           parameterType: STRING
         sdg_sampling_size:
+          defaultValue: 1.0
           isOptional: true
           parameterType: NUMBER_DOUBLE
         taxonomy_path:
@@ -1535,10 +1536,10 @@ deploymentSpec:
           \ *\n\ndef sdg_op(\n    num_instructions_to_generate: int,\n    pipeline:\
           \ str,\n    repo_branch: Optional[str],\n    repo_pr: Optional[int],\n \
           \   taxonomy_path: str = \"/data/taxonomy\",\n    sdg_path: str = \"/data/sdg\"\
-          ,\n    sdg_sampling_size: float = None,\n):\n    from os import getenv,\
-          \ path\n\n    import instructlab.sdg\n    import openai\n    import yaml\n\
-          \n    api_key = getenv(\"api_key\")\n    model = getenv(\"model\")\n   \
-          \ endpoint = getenv(\"endpoint\")\n\n    if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"\
+          ,\n    sdg_sampling_size: float = 1.0,\n):\n    from os import getenv, path\n\
+          \n    import instructlab.sdg\n    import openai\n    import yaml\n\n   \
+          \ api_key = getenv(\"api_key\")\n    model = getenv(\"model\")\n    endpoint\
+          \ = getenv(\"endpoint\")\n\n    if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"\
           ):\n        import httpx\n\n        custom_http_client = httpx.Client(verify=sdg_ca_cert)\n\
           \        client = openai.OpenAI(\n            base_url=endpoint, api_key=api_key,\
           \ http_client=custom_http_client\n        )\n    else:\n        client =\
@@ -1547,10 +1548,10 @@ deploymentSpec:
           \n\n    print(\"Generating synthetic dataset for:\")\n    print()\n    print(\n\
           \        instructlab.sdg.utils.taxonomy.read_taxonomy(\n            taxonomy_path,\
           \ taxonomy_base, document_output_dir=f\"{sdg_path}/documents\"\n       \
-          \ )\n    )\n\n    # Generate synthetic dataset\n    if sdg_sampling_size\
-          \ is None:\n        # generate_data has a magic word for its taxonomy_base\
-          \ argument - 'empty'\n        # it allows generating from the whole repo,\
-          \ see:\n        # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
+          \ )\n    )\n\n    # Generate synthetic dataset\n    # 1.0 is the default\
+          \ size\n    if sdg_sampling_size == 1.0:\n        # generate_data has a\
+          \ magic word for its taxonomy_base argument - 'empty'\n        # it allows\
+          \ generating from the whole repo, see:\n        # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
           \        instructlab.sdg.generate_data(\n            client=client,\n  \
           \          num_instructions_to_generate=num_instructions_to_generate,\n\
           \            output_dir=sdg_path,\n            taxonomy=taxonomy_path,\n\
@@ -2142,6 +2143,7 @@ root:
         isOptional: true
         parameterType: STRING
       sdg_sample_size:
+        defaultValue: 1.0
         description: SDG parameter. Represents the sdg skills recipe sampling size
           as percentage in decimal form.
         isOptional: true
diff --git a/sdg/components.py b/sdg/components.py
index 40f0b4bd..e3370e67 100644
--- a/sdg/components.py
+++ b/sdg/components.py
@@ -35,7 +35,7 @@ def sdg_op(
     repo_pr: Optional[int],
     taxonomy_path: str = "/data/taxonomy",
     sdg_path: str = "/data/sdg",
-    sdg_sampling_size: float = None,
+    sdg_sampling_size: float = 1.0,
 ):
     from os import getenv, path
 
@@ -68,7 +68,8 @@ def sdg_op(
     )
 
     # Generate synthetic dataset
-    if sdg_sampling_size is None:
+    # 1.0 is the default size
+    if sdg_sampling_size == 1.0:
         # generate_data has a magic word for its taxonomy_base argument - 'empty'
         # it allows generating from the whole repo, see:
         # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
diff --git a/standalone/standalone.py b/standalone/standalone.py
index f85c4764..9078702e 100755
--- a/standalone/standalone.py
+++ b/standalone/standalone.py
@@ -1130,7 +1130,7 @@ def sdg_op(
     repo_pr: Optional[int],
     taxonomy_path: str = "/data/taxonomy",
     sdg_path: str = "/data/sdg",
-    sdg_sampling_size: float = None,
+    sdg_sampling_size: float = 1.0,
 ):
     from os import getenv, path
 
@@ -1163,7 +1163,8 @@ def sdg_op(
     )
 
     # Generate synthetic dataset
-    if sdg_sampling_size is None:
+    # 1.0 is the default size
+    if sdg_sampling_size == 1.0:
         # generate_data has a magic word for its taxonomy_base argument - 'empty'
         # it allows generating from the whole repo, see:
         # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230