diff --git a/importer-pipeline.yaml b/importer-pipeline.yaml
index c4486c7e..b34e0d97 100644
--- a/importer-pipeline.yaml
+++ b/importer-pipeline.yaml
@@ -32,7 +32,7 @@ deploymentSpec:
         env:
         - name: REGISTRY_AUTH_FILE
           value: /mnt/containers/.dockerconfigjson
-        image: quay.io/redhat-et/ilab:1.2
+        image: quay.io/redhat-et/ilab:1.3
 pipelineInfo:
   description: Helper pipeline to the InstructLab pipeline which allows users to seed/import
     a new base model
diff --git a/pipeline.py b/pipeline.py
index 5269088b..177179ed 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -271,6 +271,7 @@ def pipeline(
         )
         data_processing_task.after(model_to_pvc_task, sdg_task)
         data_processing_task.set_caching_options(False)
+        data_processing_task.set_env_variable("XDG_CACHE_HOME", "/tmp")
 
         set_image_pull_secrets(data_processing_task, [IMAGE_PULL_SECRET])
 
diff --git a/pipeline.yaml b/pipeline.yaml
index 23b8999c..823d7567 100644
--- a/pipeline.yaml
+++ b/pipeline.yaml
@@ -645,7 +645,10 @@ deploymentSpec:
           \                max_seq_len=train_args.max_seq_len,\n                chat_tmpl_path=train_args.chat_tmpl_path,\n\
           \            )\n        )\n\n    data_processing(train_args=skill_training_args)\n\
           \    data_processing(train_args=knowledge_training_args)\n\n"
-        image: quay.io/redhat-et/ilab:1.2
+        env:
+        - name: XDG_CACHE_HOME
+          value: /tmp
+        image: quay.io/redhat-et/ilab:1.3
     exec-deletepvc:
       container:
         image: argostub/deletepvc
@@ -744,7 +747,7 @@ deploymentSpec:
           \       path_to_data = \"/input_data/knowledge/data.jsonl\"\n    elif phase_num\
           \ == 2:\n        path_to_model = list_phase1_final_model()\n        path_to_data\
           \ = \"/input_data/skills/data.jsonl\"\n    else:\n        raise RuntimeError(f\"\
-          Unsupported value of {phase_num=}\")\n\n    image = \"quay.io/redhat-et/ilab:1.2\"\
+          Unsupported value of {phase_num=}\")\n\n    image = \"quay.io/redhat-et/ilab:1.3\"\
           \n\n    manifest = inspect.cleandoc(\n        f\"\"\"\n        apiVersion:\
           \ kubeflow.org/v1\n        kind: PyTorchJob\n        metadata:\n       \
           \   name: {name}\n        spec:\n          nprocPerNode: \\\"{nproc_per_node}\\\
@@ -773,29 +776,28 @@ deploymentSpec:
           \                        --log_level=INFO \\\n                         \
           \     --max_batch_len={max_batch_len} \\\n                             \
           \ --seed={seed} \\\n                              --cpu_offload_optimizer\
-          \ \\\n                              --cpu_offload_params \\\n          \
-          \                    --distributed_training_framework fsdp \\\n        \
-          \                      --is_granite \\\n                              --checkpoint_at_epoch\n\
-          \                      command:\n                        - /bin/bash\n \
-          \                       - '-c'\n                        - '--'\n       \
-          \               image: {image}\n                      name: pytorch\n  \
-          \                    volumeMounts:\n                        - mountPath:\
-          \ /input_data\n                          name: input-data\n            \
-          \              readOnly: true\n                        - mountPath: /input_model\n\
-          \                          name: model\n                          readOnly:\
-          \ true\n                        - mountPath: /output\n                 \
-          \         name: output\n                      env:\n                   \
-          \     - name: NNODES\n                          value: \\\"{nnodes}\\\"\n\
-          \                        - name: NPROC_PER_NODE\n                      \
-          \    value: \\\"{nproc_per_node}\\\"\n                        - name: XDG_CACHE_HOME\n\
+          \ \\\n                              --cpu_offload_params_fsdp \\\n     \
+          \                         --distributed_training_framework fsdp \\\n   \
+          \                           --checkpoint_at_epoch\n                    \
+          \  command:\n                        - /bin/bash\n                     \
+          \   - '-c'\n                        - '--'\n                      image:\
+          \ {image}\n                      name: pytorch\n                      volumeMounts:\n\
+          \                        - mountPath: /input_data\n                    \
+          \      name: input-data\n                          readOnly: true\n    \
+          \                    - mountPath: /input_model\n                       \
+          \   name: model\n                          readOnly: true\n            \
+          \            - mountPath: /output\n                          name: output\n\
+          \                      env:\n                        - name: NNODES\n  \
+          \                        value: \\\"{nnodes}\\\"\n                     \
+          \   - name: NPROC_PER_NODE\n                          value: \\\"{nproc_per_node}\\\
+          \"\n                        - name: XDG_CACHE_HOME\n                   \
+          \       value: /tmp\n                        - name: TRITON_CACHE_DIR\n\
           \                          value: /tmp\n                        - name:\
-          \ TRITON_CACHE_DIR\n                          value: /tmp\n            \
-          \            - name: HF_HOME\n                          value: /tmp\n  \
-          \                      - name: TRANSFORMERS_CACHE\n                    \
-          \      value: /tmp\n                      resources:\n                 \
-          \       requests:\n                          cpu: 8\n                  \
-          \        \"nvidia.com/gpu\": {nproc_per_node}\n                        limits:\n\
-          \                          cpu: 8\n                          \"nvidia.com/gpu\"\
+          \ HF_HOME\n                          value: /tmp\n                     \
+          \   - name: TRANSFORMERS_CACHE\n                          value: /tmp\n\
+          \                      resources:\n                        requests:\n \
+          \                         \"nvidia.com/gpu\": {nproc_per_node}\n       \
+          \                 limits:\n                          \"nvidia.com/gpu\"\
           : {nproc_per_node}\n                  volumes:\n                    - name:\
           \ input-data\n                      persistentVolumeClaim:\n           \
           \             claimName: {input_pvc_name}\n                    - name: model\n\
@@ -824,41 +826,39 @@ deploymentSpec:
           \                           --save_samples={save_samples} \\\n         \
           \                   --log_level=INFO \\\n                            --max_batch_len={max_batch_len}\
           \ \\\n                            --seed={seed} \\\n                   \
-          \         --cpu_offload_optimizer \\\n                            --cpu_offload_params\
+          \         --cpu_offload_optimizer \\\n                            --cpu_offload_params_fsdp\
           \ \\\n                            --distributed_training_framework fsdp\
-          \ \\\n                            --is_granite \\\n                    \
-          \        --checkpoint_at_epoch\n                      command:\n       \
-          \                 - /bin/bash\n                        - '-c'\n        \
-          \                - '--'\n                      image: {image}\n        \
-          \              name: pytorch\n                      volumeMounts:\n    \
-          \                    - mountPath: /input_data\n                        \
-          \  name: input-data\n                          readOnly: true\n        \
-          \                - mountPath: /input_model\n                          name:\
-          \ model\n                          readOnly: true\n                    \
-          \    - mountPath: /output\n                          name: output\n    \
-          \                      readOnly: true\n                      env:\n    \
-          \                    - name: NNODES\n                          value: \\\
-          \"{nnodes}\\\"\n                        - name: NPROC_PER_NODE\n       \
-          \                   value: \\\"{nproc_per_node}\\\"\n                  \
-          \      - name: XDG_CACHE_HOME\n                          value: /tmp\n \
-          \                       - name: TRITON_CACHE_DIR\n                     \
-          \     value: /tmp\n                        - name: HF_HOME\n           \
-          \               value: /tmp\n                        - name: TRANSFORMERS_CACHE\n\
+          \ \\\n                            --checkpoint_at_epoch\n              \
+          \        command:\n                        - /bin/bash\n               \
+          \         - '-c'\n                        - '--'\n                     \
+          \ image: {image}\n                      name: pytorch\n                \
+          \      volumeMounts:\n                        - mountPath: /input_data\n\
+          \                          name: input-data\n                          readOnly:\
+          \ true\n                        - mountPath: /input_model\n            \
+          \              name: model\n                          readOnly: true\n \
+          \                       - mountPath: /output\n                         \
+          \ name: output\n                          readOnly: true\n             \
+          \         env:\n                        - name: NNODES\n               \
+          \           value: \\\"{nnodes}\\\"\n                        - name: NPROC_PER_NODE\n\
+          \                          value: \\\"{nproc_per_node}\\\"\n           \
+          \             - name: XDG_CACHE_HOME\n                          value: /tmp\n\
+          \                        - name: TRITON_CACHE_DIR\n                    \
+          \      value: /tmp\n                        - name: HF_HOME\n          \
+          \                value: /tmp\n                        - name: TRANSFORMERS_CACHE\n\
           \                          value: /tmp\n                      resources:\n\
-          \                        requests:\n                          cpu: 8\n \
-          \                         \"nvidia.com/gpu\": {nproc_per_node}\n       \
-          \                 limits:\n                          cpu: 8\n          \
-          \                \"nvidia.com/gpu\": {nproc_per_node}\n                \
-          \  volumes:\n                    - name: input-data\n                  \
-          \    persistentVolumeClaim:\n                        claimName: {input_pvc_name}\n\
-          \                    - name: model\n                      persistentVolumeClaim:\n\
-          \                        claimName: {model_pvc_name}\n                 \
-          \   - name: output\n                      persistentVolumeClaim:\n     \
-          \                   claimName: {output_pvc_name}\n        \"\"\"\n    )\n\
-          \n    try:\n        manifest_yaml = yaml.safe_load(manifest)\n    except\
-          \ yaml.YAMLError as exc:\n        raise RuntimeError(f\"Error parsing manifest:\
-          \ {exc}\") from exc\n\n    # Discover the namespace in which the pod is\
-          \ running\n    with open(\n        \"/var/run/secrets/kubernetes.io/serviceaccount/namespace\"\
+          \                        requests:\n                          \"nvidia.com/gpu\"\
+          : {nproc_per_node}\n                        limits:\n                  \
+          \        \"nvidia.com/gpu\": {nproc_per_node}\n                  volumes:\n\
+          \                    - name: input-data\n                      persistentVolumeClaim:\n\
+          \                        claimName: {input_pvc_name}\n                 \
+          \   - name: model\n                      persistentVolumeClaim:\n      \
+          \                  claimName: {model_pvc_name}\n                    - name:\
+          \ output\n                      persistentVolumeClaim:\n               \
+          \         claimName: {output_pvc_name}\n        \"\"\"\n    )\n\n    try:\n\
+          \        manifest_yaml = yaml.safe_load(manifest)\n    except yaml.YAMLError\
+          \ as exc:\n        raise RuntimeError(f\"Error parsing manifest: {exc}\"\
+          ) from exc\n\n    # Discover the namespace in which the pod is running\n\
+          \    with open(\n        \"/var/run/secrets/kubernetes.io/serviceaccount/namespace\"\
           , \"r\", encoding=\"utf-8\"\n    ) as f:\n        namespace = f.read().strip()\n\
           \        print(f\"The pod is running in the namespace: {namespace}\")\n\n\
           \    try:\n        kubernetes.config.load_kube_config()\n        print(\"\
@@ -951,7 +951,7 @@ deploymentSpec:
           \       path_to_data = \"/input_data/knowledge/data.jsonl\"\n    elif phase_num\
           \ == 2:\n        path_to_model = list_phase1_final_model()\n        path_to_data\
           \ = \"/input_data/skills/data.jsonl\"\n    else:\n        raise RuntimeError(f\"\
-          Unsupported value of {phase_num=}\")\n\n    image = \"quay.io/redhat-et/ilab:1.2\"\
+          Unsupported value of {phase_num=}\")\n\n    image = \"quay.io/redhat-et/ilab:1.3\"\
           \n\n    manifest = inspect.cleandoc(\n        f\"\"\"\n        apiVersion:\
           \ kubeflow.org/v1\n        kind: PyTorchJob\n        metadata:\n       \
           \   name: {name}\n        spec:\n          nprocPerNode: \\\"{nproc_per_node}\\\
@@ -980,29 +980,28 @@ deploymentSpec:
           \                        --log_level=INFO \\\n                         \
           \     --max_batch_len={max_batch_len} \\\n                             \
           \ --seed={seed} \\\n                              --cpu_offload_optimizer\
-          \ \\\n                              --cpu_offload_params \\\n          \
-          \                    --distributed_training_framework fsdp \\\n        \
-          \                      --is_granite \\\n                              --checkpoint_at_epoch\n\
-          \                      command:\n                        - /bin/bash\n \
-          \                       - '-c'\n                        - '--'\n       \
-          \               image: {image}\n                      name: pytorch\n  \
-          \                    volumeMounts:\n                        - mountPath:\
-          \ /input_data\n                          name: input-data\n            \
-          \              readOnly: true\n                        - mountPath: /input_model\n\
-          \                          name: model\n                          readOnly:\
-          \ true\n                        - mountPath: /output\n                 \
-          \         name: output\n                      env:\n                   \
-          \     - name: NNODES\n                          value: \\\"{nnodes}\\\"\n\
-          \                        - name: NPROC_PER_NODE\n                      \
-          \    value: \\\"{nproc_per_node}\\\"\n                        - name: XDG_CACHE_HOME\n\
+          \ \\\n                              --cpu_offload_params_fsdp \\\n     \
+          \                         --distributed_training_framework fsdp \\\n   \
+          \                           --checkpoint_at_epoch\n                    \
+          \  command:\n                        - /bin/bash\n                     \
+          \   - '-c'\n                        - '--'\n                      image:\
+          \ {image}\n                      name: pytorch\n                      volumeMounts:\n\
+          \                        - mountPath: /input_data\n                    \
+          \      name: input-data\n                          readOnly: true\n    \
+          \                    - mountPath: /input_model\n                       \
+          \   name: model\n                          readOnly: true\n            \
+          \            - mountPath: /output\n                          name: output\n\
+          \                      env:\n                        - name: NNODES\n  \
+          \                        value: \\\"{nnodes}\\\"\n                     \
+          \   - name: NPROC_PER_NODE\n                          value: \\\"{nproc_per_node}\\\
+          \"\n                        - name: XDG_CACHE_HOME\n                   \
+          \       value: /tmp\n                        - name: TRITON_CACHE_DIR\n\
           \                          value: /tmp\n                        - name:\
-          \ TRITON_CACHE_DIR\n                          value: /tmp\n            \
-          \            - name: HF_HOME\n                          value: /tmp\n  \
-          \                      - name: TRANSFORMERS_CACHE\n                    \
-          \      value: /tmp\n                      resources:\n                 \
-          \       requests:\n                          cpu: 8\n                  \
-          \        \"nvidia.com/gpu\": {nproc_per_node}\n                        limits:\n\
-          \                          cpu: 8\n                          \"nvidia.com/gpu\"\
+          \ HF_HOME\n                          value: /tmp\n                     \
+          \   - name: TRANSFORMERS_CACHE\n                          value: /tmp\n\
+          \                      resources:\n                        requests:\n \
+          \                         \"nvidia.com/gpu\": {nproc_per_node}\n       \
+          \                 limits:\n                          \"nvidia.com/gpu\"\
           : {nproc_per_node}\n                  volumes:\n                    - name:\
           \ input-data\n                      persistentVolumeClaim:\n           \
           \             claimName: {input_pvc_name}\n                    - name: model\n\
@@ -1031,41 +1030,39 @@ deploymentSpec:
           \                           --save_samples={save_samples} \\\n         \
           \                   --log_level=INFO \\\n                            --max_batch_len={max_batch_len}\
           \ \\\n                            --seed={seed} \\\n                   \
-          \         --cpu_offload_optimizer \\\n                            --cpu_offload_params\
+          \         --cpu_offload_optimizer \\\n                            --cpu_offload_params_fsdp\
           \ \\\n                            --distributed_training_framework fsdp\
-          \ \\\n                            --is_granite \\\n                    \
-          \        --checkpoint_at_epoch\n                      command:\n       \
-          \                 - /bin/bash\n                        - '-c'\n        \
-          \                - '--'\n                      image: {image}\n        \
-          \              name: pytorch\n                      volumeMounts:\n    \
-          \                    - mountPath: /input_data\n                        \
-          \  name: input-data\n                          readOnly: true\n        \
-          \                - mountPath: /input_model\n                          name:\
-          \ model\n                          readOnly: true\n                    \
-          \    - mountPath: /output\n                          name: output\n    \
-          \                      readOnly: true\n                      env:\n    \
-          \                    - name: NNODES\n                          value: \\\
-          \"{nnodes}\\\"\n                        - name: NPROC_PER_NODE\n       \
-          \                   value: \\\"{nproc_per_node}\\\"\n                  \
-          \      - name: XDG_CACHE_HOME\n                          value: /tmp\n \
-          \                       - name: TRITON_CACHE_DIR\n                     \
-          \     value: /tmp\n                        - name: HF_HOME\n           \
-          \               value: /tmp\n                        - name: TRANSFORMERS_CACHE\n\
+          \ \\\n                            --checkpoint_at_epoch\n              \
+          \        command:\n                        - /bin/bash\n               \
+          \         - '-c'\n                        - '--'\n                     \
+          \ image: {image}\n                      name: pytorch\n                \
+          \      volumeMounts:\n                        - mountPath: /input_data\n\
+          \                          name: input-data\n                          readOnly:\
+          \ true\n                        - mountPath: /input_model\n            \
+          \              name: model\n                          readOnly: true\n \
+          \                       - mountPath: /output\n                         \
+          \ name: output\n                          readOnly: true\n             \
+          \         env:\n                        - name: NNODES\n               \
+          \           value: \\\"{nnodes}\\\"\n                        - name: NPROC_PER_NODE\n\
+          \                          value: \\\"{nproc_per_node}\\\"\n           \
+          \             - name: XDG_CACHE_HOME\n                          value: /tmp\n\
+          \                        - name: TRITON_CACHE_DIR\n                    \
+          \      value: /tmp\n                        - name: HF_HOME\n          \
+          \                value: /tmp\n                        - name: TRANSFORMERS_CACHE\n\
           \                          value: /tmp\n                      resources:\n\
-          \                        requests:\n                          cpu: 8\n \
-          \                         \"nvidia.com/gpu\": {nproc_per_node}\n       \
-          \                 limits:\n                          cpu: 8\n          \
-          \                \"nvidia.com/gpu\": {nproc_per_node}\n                \
-          \  volumes:\n                    - name: input-data\n                  \
-          \    persistentVolumeClaim:\n                        claimName: {input_pvc_name}\n\
-          \                    - name: model\n                      persistentVolumeClaim:\n\
-          \                        claimName: {model_pvc_name}\n                 \
-          \   - name: output\n                      persistentVolumeClaim:\n     \
-          \                   claimName: {output_pvc_name}\n        \"\"\"\n    )\n\
-          \n    try:\n        manifest_yaml = yaml.safe_load(manifest)\n    except\
-          \ yaml.YAMLError as exc:\n        raise RuntimeError(f\"Error parsing manifest:\
-          \ {exc}\") from exc\n\n    # Discover the namespace in which the pod is\
-          \ running\n    with open(\n        \"/var/run/secrets/kubernetes.io/serviceaccount/namespace\"\
+          \                        requests:\n                          \"nvidia.com/gpu\"\
+          : {nproc_per_node}\n                        limits:\n                  \
+          \        \"nvidia.com/gpu\": {nproc_per_node}\n                  volumes:\n\
+          \                    - name: input-data\n                      persistentVolumeClaim:\n\
+          \                        claimName: {input_pvc_name}\n                 \
+          \   - name: model\n                      persistentVolumeClaim:\n      \
+          \                  claimName: {model_pvc_name}\n                    - name:\
+          \ output\n                      persistentVolumeClaim:\n               \
+          \         claimName: {output_pvc_name}\n        \"\"\"\n    )\n\n    try:\n\
+          \        manifest_yaml = yaml.safe_load(manifest)\n    except yaml.YAMLError\
+          \ as exc:\n        raise RuntimeError(f\"Error parsing manifest: {exc}\"\
+          ) from exc\n\n    # Discover the namespace in which the pod is running\n\
+          \    with open(\n        \"/var/run/secrets/kubernetes.io/serviceaccount/namespace\"\
           , \"r\", encoding=\"utf-8\"\n    ) as f:\n        namespace = f.read().strip()\n\
           \        print(f\"The pod is running in the namespace: {namespace}\")\n\n\
           \    try:\n        kubernetes.config.load_kube_config()\n        print(\"\
@@ -1376,7 +1373,7 @@ deploymentSpec:
           value: /tmp
         - name: HF_HOME
           value: /tmp
-        image: quay.io/redhat-et/ilab:1.2
+        image: quay.io/redhat-et/ilab:1.3
         resources:
           accelerator:
             count: '1'
@@ -1512,7 +1509,7 @@ deploymentSpec:
           value: /tmp
         - name: HF_HOME
           value: /tmp
-        image: quay.io/redhat-et/ilab:1.2
+        image: quay.io/redhat-et/ilab:1.3
         resources:
           accelerator:
             count: '1'
@@ -1540,36 +1537,89 @@ deploymentSpec:
           \ str,\n    repo_branch: Optional[str],\n    repo_pr: Optional[int],\n \
           \   taxonomy_path: str = \"/data/taxonomy\",\n    sdg_path: str = \"/data/sdg\"\
           ,\n    sdg_sampling_size: float = 1.0,\n):\n    from os import getenv, path\n\
-          \n    import openai\n    import yaml\n    from instructlab.sdg import generate_data\n\
-          \    from instructlab.sdg.utils.taxonomy import read_taxonomy\n\n    def\
-          \ set_precomputed_skills_data_ratio(sampling_size: float):\n        skills_recipe\
-          \ = \"/usr/share/instructlab/sdg/default_data_recipes/skills.yaml\"\n  \
-          \      if path.exists(skills_recipe):\n            with open(skills_recipe,\
-          \ \"r\") as file:\n                skills_yaml = yaml.load(file, Loader=yaml.Loader)\n\
-          \n            skills_yaml[\"datasets\"][0][\"sampling_size\"] = sampling_size\n\
-          \n            with open(skills_recipe, \"w\", encoding=\"utf-8\") as file:\n\
-          \                yaml.dump(skills_yaml, file)\n\n    api_key = getenv(\"\
-          api_key\")\n    model = getenv(\"model\")\n    endpoint = getenv(\"endpoint\"\
-          )\n\n    if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"):\n        import\
-          \ httpx\n\n        custom_http_client = httpx.Client(verify=sdg_ca_cert)\n\
+          \n    import instructlab.sdg\n    import openai\n    import yaml\n\n   \
+          \ api_key = getenv(\"api_key\")\n    model = getenv(\"model\")\n    endpoint\
+          \ = getenv(\"endpoint\")\n\n    if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"\
+          ):\n        import httpx\n\n        custom_http_client = httpx.Client(verify=sdg_ca_cert)\n\
           \        client = openai.OpenAI(\n            base_url=endpoint, api_key=api_key,\
           \ http_client=custom_http_client\n        )\n    else:\n        client =\
           \ openai.OpenAI(base_url=endpoint, api_key=api_key)\n\n    taxonomy_base\
           \ = \"main\" if repo_branch or (repo_pr and int(repo_pr) > 0) else \"empty\"\
-          \n\n    print(\"Generating synthetic dataset for:\")\n    print()\n    print(read_taxonomy(taxonomy_path,\
-          \ taxonomy_base))\n\n    set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)\n\
-          \n    # generate_data has a magic word for its taxonomy_base argument -\
-          \ 'empty'\n    # it allows generating from the whole repo, see:\n    # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
-          \    generate_data(\n        client=client,\n        num_instructions_to_generate=num_instructions_to_generate,\n\
-          \        output_dir=sdg_path,\n        taxonomy=taxonomy_path,\n       \
-          \ taxonomy_base=taxonomy_base,\n        model_name=model,\n        pipeline=pipeline,\n\
-          \        chunk_word_count=1000,\n        server_ctx_size=4096,\n    )\n\n"
+          \n\n    print(\"Generating synthetic dataset for:\")\n    print()\n    print(\n\
+          \        instructlab.sdg.utils.taxonomy.read_taxonomy(\n            taxonomy_path,\
+          \ taxonomy_base, document_output_dir=f\"{sdg_path}/documents\"\n       \
+          \ )\n    )\n\n    # Generate synthetic dataset\n    # 1.0 is the default\
+          \ size\n    if sdg_sampling_size == 1.0:\n        # generate_data has a\
+          \ magic word for its taxonomy_base argument - 'empty'\n        # it allows\
+          \ generating from the whole repo, see:\n        # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
+          \        instructlab.sdg.generate_data(\n            client=client,\n  \
+          \          num_instructions_to_generate=num_instructions_to_generate,\n\
+          \            output_dir=sdg_path,\n            taxonomy=taxonomy_path,\n\
+          \            taxonomy_base=taxonomy_base,\n            model_name=model,\n\
+          \            pipeline=pipeline,\n            chunk_word_count=1000,\n  \
+          \          server_ctx_size=4096,\n        )\n    # Tweak precomputed skills\
+          \ data ratio if needed\n    else:\n        skills_recipe = \"/usr/share/instructlab/sdg/default_data_recipes/skills.yaml\"\
+          \n\n        def set_precomputed_skills_data_ratio(sampling_size: float,\
+          \ skills_recipe: str):\n            if path.exists(skills_recipe):\n   \
+          \             with open(skills_recipe, \"r\", encoding=\"utf-8\") as file:\n\
+          \                    skills_yaml = yaml.load(file, Loader=yaml.Loader)\n\
+          \n                skills_yaml[\"datasets\"][0][\"sampling_size\"] = sampling_size\n\
+          \n                with open(skills_recipe, \"w\", encoding=\"utf-8\") as\
+          \ file:\n                    yaml.dump(skills_yaml, file)\n\n        try:\n\
+          \            set_precomputed_skills_data_ratio(\n                sampling_size=sdg_sampling_size,\
+          \ skills_recipe=skills_recipe\n            )\n        except PermissionError:\n\
+          \            print(\"Failed to set precomputed skills data ratio: Permission\
+          \ denied\")\n            print(\"Attempting to move default data recipes\
+          \ to temporary directory\")\n            import os\n            import shutil\n\
+          \            import tempfile\n\n            import xdg_base_dirs\n\n   \
+          \         # Create a temporary directory\n            with tempfile.TemporaryDirectory()\
+          \ as temp_dir:\n                # Create a default_data_recipes directory\n\
+          \                temp_dir = path.join(temp_dir, \"default_data_recipes\"\
+          )\n                os.mkdir(temp_dir)\n\n                # Copy default_data_recipes/skills.yaml\
+          \ to the temporary directory\n                shutil.copy(skills_recipe,\
+          \ temp_dir)\n\n                # Also copy the current pipeline directory\
+          \ to the temporary directory - it's a small\n                # directory\
+          \ like 28KB\n                # This isn't needed if the pipeline is either\
+          \ \"full\" or \"simple\" but it's future-proofing\n                data_dirs\
+          \ = [\n                    os.path.join(str(dir), \"instructlab\", \"sdg\"\
+          )\n                    for dir in xdg_base_dirs.xdg_data_dirs()\n      \
+          \          ]\n                temp_pipeline_dir = path.join(temp_dir, \"\
+          pipeline\")\n                os.mkdir(temp_pipeline_dir)\n             \
+          \   for d in data_dirs:\n                    pipeline_path = os.path.join(d,\
+          \ \"pipelines\", pipeline)\n                    if os.path.exists(pipeline_path):\n\
+          \                        shutil.copytree(\n                            pipeline_path,\n\
+          \                            temp_pipeline_dir,\n                      \
+          \      dirs_exist_ok=True,\n                        )\n                \
+          \        break\n\n                # Build new skills.yaml path\n       \
+          \         new_skills_recipe = path.join(temp_dir, \"skills.yaml\")\n   \
+          \             print(f\"New skills recipe path: {new_skills_recipe}\")\n\n\
+          \                # Override XDG_DATA_DIRS with the temporary directory\n\
+          \                # This allows SDG to read the new skills.yaml since it's\
+          \ looking into XDG_DATA_DIRS\n                # and looks for a default_data_recipes\
+          \ directory with a skills.yaml file\n                os.environ[\"XDG_DATA_DIRS\"\
+          ] = f\"{temp_dir}\"\n\n                # Try to set the precomputed skills\
+          \ data ratio again\n                try:\n                    set_precomputed_skills_data_ratio(\n\
+          \                        sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe\n\
+          \                    )\n                    print(\n                   \
+          \     f\"Successfully set precomputed skills data ratio to {sdg_sampling_size}\"\
+          \n                    )\n\n                    # generate_data has a magic\
+          \ word for its taxonomy_base argument - 'empty'\n                    # it\
+          \ allows generating from the whole repo, see:\n                    # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
+          \                    instructlab.sdg.generate_data(\n                  \
+          \      client=client,\n                        num_instructions_to_generate=num_instructions_to_generate,\n\
+          \                        output_dir=sdg_path,\n                        taxonomy=taxonomy_path,\n\
+          \                        taxonomy_base=taxonomy_base,\n                \
+          \        model_name=model,\n                        pipeline=pipeline,\n\
+          \                        chunk_word_count=1000,\n                      \
+          \  server_ctx_size=4096,\n                    )\n                except\
+          \ Exception as e:\n                    print(f\"Failed to set precomputed\
+          \ skills data ratio: {e}\")\n                    raise\n\n"
         env:
         - name: HOME
           value: /tmp
         - name: HF_HOME
           value: /tmp
-        image: quay.io/redhat-et/ilab:1.2
+        image: quay.io/redhat-et/ilab:1.3
     exec-sdg-to-artifact-op:
       container:
         args:
diff --git a/sdg/components.py b/sdg/components.py
index aa2cdfd9..e3370e67 100644
--- a/sdg/components.py
+++ b/sdg/components.py
@@ -39,21 +39,9 @@ def sdg_op(
 ):
     from os import getenv, path
 
+    import instructlab.sdg
     import openai
     import yaml
-    from instructlab.sdg import generate_data
-    from instructlab.sdg.utils.taxonomy import read_taxonomy
-
-    def set_precomputed_skills_data_ratio(sampling_size: float):
-        skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
-        if path.exists(skills_recipe):
-            with open(skills_recipe, "r") as file:
-                skills_yaml = yaml.load(file, Loader=yaml.Loader)
-
-            skills_yaml["datasets"][0]["sampling_size"] = sampling_size
-
-            with open(skills_recipe, "w", encoding="utf-8") as file:
-                yaml.dump(skills_yaml, file)
 
     api_key = getenv("api_key")
     model = getenv("model")
@@ -73,25 +61,120 @@ def set_precomputed_skills_data_ratio(sampling_size: float):
 
     print("Generating synthetic dataset for:")
     print()
-    print(read_taxonomy(taxonomy_path, taxonomy_base))
-
-    set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)
-
-    # generate_data has a magic word for its taxonomy_base argument - 'empty'
-    # it allows generating from the whole repo, see:
-    # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
-    generate_data(
-        client=client,
-        num_instructions_to_generate=num_instructions_to_generate,
-        output_dir=sdg_path,
-        taxonomy=taxonomy_path,
-        taxonomy_base=taxonomy_base,
-        model_name=model,
-        pipeline=pipeline,
-        chunk_word_count=1000,
-        server_ctx_size=4096,
+    print(
+        instructlab.sdg.utils.taxonomy.read_taxonomy(
+            taxonomy_path, taxonomy_base, document_output_dir=f"{sdg_path}/documents"
+        )
     )
 
+    # Generate synthetic dataset
+    # 1.0 is the default size
+    if sdg_sampling_size == 1.0:
+        # generate_data has a magic word for its taxonomy_base argument - 'empty'
+        # it allows generating from the whole repo, see:
+        # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
+        instructlab.sdg.generate_data(
+            client=client,
+            num_instructions_to_generate=num_instructions_to_generate,
+            output_dir=sdg_path,
+            taxonomy=taxonomy_path,
+            taxonomy_base=taxonomy_base,
+            model_name=model,
+            pipeline=pipeline,
+            chunk_word_count=1000,
+            server_ctx_size=4096,
+        )
+    # Tweak precomputed skills data ratio if needed
+    else:
+        skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
+
+        def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
+            if path.exists(skills_recipe):
+                with open(skills_recipe, "r", encoding="utf-8") as file:
+                    skills_yaml = yaml.load(file, Loader=yaml.Loader)
+
+                skills_yaml["datasets"][0]["sampling_size"] = sampling_size
+
+                with open(skills_recipe, "w", encoding="utf-8") as file:
+                    yaml.dump(skills_yaml, file)
+
+        try:
+            set_precomputed_skills_data_ratio(
+                sampling_size=sdg_sampling_size, skills_recipe=skills_recipe
+            )
+        except PermissionError:
+            print("Failed to set precomputed skills data ratio: Permission denied")
+            print("Attempting to move default data recipes to temporary directory")
+            import os
+            import shutil
+            import tempfile
+
+            import xdg_base_dirs
+
+            # Create a temporary directory
+            with tempfile.TemporaryDirectory() as temp_dir:
+                # Create a default_data_recipes directory
+                temp_dir = path.join(temp_dir, "default_data_recipes")
+                os.mkdir(temp_dir)
+
+                # Copy default_data_recipes/skills.yaml to the temporary directory
+                shutil.copy(skills_recipe, temp_dir)
+
+                # Also copy the current pipeline directory to the temporary directory - it's a small
+                # directory like 28KB
+                # This isn't needed if the pipeline is either "full" or "simple" but it's future-proofing
+                data_dirs = [
+                    os.path.join(str(dir), "instructlab", "sdg")
+                    for dir in xdg_base_dirs.xdg_data_dirs()
+                ]
+                temp_pipeline_dir = path.join(temp_dir, "pipeline")
+                os.mkdir(temp_pipeline_dir)
+                for d in data_dirs:
+                    pipeline_path = os.path.join(d, "pipelines", pipeline)
+                    if os.path.exists(pipeline_path):
+                        shutil.copytree(
+                            pipeline_path,
+                            temp_pipeline_dir,
+                            dirs_exist_ok=True,
+                        )
+                        break
+
+                # Build new skills.yaml path
+                new_skills_recipe = path.join(temp_dir, "skills.yaml")
+                print(f"New skills recipe path: {new_skills_recipe}")
+
+                # Override XDG_DATA_DIRS with the temporary directory
+                # This allows SDG to read the new skills.yaml since it's looking into XDG_DATA_DIRS
+                # and looks for a default_data_recipes directory with a skills.yaml file
+                os.environ["XDG_DATA_DIRS"] = f"{temp_dir}"
+
+                # Try to set the precomputed skills data ratio again
+                try:
+                    set_precomputed_skills_data_ratio(
+                        sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe
+                    )
+                    print(
+                        f"Successfully set precomputed skills data ratio to {sdg_sampling_size}"
+                    )
+
+                    # generate_data has a magic word for its taxonomy_base argument - 'empty'
+                    # it allows generating from the whole repo, see:
+                    # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
+                    instructlab.sdg.generate_data(
+                        client=client,
+                        num_instructions_to_generate=num_instructions_to_generate,
+                        output_dir=sdg_path,
+                        taxonomy=taxonomy_path,
+                        taxonomy_base=taxonomy_base,
+                        model_name=model,
+                        pipeline=pipeline,
+                        chunk_word_count=1000,
+                        server_ctx_size=4096,
+                    )
+                except Exception as e:
+                    print(f"Failed to set precomputed skills data ratio: {e}")
+                    raise
+
 
 @dsl.container_component
 def taxonomy_to_artifact_op(
diff --git a/standalone/standalone.py b/standalone/standalone.py
index 028ea995..9078702e 100755
--- a/standalone/standalone.py
+++ b/standalone/standalone.py
@@ -1134,21 +1134,9 @@ def sdg_op(
 ):
     from os import getenv, path
 
+    import instructlab.sdg
     import openai
     import yaml
-    from instructlab.sdg import generate_data
-    from instructlab.sdg.utils.taxonomy import read_taxonomy
-
-    def set_precomputed_skills_data_ratio(sampling_size: float):
-        skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
-        if path.exists(skills_recipe):
-            with open(skills_recipe, "r") as file:
-                skills_yaml = yaml.load(file, Loader=yaml.Loader)
-
-            skills_yaml["datasets"][0]["sampling_size"] = sampling_size
-
-            with open(skills_recipe, "w", encoding="utf-8") as file:
-                yaml.dump(skills_yaml, file)
 
     api_key = getenv("api_key")
     model = getenv("model")
@@ -1168,24 +1156,119 @@ def set_precomputed_skills_data_ratio(sampling_size: float):
 
     print("Generating synthetic dataset for:")
     print()
-    print(read_taxonomy(taxonomy_path, taxonomy_base))
+    print(
+        instructlab.sdg.utils.taxonomy.read_taxonomy(
+            taxonomy_path, taxonomy_base, document_output_dir=f"{sdg_path}/documents"
+        )
+    )
 
-    set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)
+    # Generate synthetic dataset
+    # 1.0 is the default size
+    if sdg_sampling_size == 1.0:
+        # generate_data has a magic word for its taxonomy_base argument - 'empty'
+        # it allows generating from the whole repo, see:
+        # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
+        instructlab.sdg.generate_data(
+            client=client,
+            num_instructions_to_generate=num_instructions_to_generate,
+            output_dir=sdg_path,
+            taxonomy=taxonomy_path,
+            taxonomy_base=taxonomy_base,
+            model_name=model,
+            pipeline=pipeline,
+            chunk_word_count=1000,
+            server_ctx_size=4096,
+        )
+    # Tweak precomputed skills data ratio if needed
+    else:
+        skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
 
-    # generate_data has a magic word for its taxonomy_base argument - 'empty'
-    # it allows generating from the whole repo, see:
-    # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
-    generate_data(
-        client=client,
-        num_instructions_to_generate=num_instructions_to_generate,
-        output_dir=sdg_path,
-        taxonomy=taxonomy_path,
-        taxonomy_base=taxonomy_base,
-        model_name=model,
-        pipeline=pipeline,
-        chunk_word_count=1000,
-        server_ctx_size=4096,
-    )
+        def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
+            if path.exists(skills_recipe):
+                with open(skills_recipe, "r", encoding="utf-8") as file:
+                    skills_yaml = yaml.load(file, Loader=yaml.Loader)
+
+                skills_yaml["datasets"][0]["sampling_size"] = sampling_size
+
+                with open(skills_recipe, "w", encoding="utf-8") as file:
+                    yaml.dump(skills_yaml, file)
+
+        try:
+            set_precomputed_skills_data_ratio(
+                sampling_size=sdg_sampling_size, skills_recipe=skills_recipe
+            )
+        except PermissionError:
+            print("Failed to set precomputed skills data ratio: Permission denied")
+            print("Attempting to move default data recipes to temporary directory")
+            import os
+            import shutil
+            import tempfile
+
+            import xdg_base_dirs
+
+            # Create a temporary directory
+            with tempfile.TemporaryDirectory() as temp_dir:
+                # Create a default_data_recipes directory
+                temp_dir = path.join(temp_dir, "default_data_recipes")
+                os.mkdir(temp_dir)
+
+                # Copy default_data_recipes/skills.yaml to the temporary directory
+                shutil.copy(skills_recipe, temp_dir)
+
+                # Also copy the current pipeline directory to the temporary directory - it's a small
+                # directory like 28KB
+                # This isn't needed if the pipeline is either "full" or "simple" but it's future-proofing
+                data_dirs = [
+                    os.path.join(str(dir), "instructlab", "sdg")
+                    for dir in xdg_base_dirs.xdg_data_dirs()
+                ]
+                temp_pipeline_dir = path.join(temp_dir, "pipeline")
+                os.mkdir(temp_pipeline_dir)
+                for d in data_dirs:
+                    pipeline_path = os.path.join(d, "pipelines", pipeline)
+                    if os.path.exists(pipeline_path):
+                        shutil.copytree(
+                            pipeline_path,
+                            temp_pipeline_dir,
+                            dirs_exist_ok=True,
+                        )
+                        break
+
+                # Build new skills.yaml path
+                new_skills_recipe = path.join(temp_dir, "skills.yaml")
+                print(f"New skills recipe path: {new_skills_recipe}")
+
+                # Override XDG_DATA_DIRS with the temporary directory
+                # This allows SDG to read the new skills.yaml since it's looking into XDG_DATA_DIRS
+                # and looks for a default_data_recipes directory with a skills.yaml file
+                os.environ["XDG_DATA_DIRS"] = f"{temp_dir}"
+
+                # Try to set the precomputed skills data ratio again
+                try:
+                    set_precomputed_skills_data_ratio(
+                        sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe
+                    )
+                    print(
+                        f"Successfully set precomputed skills data ratio to {sdg_sampling_size}"
+                    )
+
+                    # generate_data has a magic word for its taxonomy_base argument - 'empty'
+                    # it allows generating from the whole repo, see:
+                    # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
+                    instructlab.sdg.generate_data(
+                        client=client,
+                        num_instructions_to_generate=num_instructions_to_generate,
+                        output_dir=sdg_path,
+                        taxonomy=taxonomy_path,
+                        taxonomy_base=taxonomy_base,
+                        model_name=model,
+                        pipeline=pipeline,
+                        chunk_word_count=1000,
+                        server_ctx_size=4096,
+                    )
+                except Exception as e:
+                    print(f"Failed to set precomputed skills data ratio: {e}")
+                    raise
 """
     exec_sdg_op_args = f"""
 sdg_op(num_instructions_to_generate={num_instructions_to_generate}, pipeline="{sdg_pipeline}", repo_branch="{exec_git_clone_op_repo_branch or ""}", repo_pr={exec_git_clone_op_repo_pr or 0}, taxonomy_path="{TAXONOMY_DATA_PATH}", sdg_path="{DATA_PVC_SDG_PATH}", sdg_sampling_size={sdg_sampling_size})
diff --git a/training/components.py b/training/components.py
index a12ad747..8059bda4 100644
--- a/training/components.py
+++ b/training/components.py
@@ -167,7 +167,7 @@ def list_phase1_final_model():
     else:
         raise RuntimeError(f"Unsupported value of {phase_num=}")
 
-    image = "quay.io/redhat-et/ilab:1.2"
+    image = "quay.io/redhat-et/ilab:1.3"
 
     manifest = inspect.cleandoc(
         f"""
@@ -211,9 +211,8 @@ def list_phase1_final_model():
                               --max_batch_len={max_batch_len} \
                               --seed={seed} \
                               --cpu_offload_optimizer \
-                              --cpu_offload_params \
+                              --cpu_offload_params_fsdp \
                               --distributed_training_framework fsdp \
-                              --is_granite \
                               --checkpoint_at_epoch
                       command:
                         - /bin/bash
@@ -245,10 +244,8 @@ def list_phase1_final_model():
                           value: /tmp
                       resources:
                         requests:
-                          cpu: 8
                           "nvidia.com/gpu": {nproc_per_node}
                         limits:
-                          cpu: 8
                           "nvidia.com/gpu": {nproc_per_node}
                   volumes:
                     - name: input-data
@@ -292,9 +289,8 @@ def list_phase1_final_model():
                             --max_batch_len={max_batch_len} \
                             --seed={seed} \
                             --cpu_offload_optimizer \
-                            --cpu_offload_params \
+                            --cpu_offload_params_fsdp \
                             --distributed_training_framework fsdp \
-                            --is_granite \
                             --checkpoint_at_epoch
                       command:
                         - /bin/bash
@@ -327,10 +323,8 @@ def list_phase1_final_model():
                           value: /tmp
                       resources:
                         requests:
-                          cpu: 8
                           "nvidia.com/gpu": {nproc_per_node}
                         limits:
-                          cpu: 8
                           "nvidia.com/gpu": {nproc_per_node}
                   volumes:
                     - name: input-data
diff --git a/utils/consts.py b/utils/consts.py
index f8116212..aad0d11f 100644
--- a/utils/consts.py
+++ b/utils/consts.py
@@ -1,4 +1,4 @@
 PYTHON_IMAGE = "quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111"
 TOOLBOX_IMAGE = "registry.access.redhat.com/ubi9/toolbox"
 OC_IMAGE = "registry.redhat.io/openshift4/ose-cli"
-RHELAI_IMAGE = "quay.io/redhat-et/ilab:1.2"
+RHELAI_IMAGE = "quay.io/redhat-et/ilab:1.3"