From d4e91ee905f8a221b3206c2ac758097b8ba5c190 Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Tue, 3 Dec 2024 14:14:27 -0500 Subject: [PATCH 1/5] update rhelai 1.2 to 1.3 Signed-off-by: Michael Clifford --- importer-pipeline.yaml | 2 +- pipeline.yaml | 12 ++++++------ training/components.py | 2 +- utils/consts.py | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/importer-pipeline.yaml b/importer-pipeline.yaml index c4486c7e..b34e0d97 100644 --- a/importer-pipeline.yaml +++ b/importer-pipeline.yaml @@ -32,7 +32,7 @@ deploymentSpec: env: - name: REGISTRY_AUTH_FILE value: /mnt/containers/.dockerconfigjson - image: quay.io/redhat-et/ilab:1.2 + image: quay.io/redhat-et/ilab:1.3 pipelineInfo: description: Helper pipeline to the InstructLab pipeline which allows users to seed/import a new base model diff --git a/pipeline.yaml b/pipeline.yaml index 23b8999c..38e42e69 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -645,7 +645,7 @@ deploymentSpec: \ max_seq_len=train_args.max_seq_len,\n chat_tmpl_path=train_args.chat_tmpl_path,\n\ \ )\n )\n\n data_processing(train_args=skill_training_args)\n\ \ data_processing(train_args=knowledge_training_args)\n\n" - image: quay.io/redhat-et/ilab:1.2 + image: quay.io/redhat-et/ilab:1.3 exec-deletepvc: container: image: argostub/deletepvc @@ -744,7 +744,7 @@ deploymentSpec: \ path_to_data = \"/input_data/knowledge/data.jsonl\"\n elif phase_num\ \ == 2:\n path_to_model = list_phase1_final_model()\n path_to_data\ \ = \"/input_data/skills/data.jsonl\"\n else:\n raise RuntimeError(f\"\ - Unsupported value of {phase_num=}\")\n\n image = \"quay.io/redhat-et/ilab:1.2\"\ + Unsupported value of {phase_num=}\")\n\n image = \"quay.io/redhat-et/ilab:1.3\"\ \n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\ \ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \ \ name: {name}\n spec:\n nprocPerNode: \\\"{nproc_per_node}\\\ @@ -951,7 +951,7 @@ deploymentSpec: \ path_to_data = \"/input_data/knowledge/data.jsonl\"\n elif phase_num\ \ == 2:\n path_to_model = list_phase1_final_model()\n path_to_data\ \ = \"/input_data/skills/data.jsonl\"\n else:\n raise RuntimeError(f\"\ - Unsupported value of {phase_num=}\")\n\n image = \"quay.io/redhat-et/ilab:1.2\"\ + Unsupported value of {phase_num=}\")\n\n image = \"quay.io/redhat-et/ilab:1.3\"\ \n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\ \ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \ \ name: {name}\n spec:\n nprocPerNode: \\\"{nproc_per_node}\\\ @@ -1376,7 +1376,7 @@ deploymentSpec: value: /tmp - name: HF_HOME value: /tmp - image: quay.io/redhat-et/ilab:1.2 + image: quay.io/redhat-et/ilab:1.3 resources: accelerator: count: '1' @@ -1512,7 +1512,7 @@ deploymentSpec: value: /tmp - name: HF_HOME value: /tmp - image: quay.io/redhat-et/ilab:1.2 + image: quay.io/redhat-et/ilab:1.3 resources: accelerator: count: '1' @@ -1569,7 +1569,7 @@ deploymentSpec: value: /tmp - name: HF_HOME value: /tmp - image: quay.io/redhat-et/ilab:1.2 + image: quay.io/redhat-et/ilab:1.3 exec-sdg-to-artifact-op: container: args: diff --git a/training/components.py b/training/components.py index a12ad747..3a007cd0 100644 --- a/training/components.py +++ b/training/components.py @@ -167,7 +167,7 @@ def list_phase1_final_model(): else: raise RuntimeError(f"Unsupported value of {phase_num=}") - image = "quay.io/redhat-et/ilab:1.2" + image = "quay.io/redhat-et/ilab:1.3" manifest = inspect.cleandoc( f""" diff --git a/utils/consts.py b/utils/consts.py index f8116212..aad0d11f 100644 --- a/utils/consts.py +++ b/utils/consts.py @@ -1,4 +1,4 @@ PYTHON_IMAGE = "quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111" TOOLBOX_IMAGE = "registry.access.redhat.com/ubi9/toolbox" OC_IMAGE = "registry.redhat.io/openshift4/ose-cli" -RHELAI_IMAGE = "quay.io/redhat-et/ilab:1.2" +RHELAI_IMAGE = "quay.io/redhat-et/ilab:1.3" From ba97769b8be626cace865821d1231896e398a395 Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Tue, 3 Dec 2024 19:46:26 -0500 Subject: [PATCH 2/5] wip: working on changes needed for RHELAI1.3 Signed-off-by: Michael Clifford --- pipeline.py | 1 + pipeline.yaml | 222 ++++++++++++++++++++--------------------- sdg/components.py | 8 +- training/components.py | 10 +- 4 files changed, 119 insertions(+), 122 deletions(-) diff --git a/pipeline.py b/pipeline.py index 5269088b..177179ed 100644 --- a/pipeline.py +++ b/pipeline.py @@ -271,6 +271,7 @@ def pipeline( ) data_processing_task.after(model_to_pvc_task, sdg_task) data_processing_task.set_caching_options(False) + data_processing_task.set_env_variable("XDG_CACHE_HOME", "/tmp") set_image_pull_secrets(data_processing_task, [IMAGE_PULL_SECRET]) diff --git a/pipeline.yaml b/pipeline.yaml index 38e42e69..e89b9f47 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -645,6 +645,9 @@ deploymentSpec: \ max_seq_len=train_args.max_seq_len,\n chat_tmpl_path=train_args.chat_tmpl_path,\n\ \ )\n )\n\n data_processing(train_args=skill_training_args)\n\ \ data_processing(train_args=knowledge_training_args)\n\n" + env: + - name: XDG_CACHE_HOME + value: /tmp image: quay.io/redhat-et/ilab:1.3 exec-deletepvc: container: @@ -773,29 +776,28 @@ deploymentSpec: \ --log_level=INFO \\\n \ \ --max_batch_len={max_batch_len} \\\n \ \ --seed={seed} \\\n --cpu_offload_optimizer\ - \ \\\n --cpu_offload_params \\\n \ - \ --distributed_training_framework fsdp \\\n \ - \ --is_granite \\\n --checkpoint_at_epoch\n\ - \ command:\n - /bin/bash\n \ - \ - '-c'\n - '--'\n \ - \ image: {image}\n name: pytorch\n \ - \ volumeMounts:\n - mountPath:\ - \ /input_data\n name: input-data\n \ - \ readOnly: true\n - mountPath: /input_model\n\ - \ name: model\n readOnly:\ - \ true\n - mountPath: /output\n \ - \ name: output\n env:\n \ - \ - name: NNODES\n value: \\\"{nnodes}\\\"\n\ - \ - name: NPROC_PER_NODE\n \ - \ value: \\\"{nproc_per_node}\\\"\n - name: XDG_CACHE_HOME\n\ + \ \\\n --cpu_offload_params_fsdp \\\n \ + \ --distributed_training_framework fsdp \\\n \ + \ --checkpoint_at_epoch\n \ + \ command:\n - /bin/bash\n \ + \ - '-c'\n - '--'\n image:\ + \ {image}\n name: pytorch\n volumeMounts:\n\ + \ - mountPath: /input_data\n \ + \ name: input-data\n readOnly: true\n \ + \ - mountPath: /input_model\n \ + \ name: model\n readOnly: true\n \ + \ - mountPath: /output\n name: output\n\ + \ env:\n - name: NNODES\n \ + \ value: \\\"{nnodes}\\\"\n \ + \ - name: NPROC_PER_NODE\n value: \\\"{nproc_per_node}\\\ + \"\n - name: XDG_CACHE_HOME\n \ + \ value: /tmp\n - name: TRITON_CACHE_DIR\n\ \ value: /tmp\n - name:\ - \ TRITON_CACHE_DIR\n value: /tmp\n \ - \ - name: HF_HOME\n value: /tmp\n \ - \ - name: TRANSFORMERS_CACHE\n \ - \ value: /tmp\n resources:\n \ - \ requests:\n cpu: 8\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n limits:\n\ - \ cpu: 8\n \"nvidia.com/gpu\"\ + \ HF_HOME\n value: /tmp\n \ + \ - name: TRANSFORMERS_CACHE\n value: /tmp\n\ + \ resources:\n requests:\n \ + \ \"nvidia.com/gpu\": {nproc_per_node}\n \ + \ limits:\n \"nvidia.com/gpu\"\ : {nproc_per_node}\n volumes:\n - name:\ \ input-data\n persistentVolumeClaim:\n \ \ claimName: {input_pvc_name}\n - name: model\n\ @@ -824,41 +826,39 @@ deploymentSpec: \ --save_samples={save_samples} \\\n \ \ --log_level=INFO \\\n --max_batch_len={max_batch_len}\ \ \\\n --seed={seed} \\\n \ - \ --cpu_offload_optimizer \\\n --cpu_offload_params\ + \ --cpu_offload_optimizer \\\n --cpu_offload_params_fsdp\ \ \\\n --distributed_training_framework fsdp\ - \ \\\n --is_granite \\\n \ - \ --checkpoint_at_epoch\n command:\n \ - \ - /bin/bash\n - '-c'\n \ - \ - '--'\n image: {image}\n \ - \ name: pytorch\n volumeMounts:\n \ - \ - mountPath: /input_data\n \ - \ name: input-data\n readOnly: true\n \ - \ - mountPath: /input_model\n name:\ - \ model\n readOnly: true\n \ - \ - mountPath: /output\n name: output\n \ - \ readOnly: true\n env:\n \ - \ - name: NNODES\n value: \\\ - \"{nnodes}\\\"\n - name: NPROC_PER_NODE\n \ - \ value: \\\"{nproc_per_node}\\\"\n \ - \ - name: XDG_CACHE_HOME\n value: /tmp\n \ - \ - name: TRITON_CACHE_DIR\n \ - \ value: /tmp\n - name: HF_HOME\n \ - \ value: /tmp\n - name: TRANSFORMERS_CACHE\n\ + \ \\\n --checkpoint_at_epoch\n \ + \ command:\n - /bin/bash\n \ + \ - '-c'\n - '--'\n \ + \ image: {image}\n name: pytorch\n \ + \ volumeMounts:\n - mountPath: /input_data\n\ + \ name: input-data\n readOnly:\ + \ true\n - mountPath: /input_model\n \ + \ name: model\n readOnly: true\n \ + \ - mountPath: /output\n \ + \ name: output\n readOnly: true\n \ + \ env:\n - name: NNODES\n \ + \ value: \\\"{nnodes}\\\"\n - name: NPROC_PER_NODE\n\ + \ value: \\\"{nproc_per_node}\\\"\n \ + \ - name: XDG_CACHE_HOME\n value: /tmp\n\ + \ - name: TRITON_CACHE_DIR\n \ + \ value: /tmp\n - name: HF_HOME\n \ + \ value: /tmp\n - name: TRANSFORMERS_CACHE\n\ \ value: /tmp\n resources:\n\ - \ requests:\n cpu: 8\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n \ - \ limits:\n cpu: 8\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n \ - \ volumes:\n - name: input-data\n \ - \ persistentVolumeClaim:\n claimName: {input_pvc_name}\n\ - \ - name: model\n persistentVolumeClaim:\n\ - \ claimName: {model_pvc_name}\n \ - \ - name: output\n persistentVolumeClaim:\n \ - \ claimName: {output_pvc_name}\n \"\"\"\n )\n\ - \n try:\n manifest_yaml = yaml.safe_load(manifest)\n except\ - \ yaml.YAMLError as exc:\n raise RuntimeError(f\"Error parsing manifest:\ - \ {exc}\") from exc\n\n # Discover the namespace in which the pod is\ - \ running\n with open(\n \"/var/run/secrets/kubernetes.io/serviceaccount/namespace\"\ + \ requests:\n \"nvidia.com/gpu\"\ + : {nproc_per_node}\n limits:\n \ + \ \"nvidia.com/gpu\": {nproc_per_node}\n volumes:\n\ + \ - name: input-data\n persistentVolumeClaim:\n\ + \ claimName: {input_pvc_name}\n \ + \ - name: model\n persistentVolumeClaim:\n \ + \ claimName: {model_pvc_name}\n - name:\ + \ output\n persistentVolumeClaim:\n \ + \ claimName: {output_pvc_name}\n \"\"\"\n )\n\n try:\n\ + \ manifest_yaml = yaml.safe_load(manifest)\n except yaml.YAMLError\ + \ as exc:\n raise RuntimeError(f\"Error parsing manifest: {exc}\"\ + ) from exc\n\n # Discover the namespace in which the pod is running\n\ + \ with open(\n \"/var/run/secrets/kubernetes.io/serviceaccount/namespace\"\ , \"r\", encoding=\"utf-8\"\n ) as f:\n namespace = f.read().strip()\n\ \ print(f\"The pod is running in the namespace: {namespace}\")\n\n\ \ try:\n kubernetes.config.load_kube_config()\n print(\"\ @@ -980,29 +980,28 @@ deploymentSpec: \ --log_level=INFO \\\n \ \ --max_batch_len={max_batch_len} \\\n \ \ --seed={seed} \\\n --cpu_offload_optimizer\ - \ \\\n --cpu_offload_params \\\n \ - \ --distributed_training_framework fsdp \\\n \ - \ --is_granite \\\n --checkpoint_at_epoch\n\ - \ command:\n - /bin/bash\n \ - \ - '-c'\n - '--'\n \ - \ image: {image}\n name: pytorch\n \ - \ volumeMounts:\n - mountPath:\ - \ /input_data\n name: input-data\n \ - \ readOnly: true\n - mountPath: /input_model\n\ - \ name: model\n readOnly:\ - \ true\n - mountPath: /output\n \ - \ name: output\n env:\n \ - \ - name: NNODES\n value: \\\"{nnodes}\\\"\n\ - \ - name: NPROC_PER_NODE\n \ - \ value: \\\"{nproc_per_node}\\\"\n - name: XDG_CACHE_HOME\n\ + \ \\\n --cpu_offload_params_fsdp \\\n \ + \ --distributed_training_framework fsdp \\\n \ + \ --checkpoint_at_epoch\n \ + \ command:\n - /bin/bash\n \ + \ - '-c'\n - '--'\n image:\ + \ {image}\n name: pytorch\n volumeMounts:\n\ + \ - mountPath: /input_data\n \ + \ name: input-data\n readOnly: true\n \ + \ - mountPath: /input_model\n \ + \ name: model\n readOnly: true\n \ + \ - mountPath: /output\n name: output\n\ + \ env:\n - name: NNODES\n \ + \ value: \\\"{nnodes}\\\"\n \ + \ - name: NPROC_PER_NODE\n value: \\\"{nproc_per_node}\\\ + \"\n - name: XDG_CACHE_HOME\n \ + \ value: /tmp\n - name: TRITON_CACHE_DIR\n\ \ value: /tmp\n - name:\ - \ TRITON_CACHE_DIR\n value: /tmp\n \ - \ - name: HF_HOME\n value: /tmp\n \ - \ - name: TRANSFORMERS_CACHE\n \ - \ value: /tmp\n resources:\n \ - \ requests:\n cpu: 8\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n limits:\n\ - \ cpu: 8\n \"nvidia.com/gpu\"\ + \ HF_HOME\n value: /tmp\n \ + \ - name: TRANSFORMERS_CACHE\n value: /tmp\n\ + \ resources:\n requests:\n \ + \ \"nvidia.com/gpu\": {nproc_per_node}\n \ + \ limits:\n \"nvidia.com/gpu\"\ : {nproc_per_node}\n volumes:\n - name:\ \ input-data\n persistentVolumeClaim:\n \ \ claimName: {input_pvc_name}\n - name: model\n\ @@ -1031,41 +1030,39 @@ deploymentSpec: \ --save_samples={save_samples} \\\n \ \ --log_level=INFO \\\n --max_batch_len={max_batch_len}\ \ \\\n --seed={seed} \\\n \ - \ --cpu_offload_optimizer \\\n --cpu_offload_params\ + \ --cpu_offload_optimizer \\\n --cpu_offload_params_fsdp\ \ \\\n --distributed_training_framework fsdp\ - \ \\\n --is_granite \\\n \ - \ --checkpoint_at_epoch\n command:\n \ - \ - /bin/bash\n - '-c'\n \ - \ - '--'\n image: {image}\n \ - \ name: pytorch\n volumeMounts:\n \ - \ - mountPath: /input_data\n \ - \ name: input-data\n readOnly: true\n \ - \ - mountPath: /input_model\n name:\ - \ model\n readOnly: true\n \ - \ - mountPath: /output\n name: output\n \ - \ readOnly: true\n env:\n \ - \ - name: NNODES\n value: \\\ - \"{nnodes}\\\"\n - name: NPROC_PER_NODE\n \ - \ value: \\\"{nproc_per_node}\\\"\n \ - \ - name: XDG_CACHE_HOME\n value: /tmp\n \ - \ - name: TRITON_CACHE_DIR\n \ - \ value: /tmp\n - name: HF_HOME\n \ - \ value: /tmp\n - name: TRANSFORMERS_CACHE\n\ + \ \\\n --checkpoint_at_epoch\n \ + \ command:\n - /bin/bash\n \ + \ - '-c'\n - '--'\n \ + \ image: {image}\n name: pytorch\n \ + \ volumeMounts:\n - mountPath: /input_data\n\ + \ name: input-data\n readOnly:\ + \ true\n - mountPath: /input_model\n \ + \ name: model\n readOnly: true\n \ + \ - mountPath: /output\n \ + \ name: output\n readOnly: true\n \ + \ env:\n - name: NNODES\n \ + \ value: \\\"{nnodes}\\\"\n - name: NPROC_PER_NODE\n\ + \ value: \\\"{nproc_per_node}\\\"\n \ + \ - name: XDG_CACHE_HOME\n value: /tmp\n\ + \ - name: TRITON_CACHE_DIR\n \ + \ value: /tmp\n - name: HF_HOME\n \ + \ value: /tmp\n - name: TRANSFORMERS_CACHE\n\ \ value: /tmp\n resources:\n\ - \ requests:\n cpu: 8\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n \ - \ limits:\n cpu: 8\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n \ - \ volumes:\n - name: input-data\n \ - \ persistentVolumeClaim:\n claimName: {input_pvc_name}\n\ - \ - name: model\n persistentVolumeClaim:\n\ - \ claimName: {model_pvc_name}\n \ - \ - name: output\n persistentVolumeClaim:\n \ - \ claimName: {output_pvc_name}\n \"\"\"\n )\n\ - \n try:\n manifest_yaml = yaml.safe_load(manifest)\n except\ - \ yaml.YAMLError as exc:\n raise RuntimeError(f\"Error parsing manifest:\ - \ {exc}\") from exc\n\n # Discover the namespace in which the pod is\ - \ running\n with open(\n \"/var/run/secrets/kubernetes.io/serviceaccount/namespace\"\ + \ requests:\n \"nvidia.com/gpu\"\ + : {nproc_per_node}\n limits:\n \ + \ \"nvidia.com/gpu\": {nproc_per_node}\n volumes:\n\ + \ - name: input-data\n persistentVolumeClaim:\n\ + \ claimName: {input_pvc_name}\n \ + \ - name: model\n persistentVolumeClaim:\n \ + \ claimName: {model_pvc_name}\n - name:\ + \ output\n persistentVolumeClaim:\n \ + \ claimName: {output_pvc_name}\n \"\"\"\n )\n\n try:\n\ + \ manifest_yaml = yaml.safe_load(manifest)\n except yaml.YAMLError\ + \ as exc:\n raise RuntimeError(f\"Error parsing manifest: {exc}\"\ + ) from exc\n\n # Discover the namespace in which the pod is running\n\ + \ with open(\n \"/var/run/secrets/kubernetes.io/serviceaccount/namespace\"\ , \"r\", encoding=\"utf-8\"\n ) as f:\n namespace = f.read().strip()\n\ \ print(f\"The pod is running in the namespace: {namespace}\")\n\n\ \ try:\n kubernetes.config.load_kube_config()\n print(\"\ @@ -1556,8 +1553,9 @@ deploymentSpec: \ http_client=custom_http_client\n )\n else:\n client =\ \ openai.OpenAI(base_url=endpoint, api_key=api_key)\n\n taxonomy_base\ \ = \"main\" if repo_branch or (repo_pr and int(repo_pr) > 0) else \"empty\"\ - \n\n print(\"Generating synthetic dataset for:\")\n print()\n print(read_taxonomy(taxonomy_path,\ - \ taxonomy_base))\n\n set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)\n\ + \n\n print(\"Generating synthetic dataset for:\")\n print()\n print(\n\ + \ read_taxonomy(\n taxonomy_path, taxonomy_base, document_output_dir=f\"\ + {sdg_path}/documents\"\n )\n )\n\n # sset_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)\n\ \n # generate_data has a magic word for its taxonomy_base argument -\ \ 'empty'\n # it allows generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\ \ generate_data(\n client=client,\n num_instructions_to_generate=num_instructions_to_generate,\n\ diff --git a/sdg/components.py b/sdg/components.py index aa2cdfd9..a81acb01 100644 --- a/sdg/components.py +++ b/sdg/components.py @@ -73,9 +73,13 @@ def set_precomputed_skills_data_ratio(sampling_size: float): print("Generating synthetic dataset for:") print() - print(read_taxonomy(taxonomy_path, taxonomy_base)) + print( + read_taxonomy( + taxonomy_path, taxonomy_base, document_output_dir=f"{sdg_path}/documents" + ) + ) - set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size) + # sset_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size) # generate_data has a magic word for its taxonomy_base argument - 'empty' # it allows generating from the whole repo, see: diff --git a/training/components.py b/training/components.py index 3a007cd0..8059bda4 100644 --- a/training/components.py +++ b/training/components.py @@ -211,9 +211,8 @@ def list_phase1_final_model(): --max_batch_len={max_batch_len} \ --seed={seed} \ --cpu_offload_optimizer \ - --cpu_offload_params \ + --cpu_offload_params_fsdp \ --distributed_training_framework fsdp \ - --is_granite \ --checkpoint_at_epoch command: - /bin/bash @@ -245,10 +244,8 @@ def list_phase1_final_model(): value: /tmp resources: requests: - cpu: 8 "nvidia.com/gpu": {nproc_per_node} limits: - cpu: 8 "nvidia.com/gpu": {nproc_per_node} volumes: - name: input-data @@ -292,9 +289,8 @@ def list_phase1_final_model(): --max_batch_len={max_batch_len} \ --seed={seed} \ --cpu_offload_optimizer \ - --cpu_offload_params \ + --cpu_offload_params_fsdp \ --distributed_training_framework fsdp \ - --is_granite \ --checkpoint_at_epoch command: - /bin/bash @@ -327,10 +323,8 @@ def list_phase1_final_model(): value: /tmp resources: requests: - cpu: 8 "nvidia.com/gpu": {nproc_per_node} limits: - cpu: 8 "nvidia.com/gpu": {nproc_per_node} volumes: - name: input-data From b235539466a276dddb53ac28d355b33f8d25bbe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Wed, 4 Dec 2024 10:10:18 +0100 Subject: [PATCH 3/5] fix: set_precomputed_skills_data_ratio if EACCES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On 1.3, we cannot edit /usr/share/instructlab/sdg/default_data_recipes/skills.yaml, thus we had to make adjustments to override the SDG DataMixer class to pass a different skills file. Also, sdg_sampling_size is now optional in the pipeline. Signed-off-by: Sébastien Han --- pipeline.py | 2 +- pipeline.yaml | 95 ++++++++++++++++++++------- sdg/components.py | 134 +++++++++++++++++++++++++++++--------- standalone/standalone.py | 136 ++++++++++++++++++++++++++++++--------- 4 files changed, 281 insertions(+), 86 deletions(-) diff --git a/pipeline.py b/pipeline.py index 177179ed..64118bda 100644 --- a/pipeline.py +++ b/pipeline.py @@ -115,7 +115,7 @@ def pipeline( sdg_scale_factor: int = 2, # Renamed upstream https://github.com/instructlab/instructlab/blob/f7d40f6ed5112d59132dd832bd332fa6fbbe7010/src/instructlab/configuration.py#L279-L290 sdg_pipeline: str = SDG_PIPELINE, sdg_max_batch_len: int = MAX_BATCH_LEN, - sdg_sample_size: float = 1.0, + sdg_sample_size: float = None, # Training phase train_nproc_per_node: int = 3, train_nnodes: int = 2, diff --git a/pipeline.yaml b/pipeline.yaml index e89b9f47..d8cef59a 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -15,7 +15,7 @@ # sdg_repo_branch: str # sdg_repo_pr: int # sdg_repo_url: str [Default: 'https://github.com/instructlab/taxonomy.git'] -# sdg_sample_size: float [Default: 1.0] +# sdg_sample_size: float # sdg_scale_factor: int [Default: 2.0] # train_effective_batch_size_phase_1: int [Default: 3840.0] # train_effective_batch_size_phase_2: int [Default: 3840.0] @@ -522,7 +522,6 @@ components: isOptional: true parameterType: STRING sdg_sampling_size: - defaultValue: 1.0 isOptional: true parameterType: NUMBER_DOUBLE taxonomy_path: @@ -1536,32 +1535,81 @@ deploymentSpec: \ *\n\ndef sdg_op(\n num_instructions_to_generate: int,\n pipeline:\ \ str,\n repo_branch: Optional[str],\n repo_pr: Optional[int],\n \ \ taxonomy_path: str = \"/data/taxonomy\",\n sdg_path: str = \"/data/sdg\"\ - ,\n sdg_sampling_size: float = 1.0,\n):\n from os import getenv, path\n\ - \n import openai\n import yaml\n from instructlab.sdg import generate_data\n\ - \ from instructlab.sdg.utils.taxonomy import read_taxonomy\n\n def\ - \ set_precomputed_skills_data_ratio(sampling_size: float):\n skills_recipe\ - \ = \"/usr/share/instructlab/sdg/default_data_recipes/skills.yaml\"\n \ - \ if path.exists(skills_recipe):\n with open(skills_recipe,\ - \ \"r\") as file:\n skills_yaml = yaml.load(file, Loader=yaml.Loader)\n\ - \n skills_yaml[\"datasets\"][0][\"sampling_size\"] = sampling_size\n\ - \n with open(skills_recipe, \"w\", encoding=\"utf-8\") as file:\n\ - \ yaml.dump(skills_yaml, file)\n\n api_key = getenv(\"\ - api_key\")\n model = getenv(\"model\")\n endpoint = getenv(\"endpoint\"\ - )\n\n if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"):\n import\ - \ httpx\n\n custom_http_client = httpx.Client(verify=sdg_ca_cert)\n\ + ,\n sdg_sampling_size: float = None,\n):\n from os import getenv,\ + \ path\n\n import instructlab.sdg\n import openai\n import yaml\n\ + \n api_key = getenv(\"api_key\")\n model = getenv(\"model\")\n \ + \ endpoint = getenv(\"endpoint\")\n\n if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"\ + ):\n import httpx\n\n custom_http_client = httpx.Client(verify=sdg_ca_cert)\n\ \ client = openai.OpenAI(\n base_url=endpoint, api_key=api_key,\ \ http_client=custom_http_client\n )\n else:\n client =\ \ openai.OpenAI(base_url=endpoint, api_key=api_key)\n\n taxonomy_base\ \ = \"main\" if repo_branch or (repo_pr and int(repo_pr) > 0) else \"empty\"\ \n\n print(\"Generating synthetic dataset for:\")\n print()\n print(\n\ - \ read_taxonomy(\n taxonomy_path, taxonomy_base, document_output_dir=f\"\ - {sdg_path}/documents\"\n )\n )\n\n # sset_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)\n\ - \n # generate_data has a magic word for its taxonomy_base argument -\ - \ 'empty'\n # it allows generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\ - \ generate_data(\n client=client,\n num_instructions_to_generate=num_instructions_to_generate,\n\ - \ output_dir=sdg_path,\n taxonomy=taxonomy_path,\n \ - \ taxonomy_base=taxonomy_base,\n model_name=model,\n pipeline=pipeline,\n\ - \ chunk_word_count=1000,\n server_ctx_size=4096,\n )\n\n" + \ instructlab.sdg.utils.taxonomy.read_taxonomy(\n taxonomy_path,\ + \ taxonomy_base, document_output_dir=f\"{sdg_path}/documents\"\n \ + \ )\n )\n\n # Generate synthetic dataset\n if sdg_sampling_size\ + \ is None:\n # generate_data has a magic word for its taxonomy_base\ + \ argument - 'empty'\n # it allows generating from the whole repo,\ + \ see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\ + \ instructlab.sdg.generate_data(\n client=client,\n \ + \ num_instructions_to_generate=num_instructions_to_generate,\n\ + \ output_dir=sdg_path,\n taxonomy=taxonomy_path,\n\ + \ taxonomy_base=taxonomy_base,\n model_name=model,\n\ + \ pipeline=pipeline,\n chunk_word_count=1000,\n \ + \ server_ctx_size=4096,\n )\n # Tweak precomputed skills\ + \ data ratio if needed\n else:\n skills_recipe = \"/usr/share/instructlab/sdg/default_data_recipes/skills.yaml\"\ + \n\n def set_precomputed_skills_data_ratio(sampling_size: float,\ + \ skills_recipe: str):\n if path.exists(skills_recipe):\n \ + \ with open(skills_recipe, \"r\", encoding=\"utf-8\") as file:\n\ + \ skills_yaml = yaml.load(file, Loader=yaml.Loader)\n\ + \n skills_yaml[\"datasets\"][0][\"sampling_size\"] = sampling_size\n\ + \n with open(skills_recipe, \"w\", encoding=\"utf-8\") as\ + \ file:\n yaml.dump(skills_yaml, file)\n\n try:\n\ + \ set_precomputed_skills_data_ratio(\n sampling_size=sdg_sampling_size,\ + \ skills_recipe=skills_recipe\n )\n except PermissionError:\n\ + \ print(\"Failed to set precomputed skills data ratio: Permission\ + \ denied\")\n print(\"Attempting to override DataMixer class\ + \ to set the ratio\")\n import os\n import shutil\n\ + \ import tempfile\n\n import xdg_base_dirs\n\n \ + \ # Create a temporary directory\n with tempfile.TemporaryDirectory()\ + \ as temp_dir:\n # Create a default_data_recipes directory\n\ + \ temp_dir = path.join(temp_dir, \"default_data_recipes\"\ + )\n os.mkdir(temp_dir)\n\n # Copy default_data_recipes/skills.yaml\ + \ to the temporary directory\n shutil.copy(skills_recipe,\ + \ temp_dir)\n\n # Also copy the current pipeline directory\ + \ to the temporary directory - it's a small\n # directory\ + \ like 28KB\n # This isn't needed if the pipeline is either\ + \ \"full\" or \"simple\" but it's future-proofing\n data_dirs\ + \ = [\n os.path.join(str(dir), \"instructlab\", \"sdg\"\ + )\n for dir in xdg_base_dirs.xdg_data_dirs()\n \ + \ ]\n temp_pipeline_dir = path.join(temp_dir, \"\ + pipeline\")\n os.mkdir(temp_pipeline_dir)\n \ + \ for d in data_dirs:\n pipeline_path = os.path.join(d,\ + \ \"pipelines\", pipeline)\n if os.path.exists(pipeline_path):\n\ + \ shutil.copytree(pipeline_path, temp_pipeline_dir)\n\ + \ break\n\n # Build new skills.yaml\ + \ path\n new_skills_recipe = path.join(temp_dir, \"skills.yaml\"\ + )\n print(f\"New skills recipe path: {new_skills_recipe}\"\ + )\n\n # Override XDG_DATA_DIRS with the temporary directory\n\ + \ # This allows SDG to read the new skills.yaml since it's\ + \ looking into XDG_DATA_DIRS\n # and looks for a default_data_recipes\ + \ directory with a skills.yaml file\n os.environ[\"XDG_DATA_DIRS\"\ + ] = f\"{temp_dir}\"\n\n # Try to set the precomputed skills\ + \ data ratio again\n try:\n set_precomputed_skills_data_ratio(\n\ + \ sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe\n\ + \ )\n print(\"Successfully set precomputed\ + \ skills data ratio\")\n\n # generate_data has a magic\ + \ word for its taxonomy_base argument - 'empty'\n # it\ + \ allows generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\ + \ instructlab.sdg.generate_data(\n \ + \ client=client,\n num_instructions_to_generate=num_instructions_to_generate,\n\ + \ output_dir=sdg_path,\n taxonomy=taxonomy_path,\n\ + \ taxonomy_base=taxonomy_base,\n \ + \ model_name=model,\n pipeline=pipeline,\n\ + \ chunk_word_count=1000,\n \ + \ server_ctx_size=4096,\n )\n except\ + \ Exception as e:\n print(f\"Failed to set precomputed\ + \ skills data ratio: {e}\")\n raise\n\n" env: - name: HOME value: /tmp @@ -2091,7 +2139,6 @@ root: isOptional: true parameterType: STRING sdg_sample_size: - defaultValue: 1.0 description: SDG parameter. Represents the sdg skills recipe sampling size as percentage in decimal form. isOptional: true diff --git a/sdg/components.py b/sdg/components.py index a81acb01..941815b9 100644 --- a/sdg/components.py +++ b/sdg/components.py @@ -35,25 +35,13 @@ def sdg_op( repo_pr: Optional[int], taxonomy_path: str = "/data/taxonomy", sdg_path: str = "/data/sdg", - sdg_sampling_size: float = 1.0, + sdg_sampling_size: float = None, ): from os import getenv, path + import instructlab.sdg import openai import yaml - from instructlab.sdg import generate_data - from instructlab.sdg.utils.taxonomy import read_taxonomy - - def set_precomputed_skills_data_ratio(sampling_size: float): - skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml" - if path.exists(skills_recipe): - with open(skills_recipe, "r") as file: - skills_yaml = yaml.load(file, Loader=yaml.Loader) - - skills_yaml["datasets"][0]["sampling_size"] = sampling_size - - with open(skills_recipe, "w", encoding="utf-8") as file: - yaml.dump(skills_yaml, file) api_key = getenv("api_key") model = getenv("model") @@ -74,27 +62,111 @@ def set_precomputed_skills_data_ratio(sampling_size: float): print("Generating synthetic dataset for:") print() print( - read_taxonomy( + instructlab.sdg.utils.taxonomy.read_taxonomy( taxonomy_path, taxonomy_base, document_output_dir=f"{sdg_path}/documents" ) ) - # sset_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size) - - # generate_data has a magic word for its taxonomy_base argument - 'empty' - # it allows generating from the whole repo, see: - # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230 - generate_data( - client=client, - num_instructions_to_generate=num_instructions_to_generate, - output_dir=sdg_path, - taxonomy=taxonomy_path, - taxonomy_base=taxonomy_base, - model_name=model, - pipeline=pipeline, - chunk_word_count=1000, - server_ctx_size=4096, - ) + # Generate synthetic dataset + if sdg_sampling_size is None: + # generate_data has a magic word for its taxonomy_base argument - 'empty' + # it allows generating from the whole repo, see: + # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230 + instructlab.sdg.generate_data( + client=client, + num_instructions_to_generate=num_instructions_to_generate, + output_dir=sdg_path, + taxonomy=taxonomy_path, + taxonomy_base=taxonomy_base, + model_name=model, + pipeline=pipeline, + chunk_word_count=1000, + server_ctx_size=4096, + ) + # Tweak precomputed skills data ratio if needed + else: + skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml" + + def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): + if path.exists(skills_recipe): + with open(skills_recipe, "r", encoding="utf-8") as file: + skills_yaml = yaml.load(file, Loader=yaml.Loader) + + skills_yaml["datasets"][0]["sampling_size"] = sampling_size + + with open(skills_recipe, "w", encoding="utf-8") as file: + yaml.dump(skills_yaml, file) + + try: + set_precomputed_skills_data_ratio( + sampling_size=sdg_sampling_size, skills_recipe=skills_recipe + ) + except PermissionError: + print("Failed to set precomputed skills data ratio: Permission denied") + print("Attempting to override DataMixer class to set the ratio") + import os + import shutil + import tempfile + + import xdg_base_dirs + + # Create a temporary directory + with tempfile.TemporaryDirectory() as temp_dir: + # Create a default_data_recipes directory + temp_dir = path.join(temp_dir, "default_data_recipes") + os.mkdir(temp_dir) + + # Copy default_data_recipes/skills.yaml to the temporary directory + shutil.copy(skills_recipe, temp_dir) + + # Also copy the current pipeline directory to the temporary directory - it's a small + # directory like 28KB + # This isn't needed if the pipeline is either "full" or "simple" but it's future-proofing + data_dirs = [ + os.path.join(str(dir), "instructlab", "sdg") + for dir in xdg_base_dirs.xdg_data_dirs() + ] + temp_pipeline_dir = path.join(temp_dir, "pipeline") + os.mkdir(temp_pipeline_dir) + for d in data_dirs: + pipeline_path = os.path.join(d, "pipelines", pipeline) + if os.path.exists(pipeline_path): + shutil.copytree(pipeline_path, temp_pipeline_dir) + break + + # Build new skills.yaml path + new_skills_recipe = path.join(temp_dir, "skills.yaml") + print(f"New skills recipe path: {new_skills_recipe}") + + # Override XDG_DATA_DIRS with the temporary directory + # This allows SDG to read the new skills.yaml since it's looking into XDG_DATA_DIRS + # and looks for a default_data_recipes directory with a skills.yaml file + os.environ["XDG_DATA_DIRS"] = f"{temp_dir}" + + # Try to set the precomputed skills data ratio again + try: + set_precomputed_skills_data_ratio( + sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe + ) + print("Successfully set precomputed skills data ratio") + + # generate_data has a magic word for its taxonomy_base argument - 'empty' + # it allows generating from the whole repo, see: + # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230 + instructlab.sdg.generate_data( + client=client, + num_instructions_to_generate=num_instructions_to_generate, + output_dir=sdg_path, + taxonomy=taxonomy_path, + taxonomy_base=taxonomy_base, + model_name=model, + pipeline=pipeline, + chunk_word_count=1000, + server_ctx_size=4096, + ) + except Exception as e: + print(f"Failed to set precomputed skills data ratio: {e}") + raise @dsl.container_component diff --git a/standalone/standalone.py b/standalone/standalone.py index 028ea995..7c1f41ed 100755 --- a/standalone/standalone.py +++ b/standalone/standalone.py @@ -1130,25 +1130,13 @@ def sdg_op( repo_pr: Optional[int], taxonomy_path: str = "/data/taxonomy", sdg_path: str = "/data/sdg", - sdg_sampling_size: float = 1.0, + sdg_sampling_size: float = None, ): from os import getenv, path + import instructlab.sdg import openai import yaml - from instructlab.sdg import generate_data - from instructlab.sdg.utils.taxonomy import read_taxonomy - - def set_precomputed_skills_data_ratio(sampling_size: float): - skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml" - if path.exists(skills_recipe): - with open(skills_recipe, "r") as file: - skills_yaml = yaml.load(file, Loader=yaml.Loader) - - skills_yaml["datasets"][0]["sampling_size"] = sampling_size - - with open(skills_recipe, "w", encoding="utf-8") as file: - yaml.dump(skills_yaml, file) api_key = getenv("api_key") model = getenv("model") @@ -1168,24 +1156,112 @@ def set_precomputed_skills_data_ratio(sampling_size: float): print("Generating synthetic dataset for:") print() - print(read_taxonomy(taxonomy_path, taxonomy_base)) + print( + instructlab.sdg.utils.taxonomy.read_taxonomy( + taxonomy_path, taxonomy_base, document_output_dir=f"{sdg_path}/documents" + ) + ) - set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size) + # Generate synthetic dataset + if sdg_sampling_size is None: + # generate_data has a magic word for its taxonomy_base argument - 'empty' + # it allows generating from the whole repo, see: + # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230 + instructlab.sdg.generate_data( + client=client, + num_instructions_to_generate=num_instructions_to_generate, + output_dir=sdg_path, + taxonomy=taxonomy_path, + taxonomy_base=taxonomy_base, + model_name=model, + pipeline=pipeline, + chunk_word_count=1000, + server_ctx_size=4096, + ) + # Tweak precomputed skills data ratio if needed + else: + skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml" - # generate_data has a magic word for its taxonomy_base argument - 'empty' - # it allows generating from the whole repo, see: - # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230 - generate_data( - client=client, - num_instructions_to_generate=num_instructions_to_generate, - output_dir=sdg_path, - taxonomy=taxonomy_path, - taxonomy_base=taxonomy_base, - model_name=model, - pipeline=pipeline, - chunk_word_count=1000, - server_ctx_size=4096, - ) + def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): + if path.exists(skills_recipe): + with open(skills_recipe, "r", encoding="utf-8") as file: + skills_yaml = yaml.load(file, Loader=yaml.Loader) + + skills_yaml["datasets"][0]["sampling_size"] = sampling_size + + with open(skills_recipe, "w", encoding="utf-8") as file: + yaml.dump(skills_yaml, file) + + try: + set_precomputed_skills_data_ratio( + sampling_size=sdg_sampling_size, skills_recipe=skills_recipe + ) + except PermissionError: + print("Failed to set precomputed skills data ratio: Permission denied") + print("Attempting to override DataMixer class to set the ratio") + import os + import shutil + import tempfile + + import xdg_base_dirs + + # Create a temporary directory + with tempfile.TemporaryDirectory() as temp_dir: + # Create a default_data_recipes directory + temp_dir = path.join(temp_dir, "default_data_recipes") + os.mkdir(temp_dir) + + # Copy default_data_recipes/skills.yaml to the temporary directory + shutil.copy(skills_recipe, temp_dir) + + # Also copy the current pipeline directory to the temporary directory - it's a small + # directory like 28KB + # This isn't needed if the pipeline is either "full" or "simple" but it's future-proofing + data_dirs = [ + os.path.join(str(dir), "instructlab", "sdg") + for dir in xdg_base_dirs.xdg_data_dirs() + ] + temp_pipeline_dir = path.join(temp_dir, "pipeline") + os.mkdir(temp_pipeline_dir) + for d in data_dirs: + pipeline_path = os.path.join(d, "pipelines", pipeline) + if os.path.exists(pipeline_path): + shutil.copytree(pipeline_path, temp_pipeline_dir) + break + + # Build new skills.yaml path + new_skills_recipe = path.join(temp_dir, "skills.yaml") + print(f"New skills recipe path: {new_skills_recipe}") + + # Override XDG_DATA_DIRS with the temporary directory + # This allows SDG to read the new skills.yaml since it's looking into XDG_DATA_DIRS + # and looks for a default_data_recipes directory with a skills.yaml file + os.environ["XDG_DATA_DIRS"] = f"{temp_dir}" + + # Try to set the precomputed skills data ratio again + try: + set_precomputed_skills_data_ratio( + sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe + ) + print("Successfully set precomputed skills data ratio") + + # generate_data has a magic word for its taxonomy_base argument - 'empty' + # it allows generating from the whole repo, see: + # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230 + instructlab.sdg.generate_data( + client=client, + num_instructions_to_generate=num_instructions_to_generate, + output_dir=sdg_path, + taxonomy=taxonomy_path, + taxonomy_base=taxonomy_base, + model_name=model, + pipeline=pipeline, + chunk_word_count=1000, + server_ctx_size=4096, + ) + except Exception as e: + print(f"Failed to set precomputed skills data ratio: {e}") + raise """ exec_sdg_op_args = f""" sdg_op(num_instructions_to_generate={num_instructions_to_generate}, pipeline="{sdg_pipeline}", repo_branch="{exec_git_clone_op_repo_branch or ""}", repo_pr={exec_git_clone_op_repo_pr or 0}, taxonomy_path="{TAXONOMY_DATA_PATH}", sdg_path="{DATA_PVC_SDG_PATH}", sdg_sampling_size={sdg_sampling_size}) From 262fb94ab94a2225b91108777e6b17134defcc77 Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Thu, 5 Dec 2024 15:25:24 -0500 Subject: [PATCH 4/5] set copytree dirs_exist_ok to True in sdg op Signed-off-by: Michael Clifford --- pipeline.yaml | 21 ++++++++++++--------- sdg/components.py | 12 +++++++++--- standalone/standalone.py | 12 +++++++++--- 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/pipeline.yaml b/pipeline.yaml index d8cef59a..9a73e564 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -1568,8 +1568,8 @@ deploymentSpec: \ set_precomputed_skills_data_ratio(\n sampling_size=sdg_sampling_size,\ \ skills_recipe=skills_recipe\n )\n except PermissionError:\n\ \ print(\"Failed to set precomputed skills data ratio: Permission\ - \ denied\")\n print(\"Attempting to override DataMixer class\ - \ to set the ratio\")\n import os\n import shutil\n\ + \ denied\")\n print(\"Attempting to move default data recipes\ + \ to temporary directory\")\n import os\n import shutil\n\ \ import tempfile\n\n import xdg_base_dirs\n\n \ \ # Create a temporary directory\n with tempfile.TemporaryDirectory()\ \ as temp_dir:\n # Create a default_data_recipes directory\n\ @@ -1586,19 +1586,22 @@ deploymentSpec: pipeline\")\n os.mkdir(temp_pipeline_dir)\n \ \ for d in data_dirs:\n pipeline_path = os.path.join(d,\ \ \"pipelines\", pipeline)\n if os.path.exists(pipeline_path):\n\ - \ shutil.copytree(pipeline_path, temp_pipeline_dir)\n\ - \ break\n\n # Build new skills.yaml\ - \ path\n new_skills_recipe = path.join(temp_dir, \"skills.yaml\"\ - )\n print(f\"New skills recipe path: {new_skills_recipe}\"\ - )\n\n # Override XDG_DATA_DIRS with the temporary directory\n\ + \ shutil.copytree(\n pipeline_path,\n\ + \ temp_pipeline_dir,\n \ + \ dirs_exist_ok=True,\n )\n \ + \ break\n\n # Build new skills.yaml path\n \ + \ new_skills_recipe = path.join(temp_dir, \"skills.yaml\")\n \ + \ print(f\"New skills recipe path: {new_skills_recipe}\")\n\n\ + \ # Override XDG_DATA_DIRS with the temporary directory\n\ \ # This allows SDG to read the new skills.yaml since it's\ \ looking into XDG_DATA_DIRS\n # and looks for a default_data_recipes\ \ directory with a skills.yaml file\n os.environ[\"XDG_DATA_DIRS\"\ ] = f\"{temp_dir}\"\n\n # Try to set the precomputed skills\ \ data ratio again\n try:\n set_precomputed_skills_data_ratio(\n\ \ sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe\n\ - \ )\n print(\"Successfully set precomputed\ - \ skills data ratio\")\n\n # generate_data has a magic\ + \ )\n print(\n \ + \ f\"Successfully set precomputed skills data ratio to {sdg_sampling_size}\"\ + \n )\n\n # generate_data has a magic\ \ word for its taxonomy_base argument - 'empty'\n # it\ \ allows generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\ \ instructlab.sdg.generate_data(\n \ diff --git a/sdg/components.py b/sdg/components.py index 941815b9..40f0b4bd 100644 --- a/sdg/components.py +++ b/sdg/components.py @@ -103,7 +103,7 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): ) except PermissionError: print("Failed to set precomputed skills data ratio: Permission denied") - print("Attempting to override DataMixer class to set the ratio") + print("Attempting to move default data recipes to temporary directory") import os import shutil import tempfile @@ -131,7 +131,11 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): for d in data_dirs: pipeline_path = os.path.join(d, "pipelines", pipeline) if os.path.exists(pipeline_path): - shutil.copytree(pipeline_path, temp_pipeline_dir) + shutil.copytree( + pipeline_path, + temp_pipeline_dir, + dirs_exist_ok=True, + ) break # Build new skills.yaml path @@ -148,7 +152,9 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): set_precomputed_skills_data_ratio( sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe ) - print("Successfully set precomputed skills data ratio") + print( + f"Successfully set precomputed skills data ratio to {sdg_sampling_size}" + ) # generate_data has a magic word for its taxonomy_base argument - 'empty' # it allows generating from the whole repo, see: diff --git a/standalone/standalone.py b/standalone/standalone.py index 7c1f41ed..f85c4764 100755 --- a/standalone/standalone.py +++ b/standalone/standalone.py @@ -1198,7 +1198,7 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): ) except PermissionError: print("Failed to set precomputed skills data ratio: Permission denied") - print("Attempting to override DataMixer class to set the ratio") + print("Attempting to move default data recipes to temporary directory") import os import shutil import tempfile @@ -1226,7 +1226,11 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): for d in data_dirs: pipeline_path = os.path.join(d, "pipelines", pipeline) if os.path.exists(pipeline_path): - shutil.copytree(pipeline_path, temp_pipeline_dir) + shutil.copytree( + pipeline_path, + temp_pipeline_dir, + dirs_exist_ok=True, + ) break # Build new skills.yaml path @@ -1243,7 +1247,9 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): set_precomputed_skills_data_ratio( sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe ) - print("Successfully set precomputed skills data ratio") + print( + f"Successfully set precomputed skills data ratio to {sdg_sampling_size}" + ) # generate_data has a magic word for its taxonomy_base argument - 'empty' # it allows generating from the whole repo, see: From 46b91495d8e92d3e362137497662140a99174dc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Mon, 9 Dec 2024 14:18:32 +0100 Subject: [PATCH 5/5] fix: set a default to sdg_sample_size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not use None since it is not supported by the pipeline. Use the default 1.0 and compare against it to determine whether we need to tweak it. Signed-off-by: Sébastien Han --- pipeline.py | 2 +- pipeline.yaml | 20 +++++++++++--------- sdg/components.py | 5 +++-- standalone/standalone.py | 5 +++-- 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/pipeline.py b/pipeline.py index 64118bda..177179ed 100644 --- a/pipeline.py +++ b/pipeline.py @@ -115,7 +115,7 @@ def pipeline( sdg_scale_factor: int = 2, # Renamed upstream https://github.com/instructlab/instructlab/blob/f7d40f6ed5112d59132dd832bd332fa6fbbe7010/src/instructlab/configuration.py#L279-L290 sdg_pipeline: str = SDG_PIPELINE, sdg_max_batch_len: int = MAX_BATCH_LEN, - sdg_sample_size: float = None, + sdg_sample_size: float = 1.0, # Training phase train_nproc_per_node: int = 3, train_nnodes: int = 2, diff --git a/pipeline.yaml b/pipeline.yaml index 9a73e564..823d7567 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -15,7 +15,7 @@ # sdg_repo_branch: str # sdg_repo_pr: int # sdg_repo_url: str [Default: 'https://github.com/instructlab/taxonomy.git'] -# sdg_sample_size: float +# sdg_sample_size: float [Default: 1.0] # sdg_scale_factor: int [Default: 2.0] # train_effective_batch_size_phase_1: int [Default: 3840.0] # train_effective_batch_size_phase_2: int [Default: 3840.0] @@ -522,6 +522,7 @@ components: isOptional: true parameterType: STRING sdg_sampling_size: + defaultValue: 1.0 isOptional: true parameterType: NUMBER_DOUBLE taxonomy_path: @@ -1535,10 +1536,10 @@ deploymentSpec: \ *\n\ndef sdg_op(\n num_instructions_to_generate: int,\n pipeline:\ \ str,\n repo_branch: Optional[str],\n repo_pr: Optional[int],\n \ \ taxonomy_path: str = \"/data/taxonomy\",\n sdg_path: str = \"/data/sdg\"\ - ,\n sdg_sampling_size: float = None,\n):\n from os import getenv,\ - \ path\n\n import instructlab.sdg\n import openai\n import yaml\n\ - \n api_key = getenv(\"api_key\")\n model = getenv(\"model\")\n \ - \ endpoint = getenv(\"endpoint\")\n\n if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"\ + ,\n sdg_sampling_size: float = 1.0,\n):\n from os import getenv, path\n\ + \n import instructlab.sdg\n import openai\n import yaml\n\n \ + \ api_key = getenv(\"api_key\")\n model = getenv(\"model\")\n endpoint\ + \ = getenv(\"endpoint\")\n\n if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"\ ):\n import httpx\n\n custom_http_client = httpx.Client(verify=sdg_ca_cert)\n\ \ client = openai.OpenAI(\n base_url=endpoint, api_key=api_key,\ \ http_client=custom_http_client\n )\n else:\n client =\ @@ -1547,10 +1548,10 @@ deploymentSpec: \n\n print(\"Generating synthetic dataset for:\")\n print()\n print(\n\ \ instructlab.sdg.utils.taxonomy.read_taxonomy(\n taxonomy_path,\ \ taxonomy_base, document_output_dir=f\"{sdg_path}/documents\"\n \ - \ )\n )\n\n # Generate synthetic dataset\n if sdg_sampling_size\ - \ is None:\n # generate_data has a magic word for its taxonomy_base\ - \ argument - 'empty'\n # it allows generating from the whole repo,\ - \ see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\ + \ )\n )\n\n # Generate synthetic dataset\n # 1.0 is the default\ + \ size\n if sdg_sampling_size == 1.0:\n # generate_data has a\ + \ magic word for its taxonomy_base argument - 'empty'\n # it allows\ + \ generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\ \ instructlab.sdg.generate_data(\n client=client,\n \ \ num_instructions_to_generate=num_instructions_to_generate,\n\ \ output_dir=sdg_path,\n taxonomy=taxonomy_path,\n\ @@ -2142,6 +2143,7 @@ root: isOptional: true parameterType: STRING sdg_sample_size: + defaultValue: 1.0 description: SDG parameter. Represents the sdg skills recipe sampling size as percentage in decimal form. isOptional: true diff --git a/sdg/components.py b/sdg/components.py index 40f0b4bd..e3370e67 100644 --- a/sdg/components.py +++ b/sdg/components.py @@ -35,7 +35,7 @@ def sdg_op( repo_pr: Optional[int], taxonomy_path: str = "/data/taxonomy", sdg_path: str = "/data/sdg", - sdg_sampling_size: float = None, + sdg_sampling_size: float = 1.0, ): from os import getenv, path @@ -68,7 +68,8 @@ def sdg_op( ) # Generate synthetic dataset - if sdg_sampling_size is None: + # 1.0 is the default size + if sdg_sampling_size == 1.0: # generate_data has a magic word for its taxonomy_base argument - 'empty' # it allows generating from the whole repo, see: # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230 diff --git a/standalone/standalone.py b/standalone/standalone.py index f85c4764..9078702e 100755 --- a/standalone/standalone.py +++ b/standalone/standalone.py @@ -1130,7 +1130,7 @@ def sdg_op( repo_pr: Optional[int], taxonomy_path: str = "/data/taxonomy", sdg_path: str = "/data/sdg", - sdg_sampling_size: float = None, + sdg_sampling_size: float = 1.0, ): from os import getenv, path @@ -1163,7 +1163,8 @@ def sdg_op( ) # Generate synthetic dataset - if sdg_sampling_size is None: + # 1.0 is the default size + if sdg_sampling_size == 1.0: # generate_data has a magic word for its taxonomy_base argument - 'empty' # it allows generating from the whole repo, see: # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230