diff --git a/importer-pipeline.yaml b/importer-pipeline.yaml index c4486c7e..b34e0d97 100644 --- a/importer-pipeline.yaml +++ b/importer-pipeline.yaml @@ -32,7 +32,7 @@ deploymentSpec: env: - name: REGISTRY_AUTH_FILE value: /mnt/containers/.dockerconfigjson - image: quay.io/redhat-et/ilab:1.2 + image: quay.io/redhat-et/ilab:1.3 pipelineInfo: description: Helper pipeline to the InstructLab pipeline which allows users to seed/import a new base model diff --git a/pipeline.py b/pipeline.py index 5269088b..177179ed 100644 --- a/pipeline.py +++ b/pipeline.py @@ -271,6 +271,7 @@ def pipeline( ) data_processing_task.after(model_to_pvc_task, sdg_task) data_processing_task.set_caching_options(False) + data_processing_task.set_env_variable("XDG_CACHE_HOME", "/tmp") set_image_pull_secrets(data_processing_task, [IMAGE_PULL_SECRET]) diff --git a/pipeline.yaml b/pipeline.yaml index 23b8999c..823d7567 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -645,7 +645,10 @@ deploymentSpec: \ max_seq_len=train_args.max_seq_len,\n chat_tmpl_path=train_args.chat_tmpl_path,\n\ \ )\n )\n\n data_processing(train_args=skill_training_args)\n\ \ data_processing(train_args=knowledge_training_args)\n\n" - image: quay.io/redhat-et/ilab:1.2 + env: + - name: XDG_CACHE_HOME + value: /tmp + image: quay.io/redhat-et/ilab:1.3 exec-deletepvc: container: image: argostub/deletepvc @@ -744,7 +747,7 @@ deploymentSpec: \ path_to_data = \"/input_data/knowledge/data.jsonl\"\n elif phase_num\ \ == 2:\n path_to_model = list_phase1_final_model()\n path_to_data\ \ = \"/input_data/skills/data.jsonl\"\n else:\n raise RuntimeError(f\"\ - Unsupported value of {phase_num=}\")\n\n image = \"quay.io/redhat-et/ilab:1.2\"\ + Unsupported value of {phase_num=}\")\n\n image = \"quay.io/redhat-et/ilab:1.3\"\ \n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\ \ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \ \ name: {name}\n spec:\n nprocPerNode: \\\"{nproc_per_node}\\\ @@ -773,29 +776,28 @@ deploymentSpec: \ --log_level=INFO \\\n \ \ --max_batch_len={max_batch_len} \\\n \ \ --seed={seed} \\\n --cpu_offload_optimizer\ - \ \\\n --cpu_offload_params \\\n \ - \ --distributed_training_framework fsdp \\\n \ - \ --is_granite \\\n --checkpoint_at_epoch\n\ - \ command:\n - /bin/bash\n \ - \ - '-c'\n - '--'\n \ - \ image: {image}\n name: pytorch\n \ - \ volumeMounts:\n - mountPath:\ - \ /input_data\n name: input-data\n \ - \ readOnly: true\n - mountPath: /input_model\n\ - \ name: model\n readOnly:\ - \ true\n - mountPath: /output\n \ - \ name: output\n env:\n \ - \ - name: NNODES\n value: \\\"{nnodes}\\\"\n\ - \ - name: NPROC_PER_NODE\n \ - \ value: \\\"{nproc_per_node}\\\"\n - name: XDG_CACHE_HOME\n\ + \ \\\n --cpu_offload_params_fsdp \\\n \ + \ --distributed_training_framework fsdp \\\n \ + \ --checkpoint_at_epoch\n \ + \ command:\n - /bin/bash\n \ + \ - '-c'\n - '--'\n image:\ + \ {image}\n name: pytorch\n volumeMounts:\n\ + \ - mountPath: /input_data\n \ + \ name: input-data\n readOnly: true\n \ + \ - mountPath: /input_model\n \ + \ name: model\n readOnly: true\n \ + \ - mountPath: /output\n name: output\n\ + \ env:\n - name: NNODES\n \ + \ value: \\\"{nnodes}\\\"\n \ + \ - name: NPROC_PER_NODE\n value: \\\"{nproc_per_node}\\\ + \"\n - name: XDG_CACHE_HOME\n \ + \ value: /tmp\n - name: TRITON_CACHE_DIR\n\ \ value: /tmp\n - name:\ - \ TRITON_CACHE_DIR\n value: /tmp\n \ - \ - name: HF_HOME\n value: /tmp\n \ - \ - name: TRANSFORMERS_CACHE\n \ - \ value: /tmp\n resources:\n \ - \ requests:\n cpu: 8\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n limits:\n\ - \ cpu: 8\n \"nvidia.com/gpu\"\ + \ HF_HOME\n value: /tmp\n \ + \ - name: TRANSFORMERS_CACHE\n value: /tmp\n\ + \ resources:\n requests:\n \ + \ \"nvidia.com/gpu\": {nproc_per_node}\n \ + \ limits:\n \"nvidia.com/gpu\"\ : {nproc_per_node}\n volumes:\n - name:\ \ input-data\n persistentVolumeClaim:\n \ \ claimName: {input_pvc_name}\n - name: model\n\ @@ -824,41 +826,39 @@ deploymentSpec: \ --save_samples={save_samples} \\\n \ \ --log_level=INFO \\\n --max_batch_len={max_batch_len}\ \ \\\n --seed={seed} \\\n \ - \ --cpu_offload_optimizer \\\n --cpu_offload_params\ + \ --cpu_offload_optimizer \\\n --cpu_offload_params_fsdp\ \ \\\n --distributed_training_framework fsdp\ - \ \\\n --is_granite \\\n \ - \ --checkpoint_at_epoch\n command:\n \ - \ - /bin/bash\n - '-c'\n \ - \ - '--'\n image: {image}\n \ - \ name: pytorch\n volumeMounts:\n \ - \ - mountPath: /input_data\n \ - \ name: input-data\n readOnly: true\n \ - \ - mountPath: /input_model\n name:\ - \ model\n readOnly: true\n \ - \ - mountPath: /output\n name: output\n \ - \ readOnly: true\n env:\n \ - \ - name: NNODES\n value: \\\ - \"{nnodes}\\\"\n - name: NPROC_PER_NODE\n \ - \ value: \\\"{nproc_per_node}\\\"\n \ - \ - name: XDG_CACHE_HOME\n value: /tmp\n \ - \ - name: TRITON_CACHE_DIR\n \ - \ value: /tmp\n - name: HF_HOME\n \ - \ value: /tmp\n - name: TRANSFORMERS_CACHE\n\ + \ \\\n --checkpoint_at_epoch\n \ + \ command:\n - /bin/bash\n \ + \ - '-c'\n - '--'\n \ + \ image: {image}\n name: pytorch\n \ + \ volumeMounts:\n - mountPath: /input_data\n\ + \ name: input-data\n readOnly:\ + \ true\n - mountPath: /input_model\n \ + \ name: model\n readOnly: true\n \ + \ - mountPath: /output\n \ + \ name: output\n readOnly: true\n \ + \ env:\n - name: NNODES\n \ + \ value: \\\"{nnodes}\\\"\n - name: NPROC_PER_NODE\n\ + \ value: \\\"{nproc_per_node}\\\"\n \ + \ - name: XDG_CACHE_HOME\n value: /tmp\n\ + \ - name: TRITON_CACHE_DIR\n \ + \ value: /tmp\n - name: HF_HOME\n \ + \ value: /tmp\n - name: TRANSFORMERS_CACHE\n\ \ value: /tmp\n resources:\n\ - \ requests:\n cpu: 8\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n \ - \ limits:\n cpu: 8\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n \ - \ volumes:\n - name: input-data\n \ - \ persistentVolumeClaim:\n claimName: {input_pvc_name}\n\ - \ - name: model\n persistentVolumeClaim:\n\ - \ claimName: {model_pvc_name}\n \ - \ - name: output\n persistentVolumeClaim:\n \ - \ claimName: {output_pvc_name}\n \"\"\"\n )\n\ - \n try:\n manifest_yaml = yaml.safe_load(manifest)\n except\ - \ yaml.YAMLError as exc:\n raise RuntimeError(f\"Error parsing manifest:\ - \ {exc}\") from exc\n\n # Discover the namespace in which the pod is\ - \ running\n with open(\n \"/var/run/secrets/kubernetes.io/serviceaccount/namespace\"\ + \ requests:\n \"nvidia.com/gpu\"\ + : {nproc_per_node}\n limits:\n \ + \ \"nvidia.com/gpu\": {nproc_per_node}\n volumes:\n\ + \ - name: input-data\n persistentVolumeClaim:\n\ + \ claimName: {input_pvc_name}\n \ + \ - name: model\n persistentVolumeClaim:\n \ + \ claimName: {model_pvc_name}\n - name:\ + \ output\n persistentVolumeClaim:\n \ + \ claimName: {output_pvc_name}\n \"\"\"\n )\n\n try:\n\ + \ manifest_yaml = yaml.safe_load(manifest)\n except yaml.YAMLError\ + \ as exc:\n raise RuntimeError(f\"Error parsing manifest: {exc}\"\ + ) from exc\n\n # Discover the namespace in which the pod is running\n\ + \ with open(\n \"/var/run/secrets/kubernetes.io/serviceaccount/namespace\"\ , \"r\", encoding=\"utf-8\"\n ) as f:\n namespace = f.read().strip()\n\ \ print(f\"The pod is running in the namespace: {namespace}\")\n\n\ \ try:\n kubernetes.config.load_kube_config()\n print(\"\ @@ -951,7 +951,7 @@ deploymentSpec: \ path_to_data = \"/input_data/knowledge/data.jsonl\"\n elif phase_num\ \ == 2:\n path_to_model = list_phase1_final_model()\n path_to_data\ \ = \"/input_data/skills/data.jsonl\"\n else:\n raise RuntimeError(f\"\ - Unsupported value of {phase_num=}\")\n\n image = \"quay.io/redhat-et/ilab:1.2\"\ + Unsupported value of {phase_num=}\")\n\n image = \"quay.io/redhat-et/ilab:1.3\"\ \n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\ \ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \ \ name: {name}\n spec:\n nprocPerNode: \\\"{nproc_per_node}\\\ @@ -980,29 +980,28 @@ deploymentSpec: \ --log_level=INFO \\\n \ \ --max_batch_len={max_batch_len} \\\n \ \ --seed={seed} \\\n --cpu_offload_optimizer\ - \ \\\n --cpu_offload_params \\\n \ - \ --distributed_training_framework fsdp \\\n \ - \ --is_granite \\\n --checkpoint_at_epoch\n\ - \ command:\n - /bin/bash\n \ - \ - '-c'\n - '--'\n \ - \ image: {image}\n name: pytorch\n \ - \ volumeMounts:\n - mountPath:\ - \ /input_data\n name: input-data\n \ - \ readOnly: true\n - mountPath: /input_model\n\ - \ name: model\n readOnly:\ - \ true\n - mountPath: /output\n \ - \ name: output\n env:\n \ - \ - name: NNODES\n value: \\\"{nnodes}\\\"\n\ - \ - name: NPROC_PER_NODE\n \ - \ value: \\\"{nproc_per_node}\\\"\n - name: XDG_CACHE_HOME\n\ + \ \\\n --cpu_offload_params_fsdp \\\n \ + \ --distributed_training_framework fsdp \\\n \ + \ --checkpoint_at_epoch\n \ + \ command:\n - /bin/bash\n \ + \ - '-c'\n - '--'\n image:\ + \ {image}\n name: pytorch\n volumeMounts:\n\ + \ - mountPath: /input_data\n \ + \ name: input-data\n readOnly: true\n \ + \ - mountPath: /input_model\n \ + \ name: model\n readOnly: true\n \ + \ - mountPath: /output\n name: output\n\ + \ env:\n - name: NNODES\n \ + \ value: \\\"{nnodes}\\\"\n \ + \ - name: NPROC_PER_NODE\n value: \\\"{nproc_per_node}\\\ + \"\n - name: XDG_CACHE_HOME\n \ + \ value: /tmp\n - name: TRITON_CACHE_DIR\n\ \ value: /tmp\n - name:\ - \ TRITON_CACHE_DIR\n value: /tmp\n \ - \ - name: HF_HOME\n value: /tmp\n \ - \ - name: TRANSFORMERS_CACHE\n \ - \ value: /tmp\n resources:\n \ - \ requests:\n cpu: 8\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n limits:\n\ - \ cpu: 8\n \"nvidia.com/gpu\"\ + \ HF_HOME\n value: /tmp\n \ + \ - name: TRANSFORMERS_CACHE\n value: /tmp\n\ + \ resources:\n requests:\n \ + \ \"nvidia.com/gpu\": {nproc_per_node}\n \ + \ limits:\n \"nvidia.com/gpu\"\ : {nproc_per_node}\n volumes:\n - name:\ \ input-data\n persistentVolumeClaim:\n \ \ claimName: {input_pvc_name}\n - name: model\n\ @@ -1031,41 +1030,39 @@ deploymentSpec: \ --save_samples={save_samples} \\\n \ \ --log_level=INFO \\\n --max_batch_len={max_batch_len}\ \ \\\n --seed={seed} \\\n \ - \ --cpu_offload_optimizer \\\n --cpu_offload_params\ + \ --cpu_offload_optimizer \\\n --cpu_offload_params_fsdp\ \ \\\n --distributed_training_framework fsdp\ - \ \\\n --is_granite \\\n \ - \ --checkpoint_at_epoch\n command:\n \ - \ - /bin/bash\n - '-c'\n \ - \ - '--'\n image: {image}\n \ - \ name: pytorch\n volumeMounts:\n \ - \ - mountPath: /input_data\n \ - \ name: input-data\n readOnly: true\n \ - \ - mountPath: /input_model\n name:\ - \ model\n readOnly: true\n \ - \ - mountPath: /output\n name: output\n \ - \ readOnly: true\n env:\n \ - \ - name: NNODES\n value: \\\ - \"{nnodes}\\\"\n - name: NPROC_PER_NODE\n \ - \ value: \\\"{nproc_per_node}\\\"\n \ - \ - name: XDG_CACHE_HOME\n value: /tmp\n \ - \ - name: TRITON_CACHE_DIR\n \ - \ value: /tmp\n - name: HF_HOME\n \ - \ value: /tmp\n - name: TRANSFORMERS_CACHE\n\ + \ \\\n --checkpoint_at_epoch\n \ + \ command:\n - /bin/bash\n \ + \ - '-c'\n - '--'\n \ + \ image: {image}\n name: pytorch\n \ + \ volumeMounts:\n - mountPath: /input_data\n\ + \ name: input-data\n readOnly:\ + \ true\n - mountPath: /input_model\n \ + \ name: model\n readOnly: true\n \ + \ - mountPath: /output\n \ + \ name: output\n readOnly: true\n \ + \ env:\n - name: NNODES\n \ + \ value: \\\"{nnodes}\\\"\n - name: NPROC_PER_NODE\n\ + \ value: \\\"{nproc_per_node}\\\"\n \ + \ - name: XDG_CACHE_HOME\n value: /tmp\n\ + \ - name: TRITON_CACHE_DIR\n \ + \ value: /tmp\n - name: HF_HOME\n \ + \ value: /tmp\n - name: TRANSFORMERS_CACHE\n\ \ value: /tmp\n resources:\n\ - \ requests:\n cpu: 8\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n \ - \ limits:\n cpu: 8\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n \ - \ volumes:\n - name: input-data\n \ - \ persistentVolumeClaim:\n claimName: {input_pvc_name}\n\ - \ - name: model\n persistentVolumeClaim:\n\ - \ claimName: {model_pvc_name}\n \ - \ - name: output\n persistentVolumeClaim:\n \ - \ claimName: {output_pvc_name}\n \"\"\"\n )\n\ - \n try:\n manifest_yaml = yaml.safe_load(manifest)\n except\ - \ yaml.YAMLError as exc:\n raise RuntimeError(f\"Error parsing manifest:\ - \ {exc}\") from exc\n\n # Discover the namespace in which the pod is\ - \ running\n with open(\n \"/var/run/secrets/kubernetes.io/serviceaccount/namespace\"\ + \ requests:\n \"nvidia.com/gpu\"\ + : {nproc_per_node}\n limits:\n \ + \ \"nvidia.com/gpu\": {nproc_per_node}\n volumes:\n\ + \ - name: input-data\n persistentVolumeClaim:\n\ + \ claimName: {input_pvc_name}\n \ + \ - name: model\n persistentVolumeClaim:\n \ + \ claimName: {model_pvc_name}\n - name:\ + \ output\n persistentVolumeClaim:\n \ + \ claimName: {output_pvc_name}\n \"\"\"\n )\n\n try:\n\ + \ manifest_yaml = yaml.safe_load(manifest)\n except yaml.YAMLError\ + \ as exc:\n raise RuntimeError(f\"Error parsing manifest: {exc}\"\ + ) from exc\n\n # Discover the namespace in which the pod is running\n\ + \ with open(\n \"/var/run/secrets/kubernetes.io/serviceaccount/namespace\"\ , \"r\", encoding=\"utf-8\"\n ) as f:\n namespace = f.read().strip()\n\ \ print(f\"The pod is running in the namespace: {namespace}\")\n\n\ \ try:\n kubernetes.config.load_kube_config()\n print(\"\ @@ -1376,7 +1373,7 @@ deploymentSpec: value: /tmp - name: HF_HOME value: /tmp - image: quay.io/redhat-et/ilab:1.2 + image: quay.io/redhat-et/ilab:1.3 resources: accelerator: count: '1' @@ -1512,7 +1509,7 @@ deploymentSpec: value: /tmp - name: HF_HOME value: /tmp - image: quay.io/redhat-et/ilab:1.2 + image: quay.io/redhat-et/ilab:1.3 resources: accelerator: count: '1' @@ -1540,36 +1537,89 @@ deploymentSpec: \ str,\n repo_branch: Optional[str],\n repo_pr: Optional[int],\n \ \ taxonomy_path: str = \"/data/taxonomy\",\n sdg_path: str = \"/data/sdg\"\ ,\n sdg_sampling_size: float = 1.0,\n):\n from os import getenv, path\n\ - \n import openai\n import yaml\n from instructlab.sdg import generate_data\n\ - \ from instructlab.sdg.utils.taxonomy import read_taxonomy\n\n def\ - \ set_precomputed_skills_data_ratio(sampling_size: float):\n skills_recipe\ - \ = \"/usr/share/instructlab/sdg/default_data_recipes/skills.yaml\"\n \ - \ if path.exists(skills_recipe):\n with open(skills_recipe,\ - \ \"r\") as file:\n skills_yaml = yaml.load(file, Loader=yaml.Loader)\n\ - \n skills_yaml[\"datasets\"][0][\"sampling_size\"] = sampling_size\n\ - \n with open(skills_recipe, \"w\", encoding=\"utf-8\") as file:\n\ - \ yaml.dump(skills_yaml, file)\n\n api_key = getenv(\"\ - api_key\")\n model = getenv(\"model\")\n endpoint = getenv(\"endpoint\"\ - )\n\n if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"):\n import\ - \ httpx\n\n custom_http_client = httpx.Client(verify=sdg_ca_cert)\n\ + \n import instructlab.sdg\n import openai\n import yaml\n\n \ + \ api_key = getenv(\"api_key\")\n model = getenv(\"model\")\n endpoint\ + \ = getenv(\"endpoint\")\n\n if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"\ + ):\n import httpx\n\n custom_http_client = httpx.Client(verify=sdg_ca_cert)\n\ \ client = openai.OpenAI(\n base_url=endpoint, api_key=api_key,\ \ http_client=custom_http_client\n )\n else:\n client =\ \ openai.OpenAI(base_url=endpoint, api_key=api_key)\n\n taxonomy_base\ \ = \"main\" if repo_branch or (repo_pr and int(repo_pr) > 0) else \"empty\"\ - \n\n print(\"Generating synthetic dataset for:\")\n print()\n print(read_taxonomy(taxonomy_path,\ - \ taxonomy_base))\n\n set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)\n\ - \n # generate_data has a magic word for its taxonomy_base argument -\ - \ 'empty'\n # it allows generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\ - \ generate_data(\n client=client,\n num_instructions_to_generate=num_instructions_to_generate,\n\ - \ output_dir=sdg_path,\n taxonomy=taxonomy_path,\n \ - \ taxonomy_base=taxonomy_base,\n model_name=model,\n pipeline=pipeline,\n\ - \ chunk_word_count=1000,\n server_ctx_size=4096,\n )\n\n" + \n\n print(\"Generating synthetic dataset for:\")\n print()\n print(\n\ + \ instructlab.sdg.utils.taxonomy.read_taxonomy(\n taxonomy_path,\ + \ taxonomy_base, document_output_dir=f\"{sdg_path}/documents\"\n \ + \ )\n )\n\n # Generate synthetic dataset\n # 1.0 is the default\ + \ size\n if sdg_sampling_size == 1.0:\n # generate_data has a\ + \ magic word for its taxonomy_base argument - 'empty'\n # it allows\ + \ generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\ + \ instructlab.sdg.generate_data(\n client=client,\n \ + \ num_instructions_to_generate=num_instructions_to_generate,\n\ + \ output_dir=sdg_path,\n taxonomy=taxonomy_path,\n\ + \ taxonomy_base=taxonomy_base,\n model_name=model,\n\ + \ pipeline=pipeline,\n chunk_word_count=1000,\n \ + \ server_ctx_size=4096,\n )\n # Tweak precomputed skills\ + \ data ratio if needed\n else:\n skills_recipe = \"/usr/share/instructlab/sdg/default_data_recipes/skills.yaml\"\ + \n\n def set_precomputed_skills_data_ratio(sampling_size: float,\ + \ skills_recipe: str):\n if path.exists(skills_recipe):\n \ + \ with open(skills_recipe, \"r\", encoding=\"utf-8\") as file:\n\ + \ skills_yaml = yaml.load(file, Loader=yaml.Loader)\n\ + \n skills_yaml[\"datasets\"][0][\"sampling_size\"] = sampling_size\n\ + \n with open(skills_recipe, \"w\", encoding=\"utf-8\") as\ + \ file:\n yaml.dump(skills_yaml, file)\n\n try:\n\ + \ set_precomputed_skills_data_ratio(\n sampling_size=sdg_sampling_size,\ + \ skills_recipe=skills_recipe\n )\n except PermissionError:\n\ + \ print(\"Failed to set precomputed skills data ratio: Permission\ + \ denied\")\n print(\"Attempting to move default data recipes\ + \ to temporary directory\")\n import os\n import shutil\n\ + \ import tempfile\n\n import xdg_base_dirs\n\n \ + \ # Create a temporary directory\n with tempfile.TemporaryDirectory()\ + \ as temp_dir:\n # Create a default_data_recipes directory\n\ + \ temp_dir = path.join(temp_dir, \"default_data_recipes\"\ + )\n os.mkdir(temp_dir)\n\n # Copy default_data_recipes/skills.yaml\ + \ to the temporary directory\n shutil.copy(skills_recipe,\ + \ temp_dir)\n\n # Also copy the current pipeline directory\ + \ to the temporary directory - it's a small\n # directory\ + \ like 28KB\n # This isn't needed if the pipeline is either\ + \ \"full\" or \"simple\" but it's future-proofing\n data_dirs\ + \ = [\n os.path.join(str(dir), \"instructlab\", \"sdg\"\ + )\n for dir in xdg_base_dirs.xdg_data_dirs()\n \ + \ ]\n temp_pipeline_dir = path.join(temp_dir, \"\ + pipeline\")\n os.mkdir(temp_pipeline_dir)\n \ + \ for d in data_dirs:\n pipeline_path = os.path.join(d,\ + \ \"pipelines\", pipeline)\n if os.path.exists(pipeline_path):\n\ + \ shutil.copytree(\n pipeline_path,\n\ + \ temp_pipeline_dir,\n \ + \ dirs_exist_ok=True,\n )\n \ + \ break\n\n # Build new skills.yaml path\n \ + \ new_skills_recipe = path.join(temp_dir, \"skills.yaml\")\n \ + \ print(f\"New skills recipe path: {new_skills_recipe}\")\n\n\ + \ # Override XDG_DATA_DIRS with the temporary directory\n\ + \ # This allows SDG to read the new skills.yaml since it's\ + \ looking into XDG_DATA_DIRS\n # and looks for a default_data_recipes\ + \ directory with a skills.yaml file\n os.environ[\"XDG_DATA_DIRS\"\ + ] = f\"{temp_dir}\"\n\n # Try to set the precomputed skills\ + \ data ratio again\n try:\n set_precomputed_skills_data_ratio(\n\ + \ sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe\n\ + \ )\n print(\n \ + \ f\"Successfully set precomputed skills data ratio to {sdg_sampling_size}\"\ + \n )\n\n # generate_data has a magic\ + \ word for its taxonomy_base argument - 'empty'\n # it\ + \ allows generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\ + \ instructlab.sdg.generate_data(\n \ + \ client=client,\n num_instructions_to_generate=num_instructions_to_generate,\n\ + \ output_dir=sdg_path,\n taxonomy=taxonomy_path,\n\ + \ taxonomy_base=taxonomy_base,\n \ + \ model_name=model,\n pipeline=pipeline,\n\ + \ chunk_word_count=1000,\n \ + \ server_ctx_size=4096,\n )\n except\ + \ Exception as e:\n print(f\"Failed to set precomputed\ + \ skills data ratio: {e}\")\n raise\n\n" env: - name: HOME value: /tmp - name: HF_HOME value: /tmp - image: quay.io/redhat-et/ilab:1.2 + image: quay.io/redhat-et/ilab:1.3 exec-sdg-to-artifact-op: container: args: diff --git a/sdg/components.py b/sdg/components.py index aa2cdfd9..e3370e67 100644 --- a/sdg/components.py +++ b/sdg/components.py @@ -39,21 +39,9 @@ def sdg_op( ): from os import getenv, path + import instructlab.sdg import openai import yaml - from instructlab.sdg import generate_data - from instructlab.sdg.utils.taxonomy import read_taxonomy - - def set_precomputed_skills_data_ratio(sampling_size: float): - skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml" - if path.exists(skills_recipe): - with open(skills_recipe, "r") as file: - skills_yaml = yaml.load(file, Loader=yaml.Loader) - - skills_yaml["datasets"][0]["sampling_size"] = sampling_size - - with open(skills_recipe, "w", encoding="utf-8") as file: - yaml.dump(skills_yaml, file) api_key = getenv("api_key") model = getenv("model") @@ -73,25 +61,120 @@ def set_precomputed_skills_data_ratio(sampling_size: float): print("Generating synthetic dataset for:") print() - print(read_taxonomy(taxonomy_path, taxonomy_base)) - - set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size) - - # generate_data has a magic word for its taxonomy_base argument - 'empty' - # it allows generating from the whole repo, see: - # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230 - generate_data( - client=client, - num_instructions_to_generate=num_instructions_to_generate, - output_dir=sdg_path, - taxonomy=taxonomy_path, - taxonomy_base=taxonomy_base, - model_name=model, - pipeline=pipeline, - chunk_word_count=1000, - server_ctx_size=4096, + print( + instructlab.sdg.utils.taxonomy.read_taxonomy( + taxonomy_path, taxonomy_base, document_output_dir=f"{sdg_path}/documents" + ) ) + # Generate synthetic dataset + # 1.0 is the default size + if sdg_sampling_size == 1.0: + # generate_data has a magic word for its taxonomy_base argument - 'empty' + # it allows generating from the whole repo, see: + # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230 + instructlab.sdg.generate_data( + client=client, + num_instructions_to_generate=num_instructions_to_generate, + output_dir=sdg_path, + taxonomy=taxonomy_path, + taxonomy_base=taxonomy_base, + model_name=model, + pipeline=pipeline, + chunk_word_count=1000, + server_ctx_size=4096, + ) + # Tweak precomputed skills data ratio if needed + else: + skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml" + + def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): + if path.exists(skills_recipe): + with open(skills_recipe, "r", encoding="utf-8") as file: + skills_yaml = yaml.load(file, Loader=yaml.Loader) + + skills_yaml["datasets"][0]["sampling_size"] = sampling_size + + with open(skills_recipe, "w", encoding="utf-8") as file: + yaml.dump(skills_yaml, file) + + try: + set_precomputed_skills_data_ratio( + sampling_size=sdg_sampling_size, skills_recipe=skills_recipe + ) + except PermissionError: + print("Failed to set precomputed skills data ratio: Permission denied") + print("Attempting to move default data recipes to temporary directory") + import os + import shutil + import tempfile + + import xdg_base_dirs + + # Create a temporary directory + with tempfile.TemporaryDirectory() as temp_dir: + # Create a default_data_recipes directory + temp_dir = path.join(temp_dir, "default_data_recipes") + os.mkdir(temp_dir) + + # Copy default_data_recipes/skills.yaml to the temporary directory + shutil.copy(skills_recipe, temp_dir) + + # Also copy the current pipeline directory to the temporary directory - it's a small + # directory like 28KB + # This isn't needed if the pipeline is either "full" or "simple" but it's future-proofing + data_dirs = [ + os.path.join(str(dir), "instructlab", "sdg") + for dir in xdg_base_dirs.xdg_data_dirs() + ] + temp_pipeline_dir = path.join(temp_dir, "pipeline") + os.mkdir(temp_pipeline_dir) + for d in data_dirs: + pipeline_path = os.path.join(d, "pipelines", pipeline) + if os.path.exists(pipeline_path): + shutil.copytree( + pipeline_path, + temp_pipeline_dir, + dirs_exist_ok=True, + ) + break + + # Build new skills.yaml path + new_skills_recipe = path.join(temp_dir, "skills.yaml") + print(f"New skills recipe path: {new_skills_recipe}") + + # Override XDG_DATA_DIRS with the temporary directory + # This allows SDG to read the new skills.yaml since it's looking into XDG_DATA_DIRS + # and looks for a default_data_recipes directory with a skills.yaml file + os.environ["XDG_DATA_DIRS"] = f"{temp_dir}" + + # Try to set the precomputed skills data ratio again + try: + set_precomputed_skills_data_ratio( + sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe + ) + print( + f"Successfully set precomputed skills data ratio to {sdg_sampling_size}" + ) + + # generate_data has a magic word for its taxonomy_base argument - 'empty' + # it allows generating from the whole repo, see: + # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230 + instructlab.sdg.generate_data( + client=client, + num_instructions_to_generate=num_instructions_to_generate, + output_dir=sdg_path, + taxonomy=taxonomy_path, + taxonomy_base=taxonomy_base, + model_name=model, + pipeline=pipeline, + chunk_word_count=1000, + server_ctx_size=4096, + ) + except Exception as e: + print(f"Failed to set precomputed skills data ratio: {e}") + raise + @dsl.container_component def taxonomy_to_artifact_op( diff --git a/standalone/standalone.py b/standalone/standalone.py index 028ea995..9078702e 100755 --- a/standalone/standalone.py +++ b/standalone/standalone.py @@ -1134,21 +1134,9 @@ def sdg_op( ): from os import getenv, path + import instructlab.sdg import openai import yaml - from instructlab.sdg import generate_data - from instructlab.sdg.utils.taxonomy import read_taxonomy - - def set_precomputed_skills_data_ratio(sampling_size: float): - skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml" - if path.exists(skills_recipe): - with open(skills_recipe, "r") as file: - skills_yaml = yaml.load(file, Loader=yaml.Loader) - - skills_yaml["datasets"][0]["sampling_size"] = sampling_size - - with open(skills_recipe, "w", encoding="utf-8") as file: - yaml.dump(skills_yaml, file) api_key = getenv("api_key") model = getenv("model") @@ -1168,24 +1156,119 @@ def set_precomputed_skills_data_ratio(sampling_size: float): print("Generating synthetic dataset for:") print() - print(read_taxonomy(taxonomy_path, taxonomy_base)) + print( + instructlab.sdg.utils.taxonomy.read_taxonomy( + taxonomy_path, taxonomy_base, document_output_dir=f"{sdg_path}/documents" + ) + ) - set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size) + # Generate synthetic dataset + # 1.0 is the default size + if sdg_sampling_size == 1.0: + # generate_data has a magic word for its taxonomy_base argument - 'empty' + # it allows generating from the whole repo, see: + # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230 + instructlab.sdg.generate_data( + client=client, + num_instructions_to_generate=num_instructions_to_generate, + output_dir=sdg_path, + taxonomy=taxonomy_path, + taxonomy_base=taxonomy_base, + model_name=model, + pipeline=pipeline, + chunk_word_count=1000, + server_ctx_size=4096, + ) + # Tweak precomputed skills data ratio if needed + else: + skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml" - # generate_data has a magic word for its taxonomy_base argument - 'empty' - # it allows generating from the whole repo, see: - # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230 - generate_data( - client=client, - num_instructions_to_generate=num_instructions_to_generate, - output_dir=sdg_path, - taxonomy=taxonomy_path, - taxonomy_base=taxonomy_base, - model_name=model, - pipeline=pipeline, - chunk_word_count=1000, - server_ctx_size=4096, - ) + def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str): + if path.exists(skills_recipe): + with open(skills_recipe, "r", encoding="utf-8") as file: + skills_yaml = yaml.load(file, Loader=yaml.Loader) + + skills_yaml["datasets"][0]["sampling_size"] = sampling_size + + with open(skills_recipe, "w", encoding="utf-8") as file: + yaml.dump(skills_yaml, file) + + try: + set_precomputed_skills_data_ratio( + sampling_size=sdg_sampling_size, skills_recipe=skills_recipe + ) + except PermissionError: + print("Failed to set precomputed skills data ratio: Permission denied") + print("Attempting to move default data recipes to temporary directory") + import os + import shutil + import tempfile + + import xdg_base_dirs + + # Create a temporary directory + with tempfile.TemporaryDirectory() as temp_dir: + # Create a default_data_recipes directory + temp_dir = path.join(temp_dir, "default_data_recipes") + os.mkdir(temp_dir) + + # Copy default_data_recipes/skills.yaml to the temporary directory + shutil.copy(skills_recipe, temp_dir) + + # Also copy the current pipeline directory to the temporary directory - it's a small + # directory like 28KB + # This isn't needed if the pipeline is either "full" or "simple" but it's future-proofing + data_dirs = [ + os.path.join(str(dir), "instructlab", "sdg") + for dir in xdg_base_dirs.xdg_data_dirs() + ] + temp_pipeline_dir = path.join(temp_dir, "pipeline") + os.mkdir(temp_pipeline_dir) + for d in data_dirs: + pipeline_path = os.path.join(d, "pipelines", pipeline) + if os.path.exists(pipeline_path): + shutil.copytree( + pipeline_path, + temp_pipeline_dir, + dirs_exist_ok=True, + ) + break + + # Build new skills.yaml path + new_skills_recipe = path.join(temp_dir, "skills.yaml") + print(f"New skills recipe path: {new_skills_recipe}") + + # Override XDG_DATA_DIRS with the temporary directory + # This allows SDG to read the new skills.yaml since it's looking into XDG_DATA_DIRS + # and looks for a default_data_recipes directory with a skills.yaml file + os.environ["XDG_DATA_DIRS"] = f"{temp_dir}" + + # Try to set the precomputed skills data ratio again + try: + set_precomputed_skills_data_ratio( + sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe + ) + print( + f"Successfully set precomputed skills data ratio to {sdg_sampling_size}" + ) + + # generate_data has a magic word for its taxonomy_base argument - 'empty' + # it allows generating from the whole repo, see: + # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230 + instructlab.sdg.generate_data( + client=client, + num_instructions_to_generate=num_instructions_to_generate, + output_dir=sdg_path, + taxonomy=taxonomy_path, + taxonomy_base=taxonomy_base, + model_name=model, + pipeline=pipeline, + chunk_word_count=1000, + server_ctx_size=4096, + ) + except Exception as e: + print(f"Failed to set precomputed skills data ratio: {e}") + raise """ exec_sdg_op_args = f""" sdg_op(num_instructions_to_generate={num_instructions_to_generate}, pipeline="{sdg_pipeline}", repo_branch="{exec_git_clone_op_repo_branch or ""}", repo_pr={exec_git_clone_op_repo_pr or 0}, taxonomy_path="{TAXONOMY_DATA_PATH}", sdg_path="{DATA_PVC_SDG_PATH}", sdg_sampling_size={sdg_sampling_size}) diff --git a/training/components.py b/training/components.py index a12ad747..8059bda4 100644 --- a/training/components.py +++ b/training/components.py @@ -167,7 +167,7 @@ def list_phase1_final_model(): else: raise RuntimeError(f"Unsupported value of {phase_num=}") - image = "quay.io/redhat-et/ilab:1.2" + image = "quay.io/redhat-et/ilab:1.3" manifest = inspect.cleandoc( f""" @@ -211,9 +211,8 @@ def list_phase1_final_model(): --max_batch_len={max_batch_len} \ --seed={seed} \ --cpu_offload_optimizer \ - --cpu_offload_params \ + --cpu_offload_params_fsdp \ --distributed_training_framework fsdp \ - --is_granite \ --checkpoint_at_epoch command: - /bin/bash @@ -245,10 +244,8 @@ def list_phase1_final_model(): value: /tmp resources: requests: - cpu: 8 "nvidia.com/gpu": {nproc_per_node} limits: - cpu: 8 "nvidia.com/gpu": {nproc_per_node} volumes: - name: input-data @@ -292,9 +289,8 @@ def list_phase1_final_model(): --max_batch_len={max_batch_len} \ --seed={seed} \ --cpu_offload_optimizer \ - --cpu_offload_params \ + --cpu_offload_params_fsdp \ --distributed_training_framework fsdp \ - --is_granite \ --checkpoint_at_epoch command: - /bin/bash @@ -327,10 +323,8 @@ def list_phase1_final_model(): value: /tmp resources: requests: - cpu: 8 "nvidia.com/gpu": {nproc_per_node} limits: - cpu: 8 "nvidia.com/gpu": {nproc_per_node} volumes: - name: input-data diff --git a/utils/consts.py b/utils/consts.py index f8116212..aad0d11f 100644 --- a/utils/consts.py +++ b/utils/consts.py @@ -1,4 +1,4 @@ PYTHON_IMAGE = "quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111" TOOLBOX_IMAGE = "registry.access.redhat.com/ubi9/toolbox" OC_IMAGE = "registry.redhat.io/openshift4/ose-cli" -RHELAI_IMAGE = "quay.io/redhat-et/ilab:1.2" +RHELAI_IMAGE = "quay.io/redhat-et/ilab:1.3"