From 46b91495d8e92d3e362137497662140a99174dc0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Mon, 9 Dec 2024 14:18:32 +0100
Subject: [PATCH] fix: set a default to sdg_sample_size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Do not use None since it is not supported by the pipeline. Use the
default 1.0 and compare against it to determine whether we need to
tweak it.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 pipeline.py              |  2 +-
 pipeline.yaml            | 20 +++++++++++---------
 sdg/components.py        |  5 +++--
 standalone/standalone.py |  5 +++--
 4 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/pipeline.py b/pipeline.py
index 64118bd..177179e 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -115,7 +115,7 @@ def pipeline(
         sdg_scale_factor: int = 2,  # Renamed upstream https://github.com/instructlab/instructlab/blob/f7d40f6ed5112d59132dd832bd332fa6fbbe7010/src/instructlab/configuration.py#L279-L290
         sdg_pipeline: str = SDG_PIPELINE,
         sdg_max_batch_len: int = MAX_BATCH_LEN,
-        sdg_sample_size: float = None,
+        sdg_sample_size: float = 1.0,
         # Training phase
         train_nproc_per_node: int = 3,
         train_nnodes: int = 2,
diff --git a/pipeline.yaml b/pipeline.yaml
index 9a73e56..823d756 100644
--- a/pipeline.yaml
+++ b/pipeline.yaml
@@ -15,7 +15,7 @@
 #    sdg_repo_branch: str
 #    sdg_repo_pr: int
 #    sdg_repo_url: str [Default: 'https://github.com/instructlab/taxonomy.git']
-#    sdg_sample_size: float
+#    sdg_sample_size: float [Default: 1.0]
 #    sdg_scale_factor: int [Default: 2.0]
 #    train_effective_batch_size_phase_1: int [Default: 3840.0]
 #    train_effective_batch_size_phase_2: int [Default: 3840.0]
@@ -522,6 +522,7 @@ components:
           isOptional: true
           parameterType: STRING
         sdg_sampling_size:
+          defaultValue: 1.0
           isOptional: true
           parameterType: NUMBER_DOUBLE
         taxonomy_path:
@@ -1535,10 +1536,10 @@ deploymentSpec:
           \ *\n\ndef sdg_op(\n    num_instructions_to_generate: int,\n    pipeline:\
           \ str,\n    repo_branch: Optional[str],\n    repo_pr: Optional[int],\n \
           \   taxonomy_path: str = \"/data/taxonomy\",\n    sdg_path: str = \"/data/sdg\"\
-          ,\n    sdg_sampling_size: float = None,\n):\n    from os import getenv,\
-          \ path\n\n    import instructlab.sdg\n    import openai\n    import yaml\n\
-          \n    api_key = getenv(\"api_key\")\n    model = getenv(\"model\")\n   \
-          \ endpoint = getenv(\"endpoint\")\n\n    if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"\
+          ,\n    sdg_sampling_size: float = 1.0,\n):\n    from os import getenv, path\n\
+          \n    import instructlab.sdg\n    import openai\n    import yaml\n\n   \
+          \ api_key = getenv(\"api_key\")\n    model = getenv(\"model\")\n    endpoint\
+          \ = getenv(\"endpoint\")\n\n    if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"\
           ):\n        import httpx\n\n        custom_http_client = httpx.Client(verify=sdg_ca_cert)\n\
           \        client = openai.OpenAI(\n            base_url=endpoint, api_key=api_key,\
           \ http_client=custom_http_client\n        )\n    else:\n        client =\
@@ -1547,10 +1548,10 @@ deploymentSpec:
           \n\n    print(\"Generating synthetic dataset for:\")\n    print()\n    print(\n\
           \        instructlab.sdg.utils.taxonomy.read_taxonomy(\n            taxonomy_path,\
           \ taxonomy_base, document_output_dir=f\"{sdg_path}/documents\"\n       \
-          \ )\n    )\n\n    # Generate synthetic dataset\n    if sdg_sampling_size\
-          \ is None:\n        # generate_data has a magic word for its taxonomy_base\
-          \ argument - 'empty'\n        # it allows generating from the whole repo,\
-          \ see:\n        # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
+          \ )\n    )\n\n    # Generate synthetic dataset\n    # 1.0 is the default\
+          \ size\n    if sdg_sampling_size == 1.0:\n        # generate_data has a\
+          \ magic word for its taxonomy_base argument - 'empty'\n        # it allows\
+          \ generating from the whole repo, see:\n        # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
           \        instructlab.sdg.generate_data(\n            client=client,\n  \
           \          num_instructions_to_generate=num_instructions_to_generate,\n\
           \            output_dir=sdg_path,\n            taxonomy=taxonomy_path,\n\
@@ -2142,6 +2143,7 @@ root:
         isOptional: true
         parameterType: STRING
       sdg_sample_size:
+        defaultValue: 1.0
         description: SDG parameter. Represents the sdg skills recipe sampling size
           as percentage in decimal form.
         isOptional: true
diff --git a/sdg/components.py b/sdg/components.py
index 40f0b4b..e3370e6 100644
--- a/sdg/components.py
+++ b/sdg/components.py
@@ -35,7 +35,7 @@ def sdg_op(
     repo_pr: Optional[int],
     taxonomy_path: str = "/data/taxonomy",
     sdg_path: str = "/data/sdg",
-    sdg_sampling_size: float = None,
+    sdg_sampling_size: float = 1.0,
 ):
     from os import getenv, path
 
@@ -68,7 +68,8 @@ def sdg_op(
     )
 
     # Generate synthetic dataset
-    if sdg_sampling_size is None:
+    # 1.0 is the default size
+    if sdg_sampling_size == 1.0:
         # generate_data has a magic word for its taxonomy_base argument - 'empty'
         # it allows generating from the whole repo, see:
         # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
diff --git a/standalone/standalone.py b/standalone/standalone.py
index f85c476..9078702 100755
--- a/standalone/standalone.py
+++ b/standalone/standalone.py
@@ -1130,7 +1130,7 @@ def sdg_op(
     repo_pr: Optional[int],
     taxonomy_path: str = "/data/taxonomy",
     sdg_path: str = "/data/sdg",
-    sdg_sampling_size: float = None,
+    sdg_sampling_size: float = 1.0,
 ):
     from os import getenv, path
 
@@ -1163,7 +1163,8 @@ def sdg_op(
     )
 
     # Generate synthetic dataset
-    if sdg_sampling_size is None:
+    # 1.0 is the default size
+    if sdg_sampling_size == 1.0:
         # generate_data has a magic word for its taxonomy_base argument - 'empty'
         # it allows generating from the whole repo, see:
         # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230