From 46b91495d8e92d3e362137497662140a99174dc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Mon, 9 Dec 2024 14:18:32 +0100 Subject: [PATCH] fix: set a default to sdg_sample_size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not use None since it is not supported by the pipeline. Use the default 1.0 and compare against it to determine whether we need to tweak it. Signed-off-by: Sébastien Han --- pipeline.py | 2 +- pipeline.yaml | 20 +++++++++++--------- sdg/components.py | 5 +++-- standalone/standalone.py | 5 +++-- 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/pipeline.py b/pipeline.py index 64118bd..177179e 100644 --- a/pipeline.py +++ b/pipeline.py @@ -115,7 +115,7 @@ def pipeline( sdg_scale_factor: int = 2, # Renamed upstream https://github.com/instructlab/instructlab/blob/f7d40f6ed5112d59132dd832bd332fa6fbbe7010/src/instructlab/configuration.py#L279-L290 sdg_pipeline: str = SDG_PIPELINE, sdg_max_batch_len: int = MAX_BATCH_LEN, - sdg_sample_size: float = None, + sdg_sample_size: float = 1.0, # Training phase train_nproc_per_node: int = 3, train_nnodes: int = 2, diff --git a/pipeline.yaml b/pipeline.yaml index 9a73e56..823d756 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -15,7 +15,7 @@ # sdg_repo_branch: str # sdg_repo_pr: int # sdg_repo_url: str [Default: 'https://github.com/instructlab/taxonomy.git'] -# sdg_sample_size: float +# sdg_sample_size: float [Default: 1.0] # sdg_scale_factor: int [Default: 2.0] # train_effective_batch_size_phase_1: int [Default: 3840.0] # train_effective_batch_size_phase_2: int [Default: 3840.0] @@ -522,6 +522,7 @@ components: isOptional: true parameterType: STRING sdg_sampling_size: + defaultValue: 1.0 isOptional: true parameterType: NUMBER_DOUBLE taxonomy_path: @@ -1535,10 +1536,10 @@ deploymentSpec: \ *\n\ndef sdg_op(\n num_instructions_to_generate: int,\n pipeline:\ \ str,\n repo_branch: Optional[str],\n repo_pr: Optional[int],\n \ \ taxonomy_path: str = \"/data/taxonomy\",\n sdg_path: str = \"/data/sdg\"\ - ,\n sdg_sampling_size: float = None,\n):\n from os import getenv,\ - \ path\n\n import instructlab.sdg\n import openai\n import yaml\n\ - \n api_key = getenv(\"api_key\")\n model = getenv(\"model\")\n \ - \ endpoint = getenv(\"endpoint\")\n\n if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"\ + ,\n sdg_sampling_size: float = 1.0,\n):\n from os import getenv, path\n\ + \n import instructlab.sdg\n import openai\n import yaml\n\n \ + \ api_key = getenv(\"api_key\")\n model = getenv(\"model\")\n endpoint\ + \ = getenv(\"endpoint\")\n\n if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"\ ):\n import httpx\n\n custom_http_client = httpx.Client(verify=sdg_ca_cert)\n\ \ client = openai.OpenAI(\n base_url=endpoint, api_key=api_key,\ \ http_client=custom_http_client\n )\n else:\n client =\ @@ -1547,10 +1548,10 @@ deploymentSpec: \n\n print(\"Generating synthetic dataset for:\")\n print()\n print(\n\ \ instructlab.sdg.utils.taxonomy.read_taxonomy(\n taxonomy_path,\ \ taxonomy_base, document_output_dir=f\"{sdg_path}/documents\"\n \ - \ )\n )\n\n # Generate synthetic dataset\n if sdg_sampling_size\ - \ is None:\n # generate_data has a magic word for its taxonomy_base\ - \ argument - 'empty'\n # it allows generating from the whole repo,\ - \ see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\ + \ )\n )\n\n # Generate synthetic dataset\n # 1.0 is the default\ + \ size\n if sdg_sampling_size == 1.0:\n # generate_data has a\ + \ magic word for its taxonomy_base argument - 'empty'\n # it allows\ + \ generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\ \ instructlab.sdg.generate_data(\n client=client,\n \ \ num_instructions_to_generate=num_instructions_to_generate,\n\ \ output_dir=sdg_path,\n taxonomy=taxonomy_path,\n\ @@ -2142,6 +2143,7 @@ root: isOptional: true parameterType: STRING sdg_sample_size: + defaultValue: 1.0 description: SDG parameter. Represents the sdg skills recipe sampling size as percentage in decimal form. isOptional: true diff --git a/sdg/components.py b/sdg/components.py index 40f0b4b..e3370e6 100644 --- a/sdg/components.py +++ b/sdg/components.py @@ -35,7 +35,7 @@ def sdg_op( repo_pr: Optional[int], taxonomy_path: str = "/data/taxonomy", sdg_path: str = "/data/sdg", - sdg_sampling_size: float = None, + sdg_sampling_size: float = 1.0, ): from os import getenv, path @@ -68,7 +68,8 @@ def sdg_op( ) # Generate synthetic dataset - if sdg_sampling_size is None: + # 1.0 is the default size + if sdg_sampling_size == 1.0: # generate_data has a magic word for its taxonomy_base argument - 'empty' # it allows generating from the whole repo, see: # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230 diff --git a/standalone/standalone.py b/standalone/standalone.py index f85c476..9078702 100755 --- a/standalone/standalone.py +++ b/standalone/standalone.py @@ -1130,7 +1130,7 @@ def sdg_op( repo_pr: Optional[int], taxonomy_path: str = "/data/taxonomy", sdg_path: str = "/data/sdg", - sdg_sampling_size: float = None, + sdg_sampling_size: float = 1.0, ): from os import getenv, path @@ -1163,7 +1163,8 @@ def sdg_op( ) # Generate synthetic dataset - if sdg_sampling_size is None: + # 1.0 is the default size + if sdg_sampling_size == 1.0: # generate_data has a magic word for its taxonomy_base argument - 'empty' # it allows generating from the whole repo, see: # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230