Skip to content

Commit

Permalink
fix: set a default to sdg_sample_size
Browse files Browse the repository at this point in the history
Do not use None since it is not supported by the pipeline. Use the
default 1.0 and compare against it to determine whether we need to
tweak it.

Signed-off-by: Sébastien Han <seb@redhat.com>
  • Loading branch information
leseb committed Dec 9, 2024
1 parent 262fb94 commit 46b9149
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 14 deletions.
2 changes: 1 addition & 1 deletion pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def pipeline(
sdg_scale_factor: int = 2, # Renamed upstream https://github.com/instructlab/instructlab/blob/f7d40f6ed5112d59132dd832bd332fa6fbbe7010/src/instructlab/configuration.py#L279-L290
sdg_pipeline: str = SDG_PIPELINE,
sdg_max_batch_len: int = MAX_BATCH_LEN,
sdg_sample_size: float = None,
sdg_sample_size: float = 1.0,
# Training phase
train_nproc_per_node: int = 3,
train_nnodes: int = 2,
Expand Down
20 changes: 11 additions & 9 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# sdg_repo_branch: str
# sdg_repo_pr: int
# sdg_repo_url: str [Default: 'https://github.com/instructlab/taxonomy.git']
# sdg_sample_size: float
# sdg_sample_size: float [Default: 1.0]
# sdg_scale_factor: int [Default: 2.0]
# train_effective_batch_size_phase_1: int [Default: 3840.0]
# train_effective_batch_size_phase_2: int [Default: 3840.0]
Expand Down Expand Up @@ -522,6 +522,7 @@ components:
isOptional: true
parameterType: STRING
sdg_sampling_size:
defaultValue: 1.0
isOptional: true
parameterType: NUMBER_DOUBLE
taxonomy_path:
Expand Down Expand Up @@ -1535,10 +1536,10 @@ deploymentSpec:
\ *\n\ndef sdg_op(\n num_instructions_to_generate: int,\n pipeline:\
\ str,\n repo_branch: Optional[str],\n repo_pr: Optional[int],\n \
\ taxonomy_path: str = \"/data/taxonomy\",\n sdg_path: str = \"/data/sdg\"\
,\n sdg_sampling_size: float = None,\n):\n from os import getenv,\
\ path\n\n import instructlab.sdg\n import openai\n import yaml\n\
\n api_key = getenv(\"api_key\")\n model = getenv(\"model\")\n \
\ endpoint = getenv(\"endpoint\")\n\n if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"\
,\n sdg_sampling_size: float = 1.0,\n):\n from os import getenv, path\n\
\n import instructlab.sdg\n import openai\n import yaml\n\n \
\ api_key = getenv(\"api_key\")\n model = getenv(\"model\")\n endpoint\
\ = getenv(\"endpoint\")\n\n if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"\
):\n import httpx\n\n custom_http_client = httpx.Client(verify=sdg_ca_cert)\n\
\ client = openai.OpenAI(\n base_url=endpoint, api_key=api_key,\
\ http_client=custom_http_client\n )\n else:\n client =\
Expand All @@ -1547,10 +1548,10 @@ deploymentSpec:
\n\n print(\"Generating synthetic dataset for:\")\n print()\n print(\n\
\ instructlab.sdg.utils.taxonomy.read_taxonomy(\n taxonomy_path,\
\ taxonomy_base, document_output_dir=f\"{sdg_path}/documents\"\n \
\ )\n )\n\n # Generate synthetic dataset\n if sdg_sampling_size\
\ is None:\n # generate_data has a magic word for its taxonomy_base\
\ argument - 'empty'\n # it allows generating from the whole repo,\
\ see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
\ )\n )\n\n # Generate synthetic dataset\n # 1.0 is the default\
\ size\n if sdg_sampling_size == 1.0:\n # generate_data has a\
\ magic word for its taxonomy_base argument - 'empty'\n # it allows\
\ generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
\ instructlab.sdg.generate_data(\n client=client,\n \
\ num_instructions_to_generate=num_instructions_to_generate,\n\
\ output_dir=sdg_path,\n taxonomy=taxonomy_path,\n\
Expand Down Expand Up @@ -2142,6 +2143,7 @@ root:
isOptional: true
parameterType: STRING
sdg_sample_size:
defaultValue: 1.0
description: SDG parameter. Represents the sdg skills recipe sampling size
as percentage in decimal form.
isOptional: true
Expand Down
5 changes: 3 additions & 2 deletions sdg/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def sdg_op(
repo_pr: Optional[int],
taxonomy_path: str = "/data/taxonomy",
sdg_path: str = "/data/sdg",
sdg_sampling_size: float = None,
sdg_sampling_size: float = 1.0,
):
from os import getenv, path

Expand Down Expand Up @@ -68,7 +68,8 @@ def sdg_op(
)

# Generate synthetic dataset
if sdg_sampling_size is None:
# 1.0 is the default size
if sdg_sampling_size == 1.0:
# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
# https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
Expand Down
5 changes: 3 additions & 2 deletions standalone/standalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -1130,7 +1130,7 @@ def sdg_op(
repo_pr: Optional[int],
taxonomy_path: str = "/data/taxonomy",
sdg_path: str = "/data/sdg",
sdg_sampling_size: float = None,
sdg_sampling_size: float = 1.0,
):
from os import getenv, path
Expand Down Expand Up @@ -1163,7 +1163,8 @@ def sdg_op(
)
# Generate synthetic dataset
if sdg_sampling_size is None:
# 1.0 is the default size
if sdg_sampling_size == 1.0:
# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
# https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
Expand Down

0 comments on commit 46b9149

Please sign in to comment.