worker-config.json

{
  "0.5.3": {
    "categories": [
      {
        "title": "LLM Settings",
        "settings": [
          {
            "TOKENIZER": {
              "value": "",
              "title": "Tokenizer",
              "description": "Name or path of the Hugging Face tokenizer to use.",
              "required": false,
              "type": "text",
              "category": "LLM Settings"
            },
            "TOKENIZER_MODE": {
              "value": "auto",
              "title": "Tokenizer Mode",
              "description": "The tokenizer mode.",
              "required": false,
              "type": "select",
              "category": "LLM Settings",
              "options": [
                { "value": "auto", "label": "auto" },
                { "value": "slow", "label": "slow" }
              ]
            },
            "SKIP_TOKENIZER_INIT": {
              "value": false,
              "title": "Skip Tokenizer Init",
              "description": "Skip initialization of tokenizer and detokenizer.",
              "required": false,
              "type": "toggle",
              "category": "LLM Settings"
            },
            "TRUST_REMOTE_CODE": {
              "value": false,
              "title": "Trust Remote Code",
              "description": "Trust remote code from Hugging Face.",
              "required": false,
              "type": "toggle",
              "category": "LLM Settings"
            },
            "DOWNLOAD_DIR": {
              "value": "",
              "title": "Download Directory",
              "description": "Directory to download and load the weights.",
              "required": false,
              "type": "text",
              "category": "LLM Settings"
            },
            "LOAD_FORMAT": {
              "value": "auto",
              "title": "Load Format",
              "description": "The format of the model weights to load.",
              "required": false,
              "type": "select",
              "category": "LLM Settings",
              "options": [
                { "value": "auto", "label": "auto" },
                { "value": "pt", "label": "pt" },
                { "value": "safetensors", "label": "safetensors" },
                { "value": "npcache", "label": "npcache" },
                { "value": "dummy", "label": "dummy" },
                { "value": "tensorizer", "label": "tensorizer" },
                { "value": "bitsandbytes", "label": "bitsandbytes" }
              ]
            },
            "DTYPE": {
              "value": "auto",
              "title": "Data Type",
              "description": "Data type for model weights and activations.",
              "required": false,
              "type": "select",
              "category": "LLM Settings",
              "options": [
                { "value": "auto", "label": "auto" },
                { "value": "half", "label": "half" },
                { "value": "float16", "label": "float16" },
                { "value": "bfloat16", "label": "bfloat16" },
                { "value": "float", "label": "float" },
                { "value": "float32", "label": "float32" }
              ]
            },
            "KV_CACHE_DTYPE": {
              "value": "auto",
              "title": "KV Cache Data Type",
              "description": "Data type for KV cache storage.",
              "required": false,
              "type": "select",
              "category": "LLM Settings",
              "options": [
                { "value": "auto", "label": "auto" },
                { "value": "fp8", "label": "fp8" }
              ]
            },
            "QUANTIZATION_PARAM_PATH": {
              "value": "",
              "title": "Quantization Param Path",
              "description": "Path to the JSON file containing the KV cache scaling factors.",
              "required": false,
              "type": "text",
              "category": "LLM Settings"
            },
            "MAX_MODEL_LEN": {
              "value": "",
              "title": "Max Model Length",
              "description": "Model context length.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "GUIDED_DECODING_BACKEND": {
              "value": "outlines",
              "title": "Guided Decoding Backend",
              "description": "Which engine will be used for guided decoding by default.",
              "required": false,
              "type": "select",
              "category": "LLM Settings",
              "options": [
                { "value": "outlines", "label": "outlines" },
                { "value": "lm-format-enforcer", "label": "lm-format-enforcer" }
              ]
            },
            "DISTRIBUTED_EXECUTOR_BACKEND": {
              "value": "",
              "title": "Distributed Executor Backend",
              "description": "Backend to use for distributed serving.",
              "required": false,
              "type": "select",
              "category": "LLM Settings",
              "options": [
                { "value": "ray", "label": "ray" },
                { "value": "mp", "label": "mp" }
              ]
            },
            "WORKER_USE_RAY": {
              "value": false,
              "title": "Worker Use Ray",
              "description": "Deprecated, use --distributed-executor-backend=ray.",
              "required": false,
              "type": "toggle",
              "category": "LLM Settings"
            },
            "RAY_WORKERS_USE_NSIGHT": {
              "value": false,
              "title": "Ray Workers Use Nsight",
              "description": "If specified, use nsight to profile Ray workers.",
              "required": false,
              "type": "toggle",
              "category": "LLM Settings"
            },
            "PIPELINE_PARALLEL_SIZE": {
              "value": 1,
              "title": "Pipeline Parallel Size",
              "description": "Number of pipeline stages.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "TENSOR_PARALLEL_SIZE": {
              "value": 1,
              "title": "Tensor Parallel Size",
              "description": "Number of tensor parallel replicas.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "MAX_PARALLEL_LOADING_WORKERS": {
              "value": "",
              "title": "Max Parallel Loading Workers",
              "description": "Load model sequentially in multiple batches.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "ENABLE_PREFIX_CACHING": {
              "value": false,
              "title": "Enable Prefix Caching",
              "description": "Enables automatic prefix caching.",
              "required": false,
              "type": "toggle",
              "category": "LLM Settings"
            },
            "DISABLE_SLIDING_WINDOW": {
              "value": false,
              "title": "Disable Sliding Window",
              "description": "Disables sliding window, capping to sliding window size.",
              "required": false,
              "type": "toggle",
              "category": "LLM Settings"
            },
            "USE_V2_BLOCK_MANAGER": {
              "value": false,
              "title": "Use V2 Block Manager",
              "description": "Use BlockSpaceMangerV2.",
              "required": false,
              "type": "toggle",
              "category": "LLM Settings"
            },
            "NUM_LOOKAHEAD_SLOTS": {
              "value": 0,
              "title": "Num Lookahead Slots",
              "description": "Experimental scheduling config necessary for speculative decoding.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "SEED": {
              "value": 0,
              "title": "Seed",
              "description": "Random seed for operations.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "NUM_GPU_BLOCKS_OVERRIDE": {
              "value": "",
              "title": "Num GPU Blocks Override",
              "description": "If specified, ignore GPU profiling result and use this number of GPU blocks.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "MAX_NUM_BATCHED_TOKENS": {
              "value": "",
              "title": "Max Num Batched Tokens",
              "description": "Maximum number of batched tokens per iteration.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "MAX_NUM_SEQS": {
              "value": 256,
              "title": "Max Num Seqs",
              "description": "Maximum number of sequences per iteration.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "MAX_LOGPROBS": {
              "value": 20,
              "title": "Max Logprobs",
              "description": "Max number of log probs to return when logprobs is specified in SamplingParams.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "DISABLE_LOG_STATS": {
              "value": false,
              "title": "Disable Log Stats",
              "description": "Disable logging statistics.",
              "required": false,
              "type": "toggle",
              "category": "LLM Settings"
            },
            "QUANTIZATION": {
              "value": "",
              "title": "Quantization",
              "description": "Method used to quantize the weights.",
              "required": false,
              "type": "select",
              "category": "LLM Settings",
              "options": [
                { "value": "None", "label": "None" },
                { "value": "awq", "label": "AWQ" },
                { "value": "squeezellm", "label": "SqueezeLLM" },
                { "value": "gptq", "label": "GPTQ" }
              ]
            },
            "ROPE_SCALING": {
              "value": "",
              "title": "RoPE Scaling",
              "description": "RoPE scaling configuration in JSON format.",
              "required": false,
              "type": "text",
              "category": "LLM Settings"
            },
            "ROPE_THETA": {
              "value": "",
              "title": "RoPE Theta",
              "description": "RoPE theta. Use with rope_scaling.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "TOKENIZER_POOL_SIZE": {
              "value": 0,
              "title": "Tokenizer Pool Size",
              "description": "Size of tokenizer pool to use for asynchronous tokenization.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "TOKENIZER_POOL_TYPE": {
              "value": "ray",
              "title": "Tokenizer Pool Type",
              "description": "Type of tokenizer pool to use for asynchronous tokenization.",
              "required": false,
              "type": "text",
              "category": "LLM Settings"
            },
            "TOKENIZER_POOL_EXTRA_CONFIG": {
              "value": "",
              "title": "Tokenizer Pool Extra Config",
              "description": "Extra config for tokenizer pool.",
              "required": false,
              "type": "text",
              "category": "LLM Settings"
            },
            "ENABLE_LORA": {
              "value": false,
              "title": "Enable LoRA",
              "description": "If True, enable handling of LoRA adapters.",
              "required": false,
              "type": "toggle",
              "category": "LLM Settings"
            },
            "MAX_LORAS": {
              "value": 1,
              "title": "Max LoRAs",
              "description": "Max number of LoRAs in a single batch.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "MAX_LORA_RANK": {
              "value": 16,
              "title": "Max LoRA Rank",
              "description": "Max LoRA rank.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "LORA_EXTRA_VOCAB_SIZE": {
              "value": 256,
              "title": "LoRA Extra Vocab Size",
              "description": "Maximum size of extra vocabulary for LoRA adapters.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "LORA_DTYPE": {
              "value": "auto",
              "title": "LoRA Data Type",
              "description": "Data type for LoRA.",
              "required": false,
              "type": "select",
              "category": "LLM Settings",
              "options": [
                { "value": "auto", "label": "auto" },
                { "value": "float16", "label": "float16" },
                { "value": "bfloat16", "label": "bfloat16" },
                { "value": "float32", "label": "float32" }
              ]
            },
            "LONG_LORA_SCALING_FACTORS": {
              "value": "",
              "title": "Long LoRA Scaling Factors",
              "description": "Specify multiple scaling factors for LoRA adapters.",
              "required": false,
              "type": "text",
              "category": "LLM Settings"
            },
            "MAX_CPU_LORAS": {
              "value": "",
              "title": "Max CPU LoRAs",
              "description": "Maximum number of LoRAs to store in CPU memory.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "FULLY_SHARDED_LORAS": {
              "value": false,
              "title": "Fully Sharded LoRAs",
              "description": "Enable fully sharded LoRA layers.",
              "required": false,
              "type": "toggle",
              "category": "LLM Settings"
            },
            "DEVICE": {
              "value": "auto",
              "title": "Device",
              "description": "Device type for vLLM execution.",
              "required": false,
              "type": "select",
              "category": "LLM Settings",
              "options": [
                { "value": "auto", "label": "auto" },
                { "value": "cuda", "label": "cuda" },
                { "value": "neuron", "label": "neuron" },
                { "value": "cpu", "label": "cpu" },
                { "value": "openvino", "label": "openvino" },
                { "value": "tpu", "label": "tpu" },
                { "value": "xpu", "label": "xpu" }
              ]
            },
            "SCHEDULER_DELAY_FACTOR": {
              "value": 0.0,
              "title": "Scheduler Delay Factor",
              "description": "Apply a delay before scheduling next prompt.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "ENABLE_CHUNKED_PREFILL": {
              "value": false,
              "title": "Enable Chunked Prefill",
              "description": "Enable chunked prefill requests.",
              "required": false,
              "type": "toggle",
              "category": "LLM Settings"
            },
            "SPECULATIVE_MODEL": {
              "value": "",
              "title": "Speculative Model",
              "description": "The name of the draft model to be used in speculative decoding.",
              "required": false,
              "type": "text",
              "category": "LLM Settings"
            },
            "NUM_SPECULATIVE_TOKENS": {
              "value": "",
              "title": "Num Speculative Tokens",
              "description": "The number of speculative tokens to sample from the draft model.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE": {
              "value": "",
              "title": "Speculative Draft Tensor Parallel Size",
              "description": "Number of tensor parallel replicas for the draft model.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "SPECULATIVE_MAX_MODEL_LEN": {
              "value": "",
              "title": "Speculative Max Model Length",
              "description": "The maximum sequence length supported by the draft model.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "SPECULATIVE_DISABLE_BY_BATCH_SIZE": {
              "value": "",
              "title": "Speculative Disable by Batch Size",
              "description": "Disable speculative decoding if the number of enqueue requests is larger than this value.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "NGRAM_PROMPT_LOOKUP_MAX": {
              "value": "",
              "title": "Ngram Prompt Lookup Max",
              "description": "Max size of window for ngram prompt lookup in speculative decoding.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "NGRAM_PROMPT_LOOKUP_MIN": {
              "value": "",
              "title": "Ngram Prompt Lookup Min",
              "description": "Min size of window for ngram prompt lookup in speculative decoding.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "SPEC_DECODING_ACCEPTANCE_METHOD": {
              "value": "rejection_sampler",
              "title": "Speculative Decoding Acceptance Method",
              "description": "Specify the acceptance method for draft token verification in speculative decoding.",
              "required": false,
              "type": "select",
              "category": "LLM Settings",
              "options": [
                { "value": "rejection_sampler", "label": "rejection_sampler" },
                { "value": "typical_acceptance_sampler", "label": "typical_acceptance_sampler" }
              ]
            },
            "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD": {
              "value": "",
              "title": "Typical Acceptance Sampler Posterior Threshold",
              "description": "Set the lower bound threshold for the posterior probability of a token to be accepted.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA": {
              "value": "",
              "title": "Typical Acceptance Sampler Posterior Alpha",
              "description": "A scaling factor for the entropy-based threshold for token acceptance.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "MODEL_LOADER_EXTRA_CONFIG": {
              "value": "",
              "title": "Model Loader Extra Config",
              "description": "Extra config for model loader.",
              "required": false,
              "type": "text",
              "category": "LLM Settings"
            },
            "PREEMPTION_MODE": {
              "value": "",
              "title": "Preemption Mode",
              "description": "If 'recompute', the engine performs preemption-aware recomputation. If 'save', the engine saves activations into the CPU memory as preemption happens.",
              "required": false,
              "type": "text",
              "category": "LLM Settings"
            },
            "PREEMPTION_CHECK_PERIOD": {
              "value": 1.0,
              "title": "Preemption Check Period",
              "description": "How frequently the engine checks if a preemption happens.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "PREEMPTION_CPU_CAPACITY": {
              "value": 2,
              "title": "Preemption CPU Capacity",
              "description": "The percentage of CPU memory used for the saved activations.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "MAX_LOG_LEN": {
              "value": "",
              "title": "Max Log Length",
              "description": "Max number of characters or ID numbers being printed in log.",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "DISABLE_LOGGING_REQUEST": {
              "value": false,
              "title": "Disable Logging Request",
              "description": "Disable logging requests.",
              "required": false,
              "type": "toggle",
              "category": "LLM Settings"
            }
          }
        ]
      },
      {
        "title": "Tokenizer Settings",
        "settings": [
          {
            "TOKENIZER_NAME": {
              "value": "",
              "title": "Tokenizer Name",
              "description": "Tokenizer repo to use a different tokenizer than the model's default",
              "required": false,
              "type": "text",
              "category": "Tokenizer Settings"
            },
            "TOKENIZER_REVISION": {
              "value": "",
              "title": "Tokenizer Revision",
              "description": "Tokenizer revision to load",
              "required": false,
              "type": "text",
              "category": "Tokenizer Settings"
            },
            "CUSTOM_CHAT_TEMPLATE": {
              "value": "",
              "title": "Custom Chat Template",
              "description": "Custom chat jinja template",
              "required": false,
              "type": "text",
              "category": "Tokenizer Settings"
            }
          }
        ]
      },
      {
        "title": "System Settings",
        "settings": [
          {
            "GPU_MEMORY_UTILIZATION": {
              "value": "0.95",
              "title": "GPU Memory Utilization",
              "description": "Sets GPU VRAM utilization",
              "required": false,
              "type": "number",
              "category": "System Settings"
            },
            "MAX_PARALLEL_LOADING_WORKERS": {
              "value": "",
              "title": "Max Parallel Loading Workers",
              "description": "Load model sequentially in multiple batches. Leave empty for auto",
              "required": false,
              "type": "number",
              "category": "System Settings"
            },
            "BLOCK_SIZE": {
              "value": "16",
              "title": "Block Size",
              "description": "Token block size for contiguous chunks of tokens",
              "required": false,
              "type": "number",
              "category": "System Settings"
            },
            "SWAP_SPACE": {
              "value": "4",
              "title": "Swap Space",
              "description": "CPU swap space size (GiB) per GPU",
              "required": false,
              "type": "number",
              "category": "System Settings"
            },
            "ENFORCE_EAGER": {
              "value": false,
              "title": "Enforce Eager",
              "description": "Always use eager-mode PyTorch. If False (0), will use eager mode and CUDA graph in hybrid for maximal performance and flexibility",
              "required": false,
              "type": "toggle",
              "category": "System Settings"
            },
            "MAX_SEQ_LEN_TO_CAPTURE": {
              "value": "8192",
              "title": "CUDA Graph Max Content Length",
              "description": "Maximum context length covered by CUDA graphs. If a sequence has context length larger than this, we fall back to eager mode",
              "required": false,
              "type": "number",
              "category": "System Settings"
            },
            "DISABLE_CUSTOM_ALL_REDUCE": {
              "value": false,
              "title": "Disable Custom All Reduce",
              "description": "Enables or disables custom all reduce",
              "required": false,
              "type": "toggle",
              "category": "System Settings"
            }
          }
        ]
      },
      {
        "title": "Streaming Settings",
        "settings": [
          {
            "DEFAULT_BATCH_SIZE": {
              "value": "50",
              "title": "Default Final Batch Size",
              "description": "Default and Maximum batch size for token streaming to reduce HTTP calls",
              "required": false,
              "type": "number",
              "category": "Streaming Settings"
            },
            "DEFAULT_MIN_BATCH_SIZE": {
              "value": "1",
              "title": "Default Starting Batch Size",
              "description": "Batch size for the first request, which will be multiplied by the growth factor every subsequent request",
              "required": false,
              "type": "number",
              "category": "Streaming Settings"
            },
            "DEFAULT_BATCH_SIZE_GROWTH_FACTOR": {
              "value": "3",
              "title": "Default Batch Size Growth Factor",
              "description": "Growth factor for dynamic batch size",
              "required": false,
              "type": "number",
              "category": "Streaming Settings"
            }
          }
        ]
      },
      {
        "title": "OpenAI Settings",
        "settings": [
          {
            "RAW_OPENAI_OUTPUT": {
              "value": true,
              "title": "Raw OpenAI Output",
              "description": "Raw OpenAI output instead of just the text",
              "required": false,
              "type": "toggle",
              "category": "OpenAI Settings"
            },
            "OPENAI_RESPONSE_ROLE": {
              "value": "assistant",
              "title": "OpenAI Response Role",
              "description": "Role of the LLM's Response in OpenAI Chat Completions",
              "required": false,
              "type": "text",
              "category": "OpenAI Settings"
            },
            "OPENAI_SERVED_MODEL_NAME_OVERRIDE": {
              "value": "",
              "title": "OpenAI Served Model Name Override",
              "description": "Overrides the name of the served model from model repo/path to specified name, which you will then be able to use the value for the `model` parameter when making OpenAI requests",
              "required": false,
              "type": "text",
              "category": "OpenAI Settings"
            }
          }
        ]
      },
      {
        "title": "Serverless Settings",
        "settings": [
          {
            "MAX_CONCURRENCY": {
              "value": "300",
              "title": "Max Concurrency",
              "description": "Max concurrent requests per worker. vLLM has an internal queue, so you don't have to worry about limiting by VRAM, this is for improving scaling/load balancing efficiency",
              "required": false,
              "type": "number",
              "category": "Serverless Settings"
            },
            "DISABLE_LOG_STATS": {
              "value": true,
              "title": "Disable Log Stats",
              "description": "Enables or disables vLLM stats logging",
              "required": false,
              "type": "toggle",
              "category": "Serverless Settings"
            },
            "DISABLE_LOG_REQUESTS": {
              "value": true,
              "title": "Disable Log Requests",
              "description": "Enables or disables vLLM request logging",
              "required": false,
              "type": "toggle",
              "category": "Serverless Settings"
            }
          }
        ]
      }
    ]
  },
  "0.4.2": {
    "categories": [
      {
        "title": "LLM Settings",
        "settings": [
          {
            "MODEL_REVISION": {
              "value": "",
              "title": "Model Revision",
              "description": "Model revision (branch) to load",
              "required": false,
              "type": "text",
              "category": "LLM Settings"
            },
            "MAX_MODEL_LEN": {
              "value": "",
              "title": "Max Model Length",
              "description": "Maximum number of tokens for the engine to handle per request",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "BASE_PATH": {
              "value": "/runpod-volume",
              "title": "Base Path",
              "description": "Storage directory for Huggingface cache and model",
              "required": false,
              "type": "text",
              "category": "LLM Settings"
            },
            "LOAD_FORMAT": {
              "value": "auto",
              "title": "Load Format",
              "description": "Format to load model in",
              "required": false,
              "type": "select",
              "category": "LLM Settings",
              "options": [
                { "value": "auto", "label": "auto" },
                { "value": ".safetensors", "label": ".safetensors" },
                { "value": ".bin", "label": ".bin" },
                { "value": ".pt", "label": ".pt" }
              ]
            },
            "QUANTIZATION": {
              "value": "None",
              "title": "Quantization",
              "description": "Quantization of given model. The model must already be quantized",
              "required": false,
              "type": "select",
              "category": "LLM Settings",
              "options": [
                { "value": "None", "label": "None" },
                { "value": "awq", "label": "AWQ" },
                { "value": "squeezellm", "label": "SqueezeLLM" },
                { "value": "gptq", "label": "GPTQ" }
              ]
            },
            "TRUST_REMOTE_CODE": {
              "value": "0",
              "title": "Trust Remote Code",
              "description": "Trust remote code for HuggingFace models",
              "required": false,
              "type": "toggle",
              "category": "LLM Settings"
            },
            "SEED": {
              "value": "",
              "title": "Seed",
              "description": "Sets random seed for operations",
              "required": false,
              "type": "number",
              "category": "LLM Settings"
            },
            "KV_CACHE_DTYPE": {
              "value": "auto",
              "title": "KV Cache Data Type",
              "description": "Data type for kv cache storage. Uses DTYPE if set to auto",
              "required": false,
              "type": "select",
              "category": "LLM Settings",
              "options": [
                { "value": "auto", "label": "auto" },
                { "value": "fp8_e5m2", "label": "fp8_e5m2" }
              ]
            },
            "DTYPE": {
              "value": "auto",
              "title": "Weights Datatype/Precision",
              "description": "Sets datatype/precision for model weights and activations",
              "required": false,
              "type": "select",
              "category": "LLM Settings",
              "options": [
                { "value": "auto", "label": "auto" },
                { "value": "half", "label": "half" },
                { "value": "float16", "label": "float16" },
                { "value": "bfloat16", "label": "bfloat16" },
                { "value": "float", "label": "float" },
                { "value": "float32", "label": "float32" }
              ]
            }
          }
        ]
      },
      {
        "title": "Tokenizer Settings",
        "settings": [
          {
            "TOKENIZER_NAME": {
              "value": "",
              "title": "Tokenizer Name",
              "description": "Tokenizer repo to use a different tokenizer than the model's default",
              "required": false,
              "type": "text",
              "category": "Tokenizer Settings"
            },
            "TOKENIZER_REVISION": {
              "value": "",
              "title": "Tokenizer Revision",
              "description": "Tokenizer revision to load",
              "required": false,
              "type": "text",
              "category": "Tokenizer Settings"
            },
            "CUSTOM_CHAT_TEMPLATE": {
              "value": "",
              "title": "Custom Chat Template",
              "description": "Custom chat jinja template",
              "required": false,
              "type": "text",
              "category": "Tokenizer Settings"
            }
          }
        ]
      },
      {
        "title": "System Settings",
        "settings": [
          {
            "GPU_MEMORY_UTILIZATION": {
              "value": "0.95",
              "title": "GPU Memory Utilization",
              "description": "Sets GPU VRAM utilization",
              "required": false,
              "type": "number",
              "category": "System Settings"
            },
            "MAX_PARALLEL_LOADING_WORKERS": {
              "value": "",
              "title": "Max Parallel Loading Workers",
              "description": "Load model sequentially in multiple batches. Leave empty for auto",
              "required": false,
              "type": "number",
              "category": "System Settings"
            },
            "BLOCK_SIZE": {
              "value": "16",
              "title": "Block Size",
              "description": "Token block size for contiguous chunks of tokens",
              "required": false,
              "type": "number",
              "category": "System Settings"
            },
            "SWAP_SPACE": {
              "value": "4",
              "title": "Swap Space",
              "description": "CPU swap space size (GiB) per GPU",
              "required": false,
              "type": "number",
              "category": "System Settings"
            },
            "ENFORCE_EAGER": {
              "value": "0",
              "title": "Enforce Eager",
              "description": "Always use eager-mode PyTorch. If False (0), will use eager mode and CUDA graph in hybrid for maximal performance and flexibility",
              "required": false,
              "type": "toggle",
              "category": "System Settings"
            },
            "MAX_SEQ_LEN_TO_CAPTURE": {
              "value": "8192",
              "title": "CUDA Graph Max Content Length",
              "description": "Maximum context length covered by CUDA graphs. If a sequence has context length larger than this, we fall back to eager mode",
              "required": false,
              "type": "number",
              "category": "System Settings"
            },
            "DISABLE_CUSTOM_ALL_REDUCE": {
              "value": "0",
              "title": "Disable Custom All Reduce",
              "description": "Enables or disables custom all reduce",
              "required": false,
              "type": "toggle",
              "category": "System Settings"
            }
          }
        ]
      },
      {
        "title": "Streaming Settings",
        "settings": [
          {
            "DEFAULT_BATCH_SIZE": {
              "value": "50",
              "title": "Default Final Batch Size",
              "description": "Default and Maximum batch size for token streaming to reduce HTTP calls",
              "required": false,
              "type": "number",
              "category": "Streaming Settings"
            },
            "DEFAULT_MIN_BATCH_SIZE": {
              "value": "1",
              "title": "Default Starting Batch Size",
              "description": "Batch size for the first request, which will be multiplied by the growth factor every subsequent request",
              "required": false,
              "type": "number",
              "category": "Streaming Settings"
            },
            "DEFAULT_BATCH_SIZE_GROWTH_FACTOR": {
              "value": "3",
              "title": "Default Batch Size Growth Factor",
              "description": "Growth factor for dynamic batch size",
              "required": false,
              "type": "number",
              "category": "Streaming Settings"
            }
          }
        ]
      },
      {
        "title": "OpenAI Settings",
        "settings": [
          {
            "RAW_OPENAI_OUTPUT": {
              "value": "1",
              "title": "Raw OpenAI Output",
              "description": "Raw OpenAI output instead of just the text",
              "required": false,
              "type": "toggle",
              "category": "OpenAI Settings"
            },
            "OPENAI_RESPONSE_ROLE": {
              "value": "assistant",
              "title": "OpenAI Response Role",
              "description": "Role of the LLM's Response in OpenAI Chat Completions",
              "required": false,
              "type": "text",
              "category": "OpenAI Settings"
            },
            "OPENAI_SERVED_MODEL_NAME_OVERRIDE": {
              "value": "",
              "title": "OpenAI Served Model Name Override",
              "description": "Overrides the name of the served model from model repo/path to specified name, which you will then be able to use the value for the `model` parameter when making OpenAI requests",
              "required": false,
              "type": "text",
              "category": "OpenAI Settings"
            }
          }
        ]
      },
      {
        "title": "Serverless Settings",
        "settings": [
          {
            "MAX_CONCURRENCY": {
              "value": "300",
              "title": "Max Concurrency",
              "description": "Max concurrent requests per worker. vLLM has an internal queue, so you don't have to worry about limiting by VRAM, this is for improving scaling/load balancing efficiency",
              "required": false,
              "type": "number",
              "category": "Serverless Settings"
            },
            "DISABLE_LOG_STATS": {
              "value": "1",
              "title": "Disable Log Stats",
              "description": "Enables or disables vLLM stats logging",
              "required": false,
              "type": "toggle",
              "category": "Serverless Settings"
            },
            "DISABLE_LOG_REQUESTS": {
              "value": "1",
              "title": "Disable Log Requests",
              "description": "Enables or disables vLLM request logging",
              "required": false,
              "type": "toggle",
              "category": "Serverless Settings"
            }
          }
        ]
      }
    ]
  }
}