Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 16 additions & 13 deletions docs/source/en/quantization/torchao.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ Create a [`TorchAoConfig`] and specify the quantization type and `group_size` of

We'll show examples for recommended quantization methods based on hardwares, e.g. A100 GPU, H100 GPU, CPU.

> [!WARNING]
> torchao automatically compiles the model during the first inference. The model is recompiled every time batch size or `max_new_tokens` is modified. Pass `disable_compile=True` in [`~GenerationMixin.generate`] to quantize without compilation.

Comment on lines +89 to +91
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
> [!WARNING]
> torchao automatically compiles the model during the first inference. The model is recompiled every time batch size or `max_new_tokens` is modified. Pass `disable_compile=True` in [`~GenerationMixin.generate`] to quantize without compilation.
> [!WARNING]
> torchao automatically compiles the model during the first inference if we set `cache_implementation="static"`. The model is recompiled every time batch size or `max_new_tokens` is modified. Pass `disable_compile=True` in [`~GenerationMixin.generate`] to quantize without compilation.

### H100 GPU

<hfoptions id="examples-H100-GPU">
Expand All @@ -111,7 +114,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device, quantized_model.dtype)

# auto-compile the quantized model with `cache_implementation="static"` to get speed up
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
Expand Down Expand Up @@ -140,7 +143,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device, quantized_model.dtype)

# auto-compile the quantized model with `cache_implementation="static"` to get speed up
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
Expand Down Expand Up @@ -207,7 +210,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device, quantized_model.dtype)

# auto-compile the quantized model with `cache_implementation="static"` to get speed up
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
Expand Down Expand Up @@ -243,7 +246,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device, quantized_model.dtype)

# auto-compile the quantized model with `cache_implementation="static"` to get speed up
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
Expand Down Expand Up @@ -275,7 +278,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(

tokenizer = AutoTokenizer.from_pretrained("RedHatAI/Sparse-Llama-3.1-8B-2of4")
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device, quantized_model.dtype)

# auto-compile the quantized model with `cache_implementation="static"` to get speed up
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
Expand Down Expand Up @@ -310,7 +313,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device, quantized_model.dtype)

# auto-compile the quantized model with `cache_implementation="static"` to get speed up
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
Expand Down Expand Up @@ -342,7 +345,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device)
input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device, quantized_model.dtype)

# auto-compile the quantized model with `cache_implementation="static"` to get speed up
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
Expand Down Expand Up @@ -376,7 +379,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt")
input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device, quantized_model.dtype)

# auto-compile the quantized model with `cache_implementation="static"` to get speed up
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
Expand Down Expand Up @@ -408,7 +411,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt")
input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device, quantized_model.dtype)

# auto-compile the quantized model with `cache_implementation="static"` to get speed up
output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
Expand Down Expand Up @@ -443,7 +446,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)

# Manual Testing
prompt = "Hey, are you conscious? Can you talk to me?"
inputs = tokenizer(prompt, return_tensors="pt").to(quantized_model.device.type)
inputs = tokenizer(prompt, return_tensors="pt").to(quantized_model.device, quantized_model.dtype)
generated_ids = quantized_model.generate(**inputs, max_new_tokens=128)
output_text = tokenizer.batch_decode(
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
Expand Down Expand Up @@ -482,7 +485,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)

# Manual Testing
prompt = "Hey, are you conscious? Can you talk to me?"
inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
inputs = tokenizer(prompt, return_tensors="pt").to("cpu", quantized_model.dtype)
generated_ids = quantized_model.generate(**inputs, max_new_tokens=128, cache_implementation="static")
output_text = tokenizer.batch_decode(
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
Expand Down Expand Up @@ -577,7 +580,7 @@ print("Prompt:", prompt)
inputs = tokenizer(
prompt,
return_tensors="pt",
).to("cuda")
).to(quantized_model.device, quantized_model.dtype)
# setting temperature to 0 to make sure result deterministic
generated_ids = quantized_model.generate(**inputs, max_new_tokens=128, temperature=0)

Expand Down Expand Up @@ -739,7 +742,7 @@ reloaded_model = AutoModelForCausalLM.from_pretrained(
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt")
input_ids = tokenizer(input_text, return_tensors="pt").to(reloaded_model.device.type)

output = reloaded_model.generate(**input_ids, max_new_tokens=10)
print(tokenizer.decode(output[0], skip_special_tokens=True))
Expand Down