pytorch
diff --git a/‎.github/workflows/build-wheels_m1.yml
Lines changed: 0 additions & 74 deletions b/‎.github/workflows/build-wheels_m1.yml
Lines changed: 0 additions & 74 deletions
diff --git a/‎.github/workflows/build_wheels_aarch64_linux.yml
Lines changed: 0 additions & 87 deletions b/‎.github/workflows/build_wheels_aarch64_linux.yml
Lines changed: 0 additions & 87 deletions
diff --git a/‎.github/workflows/build_wheels_linux.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/build_wheels_linux.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/build_wheels_windows.yml
Lines changed: 0 additions & 97 deletions b/‎.github/workflows/build_wheels_windows.yml
Lines changed: 0 additions & 97 deletions
diff --git a/‎.github/workflows/dashboard_perf_test.yml
Lines changed: 5 additions & 5 deletions b/‎.github/workflows/dashboard_perf_test.yml
Lines changed: 5 additions & 5 deletions
diff --git a/‎.github/workflows/float8nocompile_test.yaml
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/float8nocompile_test.yaml
Lines changed: 0 additions & 2 deletions
diff --git a/‎.github/workflows/torchao_experimental_test.yml
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/torchao_experimental_test.yml
Lines changed: 3 additions & 3 deletions
diff --git a/‎README.md
Lines changed: 2 additions & 2 deletions b/‎README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/_models/llama/__init__.py b/‎benchmarks/_models/llama/__init__.py
diff --git a/‎benchmarks/_models/sam/__init__.py b/‎benchmarks/_models/sam/__init__.py
@@ -30,8 +30,8 @@ jobs:
       with-cuda: enable
       with-rocm: enable
       with-xpu: enable
-      # please note: excluding 3.13t for aarch64 builds for now
-      python-versions: '["3.9", "3.10", "3.11", "3.12", "3.13"]'
+      # Note: if free-threaded python is required add py3.13t here
+      python-versions: '["3.9"]'
 
   build:
     needs: generate-matrix
 
@@ -42,19 +42,19 @@ jobs:
 
           mkdir -p ${{ runner.temp }}/benchmark-results
           # llama3 - compile baseline
-          ${CONDA_RUN} python benchmarks/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
+          ${CONDA_RUN} python torchao/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
 
           # llama3 - autoquant
-          ${CONDA_RUN} python benchmarks/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --quantization autoquant --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
+          ${CONDA_RUN} python torchao/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --quantization autoquant --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
 
           # skipping SAM because of https://hud.pytorch.org/pr/pytorch/ao/1407
           # # SAM
           # ${CONDA_RUN} pip install git+https://github.com/pytorch-labs/segment-anything-fast.git@main
           # # SAM compile baselilne
-          # ${CONDA_RUN} sh benchmarks/_models/sam/setup.sh
-          # ${CONDA_RUN} python benchmarks/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
+          # ${CONDA_RUN} sh torchao/_models/sam/setup.sh
+          # ${CONDA_RUN} python torchao/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
 
-          # ${CONDA_RUN} python benchmarks/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --compression autoquant --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
+          # ${CONDA_RUN} python torchao/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --compression autoquant --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
 
           # SAM 2.1
           # ${CONDA_RUN} sh scripts/download_sam2_ckpts.sh ${CHECKPOINT_PATH}/sam2
 
@@ -7,14 +7,12 @@ on:
       - 'gh/**'
     paths:
       - 'torchao/prototype/float8nocompile/**'
-      - '!torchao/prototype/float8nocompile/**'
   pull_request:
     branches:
       - main
       - 'gh/**'
     paths:
       - 'torchao/prototype/float8nocompile/**'
-      - '!torchao/prototype/float8nocompile/**'
 
 concurrency:
   group: floatnocompile_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
 
@@ -33,7 +33,7 @@ jobs:
       - name: Install requirements
         run: |
           conda activate venv
-          pip install --extra-index-url "https://download.pytorch.org/whl/nightly/cpu" torch=="2.7.0.dev20250131"
+          pip install torch --index-url "https://download.pytorch.org/whl/nightly/cpu"
           pip install numpy
           pip install pytest
           USE_CPP=1 pip install .
@@ -53,8 +53,8 @@ jobs:
         run: |
           conda activate venv
           pushd torchao/experimental/ops/tests
-          # sh build_and_run_tests.sh
-          # rm -rf /tmp/cmake-out
+          sh build_and_run_tests.sh
+          rm -rf /tmp/cmake-out
           popd
 
   test-mps-ops:
 
@@ -19,7 +19,7 @@ torchao just works with `torch.compile()` and `FSDP2` over most PyTorch models o
 
 ### Post Training Quantization
 
-Quantizing and Sparsifying your models is a 1 liner that should work on any model with an `nn.Linear` including your favorite HuggingFace model. You can find a more comprehensive usage instructions [here](torchao/quantization/), sparsity [here](/benchmarks/_models/sam/README.md) and a HuggingFace inference example [here](scripts/hf_eval.py)
+Quantizing and Sparsifying your models is a 1 liner that should work on any model with an `nn.Linear` including your favorite HuggingFace model. You can find a more comprehensive usage instructions [here](torchao/quantization/), sparsity [here](/torchao/_models/sam/README.md) and a HuggingFace inference example [here](scripts/hf_eval.py)
 
 For inference, we have the option of
 1. Quantize only the weights: works best for memory bound models
@@ -52,7 +52,7 @@ We also provide a developer facing API so you can implement your own quantizatio
 
 We've added kv cache quantization and other features in order to enable long context length (and necessarily memory efficient) inference.
 
-In practice these features alongside int4 weight only quantization allow us to **reduce peak memory by ~55%**, meaning we can Llama3.1-8B inference with a **130k context length with only 18.9 GB of peak memory.** More details can be found [here](benchmarks/_models/llama/README.md)
+In practice these features alongside int4 weight only quantization allow us to **reduce peak memory by ~55%**, meaning we can Llama3.1-8B inference with a **130k context length with only 18.9 GB of peak memory.** More details can be found [here](torchao/_models/llama/README.md)
 
 ## Training