pytorch
diff --git a/‎.github/workflows/regression_test.yml
Lines changed: 26 additions & 0 deletions b/‎.github/workflows/regression_test.yml
Lines changed: 26 additions & 0 deletions
diff --git a/‎.lintrunner.toml
Lines changed: 59 additions & 0 deletions b/‎.lintrunner.toml
Lines changed: 59 additions & 0 deletions
diff --git a/‎CODEOWNERS
Lines changed: 2 additions & 0 deletions b/‎CODEOWNERS
Lines changed: 2 additions & 0 deletions
diff --git a/‎benchmarks/intmm.py
Lines changed: 96 additions & 0 deletions b/‎benchmarks/intmm.py
Lines changed: 96 additions & 0 deletions
diff --git a/‎benchmarks/intmm_shapes.csv
Lines changed: 127 additions & 0 deletions b/‎benchmarks/intmm_shapes.csv
Lines changed: 127 additions & 0 deletions
diff --git a/‎benchmarks/print_config_shapes.py
Lines changed: 16 additions & 0 deletions b/‎benchmarks/print_config_shapes.py
Lines changed: 16 additions & 0 deletions
diff --git a/‎benchmarks/sam_vit_b_shapes.csv
Lines changed: 7 additions & 0 deletions b/‎benchmarks/sam_vit_b_shapes.csv
Lines changed: 7 additions & 0 deletions
diff --git a/‎dev-requirements.txt
Lines changed: 2 additions & 1 deletion b/‎dev-requirements.txt
Lines changed: 2 additions & 1 deletion
diff --git a/‎requirements-lintrunner.txt
Lines changed: 22 additions & 0 deletions b/‎requirements-lintrunner.txt
Lines changed: 22 additions & 0 deletions
diff --git a/‎requirements.txt
Lines changed: 1 addition & 0 deletions b/‎requirements.txt
Lines changed: 1 addition & 0 deletions
@@ -27,6 +27,32 @@ jobs:
         pip install torch
         
 
+    - name: Install package
+      run: |
+        pip install .
+
+    - name: Run tests
+      run: |
+        pytest test
+
+  test-nightly:
+    runs-on: 4-core-ubuntu-gpu-t4
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.9
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install -r dev-requirements.txt
+        pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
+        
+
     - name: Install package
       run: |
         pip install .
 
@@ -0,0 +1,59 @@
+merge_base_with = "origin/main"
+
+[[linter]]
+code = 'FLAKE8'
+include_patterns = ['**/*.py']
+exclude_patterns = [
+    'third-party/**',
+    '**/third-party/**',
+]
+command = [
+    'python',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'flake8_linter',
+    '--',
+    '@{{PATHSFILE}}'
+]
+init_command = [
+    'python',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'pip_init',
+    '--dry-run={{DRYRUN}}',
+    '--requirement=requirements-lintrunner.txt',
+]
+
+# Black + usort
+[[linter]]
+code = 'UFMT'
+include_patterns = [
+    '**/*.py',
+    '**/*.pyi',
+]
+exclude_patterns = [
+    'third-party/**',
+    '**/third-party/**',
+]
+command = [
+    'python',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'ufmt_linter',
+    '--',
+    '@{{PATHSFILE}}'
+]
+init_command = [
+    'python',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'pip_init',
+    '--dry-run={{DRYRUN}}',
+    '--no-black-binary',
+    '--requirement=requirements-lintrunner.txt',
+]
+is_formatter = true
@@ -0,0 +1,2 @@
+msaroufim
+cpuhrsch
@@ -0,0 +1,96 @@
+import argparse
+import csv
+import itertools
+import math
+import pathlib
+
+import torch
+import torch.nn.functional as F
+import torch.utils.benchmark as benchmark
+from torchao.kernel.intmm_triton import int_matmul, int_scaled_matmul
+
+torch._dynamo.config.cache_size_limit = 128
+torch._dynamo.config.accumulated_cache_size_limit = 128
+
+dtype = torch.float16
+device = "cuda"
+
+
+def benchmark_in_ms(warmup, iters, f, *args, **kwargs):
+    for _ in range(warmup):
+        f(*args, **kwargs)
+    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+
+    for _ in range(iters):
+        f(*args, **kwargs)
+
+    end_event.record()
+    torch.cuda.synchronize()
+    return start_event.elapsed_time(end_event) / float(iters)
+
+
+@torch.compile(mode="max-autotune")
+def compiled_mm(x, w):
+    return torch.mm(x, w)
+
+
+@torch.compile(mode="max-autotune")
+def compiled_int_mm(x, w):
+    return torch._int_mm(x, w)
+
+
+def run_int_mm_benchmark(x, w, b):
+    fp_time = benchmark_in_ms(10, 100, torch.mm, x, w)
+    x_int = x.to(dtype=torch.int8)
+    w_int = w.to(dtype=torch.int8)
+    int_mm_time = benchmark_in_ms(10, 100, int_matmul, x_int, w_int)
+    return fp_time, int_mm_time
+
+
+def run_int_scaled_mm_benchmark(x, w, b):
+    scales = x.sum(-1, keepdim=True)
+    fp_time = benchmark_in_ms(10, 100, lambda x, w, s: torch.mm(x, w) * s, x, w, scales)
+    x_int = x.to(dtype=torch.int8)
+    w_int = w.to(dtype=torch.int8)
+    int_scaled_mm_time = benchmark_in_ms(
+        10, 100, int_scaled_matmul, x_int, w_int, scales
+    )
+    return fp_time, int_scaled_mm_time
+
+
+def run_benchmarks(shapes):
+    print("fn,m,k,n,fp_time,int_mm_time,ratio")
+    positives = []
+    dtype = torch.bfloat16
+    device = "cuda"
+    for fn, (m, k, n) in itertools.product(
+        [run_int_mm_benchmark, run_int_scaled_mm_benchmark], shapes
+    ):
+        x = torch.randn(m, k, dtype=dtype, device=device)
+        w = torch.randn(n, k, dtype=dtype, device=device).t()
+        b = torch.randn(m, n, dtype=dtype, device=device)
+
+        fp_time, int_mm_time = fn(x, w, b)
+        ratio = fp_time / int_mm_time
+        result = ",".join(map(str, [fn, m, k, n, fp_time, int_mm_time, ratio]))
+        print(result)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="integer matmul benchmarks")
+    parser.add_argument("file_path", type=str, help="Path to csv file with shapes")
+    args = parser.parse_args()
+    # Access the file path provided as an argument
+    file_path = args.file_path
+    file_path = pathlib.Path(file_path)
+    assert file_path.is_file()
+
+    # Format is (m, k, n)
+    shapes = list(csv.reader(open(file_path, "r")))[1:]
+    # Turn into list of int tuples
+    shapes = list(map(lambda x: tuple(map(int, x)), shapes))
+
+    run_benchmarks(shapes)
@@ -0,0 +1,127 @@
+m,k,n
+1024,1024,2304
+1024,1024,4608
+1024,8192,2304
+1024,8192,4608
+1152,1024,2048
+1152,2048,16384
+1152,2048,2048
+1152,3072,2048
+1152,4096,2048
+1152,8192,2048
+1,2048,1024
+1,2048,2048
+1,2048,4096
+144,2048,16384
+144,2048,2048
+144,4096,2048
+144,8192,2048
+1472,1024,154
+1472,1024,308
+1472,2048,154
+1472,2048,308
+1472,512,154
+1472,512,308
+1,512,2048
+154,1472,1024
+154,1472,2048
+154,1472,512
+18432,1024,512
+18432,1536,512
+18432,2048,512
+18432,512,4096
+18432,512,512
+2048,1024,1
+2048,1024,2
+2048,16384,1152
+2048,16384,144
+2048,16384,288
+2048,16384,576
+2048,2048,1
+2048,2048,1152
+2048,2048,144
+2048,2048,2
+2048,2048,288
+2048,2048,576
+2048,4096,1
+2048,4096,2
+2048,512,18432
+2048,512,9216
+2,2048,1024
+2,2048,2048
+2,2048,4096
+2304,1024,1024
+2304,1024,8192
+2304,1536,1024
+2304,2048,1024
+2304,3072,1024
+2304,4096,1024
+2304,512,1024
+231,4096,1024
+231,4096,2048
+231,4096,512
+231,768,1024
+231,768,2048
+231,768,512
+2,512,2048
+288,2048,16384
+288,2048,2048
+288,4096,2048
+288,8192,2048
+308,1472,1024
+308,1472,2048
+308,1472,512
+4096,1024,2304
+4096,1024,231
+4096,1024,4608
+4096,1024,462
+4096,2048,231
+4096,2048,462
+4096,512,231
+4096,512,462
+4608,1024,1024
+4608,1024,8192
+4608,1536,1024
+4608,2048,1024
+4608,3072,1024
+4608,4096,1024
+4608,512,1024
+462,4096,1024
+462,4096,2048
+462,4096,512
+462,768,1024
+462,768,2048
+462,768,512
+512,2048,1
+512,2048,2
+512,4096,18432
+512,4096,9216
+512,512,18432
+512,512,9216
+576,1024,2048
+576,2048,16384
+576,2048,2048
+576,3072,2048
+576,4096,2048
+576,8192,2048
+768,1024,231
+768,1024,462
+768,2048,231
+768,2048,462
+768,512,231
+768,512,462
+8192,2048,1152
+8192,2048,144
+8192,2048,288
+8192,2048,576
+9216,1024,512
+9216,1536,512
+9216,2048,512
+9216,512,4096
+9216,512,512
+32768,3072,768
+32768,768,2304
+32768,768,3072
+32768,768,768
+39200,768,2304
+39200,768,768
@@ -0,0 +1,16 @@
+import torchao
+
+from torchao.kernel import autotuner
+
+configs = autotuner._load_best_configs()
+
+print("m,k,n")
+for k, v in configs.items():
+    a_shape = k[1]
+    b_shape = k[4]
+    M, K0 = a_shape
+    K1, N = b_shape
+
+    assert K0 == K1
+
+    print(f"{M},{K0},{N}")
@@ -0,0 +1,7 @@
+m,k,n
+32768,3072,768
+32768,768,2304
+32768,768,3072
+32768,768,768
+39200,768,2304
+39200,768,768
@@ -1,3 +1,4 @@
 pytest
 expecttest
-packaging
+parameterized
+packaging
@@ -0,0 +1,22 @@
+# Lintrunner itself
+lintrunner==0.11.0
+lintrunner-adapters==0.11.0
+
+# Flake 8 and its dependencies
+flake8==6.0.0
+flake8-breakpoint==1.1.0
+flake8-bugbear==23.6.5
+flake8-comprehensions==3.12.0
+flake8-pyi==23.5.0
+mccabe==0.7.0
+pycodestyle==2.10.0
+torchfix==0.1.1
+
+# UFMT
+black==24.3.0
+ufmt==2.5.1
+usort==1.0.5
+
+# Other linters
+clang-format==12.0.1
+cmakelint==1.4.1
@@ -1,3 +1,4 @@
 torch
 numpy
 sentencepiece
+packaging
-Original file line number
+Diff line change
@@ @@ -1,3 +1,4 @@ @@
 torch
 numpy
 sentencepiece
 +packaging