hack fix library

Signed-off-by: youkaichao <youkaichao@gmail.com>
vllm-project · youkaichao · Nov 1, 2024 · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024
commit d515d619a4f2f97b0bdd17d1b8652a4d82629fc4
diff --git a/tests/compile/piecewise/piecewise_compilation_config.json b/tests/compile/piecewise/piecewise_compilation_config.json
@@ -1,4 +1,4 @@
 {
     "use_cudagraph": true,
-    "non_cudagraph_ops": ["silly.attention"]
+    "non_cudagraph_ops": ["vllm.toy_attention"]
 }
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
@@ -33,8 +33,8 @@ def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
 
 
 direct_register_custom_op(
-    library_name="silly",
-    op_name="attention",
+    library_name="vllm",
+    op_name="toy_attention",
     op_func=silly_attention,
     mutates_args=["out"],
     fake_impl=silly_attention_fake,
@@ -57,12 +57,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = x + 1
         x = x + 2
         out = torch.empty_like(x)
-        torch.ops.silly.attention(x, x, x, out)
+        torch.ops.vllm.toy_attention(x, x, x, out)
         x = out
         x = x - 2
         x = x - 1
         out = torch.empty_like(x)
-        torch.ops.silly.attention(x, x, x, out)
+        torch.ops.vllm.toy_attention(x, x, x, out)
         x = out
         x = x + 1
         return x

diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
@@ -31,8 +31,8 @@ def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
 
 
 direct_register_custom_op(
-    library_name="silly",
-    op_name="attention",
+    library_name="vllm",
+    op_name="toy_attention",
     op_func=silly_attention,
     mutates_args=["out"],
     fake_impl=silly_attention_fake,
@@ -103,7 +103,7 @@ def forward(
         k = k + positions.unsqueeze(1)
 
         attn_output = torch.empty_like(q)
-        torch.ops.silly.attention(q, k, v, attn_output)
+        torch.ops.vllm.toy_attention(q, k, v, attn_output)
 
         output = self.output_projection(attn_output)
         return output
@@ -179,7 +179,7 @@ def run_model(llama_config,
             set_compilation_config(
                 CompilationConfig(
                     use_cudagraph=True,
-                    non_cudagraph_ops=["silly.attention"],
+                    non_cudagraph_ops=["vllm.toy_attention"],
                 ))
         else:
             set_compilation_config(CompilationConfig(use_cudagraph=True, ))

diff --git a/vllm/utils.py b/vllm/utils.py
@@ -1515,6 +1515,9 @@ def weak_ref_tensors(
     raise ValueError("Invalid type for tensors")
 
 
+vllm_lib = Library("vllm", "FRAGMENT")
+
+
 def direct_register_custom_op(
     library_name: str,
     op_name: str,
@@ -1530,7 +1533,9 @@ def direct_register_custom_op(
     for more details.
     """
     schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
-    my_lib = Library(library_name, "FRAGMENT")
+    # FIXME after https://github.com/pytorch/pytorch/issues/139444 is resolved
+    assert library_name == "vllm"
+    my_lib = vllm_lib
     my_lib.define(op_name + schema_str)
     my_lib.impl(op_name, op_func, "CUDA")
     if fake_impl is not None: