[Torch] Fix decomposition of matmul to bmm (#4404)

IanWood1 · web-flow · commit 06eca3f3e423 · 2025-12-16T09:40:58.000-08:00
This change prevents decomposing `torch.matmul` to `torch.bmm` when the
batch dimensions are broadcasted because `torch.bmm` does not support
broadcasting. Before this change, the added test case would result in a
compilation failure.

---------

Signed-off-by: Ian Wood &lt;ianwood@u.northwestern.edu&gt;
diff --git a/lib/Dialect/Torch/Transforms/DecomposeComplexOps.cpp b/lib/Dialect/Torch/Transforms/DecomposeComplexOps.cpp
@@ -3222,7 +3222,27 @@ class DecomposeAtenMatmulOp : public OpRewritePattern<AtenMatmulOp> {
       // If both lhs and rhs ranks are 2 then map it to `aten.mm` op.
       rewriter.replaceOpWithNewOp<AtenMmOp>(op, op.getType(), lhs, rhs);
     } else if (lhsRank == 3 && rhsRank == 3) {
-      // If both lhs and rhs ranks are 3 then map it to `aten.bmm` op.
+      // If both lhs and rhs ranks are 3, we can only map it to `aten.bmm` op
+      // if the batch dimensions are equal (since bmm doesn't support
+      // broadcasting).
+      auto lhsType = cast<BaseTensorType>(lhs.getType());
+      auto rhsType = cast<BaseTensorType>(rhs.getType());
+
+      if (!lhsType.hasSizes() || !rhsType.hasSizes())
+        return failure();
+
+      ArrayRef<int64_t> lhsShape = lhsType.getSizes();
+      ArrayRef<int64_t> rhsShape = rhsType.getSizes();
+      int64_t lhsBatchDim = lhsShape[0];
+      int64_t rhsBatchDim = rhsShape[0];
+
+      // Batch dimensions must be statically known and equal for bmm.
+      // Dynamic dimensions (kUnknownSize) or unequal dimensions require the
+      // general matmul lowering which handles broadcasting.
+      if (lhsBatchDim == kUnknownSize || rhsBatchDim == kUnknownSize ||
+          lhsBatchDim != rhsBatchDim)
+        return failure();
+
       rewriter.replaceOpWithNewOp<AtenBmmOp>(op, op.getType(), lhs, rhs);
     } else {
       return failure();
diff --git a/projects/pt1/python/torch_mlir_e2e_test/test_suite/matmul.py b/projects/pt1/python/torch_mlir_e2e_test/test_suite/matmul.py
@@ -156,6 +156,30 @@ def Matmul_3d(module, tu: TestUtils):
 # ==============================================================================
 
 
+class Matmul3DStaticBroadcast(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    @export
+    @annotate_args(
+        [
+            None,
+            ([4, 8, 5], torch.float32, True),
+            ([1, 5, 6], torch.float32, True),
+        ]
+    )
+    def forward(self, lhs, rhs):
+        return torch.matmul(lhs, rhs)
+
+
+@register_test_case(module_factory=lambda: Matmul3DStaticBroadcast())
+def Matmul3DStaticBroadcast_basic(module, tu: TestUtils):
+    module.forward(tu.rand(4, 8, 5), tu.rand(1, 5, 6))
+
+
+# ==============================================================================
+
+
 class Matmul4d(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/test/Dialect/Torch/decompose-complex-ops.mlir b/test/Dialect/Torch/decompose-complex-ops.mlir
@@ -18,13 +18,29 @@ func.func @matmul_decompose_2d(%arg0: !torch.vtensor<[?,?],f32>, %arg1: !torch.v
 }
 
 // -----
-// CHECK-LABEL:   func.func @matmul_decompose_3d(
-// CHECK:           torch.aten.bmm %arg0, %arg1 : !torch.vtensor<[?,?,?],f32>, !torch.vtensor<[?,?,?],f32> -> !torch.tensor
-func.func @matmul_decompose_3d(%arg0: !torch.vtensor<[?,?,?],f32>, %arg1: !torch.vtensor<[?,?,?],f32>) -> !torch.tensor {
+// CHECK-LABEL:   func.func @matmul_no_decompose_3d_dynamic(
+// CHECK:           torch.aten.matmul %arg0, %arg1 : !torch.vtensor<[?,?,?],f32>, !torch.vtensor<[?,?,?],f32> -> !torch.tensor
+func.func @matmul_no_decompose_3d_dynamic(%arg0: !torch.vtensor<[?,?,?],f32>, %arg1: !torch.vtensor<[?,?,?],f32>) -> !torch.tensor {
   %0 = torch.aten.matmul %arg0, %arg1 : !torch.vtensor<[?,?,?],f32>, !torch.vtensor<[?,?,?],f32> -> !torch.tensor
   return %0 : !torch.tensor
 }
 
+// -----
+// CHECK-LABEL:   func.func @matmul_decompose_3d_static(
+// CHECK:           torch.aten.bmm %arg0, %arg1 : !torch.vtensor<[4,?,?],f32>, !torch.vtensor<[4,?,?],f32> -> !torch.tensor
+func.func @matmul_decompose_3d_static(%arg0: !torch.vtensor<[4,?,?],f32>, %arg1: !torch.vtensor<[4,?,?],f32>) -> !torch.tensor {
+  %0 = torch.aten.matmul %arg0, %arg1 : !torch.vtensor<[4,?,?],f32>, !torch.vtensor<[4,?,?],f32> -> !torch.tensor
+  return %0 : !torch.tensor
+}
+
+// -----
+// CHECK-LABEL:   func.func @matmul_no_decompose_3d_broadcast(
+// CHECK:           torch.aten.matmul %arg0, %arg1 : !torch.vtensor<[4,?,?],f32>, !torch.vtensor<[1,?,?],f32> -> !torch.tensor
+func.func @matmul_no_decompose_3d_broadcast(%arg0: !torch.vtensor<[4,?,?],f32>, %arg1: !torch.vtensor<[1,?,?],f32>) -> !torch.tensor {
+  %0 = torch.aten.matmul %arg0, %arg1 : !torch.vtensor<[4,?,?],f32>, !torch.vtensor<[1,?,?],f32> -> !torch.tensor
+  return %0 : !torch.tensor
+}
+
 // -----
 // CHECK-LABEL: func.func @argmax_rank_1
 // CHECK:         %[[I0:.*]] = torch.constant.int 0