diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
index f8e30f31a961..669d1d1eb539 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
@@ -21,6 +21,9 @@ using llvm::APIntOps::GreatestCommonDivisor;
 
 namespace mlir::iree_compiler {
 
+// Threshold used to determine whether a matmul dimension is 'very skinny'.
+constexpr int64_t kVerySkinnyDimThreshold = 4;
+
 template <typename T>
 static llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
                                      const llvm::SmallVectorImpl<T> &vector) {
@@ -77,17 +80,17 @@ calculateResultSharedMemoryUsedInBytes(const GPUMMASchedule &schedule,
 static bool isScheduleAligned(const GPUMatmulShapeType &problem,
                               const GPUMMASchedule &schedule,
                               bool mustBeAligned) {
-  SmallVector<int64_t> alignedMSizes(problem.mSizes);
+  SmallVector<int64_t, 2> alignedMSizes(problem.mSizes);
   alignedMSizes.back() =
       mustBeAligned ? problem.mSizes.back()
                     : llvm::divideCeil(problem.mSizes.back(), schedule.mSize) *
                           schedule.mSize;
-  SmallVector<int64_t> alignedNSizes(problem.nSizes);
+  SmallVector<int64_t, 2> alignedNSizes(problem.nSizes);
   alignedNSizes.back() =
       mustBeAligned ? problem.nSizes.back()
                     : llvm::divideCeil(problem.nSizes.back(), schedule.nSize) *
                           schedule.nSize;
-  SmallVector<int64_t> alignedKSizes(problem.kSizes);
+  SmallVector<int64_t, 2> alignedKSizes(problem.kSizes);
   alignedKSizes.back() =
       mustBeAligned ? problem.kSizes.back()
                     : llvm::divideCeil(problem.kSizes.back(), schedule.kSize) *
@@ -106,7 +109,7 @@ static bool isScheduleAligned(const GPUMatmulShapeType &problem,
       };
   // Checks whether the elements of `a` are evenly divisible by the
   // corresponding elements of `b`.
-  auto areAligned = [](SmallVector<int64_t> a, SmallVector<int64_t> b) {
+  auto areAligned = [](SmallVector<int64_t, 2> a, SmallVector<int64_t, 2> b) {
     for (auto [aVal, bVal] : llvm::zip_equal(a, b)) {
       if (aVal % bVal != 0) {
         return false;
@@ -223,6 +226,7 @@ static FailureOr<GPUMMASchedule> fitScheduleInSharedMemory(
 
 static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
                                         const GPUMatmulShapeType &intrinsic,
+                                        int64_t preferredSubgroupSize,
                                         bool canUpcastAcc, bool mustBeAligned) {
   assert(intrinsic.mSizes.size() == 1 && intrinsic.nSizes.size() == 1 &&
          intrinsic.kSizes.size() == 1 &&
@@ -240,12 +244,17 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
     }
   }
 
-  if (mustBeAligned && (problem.mSizes.back() % intrinsic.mSizes[0] != 0 ||
-                        problem.nSizes.back() % intrinsic.nSizes[0] != 0 ||
-                        problem.kSizes.back() % intrinsic.kSizes[0] != 0)) {
-    return failure(); // Cannot use this intrinsic for misaligned cases.
+  if (mustBeAligned) {
+    if ((problem.mSizes.back() % intrinsic.mSizes[0] != 0 ||
+         problem.nSizes.back() % intrinsic.nSizes[0] != 0 ||
+         problem.kSizes.back() % intrinsic.kSizes[0] != 0)) {
+      return failure();
+    }
+    return success();
   }
 
+  // Send very skinny, {2-4}xNxK and Mx{2-4}xK, matmuls to the vector reduction
+  // pipeline, similar to matvec.
   // TODO: Figure out what the precise cutoff is, this may be machine dependent.
   // In situation when alignment isn't required, we disallow intrinsics to be
   // picked if the tile size is too small. For example, this will force a matmul
@@ -255,10 +264,15 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
   // established after we sweep the different tile sizes for a problem config.
   // Once a precise threshold is established, replace 4 with the threshold and
   // remove this todo.
-  if (!mustBeAligned &&
-      (problem.mSizes.back() < 4 || problem.nSizes.back() < 4 ||
-       problem.kSizes.back() < 4)) {
-    return failure();
+  if (llvm::all_equal({problem.mSizes.size(), problem.nSizes.size(),
+                       problem.kSizes.size(), size_t{1}}) &&
+      problem.batchSizes.empty()) {
+    int64_t mSize = problem.mSizes.back();
+    int64_t nSize = problem.nSizes.back();
+    if ((mSize <= kVerySkinnyDimThreshold && (nSize > preferredSubgroupSize)) ||
+        (nSize <= kVerySkinnyDimThreshold && (mSize > preferredSubgroupSize))) {
+      return failure();
+    }
   }
   return success();
 }
@@ -279,8 +293,8 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem,
   // 16x16x16 intrinsic, then:
   //  - mTotalTileCounts would be 4 * (16/16) = 4
   //  - nTotalTileCounts would be 2 * (32/16) = 4
-  SmallVector<int64_t> mTotalTileCounts = problem.mSizes;
-  SmallVector<int64_t> nTotalTileCounts = problem.nSizes;
+  SmallVector<int64_t, 2> mTotalTileCounts = problem.mSizes;
+  SmallVector<int64_t, 2> nTotalTileCounts = problem.nSizes;
   mTotalTileCounts.back() =
       llvm::divideCeil(problem.mSizes.back(), intrinsic.mSizes[0]);
   nTotalTileCounts.back() =
@@ -361,7 +375,7 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem,
   // For the problem described above {M:[4, 16], N:[2, 32], K[3, 128]} with a
   // 16x16x16 intrinsic, then:
   //  - kTotalTileCounts would be 3 * (128/16) = 24
-  SmallVector<int64_t> kTotalTileCounts = problem.kSizes;
+  SmallVector<int64_t, 2> kTotalTileCounts = problem.kSizes;
   kTotalTileCounts.back() =
       llvm::divideCeil(problem.kSizes.back(), intrinsic.kSizes[0]);
   // Compute the ideal number of intrinsics along K per subgroup based on the
@@ -395,8 +409,8 @@ FailureOr<GPUMMASchedule> deduceMMASchedule(
     int64_t subgroupSize, bool transposedLhs, bool transposedRhs,
     bool canUpcastAcc, bool mustBeAligned, bool doCPromotion) {
   for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) {
-    if (failed(canTargetIntrinsic(problem, intrinsic, canUpcastAcc,
-                                  mustBeAligned))) {
+    if (failed(canTargetIntrinsic(problem, intrinsic, subgroupSize,
+                                  canUpcastAcc, mustBeAligned))) {
       continue;
     }
 
@@ -450,13 +464,13 @@ FailureOr<GPUMMASchedule> deduceAttentionSchedule(
          qkMatmul.nSizes.size() == 1 && qkMatmul.kSizes.size() == 1 &&
          "unimplemented: multi M/N/K attention schedule");
   for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) {
-    if (failed(canTargetIntrinsic(qkMatmul, intrinsic, canUpcastAcc,
-                                  mustBeAligned))) {
+    if (failed(canTargetIntrinsic(qkMatmul, intrinsic, subgroupSize,
+                                  canUpcastAcc, mustBeAligned))) {
       continue;
     }
 
-    if (failed(canTargetIntrinsic(pvMatmul, intrinsic, canUpcastAcc,
-                                  mustBeAligned))) {
+    if (failed(canTargetIntrinsic(pvMatmul, intrinsic, subgroupSize,
+                                  canUpcastAcc, mustBeAligned))) {
       continue;
     }
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
index a35e2b464632..6542d11ebf18 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
@@ -10,18 +10,22 @@ namespace mlir::iree_compiler {
 
 /// Struct containing information about a matmul's shape and type.
 struct GPUMatmulShapeType {
-  SmallVector<int64_t> mSizes;
-  SmallVector<int64_t> nSizes;
-  SmallVector<int64_t> kSizes;
+  SmallVector<int64_t, 2> mSizes;
+  SmallVector<int64_t, 2> nSizes;
+  SmallVector<int64_t, 2> kSizes;
+  SmallVector<int64_t, 2> batchSizes;
   Type aType;
   Type bType;
   Type cType;
 
   GPUMatmulShapeType(int64_t m, int64_t n, int64_t k, Type a, Type b, Type c)
-      : mSizes({m}), nSizes({n}), kSizes({k}), aType(a), bType(b), cType(c) {}
-  GPUMatmulShapeType(SmallVector<int64_t> m, SmallVector<int64_t> n,
-                     SmallVector<int64_t> k, Type a, Type b, Type c)
-      : mSizes(m), nSizes(n), kSizes(k), aType(a), bType(b), cType(c) {}
+      : mSizes({m}), nSizes({n}), kSizes({k}), batchSizes({}), aType(a),
+        bType(b), cType(c) {}
+  GPUMatmulShapeType(ArrayRef<int64_t> m, ArrayRef<int64_t> n,
+                     ArrayRef<int64_t> k, ArrayRef<int64_t> batch, Type a,
+                     Type b, Type c)
+      : mSizes(m), nSizes(n), kSizes(k), batchSizes(batch), aType(a), bType(b),
+        cType(c) {}
 };
 
 /// Struct containing seed tile sizes for GPU MMA heuristics deduction logic.
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
index 7b62d6955b10..48bfcc9a7c2a 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -202,23 +202,29 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
   // Gather all static M, N, and K dimensions to deduce the MMASchedule. Dynamic
   // dimensions will be tiled to 1 in workgroup tiling, so they are ignored when
   // computing an MMA schedule.
-  SmallVector<int64_t> mDims, nDims, kDims;
-  for (auto mDim : contractionDims.m) {
+  SmallVector<int64_t> mDims, nDims, kDims, batchDims;
+  for (int64_t mDim : contractionDims.m) {
     if (!ShapedType::isDynamic(bounds[mDim])) {
       mDims.push_back(mDim);
     }
   }
-  for (auto nDim : contractionDims.n) {
+  for (int64_t nDim : contractionDims.n) {
     if (!ShapedType::isDynamic(bounds[nDim])) {
       nDims.push_back(nDim);
     }
   }
-  for (auto kDim : contractionDims.k) {
+  for (int64_t kDim : contractionDims.k) {
     if (!ShapedType::isDynamic(bounds[kDim])) {
       kDims.push_back(kDim);
     }
   }
 
+  for (int64_t batchDim : contractionDims.batch) {
+    if (!ShapedType::isDynamic(bounds[batchDim])) {
+      batchDims.push_back(batchDim);
+    }
+  }
+
   auto getDimBounds = [&](SmallVector<int64_t> dims) -> SmallVector<int64_t> {
     return llvm::map_to_vector(dims, [&](int64_t dim) { return bounds[dim]; });
   };
@@ -233,8 +239,9 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
   Type initElemType = getElementTypeOrSelf(init);
 
   GPUMatmulShapeType problem{getDimBounds(mDims), getDimBounds(nDims),
-                             getDimBounds(kDims), lhsElemType,
-                             rhsElemType,         initElemType};
+                             getDimBounds(kDims), getDimBounds(batchDims),
+                             lhsElemType,         rhsElemType,
+                             initElemType};
 
   // Infer if lhs or rhs is transposed to help generate better schedule.
   // TODO: Drop this. This is only a consideration for other pipelines.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 2ee7e8241c17..8833b4203156 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -536,13 +536,24 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
       rhsElemType = getElementTypeOrSelf(rhsOp.getDpsInputs()[0]);
   }
 
+  SmallVector<int64_t> batchDims;
+  for (int64_t batchDim : contractionDims->batch) {
+    if (!ShapedType::isDynamic(bounds[batchDim])) {
+      batchDims.push_back(batchDim);
+    }
+  }
+  auto getDimBounds = [&](SmallVector<int64_t> dims) -> SmallVector<int64_t> {
+    return llvm::map_to_vector(dims, [&](int64_t dim) { return bounds[dim]; });
+  };
+
   // TODO(Max191): Support multiple M/N/K dimension problems for MMASchedules
   // once the pipeline is able to support it. After adding multiple dimensions,
   // all instances of schedule->m/nSubgroupCounts[0] and
   // schedule->m/n/kTileSizes[0] need to use the full list of sizes instead of
   // just the first element.
-  GPUMatmulShapeType problem{bounds[mDim], bounds[nDim], bounds[kDim],
-                             lhsElemType,  rhsElemType,  initElemType};
+  GPUMatmulShapeType problem{
+      {bounds[mDim]}, {bounds[nDim]}, {bounds[kDim]}, getDimBounds(batchDims),
+      lhsElemType,    rhsElemType,    initElemType};
 
   // Helper fn to store mma information.
   auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
index 910eca3c1768..ec6038f47dee 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
@@ -282,12 +282,12 @@ module {
 // -----
 
 module {
-func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x577x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x577x577xf32> {
+func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x2x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x2x577xf32> {
     %c0 = arith.constant 0.0 : f32
-    %empty = tensor.empty() : tensor<12x577x577xf32>
-    %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x577x577xf32>) -> tensor<12x577x577xf32>
-    %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x577x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x577x577xf32>) -> tensor<12x577x577xf32>
-    return %mm :  tensor<12x577x577xf32>
+    %empty = tensor.empty() : tensor<12x2x577xf32>
+    %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x2x577xf32>) -> tensor<12x2x577xf32>
+    %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x2x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x2x577xf32>) -> tensor<12x2x577xf32>
+    return %mm :  tensor<12x2x577xf32>
 }
 }