Use compile-time promotion to reduce bitwise op size & build time

Summary: Finally getting close to the end of compile-time promotion for Tensor ops! Differential Revision: D56855548
pytorch · May 3, 2024 · f826372 · f826372
1 parent 18d869e
commit f826372
Showing 1 changed file with 65 additions and 15 deletions.
diff --git a/kernels/portable/cpu/pattern/bitwise_op.h b/kernels/portable/cpu/pattern/bitwise_op.h
@@ -14,6 +14,60 @@
 namespace torch {
 namespace executor {
 namespace native {
+namespace internal {
+
+template <
+    bool can_cast,
+    template <typename>
+    typename OpFunc,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct BitwiseOpInner;
+
+template <
+    template <typename>
+    typename OpFunc,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct BitwiseOpInner<true, OpFunc, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void run(const Tensor& a, const Tensor& b, Tensor& out) {
+    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value = OpFunc<CTYPE_IN>()(a_casted, b_casted);
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    template <typename>
+    typename OpFunc,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct BitwiseOpInner<false, OpFunc, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug {};
+
+} // namespace internal
+
 template <template <typename> typename OpFunc>
 Tensor& bitwise_op_out(
     RuntimeContext& ctx,
@@ -36,21 +90,17 @@ Tensor& bitwise_op_out(
 
   ET_SWITCH_INT_TYPES_AND(Bool, a_type, ctx, op_name, CTYPE_A, [&]() {
     ET_SWITCH_INT_TYPES_AND(Bool, b_type, ctx, op_name, CTYPE_B, [&]() {
-      ET_SWITCH_INT_TYPES_AND(Bool, common_type, ctx, op_name, CTYPE_IN, [&]() {
-        ET_SWITCH_REAL_TYPES_AND(
-            Bool, out_type, ctx, op_name, CTYPE_OUT, [&]() {
-              apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-                  [](const CTYPE_A val_a, const CTYPE_B val_b) {
-                    CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                    CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-                    CTYPE_IN value = OpFunc<CTYPE_IN>()(a_casted, b_casted);
-
-                    return static_cast<CTYPE_OUT>(value);
-                  },
-                  a,
-                  b,
-                  out);
-            });
+      using CTYPE_IN =
+          typename torch::executor::promote_types<CTYPE_A, CTYPE_B>::type;
+      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+      ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, op_name, CTYPE_OUT, [&]() {
+        internal::BitwiseOpInner<
+            can_cast<CTYPE_IN, CTYPE_OUT>::value,
+            OpFunc,
+            CTYPE_A,
+            CTYPE_B,
+            CTYPE_IN,
+            CTYPE_OUT>::run(a, b, out);
       });
     });
   });