NVIDIA · jacobhinkle · May 11, 2023 · May 11, 2023
diff --git a/runtime/helpers.cu b/runtime/helpers.cu
@@ -92,9 +92,7 @@ __device__ double fmax(double a, double b) {
   // check and propagate NaN
   if (a != a) {
     return a;
-  } else if (b != b) {
-    return b;
-  } else {
+  } else { // If b is nan, it will be returned in the next line
     return a > b ? a : b;
   }
 }
@@ -103,9 +101,7 @@ __device__ float fmax(float a, float b) {
   // check and propagate NaN
   if (a != a) {
     return a;
-  } else if (b != b) {
-    return b;
-  } else {
+  } else { // If b is nan, it will be returned in the next line
     return a > b ? a : b;
   }
 }
@@ -128,22 +124,18 @@ __device__ constexpr int64_t min(int64_t a, int64_t b) {
 
 __device__ double fmin(double a, double b) {
   // check and propagate NaN
-  if (a != a) {
-    return a;
-  } else if (b != b) {
+  if (b != b) {
     return b;
-  } else {
+  } else { // If a is nan, it will be returned in the next line
     return a > b ? b : a;
   }
 }
 
 __device__ float fmin(float a, float b) {
   // check and propagate NaN
-  if (a != a) {
-    return a;
-  } else if (b != b) {
+  if (b != b) {
     return b;
-  } else {
+  } else { // If a is nan, it will be returned in the next line
     return a > b ? b : a;
   }
 }

diff --git a/test/test_gpu3.cpp b/test/test_gpu3.cpp
@@ -8296,6 +8296,51 @@ TEST_F(NVFuserTest, FusionClearGmemBetweenSegments_CUDA) {
   testValidate(
       executor_cache.fusion(), outputs, {at_x}, {t4}, __LINE__, __FILE__);
 }
+
+// Test nan propagation during min/max with floats and doubles
+TEST_F(NVFuserTest, FusionMinMaxNanPropagation_CUDA) {
+  for (auto dtype : {DataType::Float, DataType::Double}) {
+    for (auto do_min : {true, false}) {
+      auto fusion = std::make_unique<Fusion>();
+      FusionGuard fg(fusion.get());
+
+      auto tv0 = makeSymbolicTensor(2, dtype);
+      fusion->addInput(tv0);
+      auto tv1 = do_min ? min(tv0, {1}) : max(tv0, {1});
+      fusion->addOutput(tv1);
+
+      FusionExecutorCache executor_cache(std::move(fusion));
+
+      auto options =
+          at::TensorOptions()
+              .dtype(dtype == DataType::Float ? at::kFloat : at::kDouble)
+              .device(at::kCUDA, 0);
+      // Test size 1 since it will have a single comparison, which checks
+      // missing propagation in one position even if it propagates properly in
+      // the other position
+      for (auto size : {1, 2, 5}) {
+        // To check nans in multiple positions along reduction axis create a 2D
+        // tensor that is ones except the diagonal, which are nans
+        auto at_x = at::eye(size, options);
+        at_x = (1 - at_x) / (1 - at_x);
+        std::vector<c10::IValue> inputs{at_x};
+
+        std::vector<at::Tensor> at_outputs(
+            {do_min ? at_x.amin(1) : at_x.amax(1)});
+        auto nvf_outputs = executor_cache.runFusionWithInputs(inputs);
+
+        testValidate(
+            executor_cache.fusion(),
+            nvf_outputs,
+            inputs,
+            at_outputs,
+            __LINE__,
+            __FILE__);
+      }
+    }
+  }
+}
+
 // Test file size should be up to 10K LoC. Create a new file for more tests.
 
 } // namespace nvfuser