onnx · xadupre · Nov 14, 2024 · Oct 18, 2024 · Oct 18, 2024 · Nov 14, 2024
diff --git a/skl2onnx/operator_converters/ordinal_encoder.py b/skl2onnx/operator_converters/ordinal_encoder.py
@@ -24,6 +24,12 @@ def convert_sklearn_ordinal_encoder(
     result = []
     input_idx = 0
     dimension_idx = 0
+
+    # handle the 'handle_unknown=use_encoded_value' case
+    default_value = (
+        None if ordinal_op.handle_unknown == "error" else int(ordinal_op.unknown_value)
+    )
+
     for categories in ordinal_op.categories_:
         if len(categories) == 0:
             continue
@@ -82,6 +88,7 @@ def convert_sklearn_ordinal_encoder(
             feature_column = casted_feature_column
 
         attrs = {"name": scope.get_unique_operator_name("LabelEncoder")}
+
         if isinstance(feature_column.type, FloatTensorType):
             attrs["keys_floats"] = np.array(
                 [float(s) for s in categories], dtype=np.float32
@@ -94,7 +101,26 @@ def convert_sklearn_ordinal_encoder(
             attrs["keys_strings"] = np.array(
                 [str(s).encode("utf-8") for s in categories]
             )
-        attrs["values_int64s"] = np.arange(len(categories)).astype(np.int64)
+
+        # hanlde encoded_missing_value
+        if not np.isnan(ordinal_op.encoded_missing_value) and (
+            isinstance(categories[-1], float) and np.isnan(categories[-1])
+        ):
+            # sklearn always places np.nan as the last entry
+            # in its cathegories if it was in the training data
+            # => we simply add the 'ordinal_op.encoded_missing_value'
+            # as our last entry in 'values_int64s' if it was in the training data
+            encoded_missing_value = np.array(
+                [int(ordinal_op.encoded_missing_value)]
+            ).astype(np.int64)
+            attrs["values_int64s"] = np.concatenate(
+                (np.arange(len(categories) - 1).astype(np.int64), encoded_missing_value)
+            )
+        else:
+            attrs["values_int64s"] = np.arange(len(categories)).astype(np.int64)
+
+        if default_value:
+            attrs["default_int64"] = default_value
 
         result.append(scope.get_unique_variable_name("ordinal_output"))
         label_encoder_output = scope.get_unique_variable_name("label_encoder")

diff --git a/tests/test_sklearn_ordinal_encoder.py b/tests/test_sklearn_ordinal_encoder.py
@@ -55,7 +55,7 @@ def test_model_ordinal_encoder(self):
             [("input", Int64TensorType([None, 3]))],
             target_opset=TARGET_OPSET,
         )
-        self.assertTrue(model_onnx is not None)
+        self.assertIsNotNone(model_onnx)
         dump_data_and_model(
             data, model, model_onnx, basename="SklearnOrdinalEncoderInt64-SkipDim1"
         )
@@ -182,6 +182,120 @@ def test_model_ordinal_encoder_cat_list(self):
             data, model, model_onnx, basename="SklearnOrdinalEncoderCatList"
         )
 
+    @unittest.skipIf(
+        not ordinal_encoder_support(),
+        reason="OrdinalEncoder was not available before 0.20",
+    )
+    def test_model_ordinal_encoder_unknown_value(self):
+        from onnxruntime import InferenceSession
+
+        model = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=42)
+        data = np.array([["a"], ["b"], ["c"], ["d"]], dtype=np.object_)
+        data_with_missing_value = np.array(
+            [["a"], ["b"], ["c"], ["d"], [np.nan], ["e"], [None]], dtype=np.object_
+        )
+
+        model.fit(data)
+        # 'np.nan','e' and 'None' become 42.
+        expected = model.transform(data_with_missing_value)
+
+        model_onnx = convert_sklearn(
+            model,
+            "scikit-learn ordinal encoder",
+            [("input", StringTensorType([None, 1]))],
+            target_opset=TARGET_OPSET,
+        )
+        self.assertIsNotNone(model_onnx)
+        dump_data_and_model(
+            data, model, model_onnx, basename="SklearnOrdinalEncoderUnknownValue"
+        )
+
+        sess = InferenceSession(
+            model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
+        )
+        got = sess.run(
+            None,
+            {
+                "input": data_with_missing_value,
+            },
+        )
+
+        assert_almost_equal(expected.reshape(-1), got[0].reshape(-1))
+
+    @unittest.skipIf(
+        not ordinal_encoder_support(),
+        reason="OrdinalEncoder was not available before 0.20",
+    )
+    def test_model_ordinal_encoder_encoded_missing_value(self):
+        from onnxruntime import InferenceSession
+
+        model = OrdinalEncoder(encoded_missing_value=42)
+        data = np.array([["a"], ["b"], [np.nan], ["c"], ["d"]], dtype=np.object_)
+
+        # 'np.nan' becomes 42
+        expected = model.fit_transform(data)
+
+        model_onnx = convert_sklearn(
+            model,
+            "scikit-learn ordinal encoder",
+            [("input", StringTensorType([None, 1]))],
+            target_opset=TARGET_OPSET,
+        )
+        self.assertIsNotNone(model_onnx)
+        dump_data_and_model(
+            data, model, model_onnx, basename="SklearnOrdinalEncoderEncodedMissingValue"
+        )
+
+        sess = InferenceSession(
+            model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
+        )
+        got = sess.run(
+            None,
+            {
+                "input": data,
+            },
+        )
+
+        assert_almost_equal(expected.reshape(-1), got[0].reshape(-1))
+
+    @unittest.skipIf(
+        not ordinal_encoder_support(),
+        reason="OrdinalEncoder was not available before 0.20",
+    )
+    def test_model_ordinal_encoder_encoded_missing_value_no_nan(self):
+        from onnxruntime import InferenceSession
+
+        model = OrdinalEncoder(encoded_missing_value=42)
+        data = np.array([["a"], ["b"], ["c"], ["d"]], dtype=np.object_)
+
+        expected = model.fit_transform(data)
+
+        model_onnx = convert_sklearn(
+            model,
+            "scikit-learn ordinal encoder",
+            [("input", StringTensorType([None, 1]))],
+            target_opset=TARGET_OPSET,
+        )
+        self.assertIsNotNone(model_onnx)
+        dump_data_and_model(
+            data,
+            model,
+            model_onnx,
+            basename="SklearnOrdinalEncoderEncodedMissingValueNoNan",
+        )
+
+        sess = InferenceSession(
+            model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
+        )
+        got = sess.run(
+            None,
+            {
+                "input": data,
+            },
+        )
+
+        assert_almost_equal(expected.reshape(-1), got[0].reshape(-1))
+
     @unittest.skipIf(
         not set_output_support(),
         reason="'ColumnTransformer' object has no attribute 'set_output'",