From 6daf0b813ecd2feaa85c63ea3a31078a73211cdd Mon Sep 17 00:00:00 2001
From: "Mahadik, Mukul Chandrakant" <MukulChandrakant.Mahadik@nrel.gov>
Date: Thu, 21 Nov 2024 23:18:48 -0800
Subject: [PATCH] Using train / test data split + Added value-check tests +
 Reduced instance variables

1. Split up mock trips data into train / test data.
- Saw that this was being done in one of the tests in TestForestModelLoadandSave.py itself as well as in TestGreedySimilarityBinning.py
- Hence added it for all tests in forest model tests for uniformity.

2. Reduced number of instance variables since they were used inside setUp() only.
This addresses review comment mentioned originally for TestForestModelIntegration
https://github.com/e-mission/e-mission-server/pull/938#discussion_r1485790099

3. Cleaned up TestForestModeIntegration.py
- Added equality tests that check for prediction values generated in pipeline.
Address review comment:
https://github.com/e-mission/e-mission-server/pull/938#discussion_r1486602894

- Added train / test data split.

- Removed check for empty data in setUp()
Addresses review comment:
https://github.com/e-mission/e-mission-server/pull/938#discussion_r1486610794
---
 .../TestForestModelIntegration.py             | 102 ++++++++++--------
 .../TestForestModelLoadandSave.py             |  62 +++++------
 .../modellingTests/TestRunForestModel.py      |   8 +-
 3 files changed, 89 insertions(+), 83 deletions(-)

diff --git a/emission/tests/modellingTests/TestForestModelIntegration.py b/emission/tests/modellingTests/TestForestModelIntegration.py
index 6677221a3..e08345f5d 100644
--- a/emission/tests/modellingTests/TestForestModelIntegration.py
+++ b/emission/tests/modellingTests/TestForestModelIntegration.py
@@ -1,7 +1,8 @@
-# This tests the label inference pipeline. It uses real  data and placeholder inference algorithms
 import unittest
 import numpy as np
 import time
+import logging
+import bson.objectid as boi
 import emission.analysis.classification.inference.labels.pipeline as eacilp
 import emission.analysis.classification.inference.labels.inferrers as eacili
 import emission.core.wrapper.labelprediction as ecwl
@@ -11,30 +12,29 @@
 import emission.core.get_database as edb
 import emission.tests.common as etc
 import emission.pipeline.intake_stage as epi
-import logging
-import bson.objectid as boi
-
 import emission.analysis.modelling.trip_model.config as eamtc
-
 import emission.analysis.modelling.trip_model.run_model as eamur
 import emission.analysis.modelling.trip_model.model_type as eamumt
 import emission.analysis.modelling.trip_model.model_storage as eamums
 import emission.tests.modellingTests.modellingTestAssets as etmm
 import emission.storage.timeseries.abstract_timeseries as esta
 
-
 class TestForestModelIntegration(unittest.TestCase):
-    # Test if the forest model for label prediction is smoothly integrated with the inference pipeline.
-    # In the initial setup, build a dummy forest model. Then run the pipeline on real example data.
-    # Finally in the test, assert the type of label predictions expected.
-
+    """
+    This tests the label inference pipeline. It uses real data and placeholder inference algorithms.
+    Test if the forest model for label prediction is smoothly integrated with the inference pipeline.
+    In the initial setup, build a dummy forest model. Then run the pipeline on real example data.
+    Finally in the test, assert the type of label predictions expected.
+    The label_data dict and mock_trip_data are copied over from TestRunGreedyModel.py
+    """
     def setUp(self):
         np.random.seed(91)
         self.test_algorithms = eacilp.primary_algorithms
         forest_model_config = eamtc.get_config_value_or_raise('model_parameters.forest')
-
         etc.setupRealExample(self, "emission/tests/data/real_examples/shankari_2015-07-22")  ##maybe use a different file
         ts = esta.TimeSeries.get_time_series(self.testUUID)
+
+        # Generate labels with a known sample weight that we can rely on in the test
         label_data = {
             "mode_confirm": ['ebike', 'bike'],
             "purpose_confirm": ['happy-hour', 'dog-park'],
@@ -43,11 +43,10 @@ def setUp(self):
             "purpose_weights": [0.1, 0.9]
         }
 
-        self.total_trips=100
-        ## generate mock trips
-        train = etmm.generate_mock_trips(
+        # Configuration values for randomly-generated test data copied over from TestRunGreedyModel.py
+        mock_trip_data = etmm.generate_mock_trips(
             user_id=self.testUUID,
-            trips=self.total_trips,
+            trips=100,
             origin=(-105.1705977, 39.7402654),
             destination=(-105.1755606, 39.7673075),
             trip_part='od',
@@ -56,35 +55,37 @@ def setUp(self):
             threshold=0.004, # ~400m
             has_label_p=0.9
         )
-        ## Required for Forest model inference
-        for result_entry in train:
+
+        # Required for Forest model inference
+        for result_entry in mock_trip_data:
             result_entry['data']['start_local_dt']=result_entry['metadata']['write_local_dt']
             result_entry['data']['end_local_dt']=result_entry['metadata']['write_local_dt']
             result_entry['data']['start_place']=boi.ObjectId()
             result_entry['data']['end_place']=boi.ObjectId()
-        ts.bulk_insert(train)
-        # confirm data write did not fail
-        check_data = esda.get_entries(key="analysis/confirmed_trip", user_id=self.testUUID, time_query=None)
-        if len(check_data) != self.total_trips:
-            logging.debug(f'test invariant failed after generating test data')
-            self.fail()
-        else:
-            logging.debug(f'found {self.total_trips} trips in database')
-        ## Build an already existing model or a new model
+
+        split = int(len(mock_trip_data)*0.7)
+        mock_train_data = mock_trip_data[:split]
+        self.mock_test_data = mock_trip_data[split:]
+
+        ts.bulk_insert(mock_train_data)
+
+        # Build and train model
+        logging.debug(f'(TRAIN) creating a model based on trips in database')
         eamur.update_trip_model(
             user_id=self.testUUID,
             model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER,
             model_storage=eamums.ModelStorage.DOCUMENT_DATABASE,
-            min_trips=4,
+            min_trips=14,
             model_config=forest_model_config
         )
-        ## run inference pipeline
+
+        # Run inference pipeline
         self.run_pipeline(self.test_algorithms)
         time_range = estt.TimeQuery("metadata.write_ts", None, time.time())
         self.inferred_trips = esda.get_entries(esda.INFERRED_TRIP_KEY, self.testUUID, time_query=time_range)
 
     def tearDown(self):
-        self.reset_all()
+        etc.dropAllCollections(edb._get_current_db())
 
     def run_pipeline(self, algorithms):
         default_primary_algorithms = eacilp.primary_algorithms
@@ -92,25 +93,42 @@ def run_pipeline(self, algorithms):
         epi.run_intake_pipeline_for_user(self.testUUID,skip_if_no_new_data = False)
         eacilp.primary_algorithms = default_primary_algorithms
 
-    def reset_all(self):
-        edb.get_analysis_timeseries_db().delete_many({'user_id': self.testUUID})
-        edb.get_model_db().delete_many({'user_id': self.testUUID})
-        edb.get_pipeline_state_db().delete_many({'user_id': self.testUUID})
-
-
-    # Tests that forest algorithm being tested runs successfully
     def testForestAlgorithm(self):
+        '''
+        Tests that forest algorithm runs successfully when called from the analysis pipeline
+        The tests are based on the existing tests in TestLabelInferencePipeline.py
+        '''
+        valid_modes = ['ebike', 'bike']
+        valid_purposes = ['happy-hour', 'dog-park']
+
         for trip in self.inferred_trips:
             entries = esdt.get_sections_for_trip("inference/labels", self.testUUID, trip.get_id())
             self.assertEqual(len(entries), len(self.test_algorithms))
             for entry in entries:
-                self.assertGreater(len(entry["data"]["prediction"]), 0)
+                # Test 1: Check that non-empty prediction list is generated
+                self.assertGreater(len(entry["data"]["prediction"]), 0, "Prediction list should not be empty - model failed to generate any predictions")
+
+                # Test 2: Check for equality of trip inferred labels and prediction value in entry
+                self.assertEqual(trip["data"]["inferred_labels"], entry["data"]["prediction"])
+
+                # Test 3: Check that prediction value in entry is equal to the prediction generated by the algorithm
+                this_algorithm = ecwl.AlgorithmTypes(entry["data"]["algorithm_id"])
+                self.assertIn(this_algorithm, self.test_algorithms)
+                self.assertEqual(entry["data"]["prediction"], self.test_algorithms[this_algorithm]([trip])[0])
+
                 for singleprediction in entry["data"]["prediction"]:
-                    self.assertIsInstance(singleprediction, dict, " should be an instance of the dictionary class")
-                    self.assertIsInstance(singleprediction['labels'], dict, " should be an instance of the dictionary class")
-                    self.assertIn('mode_confirm',singleprediction['labels'].keys())
-                    self.assertIn('replaced_mode',singleprediction['labels'].keys())
-                    self.assertIn('purpose_confirm',singleprediction['labels'].keys())        
+                    # Test 4: Check that the prediction is a dictionary
+                    self.assertIsInstance(singleprediction, dict, "should be an instance of the dictionary class")
+                    self.assertIsInstance(singleprediction['labels'], dict, "should be an instance of the dictionary class")
+
+                    # Test 5: Check that the prediction dictionary contains the required keys
+                    self.assertIn('mode_confirm', singleprediction['labels'].keys())
+                    self.assertIn('replaced_mode', singleprediction['labels'].keys())
+                    self.assertIn('purpose_confirm', singleprediction['labels'].keys())  
+                    
+                    # Test 6: Check that the prediction dictionary contains the correct values
+                    self.assertIn(singleprediction['labels']['mode_confirm'], valid_modes)
+                    self.assertIn(singleprediction['labels']['purpose_confirm'], valid_purposes)
 
 def main():
     etc.configLogging()
diff --git a/emission/tests/modellingTests/TestForestModelLoadandSave.py b/emission/tests/modellingTests/TestForestModelLoadandSave.py
index 431b9ddb3..e92d4273a 100644
--- a/emission/tests/modellingTests/TestForestModelLoadandSave.py
+++ b/emission/tests/modellingTests/TestForestModelLoadandSave.py
@@ -17,27 +17,14 @@
 class TestForestModelLoadandSave(unittest.TestCase):
     """
     Tests to make sure the model load and save properly
+    The label_data dict and mock_trip_data are copied over from TestRunGreedyModel.py
     """
-    def setUp(self):
-        """
-        sets up the end-to-end run model test with Confirmedtrip data
-        """
-        # configuration for randomly-generated test data
-        self.user_id = user_id = 'TestForestModelLoadAndSave-TestData'
-        self.origin = (-105.1705977, 39.7402654,)
-        self.destination = (-105.1755606, 39.7673075)
-        self.min_trips = 14
-        self.total_trips = 100
-        self.clustered_trips = 33    # must have at least self.min_trips similar trips by default
-        self.has_label_percent = 0.9 # let's make a few that don't have a label, but invariant
-                                # $clustered_trips * $has_label_percent > self.min_trips
-                                # must be correct or else this test could fail under some random test cases.
-
+    def setUp(self): 
+        self.user_id = 'TestForestModelLoadAndSave-TestData'
         self.unused_user_id = 'asdjfkl;asdfjkl;asd08234ur13fi4jhf2103mkl'
+        ts = esta.TimeSeries.get_time_series(self.user_id)
 
-        ts = esta.TimeSeries.get_time_series(user_id)
-
-        # generate labels with a known sample weight that we can rely on in the test
+        # Generate labels with a known sample weight that we can rely on in the test
         label_data = {
             "mode_confirm": ['ebike', 'bike'],
             "purpose_confirm": ['happy-hour', 'dog-park'],
@@ -46,24 +33,29 @@ def setUp(self):
             "purpose_weights": [0.1, 0.9]
         }
 
-        # generate test data for the database
-        test_data = etmm.generate_mock_trips(
-            user_id=user_id,
-            trips=self.total_trips,
-            origin=self.origin,
-            destination=self.destination,
+        # Configuration values for randomly-generated test data copied over from TestRunGreedyModel.py
+        mock_trip_data = etmm.generate_mock_trips(
+            user_id=self.user_id,
+            trips=100,
+            origin=(-105.1705977, 39.7402654,),
+            destination=(-105.1755606, 39.7673075),
             trip_part='od',
             label_data=label_data,
-            within_threshold=self.clustered_trips,  
+            within_threshold=33,  
             threshold=0.004, # ~400m
-            has_label_p=self.has_label_percent
+            has_label_p=0.9
         )
 
-        for result_entry in test_data:
+        # Required for Forest model inference
+        for result_entry in mock_trip_data:
             result_entry['data']['start_local_dt']=result_entry['metadata']['write_local_dt']
             result_entry['data']['end_local_dt']=result_entry['metadata']['write_local_dt']
 
-        ts.bulk_insert(test_data)
+        split = int(len(mock_trip_data)*0.7)  
+        mock_train_data = mock_trip_data[:split]
+        self.mock_test_data = mock_trip_data[split:]
+
+        ts.bulk_insert(mock_train_data)
 
         self.forest_model_config= eamtc.get_config_value_or_raise('model_parameters.forest')
 
@@ -73,7 +65,7 @@ def setUp(self):
             user_id=self.user_id,
             model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER,
             model_storage=eamums.ModelStorage.DOCUMENT_DATABASE,
-            min_trips=self.min_trips,
+            min_trips=14,
             model_config=self.forest_model_config
         )
 
@@ -98,10 +90,8 @@ def testForestModelPredictionsEquality(self):
         The type of deserialized model attributes and the predictions of this must match 
         those of initial model.
         """
-        test_trip_data = esda.get_entries(key=esda.CONFIRMED_TRIP_KEY, user_id=self.user_id, time_query=None)
-
         predictions_list = eamur.predict_labels_with_n(
-            trip_list = test_trip_data,
+            trip_list = self.mock_test_data,
             model=self.model
         )
 
@@ -111,7 +101,7 @@ def testForestModelPredictionsEquality(self):
         deserialized_model.from_dict(model_data)
 
         predictions_deserialized_model_list = eamur.predict_labels_with_n(
-                trip_list = test_trip_data,
+                trip_list = self.mock_test_data,
                 model=deserialized_model
         )
 
@@ -130,10 +120,8 @@ def testForestModelConsistency(self):
         ConsistencyTest : To Verify that the serialization and deserialization process
         is consistent across multiple executions
         """
-        test_trip_data = esda.get_entries(key=esda.CONFIRMED_TRIP_KEY, user_id=self.user_id, time_query=None)    
-
         predictions_list_model1 = eamur.predict_labels_with_n(
-            trip_list = test_trip_data,
+            trip_list = self.mock_test_data,
             model=self.model           
         )
 
@@ -145,7 +133,7 @@ def testForestModelConsistency(self):
         )
 
         predictions_list_model2 = eamur.predict_labels_with_n(
-            trip_list = test_trip_data,
+            trip_list = self.mock_test_data,
             model=model_iter2           
         )
 
diff --git a/emission/tests/modellingTests/TestRunForestModel.py b/emission/tests/modellingTests/TestRunForestModel.py
index 672775483..6ecad60a5 100644
--- a/emission/tests/modellingTests/TestRunForestModel.py
+++ b/emission/tests/modellingTests/TestRunForestModel.py
@@ -155,7 +155,7 @@ def testTrainForestModelWithZeroTrips(self):
             "pipeline should not have a current timestamp for the test user")
 
 
-    def test1RoundPredictForestModel(self):
+    def testRoundPredictForestModel(self):
         """
        forest model takes config arguments via the constructor for testing
        purposes but will load from a file in /conf/analysis/ which is tested here
@@ -204,11 +204,11 @@ def test1RoundPredictForestModel(self):
         )
         for prediction, n in predictions_list:
             [logging.debug(p) for p in sorted(prediction, key=lambda r: r['p'], reverse=True)]
-            self.assertNotEqual(len(prediction), 0, "should have a prediction")
+            self.assertNotEqual(len(prediction), 0, "Prediction list should not be empty - model failed to generate any predictions")
             self.assertIn('labels',prediction[0].keys())
             self.assertIn('p',prediction[0].keys())
-            self.assertIsInstance(prediction[0], dict, " should be an instance of the dictionary class")
-            self.assertIsInstance(prediction[0]['labels'], dict, " should be an instance of the dictionary class")
+            self.assertIsInstance(prediction[0], dict, "should be an instance of the dictionary class")
+            self.assertIsInstance(prediction[0]['labels'], dict, "should be an instance of the dictionary class")
             self.assertIn('mode_confirm',prediction[0]['labels'].keys())
             self.assertIn('replaced_mode',prediction[0]['labels'].keys())
             self.assertIn('purpose_confirm',prediction[0]['labels'].keys())
\ No newline at end of file