misc/cnn2d-fixes (#93)

* minor fixes for light curves * update changelog
spacetelescope · Sep 17, 2024 · 1ab2cc2 · 1ab2cc2
1 parent 0149d56
commit 1ab2cc2
Show file tree

Hide file tree

Showing 5 changed files with 46 additions and 35 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -5,6 +5,10 @@ preprocessor
 ------------
 - explicitly pass `encoding=bytes` in transform.hypersonic_pliers for numpy 2 compatibility where this will no longer be the default for np.loadtxt [#92]
 
+builder
+-------
+- Various minor fixes relating to CNN 2d model usage [#93]
+
 
 1.1.1 (2024-07-11)
 ==================

diff --git a/spacekit/builder/architect.py b/spacekit/builder/architect.py
@@ -898,7 +898,7 @@ def ensemble_cnn(self):
         self.cnn.output_name = "svm_image_output"
         self.cnn.name = "svm_cnn"
         self.cnn.ensemble = True
-        self.cnn.input_shape = self.X_train[1].shape[1:] if self.X_train else None
+        self.cnn.input_shape = self.X_train[1].shape[1:] if self.X_train is not None else None
         self.cnn.output_shape = 1
         self.cnn.layers = [18, 32, 64, 32, 18]
         self.cnn.activation = "leaky_relu"
@@ -1016,7 +1016,7 @@ def __init__(
             **builder_kwargs,
         )
         self.blueprint = blueprint
-        self.input_shape = self.X_train.shape[1:] if self.X_train else None
+        self.input_shape = self.X_train.shape[1:] if self.X_train is not None else None
         self.output_shape = 1
         self.input_name = "cnn2d_inputs"
         self.output_name = "cnn2d_output"
@@ -1035,7 +1035,7 @@ def __init__(
         self.early_stopping = None
         self.batch_size = 32
         self.cost_function = "sigmoid"
-        self.step_size = X_train.shape[1] if X_train else None
+        self.step_size = X_train.shape[1] if X_train is not None else None
         self.steps_per_epoch = self.step_size // self.batch_size
         self.batch_maker = self.batch
 
@@ -1054,17 +1054,17 @@ def build(self):
         )(inputs)
         x = MaxPool1D(strides=self.strides)(x)
         x = BatchNormalization()(x)
-        count = 1
-        for f in self.filters[1:]:
+        for f in list(range(len(self.filters))):
+            if f == 0:
+                continue
             x = Conv1D(
                 filters=self.filters[f],
                 kernel_size=self.kernel,
                 activation=self.activation,
             )(x)
             x = MaxPool1D(strides=self.strides)(x)
-            if count < len(self.filters):
+            if f < len(self.filters) - 1:
                 x = BatchNormalization()(x)
-                count += 1
             else:
                 x = Flatten()(x)
         self.log.info("DROPOUT")

diff --git a/spacekit/extractor/scrape.py b/spacekit/extractor/scrape.py
@@ -325,10 +325,10 @@ def __init__(
         self.fpaths = []
 
     def scrape(self):
-        """Using the key-pair values in `dataset` dictionary attribute, download the files from a github
-        repo and check the hash keys match before extracting. Extraction and hash-key checking is handled
-        externally by the `keras.utils.data_utils.get_file` method. If extraction is successful, the
-        archive file will be deleted.
+        """Using the key-pair values in `dataset` dictionary attribute, download the files from a website
+        (such as zenodo) and check the hash keys match before extracting. Extraction and hash-key checking 
+        is handled externally by the `keras.utils.data_utils.get_file` method. If extraction is successful, 
+        the archive file will be deleted. See spacekit.datasets.meta for dictionary formatting examples.
 
         Returns
         -------

diff --git a/spacekit/preprocessor/transform.py b/spacekit/preprocessor/transform.py
@@ -869,7 +869,7 @@ def tensors_to_arrays(X_train, y_train, X_test, y_test):
 
 
 def hypersonic_pliers(
-    path_to_train, path_to_test, y_col=[0], skip=1, dlm=",", encoding=bytes, subtract_y=0.0
+    path_to_train, path_to_test, y_col=[0], skip=1, dlm=",", encoding='bytes', subtract_y=0.0, reshape=False
 ):
     """Extracts data into 1-dimensional arrays, using separate target classes (y) for training and test data. Assumes y (target)
     is first column in dataframe. If the target (y) classes in the raw data are 0 and 2, but you'd like them to be binaries (0
@@ -900,16 +900,15 @@ def hypersonic_pliers(
     Train = np.loadtxt(path_to_train, skiprows=skip, delimiter=dlm, encoding=encoding)
     cols = list(range(Train.shape[1]))
     xcols = [c for c in cols if c not in y_col]
-    # X_train = Train[:, 1:]
     X_train = Train[:, xcols]
-    # y_train = Train[:, 0, np.newaxis] - subtract_y
     y_train = Train[:, y_col, np.newaxis] - subtract_y
 
     Test = np.loadtxt(path_to_test, skiprows=skip, delimiter=dlm, encoding=encoding)
     X_test = Test[:, xcols]
     y_test = Test[:, y_col, np.newaxis] - subtract_y
-    # X_test = Test[:, 1:]
-    # y_test = Test[:, 0, np.newaxis] - subtract_y
+    if reshape is True:
+        y_train = y_train.reshape(y_train.shape[0], 1)
+        y_test = y_test.reshape(y_test.shape[0], 1)
 
     del Train, Test
     print("X_train: ", X_train.shape)

diff --git a/spacekit/skopes/kepler/light_curves.py b/spacekit/skopes/kepler/light_curves.py
@@ -5,9 +5,21 @@
     babel_fish_dispenser,
 )
 from spacekit.builder.architect import BuilderCNN2D
-from spacekit.datasets.k2_exo import k2_uri, k2_data
+from spacekit.datasets.meta import k2 as k2meta
 from spacekit.extractor.scrape import WebScraper
 
+def downloads_exist(scraper, k2_meta):
+    base_path = os.path.join(scraper.cache_dir, scraper.cache_subdir)
+    filepaths = []
+    for k, v in k2_meta.items():
+        fpath = os.path.join(base_path, v['key'])
+        filepaths.append(fpath)
+    for fp in filepaths:
+        if not os.path.exists(fp):
+            return []
+    print("Found existing datasets, skipping download.")
+    return filepaths
+
 
 class LaunchK2:
     def __init__(self, fpaths):
@@ -20,54 +32,50 @@ def __init__(self, fpaths):
         self.history = None
 
     def launch_prep(self):
-        self.X_train, self.X_test, self.y_train, self.y_test = self.split_data()
-        self.X_train, self.X_test = self.scale_data()
-        self.X_train, self.X_test = self.add_filter()
-        return self.X_train, self.X_test, self.y_train, self.y_test
+        self.split_data()
+        self.scale_data()
+        self.add_filter()
 
     def split_data(self):
         print("Splitting train-test feature and target data...")
         for fpath in self.fpaths:
-            if fpath.endswith("Train"):
+            if "Train" in fpath:
                 train = fpath
             else:
                 test = fpath
         self.X_train, self.X_test, self.y_train, self.y_test = hypersonic_pliers(
-            train, test
+            train, test, subtract_y=1.0, reshape=True
         )
         print("Data split successful")
-        return self.X_train, self.X_test, self.y_train, self.y_test
 
     def scale_data(self):
         print("Scaling data to Zero Mean and Unit Variance...")
         self.X_train, self.X_test = thermo_fusion_chisel(self.X_train, self.X_test)
         print("Data scaling successful.")
-        return self.X_train, self.X_test
 
     def add_filter(self):
         print("Adding noise filter...")
         self.X_train, self.X_test = babel_fish_dispenser(self.X_train, self.X_test)
         print("Noise filter added successfully.")
-        return self.X_train, self.X_test
 
     def deploy(self):
         self.builder = BuilderCNN2D(
-            self.X_train, self.y_train, self.X_test, self.y_test
+            X_train=self.X_train, y_train=self.y_train, X_test=self.X_test, y_test=self.y_test
         )
         self.builder.build()
-        return self.builder
 
     def takeoff(self):
         self.history = self.builder.batch_fit()
 
 
 if __name__ == "__main__":
-    home = os.getcwd()
-    data = os.path.join(home, "data")
     print("Extracting data...")
-    fpaths = WebScraper(k2_uri, k2_data).scrape_repo()
-    print("Data extraction successful.")
-    k2 = LaunchK2(fpaths)
+    scraper = WebScraper(k2meta['uri'], k2meta['data'])
+    scraper.fpaths = downloads_exist(scraper, k2meta['data'])
+    if not scraper.fpaths:
+        scraper.scrape()
+        print("Data extraction successful.")
+    k2 = LaunchK2(scraper.fpaths)
     k2.launch_prep()
-    k2.builder = k2.deploy()
-    k2.history = k2.takeoff()
+    k2.deploy()
+    k2.takeoff()