minor changes

CardioKit · Feb 4, 2024 · 0d2e5be · 0d2e5be
1 parent 9dadcd6
commit 0d2e5be
Show file tree

Hide file tree

Showing 10 changed files with 1,164 additions and 606 deletions.
diff --git a/.gitignore b/.gitignore
@@ -166,3 +166,4 @@ cython_debug/
 /mlruns/
 /results/
 /src/mlruns/
+/test/
diff --git a/analysis/article.ipynb b/analysis/article.ipynb
diff --git a/src/main.py b/src/main.py
@@ -2,6 +2,7 @@
 import datetime
 import os
 
+import numpy as np
 import tensorflow as tf
 from keras.src.callbacks import ReduceLROnPlateau, TerminateOnNaN, CSVLogger, EarlyStopping, ModelCheckpoint
 from keras.src.optimizers import RMSprop
@@ -34,7 +35,6 @@ def main(parameters):
     ######################################################
     # DATA LOADING
     ######################################################
-    #train, size_train = Helper.load_dataset(parameters['train_dataset'])
     train, size_train = Helper.load_multiple_datasets(parameters['train_dataset'])
     val, size_val = Helper.load_multiple_datasets(parameters['val_dataset'])
 
@@ -55,7 +55,6 @@ def main(parameters):
 
     encoder = Encoder(parameters['latent_dimension'])
     decoder = Decoder(parameters['latent_dimension'])
-
     vae = TCVAE(encoder, decoder, parameters['coefficients'], size_train)
     vae.compile(optimizer=RMSprop(learning_rate=parameters['learning_rate']))
     vae.fit(
@@ -79,7 +78,7 @@ def main(parameters):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
-        prog='VECG', description='Representational Learning of ECG using TC-VAE',
+        prog='VECG', description='Representational Learning of ECG using disentangling VAE',
     )
     parser.add_argument(
         '-p', '--path_config', type=str, default='./params.yml',
@@ -88,4 +87,18 @@ def main(parameters):
 
     args = parser.parse_args()
     parameters = Helper.load_yaml_file(args.path_config)
+
+    print(type(parameters['coefficients']['alpha']))
+
     main(parameters)
+
+    #for latent_dim in [4, 8, 16, 24]:
+    #    for alpha in [0.1, 0.5]:
+    #        for beta in [0.5, 1.0, 4.0]:
+    #            for gamma in [0.1, 0.5, 1.0]:
+    #                parameters['latent_dimension'] = latent_dim
+    #                parameters['coefficients']['alpha'] = float(alpha)
+    #                parameters['coefficients']['beta'] = float(beta)
+    #                parameters['coefficients']['gamma'] = float(gamma)
+    #                print(parameters)
+    #                main(parameters)
diff --git a/src/metrics/disentanglement.py b/src/metrics/disentanglement.py
@@ -1,6 +1,7 @@
 from sklearn.preprocessing import KBinsDiscretizer
 from sklearn.metrics import mutual_info_score
 import numpy as np
+from sklearn import svm
 
 
 class Disentanglement:
@@ -30,6 +31,36 @@ def _make_discretizer(target, num_bins):
         dis = KBinsDiscretizer(num_bins, encode='ordinal').fit(target)
         return dis.transform(target)
 
+    @staticmethod
+    def compute_score_matrix(mus, ys, mus_test, ys_test, continuous_factors):
+        """Compute score matrix as described in Section 3."""
+        num_latents = mus.shape[0]
+        num_factors = ys.shape[0]
+        score_matrix = np.zeros([num_latents, num_factors])
+        for i in range(num_latents):
+            for j in range(num_factors):
+                mu_i = mus[i, :]
+                y_j = ys[j, :]
+                if continuous_factors:
+                    # Attribute is considered continuous.
+                    cov_mu_i_y_j = np.cov(mu_i, y_j, ddof=1)
+                    cov_mu_y = cov_mu_i_y_j[0, 1] ** 2
+                    var_mu = cov_mu_i_y_j[0, 0]
+                    var_y = cov_mu_i_y_j[1, 1]
+                    if var_mu > 1e-12:
+                        score_matrix[i, j] = cov_mu_y * 1. / (var_mu * var_y)
+                    else:
+                        score_matrix[i, j] = 0.
+                else:
+                    # Attribute is considered discrete.
+                    mu_i_test = mus_test[i, :]
+                    y_j_test = ys_test[j, :]
+                    classifier = svm.LinearSVC(C=0.01, class_weight="balanced")
+                    classifier.fit(mu_i[:, np.newaxis], y_j)
+                    pred = classifier.predict(mu_i_test[:, np.newaxis])
+                    score_matrix[i, j] = np.mean(pred == y_j_test)
+        return score_matrix
+
     @staticmethod
     def compute_mig(mus_train, ys_train, num_bins=10):
         """Computes score based on both training and testing codes and factors."""
@@ -44,3 +75,20 @@ def compute_mig(mus_train, ys_train, num_bins=10):
         sorted_m = np.sort(m, axis=0)[::-1]
         score_dict["discrete_mig"] = np.mean(np.divide(sorted_m[0, :] - sorted_m[1, :], entropy[:]))
         return score_dict
+
+    @staticmethod
+    def compute_avg_diff_top_two(matrix):
+        sorted_matrix = np.sort(matrix, axis=0)
+        return np.mean(sorted_matrix[-1, :] - sorted_matrix[-2, :])
+
+    @staticmethod
+    def _compute_sap(mus, ys, mus_test, ys_test, continuous_factors):
+        """Computes score based on both training and testing codes and factors."""
+        score_matrix = Disentanglement.compute_score_matrix(mus, ys, mus_test,
+                                            ys_test, continuous_factors)
+        # Score matrix should have shape [num_latents, num_factors].
+        assert score_matrix.shape[0] == mus.shape[0]
+        assert score_matrix.shape[1] == ys.shape[0]
+        scores_dict = {}
+        scores_dict["SAP_score"] = Disentanglement.compute_avg_diff_top_two(score_matrix)
+        return scores_dict
diff --git a/src/model/decoder.py b/src/model/decoder.py
@@ -10,15 +10,15 @@ def __init__(self, latent_dim):
         self.latent_inputs = keras.Input(shape=(latent_dim,))
         self.x = tf.keras.layers.Dense(20)(self.latent_inputs)
         self.x = tf.keras.layers.Reshape((5, 4))(self.x)
-        self.x = tf.keras.layers.Conv1DTranspose(filters=128, kernel_size=2, strides=1, padding='same')(self.x)
+        self.x = tf.keras.layers.Conv1DTranspose(filters=256, kernel_size=5, strides=1, padding='same')(self.x)
         self.x = tf.keras.layers.LeakyReLU()(self.x)
-        self.x = tf.keras.layers.Conv1DTranspose(filters=64, kernel_size=5, strides=2, padding='same')(self.x)
+        self.x = tf.keras.layers.Conv1DTranspose(filters=128, kernel_size=5, strides=2, padding='same')(self.x)
         self.x = tf.keras.layers.LeakyReLU()(self.x)
-        self.x = tf.keras.layers.Conv1DTranspose(filters=32, kernel_size=10, strides=2, padding='same')(self.x)
+        self.x = tf.keras.layers.Conv1DTranspose(filters=64, kernel_size=5, strides=2, padding='same')(self.x)
         self.x = tf.keras.layers.LeakyReLU()(self.x)
-        self.x = tf.keras.layers.Conv1DTranspose(filters=16, kernel_size=20, strides=5, padding='same')(self.x)
+        self.x = tf.keras.layers.Conv1DTranspose(filters=32, kernel_size=5, strides=5, padding='same')(self.x)
         self.x = tf.keras.layers.LeakyReLU()(self.x)
-        self.x = tf.keras.layers.Conv1DTranspose(filters=1, kernel_size=50, strides=5, padding='same')(self.x)
+        self.x = tf.keras.layers.Conv1DTranspose(filters=1, kernel_size=5, strides=5, padding='same')(self.x)
         self.x = tf.keras.layers.LeakyReLU()(self.x)
         self.x = tf.keras.layers.Flatten()(self.x)
         self.decoder_outputs = tf.keras.layers.Reshape((500,))(self.x)

diff --git a/src/model/encoder.py b/src/model/encoder.py
@@ -10,9 +10,10 @@ def __init__(self, latent_dim):
 
         self.encoder_inputs = keras.Input(shape=(500,))
         self.x = keras.layers.Reshape((500, 1))(self.encoder_inputs)
-        self.x = self.conv_block_enc(self.x, 32, 5, 1)
+        self.x = self.conv_block_enc(self.x, 64, 5, 1)
         self.x = self.conv_block_enc(self.x, 32, 5, 16)
         self.x = self.conv_block_enc(self.x, 16, 5, 32)
+        self.x = self.conv_block_enc(self.x, 8, 5, 64)
         self.x = tf.keras.layers.MaxPooling1D()(self.x)
         self.x = tf.keras.layers.Flatten()(self.x)
         self.x = tf.keras.layers.Dense(64)(self.x)

diff --git a/src/model/tcvae.py b/src/model/tcvae.py
@@ -1,5 +1,8 @@
 from model.vae import VAE
 import tensorflow as tf
+import tensorflow_probability as tfp
+
+tfd = tfp.distributions
 
 
 class TCVAE(VAE):
@@ -34,7 +37,6 @@ def gamma(self):
     def gamma(self, value):
         self._gamma.assign(value)
 
-
     def _loss(self, reconstruction, x, mu, log_var, z):
         size_batch = tf.shape(x)[0]
         logiw_mat = self.log_importance_weight_matrix(size_batch)

diff --git a/src/params.yml b/src/params.yml
@@ -9,21 +9,21 @@ train_dataset:
   batch_size: 1024
 val_dataset:
   name:
-  - zheng
-  split: 'train'
+  - medalcare
+  split: 'validation'
   shuffle_size: 1024
   batch_size: 1024
 save_results_path: ../results/
 seed: 42
 epochs: 200
 latent_dimension: 8
-learning_rate: 0.001
+learning_rate: 0.002
 coefficients:
-  alpha: 1.0
-  beta: 64.0
-  gamma: 1.0
+  alpha: 0.1
+  beta: 1.0
+  gamma: 0.1
 coefficients_raise: 20
-early_stopping: 50000
+early_stopping: 50
 period_reconstruction_plot: 20
 index_tracked_sample: 5
 encode_data:
@@ -37,7 +37,7 @@ encode_data:
     fine_tune: False
   medalcare:
     name: 'medalcare'
-    splits: [ 'train' ]
+    splits: [ 'train', 'test', 'validation']
     fine_tune: False
   ptb:
     name: 'ptb'

diff --git a/src/utils/visualizations.py b/src/utils/visualizations.py
@@ -1,6 +1,9 @@
 import numpy as np
+import pandas as pd
 from matplotlib import pyplot as plt
 import seaborn as sns
+from neurokit2.signal import signal_smooth
+from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
 
 
 class Visualizations:
@@ -58,18 +61,79 @@ def eval_reconstruction(X, reconstruction, indices, path_eval, titles=None, xlab
         plt.close()
 
     @staticmethod
-    def eval_dimensions(res, path_eval):
-        mean = np.mean(res, axis=0)
-        std = np.std(res, axis=0)
-        plt.figure(figsize=(15, 5))
+    def eval_dimensions(df, ld, model, dimension, path, l_bound=-10.0, u_bound=10.0, num_samples=1000):
+
+        mean_values, std_values = np.mean(df.iloc[:, :ld], axis=0), np.std(df.iloc[:, :ld], axis=0)
+        result_matrix = np.tile(mean_values, (num_samples, 1))
+        result_matrix[:, dimension] = np.linspace(l_bound, u_bound, num_samples)
+        X = model.decode(result_matrix)
+
+        M = np.zeros((X.shape))
+        for k, _ in enumerate(X):
+            M[k] = signal_smooth(X[k].numpy())
+
+        mean = np.mean(M, axis=0)
+        std = np.std(M, axis=0)
+        fig = plt.figure(figsize=(15, 5))
+        fig.tight_layout()
         plt.plot(range(0, len(mean)), mean, 'k-')
         plt.fill_between(range(0, len(mean)), mean - std, mean + std)
-        plt.savefig(path_eval + '.png')
-        plt.close()
+        plt.title("ECG reconstruction by toggling dimension " + str(dimension) + ".")
+        fig.savefig(path, dpi=300)
 
     @staticmethod
     def plot_trainings_process(train_progress, metrics):
-        plt.figure(figsize=(10, 5))
+        fig = plt.figure(figsize=(10, 5))
+        fig.tight_layout()
         for k in metrics:
-            sns.lineplot(train_progress, x='epoch', y=k)
-        # ax.set_yscale("log")
+            ax = sns.lineplot(train_progress, x='epoch', y=k)
+            ax.set_yscale("log")
+
+    @staticmethod
+    def plot_variations(df, ld, model, dimension=0, num_rows=1000):
+        mean_values = np.mean(df.iloc[:, :ld], axis=0)
+        std_values = np.std(df.iloc[:, :ld], axis=0)
+        result_matrix = np.tile(mean_values, (num_rows, 1))
+        result_matrix[:, dimension] = np.linspace(-10.0, 10.0, num_rows)
+        X = model.decode(result_matrix)
+
+        reconstruct = pd.DataFrame()
+        reconstruct['values'] = X.numpy().flatten()
+
+        original_array = list(range(0, 500))
+        desired_length = len(reconstruct)
+        repeating_array = [original_array[i % len(original_array)] for i in range(desired_length)]
+
+        reconstruct['timestamp'] = repeating_array
+        plt.figure(figsize=(15, 5))
+        sns.lineplot(data=reconstruct, x="timestamp", y="values")
+
+    @staticmethod
+    def plot_embedding_slice(df, dim_x, dim_y, hue, title_legend, path):
+        fig = plt.figure(figsize=(10, 10))
+        fig.tight_layout()
+        ax = sns.scatterplot(
+            data=df, x=dim_x, y=dim_y, hue=hue,
+        )
+        ax.set(
+            xlabel='Dimension ' + str(dim_x),
+            ylabel='Dimension ' + str(dim_y),
+            title="Slice through the embedding space.",
+        )
+        plt.legend(title=title_legend)
+        plt.tight_layout()
+        plt.show()
+        fig.savefig(path, dpi=300)
+
+    @staticmethod
+    def plot_confustion_matrix(X_train, X_test, y_train, y_test, predictor, path):
+        predictor.fit(X_train, y_train)
+        predictions = predictor.predict(X_test.fillna(0))
+        cm = confusion_matrix(y_test, predictions, labels=predictor.classes_)
+        fig = plt.figure(figsize=(15, 15))
+        fig.tight_layout()
+        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=predictor.classes_)
+        disp.plot()
+        plt.show()
+        fig.savefig(path, dpi=300)
+        return cm
diff --git a/tests/tcvae.py b/tests/tcvae.py
@@ -0,0 +1,7 @@
+# content of test_sample.py
+def inc(x):
+    return x + 1
+
+
+def test_answer():
+    assert inc(3) == 4