Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update PCA_Embedder Saving and Loading #2204 #2222

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def point_based_aug(out_dir, orig_img_list, orig_point_list, num_samples, num_di
else:
PointEmbedder = Embedder.PCA_Embbeder(point_matrix, num_dim, percent_variability)
num_dim = PointEmbedder.num_dim
PointEmbedder.write_PCA(out_dir + "PCA_Particle_Info/", "particles") # write PCA info for DeepSSM testing
PointEmbedder.write_PCA(out_dir + "PCA_Particle_Info/", suffix="particles") # write PCA info for DeepSSM testing
embedded_matrix = PointEmbedder.getEmbeddedMatrix()
# Get sampler
if sampler_type == "gaussian":
Expand Down
261 changes: 203 additions & 58 deletions Python/DataAugmentationUtilsPackage/DataAugmentationUtils/Embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import numpy as np
from abc import ABC, abstractmethod
from shapeworks.utils import sw_message
from pathlib import Path
from glob import glob

# abstract base class for embedders
class Embedder(ABC):
Expand All @@ -19,61 +21,204 @@ def project(self, PCA_instance):

# instance of embedder that uses PCA for dimension reduction
class PCA_Embbeder(Embedder):
# overriding abstract methods
def __init__(self, data_matrix, num_dim=0, percent_variability=0.95):
self.data_matrix = data_matrix
num_dim = self.run_PCA(num_dim, percent_variability)
self.num_dim = num_dim
# run PCA on data_matrix for PCA_Embedder
def run_PCA(self, num_dim, percent_variability):
# get covariance matrix (uses compact trick)
N = self.data_matrix.shape[0]
data_matrix_2d = self.data_matrix.reshape(self.data_matrix.shape[0], -1).T # flatten data instances and transpose
mean = np.mean(data_matrix_2d, axis=1)
centered_data_matrix_2d = (data_matrix_2d.T - mean).T
trick_cov_matrix = np.dot(centered_data_matrix_2d.T,centered_data_matrix_2d) * 1.0/np.sqrt(N-1)
# get eignevectors and eigenvalues
eigen_values, eigen_vectors = np.linalg.eigh(trick_cov_matrix)
eigen_vectors = np.dot(centered_data_matrix_2d, eigen_vectors)
for i in range(N):
eigen_vectors[:,i] = eigen_vectors[:,i]/np.linalg.norm(eigen_vectors[:,i])
eigen_values = np.flip(eigen_values)
eigen_vectors = np.flip(eigen_vectors, 1)
# get num PCA components
cumDst = np.cumsum(eigen_values) / np.sum(eigen_values)
if num_dim == 0:
cumDst = np.cumsum(eigen_values) / np.sum(eigen_values)
num_dim = np.where(cumDst > float(percent_variability))[0][0] + 1
W = eigen_vectors[:, :num_dim]
PCA_scores = np.matmul(centered_data_matrix_2d.T, W)
sw_message(f"The PCA modes of particles being retained : {num_dim}")
sw_message(f"Variablity preserved: {str(float(cumDst[num_dim-1]))}")
self.num_dim = num_dim
self.PCA_scores = PCA_scores
self.eigen_vectors = eigen_vectors
self.eigen_values = eigen_values
return num_dim
# write PCA info to files
# @TODO do we need all of this?
def write_PCA(self, out_dir, suffix):
if not os.path.exists(out_dir):
os.makedirs(out_dir)
np.save(out_dir + 'original_PCA_scores.npy', self.PCA_scores)
mean = np.mean(self.data_matrix, axis=0)
np.savetxt(out_dir + 'mean.' + suffix, mean)
np.savetxt(out_dir + 'eigenvalues.txt', self.eigen_values)
for i in range(self.data_matrix.shape[0]):
nm = out_dir + 'pcamode' + str(i) + '.' + suffix
data = self.eigen_vectors[:, i]
data = data.reshape(self.data_matrix.shape[1:])
np.savetxt(nm, data)
# returns embedded form of dtat_matrix
def getEmbeddedMatrix(self):
return self.PCA_scores
# projects embbed array into data
def project(self, PCA_instance):
W = self.eigen_vectors[:, :self.num_dim].T
mean = np.mean(self.data_matrix, axis=0)
data_instance = np.matmul(PCA_instance, W) + mean.reshape(-1)
data_instance = data_instance.reshape((self.data_matrix.shape[1:]))
return data_instance
def __init__(self, data_matrix=None, num_dim=0, percent_variability=0.95):
"""
Initialize the PCA_Embedder. If data_matrix is provided, a PCA model is generated.
Otherwise, the attributes defining the model are initialized as None. A model can then be initialized from arrays
using load_pca.

Parameters
----------
data_matrix
Data to use to generate a PCA model
num_dim
Number of PCA dimensions to keep in the generated PCA scores. (Max is data_matrix.shape[0]-1, i.e., the maximum number of
modes of variation is one less than the number of samples used to build the model.
If set to zero, the maximum number of dimensions are kept.
percent_variability
Percentage of the variation in the input data to keep in the generated PCA scores, scaled to between 0 and 1.
This is only used if num_dim is not set.
"""
super().__init__(data_matrix)
self.PCA_scores = None
self.eigen_vectors = None
self.eigen_values = None

if data_matrix is not None:
self.data_matrix = data_matrix
self.mean_data = np.mean(self.data_matrix, axis=0)
self.run_PCA(num_dim, percent_variability)

def run_PCA(self, num_dim, percent_variability):
"""
Perform principal component analysis on the data_matrix.

Parameters
----------
num_dim
Number of PCA dimensions to keep in the model. (Max is data_matrix.shape[0]-1, i.e., the maximum number of
modes of variation is one less than the number of samples used to build the model.
If set to zero, the maximum number of dimensions are kept.
percent_variability
Percentage of the variation in the input data to keep in the model, scaled to between 0 and 1.
This is only used if num_dim is not set.

Returns
-------
num_dim
num_dim actually used
"""
# get covariance matrix (uses compact trick)
N = self.data_matrix.shape[0]
data_matrix_2d = self.data_matrix.reshape(self.data_matrix.shape[0],
-1).T # flatten data instances and transpose
mean = np.mean(data_matrix_2d, axis=1)
centered_data_matrix_2d = (data_matrix_2d.T - mean).T
trick_cov_matrix = np.dot(centered_data_matrix_2d.T, centered_data_matrix_2d) * 1.0 / np.sqrt(N - 1)
# get eignevectors and eigenvalues
eigen_values, eigen_vectors = np.linalg.eigh(trick_cov_matrix)
eigen_vectors = np.dot(centered_data_matrix_2d, eigen_vectors)
for i in range(N):
eigen_vectors[:, i] = eigen_vectors[:, i] / np.linalg.norm(eigen_vectors[:, i])
eigen_values = np.flip(eigen_values)
eigen_vectors = np.flip(eigen_vectors, 1)
# get num PCA components
# Note that the number of the eigen_values and eigen_vectors is equal to the dimension of the data
# matrix, but the last column is not used in the model because it describes no variation.
cumDst = np.cumsum(eigen_values) / np.sum(eigen_values)
if num_dim == 0:
num_dim = np.where(cumDst >= float(percent_variability))[0][0] + 1
W = eigen_vectors[:, :num_dim]
PCA_scores = np.matmul(centered_data_matrix_2d.T, W)
sw_message(f"The PCA modes of particles being retained : {num_dim}")
sw_message(f"Variablity preserved: {str(float(cumDst[num_dim - 1]))}")

self.PCA_scores = PCA_scores
self.eigen_vectors = eigen_vectors
self.eigen_values = eigen_values
self.num_dim = num_dim
return num_dim

def write_PCA(self, out_dir: Path, score_option="full", suffix="txt"):
"""
Write PCA data to a specified directory.

Parameters
----------
out_dir
Directory in which to save PCA data
score_option
Option for how to save PCA scores. The full scores can be used to recreate the data used to create the
model, which may be privileged information, so options are provided to save no information about the scores.
Options are:
full: Save complete scores
Otherwise: Don't save scores
suffix
File extension to use
"""
out_dir = Path(out_dir)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
if score_option == "full":
np.savetxt(str(out_dir / f'original_PCA_scores.{suffix}'), self.PCA_scores)
mean = np.mean(self.data_matrix, axis=0)
np.savetxt(str(out_dir / f'mean.{suffix}'), mean)
np.savetxt(str(out_dir / f'eigenvalues.txt'), self.eigen_values)
for i in range(self.data_matrix.shape[0]):
nm = str(out_dir / f'pcamode{i}.{suffix}')
data = self.eigen_vectors[:, i]
data = data.reshape(self.data_matrix.shape[1:])
np.savetxt(nm, data)

def project(self, PCA_instance):
"""
Maps a given set of scores to the data values (e.g., coordinate points) they represent, given the embedded
PCA model.

Parameters
----------
PCA_instance
A row vector containing one score for each PCA mode.

Returns
-------
data instance
Data represented by the input scores for this PCA model

"""
num_dim = len(PCA_instance)
W = self.eigen_vectors[:, :num_dim].T
data_instance = np.matmul(PCA_instance, W) + self.mean_data.reshape(-1)
data_instance = data_instance.reshape(self.mean_data.shape)
return data_instance

def load_PCA(self, mean_data, eigen_values, eigen_vectors, scores=None):
"""
Load PCA model from arrays

Parameters
----------
mean_data
eigen_values
eigen_vectors
scores
"""
self.mean_data = mean_data
self.eigen_values = eigen_values
self.eigen_vectors = eigen_vectors
self.PCA_scores = scores

@classmethod
def from_directory(cls, directory: Path):
"""
Factory function to create a PCA_embedder instance by loading saved data from a specified directory.

Parameters
----------
directory
Directory from which to load data

Returns
-------
embedder
PCA_embedder instance

"""
directory = Path(directory)

mean = np.loadtxt(glob(str(directory / "mean*"))[0])
eigen_values = np.loadtxt(glob(str(directory / "eigenvalues*"))[0])
eigen_vectors = []

eigen_vector_files = glob(str(directory / "pcamode*"))
eigen_vector_files.sort(key=lambda f: int(str(Path(f).stem).split("pcamode")[-1])) # Sort numerically by name
for file in eigen_vector_files:
eigen_vector = np.flip(np.loadtxt(file))
eigen_vectors.append(eigen_vector.reshape((-1)))

eigen_vectors = np.rot90(np.array(eigen_vectors))

embedder = cls()

scores = None
if scores_glob := glob(str(directory / "original_PCA_scores*")):
scores = np.loadtxt(scores_glob[0])

embedder.load_PCA(mean, eigen_values, eigen_vectors, scores=scores)

return embedder

def getEmbeddedMatrix(self):
"""
Get the embedded form of data_matrix

Returns
-------
PCA_scores
A matrix with one row for each input data sample
The columns are floats that represent the value for each PCA mode that together represent the input data
sample.
The number of columns indicates the number of PCA modes that were used to generate the scores.

"""
return self.PCA_scores
4 changes: 4 additions & 0 deletions Testing/PythonTests/PythonTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,10 @@ TEST(pythonTests, pcaTest) {
run_test("pca.py");
}

TEST(pythonTests, pcaEmbedderTest) {
run_test("pcaembedder.py");
}

TEST(pythonTests, findreferencemeshTest) {
run_test("findReferenceMesh.py");
}
Expand Down
Loading
Loading