twitter · IanJoffe · Aug 29, 2024 · Aug 29, 2024 · Aug 30, 2024 · Sep 4, 2024
diff --git a/sourcecode/scoring/matrix_factorization/matrix_factorization.py b/sourcecode/scoring/matrix_factorization/matrix_factorization.py
@@ -9,6 +9,7 @@
 import numpy as np
 import pandas as pd
 import torch
+from scipy.sparse.linalg import svds
 
 
 logger = logging.getLogger("birdwatch.matrix_factorization")
@@ -159,6 +160,26 @@ def get_note_and_rater_id_maps(
 
     return noteIdMap, raterIdMap, ratingFeaturesAndLabels
 
+  def _form_data_df(self):
+    """
+    Returns: pd.DataFrame, a (number of notes) x (number of raters) df filled with ratings data, 
+    with validation data nan'ed out.
+
+    This takes up a lot of memeory - I avoid ever having multiple of them around by deleting the reference
+    whenever I'm done with it, but one could also store it as attribute of self to trade off that memory for
+    not having to call this multiple times. 
+
+    Currently _form_data_df is only called when a Spectral Initialization occurs. 
+    """
+    data_df = self.ratingFeaturesAndLabels.pivot(index='noteId', columns='raterParticipantId', values='helpfulNum')
+    if self.validateModelData is not None:
+      notes_map_to_id = self.noteIdMap.set_index(Constants.noteIndexKey)[c.noteIdKey]
+      rater_map_to_id = self.raterIdMap.set_index(Constants.raterIndexKey)[c.raterParticipantIdKey]
+      valid_row_pos = data_df.index.get_indexer(pd.Series(self.validateModelData.note_indexes.numpy()).map(notes_map_to_id))    # may need to call detach, but I don't think so since they don't have gradients?
+      valid_col_pos = data_df.columns.get_indexer(pd.Series(self.validateModelData.user_indexes.numpy()).map(rater_map_to_id))
+      data_df.values[valid_row_pos, valid_col_pos] = np.nan
+    return data_df    
+
   def _initialize_parameters(
     self,
     noteInit: Optional[pd.DataFrame] = None,
@@ -183,17 +204,17 @@ def _initialize_parameters(
       noteInit = self.noteIdMap.merge(
         noteInit,
         on=c.noteIdKey,
-        how="left",
-        unsafeAllowed={c.noteIdKey, "noteIndex_y"},
+        how="left" # ,
+        # unsafeAllowed={c.noteIdKey, "noteIndex_y"},    my code wouldn't run with this line and I don't see it in the docs?
       )
 
-      noteInit[c.internalNoteInterceptKey].fillna(0.0, inplace=True)
+      noteInit[c.internalNoteInterceptKey] = noteInit[c.internalNoteInterceptKey].fillna(0.0)    # I had to get rid of these inplace=True's to silence a warning, but I think pandas would make a temporary copy anyway so not sure it saves memory
       self.mf_model.note_intercepts.weight.data = torch.tensor(
         np.expand_dims(noteInit[c.internalNoteInterceptKey].astype(np.float32).values, axis=1)
       )
 
       for i in range(1, self._numFactors + 1):
-        noteInit[c.note_factor_key(i)].fillna(0.0, inplace=True)
+        noteInit[c.note_factor_key(i)] = noteInit[c.note_factor_key(i)].fillna(0.0)
       self.mf_model.note_factors.weight.data = torch.tensor(
         noteInit[[c.note_factor_key(i) for i in range(1, self._numFactors + 1)]]
         .astype(np.float32)
@@ -205,13 +226,13 @@ def _initialize_parameters(
         logger.info("initializing users")
       userInit = self.raterIdMap.merge(userInit, on=c.raterParticipantIdKey, how="left")
 
-      userInit[c.internalRaterInterceptKey].fillna(0.0, inplace=True)
+      userInit[c.internalRaterInterceptKey] = userInit[c.internalRaterInterceptKey].fillna(0.0)
       self.mf_model.user_intercepts.weight.data = torch.tensor(
         np.expand_dims(userInit[c.internalRaterInterceptKey].astype(np.float32).values, axis=1)
       )
 
       for i in range(1, self._numFactors + 1):
-        userInit[c.rater_factor_key(i)].fillna(0.0, inplace=True)
+        userInit[c.rater_factor_key(i)] = userInit[c.rater_factor_key(i)].fillna(0.0)
       self.mf_model.user_factors.weight.data = torch.tensor(
         userInit[[c.rater_factor_key(i) for i in range(1, self._numFactors + 1)]]
         .astype(np.float32)
@@ -221,9 +242,7 @@ def _initialize_parameters(
     if globalInterceptInit is not None:
       if self._log:
         logger.info("initialized global intercept")
-      self.mf_model.global_intercept = torch.nn.parameter.Parameter(
-        torch.ones(1, 1, dtype=torch.float32) * globalInterceptInit
-      )
+      self.mf_model.global_intercept.data = torch.ones(1, 1, dtype=torch.float32) * globalInterceptInit
 
   def _get_parameters_from_trained_model(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """
@@ -434,7 +453,8 @@ def _fit_model(
         rating (torch.FloatTensor)
     """
     assert self.mf_model is not None
-    self._create_train_validate_sets(validate_percent)
+    if self.trainModelData is None:
+      self._create_train_validate_sets(validate_percent)
     assert self.trainModelData is not None
 
     prev_loss = 1e10
@@ -495,6 +515,8 @@ def run_mf(
     noteInit: pd.DataFrame = None,
     userInit: pd.DataFrame = None,
     globalInterceptInit: Optional[float] = None,
+    useSpectralInit: Optional[bool] = False,
+    additonalSpectralInitIters: Optional[int] = 0,
     specificNoteId: Optional[int] = None,
     validatePercent: Optional[float] = None,
     freezeRaterParameters: bool = False,
@@ -511,6 +533,8 @@ def run_mf(
         noteInit (pd.DataFrame, optional)
         userInit (pd.DataFrame, optional)
         globalInterceptInit (float, optional).
+        useSpectralInit (bool, optional): Whether to use SVD to initialize the factors
+        additionalSpectralInitIters (int, optional): How many times to reinitialize and refit with SVD
         specificNoteId (int, optional) Do approximate analysis to score a particular note
 
     Returns:
@@ -550,7 +574,68 @@ def run_mf(
       self.mf_model.freeze_rater_and_global_parameters()
     self.prepare_features_and_labels(specificNoteId)
 
+    if useSpectralInit:
+
+      self._create_train_validate_sets(validatePercent)
+      data_df = self._form_data_df()
+      data_matrix = data_df.values
+      # might be worth trying to tune the weights from 1/2-1/2, I didn't see a huge different in limited experiments but don't have the compute to be definative
+      mean_matrix = 1/2*np.nan_to_num(np.nanmean(data_matrix, axis=1), nan=np.nanmean(data_matrix))[:,np.newaxis] \
+                    + 1/2*np.nan_to_num(np.nanmean(data_matrix, axis=0), nan=np.nanmean(data_matrix)) \
+                    - np.nanmean(data_matrix)    # warning can be ignored, I deal with it by wrapping with a nan_to_num
+      filled_matrix = np.where(np.isnan(data_matrix), mean_matrix, data_matrix)
+
+      U, S, Vt = svds(filled_matrix, k=self._numFactors)
+      note_factor_init_vals = np.sqrt(S[0]) * U.T[0]
+      user_factor_init_vals = np.sqrt(S[0]) * Vt[0]
+
+      noteInit = pd.DataFrame({
+        c.noteIdKey: data_df.index,
+        c.note_factor_key(1): note_factor_init_vals, 
+        c.internalNoteInterceptKey: np.zeros(len(note_factor_init_vals))
+      })
+      userInit = pd.DataFrame({
+        c.raterParticipantIdKey: data_df.columns,
+        c.rater_factor_key(1): user_factor_init_vals, 
+        c.internalRaterInterceptKey: np.zeros(len(user_factor_init_vals))
+      })
+      globalInterceptInit = np.nanmean(data_matrix)
+      del data_df, data_matrix   # save lots of memory
+      # to further save memory, one could del data_df as soon as data_matrix is formed, but then would have to retrieve the ordering of ID's again when forming noteInit and userInit
+
+      self._initialize_parameters(noteInit, userInit, globalInterceptInit)
+
     train_loss, loss, validate_loss = self._fit_model(validatePercent)
+
+    if useSpectralInit:
+      for _ in range(additonalSpectralInitIters):
+        data_df = self._form_data_df()
+        data_matrix = data_df.values
+        noteParams, raterParams = self._get_parameters_from_trained_model()
+        intercepts_matrix = np.add.outer(noteParams["internalNoteIntercept"].to_numpy(), raterParams["internalRaterIntercept"].to_numpy())
+        if self._useGlobalIntercept:
+          intercepts_matrix = intercepts_matrix + self.mf_model.global_intercept.item()
+        filled_matrix = np.where(np.isnan(data_matrix), intercepts_matrix, data_matrix)
+
+        U, S, Vt = svds(filled_matrix, k=self._numFactors)
+        note_factor_init_vals = np.sqrt(S[0]) * U.T[0]
+        user_factor_init_vals = np.sqrt(S[0]) * Vt[0]
+
+        noteInit = pd.DataFrame({
+          c.noteIdKey: data_df.index,
+          c.note_factor_key(1): note_factor_init_vals, 
+          c.internalNoteInterceptKey: np.zeros(len(note_factor_init_vals))
+        })
+        userInit = pd.DataFrame({
+          c.raterParticipantIdKey: data_df.columns,
+          c.rater_factor_key(1): user_factor_init_vals, 
+          c.internalRaterInterceptKey: np.zeros(len(user_factor_init_vals))
+        })
+        del data_df, data_matrix
+
+        self._initialize_parameters(noteInit, userInit, None)
+        train_loss, loss, validate_loss = self._fit_model(validatePercent)
+
     if self._normalizedLossHyperparameters is not None:
       _, raterParams = self._get_parameters_from_trained_model()
       assert self.modelData is not None