Merge pull request #38 from clulab/headAndLabels

Aligned label and head predictions in predictWithScores. This is needed
clulab · Aug 25, 2023 · 02b6e4a · 02b6e4a
2 parents 4cf79d9 + d47574f
commit 02b6e4a
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 46 deletions.
diff --git a/encoder/src/main/python/clu_trainer.py b/encoder/src/main/python/clu_trainer.py
@@ -90,7 +90,7 @@ def compute_metrics(self, eval_pred: EvalPrediction) -> Dict[str, float]:
         ShortTaskDef("NER",       "conll-ner/", "train.txt",    "dev.txt",    "test.txt"),
         ShortTaskDef("POS",             "pos/", "train.txt",    "dev.txt",    "test.txt"),
         ShortTaskDef("Chunking",   "chunking/", "train.txt",    "test.txt",   "test.txt"), # this dataset has no dev
-        ShortTaskDef("Deps Head",  "deps-wsj/", "train.heads",  "dev.heads",  "test.heads"),
-        ShortTaskDef("Deps Label", "deps-wsj/", "train.labels", "dev.labels", "test.labels", dual_mode=True)
+        ShortTaskDef("Deps Head",  "deps-combined/", "wsjtrain-wsjdev-geniatrain-geniadev.heads",  "dev.heads",  "test.heads"),
+        ShortTaskDef("Deps Label", "deps-combined/", "wsjtrain-wsjdev-geniatrain-geniadev.labels", "dev.labels", "test.labels", dual_mode=True)
     ])
     CluTrainer(tokenizer).train(tasks)
diff --git a/encoder/src/main/scala/org/clulab/scala_transformers/encoder/LinearLayer.scala b/encoder/src/main/scala/org/clulab/scala_transformers/encoder/LinearLayer.scala
@@ -56,7 +56,7 @@ class LinearLayer(
 
   /** Predict all labels and their scores per token */
   def predictWithScores(inputSentence: DenseMatrix[Float], 
-                        heads: Option[Array[Int]], 
+                        heads: Option[Array[Array[Int]]], 
                         masks: Option[Array[Boolean]]): Array[Array[(String, Float)]] = {
     val batchSentences = Array(inputSentence)
     val batchHeads = heads.map(Array(_))
@@ -66,7 +66,7 @@ class LinearLayer(
 
   /** Predict all labels and their scores per token in each sentence in the batch */
   def predictWithScores(inputBatch: Array[DenseMatrix[Float]], 
-                        batchHeads: Option[Array[Array[Int]]],
+                        batchHeads: Option[Array[Array[Array[Int]]]],
                         batchMasks: Option[Array[Array[Boolean]]]): Array[Array[Array[(String, Float)]]] = {
     if (dual) predictDualWithScores(inputBatch, batchHeads, batchMasks)
     else predictPrimalWithScores(inputBatch)
@@ -77,7 +77,7 @@ class LinearLayer(
     headRelativePositions: Array[Int]): DenseMatrix[Float] = {
 
     // this matrix concatenates the hidden states of modifier + corresponding head
-    // rows = number of tokens in the sentence; cols = hidden state size
+    // rows = number of tokens in the sentence; cols = hidden state size x 2
     val concatMatrix = DenseMatrix.zeros[Float](rows = sentenceHiddenStates.rows, cols = 2 * sentenceHiddenStates.cols)
 
     // traverse all modifiers
@@ -100,6 +100,37 @@ class LinearLayer(
     concatMatrix
   }
 
+  /**
+    * Generates a 1-row matrix containing a concatenation of the modifier and head embeddings
+    *
+    */
+  def concatenateModifierAndHead(
+    sentenceHiddenStates: DenseMatrix[Float], 
+    modifierAbsolutePosition: Int,
+    headRelativePosition: Int): DenseMatrix[Float] = {
+
+    // this matrix concatenates the hidden states of modifier + corresponding head
+    // rows = 1; cols = hidden state size x 2
+    val concatMatrix = DenseMatrix.zeros[Float](rows = 1, cols = 2 * sentenceHiddenStates.cols)
+
+    // embedding of the modifier
+    val modHiddenState = sentenceHiddenStates(modifierAbsolutePosition, ::)
+
+    // embedding of the head
+    val rawHeadAbsPos = modifierAbsolutePosition + headRelativePosition
+    val headAbsolutePosition = 
+      if(rawHeadAbsPos >= 0 && rawHeadAbsPos < sentenceHiddenStates.rows) rawHeadAbsPos
+      else modifierAbsolutePosition // if the absolute position is invalid (e.g., root node or incorrect prediction) duplicate the mod embedding
+    val headHiddenState = sentenceHiddenStates(headAbsolutePosition, ::)
+
+    // concatenation of the modifier and head embeddings
+    // vector concatenation in Breeze operates over vertical vectors, hence the transposing here
+    val concatState = DenseVector.vertcat(modHiddenState.t, headHiddenState.t).t
+
+    concatMatrix(0, ::) :+= concatState
+    concatMatrix
+  }
+
   /** Predict the top label for each combination of modifier token and corresponding head token */
   def predictDual(inputBatch: Array[DenseMatrix[Float]], 
                   batchHeads: Option[Array[Array[Int]]] = None,
@@ -108,67 +139,58 @@ class LinearLayer(
     assert(batchMasks.isDefined)
     val indexToLabel = labelsOpt.getOrElse(throw new RuntimeException("ERROR: can't predict without labels!"))
 
-    val outputBatch = new Array[Array[String]](inputBatch.length)
-
     // we process one sentence at a time because the dual setting makes it harder to batch
-    for(i <- inputBatch.indices) {
-      val input = inputBatch(i)
-      val heads = batchHeads.get(i)
-
+    val outputBatch = inputBatch.zip(batchHeads.get).map { case (input, heads) =>
       // generate a matrix that is twice as wide to concatenate the embeddings of the mod + head
       val concatInput = concatenateModifiersAndHeads(input, heads)
-
       // get the logits for the current sentence produced by this linear layer
       val logitsPerSentence = forward(Array(concatInput))(0)
-
       // one token per row; pick argmax per token
       val bestLabels = Range(0, logitsPerSentence.rows).map { i =>
         val row = logitsPerSentence(i, ::) // picks line i from a 2D matrix
         val bestIndex = argmax(row.t)
+
         indexToLabel(bestIndex)
       }
 
-      outputBatch(i) = bestLabels.toArray
+      bestLabels.toArray
     }
 
     outputBatch                    
   }
 
+  // predicts the top label for each of the candidate heads
   // out dimensions: sentence in batch x token in sentence x label/score per token
+  // batchHeads dimensions: sentence in batch x token in sentence x heads per token
   // labels are sorted in descending order of their scores
   def predictDualWithScores(inputBatch: Array[DenseMatrix[Float]], 
-                            batchHeads: Option[Array[Array[Int]]] = None,
+                            batchHeads: Option[Array[Array[Array[Int]]]] = None,
                             batchMasks: Option[Array[Array[Boolean]]] = None): Array[Array[Array[(String, Float)]]] = {
     assert(batchHeads.isDefined)
     assert(batchMasks.isDefined)
     val indexToLabel = labelsOpt.getOrElse(throw new RuntimeException("ERROR: can't predict without labels!"))
 
-    val outputBatch = new Array[Array[Array[(String, Float)]]](inputBatch.length)
-
+    // dimensions: sent in batch x token in sentence x label per candidate head
     // we process one sentence at a time because the dual setting makes it harder to batch
-    for (i <- inputBatch.indices) {
-      val input = inputBatch(i)
-      val heads = batchHeads.get(i)
-
-      // generate a matrix that is twice as wide to concatenate the embeddings of the mod + head
-      val concatInput = concatenateModifiersAndHeads(input, heads)
-
-      // get the logits for the current sentence produced by this linear layer
-      val logitsPerSentence = forward(Array(concatInput))(0)
-
-      // one token per row; store scores for all labels for this token
-      val allLabels = Range(0, logitsPerSentence.rows).map { i =>
-        // picks line i from a 2D matrix and converts it to Array
-        val scores = logitsPerSentence(i, ::).t.toArray
-        // extract the label at each position in the row and its score
-        val labelsAndScores = indexToLabel.zip(scores)
-
-        // keep scores in descending order (largest first)
-        labelsAndScores.sortBy(-_._2)
-      }
-
-      outputBatch(i) = allLabels.toArray
-    }
+    val outputBatch = inputBatch.zip(batchHeads.get).map { case (input, headCandidatesPerSentence) =>
+      // now process each token separately
+      headCandidatesPerSentence.zipWithIndex.map { case (headCandidatesPerToken, modifierAbsolutePosition) =>
+        // process each head candidate for this token
+        headCandidatesPerToken.map { headRelativePosition =>
+          // generate a matrix that is twice as wide to concatenate the embeddings of the mod + head
+          val concatInput = concatenateModifierAndHead(input, modifierAbsolutePosition, headRelativePosition)
+          // get the logits for the current pair of modifier and head
+          val logitsPerSentence = forward(Array(concatInput))(0)
+          val labelScores = logitsPerSentence(0, ::)
+          val bestIndex = argmax(labelScores.t)
+          val bestScore = labelScores(bestIndex)
+          val bestLabel = indexToLabel(bestIndex)
+
+          // println(s"Top prediction for mod $modifierAbsolutePosition and relative head $headRelativePosition is $bestLabel with score $bestScore")
+          (bestLabel, bestScore)
+        } // end head candidates for this token
+      } // end this token
+    } // end sentence batch
 
     outputBatch                    
   }
@@ -206,7 +228,7 @@ class LinearLayer(
         val labelsAndScores = labels.zip(scores)
 
         // keep scores in descending order (largest first) 
-        labelsAndScores.sortBy(_._2)
+        labelsAndScores.sortBy(- _._2) // - score guarantees sorting in descending order of scores
       }
 
       allLabels.toArray

diff --git a/encoder/src/main/scala/org/clulab/scala_transformers/encoder/TokenClassifier.scala b/encoder/src/main/scala/org/clulab/scala_transformers/encoder/TokenClassifier.scala
@@ -45,13 +45,16 @@ class TokenClassifier(
     val tokenization = LongTokenization(tokenizer.tokenize(words.toArray))
     val inputIds = tokenization.tokenIds
     val wordIds = tokenization.wordIds
+    //val tokens = tokenization.tokens
 
     // run the sentence through the transformer encoder
     val encOutput = encoder.forward(inputIds)
 
     // outputs for all tasks stored here: task x tokens in sentence x scores per token
     val allLabels = new Array[Array[Array[(String, Float)]]](tasks.length)
-    var heads: Option[Array[Int]] = None
+    // all heads predicted for every token
+    // dimensions: token x heads
+    var heads: Option[Array[Array[Int]]] = None
 
     // now generate token label predictions for all primary tasks (not dual!)
     for (i <- tasks.indices) {
@@ -61,17 +64,19 @@ class TokenClassifier(
         allLabels(i) = wordLabels
 
         // if this is the task that predicts head positions, then save them for the dual tasks
-        // here we save only the head predicted with the highest score (hence the .head)
+        // we save all the heads predicted for each token
         if (tasks(i).name == headTaskName) {
-          heads = Some(tokenLabels.map(_.head._1.toInt))
+          heads = Some(tokenLabels.map(_.map(_._1.toInt)))
         }
       }
     }
 
     // generate outputs for the dual tasks, if heads were predicted by one of the primary tasks
+    // the dual task(s) must be aligned with the heads. 
+    //   that is, we predict the top label for each of the head candidates
     if (heads.isDefined) {
       //println("Tokens:    " + tokens.mkString(", "))
-      //println("Heads:     " + heads.get.mkString(", "))
+      //println("Heads:\n\t" + heads.get.map(_.slice(0, 3).mkString(", ")).mkString("\n\t"))
       //println("Masks:     " + TokenClassifier.mkTokenMask(wordIds).mkString(", "))
       val masks = Some(TokenClassifier.mkTokenMask(wordIds))
 
@@ -102,6 +107,7 @@ class TokenClassifier(
     val tokenization = LongTokenization(tokenizer.tokenize(words.toArray))
     val inputIds = tokenization.tokenIds
     val wordIds = tokenization.wordIds
+    //val tokens = tokenization.tokens
 
     // run the sentence through the transformer encoder
     val encOutput = encoder.forward(inputIds)

diff --git a/trainer.sh b/trainer.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
 
-PYTHONHASHSEED=1 python encoder/src/main/python/trainer.py
+PYTHONHASHSEED=1 python encoder/src/main/python/clu_trainer.py