DataBiosphere · davidangb · Oct 30, 2023 · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023
@@ -17,12 +17,14 @@
 import org.apache.avro.generic.GenericRecord;
 import org.databiosphere.workspacedataservice.dao.JobDao;
 import org.databiosphere.workspacedataservice.jobexec.QuartzJob;
+import org.databiosphere.workspacedataservice.retry.RestClientRetry;
 import org.databiosphere.workspacedataservice.service.model.exception.PfbParsingException;
 import org.databiosphere.workspacedataservice.workspacemanager.WorkspaceManagerDao;
 import org.quartz.JobDataMap;
 import org.quartz.JobExecutionContext;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Value;
 import org.springframework.stereotype.Component;
 
 /** Shell/starting point for PFB import via Quartz. */
@@ -36,9 +38,15 @@ public class PfbQuartzJob extends QuartzJob {
   private final JobDao jobDao;
   private final WorkspaceManagerDao wsmDao;
 
-  public PfbQuartzJob(JobDao jobDao, WorkspaceManagerDao wsmDao) {
+  @Value("${twds.instance.workspace-id}")
+  UUID workspaceId;
+
+  private final RestClientRetry restClientRetry;
+
+  public PfbQuartzJob(JobDao jobDao, WorkspaceManagerDao wsmDao, RestClientRetry restClientRetry) {
     this.jobDao = jobDao;
     this.wsmDao = wsmDao;
+    this.restClientRetry = restClientRetry;
   }
 
   @Override
@@ -59,7 +67,7 @@ protected void executeInternal(UUID jobId, JobExecutionContext context) {
               false);
 
       // process the stream into a list of unique snapshotIds
-      List<String> snapshotIds =
+      List<UUID> snapshotIds =
           recordStream
               .map(rec -> rec.get("object")) // Records in a pfb are stored under the key "object"
               .filter(GenericRecord.class::isInstance) // which we expect to be a GenericRecord
@@ -70,18 +78,13 @@ protected void executeInternal(UUID jobId, JobExecutionContext context) {
               .map(obj -> obj.get(SNAPSHOT_ID_IDENTIFIER)) // within the GenericRecord, find the
               // source_datarepo_snapshot_id
               .filter(Objects::nonNull) // expect source_datarepo_snapshot_id to be non-null
-              .map(Object::toString)
+              .map(obj -> maybeUuid(obj.toString()))
+              .filter(Objects::nonNull)
               .distinct() // find only the unique snapshotids
               .toList();
 
-      // TODO AJ-1371 pass snapshotIds to WSM
-      for (String id : snapshotIds) {
-        try {
-          wsmDao.createDataRepoSnapshotReference(new SnapshotModel().id(UUID.fromString(id)));
-        } catch (Exception e) {
-          throw new PfbParsingException("Error processing PFB: Invalid snapshot UUID", e);
-        }
-      }
+      // link the found snapshots to the workspace, skipping any that were previously linked
+      linkSnapshots(snapshotIds);
 
     } catch (IOException e) {
       throw new PfbParsingException("Error processing PFB", e);
@@ -90,4 +93,42 @@ protected void executeInternal(UUID jobId, JobExecutionContext context) {
     // TODO: AJ-1227 implement PFB import.
     logger.info("TODO: implement PFB import.");
   }
+
+  protected void linkSnapshots(List<UUID> snapshotIds) {
+    // list existing snapshots linked to this workspace
+    PfbQuartzJobSupport pfbQuartzJobSupport =
+        new PfbQuartzJobSupport(workspaceId, wsmDao, restClientRetry);
+    List<UUID> existingSnapshotIds =
+        pfbQuartzJobSupport.existingPolicySnapshotIds(/* pageSize= */ 50);
+    // find the snapshots in this PFB that are not already linked to this workspace
+    List<UUID> newSnapshotIds =
+        snapshotIds.stream().filter(id -> !existingSnapshotIds.contains(id)).toList();
+
+    logger.info(
+        "PFB contains {} snapshot ids. {} of these are already linked to the workspace; {} new links will be created.",
+        snapshotIds.size(),
+        snapshotIds.size() - newSnapshotIds.size(),
+        newSnapshotIds.size());
+
+    // pass snapshotIds to WSM
+    for (UUID uuid : newSnapshotIds) {
+      try {
+        RestClientRetry.VoidRestCall voidRestCall =
+            (() -> wsmDao.createDataRepoSnapshotReference(new SnapshotModel().id(uuid)));
+        restClientRetry.withRetryAndErrorHandling(
+            voidRestCall, "WSM.createDataRepoSnapshotReference");
+      } catch (Exception e) {
+        throw new PfbParsingException("Error processing PFB: Invalid snapshot UUID", e);
+      }
+    }
+  }
+
+  private UUID maybeUuid(String input) {
+    try {
+      return UUID.fromString(input);
+    } catch (Exception e) {
+      logger.warn("found unparseable snapshot id '{}' in PFB contents", input);
+      return null;
+    }
+  }
 }
diff --git a/.../src/main/java/org/databiosphere/workspacedataservice/dataimport/PfbQuartzJobSupport.java b/.../src/main/java/org/databiosphere/workspacedataservice/dataimport/PfbQuartzJobSupport.java
@@ -0,0 +1,113 @@
+package org.databiosphere.workspacedataservice.dataimport;
+
+import bio.terra.workspace.model.DataRepoSnapshotAttributes;
+import bio.terra.workspace.model.ResourceAttributesUnion;
+import bio.terra.workspace.model.ResourceDescription;
+import bio.terra.workspace.model.ResourceList;
+import java.util.List;
+import java.util.Objects;
+import java.util.UUID;
+import java.util.concurrent.atomic.AtomicInteger;
+import org.databiosphere.workspacedataservice.retry.RestClientRetry;
+import org.databiosphere.workspacedataservice.service.model.exception.PfbImportException;
+import org.databiosphere.workspacedataservice.workspacemanager.WorkspaceManagerDao;
+
+public class PfbQuartzJobSupport {
+
+  private final UUID workspaceId;
+  private final WorkspaceManagerDao wsmDao;
+  private final RestClientRetry restClientRetry;
+
+  public PfbQuartzJobSupport(
+      UUID workspaceId, WorkspaceManagerDao wsmDao, RestClientRetry restClientRetry) {
+    this.workspaceId = workspaceId;
+    this.wsmDao = wsmDao;
+    this.restClientRetry = restClientRetry;
+  }
+
+  /**
+   * Query WSM for the full list of referenced snapshots in this workspace, then return the list of
+   * unique snapshotIds from those references.
+   *
+   * @param pageSize how many references to return in each paginated request to WSM
+   * @return the list of unique ids for all pre-existing snapshot references
+   */
+  protected List<UUID> existingPolicySnapshotIds(int pageSize) {
+    return extractSnapshotIds(listAllSnapshots(pageSize));
+  }
+
+  /**
+   * Given a ResourceList, find all the valid ids of referenced snapshots in that list
+   *
+   * @param resourceList the list in which to look for snapshotIds
+   * @return the list of unique ids in the provided list
+   */
+  protected List<UUID> extractSnapshotIds(ResourceList resourceList) {
+    return resourceList.getResources().stream()
+        .map(this::safeGetSnapshotId)
+        .filter(Objects::nonNull)
+        .distinct()
+        .toList();
+  }
+
+  /**
+   * Get the full list of all snapshot references for the current workspace. WSM returns these
+   * results paginated; this method retrieves pages from WSM and aggregates the results.
+   *
+   * @param pageSize number of results to return from WSM at once
+   * @return the full list of all snapshot references for the workspace.
+   */
+  protected ResourceList listAllSnapshots(int pageSize) {
+    final AtomicInteger offset = new AtomicInteger(0);
+    final int hardLimit =
+        10000; // under no circumstances return more than this many snapshots from WSM
+
+    ResourceList finalList = new ResourceList(); // collect our results
+
+    while (offset.get() < hardLimit) {
+      // get a page of results from WSM
+      RestClientRetry.RestCall<ResourceList> restCall =
+          (() -> wsmDao.enumerateDataRepoSnapshotReferences(workspaceId, offset.get(), pageSize));
+      ResourceList thisPage =
+          restClientRetry.withRetryAndErrorHandling(
+              restCall, "WSM.enumerateDataRepoSnapshotReferences");
+
+      // add this page of results to our collector
+      finalList.getResources().addAll(thisPage.getResources());
+
+      if (thisPage.getResources().size() < pageSize) {
+        // fewer results from WSM than we requested; this is the last page of results
+        return finalList;
+      } else {
+        // bump our offset and request another page of results
+        offset.addAndGet(pageSize);
+      }
+    }
+
+    throw new PfbImportException(
+        "Exceeded hard limit of %d for number of pre-existing snapshot references"
+            .formatted(hardLimit));
+  }
+
+  /**
+   * Given a ResourceDescription representing a snapshot reference, retrieve that snapshot's UUID.
+   *
+   * @param resourceDescription the WSM object in which to find a snapshotId
+   * @return the snapshotId if found, else null
+   */
+  protected UUID safeGetSnapshotId(ResourceDescription resourceDescription) {
+    ResourceAttributesUnion resourceAttributes = resourceDescription.getResourceAttributes();
+    if (resourceAttributes != null) {
+      DataRepoSnapshotAttributes dataRepoSnapshot = resourceAttributes.getGcpDataRepoSnapshot();
+      if (dataRepoSnapshot != null) {
+        String snapshotIdStr = dataRepoSnapshot.getSnapshot();
+        try {
+          return UUID.fromString(snapshotIdStr);
+        } catch (Exception e) {
+          // noop; this will return null
+        }
+      }
+    }
+    return null;
+  }
+}
@@ -0,0 +1,16 @@
+package org.databiosphere.workspacedataservice.service.model.exception;
+
+import org.springframework.http.HttpStatus;
+import org.springframework.web.bind.annotation.ResponseStatus;
+
+@ResponseStatus(code = HttpStatus.INTERNAL_SERVER_ERROR)
+public class PfbImportException extends IllegalArgumentException {
+
+  public PfbImportException(String message) {
+    super(message);
+  }
+
+  public PfbImportException(String message, Exception e) {
+    super(message, e);
+  }
+}
@@ -46,9 +46,15 @@ public void createDataRepoSnapshotReference(SnapshotModel snapshotModel) {
     }
   }
 
+  public ResourceList enumerateDataRepoSnapshotReferences(UUID workspaceId, int offset, int limit)
+      throws ApiException {
+    // get a page of results from WSM
+    return enumerateResources(
+        workspaceId, offset, limit, ResourceType.DATA_REPO_SNAPSHOT, StewardshipType.REFERENCED);
+  }
+
   /** Retrieves the azure storage container url and sas token for a given workspace. */
   public String getBlobStorageUrl(String storageWorkspaceId, String authToken) {
-    final ResourceApi resourceApi = this.workspaceManagerClientFactory.getResourceApi(authToken);
     final ControlledAzureResourceApi azureResourceApi =
         this.workspaceManagerClientFactory.getAzureResourceApi(authToken);
     int count = 0;
@@ -59,8 +65,7 @@ public String getBlobStorageUrl(String storageWorkspaceId, String authToken) {
         LOGGER.debug(
             "Finding storage resource for workspace {} from Workspace Manager ...", workspaceUUID);
         ResourceList resourceList =
-            resourceApi.enumerateResources(
-                workspaceUUID, 0, 5, ResourceType.AZURE_STORAGE_CONTAINER, null);
+            enumerateResources(workspaceUUID, 0, 5, ResourceType.AZURE_STORAGE_CONTAINER, null);
         // note: it is possible a workspace may have more than one storage container associated with
         // it
         // but currently there is no way to tell which one is the primary except for checking the
@@ -94,4 +99,17 @@ public UUID extractResourceId(ResourceList resourceList, String storageWorkspace
     }
     return null;
   }
+
+  private ResourceList enumerateResources(
+      UUID workspaceId,
+      int offset,
+      int limit,
+      ResourceType resourceType,
+      StewardshipType stewardshipType)
+      throws ApiException {
+    ResourceApi resourceApi = this.workspaceManagerClientFactory.getResourceApi(null);
+    // TODO: retries
+    return resourceApi.enumerateResources(
+        workspaceId, offset, limit, resourceType, stewardshipType);
+  }
 }