Skip to content

Commit

Permalink
Adjust FITS mimetype conflict resolution to rank returned options, fi…
Browse files Browse the repository at this point in the history
…lter out invalid results. Use a temp dir for storing symlinks, since nano time can very occasionally produce the same name. Move symlinks to deposit directory so that FITS will have read permission (#1745)
  • Loading branch information
bbpennel authored Jun 13, 2024
1 parent 46a2151 commit 5350cf0
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import edu.unc.lib.boxc.deposit.work.AbstractConcurrentDepositJob;
import edu.unc.lib.boxc.deposit.work.JobFailedException;
import edu.unc.lib.boxc.deposit.work.JobInterruptedException;
import edu.unc.lib.boxc.model.api.exceptions.RepositoryException;
import edu.unc.lib.boxc.model.api.ids.PID;
import edu.unc.lib.boxc.model.api.rdf.CdrDeposit;
import edu.unc.lib.boxc.model.fcrepo.ids.DatastreamPids;
Expand Down Expand Up @@ -43,13 +44,15 @@
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import static edu.unc.lib.boxc.common.xml.SecureXMLFactory.createSAXBuilder;
import static edu.unc.lib.boxc.model.api.rdf.CdrDeposit.mimetype;
Expand All @@ -73,7 +76,7 @@ public class ExtractTechnicalMetadataJob extends AbstractConcurrentDepositJob {

private static final String FITS_SINGLE_STATUS = "SINGLE_RESULT";
private static final String FITS_EXAMINE_PATH = "examine";
private static final Path TMP_PATH = Paths.get(System.getProperty("java.io.tmpdir"));
private static final String MIMETYPE_ATTR = "mimetype";

private CloseableHttpClient httpClient;

Expand Down Expand Up @@ -215,10 +218,12 @@ public void run() {

// Symlink the file before processing
Path linkPath = makeSymlinkForStagedPath(stagedPath, providedLabel);
// Generate the FITS report as a document
Document fitsDoc = getFitsDocument(objPid, linkPath);

Document fitsDoc = null;
try {
// Generate the FITS report as a document
fitsDoc = getFitsDocument(objPid, linkPath);

// Create the PREMIS report wrapper for the FITS results
Document premisDoc = generatePremisReport(objPid, fitsDoc);
Element premisObjCharsEl = getObjectCharacteristics(premisDoc);
Expand All @@ -234,11 +239,11 @@ public void run() {
writePremisReport(objPid, premisDoc);

receiveResult(result);
} catch (JobFailedException | JobInterruptedException e) {
} catch (JobFailedException | JobInterruptedException | RepositoryException e) {
throw e;
} catch (Exception e) {
failJob(e, "Failed to extract FITS details for file '{0}' with id {1} from document:\n{2}",
stagedPath, objPid.getId(), getXMLOutputter().outputString(fitsDoc));
stagedPath, objPid.getId(), fitsDoc != null ? getXMLOutputter().outputString(fitsDoc) : "null");
} finally {
try {
Files.delete(linkPath);
Expand All @@ -263,7 +268,7 @@ private void addFileIdentification(Document fitsDoc, Element premisObjCharsEl) {
String fitsMimetype = null;
String format;
if (identity != null) {
fitsMimetype = identity.getAttributeValue("mimetype");
fitsMimetype = identity.getAttributeValue(MIMETYPE_ATTR);
format = identity.getAttributeValue("format");
} else {
format = "Unknown";
Expand Down Expand Up @@ -347,15 +352,18 @@ protected Path makeSymlinkForStagedPath(String stagedUriString, String label) {
// Resolve the path from a URI and make it absolute
URI stagedUri = URI.create(stagedUriString);
Path stagedPath;
File depositDirectory = getDepositDirectory();
if (!stagedUri.isAbsolute()) {
stagedPath = Paths.get(getDepositDirectory().toString(), stagedUriString);
stagedPath = Paths.get(depositDirectory.toString(), stagedUriString);
} else {
stagedPath = Paths.get(stagedUri);
}
try {
// Create a unique parent directory for the symlink to avoid filename conflicts
var parentDir = Files.createTempDirectory(depositDirectory.toPath(), "fits_staging");
// Assign the same permissions as the parent directory to the temp dir, since createTempDirectory is restrictive
Files.setPosixFilePermissions(parentDir, Files.getPosixFilePermissions(parentDir.getParent()));
// Create a symlink to the file to make use of the original filename and avoid issues with non-ascii characters
var parentDir = TMP_PATH.resolve(Long.toString(System.nanoTime()));
Files.createDirectories(parentDir);
String symlinkName = label != null ? label : stagedPath.getFileName().toString();
var linkPath = sanitizeCliPath(parentDir.resolve(symlinkName));
Files.createSymbolicLink(linkPath, stagedPath);
Expand Down Expand Up @@ -511,19 +519,22 @@ private Element getFitsIdentificationInformation(Document fitsDoc) {
return null;
}

// Conflicting identification from FITS, try to resolve
// Don't trust Exiftool if it detects a symlink, which is does not follow to the file.
// Trust any answer agreed on by multiple tools
for (Element el : identification.getChildren("identity", FITS_NS)) {
if (el.getChildren("tool", FITS_NS).size() > 1
|| !("Exiftool".equals(el.getChild("tool", FITS_NS).getAttributeValue("toolname"))
&& "application/x-symlink".equals(el.getAttributeValue("mimetype")))) {
return el;
}
}
// Sort the identification elements to find the best value returned by FITS
var identityEls = identification.getChildren("identity", FITS_NS).stream()
// Filter out any invalid entries
.filter(el -> MimetypeHelpers.isValidMimetype(el.getAttributeValue(MIMETYPE_ATTR)))
// Primarily sort by the best ranking mimetype
.sorted(Comparator.comparingInt((Element el) -> rankMimetype(el.getAttributeValue(MIMETYPE_ATTR)))
// Then rank by the number of tools that agreed on the mimetype
.thenComparingInt(el -> el.getChildren("tool", FITS_NS).size())
// Reverse so both rank and tool count is in descending order
.reversed()
// And then favor more application specific mimetypes
.thenComparingInt(el -> el.getAttributeValue(MIMETYPE_ATTR).contains("x-") ? -1 : 0))
.collect(Collectors.toList());
// Return the best ranking identification, or null if none are valid
return identityEls.isEmpty() ? null : identityEls.get(0);
}

return null;
}

private int rankMimetype(String mimetype) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,20 @@ public void exifSymlinkConflictMimetypeTest() throws Exception {
verifyFileResults(filePid, CONFLICT_MIMETYPE, CONFLICT_FORMAT, CONFLICT_MD5, 1);
}

@Test
public void multiRankingSpecificityConflictMimetypeTest() throws Exception {
respondWithFile("/fitsReports/conflictRankingReport.xml");

// Providing octet stream mimetype to be overridden
PID filePid = addFileObject(depositBag, CONFLICT_FILEPATH, OCTET_MIMETYPE, null);
job.closeModel();

job.run();

verifyRequestParameters(CONFLICT_FILEPATH);
verifyFileResults(filePid, "image/x-nikon-nef", "NEF EXIF", CONFLICT_MD5, 1);
}

@Test
public void exifMimetypeTest() throws Exception {
respondWithFile("/fitsReports/exifReport.xml");
Expand Down Expand Up @@ -568,7 +582,7 @@ private void verifyRequestParameters(String expectedFilepath) throws Exception {
String submittedPath = getSubmittedFilePath(request);

String failMessage = "FITS service called with wrong path. Expected " + expectedFilepath + " but got " + submittedPath;
assertTrue(submittedPath.startsWith(TMP_PATH.toString().replace("/", "%2F")), failMessage);
assertTrue(submittedPath.startsWith(job.getDepositDirectory().toString().replace("/", "%2F")), failMessage);
assertTrue(submittedPath.endsWith("%2F" + Paths.get(expectedFilepath).getFileName()), failMessage);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
<?xml version="1.0" encoding="UTF-8"?>
<fits xmlns="http://hul.harvard.edu/ois/xml/ns/fits/fits_output" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://hul.harvard.edu/ois/xml/ns/fits/fits_output http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd" version="1.6.0" timestamp="2/19/24 11:32 AM">
<identification status="CONFLICT">
<identity format="symbolic link to `/path/to/2217c4ce-9ab7-409a-9538-ef961ba119cf/data/_e5c9e83b-0427-4ec3-bb0f-fd24a992a56e'" mimetype="inode/symlink" toolname="FITS" toolversion="1.6.0">
<tool toolname="file utility" toolversion="5.11" />
</identity>
<identity format="Text Plain" mimetype="text/plain" toolname="FITS" toolversion="1.6.0">
<tool toolname="Droid" toolversion="6.1.5" />
</identity>
<identity format="Tagged Image File Format" mimetype="image/tiff" toolname="FITS" toolversion="1.6.0">
<tool toolname="Tika" toolversion="2.6.0" />
</identity>
<identity format="NEF EXIF" mimetype="image/x-nikon-nef" toolname="FITS" toolversion="1.6.0">
<tool toolname="Exiftool" toolversion="12.50" />
</identity>
</identification>
<fileinfo>
<lastmodified toolname="Exiftool" toolversion="12.50">2017-08-22T17:00:41</lastmodified>
<created toolname="Exiftool" toolversion="12.50">2017-08-22T17:00:41</created>
<filepath toolname="OIS File Information" toolversion="1.0" status="SINGLE_RESULT">/path/to/temp/9064692952992/20170822_068.nef</filepath>
<filename toolname="OIS File Information" toolversion="1.0" status="SINGLE_RESULT">20170822_068.nef</filename>
<size toolname="OIS File Information" toolversion="1.0">13887221</size>
<md5checksum toolname="OIS File Information" toolversion="1.0" status="SINGLE_RESULT">238b8c4a61fda89ef829c6b7d69b57fa</md5checksum>
<fslastmodified toolname="OIS File Information" toolversion="1.0" status="SINGLE_RESULT">1717086826457</fslastmodified>
</fileinfo>
<filestatus />
<metadata>
<image>
<compressionScheme toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">JPEG</compressionScheme>
<imageWidth toolname="Exiftool" toolversion="12.50">4304</imageWidth>
<imageHeight toolname="Exiftool" toolversion="12.50">2864</imageHeight>
<colorSpace toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">RGB</colorSpace>
<referenceBlackWhite toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">0 255 0 255 0 255</referenceBlackWhite>
<YCbCrPositioning toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">2</YCbCrPositioning>
<orientation toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">normal*</orientation>
<samplingFrequencyUnit toolname="Exiftool" toolversion="12.50">in.</samplingFrequencyUnit>
<xSamplingFrequency toolname="Exiftool" toolversion="12.50">300</xSamplingFrequency>
<ySamplingFrequency toolname="Exiftool" toolversion="12.50">300</ySamplingFrequency>
<bitsPerSample toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">8 8 8</bitsPerSample>
<samplesPerPixel toolname="Exiftool" toolversion="12.50" status="CONFLICT">3</samplesPerPixel>
<samplesPerPixel toolname="Tika" toolversion="2.6.0" status="CONFLICT">1</samplesPerPixel>
<captureDevice toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">digital still camera</captureDevice>
<digitalCameraManufacturer toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">NIKON CORPORATION</digitalCameraManufacturer>
<digitalCameraModelName toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">NIKON D500</digitalCameraModelName>
<scanningSoftwareName toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">Ver.1.13</scanningSoftwareName>
<fNumber toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">3.5</fNumber>
<exposureTime toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">0.002</exposureTime>
<exposureProgram toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">Manual</exposureProgram>
<isoSpeedRating toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">6400</isoSpeedRating>
<exposureBiasValue toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">0</exposureBiasValue>
<maxApertureValue toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">2.8</maxApertureValue>
<meteringMode toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">Pattern</meteringMode>
<lightSource toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">unknown</lightSource>
<flash toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">Flash did not fire</flash>
<focalLength toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">200.0</focalLength>
<sensingMethod toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">One-chip color area sensor</sensingMethod>
<cfaPattern toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">[Red,Green][Green,Blue]</cfaPattern>
<cfaPattern2 toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">0 1 1 2</cfaPattern2>
<gpsVersionID toolname="Exiftool" toolversion="12.50" status="SINGLE_RESULT">2.3.0.0</gpsVersionID>
</image>
</metadata>
<statistics fitsExecutionTime="8234">
<tool toolname="MediaInfo" toolversion="23.09" status="did not run" />
<tool toolname="OIS Audio Information" toolversion="0.1" status="did not run" />
<tool toolname="ADL Tool" toolversion="0.1" status="did not run" />
<tool toolname="VTT Tool" toolversion="0.1" status="did not run" />
<tool toolname="Droid" toolversion="6.5.2" executionTime="5814" />
<tool toolname="jpylyzer" toolversion="2.1.0" status="did not run" />
<tool toolname="embARC" toolversion="0.2" status="did not run" />
<tool toolname="file utility" toolversion="5.11" executionTime="8031" />
<tool toolname="Exiftool" toolversion="12.50" executionTime="8053" />
<tool toolname="NLNZ Metadata Extractor" toolversion="3.6GA" status="did not run" />
<tool toolname="OIS File Information" toolversion="1.0" executionTime="5717" />
<tool toolname="OIS XML Metadata" toolversion="0.2" status="did not run" />
<tool toolname="Tika" toolversion="2.6.0" executionTime="7168" />
</statistics>
</fits>
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<tool toolname="Exiftool" toolversion="10.00" />
<externalIdentifier toolname="Droid" toolversion="6.1.5" type="puid">fmt/141</externalIdentifier>
</identity>
<identity format="WAVE" mimetype="audio/x-wave" toolname="FITS" toolversion="1.0.5">
<identity format="WAVERLY" mimetype="audio/x-waverly" toolname="FITS" toolversion="1.0.5">
<tool toolname="Jhove" toolversion="1.11" />
</identity>
</identification>
Expand Down

0 comments on commit 5350cf0

Please sign in to comment.