From a186a4692e07f29b070eb1a8d5319005eb1414d2 Mon Sep 17 00:00:00 2001 From: nwoodward Date: Fri, 27 Sep 2024 10:48:24 -0500 Subject: [PATCH 1/2] switch from HashMap to LinkedHashMap so that order is maintained --- .../java/org/duraspace/bagit/BagWriter.java | 15 ++++++----- .../org/duraspace/bagit/BagWriterTest.java | 26 +++++++++++++------ 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/duraspace/bagit/BagWriter.java b/src/main/java/org/duraspace/bagit/BagWriter.java index 47f7318..1f2fcf1 100644 --- a/src/main/java/org/duraspace/bagit/BagWriter.java +++ b/src/main/java/org/duraspace/bagit/BagWriter.java @@ -14,6 +14,7 @@ import java.security.DigestOutputStream; import java.security.MessageDigest; import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.Map; import java.util.Set; import java.util.TreeMap; @@ -30,8 +31,8 @@ public class BagWriter { private final Set tagAlgorithms; private final Set payloadAlgorithms; - private final Map> payloadRegistry; - private final Map> tagFileRegistry; + private final Map> payloadRegistry; + private final Map> tagFileRegistry; private final Map> tagRegistry; /** @@ -97,7 +98,7 @@ public File getRootDir() { * @param algorithm Checksum digest algorithm name (e.g., "SHA-1") * @param filemap Map of Files to checksum values */ - public void registerChecksums(final BagItDigest algorithm, final Map filemap) { + public void registerChecksums(final BagItDigest algorithm, final LinkedHashMap filemap) { if (!payloadAlgorithms.contains(algorithm)) { throw new IllegalArgumentException("Invalid algorithm: " + algorithm); } @@ -112,7 +113,8 @@ public void registerChecksums(final BagItDigest algorithm, final Map values) { - final Map tagValues = tagRegistry.computeIfAbsent(key, k -> new HashMap<>()); + final Map tagValues = + tagRegistry.computeIfAbsent(key, k -> new LinkedHashMap<>()); tagValues.putAll(values); } @@ -146,7 +148,7 @@ public void write() throws IOException { * @param registerToTags flag to check if the hash of the output should be stored in the {@code tagFileRegistry} * @throws IOException if there's an error writing to the OutputStream */ - private void writeManifests(final String prefix, final Map> registry, + private void writeManifests(final String prefix, final Map> registry, final boolean registerToTags) throws IOException { final String delimiter = " "; final char backslash = '\\'; @@ -224,7 +226,8 @@ private OutputStream streamFor(final Path file) throws IOException { private void addTagChecksum(final BagItDigest algorithm, final File f, final MessageDigest digest) { if (digest != null) { - final Map m = tagFileRegistry.computeIfAbsent(algorithm, key -> new HashMap<>()); + final LinkedHashMap m = + tagFileRegistry.computeIfAbsent(algorithm, key -> new LinkedHashMap<>()); m.put(f, HexEncoder.toString(digest.digest())); } } diff --git a/src/test/java/org/duraspace/bagit/BagWriterTest.java b/src/test/java/org/duraspace/bagit/BagWriterTest.java index 83bad12..e9c029e 100644 --- a/src/test/java/org/duraspace/bagit/BagWriterTest.java +++ b/src/test/java/org/duraspace/bagit/BagWriterTest.java @@ -19,8 +19,8 @@ import java.nio.file.Paths; import java.security.MessageDigest; import java.time.LocalDate; -import java.util.Collections; import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Objects; @@ -96,12 +96,18 @@ public void write() throws IOException { Files.createDirectories(bag); final BagWriter writer = new BagWriter(bag.toFile(), Sets.newHashSet(sha1, sha256, sha512)); - // Setup the data files + // Set up the data files final Path data = bag.resolve("data"); final Path file = Files.createFile(data.resolve(filename)); - final Map sha1Sums = Maps.newHashMap(file.toFile(), HexEncoder.toString(sha1MD.digest())); - final Map sha256Sums = Maps.newHashMap(file.toFile(), HexEncoder.toString(sha256MD.digest())); - final Map sha512Sums = Maps.newHashMap(file.toFile(), HexEncoder.toString(sha512MD.digest())); + + final LinkedHashMap sha1Sums = new LinkedHashMap<>(); + sha1Sums.put(file.toFile(), HexEncoder.toString(sha1MD.digest())); + + final LinkedHashMap sha256Sums = new LinkedHashMap<>(); + sha256Sums.put(file.toFile(), HexEncoder.toString(sha256MD.digest())); + + final LinkedHashMap sha512Sums = new LinkedHashMap<>(); + sha512Sums.put(file.toFile(), HexEncoder.toString(sha512MD.digest())); // second file final Path file2 = Files.createFile(data.resolve(filename + "2")); @@ -170,8 +176,12 @@ public void testWriteDistinctManifests() throws Exception { // Setup the data files final Path data = bag.resolve("data"); final Path file = Files.createFile(data.resolve(filename)); - final Map sha1Sums = Maps.newHashMap(file.toFile(), HexEncoder.toString(sha1MD.digest())); - final Map sha256Sums = Maps.newHashMap(file.toFile(), HexEncoder.toString(sha256MD.digest())); + + final LinkedHashMap sha1Sums = new LinkedHashMap<>(); + sha1Sums.put(file.toFile(), HexEncoder.toString(sha1MD.digest())); + + final LinkedHashMap sha256Sums = new LinkedHashMap<>(); + sha256Sums.put(file.toFile(), HexEncoder.toString(sha256MD.digest())); // second file final Path file2 = Files.createFile(data.resolve(filename + "2")); @@ -274,7 +284,7 @@ public void testAddInvalidAlgorithm() throws IOException { final BagWriter writer = new BagWriter(bag.toFile(), Sets.newHashSet(sha1)); // we don't need to pass any files, just the errant BagItDigest - writer.registerChecksums(sha256, Collections.emptyMap()); + writer.registerChecksums(sha256, new LinkedHashMap<>()); }); } From 786ffb7b2e6e888b7f7feaef8a349341ca29b2d2 Mon Sep 17 00:00:00 2001 From: nwoodward Date: Fri, 27 Sep 2024 10:49:09 -0500 Subject: [PATCH 2/2] added serializeWithTimestamp method to ZipBagSerializer that ensures MD5 checksums will match --- .../bagit/serialize/BagSerializer.java | 26 ++++++- .../bagit/serialize/TarBagSerializer.java | 4 ++ .../bagit/serialize/TarGzBagSerializer.java | 5 ++ .../bagit/serialize/ZipBagSerializer.java | 67 +++++++++++++++++++ .../bagit/serialize/BagSerializerTest.java | 27 +++++++- 5 files changed, 127 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/duraspace/bagit/serialize/BagSerializer.java b/src/main/java/org/duraspace/bagit/serialize/BagSerializer.java index 94d1bf9..59e78c9 100644 --- a/src/main/java/org/duraspace/bagit/serialize/BagSerializer.java +++ b/src/main/java/org/duraspace/bagit/serialize/BagSerializer.java @@ -13,9 +13,18 @@ * @author mikejritter * @since 2020-02-24 */ -@FunctionalInterface public interface BagSerializer { + /** + * Default date/time (in milliseconds since epoch) to set for Zip Entries + * that do not have a last modified date. If the date/time is not set + * then it will default to current system date/time. + * This is less than ideal, as it causes the MD5 checksum of Zip file to + * change whenever a Zip file is regenerated (even if compressed files are unchanged). + * 1589346000 seconds * 1000 = May 13, 2020 GMT (the date BagIt-Support 1.0.0 was released) + */ + long DEFAULT_MODIFIED_DATE = 1589346000L * 1000; + /** * Serialize a BagIt bag depending on the format defined by the implementing class. This only puts the files into * an archive, with the name of the {@code root} directory serving as the name of the final file. @@ -26,4 +35,19 @@ public interface BagSerializer { */ Path serialize(Path root) throws IOException; + /** + * Serialize a BagIt bag and set file creation, last modified, and access times for each zip entry. + * Setting these times is required to ensure that MD5 checksums of identical bags created at + * different times will match. + * + * This only puts the files into an archive, with the name of the{@code root} directory serving + * as the name of the final file. + * + * @param root the {@link Path} which is the top level directory of the BagIt bag + * @param lastModifiedTime the time (in milliseconds) to set time fields in file metadata + * @return the {@link Path} to the serialized BagIt bag + * @throws IOException if there is an error writing to the archive + * @throws UnsupportedOperationException if the child class does not implement this method + */ + Path serializeWithTimestamp(Path root, Long lastModifiedTime) throws IOException; } diff --git a/src/main/java/org/duraspace/bagit/serialize/TarBagSerializer.java b/src/main/java/org/duraspace/bagit/serialize/TarBagSerializer.java index 88b920f..45689c2 100644 --- a/src/main/java/org/duraspace/bagit/serialize/TarBagSerializer.java +++ b/src/main/java/org/duraspace/bagit/serialize/TarBagSerializer.java @@ -52,4 +52,8 @@ public Path serialize(final Path root) throws IOException { return serializedBag; } + @Override + public Path serializeWithTimestamp(final Path root, final Long lastModifiedTime) throws IOException { + throw new UnsupportedOperationException("This method is not supported."); + } } diff --git a/src/main/java/org/duraspace/bagit/serialize/TarGzBagSerializer.java b/src/main/java/org/duraspace/bagit/serialize/TarGzBagSerializer.java index d5a2913..b2b84e5 100644 --- a/src/main/java/org/duraspace/bagit/serialize/TarGzBagSerializer.java +++ b/src/main/java/org/duraspace/bagit/serialize/TarGzBagSerializer.java @@ -53,4 +53,9 @@ public Path serialize(final Path root) throws IOException { return serializedBag; } + + @Override + public Path serializeWithTimestamp(final Path root, final Long lastModifiedTime) throws IOException { + throw new UnsupportedOperationException("This method is not supported."); + } } diff --git a/src/main/java/org/duraspace/bagit/serialize/ZipBagSerializer.java b/src/main/java/org/duraspace/bagit/serialize/ZipBagSerializer.java index 4043e25..6d629f4 100644 --- a/src/main/java/org/duraspace/bagit/serialize/ZipBagSerializer.java +++ b/src/main/java/org/duraspace/bagit/serialize/ZipBagSerializer.java @@ -8,12 +8,19 @@ import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.attribute.FileTime; +import java.text.DateFormat; +import java.text.SimpleDateFormat; import java.util.Iterator; import java.util.stream.Stream; +import org.apache.commons.compress.archivers.zip.X000A_NTFS; +import org.apache.commons.compress.archivers.zip.X5455_ExtendedTimestamp; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; import org.apache.commons.io.FileUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Serialize a BagIt bag into a zip archive without compression @@ -24,6 +31,8 @@ public class ZipBagSerializer implements BagSerializer { private final String extension = ".zip"; + private final Logger logger = LoggerFactory.getLogger(ZipBagSerializer.class); + @Override public Path serialize(final Path root) throws IOException { final Path parent = root.getParent().toAbsolutePath(); @@ -52,4 +61,62 @@ public Path serialize(final Path root) throws IOException { return serializedBag; } + + @Override + public Path serializeWithTimestamp(final Path root, final Long lastModifiedTime) throws IOException { + logger.info("Serializing bag with timestamp: {}", root.getFileName()); + + final Path parent = root.getParent().toAbsolutePath(); + final String bagName = root.getFileName().toString(); + final DateFormat df = new SimpleDateFormat("MM/dd/yyyy"); + + final Path serializedBag = parent.resolve(bagName + extension); + try(final OutputStream os = Files.newOutputStream(serializedBag); + final ZipArchiveOutputStream zip = new ZipArchiveOutputStream(os); + final Stream files = Files.walk(root)) { + + // Use given last modified time or the default value; avoid an invalid timestamp + final FileTime time; + if ((lastModifiedTime != null) && (lastModifiedTime > 0)) { + time = FileTime.fromMillis(lastModifiedTime); + } else { + time = FileTime.fromMillis(DEFAULT_MODIFIED_DATE); + } + + // it would be nice not to have to collect the files which are walked, but we're required to try/catch + // inside of a lambda which isn't the prettiest. maybe a result could be returned which contains either a + // Path or the Exception thrown... just an idea + final Iterator itr = files.iterator(); + while (itr.hasNext()) { + final Path bagEntry = itr.next(); + final String name = parent.relativize(bagEntry).toString(); + final ZipArchiveEntry entry = zip.createArchiveEntry(bagEntry.toFile(), name); + + logger.debug("Setting ZipArchiveEntry creation, last modified and last access times to: {}", + df.format(time.toMillis())); + + Files.setLastModifiedTime(bagEntry, time); + + final X5455_ExtendedTimestamp extendedTimestamp = new X5455_ExtendedTimestamp(); + extendedTimestamp.setCreateFileTime(time); + extendedTimestamp.setModifyFileTime(time); + extendedTimestamp.setAccessFileTime(time); + entry.addExtraField(extendedTimestamp); + + final X000A_NTFS ntfsTimestamp = new X000A_NTFS(); + ntfsTimestamp.setCreateFileTime(time); + ntfsTimestamp.setModifyFileTime(time); + ntfsTimestamp.setAccessFileTime(time); + entry.addExtraField(ntfsTimestamp); + + zip.putArchiveEntry(entry); + if (bagEntry.toFile().isFile()) { + FileUtils.copyFile(bagEntry.toFile(), zip); + } + zip.closeArchiveEntry(); + } + } + + return serializedBag; + } } diff --git a/src/test/java/org/duraspace/bagit/serialize/BagSerializerTest.java b/src/test/java/org/duraspace/bagit/serialize/BagSerializerTest.java index cbaed62..99635dc 100644 --- a/src/test/java/org/duraspace/bagit/serialize/BagSerializerTest.java +++ b/src/test/java/org/duraspace/bagit/serialize/BagSerializerTest.java @@ -5,6 +5,8 @@ package org.duraspace.bagit.serialize; import static org.assertj.core.api.Assertions.assertThat; +import static org.duraspace.bagit.serialize.BagSerializer.DEFAULT_MODIFIED_DATE; +import static org.junit.jupiter.api.Assertions.assertEquals; import java.io.IOException; import java.net.URI; @@ -12,6 +14,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.nio.file.attribute.FileTime; import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; @@ -19,6 +22,7 @@ import org.apache.commons.compress.archivers.ArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; import org.duraspace.bagit.profile.BagProfile; import org.junit.jupiter.api.BeforeEach; @@ -69,6 +73,28 @@ public void testZipSerializer() throws Exception { Files.delete(writtenBag); } + @Test + public void testZipSerializerWithTimestamp() throws IOException { + final BagSerializer zipper = SerializationSupport.serializerFor("zip", profile); + final Path writtenBag = zipper.serializeWithTimestamp(bag, DEFAULT_MODIFIED_DATE); + + assertThat(writtenBag).exists(); + assertThat(writtenBag).isRegularFile(); + + // just make sure we can read it + try (ZipArchiveInputStream zipIn = new ZipArchiveInputStream(Files.newInputStream(writtenBag))) { + ZipArchiveEntry entry; + while ((entry = zipIn.getNextEntry()) != null) { + assertThat(bagFiles).contains(Paths.get(entry.getName())); + assertEquals(entry.getCreationTime(), FileTime.fromMillis(DEFAULT_MODIFIED_DATE)); + assertEquals(entry.getLastModifiedTime(), FileTime.fromMillis(DEFAULT_MODIFIED_DATE)); + assertEquals(entry.getLastAccessTime(), FileTime.fromMillis(DEFAULT_MODIFIED_DATE)); + } + } + + Files.delete(writtenBag); + } + @Test public void testTarSerializer() throws Exception { final BagSerializer serializer = SerializationSupport.serializerFor("tar", profile); @@ -107,5 +133,4 @@ public void testGZipSerializer() throws Exception { Files.delete(writtenBag); } - } \ No newline at end of file