The MD5 from a local file and the MD5 (eTag) from S3 is not the same
I get the MD5 of a local file but it is different than the MD5 (eTag) of the "same" file in Amazon S3.开发者_StackOverflow中文版 What I would like to achieve is figure out if the lastest files I have in S3 is the same one that I have locally. If I cannot compare MD5, then how should I do it?
Generating MD5 from the local file (truncated code):
MessageDigest md = MessageDigest.getInstance("MD5");
byte[] md5 = Files.getDigest(localFile, md);
String hashtext = DigestUtils.md5Hex(md5);
Retrieving MD5 (eTag) from S3 (truncated code):
ObjectListing objectListing = s3.listObjects(new ListObjectsRequest().withBucketName(bucketName));
List<S3ObjectSummary> objectSummaries = objectListing.getObjectSummaries();
for(S3ObjectSummary objectSummary : objectSummaries) {
String MD5 = objectSummary.getETag();
}
PS: I use org.apache.commons.codec.digest.DigestUtils
and com.google.common.io.Files
libraries.
String hashtext = DigestUtils.md5Hex(md5);
Does calculate the MD5 of the MD5 you just calculated. See DigestUtils.md5Hex documentation.
hashtext
is in fact MD5(MD5(file)) and not MD5(file).
Bruno's answer nails it, but I wanted to point out that if you want to do this without the Google Guava dependency, it's actually not that difficult (especially since/if you're already using Apache Commons)
You'd replace this:
byte[] md5 = Files.getDigest(localFile, md);
with this (using a Java 7 try-initialization-block):
try (FileInputStream fis = new FileInputStream(localFile)) {
byte[]md5 = DigestUtils.md5(fileInputStream);
}
This md5(InputStream) method has been in Apache Commons since version 1.4.
This is my own implementation of S3's eTag. I tested it with a large file I uploaded on S3 to get a reference value for multipart eTag.
Keep in mind that compression and Client-Side Encryption make eTag useless when it comes to check downloaded file.
Etag.java
package io.github.caillette.s3;
import com.amazonaws.services.s3.transfer.TransferManagerConfiguration;
import com.google.common.io.ByteSource;
import org.apache.commons.codec.digest.DigestUtils;
import java.io.IOException;
import java.io.InputStream;
import java.security.DigestException;
import java.security.MessageDigest;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static com.google.common.base.Preconditions.checkArgument;
/**
* Represents the
* <a href="http://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadComplete.html" >eTag</a>
* calculated by Amazon S3.
*/
public final class Etag {
private final String md5 ;
private final Integer partNumber ;
private static final Pattern MD5_PATTERN = Pattern.compile( "[a-f0-9]{32}" ) ;
private static final Pattern FULL_ETAG_PATTERN
= Pattern.compile( "(" + MD5_PATTERN.pattern() + ")(?:-([0-9]+))?" ) ;
private Etag( final byte[] md5, final Integer partNumber ) {
this( md5asString( md5 ), partNumber ) ;
}
public static String md5asString( final byte[] md5 ) {
checkArgument( md5.length == 16 ) ;
return DigestTools.toHex( md5 );
}
private Etag( final String md5, final Integer partNumber ) {
checkArgument( MD5_PATTERN.matcher( md5 ).matches() ) ;
checkArgument( partNumber == null || partNumber > 0 ) ;
this.md5 = md5 ;
this.partNumber = partNumber ;
}
public String asString() {
return md5 + ( partNumber == null ? "" : "-" + partNumber ) ;
}
public static Etag parse( final String string ) {
final Matcher matcher = FULL_ETAG_PATTERN.matcher( string ) ;
checkArgument( matcher.matches(), "Invalid format: " + string ) ;
final String md5 = matcher.group( 1 ) ;
final String partNumber = matcher.group( 2 ) ;
return new Etag( md5, partNumber == null ? null : Integer.parseInt( partNumber ) ) ;
}
@Override
public String toString() {
return getClass().getSimpleName() + "{" + asString() + "}" ;
}
@Override
public boolean equals( final Object other ) {
if( this == other ) {
return true ;
}
if( other == null || getClass() != other.getClass() ) {
return false ;
}
final Etag etag = ( Etag ) other ;
if( ! md5.equals( etag.md5 ) ) {
return false ;
}
if( partNumber != null ? !partNumber.equals( etag.partNumber ) : etag.partNumber != null ) {
return false;
}
return true ;
}
@Override
public int hashCode() {
int result = md5.hashCode();
result = 31 * result + ( partNumber != null ? partNumber.hashCode() : 0 ) ;
return result;
}
public static final long DEFAULT_MINIMUM_UPLOAD_PART_SIZE
= new TransferManagerConfiguration().getMinimumUploadPartSize() ;
// =======
// Compute
// =======
/**
* Calculates {@link Etag} (MD5 checksum in the AWS way).
* For small files (less than {@link #DEFAULT_MINIMUM_UPLOAD_PART_SIZE}, practically 5 GB )
* it's the MD5. For big files, it's a MD5 of the MD5 of its multipart chunks.
*
* http://permalink.gmane.org/gmane.comp.file-systems.s3.s3tools/583
* https://github.com/Teachnova/s3md5
* http://stackoverflow.com/questions/12186993/what-is-the-algorithm-to-compute-the-amazon-s3-etag-for-a-file-larger-than-5gb
*/
public static Etag compute( final ByteSource byteSource, final int chunkSize )
throws IOException, DigestException
{
final List< byte[] > md5s = new ArrayList<>() ;
try( final InputStream inputStream = byteSource.openBufferedStream() ) {
while( true ) {
if( inputStream.available() > 0 ) {
final byte[] md5 = computeMd5( inputStream, chunkSize ) ;
md5s.add( md5 ) ;
} else {
break ;
}
}
}
if( md5s.size() == 1 ) {
return new Etag( md5s.get( 0 ), null ) ;
} else {
final byte[] md5concatenation = new byte[ md5s.size() * 16 ] ;
for( int i = 0 ; i < md5s.size() ; i ++ ) {
final byte[] md5 = md5s.get( i ) ;
System.arraycopy( md5, 0, md5concatenation, i * 16, 16 ) ;
}
final byte[] finalMd5 = DigestUtils.md5( md5concatenation ) ;
return new Etag( finalMd5, md5s.size() ) ;
}
}
/*package*/ static byte[] computeMd5(
final InputStream inputStream,
final int length
) throws IOException, DigestException {
final MessageDigest md5Digest = DigestUtils.getMd5Digest() ;
final byte[] buffer = new byte[ 8192 ] ;
long totalRead = 0 ;
while( true ) {
final long greatestRemainder = length - totalRead ;
final int sizeToRead = greatestRemainder > buffer.length
? buffer.length : ( int ) greatestRemainder ;
final int read = inputStream.read( buffer, 0, sizeToRead ) ;
if( read > 0 ) {
md5Digest.update( buffer, 0, read ) ;
totalRead += read ;
} else {
return md5Digest.digest() ;
}
}
}
}
EtagTest.java
package io.github.caillette.s3;
import com.google.common.io.Files;
import org.apache.commons.codec.digest.DigestUtils;
import org.junit.Ignore;
import org.junit.Rule;
import org.junit.Test;
import org.novelang.testing.junit.MethodSupport;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import static org.assertj.core.api.Assertions.assertThat ;
public class EtagTest {
/**
* This test checks equality with an eTag calculated by S3 itself.
* To trigger multipart upload (which causes special eTag calculation),
* generate a garbage file with a size of 120_000_000L and upload it
* with {@link TransferManagerConfigurator#multipartCopyThreshold} set
* to 115343360 bytes (110 MBi).
*/
@Test
public void bigMultipart() throws Exception {
final File file = createGarbageFile( 120_000_000 ) ;
final int chunkSize = 5 * 1024 * 1024 ;
final long start = System.currentTimeMillis() ;
final Etag etag = Etag.compute( Files.asByteSource( file ), chunkSize ) ;
LOGGER.info( "Calculated " + etag + " in " + ( System.currentTimeMillis() - start ) + " ms." ) ;
assertThat( etag.asString() ).isEqualTo( "94b81d1e846ec106c09eabc984314008-23" ) ;
}
@Test
public void smallMultipart() throws Exception {
final File file = createGarbageFile( 30_000 ) ;
final int chunkSize = 10_000 ;
final Etag etag = Etag.compute( Files.asByteSource( file ), chunkSize ) ;
assertThat( etag.asString() ).isEqualTo( "056b4552c5ace587b5d62305d99e8555-3" ) ;
}
@Test
public void parseMonopart() throws Exception {
final Etag etag = Etag.parse( "056b4552c5ace587b5d62305d99e8555" ) ;
assertThat( etag.asString() ).isEqualTo( "056b4552c5ace587b5d62305d99e8555" ) ;
}
@Test
public void parseMultipart() throws Exception {
final Etag etag = Etag.parse( "056b4552c5ace587b5d62305d99e8555-33" ) ;
assertThat( etag.asString() ).isEqualTo( "056b4552c5ace587b5d62305d99e8555-33" ) ;
}
@Test
public void smallMonopart() throws Exception {
final File file = createGarbageFile( 1_000 ) ;
final int chunkSize = 10_000 ;
final Etag etag = Etag.compute( Files.asByteSource( file ), chunkSize ) ;
assertThat( etag.asString() ).isEqualTo( "cc24b86af8f8c18ca90703db6834f3f3" ) ;
}
// =======
// Fixture
// =======
private static final Logger LOGGER = LoggerFactory.getLogger( EtagTest.class ) ;
@Rule
public final MethodSupport methodSupport = new MethodSupport() { } ;
private byte[] createGarbageByteArray( final long length ) throws IOException {
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream() ;
GarbageTools.generate( byteArrayOutputStream, length ) ;
return byteArrayOutputStream.toByteArray() ;
}
private File createGarbageFile( final long fileLength ) throws IOException {
final File garbageFile
= File.createTempFile( "garbage-", ".txt", methodSupport.getDirectory() ) ;
// garbageFile.deleteOnExit() ;
final long start = System.currentTimeMillis() ;
GarbageTools.generate( garbageFile, fileLength ) ;
LOGGER.info( "Generated file of " + fileLength + " bytes: " + garbageFile.getAbsolutePath()
+ " in " + ( System.currentTimeMillis() - start ) + " ms.") ;
return garbageFile ;
}
}
GarbageTools.java
package io.github.caillette.s3;
import com.google.common.base.Charsets;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
/**
* Generates file with deterministic garbage.
*/
public final class GarbageTools {
private GarbageTools() { }
public static void generate( final File file, final long length ) throws IOException {
try(
final FileOutputStream fileOutputStream = new FileOutputStream( file ) ;
final OutputStream outputStream = new BufferedOutputStream( fileOutputStream )
) {
generate( outputStream, length ) ;
}
}
/**
* Slow but it works.
*/
public static void generate( final OutputStream outputStream, final long length )
throws IOException
{
long bytesWritten = 0 ;
long counter = 0 ;
final StringBuilder stringBuilder = new StringBuilder() ;
while( true ) {
stringBuilder.append( counter ++ ).append( " " ) ;
final int lineLength = stringBuilder.length() ;
final boolean done = bytesWritten + lineLength >= length ;
if( done ) {
final int remainder = ( int ) ( length - bytesWritten ) ;
stringBuilder.delete( remainder, stringBuilder.length() ) ;
}
outputStream.write( stringBuilder.toString().getBytes( Charsets.US_ASCII ) ) ;
bytesWritten += stringBuilder.length() ;
stringBuilder.delete( 0, stringBuilder.length() ) ;
if( done ) {
break ;
}
}
}
}
Use Md5Utils from AWS SDK For Java which already provides the md5 calculation out of box.
精彩评论