开发者

The MD5 from a local file and the MD5 (eTag) from S3 is not the same

I get the MD5 of a local file but it is different than the MD5 (eTag) of the "same" file in Amazon S3.开发者_StackOverflow中文版 What I would like to achieve is figure out if the lastest files I have in S3 is the same one that I have locally. If I cannot compare MD5, then how should I do it?

Generating MD5 from the local file (truncated code):

MessageDigest md = MessageDigest.getInstance("MD5");
byte[] md5 = Files.getDigest(localFile, md);
String hashtext = DigestUtils.md5Hex(md5);

Retrieving MD5 (eTag) from S3 (truncated code):

ObjectListing objectListing = s3.listObjects(new ListObjectsRequest().withBucketName(bucketName));
List<S3ObjectSummary> objectSummaries = objectListing.getObjectSummaries();
for(S3ObjectSummary objectSummary : objectSummaries) {
    String MD5 = objectSummary.getETag();
}

PS: I use org.apache.commons.codec.digest.DigestUtils and com.google.common.io.Files libraries.


String hashtext = DigestUtils.md5Hex(md5);

Does calculate the MD5 of the MD5 you just calculated. See DigestUtils.md5Hex documentation.

hashtext is in fact MD5(MD5(file)) and not MD5(file).


Bruno's answer nails it, but I wanted to point out that if you want to do this without the Google Guava dependency, it's actually not that difficult (especially since/if you're already using Apache Commons)

You'd replace this:

byte[] md5 = Files.getDigest(localFile, md);

with this (using a Java 7 try-initialization-block):

try (FileInputStream fis = new FileInputStream(localFile)) {
    byte[]md5 = DigestUtils.md5(fileInputStream);
}

This md5(InputStream) method has been in Apache Commons since version 1.4.


This is my own implementation of S3's eTag. I tested it with a large file I uploaded on S3 to get a reference value for multipart eTag.

Keep in mind that compression and Client-Side Encryption make eTag useless when it comes to check downloaded file.


Etag.java

package io.github.caillette.s3;

import com.amazonaws.services.s3.transfer.TransferManagerConfiguration;
import com.google.common.io.ByteSource;
import org.apache.commons.codec.digest.DigestUtils;

import java.io.IOException;
import java.io.InputStream;
import java.security.DigestException;
import java.security.MessageDigest;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static com.google.common.base.Preconditions.checkArgument;

/**
 * Represents the
 * <a href="http://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadComplete.html" >eTag</a>
 * calculated by Amazon S3.
 */
public final class Etag {

  private final String md5 ;
  private final Integer partNumber ;

  private static final Pattern MD5_PATTERN = Pattern.compile( "[a-f0-9]{32}" ) ;
  private static final Pattern FULL_ETAG_PATTERN
      = Pattern.compile( "(" + MD5_PATTERN.pattern() + ")(?:-([0-9]+))?" ) ;

  private Etag( final byte[] md5, final Integer partNumber ) {
    this( md5asString( md5 ), partNumber ) ;
  }

  public static String md5asString( final byte[] md5 ) {
    checkArgument( md5.length == 16 ) ;
    return DigestTools.toHex( md5 );
  }

  private Etag( final String md5, final Integer partNumber ) {
    checkArgument( MD5_PATTERN.matcher( md5 ).matches() ) ;
    checkArgument( partNumber == null || partNumber > 0 ) ;
    this.md5 = md5 ;
    this.partNumber = partNumber ;
  }

  public String asString() {
    return md5 + ( partNumber == null ? "" : "-" + partNumber ) ;
  }

  public static Etag parse( final String string ) {
    final Matcher matcher = FULL_ETAG_PATTERN.matcher( string ) ;
    checkArgument( matcher.matches(), "Invalid format: " + string ) ;
    final String md5 = matcher.group( 1 ) ;
    final String partNumber = matcher.group( 2 ) ;
    return new Etag( md5, partNumber == null ? null : Integer.parseInt( partNumber ) ) ;
  }

  @Override
  public String toString() {
    return getClass().getSimpleName() + "{" + asString() + "}" ;
  }

  @Override
  public boolean equals( final Object other ) {
    if( this == other ) {
      return true ;
    }
    if( other == null || getClass() != other.getClass() ) {
      return false ;
    }

    final Etag etag = ( Etag ) other ;

    if( ! md5.equals( etag.md5 ) ) {
      return false ;
    }
    if( partNumber != null ? !partNumber.equals( etag.partNumber ) : etag.partNumber != null ) {
      return false;
    }

    return true ;
  }

  @Override
  public int hashCode() {
    int result = md5.hashCode();
    result = 31 * result + ( partNumber != null ? partNumber.hashCode() : 0 ) ;
    return result;
  }


  public static final long DEFAULT_MINIMUM_UPLOAD_PART_SIZE
      = new TransferManagerConfiguration().getMinimumUploadPartSize() ;



// =======
// Compute
// =======

  /**
   * Calculates {@link Etag} (MD5 checksum in the AWS way).
   * For small files (less than {@link #DEFAULT_MINIMUM_UPLOAD_PART_SIZE}, practically 5 GB )
   * it's the MD5. For big files, it's a MD5 of the MD5 of its multipart chunks.
   *
   * http://permalink.gmane.org/gmane.comp.file-systems.s3.s3tools/583
   * https://github.com/Teachnova/s3md5
   * http://stackoverflow.com/questions/12186993/what-is-the-algorithm-to-compute-the-amazon-s3-etag-for-a-file-larger-than-5gb
   */
  public static Etag compute( final ByteSource byteSource, final int chunkSize )
      throws IOException, DigestException
  {
    final List< byte[] > md5s = new ArrayList<>() ;
    try( final InputStream inputStream = byteSource.openBufferedStream() ) {
      while( true ) {
        if( inputStream.available() > 0 ) {
          final byte[] md5 = computeMd5( inputStream, chunkSize ) ;
          md5s.add( md5 ) ;
        } else {
          break ;
        }
      }
    }
    if( md5s.size() == 1 ) {
      return new Etag( md5s.get( 0 ), null ) ;
    } else {
      final byte[] md5concatenation = new byte[ md5s.size() * 16 ] ;
      for( int i = 0 ; i < md5s.size() ; i ++ ) {
        final byte[] md5 = md5s.get( i ) ;
        System.arraycopy( md5, 0, md5concatenation, i * 16, 16 ) ;
      }
      final byte[] finalMd5 = DigestUtils.md5( md5concatenation ) ;
      return new Etag( finalMd5, md5s.size() ) ;
    }
  }

  /*package*/ static byte[] computeMd5(
      final InputStream inputStream,
      final int length
  ) throws IOException, DigestException {
    final MessageDigest md5Digest = DigestUtils.getMd5Digest() ;
    final byte[] buffer = new byte[ 8192 ] ;
    long totalRead = 0 ;
    while( true ) {
      final long greatestRemainder = length - totalRead ;
      final int sizeToRead = greatestRemainder > buffer.length
          ? buffer.length : ( int ) greatestRemainder ;
      final int read = inputStream.read( buffer, 0, sizeToRead ) ;
      if( read > 0 ) {
        md5Digest.update( buffer, 0, read ) ;
        totalRead += read ;
      } else {
        return md5Digest.digest() ;
      }
    }
  }
}

EtagTest.java

package io.github.caillette.s3;

import com.google.common.io.Files;
import org.apache.commons.codec.digest.DigestUtils;
import org.junit.Ignore;
import org.junit.Rule;
import org.junit.Test;
import org.novelang.testing.junit.MethodSupport;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;

import static org.assertj.core.api.Assertions.assertThat ;

public class EtagTest {

  /**
   * This test checks equality with an eTag calculated by S3 itself.
   * To trigger multipart upload (which causes special eTag calculation), 
   * generate a garbage file with a size of 120_000_000L and upload it
   * with {@link TransferManagerConfigurator#multipartCopyThreshold} set
   * to 115343360 bytes (110 MBi).
   */
  @Test
  public void bigMultipart() throws Exception {
    final File file = createGarbageFile( 120_000_000 ) ;
    final int chunkSize = 5 * 1024 * 1024 ;
    final long start = System.currentTimeMillis() ;
    final Etag etag = Etag.compute( Files.asByteSource( file ), chunkSize ) ;
    LOGGER.info( "Calculated " + etag + " in " + ( System.currentTimeMillis() - start ) + " ms." ) ;
    assertThat( etag.asString() ).isEqualTo( "94b81d1e846ec106c09eabc984314008-23" ) ;
  }

  @Test
  public void smallMultipart() throws Exception {
    final File file = createGarbageFile( 30_000 ) ;
    final int chunkSize = 10_000 ;
    final Etag etag = Etag.compute( Files.asByteSource( file ), chunkSize ) ;
    assertThat( etag.asString() ).isEqualTo( "056b4552c5ace587b5d62305d99e8555-3" ) ;
  }

  @Test
  public void parseMonopart() throws Exception {
    final Etag etag = Etag.parse( "056b4552c5ace587b5d62305d99e8555" ) ;
    assertThat( etag.asString() ).isEqualTo( "056b4552c5ace587b5d62305d99e8555" ) ;
  }

  @Test
  public void parseMultipart() throws Exception {
    final Etag etag = Etag.parse( "056b4552c5ace587b5d62305d99e8555-33" ) ;
    assertThat( etag.asString() ).isEqualTo( "056b4552c5ace587b5d62305d99e8555-33" ) ;
  }

  @Test
  public void smallMonopart() throws Exception {
    final File file = createGarbageFile( 1_000 ) ;
    final int chunkSize = 10_000 ;
    final Etag etag = Etag.compute( Files.asByteSource( file ), chunkSize ) ;
    assertThat( etag.asString() ).isEqualTo( "cc24b86af8f8c18ca90703db6834f3f3" ) ;
  }


// =======
// Fixture
// =======

  private static final Logger LOGGER = LoggerFactory.getLogger( EtagTest.class ) ;

  @Rule
  public final MethodSupport methodSupport = new MethodSupport() { } ;

  private byte[] createGarbageByteArray( final long length ) throws IOException {
    final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream() ;
    GarbageTools.generate( byteArrayOutputStream, length ) ;
    return byteArrayOutputStream.toByteArray() ;
  }

  private File createGarbageFile( final long fileLength ) throws IOException {
    final File garbageFile
        = File.createTempFile( "garbage-", ".txt", methodSupport.getDirectory() )  ;
//    garbageFile.deleteOnExit() ;
    final long start = System.currentTimeMillis() ;
    GarbageTools.generate( garbageFile, fileLength ) ;
    LOGGER.info( "Generated file of " + fileLength + " bytes: " + garbageFile.getAbsolutePath()
        + " in " + ( System.currentTimeMillis() - start ) + " ms.") ;
    return garbageFile ;
  }

}

GarbageTools.java

package io.github.caillette.s3;

import com.google.common.base.Charsets;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;

/**
 * Generates file with deterministic garbage.
 */
public final class GarbageTools {

  private GarbageTools() { }

  public static void generate( final File file, final long length ) throws IOException {
    try(
        final FileOutputStream fileOutputStream = new FileOutputStream( file ) ;
        final OutputStream outputStream = new BufferedOutputStream( fileOutputStream )
    ) {
      generate( outputStream, length ) ;
    }
  }

  /**
   * Slow but it works.
   */
  public static void generate( final OutputStream outputStream, final long length )
      throws IOException
  {
    long bytesWritten = 0 ;
    long counter = 0 ;
    final StringBuilder stringBuilder = new StringBuilder() ;
    while( true ) {
      stringBuilder.append( counter ++ ).append( " " ) ;
      final int lineLength = stringBuilder.length() ;
      final boolean done = bytesWritten + lineLength >= length ;
      if( done ) {
        final int remainder = ( int ) ( length - bytesWritten ) ;
        stringBuilder.delete( remainder, stringBuilder.length() ) ;
      }
      outputStream.write( stringBuilder.toString().getBytes( Charsets.US_ASCII ) ) ;
      bytesWritten += stringBuilder.length() ;
      stringBuilder.delete( 0, stringBuilder.length() ) ;
      if( done ) {
        break ;
      }
    }
  }
}


Use Md5Utils from AWS SDK For Java which already provides the md5 calculation out of box.

0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜