compute crc of file in python

2022-12-12 03:18 问答作者：

I want to calculate the CRC of file and get output like: E45A12AC. Here's my code:

#!/usr/bin/env python 
import os, sys
import zlib

def crc(fileName):
    fd = open(fileName,"rb")
    content = fd.readlines()
    fd.close()
    for eachLine in content:
        zlib.crc32(eachLine)

for eachFile in sys.argv[1:]:
    crc(eachFile)

This calculates the CRC for each line, but its output (e.g. -1767935985) is not what I want.

Hashlib works the way I want, but it computes the md5:

import hashlib
m = hashlib.md5()
for line in open('data.txt', 'rb'):
    m.update(line)
print m.hexdigest()

Is it possible to get something similar usin开发者_C百科g zlib.crc32?

A little more compact and optimized code

def crc(fileName):
    prev = 0
    for eachLine in open(fileName,"rb"):
        prev = zlib.crc32(eachLine, prev)
    return "%X"%(prev & 0xFFFFFFFF)

PS2: Old PS is deprecated - therefore deleted -, because of the suggestion in the comment. Thank you. I don't get, how I missed this, but it was really good.

A modified version of kobor42's answer, with performance improved by a factor 2-3 by reading fixed size chunks instead of "lines":

import zlib

def crc32(fileName):
    with open(fileName, 'rb') as fh:
        hash = 0
        while True:
            s = fh.read(65536)
            if not s:
                break
            hash = zlib.crc32(s, hash)
        return "%08X" % (hash & 0xFFFFFFFF)

Also includes leading zeroes in the returned string.

hashlib-compatible interface for CRC-32 support:

import zlib

class crc32(object):
    name = 'crc32'
    digest_size = 4
    block_size = 1

    def __init__(self, arg=''):
        self.__digest = 0
        self.update(arg)

    def copy(self):
        copy = super(self.__class__, self).__new__(self.__class__)
        copy.__digest = self.__digest
        return copy

    def digest(self):
        return self.__digest

    def hexdigest(self):
        return '{:08x}'.format(self.__digest)

    def update(self, arg):
        self.__digest = zlib.crc32(arg, self.__digest) & 0xffffffff

# Now you can define hashlib.crc32 = crc32
import hashlib
hashlib.crc32 = crc32

# Python > 2.7: hashlib.algorithms += ('crc32',)
# Python > 3.2: hashlib.algorithms_available.add('crc32')

To show any integer's lowest 32 bits as 8 hexadecimal digits, without sign, you can "mask" the value by bit-and'ing it with a mask made of 32 bits all at value 1, then apply formatting. I.e.:

>>> x = -1767935985
>>> format(x & 0xFFFFFFFF, '08x')
'969f700f'

It's quite irrelevant whether the integer you are thus formatting comes from zlib.crc32 or any other computation whatsoever.

Python 3.8+ (using the walrus operator):

import zlib

def crc32(filename, chunksize=65536):
    """Compute the CRC-32 checksum of the contents of the given filename"""
    with open(filename, "rb") as f:
        checksum = 0
        while (chunk := f.read(chunksize)) :
            checksum = zlib.crc32(chunk, checksum)
        return checksum

chunksize is how many bytes to read from the file at a time. You will get the same CRC for the same file no matter what you set chunksize to (it has to be > 0), but setting it too low might make your code slow, too high might use too much memory.

The result is a 32 bit integer. The CRC-32 checksum of an empty file is 0.

Edited to include Altren's solution below.

A modified and more compact version of CrouZ's answer, with a slightly improved performance, using a for loop and file buffering:

def forLoopCrc(fpath):
    """With for loop and buffer."""
    crc = 0
    with open(fpath, 'rb', 65536) as ins:
        for x in range(int((os.stat(fpath).st_size / 65536)) + 1):
            crc = zlib.crc32(ins.read(65536), crc)
    return '%08X' % (crc & 0xFFFFFFFF)

Results, in a 6700k, HDD:

(Note: Retested multiple times and it was consistently faster.)

Warming up the machine...
Finished.

Beginning tests...
File size: 90288KB
Test cycles: 500

With for loop and buffer.
Result 45.24728019630359 

CrouZ solution
Result 45.433838356097894 

kobor42 solution
Result 104.16215688703986 

Altren solution
Result 101.7247863946586

Tested in Python 3.6.4 x64 using the script below:

import os, timeit, zlib, random, binascii

def forLoopCrc(fpath):
    """With for loop and buffer."""
    crc = 0
    with open(fpath, 'rb', 65536) as ins:
        for x in range(int((os.stat(fpath).st_size / 65536)) + 1):
            crc = zlib.crc32(ins.read(65536), crc)
    return '%08X' % (crc & 0xFFFFFFFF)

def crc32(fileName):
    """CrouZ solution"""
    with open(fileName, 'rb') as fh:
        hash = 0
        while True:
            s = fh.read(65536)
            if not s:
                break
            hash = zlib.crc32(s, hash)
        return "%08X" % (hash & 0xFFFFFFFF)

def crc(fileName):
    """kobor42 solution"""
    prev = 0
    for eachLine in open(fileName,"rb"):
        prev = zlib.crc32(eachLine, prev)
    return "%X"%(prev & 0xFFFFFFFF)

def crc32altren(filename):
    """Altren solution"""
    buf = open(filename,'rb').read()
    hash = binascii.crc32(buf) & 0xFFFFFFFF
    return "%08X" % hash

fpath = r'D:\test\test.dat'
tests = {forLoopCrc: 'With for loop and buffer.', 
     crc32: 'CrouZ solution', crc: 'kobor42 solution',
         crc32altren: 'Altren solution'}
count = 500

# CPU, HDD warmup
randomItm = [x for x in tests.keys()]
random.shuffle(randomItm)
print('\nWarming up the machine...')
for c in range(count):
    randomItm[0](fpath)
print('Finished.\n')

# Begin test
print('Beginning tests...\nFile size: %dKB\nTest cycles: %d\n' % (
    os.stat(fpath).st_size/1024, count))
for x in tests:
    print(tests[x])
    start_time = timeit.default_timer()
    for c in range(count):
        x(fpath)
    print('Result', timeit.default_timer() - start_time, '\n')

It is faster because for loops are faster than while loops (sources: here and here).

Merge the above 2 codes as below:

try:
    fd = open(decompressedFile,"rb")
except IOError:
    logging.error("Unable to open the file in readmode:" + decompressedFile)
    return 4
eachLine = fd.readline()
prev = 0
while eachLine:
    prev = zlib.crc32(eachLine, prev)
    eachLine = fd.readline()
fd.close()

There is faster and more compact way to compute CRC using binascii:

import binascii

def crc32(filename):
    buf = open(filename,'rb').read()
    hash = binascii.crc32(buf) & 0xFFFFFFFF
    return "%08X" % hash

You can use base64 for getting out like [ERD45FTR]. And zlib.crc32 provides update options.

import os, sys
import zlib
import base64

def crc(fileName):
  fd = open(fileName,"rb")
  content = fd.readlines()
  fd.close()
  prev = None
  for eachLine in content:
   if not prev:
     prev = zlib.crc32(eachLine)
   else:
     prev = zlib.crc32(eachLine, prev)
  return prev

for eachFile in sys.argv[1:]:
  print base64.b64encode(str(crc(eachFile)))

solution:

import os, sys
import zlib

def crc(fileName, excludeLine="", includeLine=""):
  try:
        fd = open(fileName,"rb")
  except IOError:
        print "Unable to open the file in readmode:", filename
        return
  eachLine = fd.readline()
  prev = None
  while eachLine:
      if excludeLine and eachLine.startswith(excludeLine):
            continue   
      if not prev:
        prev = zlib.crc32(eachLine)
      else:
        prev = zlib.crc32(eachLine, prev)
      eachLine = fd.readline()
  fd.close()    
  return format(prev & 0xFFFFFFFF, '08x') #returns 8 digits crc

for eachFile in sys.argv[1:]:
    print crc(eachFile)

don't realy know for what is (excludeLine="", includeLine="")...

继续阅读：crc hash python

compute crc of file in python

更多精彩内容

精彩评论

最新问答

央视是哪个频道？

请问买过的朋友，舒提啦旅行箱实际使用体验如何？？

检查不孕不育需要的费用？

海信ULED电视画质有什么不同的地方?？

钉子可以挂的住画框幕布吗？

问答排行榜

河神2九牛入海钓河妖是第几集河妖什么来历可活吞牛？

性激素六项检查的最佳时间是多久？多少钱？？

Easiest way to get words of one line from istream into a vector?

《梦在燃烧 (《三国演义》动画片主题曲)》MP3歌词-汤子星？

抽烟只抽炫赫门？