Recursive diff of two dictionaries (keys and values)?
So I have a python dictionary, call it d1
, and a version of that dictionary at a later point in time, call it d2
. I want to find all the changes between d1
and d2
. In other words, everything that was added, removed or changed. The tricky bit is that the values can be ints, strings, lists, or dicts, so it needs to be recursive. This is what I 开发者_如何转开发have so far:
def dd(d1, d2, ctx=""):
print "Changes in " + ctx
for k in d1:
if k not in d2:
print k + " removed from d2"
for k in d2:
if k not in d1:
print k + " added in d2"
continue
if d2[k] != d1[k]:
if type(d2[k]) not in (dict, list):
print k + " changed in d2 to " + str(d2[k])
else:
if type(d1[k]) != type(d2[k]):
print k + " changed to " + str(d2[k])
continue
else:
if type(d2[k]) == dict:
dd(d1[k], d2[k], k)
continue
print "Done with changes in " + ctx
return
It works just fine unless the value is a list. I cant quite come up with an elegant way to deal with lists, without having a huge, slightly changed version of this function repeated after a if(type(d2) == list)
.
Any thoughts?
EDIT: This differs from this post because the keys can change
In case you want the difference recursively, I have written a package for python: https://github.com/seperman/deepdiff
Installation
Install from PyPi:
pip install deepdiff
Example usage
Importing
>>> from deepdiff import DeepDiff
>>> from pprint import pprint
>>> from __future__ import print_function # In case running on Python 2
Same object returns empty
>>> t1 = {1:1, 2:2, 3:3}
>>> t2 = t1
>>> print(DeepDiff(t1, t2))
{}
Type of an item has changed
>>> t1 = {1:1, 2:2, 3:3}
>>> t2 = {1:1, 2:"2", 3:3}
>>> pprint(DeepDiff(t1, t2), indent=2)
{ 'type_changes': { 'root[2]': { 'newtype': <class 'str'>,
'newvalue': '2',
'oldtype': <class 'int'>,
'oldvalue': 2}}}
Value of an item has changed
>>> t1 = {1:1, 2:2, 3:3}
>>> t2 = {1:1, 2:4, 3:3}
>>> pprint(DeepDiff(t1, t2), indent=2)
{'values_changed': {'root[2]': {'newvalue': 4, 'oldvalue': 2}}}
Item added and/or removed
>>> t1 = {1:1, 2:2, 3:3, 4:4}
>>> t2 = {1:1, 2:4, 3:3, 5:5, 6:6}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff)
{'dic_item_added': ['root[5]', 'root[6]'],
'dic_item_removed': ['root[4]'],
'values_changed': {'root[2]': {'newvalue': 4, 'oldvalue': 2}}}
String difference
>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":"world"}}
>>> t2 = {1:1, 2:4, 3:3, 4:{"a":"hello", "b":"world!"}}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff, indent = 2)
{ 'values_changed': { 'root[2]': {'newvalue': 4, 'oldvalue': 2},
"root[4]['b']": { 'newvalue': 'world!',
'oldvalue': 'world'}}}
String difference 2
>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":"world!\nGoodbye!\n1\n2\nEnd"}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":"world\n1\n2\nEnd"}}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff, indent = 2)
{ 'values_changed': { "root[4]['b']": { 'diff': '--- \n'
'+++ \n'
'@@ -1,5 +1,4 @@\n'
'-world!\n'
'-Goodbye!\n'
'+world\n'
' 1\n'
' 2\n'
' End',
'newvalue': 'world\n1\n2\nEnd',
'oldvalue': 'world!\n'
'Goodbye!\n'
'1\n'
'2\n'
'End'}}}
>>>
>>> print (ddiff['values_changed']["root[4]['b']"]["diff"])
---
+++
@@ -1,5 +1,4 @@
-world!
-Goodbye!
+world
1
2
End
Type change
>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, 3]}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":"world\n\n\nEnd"}}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff, indent = 2)
{ 'type_changes': { "root[4]['b']": { 'newtype': <class 'str'>,
'newvalue': 'world\n\n\nEnd',
'oldtype': <class 'list'>,
'oldvalue': [1, 2, 3]}}}
List difference
>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, 3, 4]}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2]}}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff, indent = 2)
{'iterable_item_removed': {"root[4]['b'][2]": 3, "root[4]['b'][3]": 4}}
List difference 2:
>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, 3]}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 3, 2, 3]}}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff, indent = 2)
{ 'iterable_item_added': {"root[4]['b'][3]": 3},
'values_changed': { "root[4]['b'][1]": {'newvalue': 3, 'oldvalue': 2},
"root[4]['b'][2]": {'newvalue': 2, 'oldvalue': 3}}}
List difference ignoring order or duplicates: (with the same dictionaries as above)
>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, 3]}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 3, 2, 3]}}
>>> ddiff = DeepDiff(t1, t2, ignore_order=True)
>>> print (ddiff)
{}
List that contains dictionary:
>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, {1:1, 2:2}]}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, {1:3}]}}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff, indent = 2)
{ 'dic_item_removed': ["root[4]['b'][2][2]"],
'values_changed': {"root[4]['b'][2][1]": {'newvalue': 3, 'oldvalue': 1}}}
Sets:
>>> t1 = {1, 2, 8}
>>> t2 = {1, 2, 3, 5}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (DeepDiff(t1, t2))
{'set_item_added': ['root[3]', 'root[5]'], 'set_item_removed': ['root[8]']}
Named Tuples:
>>> from collections import namedtuple
>>> Point = namedtuple('Point', ['x', 'y'])
>>> t1 = Point(x=11, y=22)
>>> t2 = Point(x=11, y=23)
>>> pprint (DeepDiff(t1, t2))
{'values_changed': {'root.y': {'newvalue': 23, 'oldvalue': 22}}}
Custom objects:
>>> class ClassA(object):
... a = 1
... def __init__(self, b):
... self.b = b
...
>>> t1 = ClassA(1)
>>> t2 = ClassA(2)
>>>
>>> pprint(DeepDiff(t1, t2))
{'values_changed': {'root.b': {'newvalue': 2, 'oldvalue': 1}}}
Object attribute added:
>>> t2.c = "new attribute"
>>> pprint(DeepDiff(t1, t2))
{'attribute_added': ['root.c'],
'values_changed': {'root.b': {'newvalue': 2, 'oldvalue': 1}}}
Here's an implementation inspired by Winston Ewert
def recursive_compare(d1, d2, level='root'):
if isinstance(d1, dict) and isinstance(d2, dict):
if d1.keys() != d2.keys():
s1 = set(d1.keys())
s2 = set(d2.keys())
print('{:<20} + {} - {}'.format(level, s1-s2, s2-s1))
common_keys = s1 & s2
else:
common_keys = set(d1.keys())
for k in common_keys:
recursive_compare(d1[k], d2[k], level='{}.{}'.format(level, k))
elif isinstance(d1, list) and isinstance(d2, list):
if len(d1) != len(d2):
print('{:<20} len1={}; len2={}'.format(level, len(d1), len(d2)))
common_len = min(len(d1), len(d2))
for i in range(common_len):
recursive_compare(d1[i], d2[i], level='{}[{}]'.format(level, i))
else:
if d1 != d2:
print('{:<20} {} != {}'.format(level, d1, d2))
if __name__ == '__main__':
d1={'a':[0,2,3,8], 'b':0, 'd':{'da':7, 'db':[99,88]}}
d2={'a':[0,2,4], 'c':0, 'd':{'da':3, 'db':7}}
recursive_compare(d1, d2)
will return:
root + {'b'} - {'c'}
root.a len1=4; len2=3
root.a[2] 3 != 4
root.d.db [99, 88] != 7
root.d.da 7 != 3
One option would be to convert any lists you run into as dictionaries with the index as a key. For example:
# add this function to the same module
def list_to_dict(l):
return dict(zip(map(str, range(len(l))), l))
# add this code under the 'if type(d2[k]) == dict' block
elif type(d2[k]) == list:
dd(list_to_dict(d1[k]), list_to_dict(d2[k]), k)
Here is the output with the sample dictionaries you gave in comments:
>>> d1 = {"name":"Joe", "Pets":[{"name":"spot", "species":"dog"}]}
>>> d2 = {"name":"Joe", "Pets":[{"name":"spot", "species":"cat"}]}
>>> dd(d1, d2, "base")
Changes in base
Changes in Pets
Changes in 0
species changed in d2 to cat
Done with changes in 0
Done with changes in Pets
Done with changes in base
Note that this will compare index by index, so it will need some modification to work well for list items being added or removed.
Just a thought: You could try an object-oriented approach where you derive your own dictionary class that keeps track of any changes made to it (and reports them). Seems like this might have many advantages over trying to compare two dicts...one is noted at the end.
To show how that might be done, here's a reasonably complete and minimally tested sample implementation which should work with both Python 2 and 3:
import sys
_NUL = object() # unique object
if sys.version_info[0] > 2:
def iterkeys(d, **kw):
return iter(d.keys(**kw))
else:
def iterkeys(d, **kw):
return d.iterkeys(**kw)
class TrackingDict(dict):
""" Dict subclass which tracks all changes in a _changelist attribute. """
def __init__(self, *args, **kwargs):
super(TrackingDict, self).__init__(*args, **kwargs)
self.clear_changelist()
for key in sorted(iterkeys(self)):
self._changelist.append(AddKey(key, self[key]))
def clear_changelist(self): # additional public method
self._changelist = []
def __setitem__(self, key, value):
modtype = ChangeKey if key in self else AddKey
super(TrackingDict, self).__setitem__(key, value)
self._changelist.append(modtype(key, self[key]))
def __delitem__(self, key):
super(TrackingDict, self).__delitem__(key)
self._changelist.append(RemoveKey(key))
def clear(self):
deletedkeys = self.keys()
super(TrackingDict, self).clear()
for key in sorted(deletedkeys):
self._changelist.append(RemoveKey(key))
def update(self, other=_NUL):
if other is not _NUL:
otherdict = dict(other) # convert to dict if necessary
changedkeys = set(k for k in otherdict if k in self)
super(TrackingDict, self).update(other)
for key in sorted(iterkeys(otherdict)):
if key in changedkeys:
self._changelist.append(ChangeKey(key, otherdict[key]))
else:
self._changelist.append(AddKey(key, otherdict[key]))
def setdefault(self, key, default=None):
if key not in self:
self[key] = default # will append an AddKey to _changelist
return self[key]
def pop(self, key, default=_NUL):
if key in self:
ret = self[key] # save value
self.__delitem__(key)
return ret
elif default is not _NUL: # default specified
return default
else: # not there & no default
self[key] # allow KeyError to be raised
def popitem(self):
key, value = super(TrackingDict, self).popitem()
self._changelist.append(RemoveKey(key))
return key, value
# change-tracking record classes
class DictMutator(object):
def __init__(self, key, value=_NUL):
self.key = key
self.value = value
def __repr__(self):
return '%s(%r%s)' % (self.__class__.__name__, self.key,
'' if self.value is _NUL else ': '+repr(self.value))
class AddKey(DictMutator): pass
class ChangeKey(DictMutator): pass
class RemoveKey(DictMutator): pass
if __name__ == '__main__':
import traceback
import sys
td = TrackingDict({'one': 1, 'two': 2})
print('changelist: {}'.format(td._changelist))
td['three'] = 3
print('changelist: {}'.format(td._changelist))
td['two'] = -2
print('changelist: {}'.format(td._changelist))
td.clear()
print('changelist: {}'.format(td._changelist))
td.clear_changelist()
td['newkey'] = 42
print('changelist: {}'.format(td._changelist))
td.setdefault('another') # default None value
print('changelist: {}'.format(td._changelist))
td.setdefault('one more', 43)
print('changelist: {}'.format(td._changelist))
td.update(zip(('another', 'one', 'two'), (17, 1, 2)))
print('changelist: {}'.format(td._changelist))
td.pop('newkey')
print('changelist: {}'.format(td._changelist))
try:
td.pop("won't find")
except KeyError:
print("KeyError as expected:")
traceback.print_exc(file=sys.stdout)
print('...and no change to _changelist:')
print('changelist: {}'.format(td._changelist))
td.clear_changelist()
while td:
td.popitem()
print('changelist: {}'.format(td._changelist))
Note that unlike a simple comparison of the before and after state of a dictionary, this class will tell you about keys which were added and then deleted—in other words, it keeps a complete history until its _changelist
is cleared.
Output:
changelist: [AddKey('one': 1), AddKey('two': 2)]
changelist: [AddKey('one': 1), AddKey('two': 2), AddKey('three': 3)]
changelist: [AddKey('one': 1), AddKey('two': 2), AddKey('three': 3), ChangeKey('two': -2)]
changelist: [AddKey('one': 1), AddKey('two': 2), AddKey('three': 3), ChangeKey('two': -2), RemoveKey('one'), RemoveKey('three'), RemoveKey('two')]
changelist: [AddKey('newkey': 42)]
changelist: [AddKey('newkey': 42), AddKey('another': None)]
changelist: [AddKey('newkey': 42), AddKey('another': None), AddKey('one more': 43)]
changelist: [AddKey('newkey': 42), AddKey('another': None), AddKey('one more': 43), ChangeKey('another': 17), AddKey('one': 1), AddKey('two': 2)]
changelist: [AddKey('newkey': 42), AddKey('another': None), AddKey('one more': 43), ChangeKey('another': 17), AddKey('one': 1), AddKey('two': 2), RemoveKey('newkey')]
KeyError as expected:
Traceback (most recent call last):
File "trackingdict.py", line 122, in <module>
td.pop("won't find")
File "trackingdict.py", line 67, in pop
self[key] # allow KeyError to be raised
KeyError: "won't find"
...and no change to _changelist:
changelist: [AddKey('newkey': 42), AddKey('another': None), AddKey('one more': 43), ChangeKey('another': 17), AddKey('one': 1), AddKey('two': 2), RemoveKey('newkey')]
changelist: [RemoveKey('one'), RemoveKey('two'), RemoveKey('another'), RemoveKey('one more')]
Your function should begin by checking the type of its arguments, write the function so that it can handle lists, dictionaries, ints, and strings. That way you don't have to duplicate anything, you just call recursively.
Psuedocode:
def compare(d1, d2):
if d1 and d2 are dicts
compare the keys, pass values to compare
if d1 and d2 are lists
compare the lists, pass values to compare
if d1 and d2 are strings/ints
compare them
As suggested by Serge I found this solution helpful to get a quick boolean return on whether two dictionaries match "all the way down":
import json
def match(d1, d2):
return json.dumps(d1, sort_keys=True) == json.dumps(d2, sort_keys=True)
Consider using hasattr(obj, '__iter__')
as you recurse through the object. If an object implements the __iter__
method you know you can iterate over it.
It is fun to do something yourselves to practice and learn, yet I find that for non-trivial tasks, the ready and maintained packages often work better.
Consider convert to json and use some decent "semantic" json comparator say https://www.npmjs.com/package/compare-json or online http://jsondiff.com. Would need stringify number key.
If you can try translate jsondiff to python if you really need.
Conversion from JavaScript to Python code?
You can try the following simple implementation
def recursive_compare(obj1, obj2):
""" Compare python objects recursively, support type:
"int, float, long, basestring, set, datetime, date, dict, Sequence"
Example:
>>> recursive_compare([1, 2, 3], [1, 2, 3])
>>> True
>>> recursive_compare([1, 2, 3], [1, 2, 4])
>>> False
>>> recursive_compare({'a': 1}, {'a': 2})
>>> False
"""
def _diff(obj1, obj2):
# exclude type basestring for backward-compatible python2:
# <str, unicode>
if type(obj1) != type(obj2) and not isinstance(obj1, basestring):
return False
elif isinstance(obj1,
(int, float, long, basestring, set, datetime, date)):
if obj1 != obj2:
return False
elif isinstance(obj1, dict):
keys = obj1.viewkeys() & obj2.viewkeys()
if obj1 and len(keys) == 0 \
or keys.difference(set(obj1.keys())) \
or keys.difference(set(obj2.keys())):
return False
for k in keys:
if _diff(obj1[k], obj2[k]) is False:
return False
elif isinstance(obj1, collections.Sequence):
# require sorted sequence object
if len(obj1) != len(obj2):
return False
for i in range(len(obj1)):
if _diff(obj1[i], obj2[i]) is False:
return False
else:
raise TypeError('do not support type {} to compare'.format(
type(obj1)))
return False if _diff(obj1, obj2) is False else True
Here's a sample, which can be easily extended to handle other python data types too:
def deep_compare(a, b) -> bool:
if type(a) is not type(b): return False
if type(a) is dict:
if not deep_compare(list(a.keys()), list(b.keys())): return False
if not deep_compare(list(a.values()), list(b.values())): return False
elif isinstance(a, (list, tuple, set)):
for a_i, b_i in zip(a, b):
if not deep_compare(a_i, b_i): return False
else: # scalar, bool, str
if a != b: return False
return True
精彩评论