Remove certain consecutive duplicates in list
I have a list of strings like this:
['**', 'foo', '*', 'bar', 'bar', '**', '**', 'baz']
I want to replace the '**', '**'
with a single '**'
, but leave 'bar', 'bar'
intact. I.e. replace any consecutive number of '**'
with a single one. My curren开发者_如何学Ct code looks like this:
p = ['**', 'foo', '*', 'bar', 'bar', '**', '**', 'baz']
np = [p[0]]
for pi in range(1,len(p)):
if p[pi] == '**' and np[-1] == '**':
continue
np.append(p[pi])
Is there any more pythonic way to do this?
Not sure about pythonic, but this should work and is more terse:
star_list = ['**', 'foo', '*', 'bar', 'bar', '**', '**', 'baz']
star_list = [i for i, next_i in zip(star_list, star_list[1:] + [None])
if (i, next_i) != ('**', '**')]
The above copies the list twice; if you want to avoid that, consider Tom Zych's method. Or, you could do as follows:
from itertools import islice, izip, chain
star_list = ['**', 'foo', '*', 'bar', 'bar', '**', '**', 'baz']
sl_shift = chain(islice(star_list, 1, None), [None])
star_list = [i for i, next_i in izip(star_list, sl_shift)
if (i, next_i) != ('**', '**')]
This can be generalized and made iterator-friendly -- not to mention more readable -- using a variation on the pairwise
recipe from the itertools
docs:
from itertools import islice, izip, chain, tee
def compress(seq, x):
seq, shift = tee(seq)
shift = chain(islice(shift, 1, None), (object(),))
return (i for i, j in izip(seq, shift) if (i, j) != (x, x))
Tested:
>>> list(compress(star_list, '**'))
['**', 'foo', '*', 'bar', 'bar', '**', 'baz']
This is in my opinion pythonic
result = [v for i, v in enumerate(L) if L[i:i+2] != ["**", "**"]]
the only "trickery" being used is that L[i:i+2]
is a list of one element when i == len(L)-1
.
Note that of course the very same expression can also be used as a generator
This works. Not sure how Pythonic it is.
import itertools
p = ['**', 'foo', '*', 'bar', 'bar', '**', '**', 'baz']
q = []
for key, iter in itertools.groupby(p):
q.extend([key] * (1 if key == '**' else len(list(iter))))
print(q)
from itertools import groupby
p = ['**', 'foo', '*', 'bar', 'bar', '**', '**', 'baz']
keep = set(['foo', 'bar', 'baz'])
result = []
for k, g in groupby(p):
if k in keep:
result.extend(list(g))
else:
result.append(k)
A generalised "pythonic" solution that works with any iterable (no backing up, no copying, no indexing, no slicing, doesn't fail if iterable is empty) and any thing-to-squeeze (including None):
>>> test = ['**', 'foo', '*', 'bar', 'bar', '**', '**', '**', 'baz', '**', '**',
... 'foo', '*','*', 'bar', 'bar','bar', '**', '**','foo','bar',]
>>>
>>> def squeeze(iterable, victim, _dummy=object()):
... previous = _dummy
... for item in iterable:
... if item == victim == previous: continue
... previous = item
... yield item
...
>>> print test
['**', 'foo', '*', 'bar', 'bar', '**', '**', '**', 'baz', '**', '**', 'foo', '*'
, '*', 'bar', 'bar', 'bar', '**', '**', 'foo', 'bar']
>>> print list(squeeze(test, "**"))
['**', 'foo', '*', 'bar', 'bar', '**', 'baz', '**', 'foo', '*', '*', 'bar', 'bar
', 'bar', '**', 'foo', 'bar']
>>> print list(squeeze(["**"], "**"))
['**']
>>> print list(squeeze(["**", "**"], "**"))
['**']
>>> print list(squeeze([], "**"))
[]
>>>
Update for the edification of @eyquem who stated that victim
could not be a sequence (or, presumably a set).
Having a container of victims means that there are two possible semantics:
>>> def squeeze2(iterable, victims, _dummy=object()):
... previous = _dummy
... for item in iterable:
... if item == previous in victims: continue
... previous = item
... yield item
...
>>> def squeeze3(iterable, victims, _dummy=object()):
... previous = _dummy
... for item in iterable:
... if item in victims and previous in victims: continue
... previous = item
... yield item
...
>>> guff = "c...d..e.f,,,g,,h,i.,.,.,.j"
>>> print "".join(squeeze2(guff, ".,"))
c.d.e.f,g,h,i.,.,.,.j
>>> print "".join(squeeze3(guff, ".,"))
c.d.e.f,g,h,i.j
>>>
A solution without itertools.groupby()
:
p = ['**', 'foo', '*', 'bar', 'bar', '**', '**', '**', 'baz', '**', '**',
'foo', '*','*', 'bar', 'bar','bar', '**', '**','foo','bar',]
def treat(A):
prec = A[0]; yield prec
for x in A[1:]:
if (prec,x)!=('**','**'): yield x
prec = x
print p
print
print list(treat(p))
result
['**', 'foo', '*', 'bar', 'bar', '**', '**', '**',
'baz', '**', '**',
'foo', '*', '*', 'bar', 'bar','bar', '**', '**',
'foo', 'bar']
['**', 'foo', '*', 'bar', 'bar', '**',
'baz', '**',
'foo', '*', '*', 'bar', 'bar', 'bar', '**',
'foo', 'bar']
Another solution, inspired from dugres
from itertools import groupby
p = ['**', 'foo', '*', 'bar', 'bar', '**', '**', '**', 'baz', '**', '**',
'foo', '*','*', 'bar', 'bar','bar', '**', '**','foo','bar',]
res = []
for k, g in groupby(p):
res.extend( ['**'] if k=='**' else list(g) )
print res
It's like Tom Zych's solution, but simpler
.
EDIT
p = ['**','**', 'foo', '*', 'bar', 'bar', '**', '**', '**', 'baz', '**', '**',
'foo', '*','*', 'bar', 'bar','bar', '**', '**','foo','bar', '**', '**', '**']
q= ['**',12,'**',45, 'foo',78, '*',751, 'bar',4789, 'bar',3, '**', 5,'**',7, '**',
73,'baz',4, '**',8, '**',20,'foo', 8,'*',36,'*', 36,'bar', 11,'bar',0,'bar',9,
'**', 78,'**',21,'foo',27,'bar',355, '**',33, '**',37, '**','end']
def treat(B,dedupl):
B = iter(B)
prec = B.next(); yield prec
for x in B:
if not(prec==x==dedupl): yield x
prec = x
print 'gen = ( x for x in q[::2])'
gen = ( x for x in q[::2])
print 'list(gen)==p is ',list(gen)==p
gen = ( x for x in q[::2])
print 'list(treat(gen)==',list(treat(gen,'**'))
ch = '??h4i4???4t4y?45l????hmo4j5???'
print '\nch==',ch
print "''.join(treat(ch,'?'))==",''.join(treat(ch,'?'))
print "\nlist(treat([],'%%'))==",list(treat([],'%%'))
result
gen = ( x for x in q[::2])
list(gen)==p is True
list(treat(gen)== ['**', 'foo', '*', 'bar', 'bar', '**', 'baz', '**', 'foo', '*', '*', 'bar', 'bar', 'bar', '**', 'foo', 'bar', '**']
ch== ??h4i4???4t4y?45l????hmo4j5???
''.join(treat(ch,'?'))== ?h4i4?4t4y?45l?hmo4j5?
list(treat([],'%%'))== []
.
Remark: A generator function allows to adapt the output to the type of the input with a writing around the call of the generator, it dosn't require to change th internal code of the genrator function;
That's not the case with the Tom Zynch's solution , that can't be adapted so easely to the type of the input
.
EDIT 2
I searched a one-line method, with a list comprehension or a generator expression.
I found to ways of doing this, I think it isn't possible to do without groupby()
from itertools import groupby
from operator import concat
p = ['**', '**','foo', '*', 'bar', 'bar', '**', '**', '**',
'bar','**','foo','sun','sun','sun']
print 'p==',p,'\n'
dedupl = ("**",'sun')
print 'dedupl==',repr(dedupl)
print [ x for k, g in groupby(p) for x in ((k,) if k in dedupl else g) ]
# or
print reduce(concat,( [k] if k in dedupl else list(g) for k, g in groupby(p)),[])
Based on the same principle, it is easy to convert the function of dugres into a generator function:
from itertools import groupby
def compress(iterable, to_compress):
for k, g in groupby(iterable):
if k in to_compress:
yield k
else:
for x in g: yield x
However , this generator function has two disadvantages:
it resorts to the function groupby(), which is not easy to understand by someone not used to Python
its execution's time is longer than the ones of my generator function treat() and the generator function of John Machin, that don't use groupby().
I slightly modified them to make them able to accept a sequence of items to be de-duplicated, and I measured the durations of execution:
from time import clock
from itertools import groupby
def squeeze(iterable, victims, _dummy=object()):
if hasattr(iterable, '__iter__') and not hasattr(victims, '__iter__'):
victims = (victims,)
previous = _dummy
for item in iterable:
if item in victims and item==previous:
continue
previous = item
yield item
def treat(B,victims):
if hasattr(B, '__iter__') and not hasattr(victims, '__iter__'):
victims = (victims,)
B = iter(B)
prec = B.next(); yield prec
for x in B:
if x not in victims or x!=prec: yield x
prec = x
def compress(iterable, to_compress):
if hasattr(iterable, '__iter__') and not hasattr(to_compress, '__iter__'):
to_compress = (to_compress,)
for k, g in groupby(iterable):
if k in to_compress:
yield k
else:
for x in g: yield x
p = ['**', '**','su','foo', '*', 'bar', 'bar', '**', '**', '**',
'su','su','**','bin', '*','*','bar','bar','su','su','su']
n = 10000
te = clock()
for i in xrange(n):
a = list(compress(p,('**','sun')))
print clock()-te,' generator function with groupby()'
te = clock()
for i in xrange(n):
b = list(treat(p,('**','sun')))
print clock()-te,' generator function eyquem'
te = clock()
for i in xrange(n):
c = list(squeeze(p,('**','sun')))
print clock()-te,' generator function John Machin'
print p
print 'a==b==c is ',a==b==c
print a
The instruction
if hasattr(iterable, '__iter__') and not hasattr(to_compress, '__iter__'):
to_compress = (to_compress,)
is necessary to avoid errors when the the iterable argument is a sequence and the other argument only one string: this latter needs then to be modified into a container, provided that the iterable argument isn't a string itself.
It is based on the fact that sequences like tuples, lists, stes... have the method iter, but strings haven't. The following code shows the problems:
def compress(iterable, to_compress):
if hasattr(iterable, '__iter__') and not hasattr( to_compress, '__iter__'):
to_compress = (to_compress,)
print 't_compress==',repr(to_compress)
for k, g in groupby(iterable):
if k in to_compress:
yield k
else:
for x in g: yield x
def compress_bof(iterable, to_compress):
if not hasattr(to_compress, '__iter__'): # to_compress is a string
to_compress = (to_compress,)
print 't_compress==',repr(to_compress)
for k, g in groupby(iterable):
if k in to_compress:
yield k
else:
for x in g: yield x
def compress_bug(iterable, to_compress_bug):
print 't_compress==',repr(to_compress_bug)
for k, g in groupby(iterable):
#print 'k==',k,k in to_compress_bug
if k in to_compress_bug:
yield k
else:
for x in g: yield x
q = ';;;htr56;but78;;;;$$$$;ios4!'
print 'q==',q
dedupl = ";$"
print 'dedupl==',repr(dedupl)
print
print "''.join(compress (q,"+repr(dedupl)+")) :\n",''.join(compress (q,dedupl))+\
' <-CORRECT ONE'
print
print "''.join(compress_bof(q,"+repr(dedupl)+")) :\n",''.join(compress_bof(q,dedupl))+\
' <====== error ===='
print
print "''.join(compress_bug(q,"+repr(dedupl)+")) :\n",''.join(compress_bug(q,dedupl))
print '\n\n\n'
q = [';$', ';$',';$','foo', ';', 'bar','bar',';',';',';','$','$','foo',';$12',';$12']
print 'q==',q
dedupl = ";$12"
print 'dedupl==',repr(dedupl)
print
print 'list(compress (q,'+repr(dedupl)+')) :\n',list(compress (q,dedupl)),\
' <-CORRECT ONE'
print
print 'list(compress_bof(q,'+repr(dedupl)+')) :\n',list(compress_bof(q,dedupl))
print
print 'list(compress_bug(q,'+repr(dedupl)+')) :\n',list(compress_bug(q,dedupl)),\
' <====== error ===='
print
result
q== ;;;htr56;but78;;;;$$$$;ios4!
dedupl== ';$'
''.join(compress (q,';$')) :
t_compress== ';$'
;htr56;but78;$;ios4! <-CORRECT ONE
''.join(compress_bof(q,';$')) :
t_compress== (';$',)
;;;htr56;but78;;;;$$$$;ios4! <====== error ====
''.join(compress_bug(q,';$')) :
t_compress== ';$'
;htr56;but78;$;ios4!
q== [';$', ';$', ';$', 'foo', ';', 'bar', 'bar', ';', ';', ';', '$', '$', 'foo', ';$12', ';$12']
dedupl== ';$12'
list(compress (q,';$12')) :
t_compress== (';$12',)
[';$', ';$', ';$', 'foo', ';', 'bar', 'bar', ';', ';', ';', '$', '$', 'foo', ';$12'] <-CORRECT ONE
list(compress_bof(q,';$12')) :
t_compress== (';$12',)
[';$', ';$', ';$', 'foo', ';', 'bar', 'bar', ';', ';', ';', '$', '$', 'foo', ';$12']
list(compress_bug(q,';$12')) :
t_compress== ';$12'
[';$', 'foo', ';', 'bar', 'bar', ';', '$', 'foo', ';$12'] <====== error ====
I obtained the following execution's time:
0.390163274941 generator function with groupby()
0.324547114228 generator function eyquem
0.310176572721 generator function John Machin
['**', '**', 'su', 'foo', '*', 'bar', 'bar', '**', '**', '**', 'su', 'su', '**', 'bin', '*', '*', 'bar', 'bar', 'su', 'su', 'su']
a==b==c is True
['**', 'su', 'foo', '*', 'bar', 'bar', '**', 'su', 'su', '**', 'bin', '*', '*', 'bar', 'bar', 'su', 'su', 'su']
I prefer the solution of John Machin because there is no instruction B = iter(B) as in mine.
But the instruction previous = _dummy
with _dummy = object()
appears weird to me. So I finally think the better solution is the following code, that works even with a string as iterable argument, and in which the first object previous being defined isn't a fake:
def squeeze(iterable, victims):
if hasattr(iterable, '__iter__') and not hasattr(victims, '__iter__'):
victims = (victims,)
for item in iterable:
previous = item
break
for item in iterable:
if item in victims and item==previous:
continue
previous = item
yield item
.
EDIT 3
I had undesrtood that object() was used as a sentinel.
But I was puzzled by the fact object is CALLED. Yesterday, I was thinking that object is something so peculiar that it is impossible that object be in any iterable passed as an argument to squeeze(). So, I was wondering why you called it, John Machin, and that sowed doubt in my mind concerning its nature; that's why I asked you a confirmation that object is the super meta-class.
But today, I think I understand why object is called in your code.
In fact, it is quite possible that object be in an iterable, why not ? Super meta-class object is an object, so nothing prevents it to have been put in the iterable before a de-duplication is processed on the iterable, who knows ? Then using object itself as a sentinel is incorrect practice.
.
So you didn't use object but an instance object() as a sentinel.
But I wondered why to choose this mysterious thing that the return of a call to object is ?
My reflections went on about this point and I remarked something that must be the reason of this call:
calling object creates an instance , since object is the most base class in Python, and each time an instance is created, it is a different object from any prior created instance, with a value always different from the value of any prior object's instance:
a = object()
b = object()
c = object()
d = object()
print id(a),'\n',id(b),'\n',id(c),'\n',id(d)
print a==b,a==c,a==d
print b==c,b==d,c==d
result
10818752
10818760
10818768
10818776
False False False
False False False
So it is sure that _dummy=object()
is a unique object, having a unique id and a unique value. By the way , I wonder what is the value of an object's instance. Anyway the following code shows the problem with _dummy=object
and the absence of problem with _dummy=object()
def imperfect_squeeze(iterable, victim, _dummy=object):
previous = _dummy
print 'id(previous) ==',id(previous)
print 'id(iterable[0])==',id(iterable[0])
for item in iterable:
if item in victim and item==previous: continue
previous = item; yield item
def squeeze(iterable, victim, _dummy=object()):
previous = _dummy
print 'id(previous) ==',id(previous)
print 'id(iterable[0])==',id(iterable[0])
for item in iterable:
if item in victim and item==previous: continue
previous = item; yield item
wat = object
li = [wat,'**','**','foo',wat,wat]
print 'imperfect_squeeze\n''li before ==',li
print map(id,li)
li = list(imperfect_squeeze(li,[wat,'**']))
print 'li after ==',li
print
wat = object()
li = [wat,'**','**','foo',wat,wat]
print 'squeeze\n''li before ==',li
print map(id,li)
li = list(squeeze(li,[wat,'**']))
print 'li after ==',li
print
li = [object(),'**','**','foo',object(),object()]
print 'squeeze\n''li before ==',li
print map(id,li)
li = list(squeeze(li,[li[0],'**']))
print 'li after ==',li
result
imperfect_squeeze
li before == [<type 'object'>, '**', '**', 'foo', <type 'object'>, <type 'object'>]
[505317320, 18578968, 18578968, 13208848, 505317320, 505317320]
id(previous) == 505317320
id(iterable[0])== 505317320
li after == ['**', 'foo', <type 'object'>]
squeeze
li before == [<object object at 0x00A514C8>, '**', '**', 'foo', <object object at 0x00A514C8>, <object object at 0x00A514C8>]
[10818760, 18578968, 18578968, 13208848, 10818760, 10818760]
id(previous) == 10818752
id(iterable[0])== 10818760
li after == [<object object at 0x00A514C8>, '**', 'foo', <object object at 0x00A514C8>]
squeeze
li before == [<object object at 0x00A514D0>, '**', '**', 'foo', <object object at 0x00A514D8>, <object object at 0x00A514E0>]
[10818768, 18578968, 18578968, 13208848, 10818776, 10818784]
id(previous) == 10818752
id(iterable[0])== 10818768
li after == [<object object at 0x00A514D0>, '**', 'foo', <object object at 0x00A514D8>, <object object at 0x00A514E0>]
The problem is consisting in the absence of <type 'object'>
as first element of the list after treatment by imperfect_squeeze() .
However, we must note that the "problem" is possible only with a list whose FIRST element is object: that's a lot of reflections about such a tiny probability.... but a rigorous coder takes account of all.
If we resort to list, instead of object , the results are different:
def imperfect_sqlize(iterable, victim, _dummy=list):
previous = _dummy
print 'id(previous) ==',id(previous)
print 'id(iterable[0])==',id(iterable[0])
for item in iterable:
if item in victim and item==previous: continue
previous = item; yield item
def sqlize(iterable, victim, _dummy=list()):
previous = _dummy
print 'id(previous) ==',id(previous)
print 'id(iterable[0])==',id(iterable[0])
for item in iterable:
if item in victim and item==previous: continue
previous = item; yield item
wat = list
li = [wat,'**','**','foo',wat,wat]
print 'imperfect_sqlize\n''li before ==',li
print map(id,li)
li = list(imperfect_sqlize(li,[wat,'**']))
print 'li after ==',li
print
wat = list()
li = [wat,'**','**','foo',wat,wat]
print 'sqlize\n''li before ==',li
print map(id,li)
li = list(sqlize(li,[wat,'**']))
print 'li after ==',li
print
li = [list(),'**','**','foo',list(),list()]
print 'sqlize\n''li before ==',li
print map(id,li)
li = list(sqlize(li,[li[0],'**']))
print 'li after ==',li
result
imperfect_sqlize
li before == [<type 'list'>, '**', '**', 'foo', <type 'list'>, <type 'list'>]
[505343304, 18578968, 18578968, 13208848, 505343304, 505343304]
id(previous) == 505343304
id(iterable[0])== 505343304
li after == ['**', 'foo', <type 'list'>]
sqlize
li before == [[], '**', '**', 'foo', [], []]
[18734936, 18578968, 18578968, 13208848, 18734936, 18734936]
id(previous) == 18734656
id(iterable[0])== 18734936
li after == ['**', 'foo', []]
sqlize
li before == [[], '**', '**', 'foo', [], []]
[18734696, 18578968, 18578968, 13208848, 18735016, 18734816]
id(previous) == 18734656
id(iterable[0])== 18734696
li after == ['**', 'foo', []]
Is there any other object than object in Python that have this peculiarity ?
John Machin, why did you choose an instance of object as a sentinel in your generator function ? Did you already know the above peculiarity ?
精彩评论