Strange Map Reduce Behavior in CouchDB. Rereduce?
I have a mapreduce issue with couchdb (both functions shown below): when I run it with grouplevel = 2 (exact) I get accurate output:
{"rows":[
{"key":["2011-01-11","staff-1"],"value":{"total":895.72,"count":2,"services":6,"services_ignored":6,"services_liked":0,"services_disliked":0,"services_disliked_avg":0,"Revise":{"total":275.72,"count":1},"Review":{"total":620,"count":1}}},
{"key":["2011-01-11","staff-2"],"value":{"total":8461.689999999999,"count":2,"services":41,"services_ignored":37,"services_liked":4,"services_disliked":0,"services_disliked_avg":0,"Revise":{"total":4432.4,"count":1},"Review":{"total":4029.29,"count":1}}},
{"key":["2011-01-11","staff-3"],"value":{"total":2100.72,"count":1,"services":10,"services_ignored":4,"services_liked":3,"services_disliked":3,"services_disliked_avg":2.3333333333333335,"Revise":{"total":2100.72,"count":1}}},
However, changing to grouplevel=1 so the values for all the different staff keys should be all grouped by date no longer gives accurate output (notice the total is currect but all others are wrong):
{"rows":[
{"key":["2011-01-11"],"value":{"total":11458.130000000001,"count":2,"services":0,"services_ignored":0,"services_liked":0,"services_disliked":0,"services_disliked_avg":0,"None":{"total":11458.130000000001,"count":2}}},
My only theory is this has something to do with rereduce, which I have not yet learned. Should I explore that option or am I missing something else here?
This is the Map function:
function(doc) {
if(doc.doc_type == 'Feedback') {
emit([doc.date.split('T')[0], doc.staff_id], doc);
}
}
And this is the Reduce:
function(keys, vals) {
// sum all key points by status: total, count, services (liked, rejected, ignored)
var ret = {
'total':0,
'count':0,
'services': 0,
'services_ignored': 0,
'services_liked': 0,
'services_disliked': 0,
'services_disliked_avg': 0,
};
var total_disliked_score = 0;
// handle status
function handle_status(doc) {
if(!doc.status || doc.status == '' || doc.status == undefined) {
status = 'None';
} else if (doc.status == 'Declined') {
status = 'Rejected';
} else {
status = doc.status;
}
if(!ret[status]) ret[status] = {'total':0, 'count':0};
ret[status]['total'] += doc.total;
ret[status]['count'] += 1;
};
// handle likes / dislikes
function handle_services(services) {
ret.services += services.length;
for(var a in services) {
if (services[a].user_likes == 10) {
ret.services_liked += 1;
} else if (services[a].user_likes >= 1) {
ret.services_disliked += 1;
total_disliked_score += services[a].user_likes;
if (total_disliked_score >= ret.services_disliked) {
ret.services_disliked_avg = total_disliked_score / ret.services_disliked;
}
} else {
ret.services_ignored += 1;
}
}
}
// loop thru docs
for(var i in vals) {
// increment the total $
ret.total += vals[i].total;
ret.count += 1;
// update totals and sums for the status of this route
handle_status(vals[i]);
// do the likes / dislikes stats
if(val开发者_如何学编程s[i].groups) {
for(var ii in vals[i].groups) {
if(vals[i].groups[ii].services) {
handle_services(vals[i].groups[ii].services);
}
}
}
// handle deleted services
if(vals[i].hidden_services) {
if (vals[i].hidden_services) {
handle_services(vals[i].hidden_services);
}
}
}
return ret;
}
This is a classic mistake. Keep in mind that CouchDB reduction happens in several steps, and some of these steps will receive as input the result of other reduction steps. However, your code seems to assume that vals[i]
will be an object of the form { "groups": _ , "hidden_services": _ , _ }
representing a single document. This code will fail when a rereduce happens, because then vals[i]
will be of the form { "count" : _ , "services" : _ , _ }
representing the result of a previous reduction step.
So, for instance, by counting with ret.count += 1
, you're counting the number of intermediary reduction results and not the number of documents.
One solution is to write a two versions of your reduce code, one to handle the original reduce and another to handle the rereduce steps. You can determine whether a given call is an initial or rereduce call by looking at the third argument (false if initial, true if rereduce).
Another solution is to have the map function emit a pre-processed value of the same form { "count" : _ , "services" : _ , _ }
that is returned by the reduce function, and have the reduce function merely add up the members of those values together.
For reference, adding a the following code below var ret = { ... } to handle rereduce works!
function rereduce_status(row, ret, stat)
{
if(row[stat]) {
if(!ret[stat]) ret[stat] = {'total':0, 'count':0};
ret[stat]['total'] += row[stat].total;
ret[stat]['count'] += row[stat].count;
}
return ret;
}
if(rereduce) {
for (var i in vals) {
ret.total += vals[i].total;
ret.count += vals[i].count;
ret.services += vals[i].services;
ret.services_ignored += vals[i].services_ignored;
ret.services_liked += vals[i].services_liked;
ret.services_disliked += vals[i].services_disliked;
ret.services_disliked_score += vals[i].services_disliked_score;
if (ret.services_disliked_score >= ret.services_disliked) {
ret.services_disliked_avg = ret.services_disliked_score / ret.services_disliked;
}
ret = rereduce_status(vals[i], ret, 'None');
ret = rereduce_status(vals[i], ret, 'Review');
ret = rereduce_status(vals[i], ret, 'Revise');
ret = rereduce_status(vals[i], ret, 'Rejected');
ret = rereduce_status(vals[i], ret, 'Booked');
}
return ret;
}
精彩评论