What's the best way to perform the equivalent of DENSE_RANK on MongoDB?
SQL Server and Oracle both have DENSE_RANK functions. This allows you to, among other things, get the global ranking for a record while returning only a subset of th开发者_JAVA百科ose records, e.g.:
SELECT DENSE_RANK() OVER(ORDER BY SomeField DESC) SomeRank
What is the best way to do the same thing in MongoDB?
After some experimentation, I found that it is possible to build a ranking function based on MapReduce, assuming the result set can fit in the max document size.
For example, suppose I have a collection like this:
{ player: "joe", points: 1000, foo: 10, bar: 20, bang: "some text" }
{ player: "susan", points: 2000, foo: 10, bar: 20, bang: "some text" }
{ player: "joe", points: 1500, foo: 10, bar: 20, bang: "some text" }
{ player: "ben", points: 500, foo: 10, bar: 20, bang: "some text" }
...
I can perform the rough equivalent of a DENSE_RANK like so:
var m = function() {
++g_counter;
if ((this.player == "joe") && (g_scores.length != g_fake_limit)) {
g_scores.push({
player: this.player,
points: this.points,
foo: this.foo,
bar: this.bar,
bang: this.bang,
rank: g_counter
});
}
if (g_counter == g_final)
{
emit(this._id, g_counter);
}
}}
var r = function (k, v) { }
var f = function(k, v) { return g_scores; }
var test_mapreduce = function (limit) {
var total_scores = db.scores.count();
return db.scores.mapReduce(m, r, {
out: { inline: 1 },
sort: { points: -1 },
finalize: f,
limit: total_scores,
verbose: true,
scope: {
g_counter: 0,
g_final: total_scores,
g_fake_limit: limit,
g_scores:[]
}
}).results[0].value;
}
For comparison, here is the "naive" approach mentioned elsewhere:
var test_naive = function(limit) {
var cursor = db.scores.find({player: "joe"}).limit(limit).sort({points: -1});
var scores = [];
cursor.forEach(function(score) {
score.rank = db.scores.count({points: {"$gt": score.points}}) + 1;
scores.push(score);
});
return scores;
}
I benchmarked both approaches on a single instance of MongoDB 1.8.2 using the following code:
var rand = function(max) {
return Math.floor(Math.random() * max);
}
var create_score = function() {
var names = ["joe", "ben", "susan", "kevin", "lucy"]
return { player: names[rand(names.length)], points: rand(1000000), foo: 10, bar: 20, bang: "some kind of example text"};
}
var init_collection = function(total_records) {
db.scores.drop();
for (var i = 0; i != total_records; ++i) {
db.scores.insert(create_score());
}
db.scores.createIndex({points: -1})
}
var benchmark = function(test, count, limit) {
init_collection(count);
var durations = [];
for (var i = 0; i != 5; ++i) {
var start = new Date;
result = test(limit)
var stop = new Date;
durations.push(stop - start);
}
db.scores.drop();
return durations;
}
While MapReduce was faster than I expected, the naive approach blew it out of the water for larger collection sizes, especially once the cache was warmed up:
> benchmark(test_naive, 1000, 50);
[ 22, 16, 17, 16, 17 ]
> benchmark(test_mapreduce, 1000, 50);
[ 16, 15, 14, 11, 14 ]
>
> benchmark(test_naive, 10000, 50);
[ 56, 16, 17, 16, 17 ]
> benchmark(test_mapreduce, 10000, 50);
[ 154, 109, 116, 109, 109 ]
>
> benchmark(test_naive, 100000, 50);
[ 492, 15, 18, 17, 16 ]
> benchmark(test_mapreduce, 100000, 50);
[ 1595, 1071, 1099, 1108, 1070 ]
>
> benchmark(test_naive, 1000000, 50);
[ 6600, 16, 15, 16, 24 ]
> benchmark(test_mapreduce, 1000000, 50);
[ 17405, 10725, 10768, 10779, 11113 ]
So for now, it looks like the naive approach is the way to go, although I'll be interested to see if the story changes later this year as the MongoDB team continues improving MapReduce performance.
If your score field is directly in your documents, the dense rank is simply the index of the documents in a certain sorted order.
Suppose you have a collection of scores for a game, like:
{user: "dcrosta", score: 10}
{user: "someone", score: 18}
{user: "another", score: 5}
...
Then (assuming you have an index on score) to get ranks you can just query sorted by score (shown here in pymongo syntax):
scores = db.scores.find().sort('score', pymongo.DESCENDING)
for rank, record in enumerate(scores, start=1):
print rank, record['user']
# prints
1 someone
2 dcrosta
3 another
If you're unfamiliar with Python, the enumerate
function creates an iterator which returns pairs of (index
, element
).
EDIT: I assumed you wanted a ranking table -- if you're looking for the rank of a particular user, Richard's answer, or something like it, is what you want.
精彩评论