T-SQL: A better sliding distribution function/query
I need a T-SQL ranking approach similar to the one provided by NTILE(), except that the members of each tile would be on a sliding distribution so that higher ranking tiles have fewer memb开发者_JS百科ers.
For example
CREATE TABLE #Rank_Table(
id int identity(1,1) not null,
hits bigint not null default 0,
PERCENTILE smallint null
)
--Slant the distribution of the data
INSERT INTO #Rank_Table (hits)
select CASE
when DATA > 9500 THEN DATA*30
WHEN data > 8000 THEN DATA*5
WHEN data < 7000 THEN DATA/3 +1
ELSE DATA
END
FROM
(select top 10000 (ABS(CHECKSUM(NewId())) % 99 +1) * (ABS(CHECKSUM(NewId())) % 99 +1 ) DATA
from master..spt_values t1
cross JOIN master..spt_values t2) exponential
Declare @hitsPerGroup as bigint
Declare @numGroups as smallint
set @numGroups=100
select @hitsPerGroup=SUM(hits)/(@numGroups -1) FROM #Rank_Table
select @hitsPerGroup HITS_PER_GROUP
--This is an even distribution
SELECT id,HITS, NTILE(@numGroups) Over (Order By HITS DESC) PERCENTILE
FROM #Rank_Table
GROUP by id, HITS
--This is my best attempt, but it skips groups because of the erratic distribution
select
T1.ID,
T1.hits,
T.RunningTotal/@hitsPerGroup + 1 TILE,
T.RunningTotal
FROM #Rank_Table T1
CROSS APPLY ( Select SUM(hits) RunningTotal FROM #Rank_Table where hits <= T1.hits) T
order by T1.hits
DROP TABLE #Rank_Table
In #Rank_table, NTILE(@numGroups) creates an even distribution of @numGroups groups. What I need are @numGroups groups where the tile 1 has the fewest members, tile 2 would have one or more than tile 1, tile 3 would have 1 or more than tile 2 ... tile 100 would have the most.
I'm using SQL Server 2008. In practice this will be run against a permanent table with potentially millions of rows in order to periodically update the PERCENTILE column with its percentile from 1-100.
My best attempt above will skip percentiles and performs poorly. There must be a better way.
A better NTILE implementation? YMMV
In order to create a more linear distribution, I added a computed column to the data table, HITS_SQRT HITS_SQRT AS (CONVERT([int],sqrt(HITS*4),(0))) PERSISTED
.
Using this column you can calculate a target number of "hits per percentile".
select @hitsPerGroup=SUM(HITS_SQRT)/(@numGroups -1)-@numGroups, @dataPoints=COUNT(*) FROM #Rank_Table
The script then creates a temp table with a ROW_NUMBER() ordered by the number of hits and iterates the rows in descending order updating its percentile from 100 to 1. A running total is kept of the number of hits, and when the @hitsPerGroup
is passed, the percentile is lowered form 100 to 99, 99 to 98, etc.
Then the source data table is updated with its percentile. There is an index of the temp work table to speed the update.
Full script using #Rank_Table
as the source data table.
--Create Test Data
CREATE TABLE #Rank_Table(
id int identity(1,1) not null,
hits bigint not null default 0,
PERCENTILE smallint NULL,
HITS_SQRT AS (CONVERT([int],sqrt(HITS*4),(0))) PERSISTED
)
--Slant the distribution of the data
INSERT INTO #Rank_Table (hits)
select CASE
when DATA > 9500 THEN DATA*30
WHEN data > 8000 THEN DATA*5
WHEN data < 7000 THEN DATA/3 +1
ELSE DATA
END
FROM
(select top 10000 (ABS(CHECKSUM(NewId())) % 99 +1) * (ABS(CHECKSUM(NewId())) % 99 +1 ) DATA
from master..spt_values t1
cross JOIN master..spt_values t2) exponential
--Create temp work table and variables to calculate percentiles
Declare @hitsPerGroup as int
Declare @numGroups as int
Declare @dataPoints as int
set @numGroups=100
select @hitsPerGroup=SUM(HITS_SQRT)/(@numGroups -1)-@numGroups, @dataPoints=COUNT(*) FROM #Rank_Table
--show the number of hits that each group should have
select @hitsPerGroup HITS_PER_GROUP
--Use temp table for the calculation
CREATE TABLE #tbl (
row int,
hits int,
ID bigint,
PERCENTILE smallint null
)
--add index to row
CREATE CLUSTERED INDEX idxRow ON #tbl(row)
insert INTO #tbl
select ROW_NUMBER() over (ORDER BY HITS), hits_SQRT, ID, null from #Rank_Table
--Update each row with a running total.
--lower the percentile by one when we cross a threshold for the maximum number of hits per group (@hitsPerGroup)
DECLARE @row as int
DEClare @runningTotal as int
declare @percentile int
set @row = 0
set @runningTotal = 0
set @percentile = @numGroups
while @row <= @dataPoints
BEGIN
select @runningTotal=@runningTotal + hits from #tbl where row=@row
if @runningTotal >= @hitsPerGroup
BEGIN
update #tbl
set PERCENTILE=@percentile
WHERE PERCENTILE is null and row <@row
set @percentile = @percentile - 1
set @runningTotal = 0
END
--change rows
set @row = @row + 1
END
--get remaining
update #tbl
set PERCENTILE=@percentile
WHERE PERCENTILE is null
--update source data
UPDATE m SET PERCENTILE = t.PERCENTILE
FROM #tbl t
inner join #Rank_Table m on t.ID=m.ID
--Show the results
SELECT PERCENTILE, COUNT(id) NUMBER_RECORDS, SUM(HITS) HITS_IN_PERCENTILE
FROM #Rank_Table
GROUP BY PERCENTILE
ORDER BY PERCENTILE
--cleanup
DROP TABLE #Rank_Table
DROP TABLE #tbl
The performance is not stellar, but it achieves the goal of a smooth sliding distribution.
精彩评论