开发者

T-SQL: A better sliding distribution function/query

I need a T-SQL ranking approach similar to the one provided by NTILE(), except that the members of each tile would be on a sliding distribution so that higher ranking tiles have fewer memb开发者_JS百科ers.

For example

CREATE TABLE #Rank_Table(
id int identity(1,1) not null,
hits bigint not null default 0,
PERCENTILE smallint null
)
--Slant the distribution of the data
INSERT INTO #Rank_Table (hits)
select CASE 
  when DATA > 9500 THEN DATA*30
  WHEN data > 8000  THEN DATA*5 
  WHEN data < 7000  THEN DATA/3 +1
  ELSE DATA
 END
FROM
 (select top 10000 (ABS(CHECKSUM(NewId())) % 99 +1) * (ABS(CHECKSUM(NewId())) % 99 +1 ) DATA
 from master..spt_values t1
  cross JOIN master..spt_values t2) exponential

Declare @hitsPerGroup as bigint
Declare @numGroups as smallint
set @numGroups=100

select @hitsPerGroup=SUM(hits)/(@numGroups -1) FROM #Rank_Table 

select @hitsPerGroup HITS_PER_GROUP

--This is an even distribution
SELECT  id,HITS, NTILE(@numGroups) Over (Order By HITS DESC) PERCENTILE 
FROM #Rank_Table 
GROUP by id, HITS

--This is my best attempt, but it skips groups because of the erratic distribution
select 
    T1.ID, 
    T1.hits, 
    T.RunningTotal/@hitsPerGroup + 1 TILE,
    T.RunningTotal
FROM    #Rank_Table T1
        CROSS APPLY ( Select SUM(hits) RunningTotal FROM #Rank_Table where hits <= T1.hits) T
order by T1.hits 

DROP TABLE #Rank_Table

In #Rank_table, NTILE(@numGroups) creates an even distribution of @numGroups groups. What I need are @numGroups groups where the tile 1 has the fewest members, tile 2 would have one or more than tile 1, tile 3 would have 1 or more than tile 2 ... tile 100 would have the most.

I'm using SQL Server 2008. In practice this will be run against a permanent table with potentially millions of rows in order to periodically update the PERCENTILE column with its percentile from 1-100.

My best attempt above will skip percentiles and performs poorly. There must be a better way.


A better NTILE implementation? YMMV


In order to create a more linear distribution, I added a computed column to the data table, HITS_SQRT HITS_SQRT AS (CONVERT([int],sqrt(HITS*4),(0))) PERSISTED.

Using this column you can calculate a target number of "hits per percentile".

select @hitsPerGroup=SUM(HITS_SQRT)/(@numGroups -1)-@numGroups, @dataPoints=COUNT(*) FROM #Rank_Table 

The script then creates a temp table with a ROW_NUMBER() ordered by the number of hits and iterates the rows in descending order updating its percentile from 100 to 1. A running total is kept of the number of hits, and when the @hitsPerGroup is passed, the percentile is lowered form 100 to 99, 99 to 98, etc.

Then the source data table is updated with its percentile. There is an index of the temp work table to speed the update.

Full script using #Rank_Table as the source data table.

--Create Test Data
CREATE TABLE #Rank_Table(
id int identity(1,1) not null,
hits bigint not null default 0,
PERCENTILE smallint NULL,
HITS_SQRT  AS (CONVERT([int],sqrt(HITS*4),(0))) PERSISTED
)
--Slant the distribution of the data
INSERT INTO #Rank_Table (hits)
select CASE 
  when DATA > 9500 THEN DATA*30
  WHEN data > 8000  THEN DATA*5 
  WHEN data < 7000  THEN DATA/3 +1
  ELSE DATA
 END
FROM
 (select top 10000 (ABS(CHECKSUM(NewId())) % 99 +1) * (ABS(CHECKSUM(NewId())) % 99 +1 ) DATA
 from master..spt_values t1
  cross JOIN master..spt_values t2) exponential

--Create temp work table and variables to calculate percentiles
    Declare @hitsPerGroup as int
    Declare @numGroups as int
    Declare @dataPoints as int
    set @numGroups=100

    select @hitsPerGroup=SUM(HITS_SQRT)/(@numGroups -1)-@numGroups, @dataPoints=COUNT(*) FROM #Rank_Table 

    --show the number of hits that each group should have
    select @hitsPerGroup HITS_PER_GROUP

    --Use temp table for the calculation
    CREATE TABLE #tbl (
        row int,
        hits int,
        ID bigint,
        PERCENTILE smallint null
    )
    --add index to row
    CREATE CLUSTERED INDEX idxRow ON #tbl(row) 

    insert INTO #tbl
    select ROW_NUMBER() over (ORDER BY HITS), hits_SQRT, ID, null from #Rank_Table

    --Update each row with a running total.
    --lower the percentile by one when we cross a threshold for the maximum number of hits per group (@hitsPerGroup)
    DECLARE @row as int
    DEClare @runningTotal as int
    declare @percentile int
    set @row = 0
    set @runningTotal = 0
    set @percentile = @numGroups

    while @row <= @dataPoints
    BEGIN
        select @runningTotal=@runningTotal + hits from #tbl where row=@row

        if @runningTotal >= @hitsPerGroup
        BEGIN

            update #tbl
            set PERCENTILE=@percentile
            WHERE PERCENTILE is null and row <@row

            set @percentile = @percentile - 1

            set @runningTotal = 0
        END

        --change rows
        set @row = @row + 1
    END

    --get remaining
    update #tbl
    set PERCENTILE=@percentile
    WHERE PERCENTILE is null

    --update source data
    UPDATE m SET PERCENTILE = t.PERCENTILE
    FROM #tbl t
    inner join #Rank_Table m on t.ID=m.ID


--Show the results
    SELECT PERCENTILE, COUNT(id) NUMBER_RECORDS, SUM(HITS) HITS_IN_PERCENTILE 
    FROM #Rank_Table 
    GROUP BY PERCENTILE
    ORDER BY PERCENTILE 

--cleanup
    DROP TABLE #Rank_Table
    DROP TABLE #tbl

The performance is not stellar, but it achieves the goal of a smooth sliding distribution.

0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜