T-SQL: A better sliding distribution function/query

2023-01-20 08:10 问答作者：

I need a T-SQL ranking approach similar to the one provided by NTILE(), except that the members of each tile would be on a sliding distribution so that higher ranking tiles have fewer memb开发者_JS百科ers.

For example

CREATE TABLE #Rank_Table(
id int identity(1,1) not null,
hits bigint not null default 0,
PERCENTILE smallint null
)
--Slant the distribution of the data
INSERT INTO #Rank_Table (hits)
select CASE 
  when DATA > 9500 THEN DATA*30
  WHEN data > 8000  THEN DATA*5 
  WHEN data < 7000  THEN DATA/3 +1
  ELSE DATA
 END
FROM
 (select top 10000 (ABS(CHECKSUM(NewId())) % 99 +1) * (ABS(CHECKSUM(NewId())) % 99 +1 ) DATA
 from master..spt_values t1
  cross JOIN master..spt_values t2) exponential

Declare @hitsPerGroup as bigint
Declare @numGroups as smallint
set @numGroups=100

select @hitsPerGroup=SUM(hits)/(@numGroups -1) FROM #Rank_Table 

select @hitsPerGroup HITS_PER_GROUP

--This is an even distribution
SELECT  id,HITS, NTILE(@numGroups) Over (Order By HITS DESC) PERCENTILE 
FROM #Rank_Table 
GROUP by id, HITS

--This is my best attempt, but it skips groups because of the erratic distribution
select 
    T1.ID, 
    T1.hits, 
    T.RunningTotal/@hitsPerGroup + 1 TILE,
    T.RunningTotal
FROM    #Rank_Table T1
        CROSS APPLY ( Select SUM(hits) RunningTotal FROM #Rank_Table where hits <= T1.hits) T
order by T1.hits 

DROP TABLE #Rank_Table

In #Rank_table, NTILE(@numGroups) creates an even distribution of @numGroups groups. What I need are @numGroups groups where the tile 1 has the fewest members, tile 2 would have one or more than tile 1, tile 3 would have 1 or more than tile 2 ... tile 100 would have the most.

I'm using SQL Server 2008. In practice this will be run against a permanent table with potentially millions of rows in order to periodically update the PERCENTILE column with its percentile from 1-100.

My best attempt above will skip percentiles and performs poorly. There must be a better way.

A better NTILE implementation? YMMV

In order to create a more linear distribution, I added a computed column to the data table, HITS_SQRT HITS_SQRT AS (CONVERT([int],sqrt(HITS*4),(0))) PERSISTED.

Using this column you can calculate a target number of "hits per percentile".

select @hitsPerGroup=SUM(HITS_SQRT)/(@numGroups -1)-@numGroups, @dataPoints=COUNT(*) FROM #Rank_Table

The script then creates a temp table with a ROW_NUMBER() ordered by the number of hits and iterates the rows in descending order updating its percentile from 100 to 1. A running total is kept of the number of hits, and when the @hitsPerGroup is passed, the percentile is lowered form 100 to 99, 99 to 98, etc.

Then the source data table is updated with its percentile. There is an index of the temp work table to speed the update.

Full script using #Rank_Table as the source data table.

--Create Test Data
CREATE TABLE #Rank_Table(
id int identity(1,1) not null,
hits bigint not null default 0,
PERCENTILE smallint NULL,
HITS_SQRT  AS (CONVERT([int],sqrt(HITS*4),(0))) PERSISTED
)
--Slant the distribution of the data
INSERT INTO #Rank_Table (hits)
select CASE 
  when DATA > 9500 THEN DATA*30
  WHEN data > 8000  THEN DATA*5 
  WHEN data < 7000  THEN DATA/3 +1
  ELSE DATA
 END
FROM
 (select top 10000 (ABS(CHECKSUM(NewId())) % 99 +1) * (ABS(CHECKSUM(NewId())) % 99 +1 ) DATA
 from master..spt_values t1
  cross JOIN master..spt_values t2) exponential

--Create temp work table and variables to calculate percentiles
    Declare @hitsPerGroup as int
    Declare @numGroups as int
    Declare @dataPoints as int
    set @numGroups=100

    select @hitsPerGroup=SUM(HITS_SQRT)/(@numGroups -1)-@numGroups, @dataPoints=COUNT(*) FROM #Rank_Table 

    --show the number of hits that each group should have
    select @hitsPerGroup HITS_PER_GROUP

    --Use temp table for the calculation
    CREATE TABLE #tbl (
        row int,
        hits int,
        ID bigint,
        PERCENTILE smallint null
    )
    --add index to row
    CREATE CLUSTERED INDEX idxRow ON #tbl(row) 

    insert INTO #tbl
    select ROW_NUMBER() over (ORDER BY HITS), hits_SQRT, ID, null from #Rank_Table

    --Update each row with a running total.
    --lower the percentile by one when we cross a threshold for the maximum number of hits per group (@hitsPerGroup)
    DECLARE @row as int
    DEClare @runningTotal as int
    declare @percentile int
    set @row = 0
    set @runningTotal = 0
    set @percentile = @numGroups

    while @row <= @dataPoints
    BEGIN
        select @runningTotal=@runningTotal + hits from #tbl where row=@row

        if @runningTotal >= @hitsPerGroup
        BEGIN

            update #tbl
            set PERCENTILE=@percentile
            WHERE PERCENTILE is null and row <@row

            set @percentile = @percentile - 1

            set @runningTotal = 0
        END

        --change rows
        set @row = @row + 1
    END

    --get remaining
    update #tbl
    set PERCENTILE=@percentile
    WHERE PERCENTILE is null

    --update source data
    UPDATE m SET PERCENTILE = t.PERCENTILE
    FROM #tbl t
    inner join #Rank_Table m on t.ID=m.ID


--Show the results
    SELECT PERCENTILE, COUNT(id) NUMBER_RECORDS, SUM(HITS) HITS_IN_PERCENTILE 
    FROM #Rank_Table 
    GROUP BY PERCENTILE
    ORDER BY PERCENTILE 

--cleanup
    DROP TABLE #Rank_Table
    DROP TABLE #tbl

The performance is not stellar, but it achieves the goal of a smooth sliding distribution.

继续阅读：sql-server-2008 statistics tile tsql

T-SQL: A better sliding distribution function/query

更多精彩内容

精彩评论

最新问答

央视是哪个频道？

请问买过的朋友，舒提啦旅行箱实际使用体验如何？？

检查不孕不育需要的费用？

海信ULED电视画质有什么不同的地方?？

钉子可以挂的住画框幕布吗？

问答排行榜

河神2九牛入海钓河妖是第几集河妖什么来历可活吞牛？

性激素六项检查的最佳时间是多久？多少钱？？

Easiest way to get words of one line from istream into a vector?

《梦在燃烧 (《三国演义》动画片主题曲)》MP3歌词-汤子星？

抽烟只抽炫赫门？