开发者

Finding continuous date ranges and number of times they occur for a row_id

I have a table which shows for each company the contacts related to them and the dates over which the relationship lasted. What I am trying to do is for each company work out a the sets of ranges over which a relationship existed and the number of contacts linked during that time.

(I couldn't find a comparable question in the search field, so apologies if this is a dupe)

For the following sample data:

CREATE TABLE #overlap
(
contact_id NVARCHAR(255)
,company_id NVARCHAR(255)
,relationship_started DATETIME
,relationship_ended DATETIME
)

INSERT INTO #overlap VALUES ('19CB6330-8559-4357-BF70-8F2EBAE4CF90','AECA7DFD-4551-455A-B01D-0000AE19B712','2006-07-25 00:00:00.000','2010-04-11 10:52:00.000')
INSERT INTO #overlap VALUES ('90331A59-EED3-47D5-8885-4648825FE06F','AECA7DFD-4551-455A-B01D-0000AE19B712','2007-07-31 00:00:00.000','2007-08-24 01:09:00.000')
INSERT INTO #overlap VALUES ('CFF414A7-4AB7-4C38-9915-6A107C9044AE','AECA7DFD-4551-455A-B01D-0000AE19B712','2008-01-11 00:00:00.000','2008-02-05 08:23:00.000')
INSERT INTO #overlap VALUES ('EC520389-4B84-429B-97D2-9653CFC47669','AECA7DFD-4551-455A-B01D-0000AE19B712','2008-01-01 00:00:00.000','2011-02-08 09:00:00.000')

I would like the following results:

CREATE TABLE #results
(
company_id NVARCHAR(255)
,start_date DATETIME
,end_date DATETIME
,relationship_number INT
,num_contacts INT
)
INSERT INTO #results VALUES('AECA7DFD-4551-455A-B01D-0000AE19B712','2006-07-25 00:00:00.000','2007-07-31 00:00:00.000',1,1)
INSERT INTO #results VALUES('AECA7DFD-4551-455A-B01D-0000AE19B712','2007-07-31 00:00:00.000','2007-08-24 01:09:00.000',2,2)
INSERT INTO #results VALUES('AECA7DFD-4551-455A-B01D-0000AE19B712','2007-08-24 01:09:00.000','2008-01-01 00:00:00.000',3,1)
INSERT INTO #results VALUES('AECA7DFD-4551-455A-B01D-0000AE19B712','2008-01-01 00:00:00.000','2008-01-11 00:00:00.000',4,2)
INSERT INTO #results VALUES('AECA7DFD-4551-455A-B01D-0000AE19B712','2008-01-11 00:00:00.000','2008-02-05 08:23:00.000',5,3)
INSERT INTO #results VALUES('AECA7DFD-4551-455A-B01D-0000AE19B712','2008-02-05 08:23:00.000','2010-04-11 10:52:00.000',6,2)
INSERT INTO #results VALUES('AECA7DFD-4551-455A-B01D-0000AE19B712','2010-04-11 10:52:00.000','2011-02-08 09:00:00.000',7,1)

I started by using this to group for start dates (and then repeated for the end dates) but couldn't work out how to use the groups to distinguish between the continuous ranges

DENSE_RANK() OVER  
       (PARTITION BY company_id
        ORDER BY DATEPART(YEAR, relationship_started)*12 + DATEPART(MONTH, relationship_started)) AS start_group

Any pointers in the right direction gratefully received! :)

EDIT TO INCLUDE MORE SAMPLE DATA:

CREATE TABLE #overlap
(
contact_id NVARCHAR(255)
,company_id NVARCHAR(255)
,relationship_started DATETIME
,relationship_ended DATETIME
)

INSERT INTO #overlap VALUES ('19CB6330-8559-4357-BF70-8F2EBAE4CF90','AECA7DFD-4551-455A-B01D-0000AE19B712','2006-07-25 00:00:00.000','2010-04-11 10:52:00.000')
INSERT INTO #overlap VALUES ('90331A59-EED3-47D5-8885-4648825FE06F','AECA7DFD-4551-455A-B01D-0000AE19B712','2007-07-31 00:00:00.000','2007-08-24 01:09:00.000')
INSERT INTO #overlap VALUES ('CFF414A7-4AB7-4C38-9915-6A1开发者_StackOverflow社区07C9044AE','AECA7DFD-4551-455A-B01D-0000AE19B712','2008-01-11 00:00:00.000','2008-02-05 08:23:00.000')
INSERT INTO #overlap VALUES ('EC520389-4B84-429B-97D2-9653CFC47669','AECA7DFD-4551-455A-B01D-0000AE19B712','2008-01-01 00:00:00.000','2011-02-08 09:00:00.000')
INSERT INTO #overlap VALUES ('E892EF8B-962F-4974-A3ED-60F6E916DB86','0B10F07D-3662-454B-8FAC-87E5AE92EE8D','2008-07-01 00:00:00.000','2011-02-08 09:00:00.000')
INSERT INTO #overlap VALUES ('76F944E1-E9C8-486E-912B-4A322F5F0A05','6FB62827-A27C-4110-BBA4-BC04C84C0219','2010-12-06 00:00:00.000','2011-02-08 13:00:00.000')
INSERT INTO #overlap VALUES ('69E3768C-3118-48E1-B590-8A7D02726227','6FB62827-A27C-4110-BBA4-BC04C84C0219','1950-01-01 00:00:00.000','2010-12-06 00:00:00.000')


WITH Dates
     AS (SELECT company_id, relationship_started As Date
         FROM   #overlap
         UNION 
         SELECT company_id, relationship_ended
         FROM   #overlap),
     T
     AS (SELECT COUNT(*)                              AS Cnt,
                d.Date,
                ROW_NUMBER() OVER (PARTITION BY d.company_id ORDER BY (d.Date)) AS RN,
                d.company_id
         FROM   Dates d
                left JOIN #overlap o
                  ON d.Date >= relationship_started
                     and d.Date < relationship_ended
                     and d.company_id = o.company_id
         group  by d.Date,
                   d.company_id)
SELECT t1.company_id,
       t1.Date,
       t2.Date,
       ROW_NUMBER() over (PARTITION BY t1.company_id order by (select 0)) as relationship_number,
       t1.Cnt
FROM   T t1
       JOIN T t2
         ON t2.RN = t1.RN + 1 AND t1.company_id = t2.company_id
ORDER BY t1.company_id, t1.Date


I'm not entirely done yet - but maybe this input can take you into the right direction.

My thought was to create two CTE's (Common Table Expressions), that would list out the dates when a new relationship started (with a value of +1), and those when a relationship ended (with a value of -1).

So basically, you get a historical list of when something changed in the relations with that company.

I've come this far by now:

;WITH DatesStarted AS
(SELECT relationship_started AS 'TheDate', 1 AS 'Delta'
 FROM #overlap
),
DatesEnded AS
(SELECT relationship_ended AS 'TheDate', -1 AS 'Delta'
 FROM #overlap
)
SELECT *
FROM DatesStarted
UNION
SELECT * 
FROM DatesEnded
ORDER BY thedate

and I get an output something like this:

TheDate                Delta
2006-07-25 00:00:00.000 1
2007-07-31 00:00:00.000 1
2007-08-24 01:09:00.000 -1
2008-01-01 00:00:00.000 1
2008-01-11 00:00:00.000 1
2008-02-05 08:23:00.000 -1
2010-04-11 10:52:00.000 -1
2011-02-08 09:00:00.000 -1

So given this, you should now generate the inserts into your result table.... not quite there yet (and I have to run off to a meeting) - but maybe that helps get you started!


Here's my solution, for the record.

Somehow I've managed to restructure it a bit, and so it no longer seems enormous to me. Still, the execution plan says it is less performant than the accepted solution.

;WITH
dates AS (
  SELECT
    company_id,
    date         = relationship_started,
    count_change = +1
  FROM #overlap
  UNION ALL
  SELECT
    company_id,
    date         = relationship_ended,
    count_change = -1
  FROM #overlap
),
sorted_dates AS (
  SELECT
    company_id,
    date,
    count_change,
    rownum = ROW_NUMBER() OVER (PARTITION BY company_id ORDER BY date)
  FROM (
    SELECT
      company_id,
      date,
      count_change = SUM(count_change)
    FROM dates
    GROUP BY company_id, date
    HAVING SUM(count_change) <> 0
  ) s
)
SELECT
  sd1.company_id,
  start_date          = MAX(sd2.date),
  end_date            = sd1.date,
  relationship_number = MAX(sd2.rownum),
  num_contacts        = SUM(sd2.count_change)
FROM sorted_dates sd1
  INNER JOIN sorted_dates sd2 ON sd1.company_id = sd2.company_id
    AND sd2.rownum BETWEEN 1 AND sd1.rownum - 1
GROUP BY sd1.company_id, sd1.date


I believe this answer works for multiple companies. Comments are embedded into the code:

WITH company_dates (company_id, date) AS (
    /* get all relevant dates */
    SELECT company_id, relationship_started AS date
    FROM #overlap
    UNION
    SELECT company_id, relationship_ended
    FROM #overlap
)
, sequenced_dates (company_id, date, sequence) AS (
    /* assign dates a sequence for easy joining */
    SELECT company_id, date, row_number() OVER (PARTITION BY company_id ORDER BY DATE) AS sequence
    FROM company_dates
)
SELECT d1.company_id, d1.date AS start_date, d2.date AS end_date, row_number() OVER (PARTITION BY d1.company_id ORDER BY d1.date) AS relationship_number, count(*) AS numContacts
FROM sequenced_dates d1
JOIN sequenced_dates d2 /* join to get start and end date for each region */
    ON d1.company_id = d2.company_id
    AND d1.sequence = d2.sequence-1
JOIN #overlap o /* join to get all involved relationships */
    ON o.company_id = d1.company_id
    AND o.relationship_started <= d1.date
    AND o.relationship_ended >= d2.date
GROUP BY d1.company_id, d1.date, d2.date
ORDER BY d1.company_Id, d1.date
0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜