SQL comparing sets, part II: How to join sets of sets
This question reminded me of a couple related problems with whole-set comparison. Given:
- a
collection
of sets, and - a
probe
set
Three questions:
- How do you find all sets in
collection
that matchprobe
, element for element? - How do you find all sets in
collection
that match a collection ofprobe
s, without the use of explicit looping constructs? How do you join sets of sets? - Is this relational division? If not, what is it?
I have a decent solution to question 1 (see below).
I don't have a decent relational solution to question 2. Any takers?
Test data:
IF OBJECT_ID('tempdb..#elements') IS NOT NULL DROP TABLE #elements
IF OBJECT_ID('tempdb..#sets') IS NOT NULL DROP TABLE #sets
CREATE TABLE #sets (set_no INT, PRIMARY KEY (set_no))
CREATE TABLE #elements (set_no INT, elem CHAR(1), PRIMARY KEY (set_no, elem))
INSERT #elements VALUES (1, 'A')
INSERT #elements VALUES (1, 'B')
INSERT #elements VALUES (1, 'C')
INSERT #elements VALUES (1, 'D')
INSERT #elements VALUES (1, 'E')
INSERT #elements VALUES (1, 'F')
INSERT #elements VALUES (2, 'A')
INSERT #elements VALUES (2, 'B')
INSERT #elements VALUES (2, 'C')
INSERT #elements VALUES (3, 'D')
INSERT #elements VALUES (3, 'E')
INSERT #elements VALUES (3, 'F')
INSERT #elements VALUES (4, 'B')
INSERT #elements VALUES (4, 'C')
INSERT #elements VALUES (4, 'F')
INSERT #elements VALUES (5, 'F')
INSERT #sets SELECT DISTINCT set_no FROM #elements
Setup and solution for question 1, set lookup:
IF OBJECT_ID('tempdb..#probe') IS NOT NULL DROP TABLE #probe
CREATE TABLE #probe (elem CHAR(1) PRIMARY KEY (elem))
INS开发者_开发技巧ERT #probe VALUES ('B')
INSERT #probe VALUES ('C')
INSERT #probe VALUES ('F')
-- I think this works.....upvotes for anyone who can demonstrate otherwise
SELECT set_no FROM #sets s
WHERE NOT EXISTS (
SELECT * FROM #elements i WHERE i.set_no = s.set_no AND NOT EXISTS (
SELECT * FROM #probe p WHERE p.elem = i.elem))
AND NOT EXISTS (
SELECT * FROM #probe p WHERE NOT EXISTS (
SELECT * FROM #elements i WHERE i.set_no = s.set_no AND i.elem = p.elem))
Setup for question 2, no solution:
IF OBJECT_ID('tempdb..#multi_probe') IS NOT NULL DROP TABLE #multi_probe
CREATE TABLE #multi_probe (probe_no INT, elem CHAR(1) PRIMARY KEY (probe_no, elem))
INSERT #multi_probe VALUES (1, 'B')
INSERT #multi_probe VALUES (1, 'C')
INSERT #multi_probe VALUES (1, 'F')
INSERT #multi_probe VALUES (2, 'C')
INSERT #multi_probe VALUES (2, 'F')
INSERT #multi_probe VALUES (3, 'A')
INSERT #multi_probe VALUES (3, 'B')
INSERT #multi_probe VALUES (3, 'C')
-- some magic here.....
-- result set:
-- probe_no | set_no
------------|--------
-- 1 | 4
-- 3 | 2
OK, let's solve question 2 step by step:
(1) Inner join sets and probes on their individual elements. This way we'll see how do test sets and probe sets relate (which sets have what elements in common with which probe):
SELECT
e.set_no AS [test set],
m.set_no AS [probe set],
e.elem [common element]
FROM
@elements e
JOIN
@multi_probe m ON e.elem = m.elem
Result:
test set probe set common element
----------- ----------- --------------
1 3 A
1 1 B
1 3 B
1 1 C
1 2 C
1 3 C
1 1 F
1 2 F
2 3 A
2 1 B
2 3 B
2 1 C
2 2 C
2 3 C
3 1 F
3 2 F
4 1 B
4 3 B
4 1 C
4 2 C
4 3 C
4 1 F
4 2 F
5 1 F
5 2 F
(2) Count how many common elements between each test set and probe set (inner joins mean we already left the "no matches" aside)
SELECT
e.set_no AS [test set],
m.set_no AS [probe set],
COUNT(*) AS [common element count]
FROM
@elements e
JOIN
@multi_probe m ON e.elem = m.elem
GROUP BY
e.set_no, m.set_no
ORDER BY
e.set_no, m.set_no
Result:
test set probe set common element count
----------- ----------- --------------------
1 1 3
1 2 2
1 3 3
2 1 2
2 2 1
2 3 3
3 1 1
3 2 1
4 1 3
4 2 2
4 3 2
5 1 1
5 2 1
(3) Bring the counts of the test set and probe set on each row (subqueries may not be the most elegant)
SELECT
e.set_no AS [test set],
m.set_no AS [probe set],
COUNT(*) AS [common element count],
(SELECT COUNT(*) FROM @elements e1 WHERE e1.set_no = e.set_no) AS [test set count],
(SELECT COUNT(*) FROM @multi_probe m1 WHERE m1.set_no = m.set_no) AS [probe set count]
FROM
@elements e
JOIN @multi_probe m ON e.elem = m.elem
GROUP BY
e.set_no, m.set_no
ORDER BY
e.set_no, m.set_no
Result:
test set probe set common element count test set count probe set count
----------- ----------- -------------------- -------------- ---------------
1 1 3 6 3
1 2 2 6 2
1 3 3 6 3
2 1 2 3 3
2 2 1 3 2
2 3 3 3 3
3 1 1 3 3
3 2 1 3 2
4 1 3 3 3
4 2 2 3 2
4 3 2 3 3
5 1 1 1 3
5 2 1 1 2
(4) Find the solution: only retain those test sets and probe sets that have the same number of elements AND this number is also the number of common elements, i.e. the test set and the probe set are identical
SELECT
e.set_no AS [test set],
m.set_no AS [probe set]
FROM
@elements e
JOIN
@multi_probe m ON e.elem = m.elem
GROUP BY
e.set_no, m.set_no
HAVING
COUNT(*) = (SELECT COUNT(*) FROM @elements e1 WHERE e1.set_no = e.set_no)
AND (SELECT COUNT(*) FROM @elements e1 WHERE e1.set_no = e.set_no) = (SELECT COUNT(*) FROM @multi_probe m1 WHERE m1.set_no = m.set_no)
ORDER BY
e.set_no, m.set_no
Result:
test set probe set
----------- -----------
2 3
4 1
Excuse the @
s instead of #
s, I like table variables better :)
May I submit a more "mathematically inclined" solution to question (1), in SQL Server syntax:
SELECT
s.set_no
FROM
#sets s
JOIN @elements e ON s.set_no = e.set_no
LEFT JOIN #probe p ON e.elem = p.elem
GROUP BY
s.set_no
HAVING
COUNT(DISTINCT p.elem) = COUNT(*)
AND COUNT(*) = (SELECT COUNT(*) FROM #probe)
COUNT(*)
will always denote the number of elements in each test set (because of theLEFT JOIN
)COUNT(DISTINCT p.elem)
will denote the number of "matches" between an element in the test set and an element in the probe set (because theNULL
s will not be counted), i.e. how many elements in the probe set were also present in the test set
Translated into mathematical terms COUNT(DISTINCT p.elem) = COUNT(*)
would express that the test set is a subset of the probe set ( test ⊆ probe
) while COUNT(*) = (SELECT COUNT(*) FROM #probe)
would express that the cardinality of the test set is equal to the cardinality of the probe set ( |test| = |probe|
). From these two conditions we conclude that test = probe
.
[Answering my own question....]
First, the solution. The EXCEPT syntax can handle multiple columns and NULLs gracefully, so this is closer to a general solution:
SELECT
s.set_no AS test_set_no
, p.set_no AS probe_set_no
FROM #test_sets s CROSS JOIN #probe_sets p
WHERE NOT EXISTS (
SELECT elem FROM #test_elements te WHERE te.set_no = s.set_no EXCEPT
SELECT elem FROM #probe_elements pe WHERE pe.set_no = p.set_no)
AND NOT EXISTS (
SELECT elem FROM #probe_elements pe WHERE pe.set_no = p.set_no EXCEPT
SELECT elem FROM #test_elements te WHERE te.set_no = s.set_no)
ORDER BY
test_set_no
, probe_set_no
Next, the revised data set:
IF OBJECT_ID('tempdb..#test_elements') IS NOT NULL DROP TABLE #test_elements
IF OBJECT_ID('tempdb..#test_sets') IS NOT NULL DROP TABLE #test_sets
CREATE TABLE #test_sets (set_no INT, PRIMARY KEY (set_no))
CREATE TABLE #test_elements (set_no INT, elem CHAR(1), PRIMARY KEY (set_no, elem))
INSERT #test_elements VALUES (1, 'A')
INSERT #test_elements VALUES (1, 'B')
INSERT #test_elements VALUES (1, 'C')
INSERT #test_elements VALUES (1, 'D')
INSERT #test_elements VALUES (1, 'E')
INSERT #test_elements VALUES (1, 'F')
INSERT #test_elements VALUES (2, 'A')
INSERT #test_elements VALUES (2, 'B')
INSERT #test_elements VALUES (2, 'C')
INSERT #test_elements VALUES (3, 'D')
INSERT #test_elements VALUES (3, 'E')
INSERT #test_elements VALUES (3, 'F')
INSERT #test_elements VALUES (4, 'B')
INSERT #test_elements VALUES (4, 'C')
INSERT #test_elements VALUES (4, 'F')
INSERT #test_elements VALUES (5, 'F')
INSERT #test_sets SELECT DISTINCT set_no FROM #test_elements
IF OBJECT_ID('tempdb..#probe_elements') IS NOT NULL DROP TABLE #probe_elements
IF OBJECT_ID('tempdb..#probe_sets') IS NOT NULL DROP TABLE #probe_sets
CREATE TABLE #probe_sets (set_no INT PRIMARY KEY (set_no))
CREATE TABLE #probe_elements (set_no INT, elem CHAR(1) PRIMARY KEY (set_no, elem))
INSERT #probe_elements VALUES (1, 'B')
INSERT #probe_elements VALUES (1, 'C')
INSERT #probe_elements VALUES (1, 'F')
INSERT #probe_elements VALUES (2, 'C')
INSERT #probe_elements VALUES (2, 'F')
INSERT #probe_elements VALUES (3, 'A')
INSERT #probe_elements VALUES (3, 'B')
INSERT #probe_elements VALUES (3, 'C')
INSERT #probe_sets SELECT DISTINCT set_no FROM #probe_elements
By comparison, using aggregates, as per CyberDude:
SELECT
e.set_no AS [test set]
, m.set_no AS [probe set]
FROM #test_elements e
JOIN #probe_elements m ON e.elem = m.elem
GROUP BY
e.set_no
, m.set_no
HAVING (SELECT COUNT(*) FROM #test_elements e1 WHERE e1.set_no = e.set_no)
= (SELECT COUNT(*) FROM #probe_elements m1 WHERE m1.set_no = m.set_no)
AND (SELECT COUNT(*) FROM #test_elements e1 WHERE e1.set_no = e.set_no)
= COUNT(*)
ORDER BY
e.set_no
, m.set_no
精彩评论