Interpreting a primary key from table data
I have a legacy database imported into sql server 2008 r2 that contains no indexes and no primary/foreign keys for hundreds of tables (some with hundreds of columns). The primary keys which I've identified manually (some are composite) are usually high up in the column ordinal order but I'm not going to spend weeks working them all out if I can help it.
Is there any tool or script which can be used to analyse the cardinality of the data to either suggest or script out likely primary keys?
So far I've found the following script but it's throwing some errors. I will debug it to see if I can work out what is going wrong but wondering if anyone has come across a similar problem and managed to get something working.
Script text from link is below
--Natural Key Finder
--blindman, 6/2005
--This script searches up to 63 columns of any table for permutations of fields that represent unique keys within the dataset.
--The search can be limited by the maximum columns per key, and by the maximum minutes for processing.
declare @TableName varchar(50)
declare @MaxColumns int
declare @MaxMinutes int
--Get a suitable test table from the current database
/*
set @TableName =
(select top 1 sysobjects.name
from sysobjects
inner join sysindexes
on sysobjects.id = sysindexes.id
and indid in (0, 1)
inner join syscolumns on sysobjects.id = syscolumns.id
where sysobjects.type = 'U'
group by sysobjects.name,
sysindexes.rowcnt
having count(*) between 5 and 15
order by rowcnt * count(*) desc)
*/
set @TableName = 'calendarEvents' --Enter the name of the table to analyze. Do NOT enclose it in brackets: []!
set @MaxColumns = 63 --Set the maximum number of columns per key you want to search for.
set @MaxMinutes = 5 --Set a maximum time limit on the search process.
set nocount on
--Procedure variables
declare @RecordCount bigint --The number of records in the table, and the maximum possible cardinality.
declare @ColumnMask bigint --A bitmask representing the presence or absence of columns in a column set.
declare @ColumnString varchar(8000) --A comma-delimited string including all the column names in a column set.
declare @KeyLength int --The permutation size in columns currently being searched.
declare @StartTime datetime --Used to track execution time of the script.
declare @sqlstring varchar(8000) --Dynamically created sql statement.
declare @ProcessGUID char(32) --This random value will be used to name dynamically created database objects.
declare @SampleTableName varchar(50) --This value will hold the name of a dynamically created subset of the table data.
declare @SubMask1 bigint --Bitmask for storing temporary column mask.
declare @SubMask2 bigint --Bitmask for storing temporary column mask.
declare @SubMask3 bigint --Bitmask for storing temporary column mask.
declare @BitMask bigint --Temporary bitmask for stepping through a column mask to find active columns.
declare @Counter int --counts the number permutations analyzed
set @StartTime = getdate()
set @ProcessGUID = replace(newid(), '-', '') --Generate a random string.
set @SampleTableName = 'tbl' + @ProcessGUID --Generate a random table name.
set @Counter = 0
if @MaxColumns > 63 set @MaxColumns = 63
--Create a temp table to hold the record count processed through dynamic SQL
set @RecordCount =
(select rowcnt
from sysobjects
inner join sysindexes
on sysobjects.id = sysindexes.id
where sysobjects.name = @TableName
and sysindexes.indid in (1, 0))
--Create a table for listing and tracking permutations of columns.
create table #ColumnSets
(ColumnSetID int identity,
ColumnMask bigint, --a bitmask value reprenting the presence or absence of columns in a column set.
ColumnCount int, --the number of columns in the set.
Cardinality bigint, --the number of unique values in the column set.
IsUnique int, -- 0: not a naturalkey. 1: natural key.
ColumnString varchar(4000)) --comma-delimited string listing all the columns in the column set.
exec ('CREATE CLUSTERED INDEX IX_' + @ProcessGUID + '_1 ON #ColumnSets (ColumnMask)')
--Create the parent records in the ColumnSets table
insert into #ColumnSets
(ColumnMask,
ColumnCount,
Cardinality,
ColumnString)
select case when colid < 64 then power(cast(2 as bigint), colid-1) else 0 end, --Converts the colid value to a bitmap value.
1,
@RecordCount + 1, --Cardinality is unknown, so estimate 1 higher than the maximum possible cardinality.
'[' + syscolumns.name + ']'
from syscolumns
inner join sysobjects on syscolumns.id = sysobjects.id
where sysobjects.name = @TableName
and syscolumns.prec is not null
and colid < 64
--Informational message
select 'Analyzing table "' + @TableName + '" containing ' + cast(@RecordCount as varchar(20)) + ' records in ' + cast(count(*) as varchar(3)) + ' columns.'
from #ColumnSets
where ColumnCount = 1
--First we will search for large non-primary keys in a sample set of the records. If we find any large permutations that
--are known not to be unique, then we can eliminate any subsets of these permutations from processing later on.
select cast(power(@MaxMinutes, 0.5) -1 as varchar(4)) + ' minutes will be alloted for pre-searching.'
declare @SampleSize int
set @SampleSize = power(@RecordCount, (0.5)) --Sample sqrt(RecordCount)
set @SQLString = 'select top ' + cast(@SampleSize as varchar(10)) + ' * into ' + @SampleTableName + ' from ' + @TableName
exec (@SQLString)
--Start with a columnset including all columns
insert into #ColumnSets
(ColumnMask,
ColumnCount)
select sum(ColumnMask),
count(*)
from #ColumnSets
where ColumnCount = 1
set @KeyLength = (select count(*) from #ColumnSets where ColumnCount = 1)
while exists (select * from #ColumnSets where IsUnique is null and ColumnCount > 1)--isnull((Select max(ColumnCount) from #ColumnSets where IsUnique = 0), 1))
begin
--show status
select 'Largest non-unique permutation found: ' + cast(isnull(max(ColumnCount), 0) as varchar(3)) + ' columns.'
from #ColumnSets
where IsUnique = 0
--If the estimated cardinality is less than the record count, we know the column set cannot
--possibly be unique, so mark it as non-unique.
update #ColumnSets
set IsUnique = 0
where Cardinality < @RecordCount
and IsUnique is null
while exists (select * from #ColumnSets where IsUnique is null and ColumnCount > 1)--isnull((Select max(ColumnCount) from #ColumnSets where IsUnique = 0), 1))
begin
--set @KeyLength = (select max(ColumnCount) from #ColumnSets where IsUnique is null and ColumnCount > 1)
--Get the cardinality of all untested column sets
set @Counter = @Counter +
(Select Count(*)
from #ColumnSets
where IsUnique is null
and ColumnCount = (select max(ColumnCount) from #ColumnSets where IsUnique is null and ColumnCount > 1))
declare CSCursor cursor for
Select ColumnMask,
ColumnString
from #ColumnSets
where IsUnique is null
and ColumnCount = (select max(ColumnCount) from #ColumnSets where IsUnique is null and ColumnCount > 1)
Open CSCursor
Fetch next from CSCursor into @ColumnMask, @ColumnString
while @@FETCH_STATUS = 0
begin
--Create the column string for the bitmask
select @ColumnString = isnull(@ColumnString + ', ', '') + ColumnString
from #ColumnSets
where ColumnCount = 1
and ColumnMask & @ColumnMask = ColumnMask
order by ColumnString
set @SQLString = 'Update #ColumnSets set ColumnString = ''' + @ColumnString + ''', Cardinality = (select count(*) from (select distinct ' + @ColumnString + ' from ' + @SampleTableName + ') DistinctValues), IsUnique = 0 where IsUnique is null and ColumnMask = ' + cast(@ColumnMask as varchar(20))
exec (@SQLString)
fetch next from CSCursor into @ColumnMask, @ColumnString
--Break out of this loop if we have used more than half of the alloted processing time.
if dateadd(mi, power(@MaxMinutes, 0.5), @StartTime) < getdate() break
end
Close CSCursor
Deallocate CSCursor
--Any columnsets with a cardinality equal to the rowcount represent natural keys
Update #ColumnSets set IsUnique = 1 where Cardinality = @SampleSize and ColumnCount > 1
if dateadd(mi, power(@MaxMinutes, 0.5), @StartTime) < getdate() break
--Delete subsets of combinations known not to be unique.
delete CandidateSets
from #ColumnSets CandidateSets
inner join #ColumnSets Supersets on CandidateSets.ColumnMask & Supersets.ColumnMask = CandidateSets.ColumnMask
where SuperSets.IsUnique = 0
and CandidateSets.ColumnCount > 1
and CandidateSets.ColumnCount < SuperSets.ColumnCount
end
if dateadd(mi, power(@MaxMinutes, 0.5) -1, @StartTime) < getdate() break
--Split non-minimal unique sets into three subsets. We will shuffle these subset together
--to creat new permutations for searching.
declare CSCursor cursor for
Select ColumnMask
from #ColumnSets
where IsUnique = 1
and ColumnCount >= 6
Open CSCursor
Fetch next from CSCursor into @ColumnMask
while @@FETCH_STATUS = 0
begin
set @BitMask = 1
set @SubMask1 = 0
set @SubMask2 = 0
set @SubMask3 = 0
--Note that this permutation has already been split
update #ColumnSets set IsUnique = 2 where ColumnMask = @ColumnMask
--split the TargetMask into three distinct masks
while @BitMask < @ColumnMask
begin
while @BitMask < @ColumnMask and @BitMask & @ColumnMask <> @BitMask set @BitMask = @BitMask * 2
set @SubMask1 = @SubMask1 | @BitMask
if @BitMask > @ColumnMask/2 break
set @BitMask = @BitMask * 2
while @BitMask < @ColumnMask and @BitMask & @ColumnMask <> @BitMask set @BitMask = @BitMask * 2
set @SubMask2 = @SubMask2 | @BitMask
if @BitMask > @ColumnMask/2 break
set @BitMask = @BitMask * 2
while @BitMask < @ColumnMask and @BitMask & @ColumnMask <> @BitMask set @BitMask = @BitMask * 2
set @SubMask3 = @SubMask3 | @BitMask
if @BitMask > @ColumnMask/2 break
set @BitMask = @BitMask * 2
end
insert into #ColumnSets
(ColumnMask,
ColumnCount,
Cardinality)
select @SubMask1,
Count(*),
@RecordCount + 1
from #ColumnSets
where ColumnCount = 1
and ColumnMask & @SubMask1 = ColumnMask
insert into #ColumnSets
(ColumnMask,
ColumnCount,
Cardinality)
select @SubMask2,
Count(*),
@RecordCount + 1
from #ColumnSets
where ColumnCount = 1
and ColumnMask & @SubMask2 = ColumnMask
insert into #ColumnSets
(ColumnMask,
ColumnCount,
Cardinality)
select @SubMask3,
Count(*),
@RecordCount + 1
from #ColumnSets
where ColumnCount = 1
and ColumnMask & @SubMask3 = ColumnMask
fetch next from CSCursor into @ColumnMask
end
Close CSCursor
Deallocate CSCursor
--Create larger combinations of non-unique permutations for testing
insert into #ColumnSets
(ColumnMask,
ColumnCount,
Cardinality)
select distinct
ColumnSetsA.ColumnMask | ColumnSetsB.ColumnMask,
ColumnSetsA.ColumnCount + ColumnSetsB.ColumnCount,
@RecordCount + 1
from #ColumnSets ColumnSetsA,
#ColumnSets ColumnSetsB
where ColumnSetsA.ColumnCount + ColumnSetsB.ColumnCount > isnull((Select max(ColumnCount) from #ColumnSets where IsUnique = 0), 0)
and ColumnSetsA.IsUnique = 0
and isnull(ColumnSetsB.IsUnique, 0) = 0
and ColumnSetsA.ColumnMask <> ColumnSetsB.ColumnMask
and ColumnSetsA.ColumnMa开发者_如何学Gosk & ColumnSetsB.ColumnMask = 0
--Remove duplicates
delete #ColumnSets
from #ColumnSets
left outer join --DistinctSets
(select ColumnMask,
min(ColumnSetID) ColumnSetID
from #ColumnSets
group by ColumnMask) DistinctSets
on #ColumnSets.ColumnMask = DistinctSets.ColumnMask
and #ColumnSets.ColumnSetID = DistinctSets.ColumnSetID
where DistinctSets.ColumnSetID is null
--Delete subsets of combinations known not to be unique.
delete CandidateSets
from #ColumnSets CandidateSets
inner join #ColumnSets Supersets on CandidateSets.ColumnMask & Supersets.ColumnMask = CandidateSets.ColumnMask
where SuperSets.IsUnique = 0
and CandidateSets.ColumnCount > 1
and CandidateSets.ColumnCount < SuperSets.ColumnCount
and CandidateSets.IsUnique is null
--Delete supersets of combinations already known to be unique.
delete CandidateSets
from #ColumnSets CandidateSets
inner join #ColumnSets UniqueSets on CandidateSets.ColumnMask & UniqueSets.ColumnMask = UniqueSets.ColumnMask
where UniqueSets.IsUnique > 0
and CandidateSets.ColumnCount > UniqueSets.ColumnCount
and CandidateSets.IsUnique is null
end
delete CandidateSets
from #ColumnSets CandidateSets
inner join #ColumnSets Supersets on CandidateSets.ColumnMask & Supersets.ColumnMask = CandidateSets.ColumnMask
where SuperSets.IsUnique = 0
and CandidateSets.ColumnCount > 1
and CandidateSets.ColumnCount < SuperSets.ColumnCount
delete
from #ColumnSets
where (IsUnique <> 0 or IsUnique is null)
and ColumnCount > 1
set @SQLString = 'drop table ' + @SampleTableName
exec (@SQLString)
--Now start building permutations of columns and checking them for uniqueness.
set @KeyLength = 1
While @KeyLength <= @MaxColumns
begin
--If there are no more candidates left, then stop looking.
if (select count(*) from #ColumnSets where IsUnique is null) = 0 break
--If the estimated cardinality is less than the record count, we know the column set cannot possibly be unique.
update #ColumnSets
set IsUnique = 0
where Cardinality < @RecordCount
and IsUnique is null
--show status
select ColumnCount,
sum(case when IsUnique is null then 1 else 0 end) as Unknown,
sum(case when IsUnique = 0 then 1 else 0 end) as NotUnique,
sum(case when IsUnique = 1 then 1 else 0 end) as IsUnique
from #ColumnSets
group by ColumnCount
order by ColumnCount
--Get the cardinality of all untested column sets
set @Counter = @Counter +
(Select Count(*)
from #ColumnSets
where Cardinality >= @RecordCount
and IsUnique is null)
declare CSCursor cursor for
Select ColumnMask,
ColumnString
from #ColumnSets
where Cardinality >= @RecordCount
and IsUnique is null
Open CSCursor
Fetch next from CSCursor into @ColumnMask, @ColumnString
while @@FETCH_STATUS = 0
begin
set @SQLString = 'Update #ColumnSets set Cardinality = (select count(*) from (select distinct ' + @ColumnString + ' from ' + @TableName + ') DistinctValues), IsUnique = 0 where IsUnique is null and ColumnMask = ' + cast(@ColumnMask as varchar(20))
exec (@SQLString)
fetch next from CSCursor into @ColumnMask, @ColumnString
if dateadd(mi, @MaxMinutes, @StartTime) < getdate() break
end
Close CSCursor
Deallocate CSCursor
--Delete any ColumnSets with fewer than two distinct values, as they cannot be part of a natural key
Delete from #ColumnSets where Cardinality < 2
--Any columnsets with a cardinality equal to the rowcount represent natural keys.
Update #ColumnSets set IsUnique = 1 where Cardinality = @RecordCount
if dateadd(mi, @MaxMinutes, @StartTime) < getdate() break
set @SQLString = 'select ColumnString as ''' + cast(@KeyLength as varchar(3)) + '-column keys found in ' + cast(datediff(second, @StartTime, getdate()) as varchar(10)) + ' seconds.'' from #ColumnSets where IsUnique = 1 and ColumnCount = ' + cast(@KeyLength as varchar(3)) + ' order by ColumnCount, ColumnString'
exec (@SQLString)
--Remove superfluous permutations (ColumnSet has same cardinality as one of its subsets)
delete ColumnSets
from #ColumnSets ColumnSets
inner join #ColumnSets SubSets
on ColumnSets.Cardinality = SubSets.Cardinality
where ColumnSets.ColumnCount > SubSets.ColumnCount
and ColumnSets.ColumnCount = @KeyLength
and ColumnSets.ColumnMask & SubSets.ColumnMask = SubSets.ColumnMask
--Identify and insert combinations of non-unique permutations that have insufficient potential cardinality to be unique
insert into #ColumnSets
(ColumnMask,
ColumnCount,
Cardinality, --estimate the cardinality as the product of the cardinalities of the component columns.
IsUnique,
ColumnString)
select ColumnSetsA.ColumnMask + ColumnSetsB.ColumnMask as ColumnMask,
ColumnSetsA.ColumnCount + ColumnSetsB.ColumnCount as ColumnCount,
min(ColumnSetsA.Cardinality * ColumnSetsB.Cardinality) as Cardinality,
0 as IsUnique,
min(ColumnSetsA.ColumnString + ', ' + ColumnSetsB.ColumnString) as ColumnString
from #ColumnSets ColumnSetsA
inner join #ColumnSets ColumnSetsB on ColumnSetsB.ColumnCount < ColumnSetsA.ColumnCount
where ColumnSetsA.ColumnCount = @KeyLength
and ColumnSetsA.Cardinality * ColumnSetsB.Cardinality <= @RecordCount
and ColumnSetsA.ColumnMask & ColumnSetsB.ColumnMask = 0
and ColumnSetsA.IsUnique = 0
and ColumnSetsB.IsUnique = 0
group by ColumnSetsA.ColumnMask + ColumnSetsB.ColumnMask,
ColumnSetsA.ColumnCount + ColumnSetsB.ColumnCount
--Insert new column sets
set @KeyLength = @KeyLength + 1
insert into #ColumnSets
(ColumnMask,
ColumnCount,
Cardinality, --estimate the cardinality as the product of the cardinalities of the component columns.
ColumnString)
select ParentSets.ColumnMask | UnarySets.ColumnMask as ColumnMask,
ParentSets.ColumnCount + UnarySets.ColumnCount as ColumnCount,
ParentSets.Cardinality * UnarySets.Cardinality as Cardinality,
ParentSets.ColumnString + ', ' + UnarySets.ColumnString as ColumnString
from #ColumnSets ParentSets
inner join #ColumnSets UnarySets on ParentSets.IsUnique = UnarySets.IsUnique
where ParentSets.IsUnique = 0
and ParentSets.ColumnCount = @KeyLength - 1
and UnarySets.ColumnCount = 1
and ParentSets.ColumnMask & UnarySets.ColumnMask = 0
--Remove duplicates
delete #ColumnSets
from #ColumnSets
left outer join --DistinctSets
(select ColumnMask,
min(ColumnString) ColumnString
from #ColumnSets
group by ColumnMask) DistinctSets
on #ColumnSets.ColumnMask = DistinctSets.ColumnMask
and #ColumnSets.ColumnString = DistinctSets.ColumnString
where DistinctSets.ColumnString is null
--Remove supersets of combinations already known to be unique.
delete CandidateSets
from #ColumnSets CandidateSets
inner join #ColumnSets UniqueSets on CandidateSets.ColumnMask & UniqueSets.ColumnMask = UniqueSets.ColumnMask
where UniqueSets.IsUnique = 1
and CandidateSets.IsUnique is null
--Remove subsets of combinations known not to be unique.
delete CandidateSets
from #ColumnSets CandidateSets
inner join #ColumnSets Supersets on CandidateSets.ColumnMask & Supersets.ColumnMask = CandidateSets.ColumnMask
where SuperSets.IsUnique = 0
and CandidateSets.IsUnique is null
end
-- Make sure that all natural keys present their columns in a uniform alphabetical order
declare CSCursor cursor for
Select ColumnMask
from #ColumnSets
where IsUnique = 1
Open CSCursor
Fetch next from CSCursor into @ColumnMask
while @@FETCH_STATUS = 0
begin
set @ColumnString = null
select @ColumnString = isnull(@ColumnString + ', ', '') + ColumnString
from #ColumnSets
where ColumnCount = 1
and ColumnMask & @ColumnMask = ColumnMask
order by ColumnString
update #ColumnSets set ColumnString = @ColumnString where ColumnMask = @ColumnMask
fetch next from CSCursor into @ColumnMask
end
Close CSCursor
Deallocate CSCursor
--Final output:
select 'Found ' + cast(count(*) as varchar(10)) + ' natural keys with ' + cast(@MaxColumns as varchar(10)) + ' or fewer columns in ' + cast(datediff(minute, @StartTime, getdate()) as varchar(10)) + ' minutes.' from #ColumnSets where IsUnique = 1
select ColumnString as NaturalKeys
from #ColumnSets
where IsUnique = 1
order by ColumnCount, ColumnString
select convert(varchar(20), @Counter) + ' permutations tested.'
drop table #ColumnSets
If you are embracing the Microsoft Suite a possibility is to use SSIS. Intergration services has a data profiling task.
This site has the basics on how to use it: http://consultingblogs.emc.com/jamiethomson/archive/2008/03/04/ssis-data-profiling-task-part-8-candidate-key.aspx
As a caution it can take a very long time depending on table size and the number of columns to include in your candidate key. The output is very useful for identifying the strength of keys. In your case you would be looking for items that are 100%.
精彩评论