Standard Deviation for SQLite
I've searched the SQ开发者_如何学编程Lite docs and couldn't find anything, but I've also searched on Google and a few results appeared.
Does SQLite have any built-in Standard Deviation function?
You can calculate the variance in SQL:
create table t (row int);
insert into t values (1),(2),(3);
SELECT AVG((t.row - sub.a) * (t.row - sub.a)) as var from t,
(SELECT AVG(row) AS a FROM t) AS sub;
0.666666666666667
However, you still have to calculate the square root to get the standard deviation.
The aggregate functions supported by SQLite are here:
http://www.sqlite.org/lang_aggfunc.html
STDEV is not in the list.
However, the module extension-functions.c
in this page contains a STDEV function.
There is still no built-in stdev function in sqlite. However, you can define (as Alix has done) a user-defined aggregator function. Here is a complete example in Python:
import sqlite3
import math
class StdevFunc:
def __init__(self):
self.M = 0.0
self.S = 0.0
self.k = 1
def step(self, value):
if value is None:
return
tM = self.M
self.M += (value - tM) / self.k
self.S += (value - tM) * (value - self.M)
self.k += 1
def finalize(self):
if self.k < 3:
return None
return math.sqrt(self.S / (self.k-2))
with sqlite3.connect(':memory:') as con:
con.create_aggregate("stdev", 1, StdevFunc)
cur = con.cursor()
cur.execute("create table test(i)")
cur.executemany("insert into test(i) values (?)", [(1,), (2,), (3,), (4,), (5,)])
cur.execute("insert into test(i) values (null)")
cur.execute("select avg(i) from test")
print("avg: %f" % cur.fetchone()[0])
cur.execute("select stdev(i) from test")
print("stdev: %f" % cur.fetchone()[0])
This will print:
avg: 3.000000
stdev: 1.581139
Compare with MySQL: http://sqlfiddle.com/#!2/ad42f3/3/0
Use variance formula V(X) = E(X^2) - E(X)^2. In SQL sqlite
SELECT AVG(col*col) - AVG(col)*AVG(col) FROM table
To get standard deviation you need to take the square root V(X)^(1/2)
I implemented the Welford's method (the same as extension-functions.c
) as a SQLite UDF:
$db->sqliteCreateAggregate('stdev',
function (&$context, $row, $data) // step callback
{
if (isset($context) !== true) // $context is null at first
{
$context = array
(
'k' => 0,
'm' => 0,
's' => 0,
);
}
if (isset($data) === true) // the standard is non-NULL values only
{
$context['s'] += ($data - $context['m']) * ($data - ($context['m'] += ($data - $context['m']) / ++$context['k']));
}
return $context;
},
function (&$context, $row) // fini callback
{
if ($context['k'] > 0) // return NULL if no non-NULL values exist
{
return sqrt($context['s'] / $context['k']);
}
return null;
},
1);
That's in PHP ($db
is the PDO object) but it should be trivial to port to another language.
SQLite is soooo cool. <3
a little trick
select ((sum(value)*sum(value) - sum(value * value))/((count(*)-1)*(count(*))))
from the_table ;
then the only thing left is to calculate sqrt outside.
No, I searched this same issue, and ended having to do the calculations with my application (PHP)
added some error detection in the python functions
class StdevFunc:
"""
For use as an aggregate function in SQLite
"""
def __init__(self):
self.M = 0.0
self.S = 0.0
self.k = 0
def step(self, value):
try:
# automatically convert text to float, like the rest of SQLite
val = float(value) # if fails, skips this iteration, which also ignores nulls
tM = self.M
self.k += 1
self.M += ((val - tM) / self.k)
self.S += ((val - tM) * (val - self.M))
except:
pass
def finalize(self):
if self.k <= 1: # avoid division by zero
return none
else:
return math.sqrt(self.S / (self.k-1))
You don't state which version of standard deviation you wish to calculate but variances (standard deviation squared) for either version can be calculated using a combination of the sum() and count() aggregate functions.
select
(count(val)*sum(val*val) - (sum(val)*sum(val)))/((count(val)-1)*(count(val))) as sample_variance,
(count(val)*sum(val*val) - (sum(val)*sum(val)))/((count(val))*(count(val))) as population_variance
from ... ;
It will still be necessary to take the square root of these to obtain the standard deviation.
#!/usr/bin/python
# -*- coding: utf-8 -*-
#Values produced by this script can be verified by follwing the steps
#found at https://support.microsoft.com/en-us/kb/213930 to Verify
#by chosing a non memory based database.
import sqlite3
import math
import random
import os
import sys
import traceback
import random
class StdevFunc:
def __init__(self):
self.M = 0.0 #Mean
self.V = 0.0 #Used to Calculate Variance
self.S = 0.0 #Standard Deviation
self.k = 1 #Population or Small
def step(self, value):
try:
if value is None:
return None
tM = self.M
self.M += (value - tM) / self.k
self.V += (value - tM) * (value - self.M)
self.k += 1
except Exception as EXStep:
pass
return None
def finalize(self):
try:
if ((self.k - 1) < 3):
return None
#Now with our range Calculated, and Multiplied finish the Variance Calculation
self.V = (self.V / (self.k-2))
#Standard Deviation is the Square Root of Variance
self.S = math.sqrt(self.V)
return self.S
except Exception as EXFinal:
pass
return None
def Histogram(Population):
try:
BinCount = 6
More = 0
#a = 1 #For testing Trapping
#b = 0 #and Trace Back
#c = (a / b) #with Detailed Info
#If you want to store the Database
#uncDatabase = os.path.join(os.getcwd(),"BellCurve.db3")
#con = sqlite3.connect(uncDatabase)
#If you want the database in Memory
con = sqlite3.connect(':memory:')
#row_factory allows accessing fields by Row and Col Name
con.row_factory = sqlite3.Row
#Add our Non Persistent, Runtime Standard Deviation Function to the Database
con.create_aggregate("Stdev", 1, StdevFunc)
#Lets Grab a Cursor
cur = con.cursor()
#Lets Initialize some tables, so each run with be clear of previous run
cur.executescript('drop table if exists MyData;') #executescript requires ; at the end of the string
cur.execute("create table IF NOT EXISTS MyData('ID' INTEGER PRIMARY KEY AUTOINCREMENT, 'Val' FLOAT)")
cur.executescript('drop table if exists Bins;') #executescript requires ; at the end of the string
cur.execute("create table IF NOT EXISTS Bins('ID' INTEGER PRIMARY KEY AUTOINCREMENT, 'Bin' UNSIGNED INTEGER, 'Val' FLOAT, 'Frequency' UNSIGNED BIG INT)")
#Lets generate some random data, and insert in to the Database
for n in range(0,(Population)):
sql = "insert into MyData(Val) values ({0})".format(random.uniform(-1,1))
#If Whole Number Integer greater that value of 2, Range Greater that 1.5
#sql = "insert into MyData(Val) values ({0})".format(random.randint(-1,1))
cur.execute(sql)
pass
#Now let’s calculate some built in Aggregates, that SQLite comes with
cur.execute("select Avg(Val) from MyData")
Average = cur.fetchone()[0]
cur.execute("select Max(Val) from MyData")
Max = cur.fetchone()[0]
cur.execute("select Min(Val) from MyData")
Min = cur.fetchone()[0]
cur.execute("select Count(Val) from MyData")
Records = cur.fetchone()[0]
#Now let’s get Standard Deviation using our function that we added
cur.execute("select Stdev(Val) from MyData")
Stdev = cur.fetchone()[0]
#And Calculate Range
Range = float(abs(float(Max)-float(Min)))
if (Stdev == None):
print("================================ Data Error ===============================")
print(" Insufficient Population Size, Or Bad Data.")
print("*****************************************************************************")
elif (abs(Max-Min) == 0):
print("================================ Data Error ===============================")
print(" The entire Population Contains Identical values, Distribution Incalculable.")
print("******************************************************************************")
else:
Bin = [] #Holds the Bin Values
Frequency = [] #Holds the Bin Frequency for each Bin
#Establish the 1st Bin, which is based on (Standard Deviation * 3) being subtracted from the Mean
Bin.append(float((Average - ((3 * Stdev)))))
Frequency.append(0)
#Establish the remaining Bins, which is basically adding 1 Standard Deviation
#for each interation, -3, -2, -1, 1, 2, 3
for b in range(0,(BinCount) + 1):
Bin.append((float(Bin[(b)]) + Stdev))
Frequency.append(0)
for b in range(0,(BinCount) + 1):
#Lets exploit the Database and have it do the hard work calculating distribution
#of all the Bins, with SQL's between operator, but making it left inclusive, right exclusive.
sqlBinFreq = "select count(*) as Frequency from MyData where val between {0} and {1} and Val < {2}". \
format(float((Bin[b])), float(Bin[(b + 1)]), float(Bin[(b + 1)]))
#If the Database Reports Values that fall between the Current Bin, Store the Frequency to a Bins Table.
for rowBinFreq in cur.execute(sqlBinFreq):
Frequency[(b + 1)] = rowBinFreq['Frequency']
sqlBinFreqInsert = "insert into Bins (Bin, Val, Frequency) values ({0}, {1}, {2})". \
format(b, float(Bin[b]), Frequency[(b)])
cur.execute(sqlBinFreqInsert)
#Allthough this Demo is not likley produce values that
#fall outside of Standard Distribution
#if this demo was to Calculate with real data, we want to know
#how many non-Standard data points we have.
More = (More + Frequency[b])
More = abs((Records - More))
#Add the More value
sqlBinFreqInsert = "insert into Bins (Bin, Val, Frequency) values ({0}, {1}, {2})". \
format((BinCount + 1), float(0), More)
cur.execute(sqlBinFreqInsert)
#Now Report the Analysis
print("================================ The Population ==============================")
print(" {0} {1} {2} {3} {4} {5}". \
format("Size".rjust(10, ' '), \
"Max".rjust(10, ' '), \
"Min".rjust(10, ' '), \
"Mean".rjust(10, ' '), \
"Range".rjust(10, ' '), \
"Stdev".rjust(10, ' ')))
print("Aggregates: {0:10d} {1:10.4f} {2:10.4f} {3:10.4f} {4:10.4f} {5:10.4f}". \
format(Population, Max, Min, Average, Range, Stdev))
print("================================= The Bell Curve =============================")
LabelString = "{0} {1} {2} {3}". \
format("Bin".ljust(8, ' '), \
"Ranges".rjust(8, ' '), \
"Frequency".rjust(8, ' '), \
"Histogram".rjust(6, ' '))
print(LabelString)
print("------------------------------------------------------------------------------")
#Let's Paint a Histogram
sqlChart = "select * from Bins order by Bin asc"
for rowChart in cur.execute(sqlChart):
if (rowChart['Bin'] == 7):
#Bin 7 is not really a bin, but where we place the values that did not fit into the
#Normal Distribution. This script was tested against Excel's Bell Curve Example
#https://support.microsoft.com/en-us/kb/213930
#and produces the same results. Feel free to test it.
BinName = "More"
ChartString = "{0:<6} {1:<10} {2:10.0f}". \
format(BinName, \
"", \
More)
else:
#Theses are the actual bins where values fall within the distribution.
BinName = (rowChart['Bin'] + 1)
#Scale the Chart
fPercent = ((float(rowChart['Frequency']) / float(Records) * 100))
iPrecent = int(math.ceil(fPercent))
ChartString = "{0:<6} {1:10.4f} {2:10.0f} {3}". \
format(BinName, \
rowChart['Val'], \
rowChart['Frequency'], \
"".rjust(iPrecent, '#'))
print(ChartString)
print("******************************************************************************")
#Commit to Database
con.commit()
#Clean Up
cur.close()
con.close()
except Exception as EXBellCurve:
pass
TraceInfo = traceback.format_exc()
raise Exception(TraceInfo)
精彩评论