Dynamic CSV load in SQL Server
Ok so I have been searching the internet for a solution but have not yet come up with anything yet
What I have is a CSV - this CSV could have any number of unknown columns
e.g.
Col 1, Col 2, Col 3
I have used BULK INSERT #temp FROM ...
to insert from a CSV but this 开发者_StackOverflow社区relies on me having a table before hand to load into - This is where the problem arises - I don’t know my table structure before loading the CSV
Is there a way to dynamically create the table, based on the CSV, on the fly to load the data into?
Thanks Rob
I was faced with the same tasks many many times. What I ended up doing is writing a simple c# script for the load. I can admit, each time I had to change the script a little bit, because each time the requirements were different, the CSV file had specific peculiarities, etc. This means that my code most likely won't work for you straight away, but I hope that it can help you a lot.
The main C# file is program.cs. Here is its source:
using System;
using System.IO;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
namespace CsvToSql
{
class Program
{
static string server = @"localhost";
static string database = @"test";
static bool hasHeaders = false;
static string fieldLength = "max";
static string fieldPattern = "[%fieldName%] [nvarchar](%fieldLength%) NULL,\n";
static string createTablePattern =
@"
IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'[dbo].[%tableName%]') AND type in (N'U'))
DROP TABLE [dbo].[%tableName%]
GO
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE TABLE [dbo].[%tableName%](
%fields%
) ON [PRIMARY]
";
static string commandScriptPattern =
@"sqlcmd -S %server% -E -d %database% -i %tableScriptName%
bcp %database%.dbo.%tableName% in %headrelsessFileName% -c -t^^ -r \n -T -S %server%
";
private static void Main(string[] args)
{
server = System.Configuration.ConfigurationSettings.AppSettings["server"] ?? server;
database = System.Configuration.ConfigurationSettings.AppSettings["database"] ?? database;
hasHeaders = System.Configuration.ConfigurationSettings.AppSettings["hasHeaders"] == "true";
fieldLength = System.Configuration.ConfigurationSettings.AppSettings["fieldLength"] ?? fieldLength;
string[] fileNames = Directory.GetFiles(".", "*.csv");
foreach (string fileName in fileNames)
{
Console.WriteLine("Processing {0}", fileName);
Process(fileName);
}
WriteExecuteAllFile(fileNames);
WriteCleanUpFile(fileNames);
}
private static void Process(string fileName)
{
string[] fieldNames = ReadHeaders(fileName);
ProduceTableScript(fileName, fieldNames);
ProduceCommandScript(fileName);
}
private static void WriteExecuteAllFile(string[] fileNames)
{
StringBuilder sb = new StringBuilder();
foreach (string fileName in fileNames)
{
sb.Append("call ");
sb.AppendLine(GetCommandScriptName(fileName));
}
SaveStringToFile(sb.ToString(), "_all.cmd");
}
private static void WriteCleanUpFile(string[] fileNames)
{
StringBuilder sb = new StringBuilder();
foreach (string fileName in fileNames)
{
sb.Append("del ");
sb.AppendLine(GetCommandScriptName(fileName));
sb.Append("del ");
sb.AppendLine(GetHeaderlessFileName(fileName));
sb.Append("del ");
sb.AppendLine(GetTableScriptName(fileName));
}
sb.AppendLine("del _all.cmd");
sb.AppendLine("del _cleanup.cmd");
SaveStringToFile(sb.ToString(), "_cleanup.cmd");
}
private static string[] ReadHeaders(string fileName)
{
using (FileStream fs = File.OpenRead(fileName))
using (StreamReader sr = new StreamReader(fs))
{
if (hasHeaders)
{
string[] result = ParseQutationLineToList(sr.ReadLine());
ProduceHeaderlessFile(sr, fs.Name);
return result;
}
else
{
string s = sr.ReadLine();
string[] fields = ParseQutationLineToList(s);
fs.Seek(0, SeekOrigin.Begin);
sr.DiscardBufferedData();
string[] result = new string[fields.Length];
for (int i = 0; i < fields.Length; i++)
{
result[i] = "F" + (i + 1).ToString();
}
ProduceHeaderlessFile(sr, fs.Name);
return result;
}
}
}
private static void ProduceTableScript(string fileName, string[] fieldNames)
{
string tableName = GetTableName(fileName);
string fields = fieldNames.Aggregate("", (s, i) => s + fieldPattern.Replace("%fieldName%", i).Replace("%fieldLength%", fieldLength));
string table = createTablePattern.Replace("%fields%", fields).Replace("%tableName%", tableName);
SaveStringToFile(table, GetTableScriptName(fileName));
}
private static void ProduceCommandScript(string fileName)
{
string content = commandScriptPattern;
content = content.Replace("%server%", server);
content = content.Replace("%database%", database);
content = content.Replace("%tableName%", GetTableName(fileName));
content = content.Replace("%tableScriptName%", GetTableScriptName(fileName));
content = content.Replace("%headrelsessFileName%", GetHeaderlessFileName(fileName));
SaveStringToFile(content, GetCommandScriptName(fileName));
}
private static void ProduceHeaderlessFile(StreamReader sr, string basefileName)
{
string headerlessFileName = GetHeaderlessFileName(basefileName);
if (File.Exists(headerlessFileName))
{
return;
}
int counter = 0;
using(FileStream fs = File.Open(headerlessFileName, FileMode.Create, FileAccess.Write, FileShare.Read))
using (StreamWriter sw = new StreamWriter(fs))
{
while(!sr.EndOfStream)
{
//sw.WriteLine(sr.ReadLine().Replace("\"", ""));
sw.WriteLine(ParseLine(sr.ReadLine()));
counter++;
}
sw.Flush();
fs.Flush();
}
Console.WriteLine("Written {0} records to {1}", counter, headerlessFileName);
}
private static string ParseLine(string s)
{
if (s.TrimStart(' ', '\t').StartsWith("\""))
{
return ParseQutationLine(s);
}
return s.Replace(',', '^');
}
// Some tables has the default field terminator (comma) inside them
// this is why we have to parse
private static string ParseQutationLine(string s)
{
string[] fields = ParseQutationLineToList(s);
StringBuilder sb = new StringBuilder();
foreach (string field in fields)
{
sb.Append(field.Trim('"'));
sb.Append('^');
if (field.IndexOf('^') >= 0)
{
throw new ApplicationException("String contains separator character. " + s);
}
}
return sb.ToString().Substring(0, sb.Length - 1);
}
private static string[] ParseQutationLineToList(string s)
{
JouniHeikniemi.Tools.Strings.CsvReader cr = new JouniHeikniemi.Tools.Strings.CsvReader();
ArrayList result = new ArrayList();
cr.ParseCsvFields(result, s);
return (string[])result.ToArray(typeof(string));
}
private static void SaveStringToFile(string s, string fileName)
{
using (FileStream fs = File.Open(fileName, FileMode.Create, FileAccess.Write, FileShare.Read))
using (StreamWriter sw = new StreamWriter(fs))
{
sw.Write(s);
sw.Flush();
fs.Flush();
}
}
private static string GetTableName(string fileName)
{
return "_" + Path.GetFileNameWithoutExtension(fileName).Replace('.', '_');
}
private static string GetHeaderlessFileName(string fileName)
{
return Path.ChangeExtension(fileName, "inp");
}
private static string GetTableScriptName(string fileName)
{
return Path.ChangeExtension(fileName, "tbl");
}
private static string GetCommandScriptName(string fileName)
{
return Path.ChangeExtension(fileName, "cmd");
}
}
}
This file use a library that I found in internet for parsing CSV file. Note, that I saw valid CSV files, that this library failed to parse. The text for CsvReader.cs file follows:
using System;
using System.Collections;
using System.IO;
using System.Text;
namespace JouniHeikniemi.Tools.Strings {
/// <summary>
/// A data-reader style interface for reading Csv (and otherwise-char-separated) files.
/// </summary>
public class CsvReader : IDisposable {
#region Private variables
private Stream stream;
private StreamReader reader;
private char separator;
#endregion
#region Constructors
public CsvReader() { separator = ','; }
/// <summary>
/// Creates a new Csv reader for the given stream.
/// </summary>
/// <param name="s">The stream to read the CSV from.</param>
public CsvReader(Stream s) : this(s, null, ',') { }
/// <summary>
/// Creates a new reader for the given stream and separator.
/// </summary>
/// <param name="s">The stream to read the separator from.</param>
/// <param name="separator">The field separator character</param>
public CsvReader(Stream s, char separator) : this(s, null, separator) { }
/// <summary>
/// Creates a new Csv reader for the given stream and encoding.
/// </summary>
/// <param name="s">The stream to read the CSV from.</param>
/// <param name="enc">The encoding used.</param>
public CsvReader(Stream s, Encoding enc) : this(s, enc, ',') { }
/// <summary>
/// Creates a new reader for the given stream, encoding and separator character.
/// </summary>
/// <param name="s">The stream to read the data from.</param>
/// <param name="enc">The encoding used.</param>
/// <param name="separator">The separator character between the fields</param>
public CsvReader(Stream s, Encoding enc, char separator) {
this.separator = separator;
this.stream = s;
if (!s.CanRead) {
throw new CsvReaderException("Could not read the given data stream!");
}
reader = (enc != null) ? new StreamReader(s, enc) : new StreamReader(s);
}
/// <summary>
/// Creates a new Csv reader for the given text file path.
/// </summary>
/// <param name="filename">The name of the file to be read.</param>
public CsvReader(string filename) : this(filename, null, ',') { }
/// <summary>
/// Creates a new reader for the given text file path and separator character.
/// </summary>
/// <param name="filename">The name of the file to be read.</param>
/// <param name="separator">The field separator character</param>
public CsvReader(string filename, char separator) : this(filename, null, separator) { }
/// <summary>
/// Creates a new Csv reader for the given text file path and encoding.
/// </summary>
/// <param name="filename">The name of the file to be read.</param>
/// <param name="enc">The encoding used.</param>
public CsvReader(string filename, Encoding enc)
: this(filename, enc, ',') { }
/// <summary>
/// Creates a new reader for the given text file path, encoding and field separator.
/// </summary>
/// <param name="filename">The name of the file to be read.</param>
/// <param name="enc">The encoding used.</param>
/// <param name="separator">The field separator character.</param>
public CsvReader(string filename, Encoding enc, char separator)
: this(new FileStream(filename, FileMode.Open), enc, separator) { }
#endregion
#region Properties
/// <summary>
/// The separator character for the fields. Comma for normal CSV.
/// </summary>
public char Separator {
get { return separator; }
set { separator = value; }
}
#endregion
#region Parsing
/// <summary>
/// Returns the fields for the next row of data (or null if at eof)
/// </summary>
/// <returns>A string array of fields or null if at the end of file.</returns>
public string[] GetCsvLine() {
string data = reader.ReadLine();
if (data == null) return null;
if (data.Length == 0) return new string[0];
ArrayList result = new ArrayList();
ParseCsvFields(result, data);
return (string[])result.ToArray(typeof(string));
}
// Parses the fields and pushes the fields into the result arraylist
public void ParseCsvFields(ArrayList result, string data) {
int pos = -1;
while (pos < data.Length)
result.Add(ParseCsvField(data, ref pos));
}
// Parses the field at the given position of the data, modified pos to match
// the first unparsed position and returns the parsed field
private string ParseCsvField(string data, ref int startSeparatorPosition) {
if (startSeparatorPosition == data.Length-1) {
startSeparatorPosition++;
// The last field is empty
return "";
}
int fromPos = startSeparatorPosition + 1;
// Determine if this is a quoted field
if (data[fromPos] == '"') {
// If we're at the end of the string, let's consider this a field that
// only contains the quote
if (fromPos == data.Length-1) {
fromPos++;
return "\"";
}
// Otherwise, return a string of appropriate length with double quotes collapsed
// Note that FSQ returns data.Length if no single quote was found
int nextSingleQuote = FindSingleQuote(data, fromPos+1);
startSeparatorPosition = nextSingleQuote+1;
return data.Substring(fromPos+1, nextSingleQuote-fromPos-1).Replace("\"\"", "\"");
}
// The field ends in the next separator or EOL
int nextSeparator = data.IndexOf(separator, fromPos);
if (nextSeparator == -1) {
startSeparatorPosition = data.Length;
return data.Substring(fromPos);
}
else {
startSeparatorPosition = nextSeparator;
return data.Substring(fromPos, nextSeparator - fromPos);
}
}
// Returns the index of the next single quote mark in the string
// (starting from startFrom)
private static int FindSingleQuote(string data, int startFrom) {
int i = startFrom-1;
while (++i < data.Length)
if (data[i] == '"') {
// If this is a double quote, bypass the chars
if (i < data.Length-1 && data[i+1] == '"') {
i++;
continue;
}
else
return i;
}
// If no quote found, return the end value of i (data.Length)
return i;
}
#endregion
/// <summary>
/// Disposes the reader. The underlying stream is closed.
/// </summary>
public void Dispose() {
// Closing the reader closes the underlying stream, too
if (reader != null) reader.Close();
else if (stream != null)
stream.Close(); // In case we failed before the reader was constructed
GC.SuppressFinalize(this);
}
}
/// <summary>
/// Exception class for CsvReader exceptions.
/// </summary>
[Serializable]
public class CsvReaderException : ApplicationException {
/// <summary>
/// Constructs a new CsvReaderException.
/// </summary>
public CsvReaderException() : this("The CSV Reader encountered an error.") { }
/// <summary>
/// Constructs a new exception with the given message.
/// </summary>
/// <param name="message">The exception message.</param>
public CsvReaderException(string message) : base(message) { }
/// <summary>
/// Constructs a new exception with the given message and the inner exception.
/// </summary>
/// <param name="message">The exception message.</param>
/// <param name="inner">Inner exception that caused this issue.</param>
public CsvReaderException(string message, Exception inner) : base(message, inner) { }
/// <summary>
/// Constructs a new exception with the given serialization information.
/// </summary>
/// <param name="info"></param>
/// <param name="context"></param>
protected CsvReaderException(System.Runtime.Serialization.SerializationInfo info,
System.Runtime.Serialization.StreamingContext context)
: base(info, context) { }
}
}
I also have a config file CsvToSql.exe.config:
<?xml version="1.0" encoding="utf-8" ?>
<configuration>
<appSettings>
<add key="server" value="localhost"/>
<add key="database" value="test"/>
<!-- If your csv files have header, set this to true to generate field names from headers-->
<!-- Otherwise set it to false to generate names F1, F2, F3, etc.-->
<add key="hasHeaders" value="false"/>
<!-- This is the lenght of nvarchar field created can be a number or 'max'-->
<add key="fieldLength" value="500"/>
</appSettings>
</configuration>
And a script that compiles the whole lot build.cmd:
%systemroot%\Microsoft.NET\Framework\v3.5\csc.exe /out:CsvToSql.exe Program.cs CsvReader.cs
This is how I use it:
- Run build.cmd to compile CsvToSql.exe
- Edit CsvToSql.exe.config to fit your case
- Put csv files in the same folder the executable and the config file
- Run CsvToSql.exe
- The executable does not connect to the database. Instead it produces a number of files: *.tbl files are table definitions, *.inp files are input files for bcp command line utility, *.cmd files are files that run table creation scripts and bcp command line utility. _all.cmd that runs *.cmd for all tables and _cleanup.cmd that deletes all the files that CsvToSql.exe generates
- Run _all.cmd file
- Go to your SQL and look at what has been produced. Make changes to the script and / or config, rinse and repeat
There are a lot of assumtions that this script makes, and also a lot of stuff that is hardcoded. This is what I usaully quickly change each new time I need to load a set of CSV into SQL.
Good luck and if you have any questions please don't hesitate to ask. The script requires .NET 3.5 If there is no extra-special about data I'm loading, I'm usually up and running with this script in 15 minutes. If there are troubles, twicking might take longer.
CSV parsing is non-trivial (taking into account text qualifiers, values that contain linebreaks, qualifier escape mechanisms, etc). There are several .Net libraries out there that do all this stuff for you (eg http://www.codeproject.com/KB/database/CsvReader.aspx), so I would think it would be easier to use a different technology, eg powershell, or SQL CLR, to make use of an existing library - rather that trying to roll your own CSV parser in T-SQL...
Huh, just found this nice and simple solution on an old forum post (http://forums.databasejournal.com/showthread.php?t=47966):
select *
from OpenRowset('MSDASQL', 'Driver={Microsoft Text Driver (*.txt; *.csv)}; DefaultDir=D:\;',
'select * from test1.csv')
Unfortunately, it doesn't work on recent windows versions where the text driver isn't installed by default...
精彩评论