Splitting a string only when the delimeter is not enclosed in quotation marks
I need to write a split function in JavaScript that splits a string into an array, on a comma...but the comma must not be enclosed in quotation marks ('
and "
).
Here are three examples and how the result (an array) should be:
"peanut, butter, jelly"
-> ["peanut", "butter", "jelly"]
"peanut, 'butter, bread', 'jelly'"
-> ["peanut", "butter, bread", "jelly"]
'peanut, "butter, bread", "jel开发者_StackOverflowly"'
-> ["peanut", 'butter, bread', "jelly"]
The reason I cannot use JavaScript's split
method is because it also splits when the delimiter is enclosed in quotation marks.
How can I accomplish this, maybe with a regular expression ?
As regards the context, I will be using this to split the arguments passed from the third element of the third argument passed to the function you create when extending the jQuery's $.expr[':']
. Normally, the name given to this parameter is called meta
, which is an array that contains certain info about the filter.
Anyways, the third element of this array is a string which contains the parameters that are passed with the filter; and since the parameters in a string format, I need to be able to split them correctly for parsing.
What you are asking for is essentially a Javascript CSV parser. Do a Google search on "Javascript CSV Parser" and you'll get lots of hits, many with complete scripts. See also Javascript code to parse CSV data
Well, I already have a jackhammer of a solution written (general code written for something else), so just for kicks . . .
function Lexer () {
this.setIndex = false;
this.useNew = false;
for (var i = 0; i < arguments.length; ++i) {
var arg = arguments [i];
if (arg === Lexer.USE_NEW) {
this.useNew = true;
}
else if (arg === Lexer.SET_INDEX) {
this.setIndex = Lexer.DEFAULT_INDEX;
}
else if (arg instanceof Lexer.SET_INDEX) {
this.setIndex = arg.indexProp;
}
}
this.rules = [];
this.errorLexeme = null;
}
Lexer.NULL_LEXEME = {};
Lexer.ERROR_LEXEME = {
toString: function () {
return "[object Lexer.ERROR_LEXEME]";
}
};
Lexer.DEFAULT_INDEX = "index";
Lexer.USE_NEW = {};
Lexer.SET_INDEX = function (indexProp) {
if ( !(this instanceof arguments.callee)) {
return new arguments.callee.apply (this, arguments);
}
if (indexProp === undefined) {
indexProp = Lexer.DEFAULT_INDEX;
}
this.indexProp = indexProp;
};
(function () {
var New = (function () {
var fs = [];
return function () {
var f = fs [arguments.length];
if (f) {
return f.apply (this, arguments);
}
var argStrs = [];
for (var i = 0; i < arguments.length; ++i) {
argStrs.push ("a[" + i + "]");
}
f = new Function ("var a=arguments;return new this(" + argStrs.join () + ");");
if (arguments.length < 100) {
fs [arguments.length] = f;
}
return f.apply (this, arguments);
};
}) ();
var flagMap = [
["global", "g"]
, ["ignoreCase", "i"]
, ["multiline", "m"]
, ["sticky", "y"]
];
function getFlags (regex) {
var flags = "";
for (var i = 0; i < flagMap.length; ++i) {
if (regex [flagMap [i] [0]]) {
flags += flagMap [i] [1];
}
}
return flags;
}
function not (x) {
return function (y) {
return x !== y;
};
}
function Rule (regex, lexeme) {
if (!regex.global) {
var flags = "g" + getFlags (regex);
regex = new RegExp (regex.source, flags);
}
this.regex = regex;
this.lexeme = lexeme;
}
Lexer.prototype = {
constructor: Lexer
, addRule: function (regex, lexeme) {
var rule = new Rule (regex, lexeme);
this.rules.push (rule);
}
, setErrorLexeme: function (lexeme) {
this.errorLexeme = lexeme;
}
, runLexeme: function (lexeme, exec) {
if (typeof lexeme !== "function") {
return lexeme;
}
var args = exec.concat (exec.index, exec.input);
if (this.useNew) {
return New.apply (lexeme, args);
}
return lexeme.apply (null, args);
}
, lex: function (str) {
var index = 0;
var lexemes = [];
if (this.setIndex) {
lexemes.push = function () {
for (var i = 0; i < arguments.length; ++i) {
if (arguments [i]) {
arguments [i] [this.setIndex] = index;
}
}
return Array.prototype.push.apply (this, arguments);
};
}
while (index < str.length) {
var bestExec = null;
var bestRule = null;
for (var i = 0; i < this.rules.length; ++i) {
var rule = this.rules [i];
rule.regex.lastIndex = index;
var exec = rule.regex.exec (str);
if (exec) {
var doUpdate = !bestExec
|| (exec.index < bestExec.index)
|| (exec.index === bestExec.index && exec [0].length > bestExec [0].length)
;
if (doUpdate) {
bestExec = exec;
bestRule = rule;
}
}
}
if (!bestExec) {
if (this.errorLexeme) {
lexemes.push (this.errorLexeme);
return lexemes.filter (not (Lexer.NULL_LEXEME));
}
++index;
}
else {
if (this.errorLexeme && index !== bestExec.index) {
lexemes.push (this.errorLexeme);
}
var lexeme = this.runLexeme (bestRule.lexeme, bestExec);
lexemes.push (lexeme);
}
index = bestRule.regex.lastIndex;
}
return lexemes.filter (not (Lexer.NULL_LEXEME));
}
};
}) ();
if (!Array.prototype.filter) {
Array.prototype.filter = function (fun) {
var len = this.length >>> 0;
var res = [];
var thisp = arguments [1];
for (var i = 0; i < len; ++i) {
if (i in this) {
var val = this [i];
if (fun.call (thisp, val, i, this)) {
res.push (val);
}
}
}
return res;
};
}
Now to use the code for your problem:
function trim (str) {
str = str.replace (/^\s+/, "");
str = str.replace (/\s+$/, "");
return str;
}
var splitter = new Lexer ();
splitter.setErrorLexeme (Lexer.ERROR_LEXEME);
splitter.addRule (/[^,"]*"[^"]*"[^,"]*/g, trim);
splitter.addRule (/[^,']*'[^']*'[^,']*/g, trim);
splitter.addRule (/[^,"']+/g, trim);
splitter.addRule (/,/g, Lexer.NULL_LEXEME);
var strs = [
"peanut, butter, jelly"
, "peanut, 'butter, bread', 'jelly'"
, 'peanut, "butter, bread", "jelly"'
];
// NOTE: I'm lazy here, so I'm using Array.prototype.map,
// which isn't supported in all browsers.
var splitStrs = strs.map (function (str) {
return splitter.lex (str);
});
var str = 'text, foo, "haha, dude", bar';
var fragments = str.match(/[a-z]+|(['"]).*?\1/g);
Even better (supports escaped "
or '
inside the strings):
var str = 'text_123 space, foo, "text, here\", dude", bar, \'one, two\', blob';
var fragments = str.match(/[^"', ][^"',]+[^"', ]|(["'])(?:[^\1\\\\]|\\\\.)*\1/g);
// Result:
0: text_123 space
1: foo
2: "text, here\", dude"
3: bar
4: 'one, two'
5: blob
If you can control the input to enforce that the string will be enclosed in double-quotes "
and that all elements withing the string will be enclosed in single-quotes '
, and that no element can CONTAIN a single-quote, then you can split on , '
. If you CAN'T control the input, then using a regular expression to sort/filter/split the input would be about as useful as using a regular expression to match against xhtml (see: RegEx match open tags except XHTML self-contained tags)
精彩评论