what would be a quick way to do nested splitting of brackets in python?

2023-01-22 07:51 问答作者：

I have a file of the following format:

ID1 { some text }
ID2 { some text }

They don't have to come line by line format, so that we can have:

ID1 { some [crlf]
text [crlf]
}

ID2 [crlf] { some t [crlf]
ex [crlf]
t}

and so on, meaning some text can be more than one line and there could be a CRLF immediately following ID. The main invariant is that all IDs are enclosed by { }. The thing is that some text itself could have { and } in it.

What would be a quick way to take such a file and separate it into a list of strings, each being ID { text }, while taking into account nested brackets?

Taking into account some error analysis, in case brackets are not balanced, wo开发者_如何学Gould be great.

Using pyparsing you can knock this out in about 6 lines, and then get on with your other work. Here are two variations on a solution, depending on how you want the parse results structured:

data = """ID1 { some text } ID2 { some {with some more text nested in braces} text }"""

from pyparsing import Word, alphas, alphanums, dictOf, nestedExpr, originalTextFor

# identifier starts with any alpha, followed by any alpha, num, or '_'
ident = Word(alphas,alphanums+"_")

# Solution 1
# list of items is a dict of pairs of idents and nested {}'s 
# - returns {}'s expressions as nested structures
itemlist = dictOf(ident, nestedExpr("{","}"))
items = itemlist.parseString(data)
print items.dump()

"""
prints:
[['ID1', ['some', 'text']], ['ID2', ['some', ['with', 'some', 'more', ...
- ID1: ['some', 'text']
- ID2: ['some', ['with', 'some', 'more', 'text', 'nested', 'in', 'braces'], 'text']
"""

# Solution 2
# list of items is a dict of pairs of idents and nested {}'s 
# - returns {}'s expressions as strings of text extract from the 
# original input string
itemlist = dictOf(ident, originalTextFor(nestedExpr("{","}")))
items = itemlist.parseString(data)
print items.dump()

"""
prints:
[['ID1', '{ some text }'], ['ID2', '{ some {with some more text nested in ...
- ID1: { some text }
- ID2: { some {with some more text nested in braces} text }
"""

This is a simple question of "how do I write a rescursive decent parser that matches brackets.

Given this grammar:

STMT_LIST := STMT+
STMT := ID '{' DATA '}'
DATA := TEXT | STMT
ID := [a-z0-9]+
TEXT := [^}]*

A parser might look like:

import sys
import re

def parse(data):
    """
    STMT
    """
    while data:
        data, statement_id, clause = parse_statement(data)
        print repr((statement_id, clause))

def consume_whitespace(data):
    return data.lstrip()

def parse_statement(data):
    m = re.match('[a-zA-Z0-9]+', data)
    if not m:
        raise ValueError, "No ID found"
    statement_id = m.group(0)
    data = consume_whitespace(data[len(statement_id):])
    data, clause = parse_clause(data)
    return consume_whitespace(data), statement_id, clause

def parse_clause(data):
    clause = []
    if not data.startswith('{'):
        raise ValueError, "No { found"
    data = data[1:]
    closebrace = data.index('}')
    try:
        openbrace = data.index('{')
    except ValueError:
        openbrace = sys.maxint
    while openbrace < closebrace:
        clause.append(data[:openbrace])
        data, subclause = parse_clause(data[openbrace:])
        clause.append(subclause)

        closebrace = data.index('}')
        try:
            openbrace = data.index('{')
        except ValueError:
            openbrace = sys.maxint
    clause.append(data[:closebrace])
    data = data[closebrace+1:]
    return data, clause

parse("ID { foo { bar } }")
parse("ID { foo { bar } } baz { tee fdsa { fdsa } }")

This is a nasty parser to be honest. If you were to structure it nicer you would end up with a proper token stream from a lexxer and pass that to the actual parser. As it is the 'token stream' is just a string that we strip info off the start of.

I would recommend looking at pyparsing if you wanted anything more complicated.

regex is out of the question, obviously. Have you looked at pyparsing?

[EDIT]

OTOH this might work:

from functools import wraps


def transition(method):
    @wraps(method)
    def trans(state, *args, **kwargs):
        command = method(state, *args, **kwargs)
        state.__class__ = command(state)
    return trans


class State(object):
    def __new__(cls):
        state = object.__new__(cls)
        state._identities = []
        return state

def unchanged(state):
    return state.__class__

def shifting(identity):
    def command(state):
        return identity
    return command

def pushing(identity, afterwards=None):
    def command(state):
        state._identities.append(afterwards or state.__class__)
        return identity
    return command

def popped(state):
    return state._identities.pop()


##############################################################################


import re
tokenize = re.compile(flags=re.VERBOSE | re.MULTILINE, pattern=r"""
    (?P<word>       \w+ ) |
    (?P<braceleft>  {   ) |
    (?P<braceright> }   ) |
    (?P<eoi>        $   ) |
    (?P<error>      \S  ) # catch all (except white space)
""").finditer

def parse(parser, source, builder):
    for each in tokenize(source):
        dispatch = getattr(parser, each.lastgroup)
        dispatch(each.group(), builder)


class ParsingState(State):
    def eoi(self, token, *args):
        raise ValueError('premature end of input in parsing state %s' %
            self.__class__.__name__
        )
    def error(self, token, *args):
        raise ValueError('parsing state %s does not understand token %s' % (
            self.__class__.__name__, token
        ))
    def __getattr__(self, name):
        def raiser(token, *args):
            raise ValueError(
                'parsing state %s does not understand token "%s" of type %s' %
                (self.__class__.__name__, token, name)
            )
        return raiser


class Id(ParsingState):
    @transition
    def word(self, token, builder):
        builder.add_id(token)
        return shifting(BeginContent)
    @transition
    def eoi(self, token, builder):
        return shifting(DoneParsing)

class BeginContent(ParsingState):
    @transition
    def braceleft(self, token, builder):
        return shifting(Content)

class Content(ParsingState):
    @transition
    def word(self, token, builder):
        builder.add_text(token)
        return unchanged
    @transition
    def braceleft(self, token, builder):
        builder.add_text(token)
        return pushing(PushedContent)
    @transition
    def braceright(self, token, builder):
        return shifting(Id)

class PushedContent(Content):
    @transition
    def braceright(self, token, builder):
        builder.add_text(token)
        return popped

class DoneParsing(ParsingState):
    pass

##############################################################################


class Entry(object):
    def __init__(self, idname):
        self.idname = idname
        self.text = []
    def __str__(self):
        return '%s { %s }' % (self.idname, ' '.join(self.text))

class Builder(object):
    def __init__(self):
        self.entries = []
    def add_id(self, id_token):
        self.entries.append(Entry(id_token))
    def add_text(self, text_token):
        self.entries[-1].text.append(text_token)


##############################################################################


if __name__ == '__main__':

    file_content = """
    id1 { some text } id2 {
    some { text }
    }
    """

    builder = Builder()
    parse(Id(), file_content, builder)
    for entry in builder.entries:
        print entry

Here's the brute force method, with error detection included or indicated:

# parsebrackets.py
def parse_brackets(data):
    # step 1: find the 0-nesting-level { and }
    lpos = []
    rpos = []
    nest = 0
    for i, c in enumerate(data):
        if c == '{':
            if nest == 0:
                lpos.append(i)
            nest += 1
        elif c == '}':
            nest -= 1
            if nest < 0:
                raise Exception('too many } at offset %d' % i)
            if nest == 0:
                rpos.append(i)
    if nest > 0:
        raise Exception('too many { in data')
    prev = -1
    # step 2: extract the pieces
    for start, end in zip(lpos, rpos):
        key = data[prev+1:start].strip()
        # insert test for empty key here
        text = data[start:end+1]
        prev = end
        yield key, text
    if data[prev+1:].strip():
        raise Exception('non-blank text after last }')

Output:

>>> from parsebrackets import parse_brackets as pb
>>> for k, t in pb(' foo   {bar {zot\n}} guff {qwerty}'):
...    print repr(k), repr(t)
...
'foo' '{bar {zot\n}}'
'guff' '{qwerty}'
>>>

继续阅读：python

what would be a quick way to do nested splitting of brackets in python?

更多精彩内容

精彩评论

最新问答

央视是哪个频道？

请问买过的朋友，舒提啦旅行箱实际使用体验如何？？

检查不孕不育需要的费用？

海信ULED电视画质有什么不同的地方?？

钉子可以挂的住画框幕布吗？

问答排行榜

河神2九牛入海钓河妖是第几集河妖什么来历可活吞牛？

性激素六项检查的最佳时间是多久？多少钱？？

Easiest way to get words of one line from istream into a vector?

《梦在燃烧 (《三国演义》动画片主题曲)》MP3歌词-汤子星？

抽烟只抽炫赫门？