开发者

Boost Spirit and Lex parser problem

I've been struggling to try and (incrementally) modify example code from the documentation but with not much different I am not getting the behavior I expect. Specifically, the "if" statement fails when (my intent is that) it should be passing (there was an "else" but that part of the parser was removed during debugging). The assignment statement works fine. I had a "while" statement as well which had the same problem as the "if" statement so I am sure if I can get help to figure out why one is not working it should be easy to get the other going. It must be kind of subtle because this is almost verbatim what is in one of the examples.

#include <iostream>
#include <fstream>
#include <string>

#define BOOST_SPIRIT_DEBUG
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_statement.hpp>
#include <boost/spirit/include/phoenix_container.hpp>

namespace qi  = boost::spirit::qi;
namespace lex = boost::spirit::lex;

inline std::string read_from_file( const char* infile )
{
    std::ifstream instream( infile );
    if( !instream.is_open() )
    {
        std::cerr << "Could not open file: \"" << infile << "\"" << std::endl;
        exit( -1 );
    }
    instream.unsetf( std::ios::skipws );
    return( std::string(
                std::istreambuf_iterator< char >( instream.rdbuf() ),
                std::istreambuf_iterator< char >()
          ) );
}

template< typename Lexer >
struct LangLexer : lex::lexer< Lexer >
{
    LangLexer()
    {
        identifier = "[a-zA-Z][a-zA-Z0-9_]*";
        number = "[-+]?(\\d*\\.)?\\d+([eE][-+]?\\d+)?";

        if_ = "if";
        else_ = "else";

        this->self = lex::token_def<> ( '(' ) | ')' | '{' | '}' | '=' | ';';
        this->self += identifier | number | if_ | else_;

        this->self( "WS" ) = lex::token_def<>( "[ \\t\\n]+" );

    }

    lex::token_def<> if_, else_;
    lex::token_def< std::string > identifier;
    lex::token_def< double > number;
};

template< typename Iterator, typename Lexer >
struct LangGrammar : qi::grammar< Iterator, qi::in_state_skipper< Lexer > >
{
    template< typename TokenDef >
    LangGrammar( const TokenDef& tok ) : LangGrammar::base_type( program )
    {
        using boost::phoenix::val;
        using boost::phoenix::ref;
        using boost::phoenix::size;

        program = +block;
        block = '{' >> *statement >> '}';
        statement = assignment | if_stmt;
        assignment = ( tok.identifier >> '=' >> expression >> ';' );
        if_stmt = ( tok.if_ >> '(' >> expression >> ')' >> block );
        expression = ( tok.identifier[ qi::_val = qi::_1 ] | tok.number[ qi::_val = qi::_1 ] );

        BOOST_SPIRIT_DEBUG_NODE( program );
        BOOST_SPIRIT_DEBUG_NODE( block );
        BOOST_SPIRIT_DEBUG_NODE( statement );
        BOOST_SPIRIT_DEBUG_NODE( assignment );
        BOOST_SPIRIT_DEBUG_NODE( if_stmt );
        BOOST_SPIRIT_DEBUG_NODE( expression );
    }

    qi::rule< Iterator, qi::in_state_skipper< Lexer > > program, block, statement;
    qi::rule< Iterator, qi::in_state_skipper< Lexer > > assignment, if_stmt;

    typedef boost::variant< double,开发者_开发百科 std::string > expression_type;
    qi::rule< Iterator, expression_type(), qi::in_state_skipper< Lexer > > expression;
};

int main( int argc, char** argv )
{
    typedef std::string::iterator base_iterator_type;
    typedef lex::lexertl::token< base_iterator_type, boost::mpl::vector< double, std::string > > token_type;
    typedef lex::lexertl::lexer< token_type > lexer_type;
    typedef LangLexer< lexer_type > LangLexer;
    typedef LangLexer::iterator_type iterator_type; 
    typedef LangGrammar< iterator_type, LangLexer::lexer_def > LangGrammar;

    LangLexer lexer;
    LangGrammar grammar( lexer );

    std::string str( read_from_file( 1 == argc ? "boostLexTest.dat" : argv[1] ) );

    base_iterator_type strBegin = str.begin();
    iterator_type tokenItor = lexer.begin( strBegin, str.end() );
    iterator_type tokenItorEnd = lexer.end(); 

    std::cout << std::setfill( '*' ) << std::setw(20) << '*' << std::endl <<
        str
        << std::endl << std::setfill( '*' ) << std::setw(20) << '*' << std::endl;

    bool result = qi::phrase_parse( tokenItor, tokenItorEnd, grammar, qi::in_state( "WS" )[ lexer.self ] );

    if( result )
    {
        std::cout << "Parsing successful" << std::endl;
    }
    else
    {
        std::cout << "Parsing error" << std::endl;
    }

    return( 0 );
}

Here is the output of running this (the file read into the string is dumped out first in main)

********************
{
    a = 5;
    if( a ){ b = 2; }
}


********************
<program>
  <try>{</try>
  <block>
    <try>{</try>
    <statement>
      <try></try>
      <assignment>
        <try></try>
<expression>
  <try></try>
  <success>;</success>
  <attributes>(5)</attributes>
</expression>
        <success></success>
        <attributes>()</attributes>
      </assignment>
      <success></success>
      <attributes>()</attributes>
    </statement>
    <statement>
      <try></try>
      <assignment>
        <try></try>
        <fail/>
      </assignment>
      <if_stmt>
        <try>
    if(</try>
        <fail/>
      </if_stmt>
      <fail/>
    </statement>
    <fail/>
  </block>
  <fail/>
</program>
Parsing error


The problem is the sequence you added the token definitions to the lexer. Your code

this->self += identifier | number | if_ | else_; 

first adds the identifier token, which will perfectly match the 'if' (and any other keyword) as well. If you change that to

this->self += if_ | else_ | identifier | number; 

everythings starts to work as it should.

This is nothing specific to Spirit.Lex. Any tokenizer respects the order the tokens are defined in to prioritize the matching.


Maybe the mini_c example needs to be modified to use the lexer? It will be a more complete example of a how to use the two. Most of the Qi samples (as of 2.4version) do not use a the lexer at all.

Although it serves to illustrate Qi usage - we tend to try and have a dedicated lexer for production projects due to maintainability reasons (I can off load lexer to a sub-developer for example).

0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜