开发者

Tokenizing a string in C?

I'm working on a terminal parser for a calculator written in C. I cannot figure out how to concatenate all of the numbers that are in between operators to put them into an array.

For example, if the input (command line argument) was "4+342", it would ideally be input[] = {"4", "+", "342"}.

Here's my code so far. I'm including <stdio.h>, <stdlib.h>, and <ctype.h>.

typedef char * string;

int main(int argc, char *argv[])
{
  string inputS = argv[1];
  string input[10];
  string temp;
  printf("%s\n", inputS);
  int i;
  int len = strlen(inputS);
  printf("parsed:\n");
  for(i = 0; i < len; inputS++, i++)
  { 
    if(isdigit(*inputS))
    {
      printf("%c",*inputS);
    }
    else
    {
      printf("\n%c\n",*inputS);
    }
  }
  printf("\n");
  retu开发者_Go百科rn 0;
}

If it is run with ./calc 4+5-546, it will output:

4
+
5
-
546

So what's the easiest way to get each line of this into its own array slot?


Try this for size...

#include <stdio.h>
#include <ctype.h>

typedef char * string;

int main(int argc, char *argv[])
{
    string inputS = argv[1];
    string input[50];   /* Up to 50 tokens */
    char   buffer[200];
    int    i;
    int    strnum = 0;
    char  *next = buffer;
    char   c;

    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s expression\n", argv[0]);
        return 1;
    }

    printf("input: <<%s>>\n", inputS);
    printf("parsing:\n");

    while ((c = *inputS++) != '\0')
    { 
        input[strnum++] = next;
        if (isdigit(c))
        {
            printf("Digit: %c\n", c);
            *next++ = c;
            while (isdigit(*inputS))
            {
                c = *inputS++;
                printf("Digit: %c\n", c);
                *next++ = c;
            }
            *next++ = '\0';
        }
        else
        {
            printf("Non-digit: %c\n", c);
            *next++ = c;
            *next++ = '\0';
        }
    }

    printf("parsed:\n");
    for (i = 0; i < strnum; i++)
    {
        printf("%d: <<%s>>\n", i, input[i]);
    }

    return 0;
}

Given the program is called tokenizer and the command:

tokenizer '(3+2)*564/((3+4)*2)'

It gives me the output:

input: <<(3+2)*564/((3+4)*2)>>
parsing:
Non-digit: (
Digit: 3
Non-digit: +
Digit: 2
Non-digit: )
Non-digit: *
Digit: 5
Digit: 6
Digit: 4
Non-digit: /
Non-digit: (
Non-digit: (
Digit: 3
Non-digit: +
Digit: 4
Non-digit: )
Non-digit: *
Digit: 2
Non-digit: )
parsed:
0: <<(>>
1: <<3>>
2: <<+>>
3: <<2>>
4: <<)>>
5: <<*>>
6: <<564>>
7: <</>>
8: <<(>>
9: <<(>>
10: <<3>>
11: <<+>>
12: <<4>>
13: <<)>>
14: <<*>>
15: <<2>>
16: <<)>>


The easiest solution is to use a tool like flex to generate your lexer and let it do the work of breaking the input into tokens (although flex expects its input to come from a file stream, not a character array).

strtok() isn't a good solution for several reasons:

  • It overwrites the input, which you may want to preserve for use later;
  • It's a brute force tool and doesn't handle badly-formed input well;
  • If you use your arithmetic operators as the token separators, then the operators themselves will get clobbered.

The usual solution is to write a state machine (which is basically what flex does for you). Here's a very quick-n-dirty (emphasis on the dirty) example:

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>

/**
 * Read from a string specified by source, updating the pointer as we go.
 * We're assuming that token points to a buffer large enough to hold
 * our largest token; ideally, you would want to pass the length of the
 * target buffer and check against it, but I'm leaving it out for brevity.
 * 
 * Tokens are either integers (strings of digits) or operators. 
 *
 * Return 1 if we successfully read a token, 0 if we encountered an unexpected
 * character, and EOF if the next character is the end of the input string.
 */
int getToken(char **source, char *token)
{
  enum {START, DIGIT, ERROR, DONE} state = START;
  size_t i = 0;
  char *operators="+-*/";

  if (**source == 0)  // at end of input
    return EOF;

  while (**source != 0)
  {
    switch(state)
    {
      /**
       * Initial state for this call.
       */
      case START: 
        if (isdigit(**source))
        {
          state = DIGIT;
          token[i++] = *(*source)++; // append the digit to the token
        }
        else if (strchr(operators, **source) != NULL)
        {
          state = DONE;
          token[i++] = *(*source)++; // add the operator to the token
          token[i++] = 0;            // and terminate the string
        }
        else if (isspace(**source))
        {
          (*source)++;  // ignore whitespace
        }
        else
        {
          /**
           * We've read something that isn't a digit, operator, or 
           * whitespace; treating it as an error for now.
           */
          state = ERR;
        }
        break;

      /**
       * We've read at least one digit.
       */
      case DIGIT:
        if (isdigit(**source))
        {
          token[i++] = *(*source)++; // append next digit to token
        }
        else
        {
          /**
           * We've read a non-digit character; terminate the token
           * and signal that we're done. 
           */
          token[i++] = 0;
          state = DONE;
         }
         break;

      case DONE:
        return 1;
        break;

      case ERR:
        return 0;
        break;
    }
  }
  return 1;
}

int main(int argc, char **argv)
{
  char token[20];
  char *input = argv[1];
  for (;;)
  {
    int result = getToken(&input, token);
    if (result == 1)
      printf("%s\n", token);
    else if (result == 0)
    {
      printf("Bad character '%c'; skipping\n", *input);
      input++;
    }
    else if (result == EOF)
    {
      printf("done\n");
      break;
    }
  }
  return 0;
}

Why (*source)++ instead of *source++ or source++? I don't want to update source, I want to update what source points to, so I have to dereference the pointer before the ++ is applied. The expression *(*source)++ basically translates to "give me the value of the character that the expression *source is pointing to, then update the value of *source".


--> MAN STRCAT

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main (int argc, const char **argv)
{
    char *toto_str = "Toto";
    char *is_str = "Is";
    char *awesome_str = "Awesome";
    char *final_str;
    size_t i;

    i = strlen(toto_str);
    i += strlen(is_str);
    i += strlen(awesome_str);

    final_str = malloc((i * sizeof(char)) + 1);
    strcat(final_str, toto_str);
    strcat(final_str, is_str);
    strcat(final_str, awesome_str);

    printf("%s", final_str);
    free(final_str);

    return 0;
}


strsep is a good choice here - grab the token and then decide what you want to do with it...

char *string = "(3+(5+6)/8)"

char token; while ((token = strsep(&string, "(+/) "))) { // Store token... if it's not a ( or ) or space }

Here - token will be processed similar to a Split() in Java/C#. This does mutilate the string while processing it - however, with the correct delimiters - things will be good :)


Sounds like you want to look at the standard strtok function.


this will give you an idea:

#include <stdio.h>
#include <string.h>
main(int argc, char *argv[])
{
    printf("\nargv[1]: %s",argv[1]);
    char *p;
    p = strtok(argv[1],"+");
    printf("\np: %s", p);
    p = strtok(NULL,"+");
    printf("\np: %s", p);
    p = strtok(NULL,"+");
    printf("\np: %s", p);
    printf("\n");
}

This is just a sample code to demonstrate how it is done using addition case only.
Get the main idea of this code and apply it in your code.
Example output for this:

./a.out 5+3+9

argv[1]: 5+3+9
p: 5
p: 3
p: 9

Again, I am only demonstrating the "+" sign. You may want to check for p until it is NULL, then proceed with the next operation, say subtraction, then multiplication, then division.

0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜