error in coding a lexer in c
#include<stdio.h>
#include<ctype.h>
#include<string.h>
/* this is a lexer which recognizes constants , variables ,symbols, identifiers , functions , comments and also header files . It stores the lexemes in 3 different files . One file contains all the headers and the comments . Another file will contain all the variables , another will contain all the symbols. */
int main()
{
int i=0,j,k,count=0;
char a,b[100],c[10000],d[100];
memset ( d, 0, 100 );
j=30;
FILE *fp1,*fp2;
fp1=fopen("source.txt","r"); //the source file is opened in read only mode which will passed through the lexer
fp2=fopen("lext.txt","w");
//now lets remove all the white spaces and store the rest of the words in a file
if(fp1==NULL)
{
perror("failed to open source.txt");
//return EXIT_FAILURE;
}
i=0;
k=0;
while(!feof(fp1))
{
a=fgetc(fp1);
if(a!=' '&&a!='\n')
{
if (!isalpha(a))
{
switch(a)
{
case '+':{fprintf(fp2,"+ ----> PLUS \n");
i=0;break;}
case '-':{fprintf(fp2,"- ---> MINUS \n");
i=0;break;}
case '*':{fprintf(fp2, "* --->MULT \n");
i=0;break;}
case '/':{fprintf(fp2, "/ --->DIV \n");
i=0;break;}
//case '+=':fprintf(fp2, "%.20s\n", "ADD_ASSIGN");
//case '-=':fprintf(fp2, "%.20s\n", "SUB_ASSIGN");
case '=':{fprintf(fp2, "= ---> ASSIGN \n");
i=0;break;}
case '%':{fprintf(fp2, "% ---> MOD \n");
i=0;break;}
case '<':{fprintf(fp2, "< ---> LESSER_THAN \n");
i=0;break;}
case '>':{fprintf(fp2, "> --> GREATER_THAN \n");
i=0;break;}
//case '++':fprintf(fp2, "%.20s\n", "INCREMENT");
//case '--':fprintf(fp2, "%.20s\n", "DECREMENT");
//case '==':fprintf(fp2, "%.20s\n", "ASSIGNMENT");
case ';':{fprintf(fp2, "; --->SEMI_COLUMN \n");
i=0;break;}
case ':':{fprintf(fp2, ": --->COLUMN \n");
i=0;break;}
case '(':{fprintf(fp2, "( --->LPAR \n");
i=0;break;}
case ')':{fprintf(fp2, ") --->RPAR \n");
i=0;break;}
case '{':{fprintf(fp2, "{ --->LBRACE \n");
i=0;break;}
case '}':{fprintf(fp2, "} ---> RBRACE \n");
i=0;break;}
}
}
else
{
d[i]=a;
//printf("%c\n",d[i]);
i=i+1;
}
//}
/* we can make the lexer more complex by including even more depths of checks for the symbols*/
}
else
{
d[i+1]='\0';
printf("\n");
if((strcmp(d,"if ")==0)){fprintf(fp2,"if ----> IDENTIFIER \n");
//printf("%s \n",d);
memset ( d, 0, 100 );
//printf("%s \n",d);
count=count+1;}
else if(strcmp(d,"then")==0){fprintf(fp2,"then ----> IDENTIFIER \n");
count=count+1;}
else if(strcmp(d,"else")==0){fprintf(fp2,"else ----> IDENTIFIER \n");
count=count+1;}
else if(strcmp(d,"switch")==0){fprintf(fp2,"switch ----> IDENTIFIER \n");
count=count+1;}
else if(strcmp(d,"printf")==0){fprintf(fp2,"prtintf ----> IDENTIFIER \n");
count=count+1;}
else if(strcmp(d,"scanf")==0){fprintf(fp2,"scanf ----> IDENTIFIER \n");
count=count+1;}
else if(strcmp(d,"NULL")==0){fpr开发者_运维知识库intf(fp2,"NULL ----> IDENTIFIER \n");
count=count+1;}
else if(strcmp(d,"int")==0){fprintf(fp2,"INT ----> IDENTIFIER \n");
count=count+1;}
else if(strcmp(d,"char")==0){fprintf(fp2,"char ----> IDENTIFIER \n");
count=count+1;}
else if(strcmp(d,"float")==0){fprintf(fp2,"float ----> IDENTIFIER \n");
count=count+1;}
else if(strcmp(d,"long")==0){fprintf(fp2,"long ----> IDENTIFIER \n");
count=count+1;}
else if(strcmp(d,"double")==0){fprintf(fp2,"double ----> IDENTIFIER \n");
count=count+1;}
else if(strcmp(d,"const")==0){fprintf(fp2,"const ----> IDENTIFIER \n");
count=count+1;}
else if(strcmp(d,"continue")==0)fprintf(fp2,"continue ----> IDENTIFIER \n");
else if(strcmp(d,"size of")==0){fprintf(fp2,"size of ----> IDENTIFIER \n");
count=count+1;}
else if(strcmp(d,"register")==0){fprintf(fp2,"register ----> IDENTIFIER \n");
count=count+1;}
else if(strcmp(d,"short")==0){fprintf(fp2,"short ----> IDENTIFIER \n");
count=count+1;}
else if(strcmp(d,"auto")==0){fprintf(fp2,"auto ----> IDENTIFIER \n");
count=count+1;}
else if(strcmp(d,"while")==0){fprintf(fp2,"while ----> IDENTIFIER \n");
count=count+1;}
else if(strcmp(d,"do")==0){fprintf(fp2,"do ----> IDENTIFIER \n");
count=count+1;}
else if(strcmp(d,"case")==0){fprintf(fp2,"case ----> IDENTIFIER \n");
count=count+1;}
else if (isdigit(d[i]))
{
fprintf(fp2,"%s ---->NUMBER",d);
}
else if (isalpha(a))
{
fprintf(fp2,"%s ----> Variable",d);
//printf("%s",d);
// memset ( d, 0, 100 );}
//fprintf(fp2, "s\n", b);
i=0;
k=k+1;
continue;
}
i=i+1;
k=k+1;
}
fclose(fp1);
fclose(fp2);
printf("%d",count);
return 0;
}
In this code , my source.txt has if (a+b) stored . But only ( , + and ) is getting written into lext.txt and not the identifier if or the variable a and b . Any particular reason why?
There are quite a few problems:
Once you find a space or a newline you try and compare what is in string d
using a series of strcmp
. But you are resetting the value of i in the body of else if (isalpha(a))
which will never execute because a
is space or newline. You should unconditionally set the value of i
to 0
right after
d[i+1]='\0';
Spaces in the string matter, so
if((strcmp(d,"if ")==0)) // d will never have a space as you never stuff it with one.
should be
if((strcmp(d,"if")==0))
The comparison for the "if" keyword is comparing against "if "
(with a space), but the code does not copy the space into the buffer d
. Your best bet is to step through it with a debugger and see what is happening.
/*
ref: http://msdn2.microsoft.com/en-us/library/y39145bk(vs.80).aspx
C language Tokens
token:
keyword
identifier
constant
string-literal
operator
punctuator
operator: one of
[ ] ( ) . –> ++ & * + – ~ ! sizeof/ % << >> <> <= >= == != ^ | && !!? := *= /= %= += –= <<= >>= &= ^= |=, # ##
assignment-operator: one of
= *= /= %= += –= <<= >>= &= ^= |=
punctuator: one of
[ ] ( ) { } * , : = ; ... #
* This is a generalized program working for any kind of input( input C program).
* It fails to recognise stdio.h in "#include<stdio.h>" as one token.
* Instead it identifies stdio as Identifier, . as dot operator , h as Identifier.
* this is becoz i considered .(dot) as a separator b/w tokens of structure variables
ex:
struct book
{
int price;
}b1;
b1.price; //here dot is used as seperator b/w b1 token and price token.........
This program is not exactly correct.......
It neither parses the comments nor the header files.....
At first time if u get an error saying "Abnormal program termination"
simply reduce the TableSize macro value. Becoz TC cannot allocated more global memory...
*/
#include<stdio.h>
#include<string.h>
#include<conio.h>
//===============================================================================================
#define true 1
#define false 0
//simple macros for moving back and front in the file
#define MoveFront(units) fseek(fp,+(units),SEEK_CUR)
#define MoveBack(units) fseek(fp,-(units),SEEK_CUR)
//Token Table size
#define TableSize 9144
//===============================================================================================
//used to open the file
int FileOpen(const char * path);
//Core function which splits the given program in to tokens....
int CreateTokens();
//forms the token string based on the starting(start) of file pointer
//position and ending(end) of file pointer position of the token
void GetToken(int start,int end);
//recognises whether the formed token as keyword,identifier, numeric constant,.......
void IdentifyTokenType(int index);
int EOFReached();
//===============================================================================================
//Keywords are arranged in such a way that most frequently used onces come first.....
//So that the comparison b/w retrieved tokens and keywords become more efficient....
const char *keywords[]=
{
"int", "char", "double", "float",
"if", "else", "for", "while",
"return", "switch", "case", "break",
"do", "default", "void", "struct",
"long", "const", "static", "union",
"enum", "register", "short", "unsigned",
"continue", "goto", "sizeof", "signed",
"auto", "volatile", "typedef", "extern",
};
//===============================================================================================
const char *preprocess[]=
{
"#define", "#include", // here rest of the preprocessors can be included too.............
};
//===============================================================================================
struct TokenEntry
{
int start, //start contains starting of token's file pointer position....
end; //end contains one more than the ending of token's file pointer position....
char * type; //Type of the token ,whether it is identifier ,keyword,numeric constant....
}tokenTable[TableSize]; //TableSize is the TC limit of TokenArray :(
//===============================================================================================
struct DelimEntry
{
char * delim; //delimiters .... "[","{"....
char * type; //delimiter's name ................"LSquare","LBrace"....
};
//===============================================================================================
//These are the delimiters in C language........(Not all of them are called as Delimiters...)
struct DelimEntry const DelimTable[]=
{
//Single Character Delimiters //Set 1
{"[","LSquare"}, //0
{"]","RSquare"}, //1
{"(","LParen"},
{")","RParen"},
{"{","LBrace"},
{"}","RBrace"},
{",","Comma"},
{";","SemiColon"},
{":","Colon"},
{"?","QuestionMark"},
{"~","BitwiseNOT"},
{".","Dot"}, //11
//Singles in triple character delimiters.... //Set 2
//When ever u encounter a Set 2 character ....
//U cannot confirm it as "Singles in triple character delimiters"
//U need to check for its next character also.....
//similarly When ever u encounter a Set 3 character ....
//U need to check for its next character also.....
//
{"<","LessThan"}, //12
{">","GreaterThan"},
{"&","BitwiseAND"},
{"|","BitwiseOR"},
{"^","XOR"},
{"=","Assignment"},
{"!","Not"},
{"%","Remainder"},
{"-","Minus"},
{"+","Plus"},
{"*","Multiply"},
{"/","DividedBy"}, //23
//Doubles in triple character delimiters.... //Set 3
{"<<","LeftShift"}, //24
{"<=","LessThanOrEqual"},
{">>","RightShift"},
{">=","GreaterThanOrEqual"},
{"&&","LogicalAND"},
{"&=","BitwiseANDEqual"},
{"||","LogicalOR"},
{"|=","BitwiseOREqual"},
{"^=","XOREqual"},
{"==","LogicalEqual"},
{"!=","LogicalNotEqual"},
{"%=","RemainderEquals"},
{"-=","MinusEquals"},
{"->","PointerArrow"},
{"--","DecrementOperator"},
{"++","IncrementOperator"},
{"+=","PlusEquals"},
{"*=","MultiplyEquals"},
{"/=","DividedByEquals"}, //42
//Triples in triple character delimiters.... //Set 4
{"<<=","LeftShiftEquals"}, //43
{">>=","RightShiftEquals"}, //44
};
//===============================================================================================
FILE * fp = NULL;
int index; //Variable used to iterate over the tokenTable Array..........
char buf[200]; //temporary variable to hold the token string........
//===============================================================================================
main(int argc , char *argv[] )
{
int i;
if(!FileOpen(argv[1])){
printf("\nUnable to Open the File : %s",argv[1]);
return;
}
if(!CreateTokens()){
//It is the problem of TC ..... :(
printf("\nUnable to Create Tokens - May be the given program contains tokens more than the maximum token table size");
return;
}
//Printing the Created Tokens..........
printf("\n%-5s %-16s %-18s %-8s %-8s\n","No","Token","Token Type","Begin","End");
printf("============================================================");
for(i=0 ; i<index;i++){
GetToken(tokenTable[i].start,tokenTable[i].end);
IdentifyTokenType(i);
printf("\n%-5d %-16s %-18s %-6d %-6d",i,buf,tokenTable[i].type,tokenTable[i].start,tokenTable[i].end);
}
}
//===============================================================================================
void IdentifyTokenType(int index)
{
int no,i;
if(strcmp(tokenTable[index].type ,"Unknown")==0)
{
//determining keywords present in tokens
no = sizeof(keywords)/sizeof(int);
for(i = 0;i<no;i++)
if(strcmp(buf,keywords[i]) == 0){
tokenTable[index].type = "Keyword";
return;
}
//determining identifiers present in tokens
if((buf[0]>='a'&&buf[0]<='z')|| (buf[0]>='A'&&buf[0]<='Z')||buf[0] == '_'){
tokenTable[index].type = "Identifier";
return;
}
//determining Preprocessor directives..
no = sizeof(preprocess)/sizeof(int);
for(i = 0;i<no;i++)
if(strcmp(buf,preprocess[i]) == 0){
tokenTable[index].type = "Preprocessor";
return;
}
//Determining String Literals
if(buf[0] == '"' && buf[strlen(buf)-1] == '"'){
tokenTable[index].type = "String Literal";
return;
}
//Determining Char Literals
//Determining Numeric constants
for(i=0;buf[i]!='\0';i++)
if(!(buf[i]>='0'&&buf[i]<='9'))return;
tokenTable[index].type = "Numeric Constant";
return;
}
}
//===============================================================================================
void GetToken(int start,int end)
{
int i=0;
fseek(fp,start,SEEK_SET);
while(i<end-start)
buf[i++] = fgetc(fp);
buf[i] = '\0';
//Trim trailing newline chars....
for(i--;i>=0;i--)
if(buf[i] == '\n')buf[i] = '\0';
}
//===============================================================================================
int EOFReached()
{
return feof(fp) != 0;
}
//===============================================================================================
//Implements the state machine for splitting the given program(input) in to tokens...............
int CreateTokens()
{
int i=0,j=0,k=0;
char c[4]={'\0','\0','\0','\0'}; //Array holding temporary characters.....
do{
//state1
c[0] = fgetc(fp);
//-------- //Path A
if(c[0] == ' ' || c[0] == '\t' || c[0] == '\n' || EOFReached()) goto End;
/*
//-------- Skipping the comments.....
if(c[0] == '/')
{
c[1] = fgetc(fp);
//Skipping // type of comments..............
if(c[1] == '/')
{
do
c[2] = fgetc(fp);
while(c[2]!='\n'&&!EOFReached());
goto End;
}
//Skipping /* type of comments..............
else if(c[1] == '*')
{
do{
c[1] = fgetc(fp);
c[2] = fgetc(fp);
ungetc(c[2],fp);
}while((c[1] != '*' || c[2] != '/')&&!EOFReached());
goto End;
}
MoveBack(1);
}
*/
MoveBack(1);
tokenTable[index].start = ftell(fp);
MoveFront(1);
//String literals
if(c[0] == '"'){
do{
c[1] = fgetc(fp);
if(c[1] =='\\') { fgetc(fp); c[1] = fgetc(fp);} //skip \" character in b/w a string literal
}while(c[1] != '"'&&!EOFReached());
tokenTable[index].type = "Unknown";
tokenTable[index++].end = ftell(fp);
goto End;
}
//--------
for(i=0;i<12;i++)
{
if(strcmp(DelimTable[i].delim,c) == 0) //Path B
{
tokenTable[index].type = DelimTable[i].type;
tokenTable[index++].end = ftell(fp);
goto End;
}
}
//--------
for(i=12;i<24;i++)
{
//Checking for single char in triples
if(strcmp(DelimTable[i].delim,c) == 0) //Path C
{
c[1] = fgetc(fp); //State 2
for(j=24;j<43&&!EOFReached();j++)
{
//Checking for double char in triples
if(strcmp(DelimTable[j].delim,c) == 0) //Path E
{
c[2] = fgetc(fp); //State 3
for(k=43;k<45&& !EOFReached();k++)
{ //Checking for triplets in triples
if(strcmp(DelimTable[k].delim,c) == 0) //Path G
{
tokenTable[index].type = DelimTable[k].type;
tokenTable[index++].end = ftell(fp);
goto End;
}
}
//Path F
if(!EOFReached())MoveBack(1);
tokenTable[index].type = DelimTable[j].type;
tokenTable[index++].end = ftell(fp);
goto End;
}
}
//Path D
if(!EOFReached())MoveBack(1);
tokenTable[index].type = DelimTable[i].type;
tokenTable[index++].end = ftell(fp);
goto End;
}
}
//---------
do{
//Path H
c[0] = fgetc(fp); //State 4
//Path I
//Checking for White Spaces
if(c[0] == ' ' || c[0] == '\t' || c[0] == '\n' ||EOFReached())
{
if(!EOFReached()) MoveBack(1);
tokenTable[index].type = "Unknown";
tokenTable[index++].end = ftell(fp);
goto End;
}
//Checking for Single char Delims //Checking for Single char Delims in triplets
for(i=0;i<24;i++)
if(strcmp(DelimTable[i].delim,c) == 0)
{
MoveBack(1);
tokenTable[index].type = "Unknown";
tokenTable[index++].end = ftell(fp);
goto End;
}
}while(!feof(fp));
End:
c[0] = c[1] = c[2] = c[3] = '\0';
}while((!feof(fp))&&index<TableSize);
//Tokens May(Not) be Ready
return index < TableSize;
}
//===============================================================================================
int FileOpen(const char *path)
{
fp = fopen(path,"r");
if(fp == NULL) return false;
return true;
}
//===============================================================================================
Hope this helps you........
精彩评论