SLang Scanner
A. First Edition
This is first edition of my SLang Scanner for Comp442 assignment 1.
In this assignment, you have to design a lexical analyzer for tokens of a programming language SLANG (our Source
LANGuage). In SLANG, the following token types exist: (
l represents any letter, d any digit, and c any character).identifer
specified by `l( l + d + _(`l + d))*numrical constant
specifed by dd*character constant
specified by 'c'//THE FOLLOWING ARE SYMBOL TYPE 18
"(", ")", ";", "+", "-", "*",
"/", ":=", "<", ">", "=", "<=",
">=", "!=", "[", "]", ",",
":",
//THE FOLLOWING ARE RESERVED TYPE 15
"begin", "end", "program", "variables","integer", "array", "char",
"module", "if", "then", "else", "loop", "exit", "read", "write"
E.Further improvement
F.File listing
1. scanner.h
2. errorNo.h
3. scanner.cpp
4. slang.cpp (main)
file name: scanner.h
///////////////////////////////////////////////////////////////////////////////////////////
//Program: SLang Scanner
//Author: Qingzhe Huang
//Date: Jan. 18, 2004
//FileName: scanner.h
//Features:
// 1. I want to improve efficiency of scanning, so I used table-driven method.
// 2. I used enum to represent character of all ASCII---CharType---where "space, tab,
// end of line, end of file are all considered to be White Space.
// 3. All legal token is represented by enum TokenType.
// 4. I defined a huge amount of TokenState which is basically the state of a DFA. As
// I don't want to search reserved keyword with linear search or whatever, I have
// many states for the reserved words.
// 5. I deliberately make the sequence of first 38 TokenState elements exactly same as
// all that of TokenType, so that each final state of DFA has a 1-1 correspondence with
// type of token.
// 6. I defined a struct of Token which may be used in future parser.
// 7. I defined an errorNo variable to represent various errors. And a series error string
// for displaying information.
// 8. When class Scanner is created, it will initialize the big "state-charType" table.
// 9. When readFromFile is called, it will first read one char in advance.
// 10. When an error is encountered, the caller of Scanner should understand that no further
// char is read in. So, stop calling "nextToken()". This is a bit controvercial, and I
// plan to change it in next version.
////////////////////////////////////////////////////////////////////////////////////////////
#ifndef SCANNER_H
#define SCANNER_H
#include <iostream>
using namespace std;
extern enum ErrorCode;
const int TokenStateCount=138;
const int CharTypeCount=72;
const int MaxTokenLength=255;
enum CharType
{
//all small letters 26
SMALLA,SMALLB,SMALLC,SMALLD,SMALLE,SMALLF,SMALLG,SMALLH,SMALLI,SMALLJ,SMALLK,SMALLL,
SMALLM,SMALLN,SMALLO,SMALLP,SMALLQ,SMALLR,SMALLS,SMALLT,SMALLU,SMALLV,SMALLW,SMALLX,
SMALLY,SMALLZ,
//all big letters 26
BIGA,BIGB,BIGC,BIGD,BIGE,BIGF,BIGG,BIGH,BIGI,BIGJ,BIGK,BIGL,BIGM,BIGN,BIGO,BIGP,BIGQ,
BIGR,BIGS,BIGT,BIGU,BIGV,BIGW,BIGX,BIGY,BIGZ,
//all digit 1
DIGIT,
//all symbols 16
QUOTE, OPENPAR, CLOSEPAR, SEMICOLON,PLUS, MINUS, TIMES, SLASH, COLON,
EQUAL,SMALLER,GREATER,EXCLAIM,OPENBRACKET, CLOSEBRACKET,COMMA,
//space, tab, end of line are regarded as whitespace, 1
WHITESPACE,
//UNDERSCORE IS A SPECIAL SYMBOL 1
UNDERSCORE,
//all other ASCII is regarded as illigal 1
ILLIGAL
};
//TOTAL 38, JUST 1-1 WITH THE FIRST 38 OF TOKENSTATE
enum TokenType
{
//GENERAL TYPE 5
IDTYPE, NUMBERTYPE, CHARCONSTTYPE, COMMENTTYPE, ERRORTYPE,
//THE FOLLOWING ARE SYMBOL TYPE 18
OPENPARTYPE, CLOSEPARTYPE, SEMICOLONTYPE, PLUSTYPE, MINUSTYPE, TIMESTYPE,
SLASHTYPE, ASSIGNMENTTYPE, SMALLERTYPE, GREATERTYPE, EQUALTYPE, SMALLEREQUALTYPE,
GREATEREQUALTYPE, NOTEQUALTYPE, OPENBRACKETTYPE, CLOSEBRACKETTYPE, COMMATYPE,
COLONTYPE,
//THE FOLLOWING ARE RESERVED TYPE 15
BEGINTYPE, ENDTYPE, PROGRAMTYPE, VARIABLESTYPE,INTEGERTYPE, ARRAYTYPE, CHARTYPE,
MODULETYPE, IFTYPE, THENTYPE, ELSETYPE, LOOPTYPE, EXITTYPE, READTYPE, WRITETYPE
};
enum TokenState
{
//THE FINAL STATE 38, in order to easy initialize "finalState", I put them in beginning
//5 generals
IDEND, NUMBEREND, CONSTCHAREND, COMMENTEND, ERROR,
//18 symbols
OPENPAREND, CLOSEPAREND, SEMICOLONEND, PLUSEND, MINUSEND, TIMESEND,
SLASHEND, ASSIGNMENTEND, SMALLEREND, GREATEREND, EQUALEND, SMALLEREQUALEND,
GREATEREQUALEND, NOTEQUALEND, OPENBRACKETEND, CLOSEBRACKETEND, COMMAEND,
COLONEND,
//15 reserved
BEGINEND, ENDEND, PROGRAMEND, VARIABLESEND, INTEGEREND, ARRAYEND, CHAREND,
MODULEEND, IFEND, THENEND, ELSEEND, LOOPEND, EXITEND, READEND, WRITEEND,
//THE FOLLOWING ARE ALL NON-FINAL STATES
//THE very FIRST CHAR 1
READY,
//THE FOLLOWING ARE ALL RESERVED STATE
//the first char 12
ARRAY1, BEGIN1, CHAR1, E1, I1, LOOP1, MODULE1, PROGRAM1, READ1, THEN1, VARIABLES1,
WRITE1,
//THE SECOND CHAR 15
ARRAY2, BEGIN2, CHAR2, ELSE2, END2, EXIT2, IF2, INTEGER2, LOOP2, MODULE2, PROGRAM2,
READ2, THEN2, VARIABLES2, WRITE2,
//THE THIRD CHAR 14
ARRAY3, BEGIN3, CHAR3, ELSE3, END3, EXIT3, INTEGER3, LOOP3, MODULE3, PROGRAM3, READ3,
THEN3, VARIABLES3, WRITE3,
//THE FOURTH CHAR 13
ARRAY4, BEGIN4, CHAR4, ELSE4, EXIT4, INTEGER4, LOOP4, MODULE4, PROGRAM4, READ4, THEN4,
VARIABLES4, WRITE4,
//THE FIFTH CHAR 7
ARRAY5, BEGIN5, INTEGER5, MODULE5, PROGRAM5, VARIABLES5, WRITE5,
//THE SIXTH CHAR 4
INTEGER6, MODULE6, PROGRAM6, VARIABLES6,
//THE SEVENTH CHAR 3
INTEGER7, PROGRAM7, VARIABLES7,
//THE EIGHTH CHAR 1
VARIABLES8,
//THE NINETH CHAR 1
VARIABLES9,
//THESE ARE NON-RESERVED
//THESE ARE GENERAL 9
IDBEGIN, IDUNDERSCORE, NUMBERBEGIN, CONSTCHARQUOTEBEGIN, CONSTCHARBEGIN, COMMENTSTARBEGIN,
COMMENTBEGIN, COMMENTSTAREND, COMMENTSLASHBEGIN,
//the SINGLE symbols 16
QUOTEBEGIN, OPENPARBEGIN, CLOSEPARBEGIN, SEMICOLONBEGIN,
PLUSBEGIN, MINUSBEGIN, TIMESBEGIN, SLASHBEGIN, COLONBEGIN, SMALLERBEGIN, GREATERBEGIN,
EQUALBEGIN, EXCLAIMBEGIN, OPENBRACKETBEGIN, CLOSEBRACKETBEGIN, COMMABEGIN,
//MULTI SYMBOL 4
ASSIGNMENTBEGIN, SMALLEREQUALBEGIN,
GREATEREQUALBEGIN, NOTEQUALBEGIN
};
class Scanner
{
struct Token
{
TokenType type;
char name[MaxTokenLength+1];
};
private:
unsigned char ch;
FILE* stream;
bool nextChar();
void initialize();
public:
Scanner();
ErrorCode errorNo;
Token token;
void errorHandle();
bool readFromFile(const char* fileName);
const char* getToken(){return token.name;}
bool nextToken();
};
void initialTokenState();
#endif
file name: errorNo.h
#ifndef ERRORNO_H
#define ERRORNO_H
const int ErrorCount=5;
const int TokenTypeCount=38;
enum ErrorCode
{IllegalToken, TokenTooLong, UnexpectedReachEOF, FileEmptyError, CannotOpenFile};
#endif
file name: scanner.cpp
#include <iostream>
#include "scanner.h"
#include "errorNo.h"
using namespace std;
char* errorStr[ErrorCount]=
{"IllegalToken", "TokenTooLong", "UnexpectedReachEOF", "FileEmptyError", "CannotOpenFile"};
char* tokenTypeStr[TokenTypeCount]=
{
//GENERAL TYPE 5
"ID", "NUMBER", "CHARACTER CONSTANT", "COMMENT", "ERROR",
//THE FOLLOWING ARE SYMBOL TYPE 18
"(", ")", ";", "+", "-", "*",
"/", ":=", "<", ">", "=", "<=",
">=", "!=", "[", "]", ",",
":",
//THE FOLLOWING ARE RESERVED TYPE 15
"begin", "end", "program", "variables","integer", "array", "char",
"module", "if", "then", "else", "loop", "exit", "read", "write"
};
CharType charType[256];
TokenState tokenState[TokenStateCount][CharTypeCount];
bool Scanner::nextToken()
{
TokenState state=READY;
int count=0;//to count the length of token
char* ptr=token.name;
bool isComment=false;
do
{
//have to output anyway
state=tokenState[state][charType[ch]];
//because I put all final state in the first 37 positions
if (state<38)
{
//This is a dirty trick! Because I make the "TokenType" 1-1 with
//TokenState for the 38 finals.
*ptr='\0';
if (state==ERROR)
{
errorNo=IllegalToken;
}
token.type=(TokenType)(state);
return true;
}
if (state==COMMENTBEGIN)
{
isComment=true;
}
if (count>=MaxTokenLength)
{
errorNo=TokenTooLong;
token.type=ERRORTYPE;
return true;
}
cout<<ch;
if (!isComment&&state!=READY)
{
*ptr=ch;
ptr++;
count++;
}
}while (nextChar());
state=tokenState[state][charType[ch]];
//even READY IS CONSIDERED TO BE GOOD,
if (state<38)
{
//This is a dirty trick! Because I make the "TokenType" 1-1 with
//TokenState for the 38 finals.
*ptr='\0';
token.type=(TokenType)(state);
}
else
{
if (state!=READY)
{
//this means the token is incomplete
errorNo=UnexpectedReachEOF;
token.type=ERRORTYPE;
}
}
return false;
}
Scanner::Scanner()
{
initialize();
}
void Scanner::errorHandle()
{
cout<<"\n"<<errorStr[errorNo]<<endl;
}
void Scanner::initialize()
{
initialTokenState();
}
bool Scanner::readFromFile(const char* fileName)
{
if ((stream=fopen(fileName, "r"))==NULL)
{
errorNo=CannotOpenFile;
return false;
}
else
{
if (!nextChar())
{
errorNo=FileEmptyError;
return false;
}
}
return true;
}
bool Scanner::nextChar()
{
ch=fgetc(stream);
return ch!=255;
}
file name: initialize.cpp
#include "scanner.h"
extern enum CharType;
extern enum TokenState;
//extern const int TokenStateCount;
//extern const int CharTypeCount;
extern CharType charType[256];
extern TokenState tokenState[TokenStateCount][CharTypeCount];
void finalSymbolToken(TokenState state, TokenState endState);
void finalReservedToken(TokenState state, TokenState endState);
void initialCharType();
void setFinalTokenState();
void initialReserved(TokenState state);
void setRange(TokenState state, CharType start, CharType end, TokenState target);
void setState(TokenState state, TokenState targetState);
void setDefaultState();
void setDefaultState()
{
for (int i=0; i<TokenStateCount; i++)
{
setState((TokenState)i, ERROR);
}
//the default for all letters are IDBEGIN
setRange(READY, SMALLA, BIGZ, IDBEGIN);
//THIS IS another dirty trick, since I put all reserved states together
//so you can initialize them together.
for (i=ARRAY1; i<=VARIABLES9; i++)
{
initialReserved((TokenState)i);
}
setFinalTokenState();
}
void setFinalTokenState()
{
//FOR ID
finalReservedToken(IDBEGIN, IDEND);
//for number
finalReservedToken(NUMBERBEGIN, NUMBEREND);
//THESE FOR RESERVED WORDS
finalReservedToken(ARRAY5, ARRAYEND);
finalReservedToken(BEGIN5, BEGINEND);
finalReservedToken(CHAR4, CHAREND);
finalReservedToken(ELSE4, ELSEEND);
finalReservedToken(END3, ENDEND);
finalReservedToken(EXIT4, EXITEND);
finalReservedToken(IF2, IFEND);
finalReservedToken(INTEGER7, INTEGEREND);
finalReservedToken(LOOP4, LOOPEND);
finalReservedToken(MODULE6, MODULEEND);
finalReservedToken(PROGRAM7, PROGRAMEND);
finalReservedToken(READ4, READEND);
finalReservedToken(THEN4, THENEND);
finalReservedToken(VARIABLES9, VARIABLESEND);
finalReservedToken(WRITE5, WRITEEND);
//THESE FOR SYMBOLS
finalSymbolToken(OPENPARBEGIN, OPENPAREND);
finalSymbolToken(CLOSEPARBEGIN, CLOSEPAREND);
finalSymbolToken(SEMICOLONBEGIN, SEMICOLONEND);
finalSymbolToken(PLUSBEGIN, PLUSEND);
finalSymbolToken(MINUSBEGIN, MINUSEND);
finalSymbolToken(TIMESBEGIN, TIMESEND);
finalSymbolToken(SLASHBEGIN, SLASHEND);
finalSymbolToken(ASSIGNMENTBEGIN, ASSIGNMENTEND);
finalSymbolToken(SMALLERBEGIN, SMALLEREND);
finalSymbolToken(GREATERBEGIN, GREATEREND);
finalSymbolToken(EQUALBEGIN, EQUALEND);
finalSymbolToken(SMALLEREQUALBEGIN, SMALLEREQUALEND);
finalSymbolToken(GREATEREQUALBEGIN, GREATEREQUALEND);
finalSymbolToken(NOTEQUALBEGIN, NOTEQUALEND);
finalSymbolToken(OPENBRACKETBEGIN, OPENBRACKETEND);
finalSymbolToken(CLOSEBRACKETBEGIN, CLOSEBRACKETEND);
finalSymbolToken(COMMABEGIN, COMMAEND);
finalSymbolToken(COLONBEGIN, COLONEND);
//COMMENT
finalSymbolToken(COMMENTSLASHBEGIN, COMMENTEND);
//CONSTCHAR
finalSymbolToken(CONSTCHARQUOTEBEGIN, CONSTCHAREND);
}
void initialTokenState()
{
//initialize all charType
initialCharType();
//default is always error
setDefaultState();
//loop
tokenState[READY][WHITESPACE]=READY;
//number
tokenState[READY][DIGIT]=NUMBERBEGIN;
tokenState[NUMBERBEGIN][DIGIT]=NUMBERBEGIN;//HOW LONG SHOULD NUMBER BE?
//ID
//setRange(READY, SMALLA, BIGZ, IDBEGIN); THIS IS IN DEFAULT
setRange(IDBEGIN, SMALLA, DIGIT, IDBEGIN);
tokenState[IDBEGIN][UNDERSCORE]=IDUNDERSCORE;
setRange(IDUNDERSCORE, SMALLA, DIGIT, IDBEGIN);
//reserved words
//ARRAY1, BEGIN1, CHAR1, E1, I1, LOOP1, MODULE1, PROGRAM1, READ1, THEN1, WRITE1,
//VARIABLES1,
tokenState[READY][SMALLA]=ARRAY1;
tokenState[READY][SMALLB]=BEGIN1;
tokenState[READY][SMALLC]=CHAR1;
tokenState[READY][SMALLE]=E1;
tokenState[READY][SMALLI]=I1;
tokenState[READY][SMALLL]=LOOP1;
tokenState[READY][SMALLM]=MODULE1;
tokenState[READY][SMALLP]=PROGRAM1;
tokenState[READY][SMALLR]=READ1;
tokenState[READY][SMALLT]=THEN1;
tokenState[READY][SMALLV]=VARIABLES1;
tokenState[READY][SMALLW]=WRITE1;
/* RESERVED WORDS
ARRAY2 */
tokenState[ARRAY1][SMALLR]=ARRAY2;
//BEGIN2
tokenState[BEGIN1][SMALLE]=BEGIN2;
//CHAR2
tokenState[CHAR1][SMALLH]=CHAR2;
//ELSE2,
tokenState[E1][SMALLL]=ELSE2;
//EXIT2
tokenState[E1][SMALLX]=EXIT2;
//END2
tokenState[E1][SMALLN]=END2;
//IF2
tokenState[I1][SMALLF]=IF2;
//INTEGER2
tokenState[I1][SMALLN]=INTEGER2;
//LOOP2
tokenState[LOOP1][SMALLO]=LOOP2;
//MODULE2
tokenState[MODULE1][SMALLO]=MODULE2;
//PROGRAM2
tokenState[PROGRAM1][SMALLR]=PROGRAM2;
//READ2
tokenState[READ1][SMALLE]=READ2;
//THEN2
tokenState[THEN1][SMALLH]=THEN2;
//VARIABLES2
tokenState[VARIABLES1][SMALLA]=VARIABLES2;
//WRITE2
tokenState[WRITE1][SMALLR]=WRITE2;
/* RESERVED WORDS
ARRAY3 */
tokenState[ARRAY2][SMALLR]=ARRAY3;
//BEGIN2
tokenState[BEGIN2][SMALLG]=BEGIN3;
//CHAR2
tokenState[CHAR2][SMALLA]=CHAR3;
//ELSE2,
tokenState[ELSE2][SMALLS]=ELSE3;
//END2
tokenState[END2][SMALLD]=END3;
//EXIT2
tokenState[EXIT2][SMALLI]=EXIT3;
//INTEGER2
tokenState[INTEGER2][SMALLT]=INTEGER3;
//LOOP2
tokenState[LOOP2][SMALLO]=LOOP3;
//MODULE2
tokenState[MODULE2][SMALLD]=MODULE3;
//PROGRAM2
tokenState[PROGRAM2][SMALLO]=PROGRAM3;
//READ2
tokenState[READ2][SMALLA]=READ3;
//THEN2
tokenState[THEN2][SMALLE]=THEN3;
//VARIABLES2
tokenState[VARIABLES2][SMALLR]=VARIABLES3;
//WRITE2
tokenState[WRITE2][SMALLI]=WRITE3;
/* RESERVED WORDS
ARRAY3 */
tokenState[ARRAY3][SMALLA]=ARRAY4;
//BEGIN2
tokenState[BEGIN3][SMALLI]=BEGIN4;
//CHAR2
tokenState[CHAR3][SMALLR]=CHAR4;
//ELSE2,
tokenState[ELSE3][SMALLE]=ELSE4;
//EXIT2
tokenState[EXIT3][SMALLT]=EXIT4;
//INTEGER2
tokenState[INTEGER3][SMALLE]=INTEGER4;
//LOOP2
tokenState[LOOP3][SMALLP]=LOOP4;
//MODULE2
tokenState[MODULE3][SMALLU]=MODULE4;
//PROGRAM2
tokenState[PROGRAM3][SMALLG]=PROGRAM4;
//READ2
tokenState[READ3][SMALLD]=READ4;
//THEN2
tokenState[THEN3][SMALLN]=THEN4;
//VARIABLES2
tokenState[VARIABLES3][SMALLI]=VARIABLES4;
//WRITE2
tokenState[WRITE3][SMALLT]=WRITE4;
/* RESERVED WORDS
ARRAY */
tokenState[ARRAY4][SMALLY]=ARRAY5;
//BEGIN2
tokenState[BEGIN4][SMALLN]=BEGIN5;
//INTEGER2
tokenState[INTEGER4][SMALLG]=INTEGER5;
//MODULE2
tokenState[MODULE4][SMALLL]=MODULE5;
//PROGRAM2
tokenState[PROGRAM4][SMALLR]=PROGRAM5;
//VARIABLES2
tokenState[VARIABLES4][SMALLA]=VARIABLES5;
//WRITE2
tokenState[WRITE4][SMALLE]=WRITE5;
// RESERVED WORDS*/
//INTEGER2
tokenState[INTEGER5][SMALLE]=INTEGER6;
//MODULE2
tokenState[MODULE5][SMALLE]=MODULE6;
//PROGRAM2
tokenState[PROGRAM5][SMALLA]=PROGRAM6;
//VARIABLES2
tokenState[VARIABLES5][SMALLB]=VARIABLES6;
// RESERVED WORDS*/
//INTEGER2
tokenState[INTEGER6][SMALLR]=INTEGER7;
//PROGRAM2
tokenState[PROGRAM6][SMALLM]=PROGRAM7;
//VARIABLES2
tokenState[VARIABLES6][SMALLL]=VARIABLES7;
// RESERVED WORDS*/
//VARIABLES2
tokenState[VARIABLES7][SMALLE]=VARIABLES8;
//VARIABLES2
tokenState[VARIABLES8][SMALLS]=VARIABLES9;
/*
CONSTCHAR, UNDERSCOREBEGIN, ASSIGNMENTBEGIN, SMALLEREQUALBEGIN,
GREATEREQUALBEGIN, NOTEQUAL, COMMENTBEGIN, IDUNDERSCORE,*/
//now is the symbols
//QUOTEBEGIN, OPENPARBEGIN, CLOSEPARBEGIN, SEMICOLONBEGIN,
//PLUSBEGIN, MINUSBEGIN, TIMESBEGIN, SLASHBEGIN, COLONBEGIN, SMALLERBEGIN, GREATERBEGIN,
//EQUALBEGIN, EXCLAIMBEGIN, OPENBRACKETBEGIN, CLOSEBRACKETBEGIN, COMMABEGIN,
//'
tokenState[READY][QUOTE]=QUOTEBEGIN;
//(
tokenState[READY][OPENPAR]=OPENPARBEGIN;
//)
tokenState[READY][CLOSEPAR]=CLOSEPARBEGIN;
//;
tokenState[READY][SEMICOLON]=SEMICOLONBEGIN;
//+
tokenState[READY][PLUS]=PLUSBEGIN;
//-
tokenState[READY][MINUS]=MINUSBEGIN;
//*
tokenState[READY][TIMES]=TIMESBEGIN;
///
tokenState[READY][SLASH]=SLASHBEGIN;
//:
tokenState[READY][COLON]=COLONBEGIN;
//<
tokenState[READY][SMALLER]=SMALLERBEGIN;
//>
tokenState[READY][GREATER]=GREATERBEGIN;
//=
tokenState[READY][EQUAL]=EQUALBEGIN;
//!
tokenState[READY][EXCLAIM]=EXCLAIMBEGIN;
//[
tokenState[READY][OPENBRACKET]=OPENBRACKETBEGIN;
//]
tokenState[READY][CLOSEBRACKET]=CLOSEBRACKETBEGIN;
//,
tokenState[READY][COMMA]=COMMABEGIN;
//AFTER QUOTE IT CAN BE ANY CHARACTER, INCLUDING ILLIGAL CHAR
setRange(QUOTEBEGIN, SMALLA, ILLIGAL, CONSTCHARBEGIN);
//ANY OTHER STATE IS BY DEFAULT ERROR
tokenState[CONSTCHARBEGIN][QUOTE]=CONSTCHARQUOTEBEGIN;
//FOR /, DEFAULT IS SLASHEND, EXCEPT * WHICH IS COMMENTSTARBEGIN
tokenState[SLASHBEGIN][TIMES]= COMMENTSTARBEGIN;
//FOR :, DEFAULT IS COLONEND, EXCEPT FOR = WHICH IS ASSIGNMENTBEGIN
tokenState[COLONBEGIN][EQUAL]= ASSIGNMENTBEGIN;
//FOR <, DEFAULT IS SMALLEREND, EXCEPT FOR= WHICH IS SMALLEREQAULBEGIN
tokenState[SMALLERBEGIN][EQUAL]=SMALLEREQUALBEGIN;
//FOR >, DEFAULT IS GREATEREND, EXCEPT FOR= WHICH IS GREATEREQAULBEGIN
tokenState[GREATERBEGIN][EQUAL]= GREATEREQUALBEGIN;
tokenState[EXCLAIMBEGIN][EQUAL]= NOTEQUALBEGIN;
//WITHIN COMMENT IT IS A LOOP, EXCEPT FOR * WHICH IS POSSIBLE FOR END OF COMMENT
setRange(COMMENTSTARBEGIN, SMALLA, ILLIGAL, COMMENTBEGIN);
tokenState[COMMENTSTARBEGIN][TIMES]=COMMENTSTAREND;
setRange(COMMENTBEGIN, SMALLA, ILLIGAL, COMMENTBEGIN);
tokenState[COMMENTBEGIN][TIMES]=COMMENTSTAREND;
//FROM COMMENTSTARBEGIN, ALL IS BACK TO COMMENTBEGIN, EXCEPT / WHICH IS END OF COMMENT
setRange(COMMENTSTAREND, SMALLA, ILLIGAL, COMMENTBEGIN);
tokenState[COMMENTSTAREND][SLASH]=COMMENTSLASHBEGIN;
//
}
void initialReserved(TokenState state)
{
setRange(state, SMALLA, DIGIT, IDBEGIN);
finalReservedToken(state, IDEND);
tokenState[state][UNDERSCORE]=IDUNDERSCORE;//a_
}
void finalSymbolToken(TokenState state, TokenState endState)
{
for (int i=SMALLA; i<=WHITESPACE; i++)
{
tokenState[state][(CharType)i]=endState;
}
}
void finalReservedToken(TokenState state, TokenState endState)
{
//all non-letter, non-digit is regarded to be delimeter
for (int i=QUOTE; i<=WHITESPACE; i++)
{
tokenState[state][(CharType)i]=endState;
}
}
//the default charType is ILLIGAL
void initialCharType()
{
int chType;
//the default charType is ILLIGAL
for (int i=0; i<256; i++)
{
charType[i]=ILLIGAL;
}
//chType is SMALLA
chType=SMALLA;
for (i='a'; i<='z'; i++)
{
charType[i]=(CharType)(chType);
chType++;
}
//chType is now BIGA
chType=BIGA;//I don't want to rely on the trick.
for (i='A'; i<='Z'; i++)
{
charType[i]=(CharType)(chType);
chType++;
}
chType=DIGIT;
for (i='0'; i<='9'; i++)
{
charType[i]=(CharType)(chType);
}
/*
UNDERSCORE, QUOTE, OPENPAR, CLOSEPAR, SEMICOLON,PLUS, MINUS, TIMES, SLASH, COLON,
EQUAL,SMALLER,GREATER,EXCLAIM,OPENBRACKET, CLOSEBRACKET,COMMA,
SPACE,TAB, ENDLINE, ILLIGAL
*/
charType['_']=UNDERSCORE;
charType['\'']=QUOTE;
charType['(']=OPENPAR;
charType[')']=CLOSEPAR;
charType[';']=SEMICOLON;
charType['+']=PLUS;
charType['-']=MINUS;
charType['*']=TIMES;
charType['/']=SLASH;
charType[':']=COLON;
charType['=']=EQUAL;
charType['<']=SMALLER;
charType['>']=GREATER;
charType['!']=EXCLAIM;
charType['[']=OPENBRACKET;
charType[']']=CLOSEBRACKET;
charType[',']=COMMA;
charType[' ']=WHITESPACE;
charType['\t']=WHITESPACE;
charType[10]=WHITESPACE;
charType[13]=WHITESPACE;
charType[255]=WHITESPACE;//IT IS A KIND OF DELIMETER
}
void setRange(TokenState state, CharType start, CharType end, TokenState target)
{
for (int i=start; i<=end; i++)
{
tokenState[state][i]=target;
}
}
void setState(TokenState state, TokenState targetState)
{
for (int i=0; i<CharTypeCount; i++)
{
tokenState[state][i]=targetState;
}
}
file name: slang.cpp (main)
#include <iostream>
#include "scanner.h"
#include "errorno.h"
using namespace std;
extern char* tokenTypeStr[];
int main()
{
Scanner S;
if (!S.readFromFile("c:\\scannerSource.txt"))
{
S.errorHandle();
}
while (S.nextToken())
{
/*
if (S.token.type!=COMMENTTYPE)
{
cout<<"\nthe token is:"<<S.getToken()<<endl;
}
*/
if (S.token.type==ERRORTYPE)
{
break;
}
cout<<"\nthe type of token is:"<<tokenTypeStr[S.token.type]<<endl;
}
if (S.token.type==ERRORTYPE)
{
cout<<"\nthe token is:"<<S.getToken()<<endl;
S.errorHandle();
}
return 0;
}
Here is the result: The input file is "c:\scannerSource.txt".
Sorry I don't give you the text file, you can even use the source code of program itself, except you have
to remove some illegal symbol, like ", #, {, }. etc.