CFGReader

name: M
   isNull: false
   isTerminal: false
   first: [0]=program
   follow: [0]=$
name: Dl
   isNull: true
   isTerminal: false
   first: [0]=e,[1]=variables,[2]=module
   follow: [0]=begin
name: B
   isNull: false
   isTerminal: false
   first: [0]=begin
   follow: [0]=$,[1]=module,[2]=begin
name: Dv
   isNull: true
   isTerminal: false
   first: [0]=variables,[1]=e
   follow: [0]=module,[1]=begin
name: Ml
   isNull: true
   isTerminal: false
   first: [0]=e,[1]=module
   follow: [0]=begin
name: Mo
   isNull: false
   isTerminal: false
   first: [0]=module
   follow:
name: Vl
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=module,[1]=begin,[2]=)
name: V
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=i,[1]=module,[2]=begin,[3]=)
name: Il
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=:
name: T
   isNull: false
   isTerminal: false
   first: [0]=integer,[1]=char
   follow: [0]=;
name: Ad
   isNull: true
   isTerminal: false
   first: [0]=e,[1]=array
   follow: [0]=;
name: L
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=,,[1]=;,[2]=)
name: Ar
   isNull: true
   isTerminal: false
   first: [0]=e,[1]=[
   follow: [0]=,,[1]=:=,[2]=;,[3]=),[4]=*,[5]=/,[6]=+,[7]=-,[8]=],[9]==,[10]=<,[11]=>,[12]=<=,[13]=>
=,[14]=!=,[15]=then
name: Sl
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=;
   follow: [0]=end
name: S
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=;
   follow: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=;,[8]=end,[9]=else
name: E
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=n,[2]=(,[3]=c
   follow: [0]=],[1]=),[2]=;,[3]=then
name: C
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=n,[2]=(,[3]=c
   follow: [0]=then
name: Lp
   isNull: true
   isTerminal: false
   first: [0]=e,[1]=i
   follow: [0]=)
name: Ln
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=;,[1]=)
name: Lo
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=n,[2]=c
   follow: [0]=;
name: Lr
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=n,[2]=c
   follow: [0]=,,[1]=;
name: F
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=n,[2]=(,[3]=c
   follow: [0]=+,[1]=-,[2]=],[3]=),[4]=;,[5]==,[6]=<,[7]=>,[8]=<=,[9]=>=,[10]=!=,[11]=then
name: Oa
   isNull: false
   isTerminal: false
   first: [0]=+,[1]=-
   follow:
name: R
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=n,[2]=(,[3]=c
   follow: [0]=*,[1]=/,[2]=+,[3]=-,[4]=],[5]=),[6]=;,[7]==,[8]=<,[9]=>,[10]=<=,[11]=>=,[12]=!=,[13]=
then
name: Om
   isNull: false
   isTerminal: false
   first: [0]=*,[1]=/
   follow:
name: Or
   isNull: false
   isTerminal: false
   first: [0]==,[1]=<,[2]=>,[3]=<=,[4]=>=,[5]=!=
   follow: [0]=i,[1]=n,[2]=(,[3]=c
name: M0
   isNull: true
   isTerminal: false
   first: [0]=+,[1]=e,[2]=-
   follow: [0]=],[1]=),[2]=;,[3]==,[4]=<,[5]=>,[6]=<=,[7]=>=,[8]=!=,[9]=then
name: M1
   isNull: true
   isTerminal: false
   first: [0]=,,[1]=e
   follow: [0]=;,[1]=)
name: M2
   isNull: true
   isTerminal: false
   first: [0]=,,[1]=e
   follow: [0]=;
name: M3
   isNull: true
   isTerminal: false
   first: [0]=*,[1]=e,[2]=/
   follow: [0]=+,[1]=-,[2]=],[3]=),[4]=;,[5]==,[6]=<,[7]=>,[8]=<=,[9]=>=,[10]=!=,[11]=then
name: Ml0
   isNull: true
   isTerminal: false
   first: [0]=module,[1]=e
   follow: [0]=begin
name: Vl0
   isNull: true
   isTerminal: false
   first: [0]=i,[1]=e
   follow: [0]=module,[1]=begin,[2]=)
name: Il0
   isNull: true
   isTerminal: false
   first: [0]=,,[1]=e
   follow: [0]=:
name: Sl0
   isNull: true
   isTerminal: false
   first: [0]=e,[1]=i,[2]=if,[3]=loop,[4]=exit,[5]=begin,[6]=read,[7]=write,[8]=;
   follow: [0]=end
name: S0
   isNull: false
   isTerminal: false
   first: [0]=[,[1]=:=,[2]=(
   follow: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=;,[8]=end,[9]=else
name: START
   isNull: false
   isTerminal: false
   first: [0]=program
   follow:
Press any key to continue

#include "Grammar.h" class CFGReader { private: char buffer[BufferLength]; FILE* stream; void newRule(const char* str); void readRule(); void addRule(const char* str); int addToken(const char* str, bool isTerminal=true); void printRule(int index); public: void readFromFile(const char* fileName); void optimize(); void print(); };

#include <iostream> #include "CFGReader.h" using namespace std; Grammar grammar; const char* deliStr=" |\n"; //this would only remove the immediate left recursion //and should I expand it to remove all kinds of left-recursion? void CFGReader::readFromFile(const char* fileName) { if ((stream=fopen(fileName, "r"))==NULL) { cout<<"cannot open file "<<fileName<<endl; } else { readRule(); } } void CFGReader::readRule() { char* ptr=buffer; char* hold; int count=0; while (!feof(stream)) { count=0; fgets(buffer, BufferLength-1, stream); ptr=buffer; while ((ptr=strtok(ptr, " \n"))!=NULL) { if (strcmp(ptr, "|")!=0) { //test the first two token to see if old rule continues if (count==0) { hold=ptr; } if (count==1) { if (strcmp("->", ptr)==0) { /* curIndex=addToken(hold, false);//the index of cur token grammar[curIndex]->terminal=false; newRule(); */ newRule(hold); } else { addRule(hold); addRule(ptr); } } if (count>=2)//always add { addRule(ptr); } count++; } else { //this is an alternative production rule newRule(NULL); } ptr=NULL; } } } void CFGReader::newRule(const char* str) { grammar.newRule(str); } void CFGReader::optimize() { grammar.optimize(); } void CFGReader::addRule(const char* str) { grammar.addRule(str); } int CFGReader::addToken(const char* str, bool isTerminal) { return grammar.addToken(str, isTerminal); } void CFGReader::printRule(int index) { grammar.printRule(index); } void CFGReader::print() { grammar.print(); grammar.printToken(true); //grammar.buildTable(); //grammar.printTable(); //grammar.printAllRules(); }

#ifndef GRAMMAR_H #define GRAMMAR_H const int BufferLength=256; const int MaxTokenCount=80; const int MaxRHSCount=80; const int MaxRuleCount=200; const int MaxGrammarTokenLength=10; const int MaxProduction=10; const int TokenTypeCount=38; struct GrammarToken { //bool isMeta; bool terminal; bool isNull; char* name; int production[MaxProduction];//pointer to the production it gives int count; int firstCount; int followCount; int follow[MaxTokenCount]; int first[MaxTokenCount]; }; class Grammar { friend class Parser; private: GrammarToken* token[MaxTokenCount]; int tokenCount; int curIndex;//the index of current token int curPos;//the position at production rule //MaxRuleCount>=MaxVariableCount!!! int production[MaxRuleCount][MaxRHSCount]; int prodIndex;//pointing to current production, initialized to -1 int checkRecursion(int tIndex, int curToken); void grammarError(int theVar, int theToken, int rule1, int rule2); void addFirstIntoTable(int theVariable, int theFirst, int theRule); void addFollowIntoTable(int theVariable, int theRule); void addBeginEnd(); void addEpsilonForToken(int tIndex); bool addIntoFirst(int target, int source); bool addFirst(int target, int source); void shiftRule(int ruleIndex, bool Left2Right, int offset); void addAtEnd(int ruleIndex, int toAdd); void addRuleForToken(int tIndex, int ruleIndex); int copyRule(int source, int target);//return the position of -1 void replaceRule(int curToken, int ruleIndex); int findMetaSymbol(int& begin, int& end); void initialize(); void doReplaceMetaSymbol(int ruleIndex, int begin, int end); void removeRuleFromToken(int tIndex, int ruleIndex); void checkEpsilon(int ruleIndex); void removeImmediate(int tIndex, int ruleIndex); void doAddRule(int tIndex); void commonLeftFactor(); int findCommonFactor(int tokenIndex, int ruleIndex); void removeLeftRecursion(); void doCommonFactor(int tokenIndex, int ruleIndex, int index); int forgeToken(int index);//create a new variable void epsilonRule(int ruleIndex); bool isRedundant(int tIndex); void replaceMetaSymbol(); void calculateFirst(); void calculateFollow(); void calculateNull(); bool Null(int tIndex); bool addFollow(int target, int source); bool addIntoFollow(int target, int source); bool addFollowIntoFollow(int target, int source); void removeRedundant(); void removeToken(int tIndex); // void calculateProperty(); public: Grammar(); void buildTable(); void optimize(); void printRule(int index); void print(); void printTable(); void printAllRules(); void printToken(bool onlyVar=false); int variableCount(); int terminalCount(); void addRule(const char* str); void newRule(const char* str);//this is an internal method, not suitable for outside GrammarToken* operator[](int index); int addToken(const char* str, bool isTerminal=true); GrammarToken* createToken(const char* theName, bool isTerminal); int tokenIndex(const char* str); }; #endif

#include <iostream> #include "Grammar.h" using namespace std; const char* emptyStr="e"; const char* optionBegin="{"; const char* optionEnd="}"; const char* startStr="START"; const char* endStr="$"; int startSymbolIndex=-1; int stackBottomIndex=-1; int beginIndex=-1; int endIndex=-1; int emptyIndex=-1; int epsilonIndex=-1; int table[MaxTokenCount][MaxTokenCount]; char* terminalStr[TokenTypeCount]= { //GENERAL TYPE 5 "i", "n", "c", "COMMENT", "ERROR", //THE FOLLOWING ARE SYMBOL TYPE 18 "(", ")", ";", "+", "-", "*", "/", ":=", "<", ">", "=", "<=", ">=", "!=", "[", "]", ",", ":", //THE FOLLOWING ARE RESERVED TYPE 15 "begin", "end", "program", "variables","integer", "array", "char", "module", "if", "then", "else", "loop", "exit", "read", "write" }; //these are "hash-table-like" tables //this is exactly the opposite of below int token2type[MaxTokenCount]; //this is to translate tokenType in scanner to token-index of grammar int type2token[MaxTokenCount]; int matchTokens(const char* str) { for (int i=0; i<TokenTypeCount; i++) { if (strcmp(terminalStr[i], str)==0) { return i; } } return -1; } void Grammar::addEpsilonForToken(int tIndex) { if (epsilonIndex==-1) { epsilonRule(++prodIndex); addRuleForToken(tIndex, prodIndex); } else { addRuleForToken(tIndex, epsilonIndex); } } void Grammar::printTable() { /* cout<<" "; for (int i=0; i<tokenCount; i++) { cout<<"| "<<i; } cout<<endl; */ for (int r=0; r<tokenCount; r++) { if (token[r]->terminal) { continue; } cout<<token[r]->name<<":"; for (int c=0; c<tokenCount; c++) { if (table[r][c]!=-1) { cout<<token[c]->name<<"="<<table[r][c]<<","; } } cout<<endl; } } void Grammar::buildTable() { int typeIndex; for (int r=0; r<tokenCount; r++) { if (token[r]->terminal) { typeIndex=matchTokens(token[r]->name); if (typeIndex!=-1) { type2token[typeIndex]=r; token2type[r]=typeIndex; } } //initialize for (int c=0; c<MaxTokenCount; c++) { table[r][c]=-1; } } for (int i=0; i<tokenCount; i++) { if (token[i]->terminal) { continue; } for (int j=0; j<token[i]->count; j++) { int k=0, theRule=token[i]->production[j]; int theToken=production[theRule][k]; while (theToken!=-1) { addFirstIntoTable(i, theToken, theRule); if (!token[theToken]->isNull) { break; } k++; theToken=production[theRule][k]; } if (theToken==-1) { addFollowIntoTable(i, theRule); } } } } /* nCount=tCount=0; for (int r=0; r<MaxTokenCount; r++) { if (r<tokenCount) { if (token[r]->terminal) { typeIndex=matchTokens(token[r]->name); if (typeIndex!=-1) { type2token[typeIndex]=r; token2type[r]=typeIndex; } //I am making a invertable table!!! tArray[r]=tCount++; } else { //it is an invertable table!!! nArray[r]=nCount++; } } for (int c=0; c<MaxTokenCount; c++) { table[r][c]=-1; } } for (int i=0; i<tCount; i++) { for (int j=0; j<token[i]->count; j++) { int k=0, theRule=token[i]->production[j]; int theToken=production[theRule][k]; while (theToken!=-1) { addFirstIntoTable(theToken, j); if (!token[theToken]->isNull) { break; } k++; theToken=production[theRule][k]; } if (theToken==-1) { addFollowIntoTable(j); } } } } */ void Grammar::grammarError(int theVar, int theToken, int rule1, int rule2) { cout<<"error! the conflict of grammar at "; cout<<token[theVar]->name<<" for token of " <<token[theToken]->name <<" between rules of \n"; printRule(rule1); cout<<" and "; printRule(rule2); cout<<endl; } void Grammar::addFollowIntoTable(int theVariable, int theRule) { int temp; for (int i=0; i<token[theVariable]->followCount; i++) { temp=token[theVariable]->follow[i]; if (table[theVariable][temp]!=theRule) { if (table[theVariable][temp]!=-1) { grammarError(theVariable, temp, theRule, table[theVariable][temp]); } table[theVariable][temp]=theRule; } } } void Grammar::addFirstIntoTable(int theVariable, int theFirst, int theRule) { for (int i=0; i<token[theFirst]->firstCount; i++) { if (table[theVariable][token[theFirst]->first[i]]!=theRule) { if (table[theVariable][token[theFirst]->first[i]]!=-1) { grammarError(theVariable, token[theFirst]->first[i], theRule, table[theVariable][token[theFirst]->first[i]]); } table[theVariable][token[theFirst]->first[i]]=theRule; } } } int Grammar::variableCount() { int result=0; for (int i=0; i<tokenCount; i++) { if (!token[i]->terminal) { result++; } } return result; } int Grammar::terminalCount() { int result=0; for (int i=0; i<tokenCount; i++) { if (token[i]->terminal) { result++; } } return result; } //leave the rule unremoved!!! as I have no way to do it!!! void Grammar::removeToken(int tIndex) { delete[]token[tIndex]->name; delete token[tIndex]; tokenCount--; for (int i=tIndex; i<tokenCount; i++) { token[i]=token[i+1]; } } bool Grammar::isRedundant(int tIndex) { int k, theRule, theToken; for (int i=0; i<tokenCount; i++) { for (int j=0; j<token[i]->count; j++) { k=0; theRule=token[i]->production[j]; theToken=production[theRule][k]; while (theToken!=-1) { if (theToken==tIndex) { return false; } k++; theToken=production[theRule][k]; } } } return true; } //what is redundant? except the start variable //the non-terminal never appears in other rules! void Grammar::removeRedundant() { int tIndex=1; bool findNew=false; while (tIndex<tokenCount) { findNew=false; if (!token[tIndex]->terminal) { if (isRedundant(tIndex)) { removeToken(tIndex); findNew=true; } } if (findNew) { tIndex=1; } else { tIndex++; } } } void Grammar::printAllRules() { /* cout<<"\nnow print all rules\n"; for (int i=0; i<=prodIndex; i++) { cout<<"rule index "<<i<<": "; printRule(i); cout<<"\n"; } */ int sum=0; for (int i=0; i<tokenCount; i++) { if (!token[i]->terminal) { sum+=token[i]->count; } } cout<<" total rules is:"<<prodIndex+1; cout<<"\n and the sum is "<<sum<<endl; } void Grammar::addBeginEnd() { int begin=addToken(startStr, false); int end=addToken(endStr, true); prodIndex++; production[prodIndex][0]=0; production[prodIndex][1]=end; production[prodIndex][2]=-1; addRuleForToken(begin, prodIndex); startSymbolIndex=begin; stackBottomIndex=end; } /* void Grammar::calculateProperty() { calculateNull(); calculateFirst(); calculateFollow(); } */ void Grammar::printToken(bool onlyVar) { for (int i=0; i<tokenCount; i++) { if (onlyVar) { if (token[i]->terminal) { continue; } } cout<<"name: "<<token[i]->name<<endl; cout<<" isNull: "<<(token[i]->isNull?"true":"false")<<endl; cout<<" isTerminal: "<<(token[i]->terminal?"true":"false")<<endl; cout<<" first: "; for (int j=0; j<token[i]->firstCount; j++) { if (j!=0) { cout<<","; } cout<<"["<<j<<"]="<<token[token[i]->first[j]]->name; } cout<<"\n follow: "; for (j=0; j<token[i]->followCount; j++) { if (j!=0) { cout<<","; } cout<<"["<<j<<"]="<<token[token[i]->follow[j]]->name; } cout<<endl; } } //must use while loop to discover new set!!!!! void Grammar::calculateFirst() { int i; bool addNew; do { i=0; addNew=false; while (i<tokenCount) { //the terminal contains itself if (token[i]->terminal) { //addFirst don't judge if it is nullable!! addFirst(i, i); //token[i]->first[token[i]->firstCount++]=i; } else { //for all its rules for (int j=0; j<token[i]->count; j++) { int theToken, k=0; theToken=production[token[i]->production[j]][k]; //for each token in each rule do { //add non epsilon set if (addIntoFirst(i, theToken)) { addNew=true; } //if it is not null, means it is end if (!token[theToken]->isNull) { break; } //if it is nullable, continue k++; theToken=production[token[i]->production[j]][k]; }while (theToken!=-1); //it means all token in this rule is nullable, so if (theToken==-1) { //it is nullable //addEpsilonIntoFirst(i); addFirst(i, emptyIndex); } } } i++; } }while (addNew); } bool Grammar::addFollowIntoFollow(int target, int source) { bool addNew=false; for (int i=0; i<token[source]->followCount; i++) { if (addFollow(target, token[source]->follow[i])) { addNew=true; } } return addNew; } bool Grammar::addIntoFollow(int target, int source) { bool addNew=false; if (source==-1) { return false; } for (int i=0; i<token[source]->firstCount; i++) { //add non-epsilon if (!token[token[source]->first[i]]->isNull) { if (addFollow(target, token[source]->first[i])) { addNew=true; } } } return addNew; } void Grammar::calculateFollow() { int i; bool addNew, started; //token[startSymbolIndex]->follow[0]=stackBottomIndex; //token[startSymbolIndex]->followCount=1; do { i=0; addNew=false; while (i<tokenCount) { //the terminal contains itself if (!token[i]->terminal) { for (int tIndex=0; tIndex<tokenCount; tIndex++) { //for all its rules if (token[tIndex]->terminal) { continue; } //for each its rule for (int j=0; j<token[tIndex]->count; j++) { int theToken, k=0, theRule=token[tIndex]->production[j]; started=false; theToken=production[theRule][k]; //for each token in each rule do { //the token appears here if (started) { if (addIntoFollow(i, theToken)) { addNew=true; } if (!token[theToken]->isNull) { break; } } if (theToken==i) { started=true; //add non epsilon set, including -1 situation!!! } //if it is not null, means it is end //if it is nullable, continue k++; theToken=production[theRule][k]; }while (theToken!=-1); //it means all token in this rule is nullable, so if (started&&theToken==-1) { //it is nullable //add current variable Follow(j) into Follow(i); if (addFollowIntoFollow(i, tIndex)) { addNew=true; } } } } } i++; } }while (addNew); } void Grammar::addRuleForToken(int tIndex, int ruleIndex) { token[tIndex]->production[token[tIndex]->count++]=ruleIndex; } void Grammar::shiftRule(int ruleIndex, bool left2Right, int offset) { int end=0; while (production[ruleIndex][end]!=-1) { end++; } if (left2Right) { for (int i=end; i>=0; i--) { production[ruleIndex][i+offset]=production[ruleIndex][i]; } } else { for (int i=0; i<=end-offset; i++) { production[ruleIndex][i]=production[ruleIndex][i+offset]; } checkEpsilon(ruleIndex); } } void Grammar::checkEpsilon(int ruleIndex) { if (production[ruleIndex][0]==-1) { epsilonRule(ruleIndex); } } bool Grammar::addFollow(int target, int source) { //check if it is already there for (int i=0; i<token[target]->followCount; i++) { if (token[target]->follow[i]==source) { return false; } } //add at end token[target]->follow[token[target]->followCount++]=source; return true; } bool Grammar::addFirst(int target, int source) { //check if it is already there for (int i=0; i<token[target]->firstCount; i++) { if (token[target]->first[i]==source) { return false; } } //add at end token[target]->first[token[target]->firstCount++]=source; return true; } //add non epsilon into it. bool Grammar::addIntoFirst(int target, int source) { bool addNew=false; if (token[source]->terminal) { if (!token[source]->isNull) { return addFirst(target, source); } else { return false; } } for (int i=0; i<token[source]->firstCount; i++) { //add non epsilon if (!token[token[source]->first[i]]->isNull) { if (addFirst(target, token[source]->first[i])) { addNew=true; } } } return addNew; } bool Grammar::Null(int tIndex) { if (token[tIndex]->terminal) { return token[tIndex]->isNull; } for (int i=0; i<token[tIndex]->count; i++) { int j=0, theToken; theToken=production[token[tIndex]->production[i]][j]; do { if (theToken==tIndex||!Null(theToken)) { break; } j++; theToken=production[token[tIndex]->production[i]][j]; }while (theToken!=-1); if (theToken==-1) { return true; } } return false; } void Grammar::calculateNull() { for (int i=0; i<tokenCount; i++) { token[i]->isNull=Null(i); } } int Grammar::findMetaSymbol(int& begin, int& end) { int theRule, theToken, k; begin=end=-1; for (int i=0; i<tokenCount; i++) { for (int j=0; j<token[i]->count; j++) { k=0; theRule=token[i]->production[j]; theToken=production[theRule][k]; while (theToken!=-1) { if (theToken==beginIndex) { begin=k; } if (theToken==endIndex) { end=k; return theRule; } k++; theToken=production[theRule][k]; } } } return -1; } void Grammar::doReplaceMetaSymbol(int ruleIndex, int begin, int end) { int newTokenIndex=forgeToken(0), i=0; addRuleForToken(newTokenIndex, ++prodIndex); //token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex; //token[newTokenIndex]->terminal=false; copyRule(ruleIndex, prodIndex); //shrink while (production[ruleIndex][i+end+1]!=-1) { production[ruleIndex][begin+i]=production[ruleIndex][i+end+1]; i++; } production[ruleIndex][begin+i]=-1; addAtEnd(ruleIndex, newTokenIndex); /* production[ruleIndex][begin]=newTokenIndex; production[ruleIndex][begin+1]=-1; */ shiftRule(prodIndex, false, begin+1); production[prodIndex][end-begin-1]=newTokenIndex; production[prodIndex][end-begin]=-1; addEpsilonForToken(newTokenIndex); /* if (epsilonIndex==-1) { eRuleIndex=++prodIndex; epsilonRule(eRuleIndex); } else { eRuleIndex=epsilonIndex; } addRuleForToken(newTokenIndex, eRuleIndex); */ } void Grammar::replaceMetaSymbol() { int begin, end, ruleIndex; while ((ruleIndex=findMetaSymbol(begin, end))!=-1) { doReplaceMetaSymbol(ruleIndex, begin, end); } /* for (int i=0; i<tokenCount; i++) { //if (token[i]->isMeta) if (i==beginIndex||i==endIndex) { removeToken(i); } } */ } void Grammar::addAtEnd(int ruleIndex, int toAdd) { int end=0; while (production[ruleIndex][end]!=-1) { end++; } production[ruleIndex][end++]=toAdd; production[ruleIndex][end]=-1; } //left-recursion: A -> A a | b | c //change to: A -> b A' | c A' // A' -> a A' | empty void Grammar::removeImmediate(int tIndex, int ruleIndex) { int newIndex=forgeToken(tIndex); int holdRuleIndex=token[tIndex]->production[ruleIndex]; //sequence matters! //change to: A -> b A' for (int i=0; i<token[tIndex]->count; i++) { if (i!=ruleIndex) { addAtEnd(token[tIndex]->production[i], newIndex); } } //shift removeRuleFromToken(tIndex, ruleIndex); addRuleForToken(newIndex, holdRuleIndex); //token[newIndex]->production[token[newIndex]->count++]=holdRuleIndex; shiftRule(holdRuleIndex, false, 1); addAtEnd(holdRuleIndex, newIndex); //add epsilon rule for new variable addEpsilonForToken(newIndex); /* epsilonRule(++prodIndex); addRuleForToken(newIndex, prodIndex); */ } int Grammar::forgeToken(int index) { char str[MaxGrammarTokenLength+2], ch; int len=strlen(token[index]->name); int temp=0, i=0; strcpy(str, token[index]->name); ch=str[len-1];//get last char of name if (ch>='0'&&ch<'9') { str[len-1]=ch+i+1; } else { str[len]='0'+i; str[len+1]='\0'; } //I won't bother to check repetitation of name while (tokenIndex(str)!=-1) { i++; if (ch>='0'&&ch<'9') { str[len-1]=ch+i+1; } else { str[len]='0'+i; str[len+1]='\0'; } } return addToken(str, false);//it is non-terminal for sure } int Grammar::copyRule(int source, int target) { int i=0; while (production[source][i]!=-1) { production[target][i]=production[source][i]; i++; } production[target][i]=-1; return i; } void Grammar::doAddRule(int tIndex) { token[tIndex]->production[token[tIndex]->count++]=++prodIndex; } void Grammar::addRule(const char* str) { int index; index=addToken(str); production[prodIndex][curPos++]=index; production[prodIndex][curPos]=-1;//set end } //if the token is already there, it return the index //otherwise, it add new node in token array int Grammar::addToken(const char* str, bool isTerminal) { int index; index=tokenIndex(str); if (index==-1) { index=tokenCount; } else { return index; } token[index]=createToken(str, isTerminal); if (strcmp(str, optionBegin)==0) { beginIndex=index; } if (strcmp(str, optionEnd)==0) { endIndex=index; } tokenCount++; if (strcmp(str, emptyStr)==0) { token[index]->isNull=true; emptyIndex=index; } return index; } void Grammar::newRule(const char* str) { if (str!=NULL) { curIndex=addToken(str, false); } //add one pointer token[curIndex]->production[token[curIndex]->count++]=++prodIndex; token[curIndex]->terminal=false; curPos=0;//reset to restart; } GrammarToken* Grammar::createToken(const char* theName, bool isTerminal) { GrammarToken* ptr=new GrammarToken; ptr->name=new char[strlen(theName)+1]; strcpy(ptr->name, theName); ptr->terminal=isTerminal; ptr->count=ptr->firstCount=ptr->followCount=0; ptr->isNull=false; return ptr; } int Grammar::tokenIndex(const char* str) { for (int i=0; i<tokenCount; i++) { if (strcmp(str, token[i]->name)==0) { return i; } } return -1; } int Grammar::checkRecursion(int tIndex, int curToken) { for (int i=0; i<token[curToken]->count; i++) { //token[tIndex]->production[i]=ruleIndex //production[ruleIndex][0] is the first left-recursion one if (production[token[curToken]->production[i]][0]<=curToken) { return i; } } return -1; } void Grammar::printRule(int index) { int nodeIndex=0; //cout<<" ~"<<index<<"~ "; while (production[index][nodeIndex]!=-1) { cout<<token[production[index][nodeIndex]]->name<<" "; nodeIndex++; } } void Grammar::initialize() { tokenCount=curIndex=curPos=0; prodIndex=-1;//in order to ++ blindly } Grammar::Grammar() { initialize(); } void Grammar::removeLeftRecursion() { int tIndex=0, curToken=0; bool newChange=false; while (tIndex<tokenCount) { if (!token[tIndex]->terminal) { for (int i=0; i<token[tIndex]->count; i++) { curToken=production[token[tIndex]->production[i]][0]; if (curToken<=tIndex&&!token[curToken]->terminal) { if (curToken!=tIndex) { replaceRule(tIndex, i); } else { removeImmediate(tIndex, i); } newChange=true; } } } //whenever there is some new findings, restart if (!newChange) { tIndex++; } else { tIndex=0; newChange=false; } } } void Grammar::replaceRule(int tIndex, int ruleIndex) { int pos, j, targetEnd, sourceEnd, curRule; curRule=token[tIndex]->production[ruleIndex]; int curToken=production[curRule][0]; for (int i=token[curToken]->count-1; i>=0; i--) { if (i>0) { doAddRule(tIndex); pos=copyRule(token[curToken]->production[i], prodIndex); j=0; while (production[curRule][j+1]!=-1) { production[prodIndex][pos+j]=production[curRule][j+1]; j++; } production[prodIndex][pos+j]=-1; //addRuleForToken(curToken, prodIndex); } else { targetEnd=sourceEnd=0; //curRule=token[tIndex]->production[ruleIndex]; while (true) { if (production[token[curToken]->production[0]][sourceEnd]==-1&& production[curRule][targetEnd]==-1) { break; } if (production[token[curToken]->production[0]][sourceEnd]!=-1) { sourceEnd++; } if (production[curRule][targetEnd]!=-1) { targetEnd++; } } j=targetEnd+sourceEnd-1; while (j>=0) { if (j>=sourceEnd) { production[curRule][j]=production[curRule][j-sourceEnd+1]; } else { production[curRule][j]=production[token[curToken]->production[0]][j]; } j--; } } } } //optimize sequence is first common-factor then remove left recursion //therefore I don't have to check if for same variable if there will be //more than one left-recursion void Grammar::optimize() { replaceMetaSymbol(); removeLeftRecursion(); commonLeftFactor(); //removeRedundant(); addBeginEnd(); calculateNull(); calculateFirst(); calculateFollow(); buildTable(); } int Grammar::findCommonFactor(int tIndex, int ruleIndex) { for (int i=ruleIndex+1; i<token[tIndex]->count; i++) { //if the two rule has the same first token if (production[token[tIndex]->production[ruleIndex]][0]== production[token[tIndex]->production[i]][0]) { /* //calculate if there is epsilon if (emptyIndex==-1) { emptyIndex=tokenIndex(emptyStr); } //if it is not epsilon if (production[token[tIndex]->production[i]][0]!=emptyIndex) { return i; } */ return i; } } return -1; } void Grammar::epsilonRule(int ruleIndex) { production[ruleIndex][0]=addToken(emptyStr); production[ruleIndex][1]=-1; } //sample: x -> Aa // x -> Ab //changed to: x -> Ax' //this is to change the old rule // x' -> b //this is to change the old rule // x' -> a //this is the new-added rule void Grammar::doCommonFactor(int tIndex, int ruleIndex, int index) { int newTokenIndex=forgeToken(tIndex);//create a new variable //move the second and after part to the new rule of new variable //doing: x' -> a //curPos=0; //prepare to add one new rule addRuleForToken(newTokenIndex, ++prodIndex); //token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex; token[newTokenIndex]->terminal=false; copyRule(token[tIndex]->production[ruleIndex], prodIndex); shiftRule(prodIndex, false, 1); /* do { //do copying production[prodIndex][curPos]= production[token[tIndex]->production[ruleIndex]][curPos+1]; curPos++; //even the -1 at end is copied }while (production[token[tIndex]->production[ruleIndex]][curPos]!=-1); */ //in order to show an empty string, in case the string is "epsilon" /* if (curPos==1) { epsilonRule(prodIndex); } */ //replace x -> Aa with x -> Ax' production[token[tIndex]->production[ruleIndex]][1]=newTokenIndex; production[token[tIndex]->production[ruleIndex]][2]=-1;//end //doing: x' -> b //curPos=0; //prepare to add one new rule //pointing new token to where old rule lies addRuleForToken(newTokenIndex, token[tIndex]->production[index]); /* token[newTokenIndex]->production[token[newTokenIndex]->count++]= token[tIndex]->production[index]; */ shiftRule(token[tIndex]->production[index], false, 1); removeRuleFromToken(tIndex, index); } void Grammar::removeRuleFromToken(int tIndex, int ruleIndex) { token[tIndex]->count--; for (int i=ruleIndex; i<token[tIndex]->count; i++) { token[tIndex]->production[i]=token[tIndex]->production[i+1]; } } void Grammar::commonLeftFactor() { int index=-1, tIndex=0, ruleIndex=0; bool flag; //whenever a newrule is done, restart! while (tIndex<tokenCount) { ruleIndex=0; flag=false; while (ruleIndex<token[tIndex]->count) { index=findCommonFactor(tIndex, ruleIndex); if (index!=-1) { doCommonFactor(tIndex, ruleIndex, index); //restart flag=true; break; } else { ruleIndex++; } } if (flag) { tIndex=0; } else { tIndex++; } } } GrammarToken* Grammar::operator[](int index) { if (index>=0&&index<tokenCount) { return token[index]; } else { return NULL; } } void Grammar::print() { for (int i=0; i<tokenCount; i++) { if (!token[i]->terminal) { cout<<token[i]->name<<" ==> "; for (int j=0; j<token[i]->count; j++) { printRule(token[i]->production[j]); if (j!=token[i]->count-1) { cout<<" | "; } } cout<<"\n"; } } }

#include "grammar.h" const int MaxStackLength=50; class Parser { private: int stack[MaxStackLength]; bool push(int num); bool pop(int& num); int top; void initialize(); bool pushToken(int tIndex, int theToken); void prepare(); void pushRule(int theRule); public: Parser(); void parseFile(const char* fileName); };

#include <iostream> #include "parser.h" #include "scanner.h" #include "errorNo.h" using namespace std; Scanner scanner; extern Grammar grammar; const char* defaultListFile="c:\\nickListFile.txt"; extern int startSymbolIndex; extern int stackBottomIndex; extern int table[MaxTokenCount][MaxTokenCount]; extern int token2type[MaxTokenCount]; extern int type2token[MaxTokenCount]; extern int emptyIndex; bool Parser::pushToken(int tIndex, int theToken) { if (table[tIndex][theToken]!=-1) { cout<<grammar.token[tIndex]->name<<" => "; grammar.printRule(table[tIndex][theToken]); cout<<endl; pushRule(table[tIndex][theToken]); return true; } return false; } void Parser::pushRule(int theRule) { int len=0; while (grammar.production[theRule][len]!=-1) { len++; } for (int i=len-1; i>=0; i--) { if (!push(grammar.production[theRule][i])) { errorHandle(StackOverFlow); } } } void Parser::parseFile(const char*fileName) { int theToken, theVar; bool canPop=true; scanner.readFromFile(fileName, defaultListFile); prepare(); while (scanner.nextToken()) { if (!canPop) { errorHandle(UnexpectedEmptyStack); } if (scanner.token.type==COMMENTTYPE) { continue; } theToken=type2token[scanner.token.type]; while ((canPop=pop(theVar))==true) { if (grammar.token[theVar]->terminal) { if (theToken==theVar)//match { break; } else { if (theVar==emptyIndex) { continue; } errorHandle(IllegalGrammarToken); } } else { if (!pushToken(theVar, theToken)) { errorHandle(IllegalGrammarToken); } } } } if (pop(theVar)) { if (theVar!=stackBottomIndex) { errorHandle(NotEmptyStack); } /* if (theVar!=stackBottomIndex) { errorHandle(NotEmptyStack); } */ } } Parser::Parser() { initialize(); } void Parser::initialize() { top=0; } bool Parser::pop(int& num) { if (top==0) { return false; } num=stack[--top]; return true; } bool Parser::push(int num) { if (top==MaxStackLength-1) { return false; } stack[top++]=num; return true; } void Parser::prepare() { top=0; pushRule(grammar.token[startSymbolIndex]->production[0]); }

/////////////////////////////////////////////////////////////////////////////////////////// //Program: SLang Scanner //Author: Qingzhe Huang //Date: Jan. 18, 2004 //FileName: scanner.h //Features: // 1. I want to improve efficiency of scanning, so I used table-driven method. // 2. I used enum to represent character of all ASCII---CharType---where "space, tab, // end of line, end of file are all considered to be White Space. // 3. All legal token is represented by enum TokenType. // 4. I defined a huge amount of TokenState which is basically the state of a DFA. As // I don't want to search reserved keyword with linear search or whatever, I have // many states for the reserved words. // 5. I deliberately make the sequence of first 38 TokenState elements exactly same as // all that of TokenType, so that each final state of DFA has a 1-1 correspondence with // type of token. // 6. I defined a struct of Token which may be used in future parser. // 7. I defined an errorNo variable to represent various errors. And a series error string // for displaying information. // 8. When class Scanner is created, it will initialize the big "state-charType" table. // 9. When readFromFile is called, it will first read one char in advance. // 10. When an error is encountered, the caller of Scanner should understand that no further // char is read in. So, stop calling "nextToken()". This is a bit controvercial, and I // plan to change it in next version. //////////////////////////////////////////////////////////////////////////////////////////// /*//////////////////////////////////////////////////////////////////////////// Program: SLang Scanner Author: Qingzhe Huang Date: Jan. 21, 2004 FileName: scanner.h Features: 1. I restructured the struct Token, to make it a union field in order to store int value for number. 2. I restructured the function "nextToken()" in order to give out correct line no. when error occurs. *//////////////////////////////////////////////////////////////////////////////// #ifndef SCANNER_H #define SCANNER_H #include <iostream> using namespace std; extern enum ErrorCode; const int TokenStateCount=138; const int CharTypeCount=72; const int MaxTokenLength=255; const int MaxNumberLength=12; enum CharType { //all small letters 26 SMALLA,SMALLB,SMALLC,SMALLD,SMALLE,SMALLF,SMALLG,SMALLH,SMALLI,SMALLJ,SMALLK,SMALLL, SMALLM,SMALLN,SMALLO,SMALLP,SMALLQ,SMALLR,SMALLS,SMALLT,SMALLU,SMALLV,SMALLW,SMALLX, SMALLY,SMALLZ, //all big letters 26 BIGA,BIGB,BIGC,BIGD,BIGE,BIGF,BIGG,BIGH,BIGI,BIGJ,BIGK,BIGL,BIGM,BIGN,BIGO,BIGP,BIGQ, BIGR,BIGS,BIGT,BIGU,BIGV,BIGW,BIGX,BIGY,BIGZ, //all digit 1 DIGIT, //all symbols 16 QUOTE, OPENPAR, CLOSEPAR, SEMICOLON,PLUS, MINUS, TIMES, SLASH, COLON, EQUAL,SMALLER,GREATER,EXCLAIM,OPENBRACKET, CLOSEBRACKET,COMMA, //space, tab, end of line are regarded as whitespace, 1 WHITESPACE, //UNDERSCORE IS A SPECIAL SYMBOL 1 UNDERSCORE, //all other ASCII is regarded as illegal 1 ILLEGAL }; //TOTAL 38, JUST 1-1 WITH THE FIRST 38 OF TOKENSTATE enum TokenType { //GENERAL TYPE 5 IDTYPE, NUMBERTYPE, CHARCONSTTYPE, COMMENTTYPE, ERRORTYPE, //THE FOLLOWING ARE SYMBOL TYPE 18 OPENPARTYPE, CLOSEPARTYPE, SEMICOLONTYPE, PLUSTYPE, MINUSTYPE, TIMESTYPE, SLASHTYPE, ASSIGNMENTTYPE, SMALLERTYPE, GREATERTYPE, EQUALTYPE, SMALLEREQUALTYPE, GREATEREQUALTYPE, NOTEQUALTYPE, OPENBRACKETTYPE, CLOSEBRACKETTYPE, COMMATYPE, COLONTYPE, //THE FOLLOWING ARE RESERVED TYPE 15 BEGINTYPE, ENDTYPE, PROGRAMTYPE, VARIABLESTYPE,INTEGERTYPE, ARRAYTYPE, CHARTYPE, MODULETYPE, IFTYPE, THENTYPE, ELSETYPE, LOOPTYPE, EXITTYPE, READTYPE, WRITETYPE }; //total 138 states enum TokenState { //THE FINAL STATE 38, in order to easy initialize "finalState", I put them in beginning //5 generals IDEND, NUMBEREND, CONSTCHAREND, COMMENTEND, ERROR, //18 symbols OPENPAREND, CLOSEPAREND, SEMICOLONEND, PLUSEND, MINUSEND, TIMESEND, SLASHEND, ASSIGNMENTEND, SMALLEREND, GREATEREND, EQUALEND, SMALLEREQUALEND, GREATEREQUALEND, NOTEQUALEND, OPENBRACKETEND, CLOSEBRACKETEND, COMMAEND, COLONEND, //15 reserved BEGINEND, ENDEND, PROGRAMEND, VARIABLESEND, INTEGEREND, ARRAYEND, CHAREND, MODULEEND, IFEND, THENEND, ELSEEND, LOOPEND, EXITEND, READEND, WRITEEND, //THE FOLLOWING ARE ALL NON-FINAL STATES //THE very FIRST CHAR 1 READY, //THE FOLLOWING ARE ALL RESERVED STATE //the first char 12 ARRAY1, BEGIN1, CHAR1, E1, I1, LOOP1, MODULE1, PROGRAM1, READ1, THEN1, VARIABLES1, WRITE1, //THE SECOND CHAR 15 ARRAY2, BEGIN2, CHAR2, ELSE2, END2, EXIT2, IF2, INTEGER2, LOOP2, MODULE2, PROGRAM2, READ2, THEN2, VARIABLES2, WRITE2, //THE THIRD CHAR 14 ARRAY3, BEGIN3, CHAR3, ELSE3, END3, EXIT3, INTEGER3, LOOP3, MODULE3, PROGRAM3, READ3, THEN3, VARIABLES3, WRITE3, //THE FOURTH CHAR 13 ARRAY4, BEGIN4, CHAR4, ELSE4, EXIT4, INTEGER4, LOOP4, MODULE4, PROGRAM4, READ4, THEN4, VARIABLES4, WRITE4, //THE FIFTH CHAR 7 ARRAY5, BEGIN5, INTEGER5, MODULE5, PROGRAM5, VARIABLES5, WRITE5, //THE SIXTH CHAR 4 INTEGER6, MODULE6, PROGRAM6, VARIABLES6, //THE SEVENTH CHAR 3 INTEGER7, PROGRAM7, VARIABLES7, //THE EIGHTH CHAR 1 VARIABLES8, //THE NINETH CHAR 1 VARIABLES9, //THESE ARE NON-RESERVED //THESE ARE GENERAL 9 IDBEGIN, IDUNDERSCORE, NUMBERBEGIN, CONSTCHARQUOTEBEGIN, CONSTCHARBEGIN, COMMENTSTARBEGIN, COMMENTBEGIN, COMMENTSTAREND, COMMENTSLASHBEGIN, //the SINGLE symbols 16 QUOTEBEGIN, OPENPARBEGIN, CLOSEPARBEGIN, SEMICOLONBEGIN, PLUSBEGIN, MINUSBEGIN, TIMESBEGIN, SLASHBEGIN, COLONBEGIN, SMALLERBEGIN, GREATERBEGIN, EQUALBEGIN, EXCLAIMBEGIN, OPENBRACKETBEGIN, CLOSEBRACKETBEGIN, COMMABEGIN, //MULTI SYMBOL 4 ASSIGNMENTBEGIN, SMALLEREQUALBEGIN, GREATEREQUALBEGIN, NOTEQUALBEGIN }; //extern ErrorCode errorNo; extern void errorHandle(ErrorCode errorNo); struct Token { TokenType type; union { char name[MaxTokenLength+1]; int number; }; }; class Scanner { private: int tokenCount; unsigned char ch; void printLineNo(); FILE* stream; bool nextChar(); void initialize(); bool resume(); public: Scanner(); static Token token; bool readFromFile(const char* fileName, const char* listFileName); const char* getToken(){return token.name;} bool nextToken(); void report(); }; void initialTokenState(); #endif

/*//////////////////////////////////////////////////////////////////////////// Program: SLang Scanner Author: Qingzhe Huang Date: Jan. 21, 2004 FileName: scanner.cpp Features: 1. As Dr. Optrany said, the number should be stored as int or double whatever. 2. I restructured the function "nextToken()" in order to give out correct line no. when error occurs. *//////////////////////////////////////////////////////////////////////////////// #include <iostream> #include <fstream> #include "scanner.h" #include "errorNo.h" using namespace std; ofstream fList; //this will determine how many errors of maximum the scanner will tolerant const int MaxErrortolerant=10; //as integer usually have max 12 digit roughly int errorCount=0; int lineCount=1; //static memeber Token Scanner::token; const int TokenTypeCount=38; //extern void errorHandle(int errorNo); extern char* errorStr[ErrorCount]; //this is purely for displaying purpose char* tokenTypeStr[TokenTypeCount]= { //GENERAL TYPE 5 "ID", "NUMBER", "CHARACTER CONSTANT", "COMMENT", "ERROR", //THE FOLLOWING ARE SYMBOL TYPE 18 "(", ")", ";", "+", "-", "*", "/", ":=", "<", ">", "=", "<=", ">=", "!=", "[", "]", ",", ":", //THE FOLLOWING ARE RESERVED TYPE 15 "begin", "end", "program", "variables","integer", "array", "char", "module", "if", "then", "else", "loop", "exit", "read", "write" }; CharType charType[256]; TokenState tokenState[TokenStateCount][CharTypeCount]; //when error occurs, no message is immediately output, it is //postponed to next time, because when '\n' is read in, //lineCount is not incremented until token is decided. //Therefore, when a token is ended with '\n', the line no is not updated //until next round. So, we can keep the correct line no. for each token bool Scanner::nextToken() { TokenState state=READY; int digitCount=0; int value=0; int count=0;//to count the length of token char* ptr=token.name; bool isComment=false; do { //map ch to CharType reducing 256 ASCII to 73 CharTypes //the table for "state" and "CharType is 138x73, each entry is a //index for state. state=tokenState[state][charType[ch]]; if (state==NUMBERBEGIN) { digitCount++; if (digitCount>=MaxNumberLength) { errorHandle(ExceedNumberLimit); return resume(); } //to accumulate the value value*=10; value+=ch-'0'; } //because I put all final state in the first 38 positions if (state<38) { //This is a dirty trick! Because I make the "TokenType" 1-1 with //TokenState for the 38 finals. *ptr='\0'; token.type=(TokenType)(state); if (state==ERROR) { errorHandle(IllegalToken); //printLineNo(); return resume(); } tokenCount++; if (state==NUMBEREND) { token.number=value; } //printLineNo(); return true; } if (state==COMMENTBEGIN) { isComment=true; } if (count>=MaxTokenLength) { errorHandle(TokenTooLong); token.type=ERRORTYPE; //printLineNo(); return false; } //cout<<ch; if (!isComment&&state!=READY) { *ptr=ch; ptr++; count++; } //it is only at end to update line no. printLineNo(); }while (nextChar()); state=tokenState[state][charType[ch]]; //at this point, it is either in ready state, or error state if (state==ERROR) { token.type=(TokenType)(state); errorHandle(UnexpectedReachEOF); } //but in all case it means end of file, so return false return false; } bool Scanner::resume() { //the scanner will try to continue if error number is within 10 if (errorCount==MaxErrortolerant) { return false; } return nextChar(); } void Scanner::report() { fList<<"\ntotal number of tokens is "<<tokenCount; fList<<"\ntotal number of errors is "<<errorCount; } void Scanner::printLineNo() { if (ch=='\n') { fList<<++lineCount<<" "; } } Scanner::Scanner() { initialize(); } void Scanner::initialize() { errorCount=0; lineCount=1; tokenCount=0; initialTokenState(); } bool Scanner::readFromFile(const char* fileName, const char* listFileName) { if ((stream=fopen(fileName, "r"))==NULL) { errorHandle(CannotOpenFile); return false; } else { fList.open(listFileName); fList<<lineCount<<" "; //this is to prevent the empty file situation in which //you cannot even read one single char because my scanner need to read //one char ahead if (!nextChar()) { errorHandle(FileEmptyError); return false; } } return true; } bool Scanner::nextChar() { ch=fgetc(stream); fList<<ch; return ch!=255; }

#ifndef ERRORNO_H #define ERRORNO_H extern char* errorStr[]; void errorHandle(int errorNo); const int ErrorCount=10; const int ScannerErrorCount=6; const int ParserErrorCount=4; //errors of scanner = 6 #define IllegalToken 0 #define TokenTooLong 1 #define UnexpectedReachEOF 2 #define FileEmptyError 3 #define CannotOpenFile 4 #define ExceedNumberLimit 5 //error of parser =4 #define UnexpectedEmptyStack 6 #define IllegalGrammarToken 7 #define NotEmptyStack 8 #define StackOverFlow 9 #endif

#include <iostream> #include <fstream> #include "scanner.h" #include "errorNo.h" using namespace std; char* errorStr[ErrorCount]= { //these are scanner errors: "IllegalToken", "TokenTooLong", "UnexpectedReachEOF", "FileEmptyError", "CannotOpenFile", "ExceedNumberLimit", //these are parser errors "UnexpectedEmptyStack", "IllegalGrammarToken", "NotEmptyStack", "StackOverFlow" }; extern int errorCount; extern int lineCount; extern ofstream fList; //this is going to be improved in future as parser need to //call it, too. so, more parameter should be added? //No! the error no. itself specifies the error and it is //error handler to try to find necessary info to display. void errorHandle(int errorNo) { if (errorNo<ScannerErrorCount) { errorCount++; //the illegal token may be for various reason and I only suggest //a possible nearby place to spot the error occurs. fList<<"\nerror of "<<errorStr[errorNo]<<" occurred at line " <<lineCount<<" near token "<<Scanner::token.name<<endl; } else { if (errorNo<ScannerErrorCount+ParserErrorCount) { errorCount++; fList<<"\nerror of "<<errorStr[errorNo]<<" occurred at line " <<lineCount<<" near token "<<Scanner::token.name<<endl; exit(errorNo); } } }

/*//////////////////////////////////////////////////////////////////////////// Program: SLang Scanner Author: Qingzhe Huang Date: Jan. 18, 2004 FileName: initialize.cpp Features: 1. This is purely mechnical job, you know to initialize a huge state table: 138x72 is really a boring, routine job. 2. For EOF, I want "ch" to be able to be an index in "CharType" array, so, it cannot be -1, but 255 for "unsigned char" which is declared in class Scanner. *//////////////////////////////////////////////////////////////////////////////// #include "scanner.h" extern enum CharType; extern enum TokenState; extern CharType charType[256]; extern TokenState tokenState[TokenStateCount][CharTypeCount]; void finalSymbolToken(TokenState state, TokenState endState); void finalReservedToken(TokenState state, TokenState endState); void initialCharType(); void setFinalTokenState(); void initialReserved(TokenState state); void setRange(TokenState state, CharType start, CharType end, TokenState target); void setState(TokenState state, TokenState targetState); void setDefaultState(); void setDefaultState() { //all states are by default error for (int i=0; i<TokenStateCount; i++) { setState((TokenState)i, ERROR); } //the default for all letters are IDBEGIN setRange(READY, SMALLA, BIGZ, IDBEGIN); //THIS IS another dirty trick, since I put all reserved states together //so you can initialize them together. for (i=ARRAY1; i<=VARIABLES9; i++) { initialReserved((TokenState)i); } setFinalTokenState(); } void setFinalTokenState() { //FOR ID finalReservedToken(IDBEGIN, IDEND); //for number finalReservedToken(NUMBERBEGIN, NUMBEREND); //THESE FOR RESERVED WORDS finalReservedToken(ARRAY5, ARRAYEND); finalReservedToken(BEGIN5, BEGINEND); finalReservedToken(CHAR4, CHAREND); finalReservedToken(ELSE4, ELSEEND); finalReservedToken(END3, ENDEND); finalReservedToken(EXIT4, EXITEND); finalReservedToken(IF2, IFEND); finalReservedToken(INTEGER7, INTEGEREND); finalReservedToken(LOOP4, LOOPEND); finalReservedToken(MODULE6, MODULEEND); finalReservedToken(PROGRAM7, PROGRAMEND); finalReservedToken(READ4, READEND); finalReservedToken(THEN4, THENEND); finalReservedToken(VARIABLES9, VARIABLESEND); finalReservedToken(WRITE5, WRITEEND); //THESE FOR SYMBOLS finalSymbolToken(OPENPARBEGIN, OPENPAREND); finalSymbolToken(CLOSEPARBEGIN, CLOSEPAREND); finalSymbolToken(SEMICOLONBEGIN, SEMICOLONEND); finalSymbolToken(PLUSBEGIN, PLUSEND); finalSymbolToken(MINUSBEGIN, MINUSEND); finalSymbolToken(TIMESBEGIN, TIMESEND); finalSymbolToken(SLASHBEGIN, SLASHEND); finalSymbolToken(ASSIGNMENTBEGIN, ASSIGNMENTEND); finalSymbolToken(SMALLERBEGIN, SMALLEREND); finalSymbolToken(GREATERBEGIN, GREATEREND); finalSymbolToken(EQUALBEGIN, EQUALEND); finalSymbolToken(SMALLEREQUALBEGIN, SMALLEREQUALEND); finalSymbolToken(GREATEREQUALBEGIN, GREATEREQUALEND); finalSymbolToken(NOTEQUALBEGIN, NOTEQUALEND); finalSymbolToken(OPENBRACKETBEGIN, OPENBRACKETEND); finalSymbolToken(CLOSEBRACKETBEGIN, CLOSEBRACKETEND); finalSymbolToken(COMMABEGIN, COMMAEND); finalSymbolToken(COLONBEGIN, COLONEND); //COMMENT finalSymbolToken(COMMENTSLASHBEGIN, COMMENTEND); //CONSTCHAR finalSymbolToken(CONSTCHARQUOTEBEGIN, CONSTCHAREND); } void initialTokenState() { //initialize all charType initialCharType(); //default is always error setDefaultState(); //loop tokenState[READY][WHITESPACE]=READY; //number tokenState[READY][DIGIT]=NUMBERBEGIN; tokenState[NUMBERBEGIN][DIGIT]=NUMBERBEGIN;//HOW LONG SHOULD NUMBER BE? //ID //setRange(READY, SMALLA, BIGZ, IDBEGIN); THIS IS IN DEFAULT setRange(IDBEGIN, SMALLA, DIGIT, IDBEGIN); tokenState[IDBEGIN][UNDERSCORE]=IDUNDERSCORE; setRange(IDUNDERSCORE, SMALLA, DIGIT, IDBEGIN); //reserved words //ARRAY1, BEGIN1, CHAR1, E1, I1, LOOP1, MODULE1, PROGRAM1, READ1, THEN1, WRITE1, //VARIABLES1, tokenState[READY][SMALLA]=ARRAY1; tokenState[READY][SMALLB]=BEGIN1; tokenState[READY][SMALLC]=CHAR1; tokenState[READY][SMALLE]=E1; tokenState[READY][SMALLI]=I1; tokenState[READY][SMALLL]=LOOP1; tokenState[READY][SMALLM]=MODULE1; tokenState[READY][SMALLP]=PROGRAM1; tokenState[READY][SMALLR]=READ1; tokenState[READY][SMALLT]=THEN1; tokenState[READY][SMALLV]=VARIABLES1; tokenState[READY][SMALLW]=WRITE1; /* RESERVED WORDS ARRAY2 */ tokenState[ARRAY1][SMALLR]=ARRAY2; //BEGIN2 tokenState[BEGIN1][SMALLE]=BEGIN2; //CHAR2 tokenState[CHAR1][SMALLH]=CHAR2; //ELSE2, tokenState[E1][SMALLL]=ELSE2; //EXIT2 tokenState[E1][SMALLX]=EXIT2; //END2 tokenState[E1][SMALLN]=END2; //IF2 tokenState[I1][SMALLF]=IF2; //INTEGER2 tokenState[I1][SMALLN]=INTEGER2; //LOOP2 tokenState[LOOP1][SMALLO]=LOOP2; //MODULE2 tokenState[MODULE1][SMALLO]=MODULE2; //PROGRAM2 tokenState[PROGRAM1][SMALLR]=PROGRAM2; //READ2 tokenState[READ1][SMALLE]=READ2; //THEN2 tokenState[THEN1][SMALLH]=THEN2; //VARIABLES2 tokenState[VARIABLES1][SMALLA]=VARIABLES2; //WRITE2 tokenState[WRITE1][SMALLR]=WRITE2; /* RESERVED WORDS ARRAY3 */ tokenState[ARRAY2][SMALLR]=ARRAY3; //BEGIN2 tokenState[BEGIN2][SMALLG]=BEGIN3; //CHAR2 tokenState[CHAR2][SMALLA]=CHAR3; //ELSE2, tokenState[ELSE2][SMALLS]=ELSE3; //END2 tokenState[END2][SMALLD]=END3; //EXIT2 tokenState[EXIT2][SMALLI]=EXIT3; //INTEGER2 tokenState[INTEGER2][SMALLT]=INTEGER3; //LOOP2 tokenState[LOOP2][SMALLO]=LOOP3; //MODULE2 tokenState[MODULE2][SMALLD]=MODULE3; //PROGRAM2 tokenState[PROGRAM2][SMALLO]=PROGRAM3; //READ2 tokenState[READ2][SMALLA]=READ3; //THEN2 tokenState[THEN2][SMALLE]=THEN3; //VARIABLES2 tokenState[VARIABLES2][SMALLR]=VARIABLES3; //WRITE2 tokenState[WRITE2][SMALLI]=WRITE3; /* RESERVED WORDS ARRAY3 */ tokenState[ARRAY3][SMALLA]=ARRAY4; //BEGIN2 tokenState[BEGIN3][SMALLI]=BEGIN4; //CHAR2 tokenState[CHAR3][SMALLR]=CHAR4; //ELSE2, tokenState[ELSE3][SMALLE]=ELSE4; //EXIT2 tokenState[EXIT3][SMALLT]=EXIT4; //INTEGER2 tokenState[INTEGER3][SMALLE]=INTEGER4; //LOOP2 tokenState[LOOP3][SMALLP]=LOOP4; //MODULE2 tokenState[MODULE3][SMALLU]=MODULE4; //PROGRAM2 tokenState[PROGRAM3][SMALLG]=PROGRAM4; //READ2 tokenState[READ3][SMALLD]=READ4; //THEN2 tokenState[THEN3][SMALLN]=THEN4; //VARIABLES2 tokenState[VARIABLES3][SMALLI]=VARIABLES4; //WRITE2 tokenState[WRITE3][SMALLT]=WRITE4; /* RESERVED WORDS ARRAY */ tokenState[ARRAY4][SMALLY]=ARRAY5; //BEGIN2 tokenState[BEGIN4][SMALLN]=BEGIN5; //INTEGER2 tokenState[INTEGER4][SMALLG]=INTEGER5; //MODULE2 tokenState[MODULE4][SMALLL]=MODULE5; //PROGRAM2 tokenState[PROGRAM4][SMALLR]=PROGRAM5; //VARIABLES2 tokenState[VARIABLES4][SMALLA]=VARIABLES5; //WRITE2 tokenState[WRITE4][SMALLE]=WRITE5; // RESERVED WORDS*/ //INTEGER2 tokenState[INTEGER5][SMALLE]=INTEGER6; //MODULE2 tokenState[MODULE5][SMALLE]=MODULE6; //PROGRAM2 tokenState[PROGRAM5][SMALLA]=PROGRAM6; //VARIABLES2 tokenState[VARIABLES5][SMALLB]=VARIABLES6; // RESERVED WORDS*/ //INTEGER2 tokenState[INTEGER6][SMALLR]=INTEGER7; //PROGRAM2 tokenState[PROGRAM6][SMALLM]=PROGRAM7; //VARIABLES2 tokenState[VARIABLES6][SMALLL]=VARIABLES7; // RESERVED WORDS*/ //VARIABLES2 tokenState[VARIABLES7][SMALLE]=VARIABLES8; //VARIABLES2 tokenState[VARIABLES8][SMALLS]=VARIABLES9; /* CONSTCHAR, UNDERSCOREBEGIN, ASSIGNMENTBEGIN, SMALLEREQUALBEGIN, GREATEREQUALBEGIN, NOTEQUAL, COMMENTBEGIN, IDUNDERSCORE,*/ //now is the symbols //QUOTEBEGIN, OPENPARBEGIN, CLOSEPARBEGIN, SEMICOLONBEGIN, //PLUSBEGIN, MINUSBEGIN, TIMESBEGIN, SLASHBEGIN, COLONBEGIN, SMALLERBEGIN, GREATERBEGIN, //EQUALBEGIN, EXCLAIMBEGIN, OPENBRACKETBEGIN, CLOSEBRACKETBEGIN, COMMABEGIN, //' tokenState[READY][QUOTE]=QUOTEBEGIN; //( tokenState[READY][OPENPAR]=OPENPARBEGIN; //) tokenState[READY][CLOSEPAR]=CLOSEPARBEGIN; //; tokenState[READY][SEMICOLON]=SEMICOLONBEGIN; //+ tokenState[READY][PLUS]=PLUSBEGIN; //- tokenState[READY][MINUS]=MINUSBEGIN; //* tokenState[READY][TIMES]=TIMESBEGIN; /// tokenState[READY][SLASH]=SLASHBEGIN; //: tokenState[READY][COLON]=COLONBEGIN; //< tokenState[READY][SMALLER]=SMALLERBEGIN; //> tokenState[READY][GREATER]=GREATERBEGIN; //= tokenState[READY][EQUAL]=EQUALBEGIN; //! tokenState[READY][EXCLAIM]=EXCLAIMBEGIN; //[ tokenState[READY][OPENBRACKET]=OPENBRACKETBEGIN; //] tokenState[READY][CLOSEBRACKET]=CLOSEBRACKETBEGIN; //, tokenState[READY][COMMA]=COMMABEGIN; //AFTER QUOTE IT CAN BE ANY CHARACTER, INCLUDING ILLEGAL CHAR setRange(QUOTEBEGIN, SMALLA, ILLEGAL, CONSTCHARBEGIN); //ANY OTHER STATE IS BY DEFAULT ERROR tokenState[CONSTCHARBEGIN][QUOTE]=CONSTCHARQUOTEBEGIN; //FOR /, DEFAULT IS SLASHEND, EXCEPT * WHICH IS COMMENTSTARBEGIN tokenState[SLASHBEGIN][TIMES]= COMMENTSTARBEGIN; //FOR :, DEFAULT IS COLONEND, EXCEPT FOR = WHICH IS ASSIGNMENTBEGIN tokenState[COLONBEGIN][EQUAL]= ASSIGNMENTBEGIN; //FOR <, DEFAULT IS SMALLEREND, EXCEPT FOR= WHICH IS SMALLEREQAULBEGIN tokenState[SMALLERBEGIN][EQUAL]=SMALLEREQUALBEGIN; //FOR >, DEFAULT IS GREATEREND, EXCEPT FOR= WHICH IS GREATEREQAULBEGIN tokenState[GREATERBEGIN][EQUAL]= GREATEREQUALBEGIN; tokenState[EXCLAIMBEGIN][EQUAL]= NOTEQUALBEGIN; //WITHIN COMMENT IT IS A LOOP, EXCEPT FOR * WHICH IS POSSIBLE FOR END OF COMMENT setRange(COMMENTSTARBEGIN, SMALLA, ILLEGAL, COMMENTBEGIN); tokenState[COMMENTSTARBEGIN][TIMES]=COMMENTSTAREND; setRange(COMMENTBEGIN, SMALLA, ILLEGAL, COMMENTBEGIN); tokenState[COMMENTBEGIN][TIMES]=COMMENTSTAREND; //FROM COMMENTSTARBEGIN, ALL IS BACK TO COMMENTBEGIN, EXCEPT / WHICH IS END OF COMMENT setRange(COMMENTSTAREND, SMALLA, ILLEGAL, COMMENTBEGIN); tokenState[COMMENTSTAREND][SLASH]=COMMENTSLASHBEGIN; // } void initialReserved(TokenState state) { setRange(state, SMALLA, DIGIT, IDBEGIN); finalReservedToken(state, IDEND); tokenState[state][UNDERSCORE]=IDUNDERSCORE;//a_ } void finalSymbolToken(TokenState state, TokenState endState) { for (int i=SMALLA; i<=WHITESPACE; i++) { tokenState[state][(CharType)i]=endState; } } void finalReservedToken(TokenState state, TokenState endState) { //all non-letter, non-digit is regarded to be delimeter for (int i=QUOTE; i<=WHITESPACE; i++) { tokenState[state][(CharType)i]=endState; } } //the default charType is ILLEGAL void initialCharType() { int chType; //the default charType is ILLEGAL for (int i=0; i<256; i++) { charType[i]=ILLEGAL; } //chType is SMALLA chType=SMALLA; for (i='a'; i<='z'; i++) { charType[i]=(CharType)(chType); chType++; } //chType is now BIGA chType=BIGA;//I don't want to rely on the trick. for (i='A'; i<='Z'; i++) { charType[i]=(CharType)(chType); chType++; } chType=DIGIT; for (i='0'; i<='9'; i++) { charType[i]=(CharType)(chType); } /* UNDERSCORE, QUOTE, OPENPAR, CLOSEPAR, SEMICOLON,PLUS, MINUS, TIMES, SLASH, COLON, EQUAL,SMALLER,GREATER,EXCLAIM,OPENBRACKET, CLOSEBRACKET,COMMA, SPACE,TAB, ENDLINE, ILLEGAL */ charType['_']=UNDERSCORE; charType['\'']=QUOTE; charType['(']=OPENPAR; charType[')']=CLOSEPAR; charType[';']=SEMICOLON; charType['+']=PLUS; charType['-']=MINUS; charType['*']=TIMES; charType['/']=SLASH; charType[':']=COLON; charType['=']=EQUAL; charType['<']=SMALLER; charType['>']=GREATER; charType['!']=EXCLAIM; charType['[']=OPENBRACKET; charType[']']=CLOSEBRACKET; charType[',']=COMMA; charType[' ']=WHITESPACE; charType['\t']=WHITESPACE; charType[10]=WHITESPACE; charType[13]=WHITESPACE; //pls note, since I changed the type of "ch" to be "unsigned char" //the EOF now is not -1, but 255 charType[255]=WHITESPACE;//IT IS A KIND OF DELIMETER } void setRange(TokenState state, CharType start, CharType end, TokenState target) { for (int i=start; i<=end; i++) { tokenState[state][i]=target; } } void setState(TokenState state, TokenState targetState) { for (int i=0; i<CharTypeCount; i++) { tokenState[state][i]=targetState; } }

#include <iostream> #include "CFGReader.h" #include "Parser.h" using namespace std; int main() { CFGReader R; Parser P; R.readFromFile("c:\\ruleTest.txt"); R.optimize(); //R.print(); cout<<"\n now begin parsing...\n"; P.parseFile("c:\\sourceTest.txt"); return 0; }

program average; /* a sample program. It calculates the average of integers, */ /* the first value gives the number of integers to read*/ variables n: integer; ave, total: integer; module sum(n, res :integer;) /* sums up n integers from input, puts the result into res*/ variables i: integer; val: integer array[100]; begin i:=i; res :=0; loop read val[i]; res:= res+ val[i]; i := i+1; if i>n then exit; else ; end; /* loop */ end;/* module sum*/ begin /* main program*/ read n; sum(n, total); ave:=(((total/n))); write 'a', 'v','e','r','a','g','e','='; write ave; end; /* of program*/

now begin parsing... M => program i ; Dl B Dl => Dv Ml Dv => variables Vl Vl => V Vl0 V => Il : T ; Il => i Il0 Il0 => e T => integer Ad Ad => e Vl0 => i Il0 : T ; Vl0 Il0 => , i Il0 Il0 => e T => integer Ad Ad => e Vl0 => e Ml => e Ml0 Ml0 => module i ( Vl ) Dv B Ml0 Vl => V Vl0 V => Il : T ; Il => i Il0 Il0 => , i Il0 Il0 => e T => integer Ad Ad => e Vl0 => e Dv => variables Vl Vl => V Vl0 V => Il : T ; Il => i Il0 Il0 => e T => integer Ad Ad => e Vl0 => i Il0 : T ; Vl0 Il0 => e T => integer Ad Ad => array [ n ] Vl0 => e B => begin Sl end ; Sl => S Sl0 S => i S0 S0 => Ar := E ; Ar => e E => F M0 F => R M3 R => i Ar Ar => e M3 => e M0 => e Sl0 => Sl Sl => S Sl0 S => i S0 S0 => Ar := E ; Ar => e E => F M0 F => R M3 R => n M3 => e M0 => e Sl0 => Sl Sl => S Sl0 S => loop Sl end ; Sl => S Sl0 S => read Ln ; Ln => i Ar M1 Ar => [ E ] E => F M0 F => R M3 R => i Ar Ar => e M3 => e M0 => e M1 => e Sl0 => Sl Sl => S Sl0 S => i S0 S0 => Ar := E ; Ar => e E => F M0 F => R M3 R => i Ar Ar => e M3 => e M0 => + F M0 F => R M3 R => i Ar Ar => [ E ] E => F M0 F => R M3 R => i Ar Ar => e M3 => e M0 => e M3 => e M0 => e Sl0 => Sl Sl => S Sl0 S => i S0 S0 => Ar := E ; Ar => e E => F M0 F => R M3 R => i Ar Ar => e M3 => e M0 => + F M0 F => R M3 R => n M3 => e M0 => e Sl0 => Sl Sl => S Sl0 S => if C then S else S C => F M0 Or E F => R M3 R => i Ar Ar => e M3 => e M0 => e Or => > E => F M0 F => R M3 R => i Ar Ar => e M3 => e M0 => e S => exit ; S => e ; Sl0 => e Sl0 => e Ml0 => e B => begin Sl end ; Sl => S Sl0 S => read Ln ; Ln => i Ar M1 Ar => e M1 => e Sl0 => Sl Sl => S Sl0 S => i S0 S0 => ( Lp ) ; Lp => Ln Ln => i Ar M1 Ar => e M1 => , L M1 L => i Ar Ar => e M1 => e Sl0 => Sl Sl => S Sl0 S => i S0 S0 => Ar := E ; Ar => e E => F M0 F => R M3 R => ( E ) E => F M0 F => R M3 R => ( E ) E => F M0 F => R M3 R => ( E ) E => F M0 F => R M3 R => i Ar Ar => e M3 => / R M3 R => i Ar Ar => e M3 => e M0 => e M3 => e M0 => e M3 => e M0 => e M3 => e M0 => e Sl0 => Sl Sl => S Sl0 S => write Lo ; Lo => Lr M2 Lr => c M2 => , Lr M2 Lr => c M2 => , Lr M2 Lr => c M2 => , Lr M2 Lr => c M2 => , Lr M2 Lr => c M2 => , Lr M2 Lr => c M2 => , Lr M2 Lr => c M2 => , Lr M2 Lr => c M2 => e Sl0 => Sl Sl => S Sl0 S => write Lo ; Lo => Lr M2 Lr => i Ar Ar => e M2 => e Sl0 => e Press any key to continue