CFG Reader(NT-Table)
A. Seventh Edition
This is seventh edition of my CFG Reader which stands for Context-Free-Grammar Reader. In this edition,
I finished the establishing NT-table and mapping of token-type from "scanner" with terminals in my grammar.
M -> program i ; Dl B
Dl -> Dv Ml
Ml -> Ml Mo | e
Mo -> module i ( Vl ) Dv B
Dv -> variables Vl | e
Vl -> Vl V | V
V -> Il : T ;
T -> integer Ad ; | char Ad
Il -> i | Il , i
L -> i Ar
Ad -> e | array [ n ]
B -> begin Sl end ;
Sl -> S | S Sl
S -> L := E ; | if C then S else S | loop Sl end ; | exit ; | i ( Lp ) ; | B |
read Ln ; | write Lo ; | e ;
Lp -> e | Ln
Ln -> L { , L }
Lo -> Lr { , Lr }
Lr -> i Ar | n | c
Ar -> e | [ E ]
E -> F { Oa F }
Oa -> + | -
F -> R { Om R }
Om -> * | /
R -> L | n | ( E ) | c
C -> E Or E
Or -> = | < | > | <= | >= | !=
I cannot remove Tokens! I cannot discover INDIRECT common-left-factors. I cannot remove redundant variables.
E.Further improvement
F.File listing
1. CFGReader.h
2. CFGReader.cpp
3. Grammar.h
4. Grammar.cpp
5. main.cpp (main)
file name: CFGReader.h
#include "Grammar.h" class CFGReader { private: char buffer[BufferLength]; FILE* stream; void newRule(const char* str); void readRule(); void addRule(const char* str); int addToken(const char* str, bool isTerminal=true); void printRule(int index); public: void readFromFile(const char* fileName); void optimize(); void print(); };
file name: CFGReader.cpp
#include <iostream> #include "CFGReader.h" using namespace std; Grammar grammar; const char* deliStr=" |\n"; //this would only remove the immediate left recursion //and should I expand it to remove all kinds of left-recursion? void CFGReader::readFromFile(const char* fileName) { if ((stream=fopen(fileName, "r"))==NULL) { cout<<"cannot open file "<<fileName<<endl; } else { readRule(); } } void CFGReader::readRule() { char* ptr=buffer; char* hold; int count=0; while (!feof(stream)) { count=0; fgets(buffer, BufferLength-1, stream); ptr=buffer; while ((ptr=strtok(ptr, " \n"))!=NULL) { if (strcmp(ptr, "|")!=0) { //test the first two token to see if old rule continues if (count==0) { hold=ptr; } if (count==1) { if (strcmp("->", ptr)==0) { /* curIndex=addToken(hold, false);//the index of cur token grammar[curIndex]->terminal=false; newRule(); */ newRule(hold); } else { addRule(hold); addRule(ptr); } } if (count>=2)//always add { addRule(ptr); } count++; } else { //this is an alternative production rule newRule(NULL); } ptr=NULL; } } } void CFGReader::newRule(const char* str) { grammar.newRule(str); } void CFGReader::optimize() { grammar.optimize(); } void CFGReader::addRule(const char* str) { grammar.addRule(str); } int CFGReader::addToken(const char* str, bool isTerminal) { return grammar.addToken(str, isTerminal); } void CFGReader::printRule(int index) { grammar.printRule(index); } void CFGReader::print() { grammar.print(); grammar.printToken(true); grammar.buildTable(); grammar.printTable(); //grammar.printAllRules(); }
file name: Grammar.h
const int BufferLength=256; const int MaxTokenCount=80; const int MaxRHSCount=80; const int MaxRuleCount=200; const int MaxTokenLength=10; const int MaxProduction=10; struct Token { //bool isMeta; bool terminal; bool isNull; char* name; int production[MaxProduction];//pointer to the production it gives int count; int firstCount; int followCount; int follow[MaxTokenCount]; int first[MaxTokenCount]; }; class Grammar { private: int table[MaxTokenCount][MaxTokenCount]; int variables[MaxTokenCount]; int terminals[MaxTokenCount]; int tCount; int nCount; Token* token[MaxTokenCount]; int tokenCount; int curIndex;//the index of current token int curPos;//the position at production rule //MaxRuleCount>=MaxVariableCount!!! int production[MaxRuleCount][MaxRHSCount]; int prodIndex;//pointing to current production, initialized to -1 int checkRecursion(int tIndex, int curToken); void grammarError(int theVar, int theToken, int rule1, int rule2); void addFirstIntoTable(int theVariable, int theFirst, int theRule); void addFollowIntoTable(int theVariable, int theRule); void addBeginEnd(); bool addIntoFirst(int target, int source); bool addFirst(int target, int source); void shiftRule(int ruleIndex, bool Left2Right, int offset); void addAtEnd(int ruleIndex, int toAdd); void addRuleForToken(int tIndex, int ruleIndex); int copyRule(int source, int target);//return the position of -1 void replaceRule(int curToken, int ruleIndex); int findMetaSymbol(int& begin, int& end); void initialize(); void doReplaceMetaSymbol(int ruleIndex, int begin, int end); void removeRuleFromToken(int tIndex, int ruleIndex); void checkEpsilon(int ruleIndex); void removeImmediate(int tIndex, int ruleIndex); void doAddRule(int tIndex); void commonLeftFactor(); int findCommonFactor(int tokenIndex, int ruleIndex); void removeLeftRecursion(); void doCommonFactor(int tokenIndex, int ruleIndex, int index); int forgeToken(int index);//create a new variable void epsilonRule(int ruleIndex); bool isRedundant(int tIndex); void replaceMetaSymbol(); void calculateFirst(); void calculateFollow(); void calculateNull(); bool Null(int tIndex); bool addFollow(int target, int source); bool addIntoFollow(int target, int source); bool addFollowIntoFollow(int target, int source); void removeRedundant(); void removeToken(int tIndex); // void calculateProperty(); public: Grammar(); void buildTable(); void optimize(); void printRule(int index); void print(); void printTable(); void printAllRules(); void printToken(bool onlyVar=false); int variableCount(); int terminalCount(); void addRule(const char* str); void newRule(const char* str);//this is an internal method, not suitable for outside Token* operator[](int index); int addToken(const char* str, bool isTerminal=true); Token* createToken(const char* theName, bool isTerminal); int tokenIndex(const char* str); };
file name: Grammar.cpp
#include <iostream> #include "Grammar.h" using namespace std; const int TokenTypeCount=38; const char* emptyStr="e"; const char* optionBegin="{"; const char* optionEnd="}"; const char* startStr="START"; const char* endStr="$"; int startSymbolIndex=-1; int stackBottomIndex=-1; int beginIndex=-1; int endIndex=-1; int emptyIndex=-1; char* terminalStr[TokenTypeCount]= { //GENERAL TYPE 5 "i", "n", "c", "COMMENT", "ERROR", //THE FOLLOWING ARE SYMBOL TYPE 18 "(", ")", ";", "+", "-", "*", "/", ":=", "<", ">", "=", "<=", ">=", "!=", "[", "]", ",", ":", //THE FOLLOWING ARE RESERVED TYPE 15 "begin", "end", "program", "variables","integer", "array", "char", "module", "if", "then", "else", "loop", "exit", "read", "write" }; //these are "hash-table-like" tables int token2type[MaxTokenCount]; int type2token[MaxTokenCount]; int matchTokens(const char* str) { for (int i=0; i<TokenTypeCount; i++) { if (strcmp(terminalStr[i], str)==0) { return i; } } return -1; } void Grammar::printTable() { /* cout<<" "; for (int i=0; i<tokenCount; i++) { cout<<"| "<<i; } cout<<endl; */ for (int r=0; r<tokenCount; r++) { if (token[r]->terminal) { continue; } cout<<token[r]->name<<":"; for (int c=0; c<tokenCount; c++) { if (table[r][c]!=-1) { cout<<token[c]->name<<"="<<table[r][c]<<","; } } cout<<endl; } } void Grammar::buildTable() { int typeIndex; for (int r=0; r<tokenCount; r++) { if (token[r]->terminal) { typeIndex=matchTokens(token[r]->name); if (typeIndex!=-1) { type2token[typeIndex]=r; token2type[r]=typeIndex; } } //initialize for (int c=0; c<MaxTokenCount; c++) { table[r][c]=-1; } } for (int i=0; i<tokenCount; i++) { if (token[i]->terminal) { continue; } for (int j=0; j<token[i]->count; j++) { int k=0, theRule=token[i]->production[j]; int theToken=production[theRule][k]; while (theToken!=-1) { addFirstIntoTable(i, theToken, theRule); if (!token[theToken]->isNull) { break; } k++; theToken=production[theRule][k]; } if (theToken==-1) { addFollowIntoTable(i, theRule); } } } } /* nCount=tCount=0; for (int r=0; r<MaxTokenCount; r++) { if (r<tokenCount) { if (token[r]->terminal) { typeIndex=matchTokens(token[r]->name); if (typeIndex!=-1) { type2token[typeIndex]=r; token2type[r]=typeIndex; } //I am making a invertable table!!! tArray[r]=tCount++; } else { //it is an invertable table!!! nArray[r]=nCount++; } } for (int c=0; c<MaxTokenCount; c++) { table[r][c]=-1; } } for (int i=0; i<tCount; i++) { for (int j=0; j<token[i]->count; j++) { int k=0, theRule=token[i]->production[j]; int theToken=production[theRule][k]; while (theToken!=-1) { addFirstIntoTable(theToken, j); if (!token[theToken]->isNull) { break; } k++; theToken=production[theRule][k]; } if (theToken==-1) { addFollowIntoTable(j); } } } } */ void Grammar::grammarError(int theVar, int theToken, int rule1, int rule2) { cout<<"error! the conflict of grammar at "; cout<<token[theVar]->name<<" for token of " <<token[theToken]->name <<" between rules of \n"; printRule(rule1); cout<<" and "; printRule(rule2); cout<<endl; } void Grammar::addFollowIntoTable(int theVariable, int theRule) { int temp; for (int i=0; i<token[theVariable]->followCount; i++) { temp=token[theVariable]->follow[i]; if (table[theVariable][temp]!=theRule) { if (table[theVariable][temp]!=-1) { grammarError(theVariable, temp, theRule, table[theVariable][temp]); } table[theVariable][temp]=theRule; } } } void Grammar::addFirstIntoTable(int theVariable, int theFirst, int theRule) { for (int i=0; i<token[theFirst]->firstCount; i++) { if (table[theVariable][token[theFirst]->first[i]]!=theRule) { if (table[theVariable][token[theFirst]->first[i]]!=-1) { grammarError(theVariable, token[theFirst]->first[i], theRule, table[theVariable][token[theFirst]->first[i]]); } table[theVariable][token[theFirst]->first[i]]=theRule; } } } int Grammar::variableCount() { int result=0; for (int i=0; i<tokenCount; i++) { if (!token[i]->terminal) { result++; } } return result; } int Grammar::terminalCount() { int result=0; for (int i=0; i<tokenCount; i++) { if (token[i]->terminal) { result++; } } return result; } //leave the rule unremoved!!! as I have no way to do it!!! void Grammar::removeToken(int tIndex) { delete[]token[tIndex]->name; delete token[tIndex]; tokenCount--; for (int i=tIndex; i<tokenCount; i++) { token[i]=token[i+1]; } } bool Grammar::isRedundant(int tIndex) { int k, theRule, theToken; for (int i=0; i<tokenCount; i++) { for (int j=0; j<token[i]->count; j++) { k=0; theRule=token[i]->production[j]; theToken=production[theRule][k]; while (theToken!=-1) { if (theToken==tIndex) { return false; } k++; theToken=production[theRule][k]; } } } return true; } //what is redundant? except the start variable //the non-terminal never appears in other rules! void Grammar::removeRedundant() { int tIndex=1; bool findNew=false; while (tIndex<tokenCount) { findNew=false; if (!token[tIndex]->terminal) { if (isRedundant(tIndex)) { removeToken(tIndex); findNew=true; } } if (findNew) { tIndex=1; } else { tIndex++; } } } void Grammar::printAllRules() { /* cout<<"\nnow print all rules\n"; for (int i=0; i<=prodIndex; i++) { cout<<"rule index "<<i<<": "; printRule(i); cout<<"\n"; } */ int sum=0; for (int i=0; i<tokenCount; i++) { if (!token[i]->terminal) { sum+=token[i]->count; } } cout<<" total rules is:"<<prodIndex+1; cout<<"\n and the sum is "<<sum<<endl; } void Grammar::addBeginEnd() { int begin=addToken(startStr, false); int end=addToken(endStr, true); prodIndex++; production[prodIndex][0]=0; production[prodIndex][1]=end; production[prodIndex][2]=-1; addRuleForToken(begin, prodIndex); startSymbolIndex=begin; stackBottomIndex=end; } /* void Grammar::calculateProperty() { calculateNull(); calculateFirst(); calculateFollow(); } */ void Grammar::printToken(bool onlyVar) { for (int i=0; i<tokenCount; i++) { if (onlyVar) { if (token[i]->terminal) { continue; } } cout<<"name: "<<token[i]->name<<endl; cout<<" isNull: "<<(token[i]->isNull?"true":"false")<<endl; cout<<" isTerminal: "<<(token[i]->terminal?"true":"false")<<endl; cout<<" first: "; for (int j=0; j<token[i]->firstCount; j++) { if (j!=0) { cout<<","; } cout<<"["<<j<<"]="<<token[token[i]->first[j]]->name; } cout<<"\n follow: "; for (j=0; j<token[i]->followCount; j++) { if (j!=0) { cout<<","; } cout<<"["<<j<<"]="<<token[token[i]->follow[j]]->name; } cout<<endl; } } //must use while loop to discover new set!!!!! void Grammar::calculateFirst() { int i; bool addNew; do { i=0; addNew=false; while (i<tokenCount) { //the terminal contains itself if (token[i]->terminal) { //addFirst don't judge if it is nullable!! addFirst(i, i); //token[i]->first[token[i]->firstCount++]=i; } else { //for all its rules for (int j=0; j<token[i]->count; j++) { int theToken, k=0; theToken=production[token[i]->production[j]][k]; //for each token in each rule do { //add non epsilon set if (addIntoFirst(i, theToken)) { addNew=true; } //if it is not null, means it is end if (!token[theToken]->isNull) { break; } //if it is nullable, continue k++; theToken=production[token[i]->production[j]][k]; }while (theToken!=-1); //it means all token in this rule is nullable, so if (theToken==-1) { //it is nullable //addEpsilonIntoFirst(i); addFirst(i, emptyIndex); } } } i++; } }while (addNew); } bool Grammar::addFollowIntoFollow(int target, int source) { bool addNew=false; for (int i=0; i<token[source]->followCount; i++) { if (addFollow(target, token[source]->follow[i])) { addNew=true; } } return addNew; } bool Grammar::addIntoFollow(int target, int source) { bool addNew=false; if (source==-1) { return false; } for (int i=0; i<token[source]->firstCount; i++) { //add non-epsilon if (!token[token[source]->first[i]]->isNull) { if (addFollow(target, token[source]->first[i])) { addNew=true; } } } return addNew; } void Grammar::calculateFollow() { int i; bool addNew, started; //token[startSymbolIndex]->follow[0]=stackBottomIndex; //token[startSymbolIndex]->followCount=1; do { i=0; addNew=false; while (i<tokenCount) { //the terminal contains itself if (!token[i]->terminal) { for (int tIndex=0; tIndex<tokenCount; tIndex++) { //for all its rules if (token[tIndex]->terminal) { continue; } //for each its rule for (int j=0; j<token[tIndex]->count; j++) { int theToken, k=0, theRule=token[tIndex]->production[j]; started=false; theToken=production[theRule][k]; //for each token in each rule do { //the token appears here if (theToken==i) { started=true; //add non epsilon set, including -1 situation!!! if (addIntoFollow(i, production[theRule][k+1])) { addNew=true; } } //if it is not null, means it is end if (started&&!token[theToken]->isNull) { break; } //if it is nullable, continue k++; theToken=production[theRule][k]; }while (theToken!=-1); //it means all token in this rule is nullable, so if (started&&theToken==-1) { //it is nullable //add current variable Follow(j) into Follow(i); if (addFollowIntoFollow(i, tIndex)) { addNew=true; } } } } } i++; } }while (addNew); } void Grammar::addRuleForToken(int tIndex, int ruleIndex) { token[tIndex]->production[token[tIndex]->count++]=ruleIndex; } void Grammar::shiftRule(int ruleIndex, bool left2Right, int offset) { int end=0; while (production[ruleIndex][end]!=-1) { end++; } if (left2Right) { for (int i=end; i>=0; i--) { production[ruleIndex][i+offset]=production[ruleIndex][i]; } } else { for (int i=0; i<=end-offset; i++) { production[ruleIndex][i]=production[ruleIndex][i+offset]; } checkEpsilon(ruleIndex); } } void Grammar::checkEpsilon(int ruleIndex) { if (production[ruleIndex][0]==-1) { epsilonRule(ruleIndex); } } bool Grammar::addFollow(int target, int source) { //check if it is already there for (int i=0; i<token[target]->followCount; i++) { if (token[target]->follow[i]==source) { return false; } } //add at end token[target]->follow[token[target]->followCount++]=source; return true; } bool Grammar::addFirst(int target, int source) { //check if it is already there for (int i=0; i<token[target]->firstCount; i++) { if (token[target]->first[i]==source) { return false; } } //add at end token[target]->first[token[target]->firstCount++]=source; return true; } //add non epsilon into it. bool Grammar::addIntoFirst(int target, int source) { bool addNew=false; if (token[source]->terminal) { return addFirst(target, source); } for (int i=0; i<token[source]->firstCount; i++) { //add non epsilon if (!token[token[source]->first[i]]->isNull) { if (addFirst(target, token[source]->first[i])) { addNew=true; } } } return addNew; } bool Grammar::Null(int tIndex) { if (token[tIndex]->terminal) { return token[tIndex]->isNull; } for (int i=0; i<token[tIndex]->count; i++) { int j=0, theToken; theToken=production[token[tIndex]->production[i]][j]; do { if (theToken==tIndex||!Null(theToken)) { break; } j++; theToken=production[token[tIndex]->production[i]][j]; }while (theToken!=-1); if (theToken==-1) { return true; } } return false; } void Grammar::calculateNull() { for (int i=0; i<tokenCount; i++) { token[i]->isNull=Null(i); } } int Grammar::findMetaSymbol(int& begin, int& end) { int theRule, theToken, k; begin=end=-1; for (int i=0; i<tokenCount; i++) { for (int j=0; j<token[i]->count; j++) { k=0; theRule=token[i]->production[j]; theToken=production[theRule][k]; while (theToken!=-1) { if (theToken==beginIndex) { begin=k; } if (theToken==endIndex) { end=k; return theRule; } k++; theToken=production[theRule][k]; } } } return -1; } void Grammar::doReplaceMetaSymbol(int ruleIndex, int begin, int end) { int newTokenIndex=forgeToken(0), i=0; addRuleForToken(newTokenIndex, ++prodIndex); //token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex; //token[newTokenIndex]->terminal=false; copyRule(ruleIndex, prodIndex); //shrink while (production[ruleIndex][i+end+1]!=-1) { production[ruleIndex][begin+i]=production[ruleIndex][i+end+1]; i++; } production[ruleIndex][begin+i]=-1; addAtEnd(ruleIndex, newTokenIndex); /* production[ruleIndex][begin]=newTokenIndex; production[ruleIndex][begin+1]=-1; */ shiftRule(prodIndex, false, begin+1); production[prodIndex][end-begin-1]=newTokenIndex; production[prodIndex][end-begin-1]=-1; epsilonRule(++prodIndex); addRuleForToken(newTokenIndex, prodIndex); } void Grammar::replaceMetaSymbol() { int begin, end, ruleIndex; while ((ruleIndex=findMetaSymbol(begin, end))!=-1) { doReplaceMetaSymbol(ruleIndex, begin, end); } /* for (int i=0; i<tokenCount; i++) { //if (token[i]->isMeta) if (i==beginIndex||i==endIndex) { removeToken(i); } } */ } void Grammar::addAtEnd(int ruleIndex, int toAdd) { int end=0; while (production[ruleIndex][end]!=-1) { end++; } production[ruleIndex][end++]=toAdd; production[ruleIndex][end]=-1; } //left-recursion: A -> A a | b | c //change to: A -> b A' | c A' // A' -> a A' | empty void Grammar::removeImmediate(int tIndex, int ruleIndex) { int newIndex=forgeToken(tIndex); int holdRuleIndex=token[tIndex]->production[ruleIndex]; //sequence matters! //change to: A -> b A' for (int i=0; i<token[tIndex]->count; i++) { if (i!=ruleIndex) { addAtEnd(token[tIndex]->production[i], newIndex); } } //shift removeRuleFromToken(tIndex, ruleIndex); addRuleForToken(newIndex, holdRuleIndex); //token[newIndex]->production[token[newIndex]->count++]=holdRuleIndex; shiftRule(holdRuleIndex, false, 1); addAtEnd(holdRuleIndex, newIndex); //add epsilon rule for new variable epsilonRule(++prodIndex); addRuleForToken(newIndex, prodIndex); } int Grammar::forgeToken(int index) { char str[MaxTokenLength+2], ch; int len=strlen(token[index]->name); int temp=0, i=0; strcpy(str, token[index]->name); ch=str[len-1];//get last char of name if (ch>='0'&&ch<'9') { str[len-1]=ch+i+1; } else { str[len]='0'+i; str[len+1]='\0'; } //I won't bother to check repetitation of name while (tokenIndex(str)!=-1) { i++; if (ch>='0'&&ch<'9') { str[len-1]=ch+i+1; } else { str[len]='0'+i; str[len+1]='\0'; } } return addToken(str, false);//it is non-terminal for sure } int Grammar::copyRule(int source, int target) { int i=0; while (production[source][i]!=-1) { production[target][i]=production[source][i]; i++; } production[target][i]=-1; return i; } void Grammar::doAddRule(int tIndex) { token[tIndex]->production[token[tIndex]->count++]=++prodIndex; } void Grammar::addRule(const char* str) { int index; index=addToken(str); production[prodIndex][curPos++]=index; production[prodIndex][curPos]=-1;//set end } //if the token is already there, it return the index //otherwise, it add new node in token array int Grammar::addToken(const char* str, bool isTerminal) { int index; index=tokenIndex(str); if (index==-1) { index=tokenCount; } else { return index; } token[index]=createToken(str, isTerminal); if (strcmp(str, optionBegin)==0) { beginIndex=index; } if (strcmp(str, optionEnd)==0) { endIndex=index; } tokenCount++; if (strcmp(str, emptyStr)==0) { token[index]->isNull=true; emptyIndex=index; } return index; } void Grammar::newRule(const char* str) { if (str!=NULL) { curIndex=addToken(str, false); } //add one pointer token[curIndex]->production[token[curIndex]->count++]=++prodIndex; token[curIndex]->terminal=false; curPos=0;//reset to restart; } Token* Grammar::createToken(const char* theName, bool isTerminal) { Token* ptr=new Token; ptr->name=new char[strlen(theName)+1]; strcpy(ptr->name, theName); ptr->terminal=isTerminal; ptr->count=ptr->firstCount=ptr->followCount=0; ptr->isNull=false; return ptr; } int Grammar::tokenIndex(const char* str) { for (int i=0; i<tokenCount; i++) { if (strcmp(str, token[i]->name)==0) { return i; } } return -1; } int Grammar::checkRecursion(int tIndex, int curToken) { for (int i=0; i<token[curToken]->count; i++) { //token[tIndex]->production[i]=ruleIndex //production[ruleIndex][0] is the first left-recursion one if (production[token[curToken]->production[i]][0]<=curToken) { return i; } } return -1; } void Grammar::printRule(int index) { int nodeIndex=0; cout<<" ~"<<index<<"~ "; while (production[index][nodeIndex]!=-1) { cout<<token[production[index][nodeIndex]]->name<<" "; nodeIndex++; } } void Grammar::initialize() { tokenCount=curIndex=curPos=0; prodIndex=-1;//in order to ++ blindly } Grammar::Grammar() { initialize(); } void Grammar::removeLeftRecursion() { int tIndex=0, curToken=0; bool newChange=false; while (tIndex<tokenCount) { if (!token[tIndex]->terminal) { for (int i=0; i<token[tIndex]->count; i++) { curToken=production[token[tIndex]->production[i]][0]; if (curToken<=tIndex&&!token[curToken]->terminal) { if (curToken!=tIndex) { replaceRule(tIndex, i); } else { removeImmediate(tIndex, i); } newChange=true; } } } //whenever there is some new findings, restart if (!newChange) { tIndex++; } else { tIndex=0; newChange=false; } } } void Grammar::replaceRule(int tIndex, int ruleIndex) { int pos, j, targetEnd, sourceEnd, curRule; curRule=token[tIndex]->production[ruleIndex]; int curToken=production[curRule][0]; for (int i=token[curToken]->count-1; i>=0; i--) { if (i>0) { doAddRule(tIndex); pos=copyRule(token[curToken]->production[i], prodIndex); j=0; while (production[curRule][j+1]!=-1) { production[prodIndex][pos+j]=production[curRule][j+1]; j++; } production[prodIndex][pos+j]=-1; //addRuleForToken(curToken, prodIndex); } else { targetEnd=sourceEnd=0; //curRule=token[tIndex]->production[ruleIndex]; while (true) { if (production[token[curToken]->production[0]][sourceEnd]==-1&& production[curRule][targetEnd]==-1) { break; } if (production[token[curToken]->production[0]][sourceEnd]!=-1) { sourceEnd++; } if (production[curRule][targetEnd]!=-1) { targetEnd++; } } j=targetEnd+sourceEnd-1; while (j>=0) { if (j>=sourceEnd) { production[curRule][j]=production[curRule][j-sourceEnd+1]; } else { production[curRule][j]=production[token[curToken]->production[0]][j]; } j--; } } } } //optimize sequence is first common-factor then remove left recursion //therefore I don't have to check if for same variable if there will be //more than one left-recursion void Grammar::optimize() { replaceMetaSymbol(); removeLeftRecursion(); commonLeftFactor(); //removeRedundant(); addBeginEnd(); calculateNull(); calculateFirst(); calculateFollow(); } int Grammar::findCommonFactor(int tIndex, int ruleIndex) { for (int i=ruleIndex+1; i<token[tIndex]->count; i++) { //if the two rule has the same first token if (production[token[tIndex]->production[ruleIndex]][0]== production[token[tIndex]->production[i]][0]) { /* //calculate if there is epsilon if (emptyIndex==-1) { emptyIndex=tokenIndex(emptyStr); } //if it is not epsilon if (production[token[tIndex]->production[i]][0]!=emptyIndex) { return i; } */ return i; } } return -1; } void Grammar::epsilonRule(int ruleIndex) { production[ruleIndex][0]=addToken(emptyStr); production[ruleIndex][1]=-1; } //sample: x -> Aa // x -> Ab //changed to: x -> Ax' //this is to change the old rule // x' -> b //this is to change the old rule // x' -> a //this is the new-added rule void Grammar::doCommonFactor(int tIndex, int ruleIndex, int index) { int newTokenIndex=forgeToken(tIndex);//create a new variable //move the second and after part to the new rule of new variable //doing: x' -> a //curPos=0; //prepare to add one new rule addRuleForToken(newTokenIndex, ++prodIndex); //token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex; token[newTokenIndex]->terminal=false; copyRule(token[tIndex]->production[ruleIndex], prodIndex); shiftRule(prodIndex, false, 1); /* do { //do copying production[prodIndex][curPos]= production[token[tIndex]->production[ruleIndex]][curPos+1]; curPos++; //even the -1 at end is copied }while (production[token[tIndex]->production[ruleIndex]][curPos]!=-1); */ //in order to show an empty string, in case the string is "epsilon" /* if (curPos==1) { epsilonRule(prodIndex); } */ //replace x -> Aa with x -> Ax' production[token[tIndex]->production[ruleIndex]][1]=newTokenIndex; production[token[tIndex]->production[ruleIndex]][2]=-1;//end //doing: x' -> b //curPos=0; //prepare to add one new rule //pointing new token to where old rule lies addRuleForToken(newTokenIndex, token[tIndex]->production[index]); /* token[newTokenIndex]->production[token[newTokenIndex]->count++]= token[tIndex]->production[index]; */ shiftRule(token[tIndex]->production[index], false, 1); removeRuleFromToken(tIndex, index); } void Grammar::removeRuleFromToken(int tIndex, int ruleIndex) { token[tIndex]->count--; for (int i=ruleIndex; i<token[tIndex]->count; i++) { token[tIndex]->production[i]=token[tIndex]->production[i+1]; } } void Grammar::commonLeftFactor() { int index=-1, tIndex=0, ruleIndex=0; bool flag; //whenever a newrule is done, restart! while (tIndex<tokenCount) { ruleIndex=0; flag=false; while (ruleIndex<token[tIndex]->count) { index=findCommonFactor(tIndex, ruleIndex); if (index!=-1) { doCommonFactor(tIndex, ruleIndex, index); //restart flag=true; break; } else { ruleIndex++; } } if (flag) { tIndex=0; } else { tIndex++; } } } Token* Grammar::operator[](int index) { if (index>=0&&index<tokenCount) { return token[index]; } else { return NULL; } } void Grammar::print() { for (int i=0; i<tokenCount; i++) { if (!token[i]->terminal) { cout<<token[i]->name<<" ==> "; for (int j=0; j<token[i]->count; j++) { printRule(token[i]->production[j]); if (j!=token[i]->count-1) { cout<<" | "; } } cout<<"\n"; } } }
file name: main.cpp (main)
#include <iostream> #include "CFGReader.h" using namespace std; int main() { CFGReader R; R.readFromFile("c:\\ruleTest.txt"); R.optimize(); R.print(); return 0; }
The input is something like following:
Please note that all tokens must be separated by space, including "|" and "->" and "{", "}" is meta symbol
equivalent to Kaleen star.
M -> program i ; Dl B Dl -> Dv Ml Ml -> Ml Mo | e Mo -> module i ( Vl ) Dv B Dv -> variables Vl | e Vl -> Vl V | V V -> Il : T ; T -> integer Ad ; | char Ad Il -> i | Il , i L -> i Ar Ad -> e | array [ n ] B -> begin Sl end ; Sl -> S | S Sl S -> L := E ; | if C then S else S | loop Sl end ; | exit ; | i ( Lp ) ; | B | read Ln ; | write Lo ; | e ; Lp -> e | Ln Ln -> L { , L } Lo -> Lr { , Lr } Lr -> i Ar | n | c Ar -> e | [ E ] E -> F { Oa F } Oa -> + | - F -> R { Om R } Om -> * | / R -> L | n | ( E ) | c C -> E Or E Or -> = | < | > | <= | >= | !=
Here is the result:
M ==> ~0~ program i ; Dl B Dl ==> ~1~ Dv Ml B ==> ~17~ begin Sl end ; Dv ==> ~5~ variables Vl | ~6~ e Ml ==> ~3~ e Ml0 Mo ==> ~4~ module i ( Vl ) Dv B Vl ==> ~8~ V Vl0 V ==> ~9~ Il : T ; Il ==> ~12~ i Il0 T ==> ~10~ integer Ad ; | ~11~ char Ad Ad ==> ~15~ e | ~16~ array [ n ] L ==> ~14~ i Ar Ar ==> ~36~ e | ~37~ [ E ] Sl ==> ~18~ S Sl0 S ==> ~20~ i S0 | ~21~ if C then S else S | ~22~ loop Sl end ; | ~23~ exit ; | ~25~ begin S l end ; | ~26~ read Ln ; | ~27~ write Lo ; | ~28~ e ; E ==> ~38~ F M0 C ==> ~48~ F M0 Or E Lp ==> ~29~ e | ~30~ Ln Ln ==> ~31~ i Ar M1 Lo ==> ~32~ Lr M2 Lr ==> ~33~ i Ar | ~34~ n | ~35~ c F ==> ~41~ R M3 Oa ==> ~39~ + | ~40~ - R ==> ~44~ i Ar | ~45~ n | ~46~ ( E ) | ~47~ c Om ==> ~42~ * | ~43~ / Or ==> ~49~ = | ~50~ < | ~51~ > | ~52~ <= | ~53~ >= | ~54~ != M0 ==> ~55~ + F | ~56~ e | ~66~ - F M1 ==> ~57~ , L | ~58~ e M2 ==> ~59~ , Lr | ~60~ e M3 ==> ~61~ * R | ~62~ e | ~67~ / R Ml0 ==> ~2~ module i ( Vl ) Dv B Ml0 | ~63~ e Vl0 ==> ~7~ i Il0 : T ; Vl0 | ~64~ e Il0 ==> ~13~ , i Il0 | ~65~ e Sl0 ==> ~68~ e | ~19~ Sl S0 ==> ~69~ Ar := E ; | ~24~ ( Lp ) ; START ==> ~70~ M $ name: M isNull: false isTerminal: false first: [0]=program follow: [0]=$ name: Dl isNull: true isTerminal: false first: [0]=e,[1]=variables,[2]=module follow: [0]=begin name: B isNull: false isTerminal: false first: [0]=begin follow: [0]=module name: Dv isNull: true isTerminal: false first: [0]=variables,[1]=e follow: [0]=module,[1]=begin name: Ml isNull: true isTerminal: false first: [0]=e,[1]=module follow: [0]=begin name: Mo isNull: false isTerminal: false first: [0]=module follow: name: Vl isNull: false isTerminal: false first: [0]=i follow: [0]=) name: V isNull: false isTerminal: false first: [0]=i follow: [0]=i name: Il isNull: false isTerminal: false first: [0]=i follow: [0]=: name: T isNull: false isTerminal: false first: [0]=integer,[1]=char follow: [0]=; name: Ad isNull: true isTerminal: false first: [0]=e,[1]=array follow: [0]=; name: L isNull: false isTerminal: false first: [0]=i follow: name: Ar isNull: true isTerminal: false first: [0]=e,[1]=[ follow: [0]=,,[1]=:=,[2]=;,[3]=*,[4]=/ name: Sl isNull: false isTerminal: false first: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=; follow: [0]=end name: S isNull: false isTerminal: false first: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=e,[8]=; follow: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=;,[8]=else name: E isNull: false isTerminal: false first: [0]=i,[1]=n,[2]=(,[3]=c follow: [0]=],[1]=),[2]=; name: C isNull: false isTerminal: false first: [0]=i,[1]=n,[2]=(,[3]=c follow: [0]=then name: Lp isNull: true isTerminal: false first: [0]=e,[1]=i follow: [0]=) name: Ln isNull: false isTerminal: false first: [0]=i follow: [0]=; name: Lo isNull: false isTerminal: false first: [0]=i,[1]=n,[2]=c follow: [0]=; name: Lr isNull: false isTerminal: false first: [0]=i,[1]=n,[2]=c follow: [0]=, name: F isNull: false isTerminal: false first: [0]=i,[1]=n,[2]=(,[3]=c follow: [0]=+,[1]=- name: Oa isNull: false isTerminal: false first: [0]=+,[1]=- follow: name: R isNull: false isTerminal: false first: [0]=i,[1]=n,[2]=(,[3]=c follow: [0]=*,[1]=/ name: Om isNull: false isTerminal: false first: [0]=*,[1]=/ follow: name: Or isNull: false isTerminal: false first: [0]==,[1]=<,[2]=>,[3]=<=,[4]=>=,[5]=!= follow: [0]=i,[1]=n,[2]=(,[3]=c name: M0 isNull: true isTerminal: false first: [0]=+,[1]=e,[2]=- follow: [0]=],[1]=),[2]=;,[3]==,[4]=<,[5]=>,[6]=<=,[7]=>=,[8]=!= name: M1 isNull: true isTerminal: false first: [0]=,,[1]=e follow: [0]=; name: M2 isNull: true isTerminal: false first: [0]=,,[1]=e follow: [0]=; name: M3 isNull: true isTerminal: false first: [0]=*,[1]=e,[2]=/ follow: [0]=+,[1]=- name: Ml0 isNull: true isTerminal: false first: [0]=module,[1]=e follow: [0]=begin name: Vl0 isNull: true isTerminal: false first: [0]=i,[1]=e follow: [0]=) name: Il0 isNull: true isTerminal: false first: [0]=,,[1]=e follow: [0]=: name: Sl0 isNull: true isTerminal: false first: [0]=e,[1]=i,[2]=if,[3]=loop,[4]=exit,[5]=begin,[6]=read,[7]=write,[8]=; follow: [0]=end name: S0 isNull: false isTerminal: false first: [0]=[,[1]=:=,[2]=( follow: name: START isNull: false isTerminal: false first: [0]=program follow: M:program=0, Dl:e=1,module=1,variables=1,begin=1, B:begin=17, Dv:e=6,module=6,variables=5,begin=6, Ml:e=3,module=3,begin=3, Mo:module=4, Vl:i=8, V:i=9, Il:i=12, T:integer=10,char=11, Ad:;=15,e=15,array=16, L:i=14, Ar:;=36,e=36,,=36,[=37,:==36,*=36,/=36, Sl:i=18,;=18,e=18,begin=18,if=18,loop=18,exit=18,read=18,write=18, S:i=20,;=28,e=28,begin=25,if=21,loop=22,exit=23,read=26,write=27, E:i=38,(=38,n=38,c=38, C:i=48,(=48,n=48,c=48, Lp:i=30,e=29,)=29, Ln:i=31, Lo:i=32,n=32,c=32, Lr:i=33,n=34,c=35, F:i=41,(=41,n=41,c=41, Oa:+=39,-=40, R:i=44,(=46,n=45,c=47, Om:*=42,/=43, Or:==49,<=50,>=51,<==52,>==53,!==54, M0:;=56,e=56,)=56,]=56,+=55,-=66,==56,<=56,>=56,<==56,>==56,!==56, M1:;=58,e=58,,=57, M2:;=60,e=60,,=59, M3:e=62,+=62,-=62,*=61,/=67, Ml0:e=63,module=2,begin=63, Vl0:i=7,e=64,)=64, Il0:e=65,:=65,,=13, Sl0:i=19,;=19,e=68,begin=19,end=68,if=19,loop=19,exit=19,read=19,write=19, S0:e=69,(=24,[=69,:==69, START:program=70, Press any key to continue