CFG Reader(removal-left-recursion1)
A. Third Edition
This is third edition of my CFG Reader which stands for Context-Free-Grammar Reader. In this edition, I finished
the first part of removal-left-recursion.
What is RLR?
Example: A -> A a | b
Here you see the variable A which appears the left-most of rule and this will produce infinite loop for Recursive-
Descent parsing. For table-driven parsing, it is the same problem. So, how to remove?
A general algorithm is to index all variables and make sure all the left-most variable in any rule has no small or
equal index number than the left-hand-side.
For example,
1) A -> b a | b b
2) B -> A a | b
on 2) LHS is B which has a bigger index than left-most variable A in "B -> A a". This should be replaced with
whatever of A represents:
1) A -> b a | b b
2) B -> b a a | b b a | b
This is just what I have done in this edition.
In next edition I will remove the immediate left recursion.
program -> stmt-sequence
stmt-sequence -> stmt-sequence ; statement | statement
statement -> if-stmt | repeat-stmt | assign-stmt | read-stmt | write-stmt
if-stmt -> if boolean-exp then stmt-sequence end
| if boolean-exp then stmt-sequence else stmt-sequence end
repeat-stmt -> repeat stmt-sequence until boolean-exp
assign-stmt -> identifier := integer-exp
read-stmt -> read identifier
write-stmt -> write integer-exp
boolean-exp -> integer-exp comparison-op integer-exp
comparison-op -> < | =
integer-exp -> integer-exp addop term | term
addop -> + | -
term -> term mulop factor | factor
mulop -> * | /
factor -> ( integer-exp ) | number | identifier
1) A -> b a
2) B -> A a | b
Modified:
1) A -> b a
2) B -> b a a | b
Case 2: The replaced variable has more than one rules, say n. We must invent n-1 rules at end. These
new n-1 rules are all beginning with the replaced plus the original rule minus the first to-be-replaced
variable. Then for the original rule, just modify it like case 1.
i.e.
1) A -> b a | b b
2) B -> A a | b
after replaced
1) A -> b a | b b
2) B -> b a a | b
| B -> b b a //this should be newly-invented rule placed at end.
E.Further improvement
F.File listing
1. CFGReader.h
2. CFGReader.cpp
3. Grammar.h
4. Grammar.cpp
5. main.cpp (main)
file name: CFGReader.h
#include "Grammar.h"
class CFGReader
{
private:
char buffer[BufferLength];
FILE* stream;
void newRule(const char* str);
void readRule();
void addRule(const char* str);
int addToken(const char* str, bool isTerminal=true);
void printRule(int index);
public:
void readFromFile(const char* fileName);
void optimize();
void print();
};
file name: CFGReader.cpp
#include <iostream>
#include "CFGReader.h"
using namespace std;
Grammar grammar;
const char* deliStr=" |\n";
//this would only remove the immediate left recursion
//and should I expand it to remove all kinds of left-recursion?
void CFGReader::readFromFile(const char* fileName)
{
if ((stream=fopen(fileName, "r"))==NULL)
{
cout<<"cannot open file "<<fileName<<endl;
}
else
{
readRule();
}
}
void CFGReader::readRule()
{
char* ptr=buffer;
char* hold;
int count=0;
while (!feof(stream))
{
count=0;
fgets(buffer, BufferLength-1, stream);
ptr=buffer;
while ((ptr=strtok(ptr, " \n"))!=NULL)
{
if (strcmp(ptr, "|")!=0)
{
//test the first two token to see if old rule continues
if (count==0)
{
hold=ptr;
}
if (count==1)
{
if (strcmp("->", ptr)==0)
{
/*
curIndex=addToken(hold, false);//the index of cur token
grammar[curIndex]->terminal=false;
newRule();
*/
newRule(hold);
}
else
{
addRule(hold);
addRule(ptr);
}
}
if (count>=2)//always add
{
addRule(ptr);
}
count++;
}
else
{
//this is an alternative production rule
newRule(NULL);
}
ptr=NULL;
}
}
}
void CFGReader::newRule(const char* str)
{
grammar.newRule(str);
}
void CFGReader::optimize()
{
grammar.optimize();
}
void CFGReader::addRule(const char* str)
{
grammar.addRule(str);
}
int CFGReader::addToken(const char* str, bool isTerminal)
{
return grammar.addToken(str, isTerminal);
}
void CFGReader::printRule(int index)
{
grammar.printRule(index);
}
void CFGReader::print()
{
grammar.print();
}
file name: Grammar.h
const int BufferLength=256;
const int MaxTokenCount=50;
const int MaxRHSCount=8;
const int MaxRuleCount=50;
const int MaxTokenLength=10;
const int MaxProduction=10;
struct Token
{
bool terminal;
char* name;
int production[MaxProduction];//pointer to the production it gives
int count;
int firstCount;
int followCount;
int follow[MaxTokenCount];
int first[MaxTokenCount];
};
class Grammar
{
private:
Token* token[MaxTokenCount];
int tokenCount;
int curIndex;//the index of current token
int curPos;//the position at production rule
//MaxRuleCount>=MaxVariableCount!!!
int production[MaxRuleCount][MaxRHSCount];
int prodIndex;//pointing to current production, initialized to -1
int checkRecursion(int tIndex, int curToken);
int copyRule(int source, int target);//return the position of -1
void replaceRule(int curToken, int ruleIndex);
void initialize();
void doAddRule(int tIndex);
void commonLeftFactor();
int findCommonFactor(int tokenIndex, int ruleIndex);
void removeLeftRecursion();
void doCommonFactor(int tokenIndex, int ruleIndex, int index);
int forgeToken(int index);//create a new variable
void epsilonRule(int ruleIndex);
public:
Grammar();
void optimize();
void printRule(int index);
void print();
void addRule(const char* str);
void newRule(const char* str);//this is an internal method, not suitable for outside
Token* operator[](int index);
int addToken(const char* str, bool isTerminal=true);
Token* createToken(const char* theName, bool isTerminal);
int tokenIndex(const char* str);
};
file name: Grammar.cpp
#include <iostream>
#include "Grammar.h"
using namespace std;
const char* emptyStr="empty";
int emptyIndex=-1;
int Grammar::forgeToken(int index)
{
char str[MaxTokenLength+2], ch;
int len=strlen(token[index]->name);
int temp=0, i=0;
strcpy(str, token[index]->name);
ch=str[len-1];//get last char of name
if (ch>='0'&&ch<'9')
{
str[len-1]=ch+i+1;
}
else
{
str[len]='0'+i;
str[len+1]='\0';
}
//I won't bother to check repetitation of name
while (tokenIndex(str)!=-1)
{
i++;
if (ch>='0'&&ch<'9')
{
str[len-1]=ch+i+1;
}
else
{
str[len]='0'+i;
str[len+1]='\0';
}
}
return addToken(str, false);//it is non-terminal for sure
}
int Grammar::copyRule(int source, int target)
{
int i=0;
while (production[source][i]!=-1)
{
production[target][i]=production[source][i];
i++;
}
production[target][i]=-1;
return i;
}
void Grammar::doAddRule(int tIndex)
{
token[tIndex]->production[token[tIndex]->count++]=++prodIndex;
}
void Grammar::addRule(const char* str)
{
int index;
index=addToken(str);
production[prodIndex][curPos++]=index;
production[prodIndex][curPos]=-1;//set end
}
//if the token is already there, it return the index
//otherwise, it add new node in token array
int Grammar::addToken(const char* str, bool isTerminal)
{
int index;
index=tokenIndex(str);
if (index==-1)
{
index=tokenCount;
}
else
{
return index;
}
token[index]=createToken(str, isTerminal);
tokenCount++;
return index;
}
void Grammar::newRule(const char* str)
{
if (str!=NULL)
{
curIndex=addToken(str, false);
}
//add one pointer
token[curIndex]->production[token[curIndex]->count++]=++prodIndex;
token[curIndex]->terminal=false;
curPos=0;//reset to restart;
}
Token* Grammar::createToken(const char* theName, bool isTerminal)
{
Token* ptr=new Token;
ptr->name=new char[strlen(theName)+1];
strcpy(ptr->name, theName);
ptr->terminal=isTerminal;
ptr->count=ptr->firstCount=ptr->followCount=0;
return ptr;
}
int Grammar::tokenIndex(const char* str)
{
for (int i=0; i<tokenCount; i++)
{
if (strcmp(str, token[i]->name)==0)
{
return i;
}
}
return -1;
}
int Grammar::checkRecursion(int tIndex, int curToken)
{
for (int i=0; i<token[curToken]->count; i++)
{
//token[tIndex]->production[i]=ruleIndex
//production[ruleIndex][0] is the first left-recursion one
if (production[token[curToken]->production[i]][0]<=curToken)
{
return i;
}
}
return -1;
}
void Grammar::printRule(int index)
{
int nodeIndex=0;
while (production[index][nodeIndex]!=-1)
{
cout<<token[production[index][nodeIndex]]->name<<" ";
nodeIndex++;
}
}
void Grammar::initialize()
{
tokenCount=curIndex=curPos=0;
prodIndex=-1;//in order to ++ blindly
}
Grammar::Grammar()
{
initialize();
}
void Grammar::removeLeftRecursion()
{
int tIndex=0, curToken=0;
bool newChange=false;
while (tIndex<tokenCount)
{
if (!token[tIndex]->terminal)
{
for (int i=0; i<token[tIndex]->count; i++)
{
curToken=production[token[tIndex]->production[i]][0];
if (curToken<=tIndex&&!token[curToken]->terminal)
{
if (curToken!=tIndex)
{
replaceRule(tIndex, i);
}
else
{
//generateRule(tIndex, i);
}
newChange=true;
}
}
}
//whenever there is some new findings, restart
if (!newChange)
{
tIndex++;
}
else
{
tIndex=0;
newChange=false;
}
}
}
void Grammar::replaceRule(int tIndex, int ruleIndex)
{
int pos, j, targetEnd, sourceEnd, curRule;
int curToken=production[token[tIndex]->production[ruleIndex]][0];
for (int i=token[curToken]->count-1; i>=0; i--)
{
if (i>0)
{
doAddRule(tIndex);
pos=copyRule(token[curToken]->production[i], prodIndex);
j=0;
while (production[token[tIndex]->production[ruleIndex]][j+1]!=-1)
{
production[prodIndex][pos+j]=
production[token[tIndex]->production[ruleIndex]][j+1];
j++;
}
production[prodIndex][pos+j]=-1;
}
else
{
targetEnd=sourceEnd=0;
curRule=token[tIndex]->production[ruleIndex];
while (true)
{
if (production[token[curToken]->production[0]][sourceEnd]==-1&&
production[curRule][targetEnd]==-1)
{
break;
}
if (production[token[curToken]->production[0]][sourceEnd]!=-1)
{
sourceEnd++;
}
if (production[curRule][targetEnd]!=-1)
{
targetEnd++;
}
}
j=targetEnd+sourceEnd-1;
while (j>=0)
{
if (j>=sourceEnd)
{
production[curRule][j]=production[curRule][j-sourceEnd+1];
}
else
{
production[curRule][j]=production[token[curToken]->production[0]][j];
}
j--;
}
}
}
}
//optimize sequence is first common-factor then remove left recursion
//therefore I don't have to check if for same variable if there will be
//more than one left-recursion
void Grammar::optimize()
{
commonLeftFactor();
removeLeftRecursion();
}
int Grammar::findCommonFactor(int tIndex, int ruleIndex)
{
for (int i=ruleIndex+1; i<token[tIndex]->count; i++)
{
//if the two rule has the same first token
if (production[token[tIndex]->production[ruleIndex]][0]==
production[token[tIndex]->production[i]][0])
{
/*
//calculate if there is epsilon
if (emptyIndex==-1)
{
emptyIndex=tokenIndex(emptyStr);
}
//if it is not epsilon
if (production[token[tIndex]->production[i]][0]!=emptyIndex)
{
return i;
}
*/
return i;
}
}
return -1;
}
void Grammar::epsilonRule(int ruleIndex)
{
production[ruleIndex][0]=addToken(emptyStr);
production[ruleIndex][1]=-1;
}
//sample: x -> Aa
// x -> Ab
//changed to: x -> Ax' //this is to change the old rule
// x' -> b //this is to change the old rule
// x' -> a //this is the new-added rule
void Grammar::doCommonFactor(int tIndex, int ruleIndex, int index)
{
int newTokenIndex=forgeToken(tIndex);//create a new variable
//move the second and after part to the new rule of new variable
//doing: x' -> a
curPos=0;
//prepare to add one new rule
token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex;
token[newTokenIndex]->terminal=false;
do
{
//do copying
production[prodIndex][curPos]=
production[token[tIndex]->production[ruleIndex]][curPos+1];
curPos++;
//even the -1 at end is copied
}while (production[token[tIndex]->production[ruleIndex]][curPos]!=-1);
//in order to show an empty string, in case the string is "epsilon"
if (curPos==1)
{
epsilonRule(prodIndex);
}
//replace x -> Aa with x -> Ax'
production[token[tIndex]->production[ruleIndex]][1]=newTokenIndex;
production[token[tIndex]->production[ruleIndex]][2]=-1;//end
//doing: x' -> b
curPos=0;
//prepare to add one new rule
//pointing new token to where old rule lies
token[newTokenIndex]->production[token[newTokenIndex]->count++]=
token[tIndex]->production[index];
do
{
//left-shift productions
production[token[tIndex]->production[index]][curPos]=
production[token[tIndex]->production[index]][curPos+1];
curPos++;
}while (production[token[tIndex]->production[index]][curPos]!=-1);
//add epsilon
if (curPos==1)
{
epsilonRule(token[tIndex]->production[index]);
}
//remove from token's production array the rule-index----"index"
//shrink the array by one
token[tIndex]->count--;
for (int i=index; i<token[tIndex]->count; i++)
{
token[tIndex]->production[i]=token[tIndex]->production[i+1];
}
}
void Grammar::commonLeftFactor()
{
int index=-1, tIndex=0, ruleIndex=0;
bool flag;
//whenever a newrule is done, restart!
while (tIndex<tokenCount)
{
ruleIndex=0;
flag=false;
while (ruleIndex<token[tIndex]->count)
{
index=findCommonFactor(tIndex, ruleIndex);
if (index!=-1)
{
doCommonFactor(tIndex, ruleIndex, index);
//restart
flag=true;
break;
}
else
{
ruleIndex++;
}
}
if (flag)
{
tIndex=0;
}
else
{
tIndex++;
}
}
}
Token* Grammar::operator[](int index)
{
if (index>=0&&index<tokenCount)
{
return token[index];
}
else
{
return NULL;
}
}
void Grammar::print()
{
for (int i=0; i<tokenCount; i++)
{
if (!token[i]->terminal)
{
cout<<token[i]->name<<" ==> ";
for (int j=0; j<token[i]->count; j++)
{
printRule(token[i]->production[j]);
if (j!=token[i]->count-1)
{
cout<<" | ";
}
}
cout<<"\n";
}
}
}
file name: main.cpp (main)
#include <iostream>
#include "CFGReader.h"
using namespace std;
int main()
{
CFGReader R;
R.readFromFile("c:\\ruleTest.txt");
R.optimize();
R.print();
return 0;
}
The input is something like following:
Please note that all tokens must be separated by space, including "|" and "->".
A -> a | b B -> A a | a b
Here is the result:
A ==> a | b B ==> a a | a b | b a Press any key to continue