CFG Reader(removal-left-recursion2)

A. Third Edition
This is fourth edition of my CFG Reader which stands for Context-Free-Grammar Reader. In this edition, I finished 
the second part of removal-left-recursion.
B.The problem
What is RLR?
Example: A  -> A a | b | c
Here you see the variable A which appears the left-most of rule and this will produce infinite loop for Recursive-
Descent parsing. For table-driven parsing, it is the same problem. So, how to remove?

program -> stmt-sequence
stmt-sequence -> stmt-sequence ; statement | statement
statement -> if-stmt | repeat-stmt | assign-stmt | read-stmt | write-stmt
if-stmt -> if boolean-exp then stmt-sequence end
| if boolean-exp then stmt-sequence else stmt-sequence end
repeat-stmt -> repeat stmt-sequence until boolean-exp
assign-stmt -> identifier := integer-exp
read-stmt -> read identifier
write-stmt -> write integer-exp
boolean-exp -> integer-exp comparison-op integer-exp
comparison-op -> < | =
integer-exp -> integer-exp addop term | term
addop -> + | -
term -> term mulop factor | factor
mulop -> * | /
factor -> ( integer-exp ) | number | identifier

 

C.The idea of program
 
Now we are in a position to remove the immediate-left-recursion! Taking the above example, the new grammar would
be:
1) A -> A a              ===>     A  -> a A'
2) A -> b                ===>     A' -> b A'
3) A -> c                ===>     A' -> c A'
                         ===>     A' -> epsilon
So, generally there are three steps: 
1) Modify current rule to such that it removes the first repeating variable and shift-to-left by 1 and at end
of rule add a new variable.
2) Change all other rules to rules belonging to new variable and at end of each rule add the new variable.
3) Add an epsilon rule for new variable.
 
D.The major functions
E.Further improvement
F.File listing
1. CFGReader.h
2. CFGReader.cpp  
3. Grammar.h
4. Grammar.cpp
5. main.cpp (main)
 
 
file name: CFGReader.h
#include "Grammar.h"


class CFGReader
{
private:
	char buffer[BufferLength];
	FILE* stream;
	void newRule(const char* str);
	void readRule();
	void addRule(const char* str);
	int addToken(const char* str, bool isTerminal=true);
	void printRule(int index);
public:
	void readFromFile(const char* fileName);
	void optimize();
	void print();
};


file name: CFGReader.cpp 
#include <iostream>
#include "CFGReader.h"

using namespace std;

Grammar grammar;

const char* deliStr=" |\n";

//this would only remove the immediate left recursion
//and should I expand it to remove all kinds of left-recursion?


void CFGReader::readFromFile(const char* fileName)
{
	if ((stream=fopen(fileName, "r"))==NULL)
	{
		cout<<"cannot open file "<<fileName<<endl;
	}
	else
	{		
		readRule();		
	}
}

void CFGReader::readRule()
{
	char* ptr=buffer;
	char* hold;
	int count=0;

	while (!feof(stream))
	{
		count=0;
		fgets(buffer, BufferLength-1, stream);
		ptr=buffer;	
		
		while ((ptr=strtok(ptr, " \n"))!=NULL)
		{
			if (strcmp(ptr, "|")!=0)
			{
				//test the first two token to see if old rule continues
				if (count==0)				
				{
					hold=ptr;
				}
				if (count==1)
				{
					if (strcmp("->", ptr)==0)
					{			
						/*
						curIndex=addToken(hold, false);//the index of cur token	
						grammar[curIndex]->terminal=false;
						newRule();	
						*/
						newRule(hold);
					}
					else
					{								
						addRule(hold);
						addRule(ptr);															
					}
				}
				if (count>=2)//always add
				{
					addRule(ptr);
				}
				count++;
			}
			else
			{
				//this is an alternative production rule
				newRule(NULL);						
			}
			ptr=NULL;
		}		
	}
}

void CFGReader::newRule(const char* str)
{
	grammar.newRule(str);
}

void CFGReader::optimize()
{
	grammar.optimize();
}


void CFGReader::addRule(const char* str)
{
	grammar.addRule(str);
}

int CFGReader::addToken(const char* str, bool isTerminal)
{
	return grammar.addToken(str, isTerminal);
}



void CFGReader::printRule(int index)
{
	grammar.printRule(index);
}

void CFGReader::print()
{
	grammar.print();
}



file name: Grammar.h
const int BufferLength=256;
const int MaxTokenCount=50;
const int MaxRHSCount=8;
const int MaxRuleCount=50;
const int MaxTokenLength=10;
const int MaxProduction=10;

struct Token
{
	bool terminal;
	bool isNull;
	char* name;
	int production[MaxProduction];//pointer to the production it gives
	int count;
	int firstCount;
	int followCount;
	int follow[MaxTokenCount];
	int first[MaxTokenCount];
};

class Grammar
{
private:
	Token* token[MaxTokenCount];
	int tokenCount;
	int curIndex;//the index of current token
	int curPos;//the position at production rule
	//MaxRuleCount>=MaxVariableCount!!!
	int production[MaxRuleCount][MaxRHSCount];
	int prodIndex;//pointing to current production, initialized to -1
	int checkRecursion(int tIndex, int curToken);
	bool addIntoFirst(int target, int source, int ruleIndex);
	void shiftRule(int ruleIndex, bool Left2Right, int offset);
	void addAtEnd(int ruleIndex, int toAdd);
	void addRuleForToken(int tIndex, int ruleIndex);
	int copyRule(int source, int target);//return the position of -1
	void replaceRule(int curToken, int ruleIndex);
	void initialize();
	void removeRuleFromToken(int tIndex, int ruleIndex);
	void checkEpsilon(int ruleIndex);
	void removeImmediate(int tIndex, int ruleIndex);
	void doAddRule(int tIndex);
	void commonLeftFactor();
	int findCommonFactor(int tokenIndex, int ruleIndex);
	void removeLeftRecursion();
	void doCommonFactor(int tokenIndex, int ruleIndex, int index);
	int forgeToken(int index);//create a new variable
	void epsilonRule(int ruleIndex);
	//void calculateNull();
	void calculateFirst();
	void calculateFollow();
public:
	Grammar();
	void optimize();
	void printRule(int index);
	void print();
	void addRule(const char* str);
	void newRule(const char* str);//this is an internal method, not suitable for outside
	Token* operator[](int index);
	int addToken(const char* str, bool isTerminal=true);
	Token* createToken(const char* theName, bool isTerminal);
	int tokenIndex(const char* str);
};
 
 
file name: Grammar.cpp 
#include <iostream>
#include "Grammar.h"

using namespace std;

const char* emptyStr="empty";
int emptyIndex=-1;

void Grammar::calculateFirst()
{
	int start=0;
	while (start<tokenCount)
	{
		if (!token[start]->terminal)
		{
		}
	}
}


void Grammar::addRuleForToken(int tIndex, int ruleIndex)
{
	token[tIndex]->production[token[tIndex]->count++]=ruleIndex;
}

void Grammar::shiftRule(int ruleIndex, bool left2Right, int offset)
{
	int end=0;
	while (production[ruleIndex][end]!=-1)
	{
		end++;
	}
	if (left2Right)
	{
		for (int i=end; i>=0; i--)
		{
			production[ruleIndex][i+offset]=production[ruleIndex][i];
		}
	}
	else
	{
		for (int i=0; i<=end-offset; i++)
		{
			production[ruleIndex][i]=production[ruleIndex][i+offset];
		}
		checkEpsilon(ruleIndex);
	}
}

void Grammar::checkEpsilon(int ruleIndex)
{
	if (production[ruleIndex][0]==-1)
	{
		epsilonRule(ruleIndex);
	}
}

bool Grammar::addIntoFirst(int target, int source, int ruleIndex)
{
	/*
	int theFirst;
	bool added;
	//if the token epsilon is added, surely the token will be nullable
	if (token[source]->terminal)
	{
		token[target]->first[token[target]->firstCount++]=source;
		if (token[source]->isNull)
		{
			token[target]->isNull=true;
		}
	}
	else
	{
		added=false;
		for (int i=0; i<token[source]->count; i++)
		{
			theFirst=production[token[source]->production[i]][0];
			if (
			addIntoFirst(target, production[token[source]->production[0]][0]);
		}
	}
	*/
	return true;
}

void Grammar::addAtEnd(int ruleIndex, int toAdd)
{
	int end=0;
	while (production[ruleIndex][end]!=-1)
	{
		end++;
	}
	production[ruleIndex][end++]=toAdd;
	production[ruleIndex][end]=-1;
}

//left-recursion:  A -> A a | b | c
//change to: A  -> b A' | c A'
//			 A' -> a A' | empty
void Grammar::removeImmediate(int tIndex, int ruleIndex)
{
	int newIndex=forgeToken(tIndex);
	int holdRuleIndex=token[tIndex]->production[ruleIndex];
	//sequence matters!

	//change to: A -> b A'
	for (int i=0; i<token[tIndex]->count; i++)
	{
		if (i!=ruleIndex)
		{
			addAtEnd(token[tIndex]->production[i], newIndex);
		}
	}
	
	//shift
	/*
	token[tIndex]->count--;
	for (i=ruleIndex; i<token[tIndex]->count; i++)
	{
		token[tIndex]->production[i]=token[tIndex]->production[i+1];
	}
	*/
	removeRuleFromToken(tIndex, ruleIndex);
	
	addRuleForToken(newIndex, holdRuleIndex);
	//token[newIndex]->production[token[newIndex]->count++]=holdRuleIndex;

	shiftRule(holdRuleIndex, false, 1);
	addAtEnd(holdRuleIndex, newIndex);
	
	//add epsilon rule for new variable
	epsilonRule(++prodIndex);
	addRuleForToken(newIndex, prodIndex);
}


int Grammar::forgeToken(int index)
{
	char str[MaxTokenLength+2], ch;
	int len=strlen(token[index]->name);
	int temp=0, i=0;
	strcpy(str, token[index]->name);
	ch=str[len-1];//get last char of name
	if (ch>='0'&&ch<'9')
	{
		str[len-1]=ch+i+1;
	}
	else
	{
		str[len]='0'+i;
		str[len+1]='\0';
	}
	//I won't bother to check repetitation of name
	while (tokenIndex(str)!=-1)
	{
		i++;
		if (ch>='0'&&ch<'9')
		{
			str[len-1]=ch+i+1;
		}
		else
		{
			str[len]='0'+i;
			str[len+1]='\0';
		}
	}

	return addToken(str, false);//it is non-terminal for sure	
}

int Grammar::copyRule(int source, int target)
{
	int i=0;
	while (production[source][i]!=-1) 
	{
		production[target][i]=production[source][i];
		i++;
	}
	production[target][i]=-1;
	return i;
}

void Grammar::doAddRule(int tIndex)
{	
	token[tIndex]->production[token[tIndex]->count++]=++prodIndex;
}

void Grammar::addRule(const char* str)
{
	int index;
	index=addToken(str);
	production[prodIndex][curPos++]=index;
	production[prodIndex][curPos]=-1;//set end
}


//if the token is already there, it return the index
//otherwise, it add new node in token array
int Grammar::addToken(const char* str, bool isTerminal)
{
	int index;
	index=tokenIndex(str);
	if (index==-1)
	{
		index=tokenCount;
	}
	else
	{		
		return index;
	}
	token[index]=createToken(str, isTerminal);	
	tokenCount++;
	if (strcmp(str, emptyStr)==0)
	{
		token[index]->isNull=true;
	}
	return index;
}

void Grammar::newRule(const char* str)
{
	if (str!=NULL)
	{
		curIndex=addToken(str, false);
	}
	//add one pointer
	token[curIndex]->production[token[curIndex]->count++]=++prodIndex;
	token[curIndex]->terminal=false;
	curPos=0;//reset to restart;
}

Token* Grammar::createToken(const char* theName, bool isTerminal)
{
	Token* ptr=new Token;
	ptr->name=new char[strlen(theName)+1];
	strcpy(ptr->name, theName);
	ptr->terminal=isTerminal;
	ptr->count=ptr->firstCount=ptr->followCount=0;
	ptr->isNull=false;
	return ptr;
}

int Grammar::tokenIndex(const char* str)
{
	for (int i=0; i<tokenCount; i++)
	{
		if (strcmp(str, token[i]->name)==0)
		{
			return i;
		}
	}
	return -1;
}

int Grammar::checkRecursion(int tIndex, int curToken)
{
	for (int i=0; i<token[curToken]->count; i++)
	{
		//token[tIndex]->production[i]=ruleIndex
		//production[ruleIndex][0] is the first left-recursion one
		if (production[token[curToken]->production[i]][0]<=curToken)
		{
			return i;
		}
	}
	return -1;
}

void Grammar::printRule(int index)
{
	int nodeIndex=0;
	while (production[index][nodeIndex]!=-1)
	{
		cout<<token[production[index][nodeIndex]]->name<<" ";
		nodeIndex++;
	}
}



void Grammar::initialize()
{
	tokenCount=curIndex=curPos=0;
	prodIndex=-1;//in order to ++ blindly
}

Grammar::Grammar()
{
	initialize();
}

void Grammar::removeLeftRecursion()
{
	int tIndex=0, curToken=0;
	bool newChange=false;
	while (tIndex<tokenCount)
	{		
		if (!token[tIndex]->terminal)
		{
			for (int i=0; i<token[tIndex]->count; i++)
			{
				curToken=production[token[tIndex]->production[i]][0];
				if (curToken<=tIndex&&!token[curToken]->terminal)
				{
					if (curToken!=tIndex)
					{
						replaceRule(tIndex, i);
					}
					else
					{
						removeImmediate(tIndex, i);
					}
					newChange=true;				
				}
			}
		}
		//whenever there is some new findings, restart
		if (!newChange)
		{
			tIndex++;
		}
		else
		{
			tIndex=0;
			newChange=false;
		}
	}
}

void Grammar::replaceRule(int tIndex, int ruleIndex)
{
	int pos, j, targetEnd, sourceEnd, curRule;
	int curToken=production[token[tIndex]->production[ruleIndex]][0];
	for (int i=token[curToken]->count-1; i>=0; i--)
	{
		if (i>0)
		{
			doAddRule(tIndex);
			pos=copyRule(token[curToken]->production[i], prodIndex);
			j=0;
			while (production[token[tIndex]->production[ruleIndex]][j+1]!=-1)
			{
				production[prodIndex][pos+j]=
					production[token[tIndex]->production[ruleIndex]][j+1];
				j++;
			}
			production[prodIndex][pos+j]=-1;
		}
		else
		{
			targetEnd=sourceEnd=0;
			curRule=token[tIndex]->production[ruleIndex];
			while (true)
			{
				if (production[token[curToken]->production[0]][sourceEnd]==-1&&
				production[curRule][targetEnd]==-1)
				{
					break;
				}
				if (production[token[curToken]->production[0]][sourceEnd]!=-1)
				{
					sourceEnd++;
				}
				if (production[curRule][targetEnd]!=-1)
				{
					targetEnd++;
				}
			}
			j=targetEnd+sourceEnd-1;
			while (j>=0)
			{
				if (j>=sourceEnd)
				{
					production[curRule][j]=production[curRule][j-sourceEnd+1];
				}
				else
				{
					production[curRule][j]=production[token[curToken]->production[0]][j];
				}
				j--;
			}
		}
	}
}


//optimize sequence is first common-factor then remove left recursion
//therefore I don't have to check if for same variable if there will be
//more than one left-recursion
void Grammar::optimize()
{
	removeLeftRecursion();
	commonLeftFactor();
}

int Grammar::findCommonFactor(int tIndex, int ruleIndex)
{
	for (int i=ruleIndex+1; i<token[tIndex]->count; i++)
	{	
		//if the two rule has the same first token
		if (production[token[tIndex]->production[ruleIndex]][0]==
			production[token[tIndex]->production[i]][0])
		{
			/*
			//calculate if there is epsilon
			if (emptyIndex==-1)
			{
				emptyIndex=tokenIndex(emptyStr);
			}
			//if it is not epsilon
			if (production[token[tIndex]->production[i]][0]!=emptyIndex)
			{
				return i;
			}
			*/
			return i;
		}
	}
	return -1;
}

void Grammar::epsilonRule(int ruleIndex)
{
	production[ruleIndex][0]=addToken(emptyStr);
	production[ruleIndex][1]=-1;
}

//sample:  x -> Aa 
//         x -> Ab
//changed to:  x -> Ax' //this is to change the old rule
//             x' -> b //this is to change the old rule
//             x' -> a //this is the new-added rule
void Grammar::doCommonFactor(int tIndex, int ruleIndex, int index)
{
	int newTokenIndex=forgeToken(tIndex);//create a new variable
	//move the second and after part to the new rule of new variable
	//doing: x' -> a
	//curPos=0;
	//prepare to add one new rule 	
	addRuleForToken(newTokenIndex, ++prodIndex);
	//token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex;
	token[newTokenIndex]->terminal=false;

	copyRule(token[tIndex]->production[ruleIndex], prodIndex);
	shiftRule(prodIndex, false, 1);
	/*
	do
	{
		
		//do copying
		production[prodIndex][curPos]=
			production[token[tIndex]->production[ruleIndex]][curPos+1];
		curPos++;
	//even the -1 at end is copied
	}while (production[token[tIndex]->production[ruleIndex]][curPos]!=-1);
	*/

	//in order to show an empty string, in case the string is "epsilon"

	
	/*
	if (curPos==1)
	{
		epsilonRule(prodIndex);
	}
	*/


	//replace x -> Aa with x -> Ax'
	production[token[tIndex]->production[ruleIndex]][1]=newTokenIndex;
	production[token[tIndex]->production[ruleIndex]][2]=-1;//end
	
	//doing: x' -> b
	//curPos=0;
	//prepare to add one new rule 
	//pointing new token to where old rule lies
	addRuleForToken(newTokenIndex, token[tIndex]->production[index]);
	/*
	token[newTokenIndex]->production[token[newTokenIndex]->count++]=
		token[tIndex]->production[index];
		*/

	shiftRule(token[tIndex]->production[index], false, 1);
	/*
	do 
	{
		//left-shift productions
		production[token[tIndex]->production[index]][curPos]=
			production[token[tIndex]->production[index]][curPos+1];
		curPos++;

	}while (production[token[tIndex]->production[index]][curPos]!=-1);
	*/
	//add epsilon
	/*
	if (curPos==1)
	{
		epsilonRule(token[tIndex]->production[index]);
	}
	*/
	//remove from token's production array the rule-index----"index"
	//shrink the array by one
	/*
	for (int i=index; i<token[tIndex]->count; i++)
	{
		token[tIndex]->production[i]=token[tIndex]->production[i+1];
	}
	token[tIndex]->count--;
	*/
	removeRuleFromToken(tIndex, index);
}

void Grammar::removeRuleFromToken(int tIndex, int ruleIndex)
{
	for (int i=ruleIndex; i<token[tIndex]->count; i++)
	{
		token[tIndex]->production[i]=token[tIndex]->production[i+1];
	}
	token[tIndex]->count--;
}


void Grammar::commonLeftFactor()
{
	int index=-1, tIndex=0, ruleIndex=0;
	bool flag;
	//whenever a newrule is done, restart!
	while (tIndex<tokenCount)
	{
		ruleIndex=0;
		flag=false;		
		while (ruleIndex<token[tIndex]->count)
		{			
			index=findCommonFactor(tIndex, ruleIndex);
			if (index!=-1)
			{
				doCommonFactor(tIndex, ruleIndex, index);
				//restart
				flag=true;
				break;
			}
			else
			{
				ruleIndex++;
			}
		}
		if (flag)
		{
			tIndex=0;		
		}
		else
		{
			tIndex++;
		}
	}
}

Token* Grammar::operator[](int index)
{
	if (index>=0&&index<tokenCount)
	{
		return token[index];
	}
	else
	{
		return NULL;
	}
}

void Grammar::print()
{
	for (int i=0; i<tokenCount; i++)
	{		
		if (!token[i]->terminal)
		{
			cout<<token[i]->name<<" ==> ";
			for (int j=0; j<token[i]->count; j++)
			{			
				printRule(token[i]->production[j]);	
				if (j!=token[i]->count-1)
				{
					cout<<" | ";
				}
			}
			cout<<"\n";
		}		
	}
}

file name: main.cpp (main)
#include <iostream>
#include "CFGReader.h"

using namespace std;

int main()
{
	CFGReader R;
	R.readFromFile("c:\\ruleTest.txt");
    	R.optimize();
	R.print();
	return 0;
}
 
The input is something like following:
Please note that all tokens must be separated by space, including "|" and "->".
A -> A a b | A a b c | c 



Here is the result:
A ==> c A0 A1
A0 ==> a b A0 | empty
A1 ==> a b c A0 A1 | empty
Press any key to continue










                                 back.gif (341 bytes)       up.gif (335 bytes)         next.gif (337 bytes)