CFG Reader(common-left-factor)
A. Second Edition
This is second edition of my CFG Reader which stands for Context-Free-Grammar Reader. In this edition, I made
an adjustment of structure of CFGRead class to make it a pure controller of a class Grammar which store and
represent CFG. What's more important is that I made the Grammar class capable to find and solve the common left
factor of grammar rules. For example: A -> test a b | test a | test a b a
In this rule, it is rather complicated to combine the common-left-factor which is "test a" and it is across 3
independent rules. Do you see the difficulty? The only easy way to solve it is to do it step-by-step. That is
to change grammar to be: A -> test A0 A0 -> a b | a
And the third rule remains: A -> test a b a
Now re-do the checking will find out "test" is still the common-left-factor and will be factored again!
Now it is something like: A -> test A1 A1 -> A0 | a b a
The "A0" remains same this time: A0 -> a b | a
My intention is to write a simple grammar reader that is to automatically update CFG. There are several jobs to
be considered:
1) How to store "tokens"?
2) How to connect "tokens" with their "grammar rules"?
3) How to change grammar to make it LL(1) grammar?
a) Combine common-left-factors.
b) Remove left-recursions.
c) Calculates First and Follow sets.
I finished the 1,2,3-a. The rest will be finished in next edition.
program -> stmt-sequence
stmt-sequence -> stmt-sequence ; statement | statement
statement -> if-stmt | repeat-stmt | assign-stmt | read-stmt | write-stmt
if-stmt -> if boolean-exp then stmt-sequence end
| if boolean-exp then stmt-sequence else stmt-sequence end
repeat-stmt -> repeat stmt-sequence until boolean-exp
assign-stmt -> identifier := integer-exp
read-stmt -> read identifier
write-stmt -> write integer-exp
boolean-exp -> integer-exp comparison-op integer-exp
comparison-op -> < | =
integer-exp -> integer-exp addop term | term
addop -> + | -
term -> term mulop factor | factor
mulop -> * | /
factor -> ( integer-exp ) | number | identifier
E.Further improvement
F.File listing
1. CFGReader.h
2. CFGReader.cpp
3. Grammar.h
4. Grammar.cpp
5. main.cpp (main)
file name: CFGReader.h
#include "Grammar.h"
class CFGReader
{
private:
char buffer[BufferLength];
FILE* stream;
void newRule(const char* str);
void readRule();
void addRule(const char* str);
int addToken(const char* str, bool isTerminal=true);
void printRule(int index);
public:
void readFromFile(const char* fileName);
void optimize();
void print();
};
file name: CFGReader.cpp
#include <iostream>
#include "CFGReader.h"
using namespace std;
Grammar grammar;
const char* deliStr=" |\n";
//this would only remove the immediate left recursion
//and should I expand it to remove all kinds of left-recursion?
void CFGReader::readFromFile(const char* fileName)
{
if ((stream=fopen(fileName, "r"))==NULL)
{
cout<<"cannot open file "<<fileName<<endl;
}
else
{
readRule();
}
}
void CFGReader::readRule()
{
char* ptr=buffer;
char* hold;
int count=0;
while (!feof(stream))
{
count=0;
fgets(buffer, BufferLength-1, stream);
ptr=buffer;
while ((ptr=strtok(ptr, " \n"))!=NULL)
{
if (strcmp(ptr, "|")!=0)
{
//test the first two token to see if old rule continues
if (count==0)
{
hold=ptr;
}
if (count==1)
{
if (strcmp("->", ptr)==0)
{
/*
curIndex=addToken(hold, false);//the index of cur token
grammar[curIndex]->terminal=false;
newRule();
*/
newRule(hold);
}
else
{
addRule(hold);
addRule(ptr);
}
}
if (count>=2)//always add
{
addRule(ptr);
}
count++;
}
else
{
//this is an alternative production rule
newRule(NULL);
}
ptr=NULL;
}
}
}
void CFGReader::newRule(const char* str)
{
grammar.newRule(str);
}
void CFGReader::optimize()
{
grammar.optimize();
}
void CFGReader::addRule(const char* str)
{
grammar.addRule(str);
}
int CFGReader::addToken(const char* str, bool isTerminal)
{
return grammar.addToken(str, isTerminal);
}
void CFGReader::printRule(int index)
{
grammar.printRule(index);
}
void CFGReader::print()
{
grammar.print();
}
file name: Grammar.h
const int BufferLength=256;
const int MaxTokenCount=50;
const int MaxRHSCount=8;
const int MaxRuleCount=50;
const int MaxTokenLength=10;
const int MaxProduction=10;
struct Token
{
bool terminal;
char* name;
int production[MaxProduction];//pointer to the production it gives
int count;
int firstCount;
int followCount;
int follow[MaxTokenCount];
int first[MaxTokenCount];
};
class Grammar
{
private:
Token* token[MaxTokenCount];
int tokenCount;
int curIndex;//the index of current token
int curPos;//the position at production rule
//MaxRuleCount>=MaxVariableCount!!!
int production[MaxRuleCount][MaxRHSCount];
int prodIndex;//pointing to current production, initialized to -1
int checkRecursion(int tokenIndex, int ruleIndex);
void replaceRule(int tokenIndex, int recurIndex[], int count);
void initialize();
void commonLeftFactor();
int findCommonFactor(int tokenIndex, int ruleIndex);
void removeLeftRecursion();
void doCommonFactor(int tokenIndex, int ruleIndex, int index);
int forgeToken(int index);//create a new variable
void epsilonRule(int ruleIndex);
public:
Grammar();
void optimize();
void printRule(int index);
void print();
void addRule(const char* str);
void newRule(const char* str);//this is an internal method, not suitable for outside
Token* operator[](int index);
int addToken(const char* str, bool isTerminal=true);
Token* createToken(const char* theName, bool isTerminal);
int tokenIndex(const char* str);
};
file name: Grammar.cpp
#include <iostream>
#include "Grammar.h"
using namespace std;
const char* emptyStr="empty";
int Grammar::forgeToken(int index)
{
char str[MaxTokenLength+2], ch;
int len=strlen(token[index]->name);
int temp=0, i=0;
strcpy(str, token[index]->name);
ch=str[len-1];//get last char of name
if (ch>='0'&&ch<'9')
{
str[len-1]=ch+i+1;
}
else
{
str[len]='0'+i;
str[len+1]='\0';
}
//I won't bother to check repetitation of name
while (tokenIndex(str)!=-1)
{
i++;
if (ch>='0'&&ch<'9')
{
str[len-1]=ch+i+1;
}
else
{
str[len]='0'+i;
str[len+1]='\0';
}
}
return addToken(str, false);//it is non-terminal for sure
}
void Grammar::addRule(const char* str)
{
int index;
index=addToken(str);
production[prodIndex][curPos++]=index;
production[prodIndex][curPos]=-1;//set end
}
//if the token is already there, it return the index
//otherwise, it add new node in token array
int Grammar::addToken(const char* str, bool isTerminal)
{
int index;
index=tokenIndex(str);
if (index==-1)
{
index=tokenCount;
}
else
{
return index;
}
token[index]=createToken(str, isTerminal);
tokenCount++;
return index;
}
void Grammar::newRule(const char* str)
{
if (str!=NULL)
{
curIndex=addToken(str, false);
}
//add one pointer
token[curIndex]->production[token[curIndex]->count++]=++prodIndex;
token[curIndex]->terminal=false;
curPos=0;//reset to restart;
}
Token* Grammar::createToken(const char* theName, bool isTerminal)
{
Token* ptr=new Token;
ptr->name=new char[strlen(theName)+1];
strcpy(ptr->name, theName);
ptr->terminal=isTerminal;
ptr->count=ptr->firstCount=ptr->followCount=0;
return ptr;
}
int Grammar::tokenIndex(const char* str)
{
for (int i=0; i<tokenCount; i++)
{
if (strcmp(str, token[i]->name)==0)
{
return i;
}
}
return -1;
}
int Grammar::checkRecursion(int tokenIndex, int ruleIndex)
{
for (int i=ruleIndex; i<token[tokenIndex]->count; i++)
{
//token[tokenIndex]->production[i]=ruleIndex
//production[ruleIndex][0] is the first left-recursion one
if (production[token[tokenIndex]->production[i]][0]==tokenIndex)
{
return i;
}
}
return -1;
}
void Grammar::printRule(int index)
{
int nodeIndex=0;
while (production[index][nodeIndex]!=-1)
{
cout<<token[production[index][nodeIndex]]->name<<" ";
nodeIndex++;
}
}
void Grammar::initialize()
{
tokenCount=curIndex=curPos=0;
prodIndex=-1;//in order to ++ blindly
}
Grammar::Grammar()
{
initialize();
}
void Grammar::removeLeftRecursion()
{
int index, count;
int recurIndex[MaxTokenCount];
for (int i=0; i<tokenCount; i++)
{
index=0;
count=0;
do
{
index =checkRecursion(i, index);
if (index!=-1)
{
recurIndex[count++]=index;
}
}while (index!=-1);
replaceRule(i, recurIndex, count);
}
}
void Grammar::replaceRule(int tokenIndex, int recurIndex[], int count)
{
// int newTokenIndex=forgeToken(tokenIndex);
}
//optimize sequence is first common-factor then remove left recursion
//therefore I don't have to check if for same variable if there will be
//more than one left-recursion
void Grammar::optimize()
{
commonLeftFactor();
// removeLeftRecursion();
}
int Grammar::findCommonFactor(int tokenIndex, int ruleIndex)
{
for (int i=ruleIndex+1; i<token[tokenIndex]->count; i++)
{
//if the two rule has the same first token
if (production[token[tokenIndex]->production[ruleIndex]][0]==
production[token[tokenIndex]->production[i]][0])
{
return i;
}
}
return -1;
}
void Grammar::epsilonRule(int ruleIndex)
{
production[ruleIndex][0]=addToken(emptyStr);
production[ruleIndex][1]=-1;
}
//sample: x -> Aa
// x -> Ab
//changed to: x -> Ax' //this is to change the old rule
// x' -> b //this is to change the old rule
// x' -> a //this is the new-added rule
void Grammar::doCommonFactor(int tokenIndex, int ruleIndex, int index)
{
int newTokenIndex=forgeToken(tokenIndex);//create a new variable
//move the second and after part to the new rule of new variable
//doing: x' -> a
curPos=0;
//prepare to add one new rule
token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex;
token[newTokenIndex]->terminal=false;
do
{
//do copying
production[prodIndex][curPos]=
production[token[tokenIndex]->production[ruleIndex]][curPos+1];
curPos++;
//even the -1 at end is copied
}while (production[token[tokenIndex]->production[ruleIndex]][curPos]!=-1);
//in order to show an empty string, in case the string is "epsilon"
if (curPos==1)
{
epsilonRule(prodIndex);
}
//replace x -> Aa with x -> Ax'
production[token[tokenIndex]->production[ruleIndex]][1]=newTokenIndex;
production[token[tokenIndex]->production[ruleIndex]][2]=-1;//end
//doing: x' -> b
curPos=0;
//prepare to add one new rule
//pointing new token to where old rule lies
token[newTokenIndex]->production[token[newTokenIndex]->count++]=
token[tokenIndex]->production[index];
do
{
//left-shift productions
production[token[tokenIndex]->production[index]][curPos]=
production[token[tokenIndex]->production[index]][curPos+1];
curPos++;
}while (production[token[tokenIndex]->production[index]][curPos]!=-1);
//add epsilon
if (curPos==1)
{
epsilonRule(token[tokenIndex]->production[index]);
}
//remove from token's production array the rule-index----"index"
//shrink the array by one
token[tokenIndex]->count--;
for (int i=index; i<token[tokenIndex]->count; i++)
{
token[tokenIndex]->production[i]=token[tokenIndex]->production[i+1];
}
}
void Grammar::commonLeftFactor()
{
int index=-1, tokenIndex=0, ruleIndex=0;
bool flag;
//whenever a newrule is done, restart!
while (tokenIndex<tokenCount)
{
ruleIndex=0;
flag=false;
while (ruleIndex<token[tokenIndex]->count)
{
index=findCommonFactor(tokenIndex, ruleIndex);
if (index!=-1)
{
doCommonFactor(tokenIndex, ruleIndex, index);
//restart
flag=true;
break;
}
else
{
ruleIndex++;
}
}
if (flag)
{
tokenIndex=0;
}
else
{
tokenIndex++;
}
}
}
Token* Grammar::operator[](int index)
{
if (index>=0&&index<tokenCount)
{
return token[index];
}
else
{
return NULL;
}
}
void Grammar::print()
{
for (int i=0; i<tokenCount; i++)
{
if (!token[i]->terminal)
{
cout<<token[i]->name<<" ==> ";
for (int j=0; j<token[i]->count; j++)
{
printRule(token[i]->production[j]);
if (j!=token[i]->count-1)
{
cout<<" | ";
}
}
cout<<"\n";
}
}
}
file name: main.cpp (main)
#include <iostream>
#include "CFGReader.h"
using namespace std;
int main()
{
CFGReader R;
R.readFromFile("c:\\ruleTest.txt");
R.optimize();
R.print();
return 0;
}
The input is something like following:
Please note that all tokens must be separated by space, including "|" and "->".
A -> test a b | test a | test a b a
B -> test A | test B
Here is the result:
A ==> test A1 B ==> test B0 A0 ==> a A2 A1 ==> A0 | a b a B0 ==> A | B A2 ==> b | empty Press any key to continue