// grammar.h see license.txt for copyright and terms of use // representation and algorithms for context-free grammars // Author: Scott McPeak, April 2000 // Unfortunately, representation and algorithm tend to get // mixed together. Separating them entirely is possible, // but syntactically inconvenient. So, instead, I try to // document the separation in comments. Specifically, // sections beginning with ---- representation ---- are data // for representation of the underlying concept, while // sections with ---- annotation ---- are data created by // algorithms manipulating the data. // Another measure is I've split all grammar-wide algorithm // stuff into GrammarAnalysis (gramanl.h). Things should // only be put into Grammar if they are directly related // to the grammar representation. (However, constitutent // objects like Production will continue to be a mix.) #ifndef __GRAMMAR_H #define __GRAMMAR_H #include // ostream #include "str.h" // string #include "objlist.h" // ObjList #include "sobjlist.h" // SObjList #include "util.h" // OSTREAM_OPERATOR, INTLOOP #include "locstr.h" // LocString, StringRef #include "strobjdict.h" // StringObjDict #include "owner.h" // Owner #include "asockind.h" // AssocKind class StrtokParse; // strtokp.h // fwds defined below class Symbol; class Terminal; class Nonterminal; class Production; class DottedProduction; class Grammar; // transitional definitions typedef StringObjDict LitCodeDict; typedef LocString LiteralCode; // everywhere in the Grammar specification we have a StringRef, it // refers to this string table extern StringTable grammarStringTable; // ---------------- Symbol -------------------- // either a nonterminal or terminal symbol class Symbol { // ------ representation ------ public: LocString const name; // symbol's name in grammar bool const isTerm; // true: terminal (only on right-hand sides of productions) // false: nonterminal (can appear on left-hand sides) bool const isEmptyString; // true only for the emptyString nonterminal StringRef type; // C type of semantic value StringRef dupParam; // name of parameter to 'dup' LocString dupCode; // code to duplicate a semantic value StringRef delParam; // param name; may be NULL to indicate not used LocString delCode; // code // ----------- annotation ------------ public: bool reachable; // computed by constructLRItemSets; true when nonterminal reachable from start symbol protected: // funcs virtual void internalPrintDDM(ostream &os) const; public: // funcs Symbol(LocString const &n, bool t, bool e = false); virtual ~Symbol(); Symbol(Flatten&); void xfer(Flatten &flat); // symmetric selectors bool isTerminal() const { return isTerm; } bool isNonterminal() const { return !isTerm; } // both terminals and nonterminals have ids; this gets the // id for whichever kind this object happens to be int getTermOrNontermIndex() const; // casting Terminal const &asTerminalC() const; // checks 'isTerminal' for cast safety Terminal &asTerminal() { return const_cast(asTerminalC()); } Nonterminal const &asNonterminalC() const; Nonterminal &asNonterminal() { return const_cast(asNonterminalC()); } // cast or NULL Terminal const *ifTerminalC() const; Terminal *ifTerminal() { return const_cast(ifTerminalC()); } Nonterminal const *ifNonterminalC() const; Nonterminal *ifNonterminal() { return const_cast(ifNonterminalC()); } // debugging // print as '$name: isTerminal=$isTerminal' (no newline) virtual void print(ostream &os) const; OSTREAM_OPERATOR(Symbol) // print 'token[type] name { dup.. del.. merge.. }' (with newlines) void printDDM(ostream &os) const; // true if any of the handlers were specified virtual bool anyDDM() const; virtual string toString() const { return string(name); } }; // I have several needs for serf lists of symbols, so let's use this for now typedef SObjList SymbolList; typedef SObjListIter SymbolListIter; typedef SObjListMutator SymbolListMutator; #define FOREACH_SYMBOL(list, iter) FOREACH_OBJLIST(Symbol, list, iter) #define MUTATE_EACH_SYMBOL(list, iter) MUTATE_EACH_OBJLIST(Symbol, list, iter) #define SFOREACH_SYMBOL(list, iter) SFOREACH_OBJLIST(Symbol, list, iter) #define SMUTATE_EACH_SYMBOL(list, iter) SMUTATE_EACH_OBJLIST(Symbol, list, iter) // format: "s1 s2 s3" string symbolSequenceToString(SymbolList const &list); // ---------------- Terminal -------------------- // something that only appears on the right-hand side of // productions, and is an element of the source language // NOTE: This is really a terminal *class*, in that it's possible // for several different tokens to be classified into the same // terminal class (e.g. "foo" and "bar" are both identifiers) class Terminal : public Symbol { // -------- representation --------- public: // data // whereas 'name' is the canonical name for the terminal class, // this field is an alias; for example, if the canonical name is // L2_EQUALEQUAL, the alias might be "=="; the alias should *not* // include actual double-quote characters // if the alias is "", there is no alias LocString alias; // parsgen-time conflict resolution: if a shift/reduce conflict // occurs between a production and a symbol, both with specified // precedence (not 0), then the one with the numerically higher // precedence will be used int precedence; // if, in the above scenario, the precedence values are the same, // then the associativity kind will be used to decide which to use AssocKind associativity; StringRef classifyParam; // name of parameter to 'classify' LocString classifyCode; // code to reclassify a token type // ------ annotation ------ public: // data // terminal class index - this terminal's id; -1 means unassigned int termIndex; protected: // funcs virtual void internalPrintDDM(ostream &os) const; public: // funcs Terminal(LocString const &name) // canonical name for terminal class : Symbol(name, true /*terminal*/), alias(), precedence(0), associativity(AK_NONASSOC), classifyParam(NULL), termIndex(-1) {} Terminal(Flatten &flat); void xfer(Flatten &flat); virtual void print(ostream &os) const; OSTREAM_OPERATOR(Terminal) virtual bool anyDDM() const; // return alias if defined, name otherwise virtual string toString(bool quoteAliases = false) const; }; typedef SObjList TerminalList; typedef SObjListIter TerminalListIter; #define FOREACH_TERMINAL(list, iter) FOREACH_OBJLIST(Terminal, list, iter) #define MUTATE_EACH_TERMINAL(list, iter) MUTATE_EACH_OBJLIST(Terminal, list, iter) #define SFOREACH_TERMINAL(list, iter) SFOREACH_OBJLIST(Terminal, list, iter) #define SMUTATE_EACH_TERMINAL(list, iter) SMUTATE_EACH_OBJLIST(Terminal, list, iter) // casting aggregates inline ObjList const &toObjList(ObjList const &list) { return reinterpret_cast< ObjListconst& >(list); } // format: "t1 t2 t3" string terminalSequenceToString(TerminalList const &list); // ----------------- TerminalSet ------------------- // used for the lookahead sets of LR items, and for the First() // sets of production RHSs class TerminalSet { private: // data unsigned char *bitmap; // (owner) bitmap of terminals, indexed by // terminal id; lsb of byte 0 is index 0 int bitmapLen; // # of bytes in 'bitmap' public: // data // printing customization: when non-NULL only print tokens if // it includes this token, and then *only* print this one static Terminal const *suppressExcept; private: // funcs void init(int numTerms); unsigned char *getByte(int terminalId) const; int getBit(int terminalId) const { return ((unsigned)terminalId % 8); } public: // funcs TerminalSet(int numTerms=0); // allocate new set, initially empty TerminalSet(TerminalSet const &obj); ~TerminalSet(); TerminalSet& operator= (TerminalSet const &obj) { copy(obj); return *this; } TerminalSet(Flatten&); void xfer(Flatten &flat); // call this to re-allocate at a new size; set is emptied void reset(int numTerms); // true when the # of symbols is 0; an unfinished state bool nullMap() const { return bitmap==NULL; } bool contains(int terminalId) const; // NOTE: can only compare dotted productions which have the // same number of symbols (assertion fail otherwise) bool isEqual(TerminalSet const &obj) const; void add(int terminalId); void remove(int terminalId); void clear(); void copy(TerminalSet const &obj); // lengths must be the same bool merge(TerminalSet const &obj); // union; returns true if merging changed set bool removeSet(TerminalSet const &obj); // intersect with complement; returns true if this changed set void print(ostream &os, Grammar const &g, char const *lead = ", ") const; }; // ---------------- Nonterminal -------------------- // something that can appear on the left-hand side of a production // (or, emptyString, since we classify that as a nonterminal also) class Nonterminal : public Symbol { // ---------- representation -------- public: StringRef mergeParam1; // param name for first alternative StringRef mergeParam2; // and 2nd alt LocString mergeCode; // code to resolve then StringRef keepParam; // name of parameter to 'keep' LocString keepCode; // code to decide whether to keep a reduction bool maximal; // if true, use maximal munch disambiguation SObjList subsets; // preferred subsets (for scannerless) protected: // funcs virtual void internalPrintDDM(ostream &os) const; public: // funcs Nonterminal(LocString const &name, bool isEmptyString=false); virtual ~Nonterminal(); Nonterminal(Flatten &flat); void xfer(Flatten &flat); void xferSerfs(Flatten &flat, Grammar &g); virtual void print(ostream &os, Grammar const *grammer = NULL) const; OSTREAM_OPERATOR(Nonterminal) virtual bool anyDDM() const; // ------ annotation ------ public: // data int ntIndex; // nonterminal index; see Grammar::computeWhatCanDeriveWhat bool cyclic; // true if this can derive itself in 1 or more steps TerminalSet first; // set of terminals that can be start of a string derived from 'this' TerminalSet follow; // set of terminals that can follow a string derived from 'this' Nonterminal *superset; // inverse of 'subsets' }; typedef SObjList NonterminalList; typedef SObjListIter NonterminalListIter; #define FOREACH_NONTERMINAL(list, iter) FOREACH_OBJLIST(Nonterminal, list, iter) #define MUTATE_EACH_NONTERMINAL(list, iter) MUTATE_EACH_OBJLIST(Nonterminal, list, iter) #define SFOREACH_NONTERMINAL(list, iter) SFOREACH_OBJLIST(Nonterminal, list, iter) #define SMUTATE_EACH_NONTERMINAL(list, iter) SMUTATE_EACH_OBJLIST(Nonterminal, list, iter) // casting aggregates inline ObjList const &toObjList(ObjList const &list) { return reinterpret_cast< ObjListconst& >(list); } // ---------------- Production -------------------- // a rewrite rule class Production { // ------ representation ------ public: // types class RHSElt { public: Symbol *sym; // (serf) rhs element symbol // tags applied to the symbols for purposes of unambiguous naming in // actions, and for self-commenting value as role indicators; an // empty tag ("") is allowed and means there is no tag LocString tag; // tag for this symbol; can be "" public: RHSElt(Symbol *s, LocString const &t) : sym(s), tag(t) {} ~RHSElt(); RHSElt(Flatten&); void xfer(Flatten &flat); void xferSerfs(Flatten &flat, Grammar &g); }; public: // data // fundamental context-free grammar (CFG) component Nonterminal * const left; // (serf) left hand side; must be nonterminal ObjList right; // right hand side; terminals & nonterminals int precedence; // precedence level for disambiguation (0 for none specified) TerminalSet *forbid; // (nullable owner) forbidden next tokens // user-supplied reduction action code LocString action; private: // funcs void computeDerived(); public: // funcs Production(Nonterminal *left, char const *leftTag); ~Production(); Production(Flatten &flat); void xfer(Flatten &flat); void xferSerfs(Flatten &flat, Grammar &g); // length *not* including emptySymbol, if present // UPDATE: I'm now disallowing emptySymbol from ever appearing in 'right' int rhsLength() const { return rhsLen; } // number of nonterminals on RHS int numRHSNonterminals() const; // true if the given symbol appears in 'right' bool rhsHasSymbol(Symbol const *sym) const; // retrieve the RHS as a list of symbols, rather than as a list of RHSElts void getRHSSymbols(SymbolList &output) const; // append a RHS symbol void append(Symbol *sym, LocString const &tag); // call this when production is built, so it can compute annotations // (this is called by GrammarAnalysis::initializeAuxData, from // inside runAnalyses) void finished(int numTerms); // find a symbol by tag; returns 1 for first RHS symbol, 2 for // second, etc.; returns -1 if the tag doesn't match anything int findTag(StringRef tag) const; // given an index as returned by 'findTaggedSymbol', translate that // back into a tag string symbolTag(int symbolIndex) const; // or translate a symbol index into a symbol Symbol const *symbolByIndexC(int symbolIndex) const; Symbol *symbolByIndex(int symbolIndex) { return const_cast(symbolByIndexC(symbolIndex)); } #if 0 // retrieve an item DottedProduction const *getDProdC(int dotPlace) const; DottedProduction *getDProd(int dotPlace) { return const_cast(getDProdC(dotPlace)); } #endif // 0 // add a terminal to the 'forbid' set void addForbid(Terminal *t, int totalNumTerminals); // print 'A -> B c D' (no newline) string toString(bool printType = true, bool printIndex = true) const; // this one prints 'B c D' for above example rule string rhsString(bool printTags = true, bool quoteAliases = false) const; void print(ostream &os) const; OSTREAM_OPERATOR(Production) // print entire input syntax, with newlines, e.g. // A -> B c D { return foo; } string toStringMore(bool printCode) const; // ------ annotation ------ private: // data int rhsLen; // right.count() public: // data int prodIndex; // unique production id TerminalSet firstSet; // First(RHS); computed by GrammarAnalysis::computeFirst }; typedef SObjList ProductionList; typedef SObjListIter ProductionListIter; #define FOREACH_PRODUCTION(list, iter) FOREACH_OBJLIST(Production, list, iter) #define MUTATE_EACH_PRODUCTION(list, iter) MUTATE_EACH_OBJLIST(Production, list, iter) #define SFOREACH_PRODUCTION(list, iter) SFOREACH_OBJLIST(Production, list, iter) #define SMUTATE_EACH_PRODUCTION(list, iter) SMUTATE_EACH_OBJLIST(Production, list, iter) typedef ObjList RHSEltList; typedef ObjListIter RHSEltListIter; typedef ObjListMutator RHSEltListMutator; // ---------------- Grammar -------------------- // represent a grammar: nonterminals, terminals, productions, and start-symbol class Grammar { // ------ representation ------ public: // data ObjList nonterminals; // (owner list) ObjList terminals; // (owner list) ObjList productions; // (owner list) Nonterminal *startSymbol; // (serf) a particular nonterminal // the special terminal for the empty string; does not appear in the // list of nonterminals or terminals for a grammar, but can be // referenced by productions, etc.; the decision to explicitly have // such a symbol, instead of letting it always be implicit, is // motivated by things like the derivability relation, where it's // nice to treat empty like any other symbol Nonterminal emptyString; // sections of verbatim code emitted into the interface file, before // the parser context class body ObjList verbatim; // name of the class into which the action functions are placed LocString actionClassName; // verbatim action class declaration, and additional codes from // extension modules to append to it (but see note of 11/13/04 // in grampar.cc) ObjList actionClasses; // code emitted into the implementation file at the end ObjList implVerbatim; // ---- declarative options ---- // name of the target language; nominally "C++" string targetLang; // when true, the default dup/del is what's expected for a // garbage-collected system: dup() is the identity function, // and del() is a no-op bool useGCDefaults; // when true, unspecified merge() functions abort() bool defaultMergeAborts; // expected numbers of various anomalies; -1 means no // expectation has been supplied; this informtion is used // to control what is reported after grammar analysis int expectedSR; // shift/reduce conflicts int expectedRR; // reduce/reduce conflicts int expectedUNRNonterms; // # unreachable nonterminals int expectedUNRTerms; // # unreachable terminals public: // funcs Grammar(); // set everything manually ~Grammar(); // read/write as binary file void xfer(Flatten &flat); // simple queries int numTerminals() const; int numNonterminals() const; // ---- building a grammar ---- // declare a new token exists, with name and optional alias; // return false if it's already declared bool declareToken(LocString const &symbolName, int code, LocString const &alias); // add a new production; the rhs arg list must be terminated with a NULL //void addProduction(Nonterminal *lhs, Symbol *rhs, ...); // add a pre-constructed production void addProduction(Production *prod); // ---------- outputting a grammar -------------- // print the list of symbols with type annotations void printSymbolTypes(ostream &os) const; // print the current list of productions void printProductions(ostream &os, bool printCode=true) const; // emit C++ code to construct this grammar later void emitSelfCC(ostream &os) const; // ---- whole-grammar stuff ---- // after adding all rules, check that all nonterminals have // at least one rule; also checks referential integrity // in actions and conditions; throw exception if there is a // problem void checkWellFormed() const; // output grammar in Bison's syntax // (coincidentally, when bison dumps its table with '-v', its table // dump syntax is similar to my input syntax) void printAsBison(ostream &os) const; // ---- symbol access ---- #define SYMBOL_ACCESS(Thing) \ /* retrieve, return NULL if not there */ \ Thing const *find##Thing##C(char const *name) const; \ Thing *find##Thing(char const *name) \ { return const_cast(find##Thing##C(name)); } \ \ /* retrieve, or create it if not already there */ \ Thing *getOrMake##Thing(LocString const &name); SYMBOL_ACCESS(Symbol) // findSymbolC, findSymbol, getOrMakeSymbol SYMBOL_ACCESS(Terminal) // findTerminal{C,}, getOrMakeTerminal SYMBOL_ACCESS(Nonterminal) // findNonterminal{C,}, getOrMakeNonterminal #undef SYMBOL_ACCESS // map a production to a unique index int getProductionIndex(Production const *prod) const; }; #endif // __GRAMMAR_H