// gramanl.h see license.txt for copyright and terms of use // grammar analysis module; separated from grammar.h to // reduce mixing of representation and algorithm; this // module should be entirely algorithm // Author: Scott McPeak, April 2000 // Updates: March 2002 // references: // // [ASU] Aho, Sethi Ullman. Compilers: Principles, // Techniques, and Tools. Addison-Wesley, // Reading, MA. 1986. Second printing (3/88). // [A classic reference for LR parsing.] #ifndef __GRAMANL_H #define __GRAMANL_H #include "grammar.h" // Grammar and friends #include "ohashtbl.h" // OwnerHashTable #include "okhashtbl.h" // OwnerKHashTable #include "okhasharr.h" // OwnerKHashArray #include "glrconfig.h" // SOURCELOC #include "parsetables.h" // ParseTables, GrowArray // forward decls class Bit2d; // bit2d.h class BitArray; // bitarray.h class EmitCode; // emitcode.h // this file class GrammarAnalysis; // ---------------- DottedProduction -------------------- // a production, with an indicator that says how much of this // production has been matched by some part of the input string // (exactly which part of the input depends on where this appears // in the algorithm's data structures) class DottedProduction { // ------ representation ------ private: // data Production const *prod; // (serf) the base production int dot; // 0 means it's before all RHS symbols, 1 means after first, etc. // -------- annotation ---------- private: // data // performance optimization: NULL if dot at end, or else pointer // to the symbol right after the dot Symbol *afterDot; public: // data // First of the sentential form that follows the dot; this set // is computed by GrammarAnalysis::computeDProdFirsts TerminalSet firstSet; // also computed by computeDProdFirsts, this is true if the // sentential form can derive epsilon (the empty string) bool canDeriveEmpty; // during item set closure, I need a way to map from dotted prods to // the items which use them; so rather than use a hash table, I'll // just annotate the dprods themselves with backpointers; these // backpointers *must* be maintained as NULL when there's no // association mutable class LRItem *backPointer; private: // funcs void init(); public: // funcs //DottedProduction(DottedProduction const &obj); // need the grammar passed during creation so we know how big // to make 'lookahead' //DottedProduction(GrammarAnalysis const &g); // for later filling-in //DottedProduction(/*GrammarAnalysis const &g,*/ Production *p, int d); DottedProduction(); // for creating arrays of them ~DottedProduction(); // no point to flattening these because they're easily re-computable #if 0 DottedProduction(Flatten&); void xfer(Flatten &flat); void xferSerfs(Flatten &flat, GrammarAnalysis &g); #endif // 0 // simple queries Production const *getProd() const { return prod; } int getDot() const { return dot; } bool isDotAtStart() const { return dot==0; } bool isDotAtEnd() const { return afterDot==NULL; } // no need for equality now, since all DPs with the same // prod/dot are shared //bool isEqual(DottedProduction const &obj) const; //bool operator== (DottedProduction const &obj) const; // call this to change prod and dot void setProdAndDot(Production const *p, int d); // dot must not be at the start (left edge) Symbol const *symbolBeforeDotC() const; Symbol *symbolBeforeDot() { return const_cast(symbolBeforeDotC()); } // dot must not be at the end (right edge) Symbol const *symbolAfterDotC() const { return afterDot; } Symbol *symbolAfterDot() { return const_cast(symbolAfterDotC()); } // print to cout as 'A -> B . c D' (no newline) void print(ostream &os/*, GrammarAnalysis const &g*/) const; OSTREAM_OPERATOR(DottedProduction) }; // lists of dotted productions typedef ObjList DProductionList; typedef ObjListIter DProductionListIter; typedef SObjList SDProductionList; typedef SObjListIter SDProductionListIter; #define FOREACH_DOTTEDPRODUCTION(list, iter) FOREACH_OBJLIST(DottedProduction, list, iter) #define MUTATE_EACH_DOTTEDPRODUCTION(list, iter) MUTATE_EACH_OBJLIST(DottedProduction, list, iter) #define SFOREACH_DOTTEDPRODUCTION(list, iter) SFOREACH_OBJLIST(DottedProduction, list, iter) #define SMUTATE_EACH_DOTTEDPRODUCTION(list, iter) SMUTATE_EACH_OBJLIST(DottedProduction, list, iter) // --------------- LRItem --------------- // a dotted production with a lookahead; whereas each production // has a fixed number of dotted versions of that production, there // can be lots of items, because of the differing lookahead sets // (I prefer the name "LRItem" to simply "Item" because the latter // easily collides with other uses) class LRItem { public: // data DottedProduction const *dprod; // (serf) production and dot position TerminalSet lookahead; // lookahead symbols public: // funcs LRItem(LRItem const &obj); ~LRItem(); // need 'numTerms' to tell how big to make 'lookahead' LRItem(int numTerms, DottedProduction const *dp); LRItem(Flatten&); void xfer(Flatten &flat); void xferSerfs(Flatten &flat, GrammarAnalysis &g); // comparison static int diff(LRItem const *a, LRItem const *b, void*); bool equalNoLA(LRItem const &obj) const { return dprod == obj.dprod; } // manipulate the lookahead set bool laContains(int terminalId) const { return lookahead.contains(terminalId); } void laAdd(int terminalId) { lookahead.add(terminalId); } void laRemove(int terminalId) { lookahead.remove(terminalId); } void laCopy(LRItem const &obj) { lookahead.copy(obj.lookahead); } bool laMerge(LRItem const &obj) // returns true if merging changed lookahead { return lookahead.merge(obj.lookahead); } bool laIsEqual(LRItem const &obj) const { return lookahead.isEqual(obj.lookahead); } // pass-thru queries into 'dprod' Production const *getProd() const { return dprod->getProd(); } int getDot() const { return dprod->getDot(); } bool isDotAtStart() const { return dprod->isDotAtStart(); } bool isDotAtEnd() const { return dprod->isDotAtEnd(); } Symbol const *symbolBeforeDotC() const { return dprod->symbolBeforeDotC(); } Symbol const *symbolAfterDotC() const { return dprod->symbolAfterDotC(); } int prodIndex() const { return getProd()->prodIndex; } // stuff for insertion into a hash table static unsigned hash(DottedProduction const *key); static DottedProduction const *dataToKey(LRItem *dp); static bool dpEqual(DottedProduction const *key1, DottedProduction const *key2); // true if this item is "A -> alpha * t beta" bool isExtendingShift(Nonterminal const *A, Terminal const *t) const; void print(ostream &os, GrammarAnalysis const &g) const; }; // ---------------- ItemSet ------------------- // a set of dotted productions, and the transitions between // item sets, as in LR(0) set-of-items construction class ItemSet { public: // intended to be read-only public // kernel items: the items that define the set; except for // the special case of the initial item in the initial state, // the kernel items are distinguished by having the dot *not* // at the left edge ObjList kernelItems; // nonkernel items: those derived as the closure of the kernel // items by expanding symbols to the right of dots; here I am // making the choice to materialize them, rather than derive // them on the spot as needed (and may change this decision) ObjList nonkernelItems; private: // data // transition function (where we go on shifts); NULL means no transition // Map : (Terminal id or Nonterminal id) -> ItemSet* ItemSet **termTransition; // (owner ptr to array of serf ptrs) ItemSet **nontermTransition; // (owner ptr to array of serf ptrs) // bounds for above int terms; int nonterms; // profiler reports I'm spending significant time rifling through // the items looking for those that have the dot at the end; so this // array will point to all such items LRItem const **dotsAtEnd; // (owner ptr to array of serf ptrs) int numDotsAtEnd; // number of elements in 'dotsAtEnd' // profiler also reports I'm still spending time comparing item sets; this // stores a CRC of the numerically sorted kernel item pointer addresses, // concatenated into a buffer of sufficient size unsigned long kernelItemsCRC; // need to store this, because I can't compute it once I throw // away the items Symbol const *stateSymbol; public: // data // numerical state id, should be unique among item sets // in a particular grammar's sets StateId id; // it's useful to have a BFS tree superimposed on the transition // graph; for example, it makes it easy to generate sample inputs // for each state. so we store the parent pointer; we can derive // child pointers by looking at all outgoing transitions, and // filtering for those whose targets' parent pointers equal 'this'. // the start state's parent is NULL, since it is the root of the // BFS tree ItemSet *BFSparent; // (serf) private: // funcs int bcheckTerm(int index) const; int bcheckNonterm(int index) const; ItemSet *&refTransition(Symbol const *sym); void allocateTransitionFunction(); Symbol const *computeStateSymbolC() const; void deleteNonReductions(ObjList &list); public: // funcs ItemSet(StateId id, int numTerms, int numNonterms); ~ItemSet(); ItemSet(Flatten&); void xfer(Flatten &flat); void xferSerfs(Flatten &flat, GrammarAnalysis &g); // ---- item queries ---- // the set of items names a symbol as the symbol used // to reach this state -- namely, the symbol that appears // to the left of a dot. this fn retrieves that symbol // (if all items have dots at left edge, returns NULL; this // would be true only for the initial state) Symbol const *getStateSymbolC() const { return stateSymbol; } // equality is defined as having the same items (basic set equality) bool operator== (ItemSet const &obj) const; // sometimes it's convenient to have all items mixed together // (CONSTNESS: allows modification of items...) void getAllItems(SObjList &dest, bool nonkernel=true) const; // used for sorting by id static int diffById(ItemSet const *left, ItemSet const *right, void*); // ---- transition queries ---- // query transition fn for an arbitrary symbol; returns // NULL if no transition is defined ItemSet const *transitionC(Symbol const *sym) const; ItemSet *transition(Symbol const *sym) { return const_cast(transitionC(sym)); } // alternate interface; also might return NULL ItemSet const *getTermTransition(int termId) const { return termTransition[bcheckTerm(termId)]; } ItemSet const *getNontermTransition(int nontermId) const { return nontermTransition[bcheckNonterm(nontermId)]; } // get the list of productions that are ready to reduce, given // that the next input symbol is 'lookahead' (i.e. in the follow // of a production's LHS); parsing=true means we are actually // parsing input, so certain tracing output is appropriate; // 'reductions' is a list of const Productions void getPossibleReductions(ProductionList &reductions, Terminal const *lookahead, bool parsing) const; // assuming this itemset has at least one reduction ready (an assertion // checks this), retrieve the first one Production const *getFirstReduction() const; // ---- item mutations ---- // add a kernel item; used while constructing the state void addKernelItem(LRItem * /*owner*/ item); // after adding all kernel items, call this void sortKernelItems(); // add a nonkernel item; used while computing closure; this // item must not already be in the item set void addNonkernelItem(LRItem * /*owner*/ item); // computes things derived from the item set lists: // dotsAtEnd, numDotsAtEnd, kernelItemsCRC, stateSymbol; // do this after adding things to the items lists void changedItems(); // a part of 'changedItems', this is used in a specialized way // during LR item set construction; it leaves 'this' in a somewhat // half-baked state (if changedItems is not also called), so some // care needs to be taken when using this directly void computeKernelCRC(GrowArray &array); // remove the reduce using 'prod' on lookahead 'sym; // calls 'changedItems' internally void removeReduce(Production const *prod, Terminal const *sym); // throw away information not needed during parsing void throwAwayItems(); // 'dest' has already been established to have the same kernel // items as 'this' -- so merge all the kernel lookahead items // of 'this' into 'dest'; return 'true' if any changes were made // to 'dest' bool mergeLookaheadsInto(ItemSet &dest) const; // true if this itemset has an item "A -> alpha * t beta", i.e. // one that would extend 'A' by shifting 't' bool hasExtendingShift(Nonterminal const *A, Terminal const *t) const; // ---- transition mutations ---- // set transition on 'sym' to be 'dest' void setTransition(Symbol const *sym, ItemSet *dest); // remove the the shift on 'sym' void removeShift(Terminal const *sym); // ------ hashtable stuff -------- static ItemSet const *dataToKey(ItemSet *data); static unsigned hash(ItemSet const *key); static bool equalKey(ItemSet const *key1, ItemSet const *key2); // ---- debugging ---- void writeGraph(ostream &os, GrammarAnalysis const &g) const; void print(ostream &os, GrammarAnalysis const &g, bool nonkernel=true) const; }; // ---------------------- GrammarAnalysis ------------------- class GrammarAnalysis : public Grammar { protected: // data // if entry i,j is true, then nonterminal i can derive nonterminal j // (this is a graph, represented (for now) as an adjacency matrix) enum { emptyStringIndex = 0 }; Bit2d *derivable; // (owner) // index the symbols on their integer ids Nonterminal **indexedNonterms; // (owner -> serfs) ntIndex -> Nonterminal Terminal **indexedTerms; // (owner -> serfs) termIndex -> Terminal // numNonterms==Grammar::numNonterminals(), numTerms==Grammar::numTerminals() int numNonterms; // length of 'indexedNonterms' array int numTerms; // " " terms " // during itemSetClosure, profiling reports we spend a lot of time // walking the list of productions looking for those that have a given // symbol on the LHS; so let's index produtions by LHS symbol index; // this array has 'numNonterms' elements, mapping each nonterminal to // the list of productions with that nonterminal on the LHS SObjList *productionsByLHS; // (owner ptr to array) // map of production x dotPosition -> DottedProduction; // each element of the 'dottedProds' array is a pointer to an // array of DottedProduction objects DottedProduction **dottedProds; // (owner ptr to array of owners) // index of productions by id Production **indexedProds; // (owner -> serfs) prodIndex -> Production int numProds; // length of 'dottedProds' // only true after initializeAuxData has been called bool initialized; // used to assign itemset ids while the item sets are being // initially constructed; later, they get renumbered into a // canonical order int nextItemSetId; // the LR parsing tables ObjList itemSets; // distinguished start state; NOTE: much of the grammar analysis // code currently assumes (and checks) that state 0 is the start // state, so if you want to do something different, that code might // need to be changed ItemSet *startState; // (serf) public: // data // true if any nonterminal can derive itself (with no extra symbols // surrounding it) in 1 or more steps bool cyclic; // symbol of interest; various diagnostics are printed when // certain things happen with it (e.g. the first application // is to print whenever something is added to this sym's // follow) Symbol const *symOfInterest; // incremented each time we encounter an error that we can recover from int errors; // parse tables ParseTables *tables; // (owner) private: // funcs // ---- analyis init ---- // call this after grammar is completely built void initializeAuxData(); void computeIndexedNonterms(); void computeIndexedTerms(); void computeProductionsByLHS(); void computeReachable(); void computeReachableDFS(Nonterminal *nt); void resetFirstFollow(); void computeDProdFirsts(); void computeSupersets(); // ---- dotted productions ---- void createDottedProductions(); void deleteDottedProductions(); DottedProduction const *getDProd(Production const *prod, int posn) const; DottedProduction *getDProd_nc(Production const *prod, int posn) { return const_cast(getDProd(prod, posn)); } // given a dprod, yield the one obtained by moving the dot one // place to the right DottedProduction const *nextDProd(DottedProduction const *dp) const #ifdef NDEBUG { return dp+1; } // take advantage of physical co-location #endif ; // debug version checks bounds // ---- derivability ---- // iteratively compute every pair A,B such that A can derive B void computeWhatCanDeriveWhat(); void initDerivableRelation(); // add a derivability relation; returns true if this makes a change bool addDerivable(Nonterminal const *left, Nonterminal const *right); bool addDerivable(int leftNtIndex, int rightNtIndex); // private derivability interface bool canDerive(int leftNtIndex, int rightNtIndex) const; bool sequenceCanDeriveEmpty(RHSEltList const &list) const; bool iterSeqCanDeriveEmpty(RHSEltListIter iter) const; // ---- First ---- void computeFirst(); //bool addFirst(Nonterminal *NT, Terminal *term); void firstOfSequence(TerminalSet &destList, RHSEltList const &sequence); void firstOfIterSeq(TerminalSet &destList, RHSEltListIter sym); // ---- Follow ---- void computeFollow(); //bool addFollow(Nonterminal *NT, Terminal *term); // ---- LR item sets ---- ItemSet *makeItemSet(); void disposeItemSet(ItemSet *is); void moveDotNoClosure(ItemSet const *source, Symbol const *symbol, ItemSet *dest, ObjList &unusedTail, GrowArray &array); ItemSet *findItemSetInList(ObjList &list, ItemSet const *itemSet); static bool itemSetsEqual(ItemSet const *is1, ItemSet const *is2); void constructLRItemSets(); void lrParse(char const *input); void handleShiftReduceConflict( bool &keepShift, bool &keepReduce, bool &dontWarn, ItemSet const *state, Production const *prod, Terminal const *sym); void resolveConflicts( ItemSet const *state, // parse state in which the actions are possible Terminal const *sym, // lookahead symbol for these actions ItemSet const *&shiftDest, // (inout) if non-NULL, the state to which we can shift ProductionList &reductions, // (inout) list of possible reductions bool allowAmbig, // if false, always return at most 1 action bool &printedConflictHeader, // (inout) true once we've printed the state header int &sr, int &rr); // (inout) counts of S/R and R/R conflicts, resp. void computeParseTables(bool allowAmbig); int subsetDirectiveResolution( ItemSet const *state, // parse state in which the actions are possible Terminal const *sym, // lookahead symbol for these actions ProductionList &reductions); // list to try to cut down void renumberStates(); static int renumberStatesDiff (ItemSet const *left, ItemSet const *right, void *vgramanl); static int arbitraryProductionOrder (Production const *left, Production const *right, void*); static int arbitraryRHSEltOrder (Production::RHSElt const *left, Production::RHSElt const *right, void*); void computeBFSTree(); // misc void computePredictiveParsingTable(); // non-const because have to add productions to lists void topologicalSort(NtIndex *order, int &nextOrdinal, NtIndex current, BitArray &seen); // the inverse of transition: map a target state to the symbol that // would transition to that state (from the given source state) Symbol const *inverseTransitionC(ItemSet const *source, ItemSet const *target) const; // sample input helpers void leftContext(SymbolList &output, ItemSet const *state) const; bool rewriteAsTerminals(TerminalList &output, SymbolList const &input) const; bool rewriteAsTerminalsHelper(TerminalList &output, SymbolList const &input, ProductionList &reductionStack) const; bool rewriteSingleNTAsTerminals(TerminalList &output, Nonterminal const *nonterminal, ProductionList &reductionStack) const; // let's try this .. it needs to access 'itemSets' friend void ItemSet::xferSerfs(Flatten &flat, GrammarAnalysis &g); void singleItemClosure(OwnerKHashTable &finished, ArrayStack &worklist, //OwnerKHashArray &workhash, LRItem const *item, TerminalSet &scratchSet); public: // funcs GrammarAnalysis(); ~GrammarAnalysis(); // access symbols by index Terminal const *getTerminal(int index) const; Nonterminal const *getNonterminal(int index) const; Production const *getProduction(int index) const; ItemSet const *getItemSet(int index) const; int numItemSets() const { return nextItemSetId; } // faster access to counts int numTerminals() const { return numTerms; } int numNonterminals() const { return numNonterms; } // binary read/write void xfer(Flatten &flat); // essentially, my 'main()' while experimenting void exampleGrammar(); // overrides base class to add a little bit of the // annotated info void printProductions(ostream &os, bool printCode=true) const; // print lots of stuff void printProductionsAndItems(ostream &os, bool printCode=true) const; // when grammar is built, this runs all analyses and stores // the results in this object's data fields; write the LR item // sets to the given file (or don't, if NULL) void runAnalyses(char const *setsFname); // print the item sets to a stream (optionally include nonkernel items) void printItemSets(ostream &os, bool nonkernel) const; // given a grammar, replace all of its actions with actions that // will build a straightforward parse tree using the facilities // of ptreenode.h; the rules will need the user to already have // done some necessary work in the verbatim preamble, such as // #including ptreenode.h void addTreebuildingActions(); // ---- grammar queries ---- bool canDerive(Nonterminal const *lhs, Nonterminal const *rhs) const; bool canDeriveEmpty(Nonterminal const *lhs) const; bool firstIncludes(Nonterminal const *NT, Terminal const *term) const; bool followIncludes(Nonterminal const *NT, Terminal const *term) const; // ---- sample inputs and contexts ---- string sampleInput(ItemSet const *state) const; string leftContextString(ItemSet const *state) const; // ---- moved out of private ---- void itemSetClosure(ItemSet &itemSet); DottedProduction const *getDProdIndex(int prodIndex, int posn) const; }; // in gramexpl.cc: interactive grammar experimentation system void grammarExplorer(GrammarAnalysis &g); #endif // __GRAMANL_H