/* agrammar.lex see license.txt for copyright and terms of use * lexical analyzer for my AST input format */ /* ----------------- C definitions -------------------- */ %{ // pull in my declaration of the lexer class -- this defines // the additional lexer state, some of which is used in the // action rules below #include "gramlex.h" // pull in the bison-generated token codes #include "agrampar.codes.h" #include // strchr, strrchr // for maintaining column count #define TOKEN_START tokenStartLoc = fileState.loc /* user ; */ #define UPD_COL advCol(yyleng) /* user ; */ #define TOK_UPD_COL TOKEN_START; UPD_COL /* user ; */ %} /* -------------------- flex options ------------------ */ /* no wrapping is needed; setting this means we don't have to link with libfl.a */ %option noyywrap /* don't use the default-echo rules */ %option nodefault /* generate a c++ lexer */ %option c++ /* and I will define the class */ %option yyclass="GrammarLexer" /* ------------------- definitions -------------------- */ /* any character, including newline */ ANY (.|"\n") /* any character except newline */ ANYBUTNL . /* starting character in a name */ LETTER [a-zA-Z_] /* starting character in a numeric literal */ DIGIT [0-9] /* double-quote */ DQUOTE "\"" /* character that can appear in a quoted string */ /* (I currently don't have any backslash codes, but I want to * leave open that possibility, for now backslashes are illegal) */ STRCHR [^\n\\\"] /* whitespace that doesn't cross line a boundary */ SLWHITE [ \t] /* --------------- start conditions ------------------- */ %x C_COMMENT %x EMBED %x INITVAL /* ---------------------- rules ----------------------- */ %% /* -------- whitespace ------ */ "\n" { newLine(); } [ \t\f\v]+ { UPD_COL; } /* -------- comments -------- */ "/""*" { /* C-style comments */ TOKEN_START; UPD_COL; BEGIN(C_COMMENT); } { "*/" { /* end of comment */ UPD_COL; BEGIN(INITIAL); } . { /* anything but slash-star or newline -- eat it */ UPD_COL; } "\n" { newLine(); } <> { UPD_COL; // <> yyleng is 1! errorUnterminatedComment(); return TOK_EOF; } } "//".*"\n" { /* C++-style comment -- eat it */ TOKEN_START; advCol(yyleng-1); // don't count the newline newLine(); // get it here } /* -------- punctuators, operators, keywords --------- */ "}" TOK_UPD_COL; return TOK_RBRACE; ";" TOK_UPD_COL; return TOK_SEMICOLON; "->" TOK_UPD_COL; return TOK_ARROW; "(" TOK_UPD_COL; return TOK_LPAREN; "," TOK_UPD_COL; return TOK_COMMA; "<" TOK_UPD_COL; return TOK_LANGLE; ">" TOK_UPD_COL; return TOK_RANGLE; "*" TOK_UPD_COL; return TOK_STAR; "&" TOK_UPD_COL; return TOK_AMPERSAND; "=" TOK_UPD_COL; return TOK_EQUALS; ":" TOK_UPD_COL; return TOK_COLON; "class" TOK_UPD_COL; return TOK_CLASS; "option" TOK_UPD_COL; return TOK_OPTION; "new" TOK_UPD_COL; return TOK_NEW; "enum" TOK_UPD_COL; return TOK_ENUM; /* --------- embedded text --------- */ ("public"|"protected"|"private"|"ctor"|"dtor"|"pure_virtual")("(")? { TOK_UPD_COL; if (prevToken==TOK_COLON || prevToken==TOK_COMMA) { // FREAKING UGLY HACK: Normally, access control keywords introduce // a verbatim section. But I want to also use them in the syntax // for base classes, to be similar to C++. But that means that I // have to somehow distinguish those contexts. As it happens, the // previous token can be used to make the distinction. So, here // we are in that context, so don't do verbatim stuff. // // Of course, this is an awfully fragile approach. I'd like to // redesign the verbatim-field syntax at some point to eliminate // this problem, but since I don't know what a good syntax might // be, I'll leave things alone for now. // better not have used a paren.. if (yytext[yyleng-1] == '(') { // I'm tempted to make a smart-ass error message... resisting...... *phew*! err("don't put a paren after a base class access control keyword"); // now I'm tempted to change the error reporting so that all // error messages are prefixed with "(SNL donatella versaci // voice) you crazy bitch!" hmm.. maybe too much sugar today? } } else { // the keyword introduces a verbatim section // is a paren included? if (yytext[yyleng-1] == '(') { // don't drop into embedded just yet; wait for the ')' embedStart = ')'; yyless(yyleng-1); advCol(-1); } else { BEGIN(EMBED); } embedded->reset(); embedFinish = ';'; allowInit = yytext[0]=='p'; embedMode = TOK_EMBEDDED_CODE; } return yytext[0]=='c'? TOK_CTOR : yytext[0]=='d'? TOK_DTOR : yytext[2] == 'b'? TOK_PUBLIC : yytext[2] == 'o'? TOK_PROTECTED : yytext[2] == 'i'? TOK_PRIVATE : /*[2] == 'r'*/TOK_PURE_VIRTUAL ; } ("verbatim"|"impl_verbatim") { TOK_UPD_COL; // need to see one more token before we begin embedded processing embedStart = '{'; embedFinish = '}'; allowInit = false; embedded->reset(); embedMode = TOK_EMBEDDED_CODE; return yytext[0]=='v'? TOK_VERBATIM : TOK_IMPL_VERBATIM ; } "custom" { TOK_UPD_COL; embedStart = '{'; embedFinish = '}'; allowInit = false; embedded->reset(); embedMode = TOK_EMBEDDED_CODE; return TOK_CUSTOM; } /* punctuation that can start embedded code */ ("{"|")") { TOK_UPD_COL; if (yytext[0] == embedStart) { BEGIN(EMBED); } return yytext[0]=='{'? TOK_LBRACE : TOK_RPAREN; } /* no TOKEN_START here; we'll use the tokenStartLoc that * was computed in the opening punctuation */ { /* no special significance to lexer */ [^;}=\n]+ { UPD_COL; embedded->handle(yytext, yyleng, embedFinish); } "\n" { newLine(); embedded->handle(yytext, yyleng, embedFinish); } /* possibly closing delimiter */ ("}"|";"|"=") { UPD_COL; // we're done if we're at a zero nesting level and the // delimiter matches ... if (embedded->zeroNesting() && embedFinishMatches(yytext[0])) { // done BEGIN(INITIAL); if (yytext[0] == '=') { // switch to a special mode that will handle the '=' and // jump right back into embedded mode BEGIN(INITVAL); } else { // turn off embedded detection embedStart = 0; } // put back delimeter so parser will see it yyless(yyleng-1); advCol(-1); // in the abstract grammar we don't have embedded expressions embedded->exprOnly = false; // and similarly for the other flag embedded->isDeclaration = (embedFinish == ';'); // caller can get text from embedded->text return embedMode; } else { // embedded delimeter, mostly ignore it embedded->handle(yytext, yyleng, embedFinish); } } } { "=" { // yield the '=', switch back into embedded TOK_UPD_COL; BEGIN(EMBED); embedded->reset(); allowInit = false; return TOK_EQUALS; } {ANY} { xfailure("somehow got a char other than '=' in INITVAL state"); } } /* -------- name literal --------- */ {LETTER}({LETTER}|{DIGIT})* { // get text from yytext and yyleng TOK_UPD_COL; return TOK_NAME; } /* --------- integer literal --------- */ {DIGIT}+ { TOK_UPD_COL; return TOK_INTLIT; } /* --------- illegal ------------- */ {ANY} { TOK_UPD_COL; errorIllegalCharacter(yytext[0]); } %% /* -------------------- additional C code -------------------- */ bool isAGramlexEmbed(int code) { return code == TOK_EMBEDDED_CODE; }