// mlsstr.cc see license.txt for copyright and terms of use // code for mlsstr.h // based on ccsstr.cc #include "mlsstr.h" // this module #include "xassert.h" // xassert #include "exc.h" // xformat #include "strutil.h" // string, replace #include // cout #include // isspace MLSubstrate::MLSubstrate(ReportError *err) : EmbeddedLang(err) { reset(); } void MLSubstrate::reset(int initNest) { state = ST_NORMAL; delims.empty(); nestingBias = initNest; comNesting = 0; prev = 0; text.setlength(0); } MLSubstrate::~MLSubstrate() {} void MLSubstrate::handle(char const *str, int len, char finalDelim) { text.append(str, len); for (; len>0; len--,str++) { process_char_again: switch (state) { case ST_NORMAL: switch (*str) { case '{': case '(': case '[': if (!inComment()) { delims.push(*str); } break; case '}': case ')': case ']': if (inComment()) { if (prev == '*' && *str == ')') { comNesting--; } } else if (nesting() == 0) { err->reportError(stringc << "unexpected closing delimiter `" << *str << "' -- probably due to missing `" << finalDelim << "'"); } else { char o = delims.top(); char c = *str; if (!(( o=='{' && c=='}' ) || ( o=='(' && c==')' ) || ( o=='[' && c==']' ))) { err->reportError(stringc << "opening delimiter `" << o << "' does not match closing delimiter `" << c << "'"); } delims.pop(); } break; case '\"': state = ST_STRING; break; case '\'': if (isalnum(prev) || prev=='_' || prev=='\'') { // this is a prime on a variable name; stay in normal mode } else { state = ST_APOSTROPHE1; } break; case '*': if (prev == '(') { if (comNesting == 0) { // undo 'delims.push()' from the '(' xassert(nesting() > 0); delims.pop(); } comNesting++; // if the next char is ')', i.e. input was "(*)", do // not allow it to use this '*' to finish the comment prev = 0; continue; } break; } break; case ST_APOSTROPHE1: // While the OCaml manual does not specify how to disambiguate // between character literals and type variables, the ocaml // lexer (parsing/lexer.mll) uses (approximately) the // following rules: // // - If the input is (apostrophe, char, apostrophe), it is // a character literal. // - If the input is (apostrophe, backslash), it is the start // of a a character literal. // - Any other occurrence of apostrophe starts a type variable. if (*str == '\\') { state = ST_CHAR; } else { state = ST_APOSTROPHE2; } break; case ST_APOSTROPHE2: if (*str == '\'') { state = ST_NORMAL; // finishes the character literal } else { // whole thing is a type variable; but if *str is something // like ')' then we need to consider its effects on nesting state = ST_NORMAL; goto process_char_again; } break; case ST_STRING: case ST_CHAR: if (prev != '\\') { if ((state == ST_STRING && *str == '\"') || (state == ST_CHAR && *str == '\'')) { state = ST_NORMAL; } else if (*str == '\n') { // actually, ocaml allows unescaped newlines in string literals //err->reportError("unterminated string or char literal"); } } else { prev = 0; // the backslash cancels any specialness of *str continue; } break; #if 0 // old case ST_COMMENT: if (prev == '(' && *str == '*') { comNesting++; prev = 0; // like above continue; } else if (prev == '*' && *str == ')') { xassert(comNesting >= 0); if (comNesting == 0) { // done with comment state = ST_NORMAL; } else { // decrease nesting comNesting--; } } break; #endif // 0 default: xfailure("unknown state"); } prev = *str; } } bool MLSubstrate::zeroNesting() const { return state == ST_NORMAL && nesting() == 0 && !inComment(); } string MLSubstrate::getFuncBody() const { return text; } // 4/29/04: I have no idea if this is right or not.. this is the // definition from ccsstr.cc. string MLSubstrate::getDeclName() const { // go with the rather inelegant heuristic that the word // just before the first '(' is the function's name char const *start = text.c_str(); char const *p = start; // find first '(' while (*p && *p!='(') { p++; } if (!*p) { xformat("missing '('"); } if (p == start) { xformat("missing name"); } // skip backward past any whitespace before the '(' p--; while (p>=start && isspace(*p)) { p--; } if (p=start && (isalnum(*p) || *p=='_')) { p--; } p++; // move back to most recent legal char // done return substring(p, nameEnd-p); } // ------------------ test code ------------------- #ifdef TEST_MLSSTR #define ML MLSubstrate #define Test MLSubstrateTest // test code is put into a class just so that MLSubstrate // can grant it access to private fields class Test { public: void feed(ML &ml, char const *src, bool allowErrors = false); void silentFeed(ML &ml, char const *src); void test(char const *src, ML::State state, int nesting, int comNesting, char prev); void normal(char const *src, int nesting); void str(char const *src, int nesting, bool bs); void yes(char const *src); void no(char const *src); void name(char const *body, char const *n); void badname(char const *body); void bad(char const *body); int main(int argc, char *argv[]); }; #define min(a,b) ((a)<(b)?(a):(b)) void Test::feed(ML &ml, char const *src, bool allowErrors) { int origErrors = simpleReportError.errors; cout << "trying: " << src << endl; silentFeed(ml, src); if (!allowErrors && origErrors != simpleReportError.errors) { xfailure(stringc << "caused error: " << src); } } void Test::silentFeed(ML &ml, char const *src) { while (*src) { // feed it in 10 char increments, to test split processing too int len = min(strlen(src), 10); ml.handle(src, len, '}'); src += len; } } void Test::test(char const *src, ML::State state, int nesting, int comNesting, char prev) { ML ml; feed(ml, src); if (!( ml.state == state && ml.nesting() == nesting && ml.comNesting == comNesting && ml.prev == prev )) { xfailure(stringc << "failed on src: " << src); } } void Test::normal(char const *src, int nesting) { test(src, ML::ST_NORMAL, nesting, 0, src[strlen(src)-1]); } void Test::str(char const *src, int nesting, bool bs) { char prev = (bs? '\\' : src[strlen(src)-1]); test(src, ML::ST_STRING, nesting, 0, prev); // repeat the test with single-tick // // 2005-01-25: No, OCaml's character-literal rules do not treat // quote and apostrophe similarly. //string another = replace(src, "\"", "\'"); //test(another, ML::ST_CHAR, nesting, 0, prev); } void Test::yes(char const *src) { ML ml; feed(ml, src); xassert(ml.zeroNesting()); } void Test::no(char const *src) { ML ml; feed(ml, src); xassert(!ml.zeroNesting()); } void Test::name(char const *body, char const *n) { ML ml; feed(ml, body); xassert(ml.getDeclName().equals(n)); } void Test::badname(char const *body) { ML ml; feed(ml, body); try { ml.getDeclName(); xbase("got a name when it shoudn't have!"); } catch (...) {} } void Test::bad(char const *body) { int origErrors = simpleReportError.errors; ML ml; feed(ml, body, true /*allowErrors*/); if (origErrors == simpleReportError.errors) { xbase(stringc << "should have caused an error: " << body); } } int Test::main(int argc, char *argv[]) { if (argc >= 2) { // analyze the files passed as an argument, expecting them to be // complete caml source files, ending in normal mode with all // delimiters closed for (int i=1; i