// grampar.cc            see license.txt for copyright and terms of use
// additional C++ code for the grammar parser; in essence,
// build the grammar internal representation out of what
// the user supplies in a .gr file

#include "grampar.h"         // this module
#include "gramlex.h"         // GrammarLexer
#include "trace.h"           // tracing debug functions
#include "gramast.ast.gen.h" // grammar AST nodes
#include "grammar.h"         // Grammar, Production, etc.
#include "owner.h"           // Owner (redundant dependency; dot's layout is better with it though)
#include "syserr.h"          // xsyserror
#include "strutil.h"         // quoted
#include "grampar.tab.h"     // token constant codes, union YYSTYPE
#include "array.h"           // GrowArray
#include "mlsstr.h"          // MLSubstrate

#include <fstream.h>         // ifstream
#include <ctype.h>           // isspace, isalnum

#define LIT_STR(s) LocString(SL_INIT, grammarStringTable.add(s))


// ------------------------- Environment ------------------------
Environment::Environment(Grammar &G)
  : g(G),
    prevEnv(NULL),
    nontermDecls(),
    errorCount(0),
    errors(errorCount)
{}

Environment::Environment(Environment &prev)
  : g(prev.g),
    prevEnv(&prev),
    nontermDecls(prev.nontermDecls),
    errorCount(-1000),      // should never be used
    errors(prev.errors)     // copy parent's 'errors' reference
{}

Environment::~Environment()
{}


// -------------------- XASTParse --------------------
STATICDEF string XASTParse::
  constructMsg(LocString const &tok, rostring msg)
{
  if (tok.validLoc()) {
    if (tok.isNonNull()) {
      return stringc << tok.locString() << ": near " << tok
                     << ", " << msg;
    }
    else {
      return stringc << tok.locString() << ": " << msg;
    }
  }
  else {
    return string(msg);
  }
}

XASTParse::XASTParse(LocString const &tok, rostring m)
  : xBase(constructMsg(tok, m)),
    failToken(tok),
    message(m)
{}


XASTParse::XASTParse(XASTParse const &obj)
  : xBase(obj),
    DMEMB(failToken),
    DMEMB(message)
{}

XASTParse::~XASTParse()
{}


// -------------------- AST parser support ---------------------
// fwd-decl of parsing fns
void astParseGrammar(Grammar &g, GrammarAST *treeTop);
void astParseTerminals(Environment &env, TF_terminals const &terms);
void astParseDDM(Environment &env, Symbol *sym,
                 ASTList<SpecFunc> const &funcs);
void astParseNonterm(Environment &env, TF_nonterm const *nt);
void astParseProduction(Environment &env, Nonterminal *nonterm,
                        ProdDecl const *prod);


// really a static semantic error, more than a parse error..
void astParseError(LocString const &failToken, rostring msg)
{
  THROW(XASTParse(failToken, msg));
}

void astParseError(SourceLoc loc, rostring msg)
{
  LocString locstr(loc, NULL);
  THROW(XASTParse(locstr, msg));
}

void astParseError(rostring msg)
{
  LocString ls;   // no location info
  THROW(XASTParse(ls, msg));
}

// print the same message, but keep going anyway
void astParseErrorCont(Environment &env, LocString const &failToken,
                       rostring msg)
{
  XASTParse x(failToken, msg);
  cout << x.why() << endl;
  env.errors++;
}


// to put as the catch block; so far it's kind of ad-hoc where
// I actually put 'try' blocks..
#define CATCH_APPLY_CONTEXT(tok)        \
  catch (XASTParse &x) {                \
    /* leave unchanged */               \
    throw x;                            \
  }                                     \
  catch (xBase &x) {                    \
    /* add context */                   \
    astParseError(tok, x.why());        \
    throw 0;     /* silence warning */  \
  }


// ---------------------- AST "parser" --------------------------
// set the annotation pointers
void setAnnotations(GrammarAST *ast)
{
  // work through the toplevel forms
  FOREACH_ASTLIST_NC(TopForm, ast->forms, iter) {
    ASTSWITCH(TopForm, iter.data()) {
      ASTCASE(TF_terminals, t) {
        if (!ast->terms) {
          ast->terms = t;
        }
        else {
          // 12/07/04: Allow more than one 'terminals' section,
          // by simply concatenating them.  This is useful when
          // I want to merge two grammars together textually.
          ast->terms->decls.concat(t->decls);
          ast->terms->types.concat(t->types);
          ast->terms->prec.concat(t->prec);
        }
      }

      ASTNEXT(TF_nonterm, nt) {
        if (!ast->firstNT) {
          ast->firstNT = nt;
        }
      }

      ASTENDCASED
    }
  }

  if (!ast->terms) {
    astParseError("'Terminals' specification is missing");
  }
  if (!ast->firstNT) {
    astParseError("you have to have at least one nonterminal");
  }
}


LocString extractActionClassName(LocString const &body)
{
  // find start of first token
  char const *start = body.str;
  while (isspace(*start)) start++;

  // find end of first token
  char const *p = start;
  while (isspace(*p)) p++;
  while (isalnum(*p) || *p=='_') p++;
  
  // yield that, with the same source location
  return LocString(body.loc, grammarStringTable.add(
    substring(start, p-start).c_str()));
}


// handle TF_verbatim and TF_option
void astParseOptions(Grammar &g, GrammarAST *ast)
{
  // handle TF_verbatim and TF_option
  FOREACH_ASTLIST_NC(TopForm, ast->forms, iter) {
    ASTSWITCH(TopForm, iter.data()) {
      ASTCASE(TF_context, c) {
        // overwrite the context class name, and append to
        // its body verbatim list
        g.actionClassName = extractActionClassName(c->body);

        // 11/13/04: There is a subtle problem with keeping the body
        // from the base specification, when the following conditions
        // hold:
        //   - the base spec is compiled on its own (w/o the extension)
        //   - some translation unit "A" sees the resulting .gr.gen.h file
        //   - the extension spec is compiled
        //   - some translation unit "B" sees the resulting .gr.gen.h file
        //   - A and B are linked together in one executable
        // In that case, the context_class from the base will have an
        // inconsistent definition in A and B, since in A it will be
        // whatever the user wrote plus, the declarations for the
        // action functions, whereas in B it will be just what the
        // user wrote, since the action functions end up in the
        // extension context_class.
        //
        // What is even more subtle is the *manifestation* of this
        // problem, which is linking problems with vtables.  C++
        // compilers do not explicitly check that classes declared in
        // multiple translation units have identical declarations
        // (token for token), but they *do* of course rely on them
        // being so.  That reliance shows up in the decisions
        // regarding which module has the vtable, among other places.
        // So this problem did not show up immediately, and was only
        // revealed as initially mysterious portability problems
        // (since my development toolchain happend to be fairly
        // lenient w.r.t. vtable placement).
        //
        // Therefore the new policy is that context_classes from the
        // base are *not* emitted, and consequently it is impossible
        // to inherit from them in subsequent context_classes.  The
        // user must put data/functions that are meant to be shared
        // into a common base class that is *not* the context_class
        // of any grammar or extension.
        //
        // old:
        //g.actionClasses.append(new LocString(c->body));
        //
        // new:
        g.actionClasses.deleteAll();
        g.actionClasses.append(new LocString(c->body));
      }

      ASTNEXT(TF_verbatim, v) {
        if (v->isImpl) {
          g.implVerbatim.append(new LocString(v->code));
        }
        else {
          g.verbatim.append(new LocString(v->code));
        }
      }

      ASTNEXT(TF_option, op) {
        LocString const &name = op->name;
        int value = op->value;
        bool boolVal = !!value;

        if (name.equals("useGCDefaults")) {
          g.useGCDefaults = boolVal;
        }
        else if (name.equals("defaultMergeAborts")) {
          g.defaultMergeAborts = boolVal;
        }
        else if (name.equals("shift_reduce_conflicts")) {
          g.expectedSR = value;
        }
        else if (name.equals("reduce_reduce_conflicts")) {
          g.expectedRR = value;
        }
        else if (name.equals("unreachable_nonterminals")) {
          g.expectedUNRNonterms = value;
        }
        else if (name.equals("unreachable_terminals")) {
          g.expectedUNRTerms = value;
        }
        else if (name.equals("lang_OCaml")) {
          //g.targetLang = "OCaml";
          //
          // I'm retarded.. I need to know if we're parsing ocaml *before*
          // we actually parse it, otherwise I can't skip the embedded
          // action fragments properly!
          astParseError(name, "The `lang_OCaml' option has been replaced with "
                              "the `-ocaml' command-line switch.  Please use "
                              "that instead.  (Sorry for the inconvenience.)");
        }
        else if (name.equals("allow_continued_nonterminals")) {
          ast->allowContinuedNonterminals = boolVal;
        }
        else {
          astParseError(name, "unknown option name");
        }
      }

      ASTENDCASED
    }
  }
}


// map the grammar definition AST into a Grammar data structure
void astParseGrammar(Grammar &g, GrammarAST *ast)
{
  // default, empty environment
  Environment env(g);

  // handle TF_terminals
  astParseTerminals(env, *(ast->terms));

  // process all nonterminal declarations first, so while we're
  // looking at their bodies we can tell if one isn't declared
  {
    FOREACH_ASTLIST_NC(TopForm, ast->forms, iter) {
      if (!iter.data()->isTF_nonterm()) continue;
      TF_nonterm *nt = iter.data()->asTF_nonterm();

      // check for already declared
      if (env.nontermDecls.isMapped(nt->name)) {
        if (!ast->allowContinuedNonterminals) {
          astParseError(nt->name, "nonterminal already declared");
        }
        else {
          // check for consistent type
          if (!nt->type.equals(env.nontermDecls.queryf(nt->name)->type)) {
            astParseError(nt->name, "continued nonterminal with different type");
          }
          
          // just allow it; it seems the parser just iterates over
          // all the TF_nonterms, and will do the right thing
          continue;
        }
      }

      // make the Grammar object to represent the new nonterminal
      env.g.getOrMakeNonterminal(nt->name);

      // add this decl to our running list (in the original environment)
      //
      // 12/09/04: As far as I can tell, 'nontermDecls' is in fact not
      // used except for right here, to check whether a nonterminal
      // declaration is duplicated.
      env.nontermDecls.add(nt->name, nt);
    }
  }

  // process nonterminal bodies
  {
    FOREACH_ASTLIST(TopForm, ast->forms, iter) {
      if (!iter.data()->isTF_nonterm()) continue;
      TF_nonterm const *nt = iter.data()->asTF_nontermC();

      // new environment since it can contain a grouping construct
      // (at this very moment it actually can't because there is no syntax..)
      Environment newEnv(env);

      // parse it
      astParseNonterm(newEnv, nt);
    }
  }

  if (!g.actionClassName.str) {
    astParseError("you must specify a context class; for example:\n"
                  "  context_class Context : public UserActions {};\n");
  }

  if (env.errors) {
    astParseError("halting due to previously reported errors");
  }
}


// validate 'name'
Terminal *astParseToken(Environment &env, LocString const &name)
{
  Terminal *t = env.g.findTerminal(name);
  if (!t) {
    astParseError(name, "undeclared token");
  }
  return t;
}


// needed to ensure the GrowArray below has its values initialized
// to false when the array expands
class InitFalseBool {
public:
  bool b;
public:
  InitFalseBool() : b(false) {}
};


void astParseTerminals(Environment &env, TF_terminals const &terms)
{
  // basic declarations
  {
    int maxCode = 0;
    GrowArray<InitFalseBool> codeHasTerm(200);
    FOREACH_ASTLIST(TermDecl, terms.decls, iter) {
      TermDecl const &term = *(iter.data());

      // process the terminal declaration
      int code = term.code;
      StringRef name = term.name;
      trace("grampar") << "token: code=" << code
                       << ", name=" << name << endl;

      if (!env.g.declareToken(term.name, code, term.alias)) {
        astParseError(term.name, "token already declared");
      }

      // track what terminals have codes
      maxCode = max(code, maxCode);
      codeHasTerm.ensureIndexDoubler(code);
      codeHasTerm[code].b = true;
    }

    // fill in any gaps in the code space; this is required because
    // later analyses assume the terminal code space is dense
    SourceLoc dummyLoc(HERE_SOURCELOC);
    for (int i=0; i<maxCode; i++) {
      if (!codeHasTerm[i].b) {
        LocString dummy(dummyLoc, grammarStringTable.add(
          stringc << "__dummy_filler_token" << i));
        env.g.declareToken(dummy, i, dummy);
      }
    }
  }

  // type annotations
  {                  
    FOREACH_ASTLIST(TermType, terms.types, iter) {
      TermType const &type = *(iter.data());
      trace("grampar") << "token type: name=" << type.name
                       << ", type=" << type.type << endl;

      // look up the name
      Terminal *t = astParseToken(env, type.name);
      if (t->type) {
        astParseError(type.name, "this token already has a type");
      }

      // annotate with declared type
      t->type = type.type;

      // parse the dup/del/merge spec
      astParseDDM(env, t, type.funcs);
    }
  }

  // precedence specifications
  {
    FOREACH_ASTLIST(PrecSpec, terms.prec, iter) {
      PrecSpec const &spec = *(iter.data());

      FOREACH_ASTLIST(LocString, spec.tokens, tokIter) {
        LocString const &tokName = *(tokIter.data());
        trace("grampar") << "prec: " << toString(spec.kind)
                         << " " << spec.prec << " " << tokName;

        // look up the token
        Terminal *t = astParseToken(env, tokName);
        if (t->precedence) {
          astParseError(tokName,
            stringc << tokName << " already has a specified precedence");
        }

        if (spec.prec == 0) {
          // 0 means precedence isn't specified
          astParseError(tokName,
            "you can't use 0 as a precedence level, because that value "
            "is used internally to mean something else");
        }

        // apply spec
        t->precedence = spec.prec;
        t->associativity = spec.kind;
      }
    }
  }
}


void astParseDDM(Environment &env, Symbol *sym,
                 ASTList<SpecFunc> const &funcs)
{
  Terminal *term = sym->ifTerminal();
  Nonterminal *nonterm = sym->ifNonterminal();

  FOREACH_ASTLIST(SpecFunc, funcs, iter) {
    SpecFunc const &func = *(iter.data());
    int numFormals = func.formals.count();

    // decide what to do based on the name

    if (func.name.equals("dup")) {
      if (numFormals != 1) {
        astParseError(func.name, "'dup' function must have one formal parameter");
      }
      if (sym->dupParam) {
        astParseError(func.name, "duplicate 'dup' function");
      }
      sym->dupParam = func.nthFormal(0);
      sym->dupCode = func.code;
    }

    else if (func.name.equals("del")) {
      if (numFormals == 0) {
        // not specified is ok, since it means the 'del' function
        // doesn't use its parameter
        sym->delParam = NULL;
      }
      else if (numFormals == 1) {
        sym->delParam = func.nthFormal(0);
      }
      else {
        astParseError(func.name, "'del' function must have either zero or one formal parameters");
      }

      if (sym->delCode) {
        astParseError(func.name, "duplicate 'del' function");
      }
      sym->delCode = func.code;
    }

    else if (func.name.equals("merge")) {
      if (nonterm) {
        if (numFormals != 2) {
          astParseError(func.name, "'merge' function must have two formal parameters");
        }
        if (nonterm->mergeParam1) {
          astParseError(func.name, "duplicate 'merge' function");
        }
        nonterm->mergeParam1 = func.nthFormal(0);
        nonterm->mergeParam2 = func.nthFormal(1);
        nonterm->mergeCode = func.code;
      }
      else {
        astParseError(func.name, "'merge' can only be applied to nonterminals");
      }
    }

    else if (func.name.equals("keep")) {
      if (nonterm) {
        if (numFormals != 1) {
          astParseError(func.name, "'keep' function must have one formal parameter");
        }
        if (nonterm->keepParam) {
          astParseError(func.name, "duplicate 'keep' function");
        }
        nonterm->keepParam = func.nthFormal(0);
        nonterm->keepCode = func.code;
      }
      else {
        astParseError(func.name, "'keep' can only be applied to nonterminals");
      }
    }

    else if (func.name.equals("classify")) {
      if (term) {
        if (numFormals != 1) {
          astParseError(func.name, "'classify' function must have one formal parameter");
        }
        if (term->classifyParam) {
          astParseError(func.name, "duplicate 'classify' function");
        }
        term->classifyParam = func.nthFormal(0);
        term->classifyCode = func.code;
      }
      else {
        astParseError(func.name, "'classify' can only be applied to terminals");
      }
    }

    else if (func.name.equals("maximal")) {
      if (nonterm) {
        if (nonterm->maximal) {
          astParseError(func.name, "duplicate 'maximal' declaration");
        }
        nonterm->maximal = true;     // function body has no meaning
      }
      else {
        astParseError(func.name, "'maximal' can only be applied to nonterminals");
      }
    }

    else {
      astParseError(func.name,
        stringc << "unrecognized spec function \"" << func.name << "\"");
    }
  }
}


void addDefaultTypesActions(Grammar &g, GrammarAST *ast)
{
  // language defaults
  StringRef defaultType, defaultAction;
  if (g.targetLang.equals("OCaml")) {
    defaultType = grammarStringTable.add("unit");
    defaultAction = grammarStringTable.add("()");
  }
  else /*C*/ {
    defaultType = grammarStringTable.add("void");
    defaultAction = grammarStringTable.add("return;");
  }

  // hook to allow me to force defaults everywhere (this is useful
  // when I want to try a grammar written for one language using
  // another language's core)
  bool forceDefaults = tracingSys("forceDefaultActions");

  // iterate over nonterminals
  FOREACH_ASTLIST_NC(TopForm, ast->forms, iter) {
    if (!iter.data()->isTF_nonterm()) { continue; }
    TF_nonterm *nt = iter.data()->asTF_nonterm();

    // default type
    if (forceDefaults || nt->type.isNull()) {
      nt->type.str = defaultType;
    }

    // iterate over productions
    FOREACH_ASTLIST_NC(ProdDecl, nt->productions, iter2) {
      ProdDecl *pd = iter2.data();

      // default action
      if (forceDefaults || pd->actionCode.isNull()) {
        pd->actionCode.str = defaultAction;
      }
                          
      if (forceDefaults) {
        // clear RHSElt tags, since otherwise the lack of types
        // will provoke errors; and default actions don't refer to
        // the RHSElts anyway
        StringRef empty = grammarStringTable.add("");
        FOREACH_ASTLIST_NC(RHSElt, pd->rhs, iter3) {
          ASTSWITCH(RHSElt, iter3.data()) {
            ASTCASE(RH_name, n)
              n->tag.str = empty;

            ASTNEXT(RH_string, s)
              s->tag.str = empty;
            
            ASTENDCASED
          }
        }
      }
    }
  }
}


void synthesizeStartRule(Grammar &g, GrammarAST *ast)
{
  // get the first nonterminal; this is the user's start symbol
  TF_nonterm *firstNT = ast->firstNT;

  // find the name of the user's EOF token
  TermDecl const *eof = NULL;
  FOREACH_ASTLIST(TermDecl, ast->terms->decls, iter) {
    if (iter.data()->code == 0) {
      eof = iter.data();
      break;
    }
  }
  if (!eof) {
    astParseError("you have to have an EOF token, with code 0");
  }

  // build a start production
  RHSElt *rhs1 = new RH_name(LIT_STR("top").clone(), firstNT->name.clone());
  RHSElt *rhs2 = new RH_name(LIT_STR("").clone(), eof->name.clone());
  ASTList<RHSElt> *rhs = new ASTList<RHSElt>();
  rhs->append(rhs1);
  rhs->append(rhs2);
  char const *action = g.targetLang.equals("OCaml")? " top " :
                       firstNT->type.equals("void")? " return; " :
                                                     " return top; ";
  ProdDecl *startProd = new ProdDecl(SL_INIT, PDK_NEW, rhs, LIT_STR(action).clone());

  // build an even earlier start symbol
  TF_nonterm *earlyStartNT
    = new TF_nonterm(
        LIT_STR("__EarlyStartSymbol").clone(),   // name
        firstNT->type.clone(),                   // type
        NULL,                                    // empty list of functions
        new ASTList<ProdDecl>(startProd),        // productions
        NULL                                     // subsets
      );

  // put it into the AST
  ast->forms.prepend(earlyStartNT);
}


void astParseNonterm(Environment &env, TF_nonterm const *nt)
{
  LocString const &name = nt->name;

  // get the Grammar object that represents the nonterminal
  Nonterminal *nonterm = env.g.findNonterminal(name);
  xassert(nonterm);

  nonterm->type = nt->type;

  // iterate over the productions
  FOREACH_ASTLIST(ProdDecl, nt->productions, iter) {
    astParseProduction(env, nonterm, iter.data());
  }

  // parse dup/del/merge
  astParseDDM(env, nonterm, nt->funcs);
  
  // record subsets                       
  {
    FOREACH_ASTLIST(LocString, nt->subsets, iter) {
      LocString const *ls = iter.data();
      Nonterminal *sub = env.g.findNonterminal(*ls);
      if (!sub) {
        astParseError(*ls, "nonexistent nonterminal");
      }

      // note that, since context-free language inclusion is
      // undecidable (Hopcroft/Ullman), we can't actually check that
      // the given nonterminals really are in the subset relation
      nonterm->subsets.prepend(sub);
    }
  }
}


void astParseProduction(Environment &env, Nonterminal *nonterm,
                        ProdDecl const *prodDecl)
{
  // is this the special start symbol I inserted?
  bool synthesizedStart = nonterm->name.equals("__EarlyStartSymbol");

  // build a production; use 'this' as the tag for LHS elements
  Production *prod = new Production(nonterm, "this");

  // put the code into it
  prod->action = prodDecl->actionCode;

  // deal with RHS elements
  FOREACH_ASTLIST(RHSElt, prodDecl->rhs, iter) {
    RHSElt const *n = iter.data();
    LocString symName;
    LocString symTag;
    bool isString = false;
    bool isAnnotation = false;

    // pull various info out of the AST node
    ASTSWITCHC(RHSElt, n) {
      ASTCASEC(RH_name, tname) {
        symName = tname->name;
        symTag = tname->tag;
      }

      ASTNEXTC(RH_string, ts) {
        symName = ts->str;
        symTag = ts->tag;
        isString = true;
      }

      ASTNEXTC(RH_prec, p) {
        Terminal *t = astParseToken(env, p->tokName);
        if (!t) { break; }

        // apply the specified precedence
        prod->precedence = t->precedence;
        isAnnotation = true;
      }

      ASTNEXTC(RH_forbid, f) {
        Terminal *t = astParseToken(env, f->tokName);
        if (!t) { break; }

        prod->addForbid(t, env.g.numTerminals());
        isAnnotation = true;
      }

      ASTDEFAULTC {
        xfailure("bad RHSElt kind");
      }

      ASTENDCASEC
    }

    if (isAnnotation) {
      // code below is for things other than annotations
      continue;
    }

    // see which (if either) thing this name already is
    Terminal *term = env.g.findTerminal(symName);
    Nonterminal *nonterm = env.g.findNonterminal(symName);
    if (term && nonterm) {
      if (isString) {
        astParseError(symName, "token alias has same name as a nonterminal");
      }
      else {
        astParseError(symName, "token and nonterminal have same name");
      }
      nonterm = NULL;                // error recovery
    }

    // syntax rules
    if (isString  &&  !term) {
      astParseError(symName, "terminals must be declared");
    }

    if (!term && !nonterm) {
      astParseErrorCont(env, symName, "undeclared symbol");

      // synthesize one anyway so we can find more errors
      nonterm = env.g.getOrMakeNonterminal(symName);
    }

    if (term && term->termIndex==0 && !synthesizedStart) {
      astParseError(symName, "you cannot use the EOF token in your rules");
    }

    if (symTag.equals("loc")) {
      // bad because loc is the name of the automatically-propagated
      // source location information
      astParseErrorCont(env, symTag, "cannot use \"loc\" as a tag");
    }

    // whenever we see a terminal, copy its precedence spec to
    // the production; thus, the last symbol appearing in the
    // production will be the one that gives the precedence
    if (term) {
      prod->precedence = term->precedence;
    }

    // decide which symbol to put in the production
    Symbol *s;
    if (nonterm) {
      s = nonterm;            // could do these two with a bitwise OR
    }                         // if I were feeling extra clever today
    else {
      s = term;
    }

    if (s->isEmptyString) {
      // "empty" is a syntactic convenience; it doesn't get
      // added to the production
    }
    else {
      // add it to the production
      prod->append(s, symTag);
    }
  }

  // after constructing the production we need to do this
  // update: no we don't -- GrammarAnalysis takes care of it (and
  // complains if we do)
  //prod->finished();

  // add production to grammar
  env.g.addProduction(prod);
}


// ----------------------- parser support ---------------------
// Bison parser calls this to get a token
int grampar_yylex(YYSTYPE *lvalp, void *parseParam)
{
  ParseParams *par = (ParseParams*)parseParam;
  GrammarLexer &lexer = par->lexer;

  int code = lexer.yylexInc();

  try {
    // yield semantic values for some things
    // note that the yielded semantic value must be consistent with
    // what is declared for these token types in grampar.y
    switch (code) {
      case TOK_INTEGER:
        lvalp->num = lexer.integerLiteral;
        break;

      case TOK_STRING:
        lvalp->str = new LocString(lexer.curLoc(), lexer.stringLiteral);
        break;

      case TOK_NAME:
        lvalp->str = new LocString(lexer.curLoc(), lexer.curToken());
        break;

      case TOK_LIT_CODE:
        lvalp->str = new LocString(lexer.curLoc(), lexer.curFuncBody());
        break;

      case TOK_ARROW:
        lvalp->loc = lexer.curLoc();
        break;

      default:
        lvalp->str = NULL;        // any attempt to use will segfault
    }
  }
  catch (xBase &x) {
    // e.g. malformed fundecl
    cout << lexer.curLocStr() << ": " << x << endl;

    // optimistically try just skipping the bad token
    return grampar_yylex(lvalp, parseParam);
  }

  return code;
}


void grampar_yyerror(char const *message, void *parseParam)
{
  ParseParams *par = (ParseParams*)parseParam;
  cout << par->lexer.curLocStr() << ": " << message << endl;
}


// ---------------------- merging -----------------------
void mergeContext(GrammarAST *base, TF_context * /*owner*/ ext)
{
  // do simple append, since the grammar parser above knows how
  // to handle multiple context classes
  base->forms.append(ext);

  #if 0
  // find 'base' context
  TF_context *baseContext = NULL;
  FOREACH_ASTLIST_NC(TopForm, base->forms, iter) {
    if (iter.data()->isTF_context()) {
      baseContext = iter.data()->asTF_context();
      break;
    }
  }

  if (!baseContext) {
    // base does not have a context class, so 'ext' becomes it
    base->forms.append(ext);
  }

  else if (baseContext->name.str == ext->name.str) {
    // same name; I'd like to append the code to what's already
    // there, but that's tricky because the location won't
    // be right..
    astParseError(ext->name, "context append not implemented");
  }

  else {
    // different name, replace the old
    base->forms.removeItem(baseContext);
    delete baseContext;
    base->forms.append(ext);
  }
  #endif // 0
}


void mergeOption(GrammarAST *base, TF_option * /*owner*/ ext)
{                    
  // find option with the same name
  FOREACH_ASTLIST_NC(TopForm, base->forms, iter) {
    if (!iter.data()->isTF_option()) continue;
    TF_option *op = iter.data()->asTF_option();
    
    if (op->name.str == ext->name.str) {
      // replace the old value
      op->value = ext->value;
      delete ext;
      return;
    }
  }

  // otherwise, just add the new option
  base->forms.append(ext);
}


void mergeTerminals(GrammarAST *base, TF_terminals * /*owner*/ ext)
{
  FOREACH_ASTLIST_NC(TopForm, base->forms, iter) {
    if (iter.data()->isTF_terminals()) {
      TF_terminals *t = iter.data()->asTF_terminals();
      
      // there's no point to changing codes, so all the
      // TermDecls just get added (collisions are detected
      // later, during AST parsing)
      t->decls.concat(ext->decls);
      
      // in fact, I'll do the same for the others, even though
      // it might make sense to do some replacement; my immediate
      // needs don't include replacement at this level
      t->types.concat(ext->types);
      t->prec.concat(ext->prec);
      
      delete ext;
      return;
    }
  }
  
  // no TF_terminals in 'base'.. unusual, but easy to handle
  base->forms.append(ext);
}


void mergeSpecFunc(TF_nonterm *base, SpecFunc * /*owner*/ ext)
{
  // find an existing spec func with the same name
  FOREACH_ASTLIST_NC(SpecFunc, base->funcs, iter) {
    SpecFunc *f = iter.data();
    if (f->name.str == ext->name) {
      // replace the old code with the extension code
      base->funcs.removeItem(f);
      delete f;
      break;
    }
  }

  // just add it
  base->funcs.append(ext);
}


bool equalRHSElt(RHSElt const *elt1, RHSElt const *elt2)
{
  if (elt1->kind() != elt2->kind()) {
    return false;
  }

  // if the RHS names a terminal, this isn't perfect because one might
  // use an alias.. but I don't have the necessary information to detect
  // that, since I haven't yet computed the associated Symbols
  if (elt1->isRH_name()) {
    return elt1->asRH_nameC()->name.str == elt2->asRH_nameC()->name.str;
  }
  if (elt1->isRH_string()) {
    return elt1->asRH_stringC()->str.str == elt2->asRH_stringC()->str.str;
  }
  if (elt1->isRH_prec()) {
    // this means you can't change the precedence..
    return elt1->asRH_precC()->tokName.str == elt2->asRH_precC()->tokName.str;
  }

  xfailure("unknown RHSElt kind");
  return false;     // silence warning
}


bool equalRHS(ProdDecl const *prod1, ProdDecl const *prod2)
{
  if (prod1->rhs.count() != prod2->rhs.count()) {
    return false;
  }

  for (ASTListIter<RHSElt> iter1(prod1->rhs), iter2(prod2->rhs);
       !iter1.isDone(); iter1.adv(), iter2.adv()) {
    if (!equalRHSElt(iter1.data(), iter2.data())) {
      return false;
    }
  }
  return true;
}


void mergeProduction(TF_nonterm *base, ProdDecl *ext)
{
  bool found = false;

  // look for a production with an identical RHS
  FOREACH_ASTLIST_NC(ProdDecl, base->productions, iter) {
    ProdDecl *prod = iter.data();

    // check RHSs for equality
    if (equalRHS(prod, ext)) {
      found = true;

      if (ext->kind == PDK_NEW) {
        astParseError(ext->loc,
                      "production has the same RHS as an existing production; "
                      "if intent is to replace, use the 'replace' keyword");
        // not reached
      }

      // delete old
      base->productions.removeItem(prod);
      delete prod;

      if (ext->kind == PDK_DELETE) {
        delete ext;       // drop new on the floor, too
        return;
      }

      // will replace old with new
      xassert(ext->kind == PDK_REPLACE);
      break;
    }
  }

  if (ext->kind != PDK_NEW && !found) {
    astParseError(ext->loc,
                  "production marked with 'delete' or 'replace' does not match "
                  "any in the base specification");
  }

  // add the production
  base->productions.append(ext);
}


void mergeNonterminal(GrammarAST *base, TF_nonterm * /*owner*/ ext)
{
  // find an existing nonterminal with the same name
  TF_nonterm *exist = NULL;
  FOREACH_ASTLIST_NC(TopForm, base->forms, iter) {
    if (iter.data()->isTF_nonterm() &&
        iter.data()->asTF_nonterm()->name.str == ext->name) {
      exist = iter.data()->asTF_nonterm();
    }
  }

  if (!exist) {
    // no pre-existing, just append it
    base->forms.append(ext);
    return;
  }

  // make sure the types agree
  if (exist->type.str != ext->type) {
    astParseError(ext->type, "cannot redefine the type of a nonterminal");
  }

  // merge the spec funcs
  while (ext->funcs.isNotEmpty()) {
    mergeSpecFunc(exist, ext->funcs.removeFirst());
  }

  // merge the productions
  while (ext->productions.isNotEmpty()) {
    mergeProduction(exist, ext->productions.removeFirst());
  }

  delete ext;
}


void mergeGrammar(GrammarAST *base, GrammarAST *ext)
{
  // work through all the forms in 'ext', removing each
  // one; it will then either be added to 'base', or
  // discarded entirely
  while (ext->forms.isNotEmpty()) {
    TopForm *form = ext->forms.removeFirst();

    ASTSWITCH(TopForm, form) {
      ASTCASE(TF_context, c) {
        mergeContext(base, c);
      }

      ASTNEXT(TF_verbatim, v) {
        // verbatims simply accumulate
        base->forms.append(v);
      }

      ASTNEXT(TF_option, op) {
        mergeOption(base, op);
      }

      ASTNEXT(TF_terminals, t) {
        mergeTerminals(base, t);
      }

      ASTNEXT(TF_nonterm, n) {
        mergeNonterminal(base, n);
      }
      
      ASTDEFAULT {
        xfailure("doh");
      }
      
      ASTENDCASE
    }
  }
}


// ---------------- external interface -------------------
bool isGramlexEmbed(int code);     // defined in gramlex.lex

GrammarAST *parseGrammarFile(rostring origFname, bool useML)
{
  string fname = origFname;

  #ifndef NDEBUG
  if (tracingSys("yydebug")) {
    yydebug = true;    // this flag goes away when NDEBUG is specified..
  }
  #endif // NDEBUG

  // open input file
  Owner<ifstream> in;
  if (fname.empty()) {
    fname = "<stdin>";
  }
  else {
    in = new ifstream(fname.c_str());
    if (!*in) {
      xsyserror("open", stringc << "error opening input file " << fname);
    }
  }

  // choose embedded language              
  EmbeddedLang *embed = NULL;
  if (useML) {
    embed = new MLSubstrate;
  }
  
  // build lexer
  GrammarLexer lexer(isGramlexEmbed,
                     grammarStringTable,
                     fname.c_str(),
                     in.xfr(),
                     embed);
  if (embed) {
    // install the refined error reporter
    embed->err = &lexer.altReporter;
  }

  ParseParams params(lexer);

  traceProgress() << "parsing grammar source: " << fname << endl;
  int retval = grampar_yyparse(&params);
  if (retval==0 && lexer.errors==0) {
    GrammarAST *ret = params.treeTop;

    if (tracingSys("printGrammarAST")) {
      // print AST
      cout << "AST:\n";
      ret->debugPrint(cout, 2);
    }

    return ret;
  }
  else {
    xbase("parsing finished with an error");
    return NULL;     // silence warning
  }
}


void parseGrammarAST(Grammar &g, GrammarAST *treeTop)
{
  setAnnotations(treeTop);
  
  // look at TF_options before synthesizing start rule,
  // so we can know what language is the target
  astParseOptions(g, treeTop);

  // fill in default types and actions
  addDefaultTypesActions(g, treeTop);

  // synthesize a rule "TrueStart -> Start EOF"
  synthesizeStartRule(g, treeTop);

  // parse the AST into a Grammar
  traceProgress() << "parsing grammar AST..\n";
  astParseGrammar(g, treeTop);

  // then check grammar properties; throws exception
  // on failure
  traceProgress() << "beginning grammar analysis..\n";
  g.checkWellFormed();
}


void readGrammarFile(Grammar &g, rostring fname)
{
  // make sure the tree gets deleted
  Owner<GrammarAST> treeTop(parseGrammarFile(fname, false /*useML*/));

  parseGrammarAST(g, treeTop);

  treeTop.del();

  // hmm.. I'd like to restore this functionality...
  //if (ASTNode::nodeCount > 0) {
  //  cout << "leaked " << ASTNode::nodeCount << " AST nodes\n";
  //}
}


// ----------------------- test code -----------------------
#ifdef TEST_GRAMPAR

#include "bflatten.h"     // BFlatten
#include <stdlib.h>       // system

int main(int argc, char **argv)
{
  if (argc < 2) {
    cout << "usage: " << argv[0] << " [-tr flags] filename.gr\n";
    cout << "  interesting trace flags:\n";
    cout << "    keep-tmp      do not delete the temporary files\n";
    //cout << "    cat-grammar   print the ascii rep to the screen\n";
    return 0;
  }

  traceAddSys("progress");
  TRACE_ARGS();

  bool printCode = true;

  // read the file
  Grammar g1;
  readGrammarFile(g1, argv[1]);

  // and print the grammar
  char const g1Fname[] = "grammar.g1.tmp";
  traceProgress() << "printing initial grammar to " << g1Fname << "\n";
  {
    ofstream out(g1Fname);
    g1.printSymbolTypes(out);
    g1.printProductions(out, printCode);
  }

  //if (tracingSys("cat-grammar")) {
    system("cat grammar.g1.tmp");
  //}

  // before using 'xfer' we have to tell it about the string table
  flattenStrTable = &grammarStringTable;

  // write it to a binary file
  char const binFname[] = "grammar.bin.tmp";
  traceProgress() << "writing initial grammar to " << binFname << "\n";
  {
    BFlatten flat(binFname, false /*reading*/);
    g1.xfer(flat);
  }

  // read it back
  traceProgress() << "reading grammar from " << binFname << "\n";
  Grammar g2;
  {
    BFlatten flat(binFname, true /*reading*/);
    g2.xfer(flat);
  }

  // print that too
  char const g2Fname[] = "grammar.g2.tmp";
  traceProgress() << "printing just-read grammar to " << g2Fname << "\n";
  {
    ofstream out(g2Fname);
    g2.printSymbolTypes(out);
    g2.printProductions(out, printCode);
  }

  // compare the two written files
  int result = system(stringc << "diff " << g1Fname << " " << g2Fname);
  if (result != 0) {
    cout << "the two ascii representations differ!!\n";
    return 4;
  }

  // remove the temp files
  if (!tracingSys("keep-tmp")) {
    remove(g1Fname);
    remove(g2Fname);
    remove(binFname);
  }

  cout << "successfully parsed, printed, wrote, and read a grammar!\n";
  return 0;
}

#endif // TEST_GRAMPAR

