/* Objective Modula-2 Compiler (objm2c)
 *
 *  @file objm2_lexer.c
 *  Objective Modula-2 lexer implementation
 *
 *  Lexical analysis for Objective Modula-2 source files
 *
 *  Author: Benjamin Kowarsch
 *
 *  Copyright (C) 2009 Sunrise Telephone Systems KK. All rights reserved.
 *
 *  License:
 *
 *  Permission is hereby granted to review and test this software for the sole
 *  purpose of supporting the effort by the licensor to define and develop the
 *  Objective Modula-2 language. It is not permissible under any circumstances
 *  to  use the software  for the purpose  of creating derivative languages or 
 *  dialects.  This permission is valid until 31 December 2009, 24:00h GMT.
 *
 *  Future licensing:
 *
 *  The licensor undertakes to eventually release this software under a proper
 *  open source license  AFTER  the Objective Modula-2 language definition has
 *  been finalised and a conforming and working reference compiler completed.
 *  
 *  Version history:
 *
 *   2.00   2009-01-31   BK   new file from various spin-offs of v.1.x
 *          2009-03-25   BK   changes for new symbol type
 *          2009-05-18   BK   changes for finalised grammar of 2009-05-18
 *
 *  TO DO:
 *
 *  code review of _get_ident, _get_number, _get_c_comment and _get_m2_comment
 *  some code to set lexer->status may be missing in these functions
 */


// ---------------------------------------------------------------------------
// Standard library imports
// ---------------------------------------------------------------------------

#include <stdio.h>
#include <errno.h>
#include <stdlib.h>

// ---------------------------------------------------------------------------
// ObjM2 project imports
// ---------------------------------------------------------------------------

#include "ASCII.h"
#include "common_macros.h"
#include "objm2_build_params.h"
#include "objm2_hash.h"
#include "objm2_reserved_words.h"
#include "objm2_lexer.h"


// ---------------------------------------------------------------------------
// Lexeme buffer size
// ---------------------------------------------------------------------------
//
// Buffer size required to hold the largest type of lexeme
//
// Calculation makes adjustments for extra characters:
//
// o  Identifiers, one extra character for ":" in method signature component
// o  Numbers, one extra character for designators "B", "C", "H" and "U"
// o  Strings, two extra characters for delimiting quotes
// o  All, one extra character to terminate buffer


#define OBJM2_LEXER_LEX_BUFFER_CAPACITY \
   (MAX(MAX(OBJM2_MAX_IDENT_LENGTH + 1, OBJM2_MAX_NUM_LENGTH + 1), \
        MAX(OBJM2_MAX_STRING_LENGTH + 2, OBJM2_MAX_COMMENT_LENGTH)) + 1)


// ---------------------------------------------------------------------------
// Lexeme buffer type
// ---------------------------------------------------------------------------

typedef uchar_t lexbuf_t[OBJM2_LEXER_LEX_BUFFER_CAPACITY];


// ---------------------------------------------------------------------------
// Lexer option flags
// ---------------------------------------------------------------------------

typedef /* objm2_lexer_options_t */ struct {
    bool trim_leading_zeroes;
    bool printable_7bit_ascii_strings;
    bool skip_pragmas;
    bool skip_c_comments;
    bool skip_m2_comments;
    bool skip_cpp_comments;
} objm2_lexer_options_t;


// ---------------------------------------------------------------------------
// Lexer state type definition
// ---------------------------------------------------------------------------

typedef /* objm2_lexer_s */ struct {
    
    // symbol and status to return
    objm2_symbol_s sym;                 // struct defined in objm2_symbols.h
    objm2_lexer_status_t status;        // codes defined in objm2_lexer.h
    
    // counters
    file_pos_t current_pos;             // position of current character
    uint16_t paren_nesting_level;       // current parenthesis nesting level
    uint16_t bracket_nesting_level;     // current bracket nesting level
    uint16_t brace_nesting_level;       // current brace nesting level
    
    // flags
    bool seen_backquote;                // lexer has seen "`"
    bool seen_method;                   // lexer has seen "METHOD"
    bool seen_open_paren_since_method;  // lexer has seen "(" since "METHOD"
    bool end_of_file;                   // end-of-file has been reached
    
    // lexeme
    cardinal lexlen;                    // length of current lexeme
    lexbuf_t lexbuf;                    // lexeme buffer
    
    // configuration parameters
    objm2_lexer_options_t options;      // option flags
    objm2_kvs_table_t symtab;           // symbol table pointer
    FILE *sourcefile;                   // handle for source file
    uchar_t filename[0];                // path name of source file

} objm2_lexer_s;

#define NOT_EOF(_lexer) (_lexer->end_of_file == false)
#define EOF_REACHED(_lexer) (_lexer->end_of_file == true)


// ==========================================================================
// P R I V A T E   F U N C T I O N   P R O T O T Y P E S
// ==========================================================================

static fmacro uchar_t _readchar(objm2_lexer_s *lexer);

static fmacro uchar_t _nextchar(objm2_lexer_s *lexer);

static fmacro uchar_t _get_ident(objm2_lexer_s *lexer);

static fmacro uchar_t _get_number(objm2_lexer_s *lexer);

static fmacro uchar_t _get_quoted_char_or_string(objm2_lexer_s *lexer);

static fmacro uchar_t _get_escaped_char(objm2_lexer_s *lexer);

static fmacro uchar_t _skip_c_comment(objm2_lexer_s *lexer);

static fmacro uchar_t _get_c_comment(objm2_lexer_s *lexer);

static fmacro uchar_t _skip_m2_comment(objm2_lexer_s *lexer);

static fmacro uchar_t _get_m2_comment(objm2_lexer_s *lexer);

static fmacro uchar_t _skip_cpp_comment(objm2_lexer_s *lexer);

static fmacro uchar_t _get_cpp_comment(objm2_lexer_s *lexer);


// ==========================================================================
// P U B L I C   F U N C T I O N   I M P L E M E N T A T I O N S
// ==========================================================================

#define readchar(v) _readchar(this_lexer) /* v = void */
#define nextchar(v) _nextchar(this_lexer) /* v = void */

// --------------------------------------------------------------------------
// function:  objm2_new_lexer(infile, symtab, options, status)
// --------------------------------------------------------------------------
//
// Creates and returns a new objm2 lexer object associated with the specified 
// source file <infile>,  symbol table <symtab>  and  options <options>.  The
// status of the operation is passed back in <status>.
//
// Returns NULL if the lexer object could not be created.

objm2_lexer_t objm2_new_lexer(const char *infile,
                       objm2_kvs_table_t symtab,
                                uint16_t options,
                    objm2_lexer_status_t *status) {
    
    FILE *sourcefile;
    objm2_lexer_s *new_lexer;
    pathname_index_t i, len = 0;
    //cardinal i, len = 0;
    
    
    // assert pre-conditions
    
    if (infile == NULL) {
        *status = OBJM2_LEXER_STATUS_INVALID_REFERENCE;
        return NULL;
    } // end if

    if (symtab == NULL) {
        *status = OBJM2_LEXER_STATUS_INVALID_REFERENCE;
        return NULL;
    } // end if
    
    // get length of pathname
    while ((infile[len] != 0) && (len <= OBJM2_MAX_PATHNAME_LENGTH))
        len++;
    
    if (len > OBJM2_MAX_PATHNAME_LENGTH) {
        *status = OBJM2_LEXER_STATUS_PATH_NAME_TOO_LONG;
        return NULL;
    } // end if
    
    // open the sourcefile
    sourcefile = fopen(infile, "r");
    
    if (sourcefile == NULL) {
        *status = OBJM2_LEXER_STATUS_ERROR_OPENING_FILE;
        return NULL;
    } // end if
    
    // allocate a new lexer object
    new_lexer = malloc(sizeof(objm2_lexer_s) + len + 1);
    
    if (new_lexer == NULL) {
        fclose(sourcefile);
        *status = OBJM2_LEXER_STATUS_UNABLE_TO_ALLOCATE;
        return NULL;
    } // end if
        
    // initialise the new lexer object

    // initialise status and return values
    
    new_lexer->sym = OBJM2_SYMBOL_FIELDS_CLEARED;
    
    new_lexer->status = OBJM2_LEXER_STATUS_UNDEFINED;
    
    // initialise counters
    SET_FPOS(new_lexer->current_pos, 1, 1);
    new_lexer->paren_nesting_level = 0;
    new_lexer->bracket_nesting_level = 0;
    new_lexer->brace_nesting_level = 0;
    
    // initialise lexer flags
    new_lexer->seen_backquote = false;
    new_lexer->seen_method = false;
    new_lexer->seen_open_paren_since_method = false;
    new_lexer->end_of_file = false;
        
    // lexeme buffer
    new_lexer->lexlen = 0;
    new_lexer->lexbuf[0] = CSTRING_TERMINATOR;
    
    // copy option parameters
    new_lexer->options.trim_leading_zeroes = 
        (options & OBJM2_LEXER_TRIM_LEADING_ZEROES_IN_LITERALS);
    new_lexer->options.printable_7bit_ascii_strings = 
        (options & OBJM2_LEXER_PRINTABLE_7BIT_ASCII_STRINGS_ONLY);
    new_lexer->options.skip_pragmas = 
        (options & OBJM2_LEXER_DONT_RETURN_PRAGMA_TOKEN);
    new_lexer->options.skip_c_comments = 
        (options & OBJM2_LEXER_DONT_RETURN_C_COMMENT_TOKEN);
    new_lexer->options.skip_m2_comments = 
        (options & OBJM2_LEXER_DONT_RETURN_M2_COMMENT_TOKEN);
    new_lexer->options.skip_cpp_comments = 
        (options & OBJM2_LEXER_DONT_RETURN_CPP_COMMENT_TOKEN);

    // copy symbol table pointer
    new_lexer->symtab = symtab;
    
    // copy file handle and pathname
    new_lexer->sourcefile = sourcefile;
    for (i = 0; i <= len; i++)
        new_lexer->filename[i] = infile[i];
    
    // return the initialised lexer object
    *status = OBJM2_LEXER_STATUS_SUCCESS;
    return (objm2_lexer_t) new_lexer;
} // end objm2_new_lexer;


// ---------------------------------------------------------------------------
// function:  objm2_lexer_getsym(lexer, sym, status)
// ---------------------------------------------------------------------------
//
// Reads the current symbol from the sourcefile of lexer <lexer>,  returns its
// token  and passes the symbol back in <sym>.  The status of the operation is
// passed back in <status>.  If an illegal character is encountered,  then the
// token TOKEN_ILLEGAL_CHARACTER is returned and the ASCII code of the illegal
// character is passed back in the symbol's key field.

objm2_token_t objm2_lexer_getsym(objm2_lexer_t lexer,
                                objm2_symbol_t sym,
                          objm2_lexer_status_t *status) {
    
    bool ignore_token;
    register uchar_t ch;
    register objm2_lexer_s *this_lexer = (objm2_lexer_s *) lexer;
        
    // assert pre-condition
    if (lexer == NULL) {
        sym = NULL;
        *status = OBJM2_LEXER_STATUS_INVALID_REFERENCE;
        return TOKEN_ILLEGAL_CHARACTER;
    } // end if
    
    // peek at the first character
    ch = nextchar();
    
    // lexer main loop
    while(true) {
        ignore_token = false;
        
        // skip all whitespace, tab and EOL characters
        while ((NOT_EOF(this_lexer)) &&
               ((ch == WHITESPACE) || (ch == TAB) || (ch == EOL))) {
            // skip the current character
            readchar();
            // take a peek at the next one
            ch = nextchar();
        } // end while;
        
        // clear symbol key
        objm2_symbol_set_key(this_lexer->sym, 0);
        
        // clear symbol flags
        objm2_symbol_clear_all_flags(this_lexer->sym);
        
        // remember position at the start of the symbol
        objm2_symbol_set_line_counter(this_lexer->sym, 
                                      this_lexer->current_pos.line);
        objm2_symbol_set_col_counter(this_lexer->sym, 
                                      this_lexer->current_pos.col);
        
        // remember nesting levels at the start of the symbol
        objm2_symbol_set_paren_nesting_level(this_lexer->sym,
                                             this_lexer->paren_nesting_level);
        objm2_symbol_set_bracket_nesting_level(this_lexer->sym,
                                             this_lexer->bracket_nesting_level);
        objm2_symbol_set_brace_nesting_level(this_lexer->sym,
                                             this_lexer->brace_nesting_level);
                                                    
        // start optimistically
        this_lexer->status = OBJM2_LEXER_STATUS_SUCCESS;
        
        // check for end-of-file
        if (EOF_REACHED(this_lexer)) {
            // found end of file
            objm2_symbol_set_token(this_lexer->sym, TOKEN_EOF_MARKER);
        } // end eof check
                
        // check for identifier or reserved word
        else if ((ch == UNDERSCORE) || (ch == DOLLAR) || (IS_LETTER(ch))) {
            // found identifier or reserved word
            ch = _get_ident(this_lexer);
            // check for method signature context
            if (this_lexer->sym.token == TOKEN_METHOD)
                this_lexer->seen_method = true;
                // this flag is cleared by semicolon
        } // end identifier/reserved word check
        
        // check for numeric literal
        else if (IS_DIGIT(ch)) {
            // found numeric literal
            ch = _get_number(this_lexer);
        } // end numeric literal check
        
        // check for quoted character or string literal
        else if ((ch == SINGLE_QUOTE) || (ch = DOUBLE_QUOTE)) {
            // found quoted string or character literal
            ch = _get_quoted_char_or_string(this_lexer);
        } // end quoted character or string check
                
        // check for message prefix
        else if (ch == BACK_QUOTE) {
            ch = readchar();
            ch = nextchar();
            objm2_symbol_set_token(this_lexer->sym, TOKEN_MESSAGE_PREFIX);
            this_lexer->seen_backquote = true;
            // NB: flag will be cleared by semicolon
        } // end comma check
         
        // check for dot
        else if (ch == DOT) {
            ch = readchar();
            ch = nextchar();
            
            // check for single dot
            if (ch != DOT) {
                // is single dot
                objm2_symbol_set_token(this_lexer->sym, TOKEN_DOT);
                objm2_symbol_set_flag(this_lexer->sym, is_terminal);
            }
            else /* is double dot  */ {
                ch = readchar();
                ch = nextchar();
                objm2_symbol_set_token(this_lexer->sym, TOKEN_DOT_DOT);
                objm2_symbol_set_flag(this_lexer->sym, is_terminal);
            } // end dot dot check
        } // end dot check
         
        // check for comma
        else if (ch == COMMA) {
            ch = readchar();
            ch = nextchar();
            objm2_symbol_set_token(this_lexer->sym, TOKEN_COMMA);
            objm2_symbol_set_flag(this_lexer->sym, is_terminal);
        } // end comma check
         
        // check for colon
        else if (ch == COLON) {
            ch = readchar();
            ch = nextchar();
            // check for assign operator
            if (ch == EQUAL_SIGN) {
                // is assign operator
                ch = readchar();
                ch = nextchar();
                objm2_symbol_set_token(this_lexer->sym, TOKEN_ASSIGN_OP);
                objm2_symbol_set_flag(this_lexer->sym, is_terminal);
                objm2_symbol_set_flag(this_lexer->sym, is_operator);
            }
            else /* is colon */ {
                objm2_symbol_set_token(this_lexer->sym, TOKEN_COLON);
                objm2_symbol_set_flag(this_lexer->sym, is_terminal);
            } // end assign operator check
        } // end colon check
        
        // check for semicolon
        else if (ch == SEMICOLON) {
            ch = readchar();
            ch = nextchar();
            // clear any open method signature context
            this_lexer->seen_method == false;
            this_lexer->seen_open_paren_since_method == false;
            this_lexer->seen_backquote == false;
            // set token and flag
            objm2_symbol_set_token(this_lexer->sym, TOKEN_COMMA);
            objm2_symbol_set_flag(this_lexer->sym, is_terminal);
        } // end of semicolon check
        
        // check for opening parenthesis
        else if (ch == OPENING_PARENTHESIS) {
            ch = readchar();
            ch = nextchar();
            // check for M2 comment
            if (ch != ASTERISK) {
                // is opening parenthesis
                this_lexer->paren_nesting_level++;
                
                if (this_lexer->seen_method)
                    this_lexer->seen_open_paren_since_method == true;
                    // flag cleared by closing parenthesis or semicolon
                    
                objm2_symbol_set_token(this_lexer->sym,
                                       TOKEN_OPENING_PARENTHESIS);
                objm2_symbol_set_flag(this_lexer->sym, is_terminal);
            }
            else /* is M2 comment */ {
                if (this_lexer->options.skip_m2_comments) {
                    ch = _skip_m2_comment(this_lexer);
                    ignore_token = true;
                }
                else /* comment is treated as a token */ {
                    ch = _get_m2_comment(this_lexer);
                    objm2_symbol_set_token(this_lexer->sym, TOKEN_M2_COMMENT);
                } // end if
            } // end Modula-2 comment check
        } // end opening parenthesis check
        
        // check for closing parenthesis
        else if (ch == CLOSING_PARENTHESIS) {
            ch = readchar();
            ch = nextchar();
            this_lexer->paren_nesting_level--;
            
            if (this_lexer->seen_method)
                this_lexer->seen_open_paren_since_method == false;
            
            objm2_symbol_set_token(this_lexer->sym, TOKEN_CLOSING_PARENTHESIS);
            objm2_symbol_set_flag(this_lexer->sym, is_terminal);

        } // end closing parenthesis check
        
        // check for opening bracket
        else if (ch == OPENING_BRACKET) {
            ch = readchar();
            ch = nextchar();
            this_lexer->bracket_nesting_level++;
            objm2_symbol_set_token(this_lexer->sym, TOKEN_OPENING_BRACKET);
            objm2_symbol_set_flag(this_lexer->sym, is_terminal);
        } // end opening bracket check

        // check for closing bracket
        else if (ch == CLOSING_BRACKET) {
            ch = readchar();
            ch = nextchar();
            this_lexer->bracket_nesting_level--;
            objm2_symbol_set_token(this_lexer->sym, TOKEN_CLOSING_BRACKET);
            objm2_symbol_set_flag(this_lexer->sym, is_terminal);
        } // end closing bracket check

        // check for opening brace
        else if (ch == OPENING_BRACE) {
            ch = readchar();
            ch = nextchar();
            this_lexer->brace_nesting_level++;
            objm2_symbol_set_token(this_lexer->sym, TOKEN_OPENING_BRACE);
            objm2_symbol_set_flag(this_lexer->sym, is_terminal);
        } // end opening brace check

        // check for closing brace
        else if (ch == CLOSING_BRACE) {
            ch = readchar();
            ch = nextchar();
            this_lexer->brace_nesting_level--;
            objm2_symbol_set_token(this_lexer->sym, TOKEN_CLOSING_BRACE);
            objm2_symbol_set_flag(this_lexer->sym, is_terminal);
        } // end closing brace check

        // check for logical-and operator
        else if (ch == AMPERSAND) {
            ch = readchar();
            ch = nextchar();
            objm2_symbol_set_token(this_lexer->sym, TOKEN_LOGICAL_AND_OP);
            objm2_symbol_set_flag(this_lexer->sym, is_terminal);
            objm2_symbol_set_flag(this_lexer->sym, is_operator);
            objm2_symbol_set_flag(this_lexer->sym, is_first_order_operator);
        } // end logical-and operator check

        // check for logical-not operator
        else if (ch == TILDE) {
            ch = readchar();
            ch = nextchar();
            objm2_symbol_set_token(this_lexer->sym, TOKEN_LOGICAL_NOT_OP);
            objm2_symbol_set_flag(this_lexer->sym, is_terminal);
            objm2_symbol_set_flag(this_lexer->sym, is_operator);
            objm2_symbol_set_flag(this_lexer->sym, is_unary_operator);
        } // end logical-not operator check

        // check for equal operator
        else if (ch == EQUAL_SIGN) {
            ch = readchar();
            ch = nextchar();
            objm2_symbol_set_token(this_lexer->sym, TOKEN_EQUAL_OP);
            objm2_symbol_set_flag(this_lexer->sym, is_terminal);
            objm2_symbol_set_flag(this_lexer->sym, is_operator);
            objm2_symbol_set_flag(this_lexer->sym, is_relational_operator);
        } // end equal operator check
        
        // check for not-equal operator
        else if (ch == NUMBER_SIGN) {
            ch = readchar();
            ch = nextchar();
            objm2_symbol_set_token(this_lexer->sym, TOKEN_NOT_EQUAL_OP);
            objm2_symbol_set_flag(this_lexer->sym, is_terminal);
            objm2_symbol_set_flag(this_lexer->sym, is_operator);
            objm2_symbol_set_flag(this_lexer->sym, is_relational_operator);
        } // end not-equal operator check

        // check for exclamation
        else if (ch == EXCLAMATION) {
            ch = readchar();
            ch = nextchar();
            if (ch == EQUAL_SIGN) {
                // is not-equal operator
                ch = readchar();
                ch = nextchar();
                objm2_symbol_set_token(this_lexer->sym, TOKEN_NOT_EQUAL_OP);
                objm2_symbol_set_flag(this_lexer->sym, is_terminal);
                objm2_symbol_set_flag(this_lexer->sym, is_operator);
                objm2_symbol_set_flag(this_lexer->sym, is_relational_operator);
            }
            else /* is logical-not operator */ {
            objm2_symbol_set_token(this_lexer->sym, TOKEN_LOGICAL_NOT_OP);
            objm2_symbol_set_flag(this_lexer->sym, is_terminal);
            objm2_symbol_set_flag(this_lexer->sym, is_operator);
            objm2_symbol_set_flag(this_lexer->sym, is_unary_operator);
            } // end if
        } // end exclamation check

        // check for greater-than operator
        else if (ch == GREATER_THAN) {
            ch = readchar();
            ch = nextchar();
            if (ch == EQUAL_SIGN) {
                // is greater_or-equal operator
                ch = readchar();
                ch = nextchar();
                objm2_symbol_set_token(this_lexer->sym,
                                       TOKEN_GREATER_OR_EQUAL_OP);
                objm2_symbol_set_flag(this_lexer->sym, is_terminal);
                objm2_symbol_set_flag(this_lexer->sym, is_operator);
                objm2_symbol_set_flag(this_lexer->sym, is_relational_operator);
            }
            else /* is greater-then operator */ {
                objm2_symbol_set_token(this_lexer->sym, TOKEN_GREATER_OP);
                objm2_symbol_set_flag(this_lexer->sym, is_terminal);
                objm2_symbol_set_flag(this_lexer->sym, is_operator);
                objm2_symbol_set_flag(this_lexer->sym, is_relational_operator);
            } // end if
        } // end greater-than operator check

        // check for less-than operator
        else if (ch == LESS_THAN) {
            ch = readchar();
            ch = nextchar();
            if (ch != EQUAL_SIGN) {
                // is less_or-equal operator
                ch = readchar();
                ch = nextchar();
                objm2_symbol_set_token(this_lexer->sym, TOKEN_LESS_OR_EQUAL_OP);
                objm2_symbol_set_flag(this_lexer->sym, is_terminal);
                objm2_symbol_set_flag(this_lexer->sym, is_operator);
                objm2_symbol_set_flag(this_lexer->sym, is_relational_operator);
            }
            else if (ch == ASTERISK) {
                if (this_lexer->options.skip_pragmas) {
                    ch = _skip_pragma(this_lexer);
                    ignore_token = true;
                }
                else /* pragma is treated as a token */ {
                    ch = _get_pragma(this_lexer);
                    objm2_symbol_set_token(this_lexer->sym, TOKEN_PRAGMA);
                } // end if
            }
            else /*is less-than operator  */ { 
                objm2_symbol_set_token(this_lexer->sym, TOKEN_LESS_OP);
                objm2_symbol_set_flag(this_lexer->sym, is_terminal);
                objm2_symbol_set_flag(this_lexer->sym, is_operator);
                objm2_symbol_set_flag(this_lexer->sym, is_relational_operator);
            } // end if
        } // end less-than operator check

        // check for plus operator
        else if (ch == PLUS) {
            ch = readchar();
            ch = nextchar();
            if (ch != PLUS) {
                // is single plus
                objm2_symbol_set_token(this_lexer->sym, TOKEN_PLUS_OP);
                objm2_symbol_set_flag(this_lexer->sym, is_terminal);
                objm2_symbol_set_flag(this_lexer->sym, is_operator);
                objm2_symbol_set_flag(this_lexer->sym, is_second_order_operator);
            }
            else /* is plus-plus */ {
                ch = readchar();
                ch = nextchar();
                objm2_symbol_set_token(this_lexer->sym, TOKEN_INCREMENT_OP);
                objm2_symbol_set_flag(this_lexer->sym, is_terminal);
            } // end if
        } // end plus operator check

        // check for minus operator
        else if (ch == MINUS) {
            ch = readchar();
            ch = nextchar();
            if (ch != MINUS) {
                // is single minus
                objm2_symbol_set_token(this_lexer->sym, TOKEN_MINUS_OP);
                objm2_symbol_set_flag(this_lexer->sym, is_terminal);
                objm2_symbol_set_flag(this_lexer->sym, is_operator);
                objm2_symbol_set_flag(this_lexer->sym, is_second_order_operator);
            }
            else /* is minus-minus */ {
                ch = readchar();
                ch = nextchar();
                objm2_symbol_set_token(this_lexer->sym, TOKEN_DECREMENT_OP);
                objm2_symbol_set_flag(this_lexer->sym, is_terminal);
            } // end if
        } // end minus operator check

        // check for multiply operator
        else if (ch == ASTERISK) {
            ch = readchar();
            ch = nextchar();
            objm2_symbol_set_token(this_lexer->sym, TOKEN_MULTIPLY_OP);
            objm2_symbol_set_flag(this_lexer->sym, is_terminal);
            objm2_symbol_set_flag(this_lexer->sym, is_operator);
            objm2_symbol_set_flag(this_lexer->sym, is_first_order_operator);
        } // end multiply operator check

        // check for divide operator
        else if (ch == FORWARD_SLASH) {
            ch = readchar();
            ch = nextchar();
            // check for C comment
            if (ch == ASTERISK) {
                // found C comment
                if (this_lexer->options.skip_c_comments) {
                    ch = _skip_c_comment(this_lexer);
                    ignore_token = true;
                }
                else /* comment is treated as a token */ {
                    ch = _get_c_comment(this_lexer);
                    objm2_symbol_set_token(this_lexer->sym, TOKEN_C_COMMENT);
                } // end if
            }
            
            // check for BCPL/C++ comment
            else if (ch == FORWARD_SLASH) {
                // found BCPL/C++ comment
                if (this_lexer->options.skip_cpp_comments) {
                    ch = _skip_cpp_comment(this_lexer);
                    ignore_token = true;
                }
                else /* comment is treated as a token */ {
                    ch = _get_cpp_comment(this_lexer);
                    objm2_symbol_set_token(this_lexer->sym, TOKEN_CPP_COMMENT);
                } // end if
            }
            
            else /* is divide operator */ {
                objm2_symbol_set_token(this_lexer->sym, TOKEN_DIVIDE_OP);
                objm2_symbol_set_flag(this_lexer->sym, is_terminal);
                objm2_symbol_set_flag(this_lexer->sym, is_operator);
                objm2_symbol_set_flag(this_lexer->sym, is_first_order_operator);
            } // end if
        } // end divide operator check

        // check for pointer operator 
        else if (ch == CARET) {
            ch = readchar();
            ch = nextchar();
            objm2_symbol_set_token(this_lexer->sym, TOKEN_POINTER_OP);
            objm2_symbol_set_flag(this_lexer->sym, is_terminal);
        } // end pointer operator check

         // check for vertical bar
        else if (ch == VERTICAL_BAR) {
            ch = readchar();
            ch = nextchar();
            objm2_symbol_set_token(this_lexer->sym, TOKEN_VERTICAL_BAR);
            objm2_symbol_set_flag(this_lexer->sym, is_terminal);
        } // end logical-or operator check

        // illegal characters
        else /* any other character */ {
            ch = readchar();
            objm2_symbol_set_offending_char(this_lexer->sym, ch);
            ch = nextchar();
            objm2_symbol_set_token(this_lexer->sym, TOKEN_ILLEGAL_CHARACTER);
            this_lexer->status = OBJM2_LEXER_STATUS_ILLEGAL_CHARACTER;
        } // end if
        
        if ((ignore_token == false) ||
            (this_lexer->status ==
                OBJM2_LEXER_STATUS_COMMENT_NESTING_LIMIT_REACHED))
            break;
            
    } // end loop;
    
    // return sym, status and token
    *sym = this_lexer->sym;
    *status = this_lexer->status;
    return objm2_symbol_token(this_lexer->sym);
} // end objm2_lexer_getsym;


// ---------------------------------------------------------------------------
// function:  objm2_lexer_pathname(lexer, status)
// ---------------------------------------------------------------------------
//
// Returns the pathname of the sourcefile of lexer <lexer>.  The status of the
// operation is passed back in <status>.
//
// Returns NULL if <lexer> is not a valid lexer object.

const char *objm2_lexer_pathname(objm2_lexer_t lexer,
                          objm2_lexer_status_t *status) {
    
    objm2_lexer_s *this_lexer = (objm2_lexer_s *) lexer;

    if (lexer == NULL) {
        *status = OBJM2_LEXER_STATUS_INVALID_REFERENCE;
        return NULL;
    }
    else {
        *status = OBJM2_LEXER_STATUS_SUCCESS;
        return (char *) this_lexer->filename;
    } // end if
    
} // end objm2_lexer_pathname;


// ---------------------------------------------------------------------------
// function:  objm2_lexer_symtab(lexer, status)
// ---------------------------------------------------------------------------
//
// Returns a pointer to the symbol table used by lexer <lexer>.  The status of
// the operation is passed back in <status>.
//
// Returns NULL if <lexer> is not a valid lexer object.

objm2_kvs_table_t objm2_lexer_symtab(objm2_lexer_t lexer,
                              objm2_lexer_status_t *status) {
    
    objm2_lexer_s *this_lexer = (objm2_lexer_s *) lexer;

    if (lexer == NULL) {
        *status = OBJM2_LEXER_STATUS_INVALID_REFERENCE;
        return NULL;
    }
    else {
        *status = OBJM2_LEXER_STATUS_SUCCESS;
        return this_lexer->symtab;
    } // end if
    
} // end objm2_lexer_symtab;


// ---------------------------------------------------------------------------
// function:  objm2_lexer_options(lexer)
// ---------------------------------------------------------------------------
//
// Returns the  option set  for lexer <lexer>.  The status of the operation is
// passed back in <status>.
//
// Returns zero if <lexer> is not a valid lexer object.

cardinal objm2_lexer_options(objm2_lexer_t lexer,
                      objm2_lexer_status_t *status) {
    
    objm2_lexer_s *this_lexer = (objm2_lexer_s *) lexer;
    cardinal result = 0;

    if (lexer == NULL) {
        *status = OBJM2_LEXER_STATUS_INVALID_REFERENCE;
        return 0;
    } // end if
    
    if (this_lexer->options.trim_leading_zeroes == true)
        result = result + OBJM2_LEXER_TRIM_LEADING_ZEROES_IN_LITERALS;

    if (this_lexer->options.printable_7bit_ascii_strings == true)
        result = result + OBJM2_LEXER_PRINTABLE_7BIT_ASCII_STRINGS_ONLY;

    if (this_lexer->options.skip_pragmas == true)
        result = result + OBJM2_LEXER_DONT_RETURN_PRAGMA_TOKEN;

    if (this_lexer->options.skip_c_comments == true)
        result = result + OBJM2_LEXER_DONT_RETURN_C_COMMENT_TOKEN;

    if (this_lexer->options.skip_m2_comments == true)
        result = result + OBJM2_LEXER_DONT_RETURN_M2_COMMENT_TOKEN;

    if (this_lexer->options.skip_cpp_comments == true)
        result = result + OBJM2_LEXER_DONT_RETURN_CPP_COMMENT_TOKEN;

    *status = OBJM2_LEXER_STATUS_SUCCESS;
    return result;
} // end objm2_lexer_options;


// ---------------------------------------------------------------------------
// function:  objm2_reset_lexer(lexer)
// ---------------------------------------------------------------------------
//
// Resets the lexer to its  initialisation status  and  closes its sourcefile.
// The  symbol table  used  by the lexer is  NOT  modified.  The status of the
// operation is passed back in <status>.

void objm2_reset_lexer(objm2_lexer_t lexer,
                objm2_lexer_status_t *status) {
    
    objm2_lexer_s *this_lexer = (objm2_lexer_s *) lexer;

    // assert pre-condition
    if (lexer == NULL) {
        *status = OBJM2_LEXER_STATUS_INVALID_REFERENCE;
        return;
    } // end if
    
    // close sourcefile
    fclose(this_lexer->sourcefile); // this should probably be rewind
    
    // initialise status and return values
    this_lexer->sym = OBJM2_SYMBOL_FIELDS_CLEARED;
    this_lexer->status = OBJM2_LEXER_STATUS_UNDEFINED;
    
    // initialise counters
    SET_FPOS(this_lexer->current_pos, 0, 0);
    this_lexer->paren_nesting_level = 0;
    this_lexer->bracket_nesting_level = 0;
    this_lexer->brace_nesting_level = 0;
    
    // initialise lexer flags
    this_lexer->seen_backquote = false;
    this_lexer->seen_method = false;
    this_lexer->seen_open_paren_since_method = false;
    this_lexer->end_of_file = false;
        
    // lexeme buffer
    this_lexer->lexlen = 0;
    this_lexer->lexbuf[0] = CSTRING_TERMINATOR;

    *status = OBJM2_LEXER_STATUS_SUCCESS;
    return;
} // end objm2_reset_lexer;


// ---------------------------------------------------------------------------
// function:  objm2_dispose_lexer(lexer)
// ---------------------------------------------------------------------------
//
// Disposes of lexer object <lexer>  and  closes its sourcefile if it is open. 
// The  symbol table  used  by  the lexer is  NOT  disposed of.  The status of
// the operation is passed back in <status>.

void objm2_dispose_lexer(objm2_lexer_t lexer,
                  objm2_lexer_status_t *status) {
    
    objm2_lexer_s *this_lexer = (objm2_lexer_s *) lexer;

    if (lexer == NULL) {
        *status = OBJM2_LEXER_STATUS_INVALID_REFERENCE;
        return;
    } // end if
    
    fclose(this_lexer->sourcefile);
    free(this_lexer);
    
    *status = OBJM2_LEXER_STATUS_SUCCESS;
    return;
} // end objm2_dispose_lexer;

#undef readchar
#undef nextchar


// ==========================================================================
// P R I V A T E   F U N C T I O N   I M P L E M E N T A T I O N S
// ==========================================================================

// ---------------------------------------------------------------------------
// macros for private functions
// ---------------------------------------------------------------------------

#define readchar(v) _readchar(lexer) /* v = void */
#define nextchar(v) _nextchar(lexer) /* v = void */
#define ALLOW_ESCAPED_CHARS NOT(lexer->options.printable_7bit_ascii_strings)
#define IS_METHOD_SIGNATURE_CONTEXT(lexer) \
    ((lexer->seen_method && !(lexer->seen_open_paren_since_method)) \
     || (lexer->bracket_nesting_level > 0) || (lexer->seen_backquote))

// ---------------------------------------------------------------------------
// private function:  _readchar(lexer)
// ---------------------------------------------------------------------------
//
// Reads one character from sourcefile and returns it. Global variable coloumn
// is incremented. Returns the linefeed character (LF) if LF or carriage
// return (CR) or CRLF is read. If LF is returned, the lexer's coloumn
// counter will be reset to 0 and its line counter will be incremented.
//
// pre-conditions:
//
//  o  lexer is an initialised lexer object
//
// post-conditions:
//
//  o  new current character is the character read (consumed)
//  o  new lookahead character is the character following the character read
//  o  position counters are updated accordingly
//
// return-value:
//
//  o  read (consumed) character is returned

static fmacro uchar_t _readchar(objm2_lexer_s *lexer) {
    register int c;

#ifndef PRIV_FUNCS_DONT_CHECK_NULL_PARAMS
    if (lexer == NULL) return (uchar_t)0;
#endif

    // read one character from source file
    c = getc(lexer->sourcefile);
        
    // handle LF style end-of-line
    if (c == ASCII_LF) {
        lexer->current_pos.col = 1;
        lexer->current_pos.line++;
    }
    // handle CRLF and CR style end-of-line
    else if (c == ASCII_CR) {
        lexer->current_pos.col = 1;
        lexer->current_pos.line++;
        c = getc(lexer->sourcefile);
        if (c != NEWLINE) {
            ungetc(c, lexer->sourcefile);
        } // end if
        c = NEWLINE;
    }
    // handle end-of-file
    else if (c == EOF) {
        // set end-of-file flag if end-of-file reached
        lexer->end_of_file = (feof(lexer->sourcefile) == true);
        c = 0;
    }
    else /* any other characters */ {
        // increment row counter
        lexer->current_pos.col++;
    } // end if
    
    if (((uchar_t) c == 255) || (c == 0)) {
        printf("");
    } // end if
    
    // return character
    return (uchar_t) c;
} // end _readchar


// ---------------------------------------------------------------------------
// private function:  _nextchar(lexer)
// ---------------------------------------------------------------------------
//
// Returns the next character in sourcefile without incrementing the file
// pointer and without changing the lexer's coloumn and lineCounter.
//
// pre-conditions:
//
//  o  lexer is an initialised lexer object
//
// post-conditions:
//
//  o  position counters remain unchanged
//
// return-value:
//
//  o  lookahead character is returned

static fmacro uchar_t _nextchar(objm2_lexer_s *lexer) {
    register int status;
    register int c;
    
#ifndef PRIV_FUNCS_DONT_CHECK_NULL_PARAMS
    if (lexer == NULL) return (uchar_t)0;
#endif

    c = getc(lexer->sourcefile);
    
    status = ungetc(c, lexer->sourcefile);
    if (status != EOF) {
        lexer->end_of_file = false;
    }
    else {
        lexer->end_of_file = true;
        c = 0;
    } // end if
    
    return (uchar_t) c;
} // end _nextchar


// ---------------------------------------------------------------------------
// private function:  _get_ident(lexer)
// ---------------------------------------------------------------------------
//
// pre-conditions:
//
//  o  lexer is an initialised lexer object
//  o  current character is the character immediately before the identifier
//
// post-conditions:
//
//  o  identifier is copied into the lexeme buffer and terminated
//  o  if the length of the identifier found exceeds the maximum length of
//     identifiers, only the significant characters are copied into the buffer
//  o  position counters are updated accordingly
//  o  the symbol table hash value of the significant part of the identifier
//     found is stored in last_ident_hash
//
// return-value:
//
//  o  new lookahead character is returned
//  o  token value is returned in parameter token

static fmacro uchar_t _get_ident(objm2_lexer_s *lexer) {
    
    register uint32_t hash = OBJM2_HASH_INITIAL;
    register ident_index_t index = 0;
    register uchar_t ch;
    register bool may_be_reserved_word = true;
    bool first_char_is_underscore = false;
    uint16_t table_index;
    
#ifndef PRIV_FUNCS_DONT_CHECK_NULL_PARAMS
    if (lexer == NULL) return (uchar_t)0;
#endif

    // process the first character
    ch = readchar();
    lexer->lexbuf[index] = ch;
    hash = OBJM2_HASH_NEXT_CHAR(hash, ch);
    
    // keep track of any occurance of non-uppercase chars
    if (may_be_reserved_word && (IS_NOT_UPPERCASE(ch)))
         may_be_reserved_word = false;
    index++;
    
    if (ch == UNDERSCORE)
        first_char_is_underscore = true;

    // take a peek at the next character
    ch = nextchar();
    
    // read the remainder of the identifier
    while ((NOT_EOF(lexer)) &&
           ((IS_ALPHANUM(ch)) || (ch == UNDERSCORE) || (ch == DOLLAR)) &&
           (index < OBJM2_MAX_IDENT_LENGTH)) {
        
        // read the current character
        ch = readchar();
        
        // copy it to lexeme buffer and update hash
        lexer->lexbuf[index] = ch;
        hash = OBJM2_HASH_NEXT_CHAR(hash, ch);
        
        // keep track of any occurance of non-uppercase chars
        if (may_be_reserved_word && (IS_NOT_UPPERCASE(ch)))
            may_be_reserved_word = false;
        
        // remember any occurance of "_"
        if (ch == UNDERSCORE) {
            objm2_symbol_set_flag(lexer->sym,
                                  has_one_or_more_underscores);
        } // end if
        
        // remember any occurrqnce of "$"        
        if (ch == DOLLAR) {
            objm2_symbol_set_flag(lexer->sym,
                                  has_one_or_more_dollar_signs);
        } // end if
        
        // prepare for next character
        index++;
        ch = nextchar();
    } // end while
    
    // check if identifier is part of a method signature
    if (IS_METHOD_SIGNATURE_CONTEXT(lexer)) {
        
        // private objc method names start with underscores
        if (first_char_is_underscore) {
            objm2_symbol_set_flag(lexer->sym,
                                  may_collide_with_priv_objc_name);
        } // end if
        
        // labeled identifiers (keywords in objc lingo) end with a colon
        if (ch == COLON) {
        
            // colon belongs to the identifier, copy it and update hash
            ch = readchar();
            lexer->lexbuf[index] = ch;
            hash = OBJM2_HASH_NEXT_CHAR(hash, ch);
        
            // get next character
            index++;
            ch = nextchar();
        
            // set the token to labeled identifier
            objm2_symbol_set_token(lexer->sym, TOKEN_LABELED_IDENTIFIER);
        } // end if
        
        // no more chars to read
        hash = OBJM2_HASH_FINAL(hash);
        lexer->lexbuf[index] = CSTRING_TERMINATOR;

    } // end method signature check
    
    // check if identifier is reserved word
    else if (may_be_reserved_word) {
        
        // no more chars to read
        hash = OBJM2_HASH_FINAL(hash);
        lexer->lexbuf[index] = CSTRING_TERMINATOR;

        // get index for table of reserved words
        table_index =
            objm2_index_for_reserved_word((char *)lexer->lexbuf, hash);
        
        // index represents token value
        objm2_symbol_set_token(lexer->sym, table_index);
        
        if (table_index < TOKEN_IDENTIFIER)
            objm2_symbol_set_flag(lexer->sym, is_reserved_word);
               
    } // end reserved word check
    
    // check if identifier is built-in
    else if (false) /* TO DO */ {
        
        // no more chars to read
        hash = OBJM2_HASH_FINAL(hash);
        lexer->lexbuf[index] = CSTRING_TERMINATOR;

        // identifier is builtin
        objm2_symbol_set_token(lexer->sym, TOKEN_IDENTIFIER);
        objm2_symbol_set_flag(lexer->sym, is_builtin_ident);
    } // end builtin identifier check

    else /* all else didn't match */ {
    
        // no more chars to read
        hash = OBJM2_HASH_FINAL(hash);
        lexer->lexbuf[index] = CSTRING_TERMINATOR;

        // identifier is user defined
        objm2_symbol_set_token(lexer->sym, TOKEN_IDENTIFIER);
    } // end if
    
    objm2_symbol_set_key(lexer->sym, hash);
    lexer->lexlen = index;
    
    // add the lexeme to the symbol table
    objm2_kvs_add_entry(lexer->symtab, hash, lexer->lexbuf, 0);
    
    return ch;
} // end _get_ident


// ---------------------------------------------------------------------------
// private function:  _get_number(lexer, token)
// ---------------------------------------------------------------------------
//
// pre-conditions:
//
//  o  lexer is an initialised lexer object
//  o  current character is the character immediately before the literal
//
// post-conditions:  /// TO DO: reformat post-conditions
//
//  the number literal is copied into the lexer's lexeme buffer, the token
//  value is returned in <token>, the character following the literal is
//  returned as the function's result, position counters are updated.
//
// return values:
//
//  o  new lookahead character is returned as function result
//  o  token value is returned in parameter token

static fmacro uchar_t _get_number(objm2_lexer_s *lexer) {
    
    register uint32_t hash = OBJM2_HASH_INITIAL;
    register num_index_t index = 0;
    register uchar_t ch;
    register uint8_t encountered_A_to_F = 0;
    register uint8_t encountered_8_or_9 = 0;
    bool decimal_point_found = false;
    bool scientific_format = false;
    num_index_t offset_to_1st_significant_digit = 0;
    num_index_t offset_to_decimal_point;
    num_index_t significant_digits;
    uchar_t first_digit, final_char;

#ifndef PRIV_FUNCS_DONT_CHECK_NULL_PARAMS
    if (lexer == NULL) return (uchar_t)0;
#endif

    // get the first digit
    ch = readchar();
    first_digit = ch;
    lexer->lexbuf[index] = first_digit;
    hash = OBJM2_HASH_NEXT_CHAR(hash, ch);
    index++;
    
    // trim leading zeroes if lexer options say so
    if ((lexer->options.trim_leading_zeroes) && (first_digit == DIGIT_ZERO)) {
        while (nextchar() == DIGIT_ZERO)
            readchar();
    } // end if
    
    // peek at the next digit
    ch = nextchar();
        
    // get all digits until non-digit is found
    while ((IS_UPPERHEX(ch)) &&
           (index < OBJM2_MAX_NUM_LENGTH) && (NOT_EOF(lexer))) {
        if (ch >= UPPERCASE_A)
            encountered_A_to_F++;
        else if (ch >= DIGIT_EIGHT)
            encountered_8_or_9++;
        ch = readchar();
        lexer->lexbuf[index] = ch;
        hash = OBJM2_HASH_NEXT_CHAR(hash, ch);
        index++;
        ch = nextchar();
    } // end while
    
    if ((ch == UPPERCASE_H) || (ch == UPPERCASE_U)) {
        // designator found
        ch = readchar();
        lexer->lexbuf[index] = ch;
        hash = OBJM2_HASH_NEXT_CHAR(hash, ch);
        index++;
        ch = nextchar();
    }
    else if ((ch == DOT) && (encountered_A_to_F == 0)) {
        // decimal point found
        decimal_point_found = true;
        offset_to_decimal_point = index;
        
        // get the decimal point
        ch = readchar();
        lexer->lexbuf[index] = ch;
        hash = OBJM2_HASH_NEXT_CHAR(hash, ch);
        index++;
        ch = nextchar();
        
        // get any remaining decimal digits after the decimal point
        while ((IS_DIGIT(ch)) &&
               (index < OBJM2_MAX_NUM_LENGTH) && (NOT_EOF(lexer))) {
            ch = readchar();
            lexer->lexbuf[index] = ch;
            hash = OBJM2_HASH_NEXT_CHAR(hash, ch);
            index++;
            ch = nextchar();
        } // end while
        
        if (ch == UPPERCASE_E) {
            // scientific notation
            scientific_format = true;
            
            // get the scale factor
            ch = readchar();
            lexer->lexbuf[index] = ch;
            hash = OBJM2_HASH_NEXT_CHAR(hash, ch);
            index++;
            ch = nextchar();
                
            if ((ch == PLUS) || (ch == MINUS)) {
                ch = readchar();
                lexer->lexbuf[index] = ch;
                hash = OBJM2_HASH_NEXT_CHAR(hash, ch);
                index++;
                ch = nextchar();
            } // end if
            
            if (IS_DIGIT(ch)) {
                // get remaining digits
                while ((IS_DIGIT(ch)) &&
                       (index < OBJM2_MAX_NUM_LENGTH) &&
                       (NOT_EOF(lexer))) {
                    ch = readchar();
                    lexer->lexbuf[index] = ch;
                    hash = OBJM2_HASH_NEXT_CHAR(hash, ch);
                    index++;
                    ch = nextchar();
                } // end while
            } // end remaining digits
        } // end scientific notation
    } // end decimal point branch
        
    // no more chars to read
    hash = OBJM2_HASH_FINAL(hash);
    objm2_symbol_set_key(lexer->sym, hash);
    lexer->lexbuf[index] = CSTRING_TERMINATOR;
    
    // add the lexeme to the symbol table
    objm2_kvs_add_entry(lexer->symtab, hash, lexer->lexbuf, 0);

    // all digits read, number verification follows ...
    
    final_char = lexer->lexbuf[index - 1];
    
    // find first significant digit
    while (lexer->lexbuf[offset_to_1st_significant_digit] == DIGIT_ZERO)
        offset_to_1st_significant_digit++;
    
    // type checking
    if (scientific_format) {
        // is real number literal
        objm2_symbol_set_token(lexer->sym, TOKEN_REAL_NUMBER_LITERAL);
        
        // verify number format
        if (IS_DIGIT(final_char)) {
            if /* more than one digit before decimal point */
             (offset_to_decimal_point - offset_to_1st_significant_digit != 1) {
                // non-normalised scientific notation error
                lexer->status =
                    OBJM2_LEXER_STATUS_MALFORMED_SCIENTIFIC_NOTATION;
                objm2_symbol_set_flag(lexer->sym, is_malformed_literal);
            } // end if
        }
        else /* does not end with a digit */ {
            // missing digits at the end of the number
            lexer->status = OBJM2_LEXER_STATUS_MALFORMED_SCIENTIFIC_NOTATION;
            objm2_symbol_set_flag(lexer->sym, is_malformed_literal);
        } // end if
    }
    else if (decimal_point_found) {
        // is real number literal
        objm2_symbol_set_token(lexer->sym, TOKEN_REAL_NUMBER_LITERAL);
        
        // verify number format
        if (IS_NOT_DIGIT(final_char)) {
            // illegal digit
            lexer->status = OBJM2_LEXER_STATUS_MALFORMED_NUMERIC_LITERAL;
            objm2_symbol_set_flag(lexer->sym, is_malformed_literal);
        } // end if
    }
    else if ((encountered_A_to_F == 1) && (encountered_8_or_9 == 0) &&
             (final_char == UPPERCASE_B)) {
        objm2_symbol_set_token(lexer->sym, TOKEN_OCTAL_INTEGER_LITERAL);
    }
    else if ((encountered_A_to_F == 1) && (encountered_8_or_9 == 0) &&
             (final_char == UPPERCASE_C)) {
        objm2_symbol_set_token(lexer->sym, TOKEN_ASCII_CHAR_LITERAL);
    }
    else if (encountered_A_to_F > 0) {
        // is sedecimal integer
        objm2_symbol_set_token(lexer->sym, TOKEN_SEDECIMAL_INTEGER_LITERAL);
        
        // verify number format
        if (final_char != UPPERCASE_H) {
            // sedecimal number missing "H" designator error
            lexer->status = OBJM2_LEXER_STATUS_MALFORMED_NUMERIC_LITERAL;
            objm2_symbol_set_flag(lexer->sym, is_malformed_literal);
        } // end if
    }
    else /* all digits are decimal */ {
        // sedecimal, unichar or decimal integer
        if (final_char == UPPERCASE_H) {
            objm2_symbol_set_token(lexer->sym, TOKEN_SEDECIMAL_INTEGER_LITERAL);
        }
        else if (final_char == UPPERCASE_U) {
            objm2_symbol_set_token(lexer->sym, TOKEN_UNICODE_CHAR_LITERAL);
        }
        else /* final char is digit */ {
            objm2_symbol_set_token(lexer->sym, TOKEN_DECIMAL_INTEGER_LITERAL);
        } // end if
    } // end type checking
    
    
    // range checking, if lexer options say so
    if (0 /* lexer->options.check_range_for_char_literals */) {
        significant_digits = index - offset_to_1st_significant_digit - 1;

        if ((final_char == UPPERCASE_C) &&
            ((significant_digits > 3) || 
            ((significant_digits == 3) && (first_digit > DIGIT_ONE)))) {
            // value out of range for ascii
            lexer->status = OBJM2_LEXER_STATUS_ASCII_OVERFLOW;
            objm2_symbol_set_flag(lexer->sym, is_malformed_literal);
        }
        else if ((final_char == UPPERCASE_U) && (significant_digits > 4)) {
            // value out of range for unichar
            lexer->status = OBJM2_LEXER_STATUS_UNICHAR_OVERFLOW;
            objm2_symbol_set_flag(lexer->sym, is_malformed_literal);
        } // end if
    } // end range checking
    
    objm2_symbol_set_flag(lexer->sym, is_literal);
    
    return ch;
} // end _get_number


// ---------------------------------------------------------------------------
// private function:  _get_quoted_char_or_string(lexer)
// ---------------------------------------------------------------------------
//
// pre-conditions:
//
//  o  lexer is an initialised lexer object
//  o  lookahead character is quotation mark at the beginning of the literal
//
// post-conditions:  /// TO DO: update post-conditions
//
//  the current character is the quotation mark at the end of the string
//  literal
//
//  the character or string has been copied to the lexeme buffer, including
//  the enclosing quotation marks
//
//  the illegal_chars_skipped flag is set if any illegal characters were
//  found, illegal characters are not copied
//
// return-value:
//
//  o  new lookahead character is returned

static fmacro uchar_t _get_quoted_char_or_string(objm2_lexer_s *lexer) {
    register uint32_t hash = OBJM2_HASH_INITIAL;
    register uchar_t *lex = (uchar_t *)&lexer->lexbuf;
    register uchar_t ch;
    register terminating_ch;
    register string_index_t len = 0;
    bool illegal_chars_skipped = false;
    bool non_printable_chars_copied = false;
    bool non_7bit_ascii_chars_copied = false;

#ifndef PRIV_FUNCS_DONT_CHECK_NULL_PARAMS
    if (lexer == NULL) return (uchar_t)0;
#endif
    
    // get first character
    ch = readchar();
    
    // first character terminates
    terminating_ch = ch;
    
    // copy to lexeme buffer
    *lex = ch; lex++;
    
    // update hash value
    hash = OBJM2_HASH_NEXT_CHAR(hash, ch);
    
    // peek at the next character
    ch = nextchar();
    
    // read all characters up to delimiter, maximum length, linefeed or EOF
    while ((ch != terminating_ch) &&
           (len <= OBJM2_MAX_STRING_LENGTH) && (ch != EOL) && NOT_EOF(lexer)) {
        // consume next character
        ch = readchar();
        
        // character check
        if (IS_CONTROL(ch)) {
            // skip but remember
            illegal_chars_skipped = true;
        }
        else if ((ch == BACKSLASH) && ALLOW_ESCAPED_CHARS) {
            ch = _get_escaped_char(lexer);
            *lex = ch; lex++; len++;
            hash = OBJM2_HASH_NEXT_CHAR(hash, ch);
            if (IS_CONTROL(ch))
                non_printable_chars_copied = true;
        }
        else if (IS_7BIT_ASCII(ch)) {
            // copy to lexeme buffer and update hash
            *lex = ch; lex++; len++;
            hash = OBJM2_HASH_NEXT_CHAR(hash, ch);
        }
        else /* not control and not 7-bit */ {
            if (lexer->options.printable_7bit_ascii_strings) {
                // skip but remember
                illegal_chars_skipped = true;
            }
            else /* non 7-bit chars are allowed */ {
                // copy to lexeme buffer and update hash
                *lex = ch; lex++; len++;
                hash = OBJM2_HASH_NEXT_CHAR(hash, ch);
                non_7bit_ascii_chars_copied = true;
            } // end if
        } // character check
        
        // peek at the next character
        ch = nextchar();
    } // end while
    
    // check if delimiter follows
    if (ch == terminating_ch) {
        // consume delimiter
        ch = readchar();
        
        // copy to lexeme buffer
        *lex = ch; lex++;

         // update hash value
        hash = OBJM2_HASH_NEXT_CHAR(hash, ch);
    }
    else if (len > OBJM2_MAX_STRING_LENGTH) {
        lexer->status = OBJM2_LEXER_STATUS_STRING_TOO_LONG;
        objm2_symbol_set_flag(lexer->sym, is_malformed_literal);
    }
    else if ((ch == EOL) && (EOF_REACHED(lexer))) {
        lexer->status = OBJM2_LEXER_STATUS_STRING_NOT_TERMINATED;
        objm2_symbol_set_flag(lexer->sym, is_malformed_literal);
    }
    else {
        lexer->status = OBJM2_LEXER_STATUS_STRING_HAS_ILLEGAL_CHARS;
        objm2_symbol_set_flag(lexer->sym, is_malformed_literal);
    } // end if
    
    // no more characters to read
    *lex = CSTRING_TERMINATOR;
    hash = OBJM2_HASH_FINAL(hash);
    
    // add the lexeme to the symbol table
    objm2_kvs_add_entry(lexer->symtab, hash, lexer->lexbuf, 0);
    
    objm2_symbol_set_key(lexer->sym, hash);
    objm2_symbol_set_token(lexer->sym, TOKEN_STRING_LITERAL);
    objm2_symbol_set_flag(lexer->sym, is_literal);
    
    if (illegal_chars_skipped) {
        objm2_symbol_set_flag(lexer->sym, is_malformed_literal);
    } // end if
    
    if (non_7bit_ascii_chars_copied) {
        objm2_symbol_set_flag(lexer->sym, has_non_7bit_ascii_chars);
    } // end if
    
    return ch;
} // end _get_quoted_char_or_string


// ---------------------------------------------------------------------------
// private function:  _get_escaped_char(lexer)
// ---------------------------------------------------------------------------
//
// pre-conditions:
//
//  o  lexer is an initialised lexer object
//  o  current character is *assumed* to be backslash
//
// post-conditions:
//
//  if the assumed backslash starts an escape sequence ...
//
//  o  the current character is the last character in the escape sequence
//  o  the lookahead character is the character following the escape sequence
//  o  line and coloumn counters are updated
//
// if the assumed backslash does not start an escape sequence ...
//
//  o  current character, lookahead character, line and coloumn counter
//     remain unchanged
//
// return-value:
//
// if the assumed backslash starts an escape sequence ...
//
//  o  the escaped character is returned
//
// if the assumed backslash does not start an escape sequence ...
//
//  o  a backslash is returned

static fmacro uchar_t _get_escaped_char(objm2_lexer_s *lexer) {
    uchar_t ch, nextch;
    bool escape_sequence_found = false;
    
#ifndef PRIV_FUNCS_DONT_CHECK_NULL_PARAMS
    if (lexer == NULL) return (uchar_t)0;
#endif

    // must NOT consume current character
    // simply assume that it is backslash
    ch = BACKSLASH;
    
    // get the lookahead character
    nextch = nextchar();
        
    switch (nextch) {
        case DOUBLE_QUOTE :
        case SINGLE_QUOTE :
            escape_sequence_found = true;
            ch = nextchar();
            break;
        case DIGIT_ZERO :
            escape_sequence_found = true;
            ch = ASCII_NUL;
            break;
        case LOWERCASE_N :
            escape_sequence_found = true;
            ch = LINEFEED;
            break;
        case LOWERCASE_R :
            escape_sequence_found = true;
            ch = CARRIAGE_RETURN;
            break;
        case LOWERCASE_T :
            escape_sequence_found = true;
            ch = TAB;
            break;
        case BACKSLASH :
            escape_sequence_found = true;
            ch = BACKSLASH;
    } // end switch
    
    // consume current character only if escape sequence was found
    if (escape_sequence_found)
        readchar();
    
    return ch;
} // end _get_escaped_char


// ---------------------------------------------------------------------------
// private function:  _skip_c_comment(lexer)
// ---------------------------------------------------------------------------
//
// pre-condition:
//
//  o  lexer is an initialised lexer object
//  o  lookahead character is the asterisk following the forward slash
//     at the start of the comment
//
// post-conditions:
//
//  o  the lookahead character is the character following the forward slash
//     at the end of the comment
//  o  the lexer's line and coloumn counters have been updated
//
// return-value:
//
//  o  new lookahead character is returned

static fmacro uchar_t _skip_c_comment(objm2_lexer_s *lexer) {
    
#ifndef PRIV_FUNCS_DONT_CHECK_NULL_PARAMS
    if (lexer == NULL) return (uchar_t)0;
#endif

    // skip the asterisk
    readchar();
        
    // skip all characters until end-of-file or end-of-comment is found
    while (NOT_EOF(lexer)) {
        if ((readchar() == ASTERISK) && (nextchar() == FORWARD_SLASH)) {
            // skip over terminating forward slash
            readchar();
            break;
        } // end if
    } // end while
        
    // return the character following the forward slash
    return nextchar();
} // end _skip_c_comment


// ---------------------------------------------------------------------------
// private function:  _get_c_comment(lexer)
// ---------------------------------------------------------------------------
//
// pre-condition:
//
//  o  lexer is an initialised lexer object
//  o  current character is the asterisk following the forward slash
//     at the start of the comment
//
// post-conditions:
//
//  lexbuf contains the comment without the comment delimiters and trimmed
//  of leading and trailing whitespace and tabs, the comment is truncated
//  if it exceeds the length of the lexeme buffer
//  
//  the current character is the forward slash at the end of the comment
//
// return-value:
//
//  o  new lookahead character is returned

static fmacro uchar_t _get_c_comment(objm2_lexer_s *lexer) {
    register uchar_t ch, nextch;
    register comment_index_t i = 0;
    
#ifndef PRIV_FUNCS_DONT_CHECK_NULL_PARAMS
    if (lexer == NULL) return (uchar_t)0;
#endif

    // skip the asterisk
    ch = readchar();
    ch = nextchar();
    
    // skip any leading whitespace and tabs
    while ((NOT_EOF(lexer)) && ((ch == WHITESPACE) || (ch == TAB))) {
        // skip the current character
        readchar();
        // take a peek at the next one
        ch = nextchar();
    } // end while;
    
    // copy all characters until end-of-file or end-of-comment is found
    while ((NOT_EOF(lexer)) && (i < OBJM2_LEXER_LEX_BUFFER_CAPACITY)) {
        ch = readchar();
        lexer->lexbuf[i] = ch;
        nextch = nextchar();
        if ((ch == ASTERISK) && (nextch == FORWARD_SLASH)) {
            // end-of-comment found - skip it
            readchar();
            readchar();
            nextch = nextchar();
            break;
        } // end if
        ch = nextch;
        i++;
    } // end while
    
    lexer->lexbuf[i] = CSTRING_TERMINATOR;
    
    // trim lexeme buffer of any trailing whitespace and tabs
    while ((i > 0) &&
           ((lexer->lexbuf[i-1] == WHITESPACE) || (lexer->lexbuf[i-1] == TAB))) {
        lexer->lexbuf[i-1] = CSTRING_TERMINATOR;
        i--;
    } // end while
    
    return nextch;
} // end _get_c_comment

// ---------------------------------------------------------------------------
// private function:  _skip_m2_comment(lexer)
// ---------------------------------------------------------------------------
//
// pre-condition:
//
//  o  lexer is an initialised lexer object
//  o  current character is the asterisk following the opening parenthesis
//     at the start of the comment
//
// post-conditions:
//
//  o  the current character is the character following the closing
//     parenthesis at the end of the comment and the lexer's status
//     field has been set to OBJM2_LEXER_STATUS_SUCCESS
//  o  if the maximum comment nesting level has been exceeded, then the
//     remainder of the file has been skipped until EOF has been reached
//     and the lexer's status field has been set to
//     OBJM2_LEXER_STATUS_COMMENT_NESTING_LIMIT_REACHED
//  o  the lexer's line and coloumn counter have been updated
//
// return-value:
//
//  o  new lookahead character is returned

static fmacro uchar_t _skip_m2_comment(objm2_lexer_s *lexer) {

    register uchar_t ch, nextch;
    cardinal open_comment_count = 1;
   
#ifndef PRIV_FUNCS_DONT_CHECK_NULL_PARAMS
    if (lexer == NULL) return (uchar_t)0;
#endif

    // skip the asterisk
    ch = readchar();
    nextch = nextchar();
    
    // skip all characters until end-of-file or end-of-comment is found
    while (NOT_EOF(lexer) && (open_comment_count > 0)) {
        nextch = nextchar();
        if ((ch == OPENING_PARENTHESIS) && (nextch == ASTERISK)) {
            open_comment_count++;
            if (open_comment_count > OBJM2_MAX_NESTED_COMMENTS) {
                // skip to the end of the file
                while (NOT_EOF(lexer))
                    readchar();
                lexer->status =
                    OBJM2_LEXER_STATUS_COMMENT_NESTING_LIMIT_REACHED;
                break;
            } // end if
        }
        else if ((ch == ASTERISK) && (nextch == CLOSING_PARENTHESIS)) {
            open_comment_count--;
        } // end if
        ch = readchar();
    } // end while
    
    if (open_comment_count == 0)
        lexer->status = OBJM2_LEXER_STATUS_SUCCESS;
    
    // return the character following the comment
    return nextchar();
} // end _skip_m2_comment


// ---------------------------------------------------------------------------
// private function:  _get_m2_comment(lexer)
// ---------------------------------------------------------------------------
//
// pre-condition:
//
//  o  lexer is an initialised lexer object
//  o  current character is the asterisk following the opening parenthesis
//     at the start of the comment
//
// post-conditions:
//
//  lexbuf contains the comment without the comment delimiters and trimmed
//  of leading and trailing whitespace and tabs, the comment is truncated
//  if it exceeds the length of the lexeme buffer
//  
//  the current character is the character following the closing
//  parenthesis at the end of the comment
//
// return-value:
//
//  o  new lookahead character is returned

static fmacro uchar_t _get_m2_comment(objm2_lexer_s *lexer) {

    register comment_index_t i = 0;
    register uchar_t ch, nextch;
    cardinal open_comment_count = 1;
    
#ifndef PRIV_FUNCS_DONT_CHECK_NULL_PARAMS
    if (lexer == NULL) return (uchar_t)0;
#endif

    // skip the asterisk
    ch = readchar();
    ch = nextchar();
    
    // skip any leading whitespace and tabs
    while ((NOT_EOF(lexer)) && ((ch == WHITESPACE) || (ch == TAB))) {
        // skip the current character
        readchar();
        // take a peek at the next one
        ch = nextchar();
    } // end while;

    // skip all characters until end-of-file or end-of-comment is found
    while (NOT_EOF(lexer) && (open_comment_count > 0)) {
        nextch = nextchar();
        if ((ch == OPENING_PARENTHESIS) && (nextch == ASTERISK)) {
            open_comment_count++;
            ch = readchar();
            if (open_comment_count > OBJM2_MAX_NESTED_COMMENTS) {
                printf("fatal error: comment nesting limit exceeded\n");
                exit(1); /// TO DO: report error back to caller
            } // end if
        }
        else if ((ch == ASTERISK) && (nextch == CLOSING_PARENTHESIS)) {
            open_comment_count--;
            ch = readchar();
        }
        else if (i < OBJM2_LEXER_LEX_BUFFER_CAPACITY) {
            // copy character to lexeme buffer
            lexer->lexbuf[i] = ch;
            i++;
        }
        else if (open_comment_count > 1) {
            printf("fatal error: lexeme buffer exceeded in nested comment\n");
            exit(1); /// TO DO: error recovery:
                     ///        truncate comment and append closing comment
                     ///        delimiters to close all open comments
        } // end if
        // get next character
        ch = readchar();
    } // end while
    
    // terminate the lexeme buffer
    lexer->lexbuf[i] = CSTRING_TERMINATOR;
    
    // trim lexeme buffer of any trailing whitespace and tabs
    while ((i > 0) &&
        ((lexer->lexbuf[i-1] == WHITESPACE) || (lexer->lexbuf[i-1] == TAB))) {
        lexer->lexbuf[i-1] = CSTRING_TERMINATOR;
        i--;
    } // end while

    // return the character following the comment
    return nextchar();
} // end _get_m2_comment


// ---------------------------------------------------------------------------
// private function:  _skip_cpp_comment(lexer)
// ---------------------------------------------------------------------------
//
// pre-condition:
//
//  o  lexer is an initialised lexer object
//  o  current character is the forward slash at the start of the comment
//
// post-conditions:
//
//  o  the new lookahead character is the character after the comment
//  o  the lexer's line and coloumn counters have been updated
//  
// return-value:
//
//  o  the new lookahead character is returned

static fmacro uchar_t _skip_cpp_comment(objm2_lexer_s *lexer) {

#ifndef PRIV_FUNCS_DONT_CHECK_NULL_PARAMS
    if (lexer == NULL) return (uchar_t)0;
#endif

    // skip the second forward slash
    readchar();
    
    // skip all characters until delimiter is found
    while ((NOT_EOF(lexer)) && (nextchar() != EOL)) {
        readchar();
    } // end while
    
    // skip over the end-of-line marker
    readchar();
    
    // return the character following the end-of-line marker
    return nextchar();
} // end _skip_cpp_comment


// ---------------------------------------------------------------------------
// private function:  _get_cpp_comment(lexer)
// ---------------------------------------------------------------------------
//
// pre-condition:
//
//  o  lexer is an initialised lexer object
//  o  current character is the forward slash at the start of the comment
//
// post-conditions:
//
//  o  comment is copied to lexeme buffer and stored in table symtab
//  o  any control characters in the comment have been ignored
//  o  if the comment did not start at the beginning of the line, then
//     both leading and trailing whitspace and tabs have been trimmed
//  o  if the comment started at the beginning of the line, then only
//     trailing whitespace and tabs have been trimmed
//  o  if the comment exceeded OBJM2_MAX_COMMENT_LENGTH, then it was
//     truncated at the maximum length
//  o  the comment's hash value has been copied to key field in the
//     lexer's sym field
//  o  the lexer's sym field's token field contains TOKEN_CPP_COMMENT
//  o  if truncation occured, then the lexer's status field is set to
//     OBJM2_LEXER_STATUS_COMMENT_TOO_LONG and its sym field's flags
//     have the excess_chars_truncated flag set
//  o  if no truncation occured, then the lexer's status field is set
//     to OBJM2_LEXER_STATUS_SUCCESS
//  o  the new lookahead character is the character after the comment
//  o  the lexer's line and coloumn counters have been updated
//  
// return-value:
//
//  o  the new lookahead character is returned

static fmacro uchar_t _get_cpp_comment(objm2_lexer_s *lexer) {

    register uint32_t hash;
    register uchar_t ch;
    register uchar_t *lex = (uchar_t *)&lexer->lexbuf;
    register comment_index_t len = 0;
    
#ifndef PRIV_FUNCS_DONT_CHECK_NULL_PARAMS
    if (lexer == NULL) return (uchar_t)0;
#endif

    // skip the second forward slash
    ch = readchar();
    ch = nextchar();
    
    // success unless set otherwise in due course
    lexer->status = OBJM2_LEXER_STATUS_SUCCESS;
    
    // skip leading whitespace and tabs if this comment did not start the line
    if (lexer->current_pos.col > 2) {
        while ((ch == WHITESPACE) || (ch == TAB)) {
            ch == readchar();
            ch == nextchar();
        } // end while
    } // end if
    
    
    // copy all characters until end-of-file or end-of-line is found
    while ((NOT_EOF(lexer)) &&
           (nextchar() != EOL) && (len <= OBJM2_MAX_COMMENT_LENGTH)) {
        ch = readchar();
        
        // copy to lexeme buffer except for control characters
        if (IS_NOT_CONTROL(ch)) {
            *lex = ch; lex++; len++;
        } // end if
        ch = nextchar();
    } // end while

    // check for maximum length overrun
    if (len > OBJM2_MAX_COMMENT_LENGTH) {
        lex--; len--;
        lexer->status = OBJM2_LEXER_STATUS_COMMENT_TOO_LONG;
        objm2_symbol_set_flag(lexer->sym, excess_chars_truncated);
    } // end if
    
    // trim any trailing whitespace and tabs
    while (len > 0) {
        lex--;
        if ((*lex != WHITESPACE) && (*lex != TAB)) {
            lex++;
            break;
        } // end if
        len--;
    } // while
    
    // terminate the lexeme
    *lex = CSTRING_TERMINATOR;
    
    // calculate the hash for the trimmed lexeme
    lex = (uchar_t *)&lexer->lexbuf;
    hash = OBJM2_HASH_INITIAL;
    while (*lex != ASCII_NUL) {
        hash = OBJM2_HASH_NEXT_CHAR(hash, *lex);
        lex++;
    } // end while
    hash = OBJM2_HASH_FINAL(hash);
    
    // add the lexeme to the symbol table
    objm2_kvs_add_entry(lexer->symtab, hash, lexer->lexbuf, 0);

    // pass hash and token
    objm2_symbol_set_key(lexer->sym, hash);
    objm2_symbol_set_token(lexer->sym, TOKEN_CPP_COMMENT);

    // skip any remaining characters in the file until end-of-line
    while ((NOT_EOF(lexer)) && (ch != EOL)) {
        ch = readchar();
        ch = nextchar();
    } // end while
    
    // skip past the end-of-line marker
    if (ch == EOL) {
        ch = readchar();
        ch = nextchar();
    } // end if
            
    // return the lookahead character
    return ch;
} // end _get_cpp_comment


// END OF FILE