/* Objective Modula-2 Compiler (objm2c)
 *
 *  @file objm2_lexer.h
 *  Objective Modula-2 lexer interface
 *
 *  Lexical analysis for Objetive Modula-2 source files
 *
 *  Author: Benjamin Kowarsch
 *
 *  Copyright (C) 2009 Sunrise Telephone Systems KK. All rights reserved.
 *
 *  License:
 *
 *  Permission is hereby granted to review and test this software for the sole
 *  purpose of supporting the effort by the licensor to define and develop the
 *  Objective Modula-2 language. It is not permissible under any circumstances
 *  to  use the software  for the purpose  of creating derivative languages or 
 *  dialects.  This permission is valid until 31 December 2009, 24:00h GMT.
 *
 *  Future licensing:
 *
 *  The licensor undertakes to eventually release this software under a proper
 *  open source license  AFTER  the Objective Modula-2 language definition has
 *  been finalised and a conforming and working reference compiler completed.
 *  
 *  Version history:
 *
 *   2.00   2009-01-31   BK   new file from various spin-offs of v.1.x
 *          2009-03-25   BK   introduced new symbol type
 */

// ---------------------------------------------------------------------------
// The lexer reads  symbols  from its input file  and returns tokens following
// the syntax and semantic rules below.
//
//  Maximum length of symbols:
//
//   Identifiers: defined by OBJM2_MAX_IDENT_LENGTH
//
//   Numeric literals: defined by OBJM2_MAX_NUM_LENGTH
//
//   String literals: defined by OBJM2_MAX_STRING_LENGTH
//
//   ASCII literals: 3 significant octal digits
//
//   UNICHAR literals: 4 significant sedecimal digits
//
//   Quoted character literals: 1 (printable) or 2 (escaped) characters
//
//   Comments: defined by OBJM2_MAX_COMMENT_LENGTH
//
//  Shortening of lexemes:
//
//   Identifiers:
//    Any identifier exceeding the maximum length for identifiers  as  defined
//    by OBJM2_MAX_IDENT_LENGTH  will be truncated to the maximum.  Characters
//    ignored in this way will not be copied into the lexer's lexeme buffer.
//
//   Numeric literals:
//    If the lexer option  OBJM2_LEXER_TRIM_LEADING_ZEROES_IN_LITERALS  is set
//    symbols representing numeric literals  will  be  trimmed  of any leading
//    zeroes in excess of one.  Leading zereos ignored in this way will not be
//    copied into  the lexer's lexeme buffer and  therefore they do  not count
//    when determining the length of the numeric literal.
//
//    The lexer will return errors for any numeric literals which exceed their
//    respective maximum length,  numeric literals will not be truncated.
//
//   String literals:
//    The lexer will return errors  for any string literals which exceed their
//    maximum length  as defined by  OBJM2_MAX_STRING_LENGTH,  string literals
//    will not be truncated.
//
//   Comments:
//    Lexer options control whether comments are returned as tokens.  Comments
//    which  are returned as tokens  are subject to  trimming  of leading  and
//    trailing  whitespace and tabs,  except for comments  which  start at the
//    first coloumn in the source file.  However,  single line comments always
//    have trailing whitespace and tabs removed, regardless of position.
//
//    Comments which are returned as tokens are further  subject to truncation
//    if their length  exceeds the  maximum length for comments  as defined by
//    OBJM2_MAX_COMMENT_LENGTH.  If a  nested  Modula-2  comment  exceeds  the
//    maximum length while its current nesting level is not zero, then it will
//    be  truncated  such  that the lexeme buffer can hold the closing comment
//    delimiters required to close  all  open nested comments,  which are then
//    automatically appended by the lexer.
//
//    However,  if  a  nested comment  exceeds  the  maximum nesting limit  as
//    defined by OBJM2_MAX_NESTED_COMMENTS,  then  the lexer will  abort  with
//    a fatal error.
//
//  Range checking:
//
//   ASCII literals:
//    The lexer will return an error for any ASCII literals with a value which
//    is greater than octal 177 (decimal 255).
//
//   UNICHAR literals:
//    The lexer  will return  an error  for any UNICHAR literals  with a value
//    which is greater than sedecimal FFFF (decimal 65535).
//
//  Unrecognised symbols, whitespace, tab, end-of-line and end-of-file ...
//
//   Illegal characters:
//    For  any  symbol  found in the  input file  which  cannot  be  resolved
//    following the syntax defined below, the token  TOKEN_ILLEGAL_CHAR  will
//    be returned.
//
//   Whitespace and tab:
//     Whitespace and tab terminate any symbol other than comments and quoted
//     strings.  No token will be returned for whitespace or tab.
//
//   End-of-line marker:
//     End-of-line markers terminate  any  symbol  other  than  comments  and
//     quoted strings.  No token will be returned for an end-of-line marker.
//
//   End-of-file marker:
//     The token  TOKEN_EOF_MARKER  will be returned when the underlying file
//     system has signalled  that the end of the source file has been reached
//     at the previous attempt to read from the input file.
//
//  Context dependent symbols ...
//
//   Labeled identifiers:
//     A labeled identifier  can  only  occur  within a method declaration  or
//     within  a  message.  Within this context,  the lexer will treat a colon
//     followed  by an identifier  as part of the identifier  and return token
//     TOKEN_LABELED_IDENTIFIER.  Within  any  other  context,  the lexer will
//     not treat a colon as part of an identifier.
//
//
//  Terminal symbols are defined by the following EBNF syntax ...
//
//   (0) illegal-char =
//        any symbol which cannot be resolved following productions 1-52
//
//   (1) keyword =
//        "AND" | "ARRAY" | "BEGIN" | "BY" | "CASE" | "CONST" | "DEFINITION" |
//        "DIV" | "DO" | "ELSE" | "ELSIF" | "END" | "EXIT" | "EXPORT" |
//        "FOR" | "FROM" | "IF" | "IMPLEMENTATION" | "IMPORT" | "IN" |
//        "LOOP" | "MOD" | "MODULE" | "NOT" | "OF" | "OR" | "POINTER" |
//        "PROCEDURE" | "QUALIFIED" | "RECORD" | "REREAT" | "RETURN" | "SET" |
//        "THEN" | "TO" | "TYPE" | "UNTIL" | "VAR" | "WHILE" | "WITH" |
//        "ADOPTS" | "AS" | "CLASS" | "EXTENDS" | "INSTANCE" | "METHOD" |
//        "PROTOCOL" | "RESTRICTED" | "SELF" | "SUBCLASSES"
//
//   (2) identifier =
//        ( "_" | "$" | letter ) { "_" | "$" | letter | digit }
//
//  (2a) labeled-identifier =
//        identifier ":"
//
//   (3) octal-integer-literal =
//        octal-digit { octal-digit } "B"
//
//   (4) decimal-integer-literal =
//        digit { digit }
//
//   (5) sedecimal-integer-literal =
//        digit { sedecimal-digit } "H"
//
//   (6) real-number-literal =
//        ( digit { digit } "." digit { digit } ) |
//        ( digit "." digit { digit } "E" [ "+" | "-" ] digit { digit } )
//
//   (7) octal-digit =
//        "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7"
//
//   (8) digit =
//        octal-digit | "8" | "9"
//
//   (9) non-decimal-digit =
//        "A" | "B" | "C" | "D" | "E" | "F"
//
//  (10) sedecimal-digit =
//        digit | non-decimal-digit
//
//  (11) 7-bit-ascii-character-code-literal =
//        { "0" } [ "1" ] [ octal-digit ] octal-digit "C"
//
//  (12) unichar-literal =
//        { "0" } ( digit | ( "0" non-decimal-digit ) )
//        [ sedecimal-digit ] [ sedecimal-digit ] [ sedecimal-digit ] "U"
//
//  (13) quoted-char-literal =
//        ( "'" character "'" ) | ( "'" character "'" )
//
//  (14) string-literal =
//        (( "'" { character } "'" ) | ( '"' { character } '"' ))
//        { "..." (( "'" { character } "'" ) | ( '"' { character } '"' )) }
//
//  (15) character =
//        printable-char | escaped-char
//
//  (16) printable-char =
//        " " | "!" | '"' | "#" | "$" | "%" | "&" | "'" | "(" | ")" | "*" |
//        "+" | "," | "-" | "." | "/" | ":" | ";" | "<" | "=" | ">" | "?" |
//        "@" | "[" | "]" | "^" | "_" | "`" | "{" | "|" | "}" | "~" |
//        letter | digit
//
//  (17) escaped-char =
//        "\" ( "'" | '"' | "\" | "n" | "r" | "t" | "0" )
//
//  (18) assign-operator =
//        ":="
//
//  (19) message-prefix =
//        "`"
//
//  (20) logical-and-operator =
//        "&"
//
//  (21) logical-not-operator =
//        "~" | "!"
//
//  (22) equal-operator =
//        "="
//
//  (23) not-equal-operator =
//        "#" | "<>" | "!="
//
//  (24) greater-operator =
//        ">"
//
//  (25) greater-or-equal-operator =
//        ">="
//
//  (26) less-operator =
//        "<"
//
//  (27) less-or-equal-operator =
//        "<="
//
//  (28) plus-operator =
//        "+"
//
//  (29) increment-operator =
//        "++"
//
//  (30) minus-operator =
//        "-"
//
//  (31) decrement-operator =
//        "--"
//
//  (32) multiply-operator =
//        "*"
//
//  (33) divide-operator =
//        "/"
//
//  (34) pointer-operator =
//        "^"
//
//  (35) opening-parenthesis =
//        "("
//
//  (36) closing-parenthesis =
//        ")"
//
//  (37) opening-bracket =
//        "["
//
//  (38) closing-bracket =
//        "]"
//
//  (39) opening-brace =
//        "{"
//
//  (40) closing-brace =
//        "}"
//
//  (41) dot =
//        "."
//
//  (42) double-dot =
//        ".."
//
//  (43) comma =
//        ","
//
//  (44) colon =
//        ":"
//
//  (45) semicolon =
//        ";"
//
//  (46) vertical-bar =
//        "|"
//
//  (47) c-comment =
//        "/*" { printable-char | end-of-line-marker } "*/"
//
//  (48) m2-comment =
//        "(*" { printable-char | end-of-line-marker | { m2-comment } } "*)"
//
//  (49) cpp-comment =
//        "//" { printable-char } end-of-line-marker
//
//  (50) pragma =
//        "<*" ( letter { "_" letter | letter } ) "=" quoted-string "*>"
//
//  (51) end-of-line-marker =
//        [ ASCII-CR ] ASCII-LF
//
//  (52) end-of-file-marker =
//        system specific
//
//  Productions  1, 2, 2a, 12, 13, 14, 15, 17, 19, 21, 24, 29, 31, 47 and 49
//  either contain or represent ObjM2 language extensions.
// ---------------------------------------------------------------------------


#ifndef OBJM2_LEXER_H
#define OBJM2_LEXER_H


#include "common_types.h"
#include "objm2_symbols.h"
#include "objm2_key_value_storage.h"


// --------------------------------------------------------------------------
// Opaque lexer handle type
// --------------------------------------------------------------------------
//
// WARNING: Objects of this opaque type should only be accessed through this
// public interface.  DO NOT EVER attempt to bypass the public interface.
//
// The internal data structure of this opaque type is HIDDEN  and  MAY CHANGE
// at any time WITHOUT NOTICE. Accessing the internal data structure directly
// other than through the  functions  in this public interface is  UNSAFE and
// may result in an inconsistent program state or a crash.

typedef opaque_t objm2_lexer_t;


// --------------------------------------------------------------------------
// Status codes
// --------------------------------------------------------------------------

typedef /* objm2_lexer_status_t */ enum {
    OBJM2_LEXER_STATUS_UNDEFINED = -1,
    
    // operation completed successfully
    OBJM2_LEXER_STATUS_SUCCESS = 1,
    
    // invalid pointer to lexer object passed
    OBJM2_LEXER_STATUS_INVALID_REFERENCE,
    
    // sourcefile at specified path not found
    OBJM2_LEXER_STATUS_FILE_NOT_FOUND,
    
    // access to specified sourcefile denied
    OBJM2_LEXER_STATUS_FILE_ACCESS_DENIED,
    
    // open file limit reached, unable to open sourcefile
    OBJM2_LEXER_STATUS_OPEN_FILE_LIMIT_REACHED,
    
    // specified pathname length exceeds maximum length
    OBJM2_LEXER_STATUS_PATH_NAME_TOO_LONG,
    
    // circular reference in specified pathname
    OBJM2_LEXER_STATUS_LOOP_IN_PATHNAME,
    
    // any other error while trying to open sourcefile
    OBJM2_LEXER_STATUS_ERROR_OPENING_FILE,
    
    // unable to allocate memory
    OBJM2_LEXER_STATUS_UNABLE_TO_ALLOCATE,
    
    // illegal character found
    OBJM2_LEXER_STATUS_ILLEGAL_CHARACTER,
    
    // real number in scientific format not normalised
    // (has more than one digit before decimal point)
    OBJM2_LEXER_STATUS_MALFORMED_SCIENTIFIC_NOTATION,
    
    // designated octal literal contains non-octal digits
    OBJM2_LEXER_STATUS_MALFORMED_OCTAL_LITERAL,
    
    // designated ASCII code literal contains non-octal digits
    OBJM2_LEXER_STATUS_MALFORMED_ASCII_LITERAL,
    
    // ASCII code literal exceeds ASCII range (0..127)
    OBJM2_LEXER_STATUS_ASCII_OVERFLOW,
    
    // UNICHAR literal exceeds Unicode range (0..65535)
    OBJM2_LEXER_STATUS_UNICHAR_OVERFLOW,
    
    // numeric literal contains illegal characters or digits
    OBJM2_LEXER_STATUS_MALFORMED_NUMERIC_LITERAL,

    // numeric literal exceeds maximum length
    OBJM2_LEXER_STATUS_NUMBER_TOO_LONG,
    
    // numeric literal exceeds maximum length
    OBJM2_LEXER_STATUS_STRING_NOT_TERMINATED,
    
    // string literal exceeds maximum length
    OBJM2_LEXER_STATUS_STRING_TOO_LONG,
    
    // string literal contains illegal characters
    OBJM2_LEXER_STATUS_STRING_HAS_ILLEGAL_CHARS,

    // nested M2 comment exceeds maximum nesting level
    OBJM2_LEXER_STATUS_COMMENT_NESTING_LIMIT_REACHED,
    
    OBJM2_LEXER_STATUS_COMMENT_TOO_LONG
} objm2_lexer_status_t;


// ---------------------------------------------------------------------------
// Lexer option identifiers
// ---------------------------------------------------------------------------

#define OBJM2_LEXER_TRIM_LEADING_ZEROES_IN_LITERALS     (1 << 0)
#define OBJM2_LEXER_PRINTABLE_7BIT_ASCII_STRINGS_ONLY   (1 << 1)
#define OBJM2_LEXER_DONT_RETURN_PRAGMA_TOKEN            (1 << 2)
#define OBJM2_LEXER_DONT_RETURN_C_COMMENT_TOKEN         (1 << 3)
#define OBJM2_LEXER_DONT_RETURN_M2_COMMENT_TOKEN        (1 << 4)
#define OBJM2_LEXER_DONT_RETURN_CPP_COMMENT_TOKEN       (1 << 5)
#define OBJM2_LEXER_DONT_RETURN_ANY_COMMENT_TOKEN       (7 << 3)
#define OBJM2_LEXER_ALL_OPTIONS                            63


// ---------------------------------------------------------------------------
// Macro:  OBJM2_LEXER_OPTION_IS_SET(option_to_test, option_flags)
// ---------------------------------------------------------------------------
//
// The  replacement expression  evaluates to  true  if option <option_to_test>
// is set in bitset <option_flags>,  otherwise it evaluates to  false.

#define OBJM2_LEXER_OPTION_IS_SET(_option,_flags) ((_option) & (_flags))


// --------------------------------------------------------------------------
// function:  objm2_new_lexer(infile, symtab, options, status)
// --------------------------------------------------------------------------
//
// Creates and returns a new objm2 lexer object associated with the specified 
// source file <infile>,  symbol table <symtab>  and  options <options>.  The
// status of the operation is passed back in <status>.
//
// Returns NULL if the lexer object could not be created.

objm2_lexer_t objm2_new_lexer(const char *infile,
                       objm2_kvs_table_t symtab,
                                uint16_t options,
                    objm2_lexer_status_t *status);


// ---------------------------------------------------------------------------
// function:  objm2_lexer_getsym(lexer, sym, status)
// ---------------------------------------------------------------------------
//
// Reads the current symbol from the sourcefile of lexer <lexer>,  returns its
// token  and passes the symbol back in <sym>.  The status of the operation is
// passed back in <status>.  If an illegal character is encountered,  then the
// token TOKEN_ILLEGAL_CHARACTER is returned and the ASCII code of the illegal
// character is passed back in the symbol's key field.

objm2_token_t objm2_lexer_getsym(objm2_lexer_t lexer,
                                objm2_symbol_t sym,
                          objm2_lexer_status_t *status);


// ---------------------------------------------------------------------------
// function:  objm2_lexer_pathname(lexer, status)
// ---------------------------------------------------------------------------
//
// Returns the pathname of the sourcefile of lexer <lexer>.  The status of the
// operation is passed back in <status>.
//
// Returns NULL if <lexer> is not a valid lexer object.

const char *objm2_lexer_pathname(objm2_lexer_t lexer,
                          objm2_lexer_status_t *status);


// ---------------------------------------------------------------------------
// function:  objm2_lexer_symtab(lexer, status)
// ---------------------------------------------------------------------------
//
// Returns a pointer to the symbol table used by lexer <lexer>.  The status of
// the operation is passed back in <status>.
//
// Returns NULL if <lexer> is not a valid lexer object.

objm2_kvs_table_t objm2_lexer_symtab(objm2_lexer_t lexer,
                              objm2_lexer_status_t *status);


// ---------------------------------------------------------------------------
// function:  objm2_lexer_options(lexer, status)
// ---------------------------------------------------------------------------
//
// Returns the  option set  for lexer <lexer>.  The status of the operation is
// passed back in <status>.
//
// Returns zero if <lexer> is not a valid lexer object.

cardinal objm2_lexer_options(objm2_lexer_t lexer,
                      objm2_lexer_status_t *status);


// ---------------------------------------------------------------------------
// function:  objm2_reset_lexer(lexer, status)
// ---------------------------------------------------------------------------
//
// Resets the lexer to its  initialisation status  and  closes its sourcefile.
// The  symbol table  used  by the lexer is  NOT  modified.  The status of the
// operation is passed back in <status>.

void objm2_reset_lexer(objm2_lexer_t lexer,
                objm2_lexer_status_t *status);


// ---------------------------------------------------------------------------
// function:  objm2_dispose_lexer(lexer, status)
// ---------------------------------------------------------------------------
//
// Disposes of lexer object <lexer>  and  closes its sourcefile if it is open. 
// The  symbol table  used  by  the lexer is  NOT  disposed of.  The status of
// the operation is passed back in <status>.

void objm2_dispose_lexer(objm2_lexer_t lexer,
                  objm2_lexer_status_t *status);


#endif /* OBJM2_LEXER_H */

// END OF FILE