WebSVN – tendra.SVN – /trunk/src/producers/common/parse/lex.c

/*
 * Copyright (c) 2002-2005 The TenDRA Project <http://www.tendra.org/>.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 3. Neither the name of The TenDRA Project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific, prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * $Id$
 */
/*
                 Crown Copyright (c) 1997

    This TenDRA(r) Computer Program is subject to Copyright
    owned by the United Kingdom Secretary of State for Defence
    acting through the Defence Evaluation and Research Agency
    (DERA).  It is made available to Recipients with a
    royalty-free licence for its use, reproduction, transfer
    to other parties and amendment for any purpose not excluding
    product development provided that any such use et cetera
    shall be deemed to be acceptance of the following conditions:-

        (1) Its Recipients shall ensure that this Notice is
        reproduced upon any copies or amended versions of it;

        (2) Any amended version of it shall be clearly marked to
        show both the nature of and the organisation responsible
        for the relevant amendment or amendments;

        (3) Its onward transfer from a recipient to another
        party shall be deemed to be that party's acceptance of
        these conditions;

        (4) DERA gives no warranty or assurance as to its
        quality or suitability for any purpose and DERA accepts
        no liability whatsoever in relation to any use to which
        it may be put.
*/


#include "config.h"
#include <limits.h>
#if FS_MULTIBYTE
#include <locale.h>
#endif
#include "c_types.h"
#include "exp_ops.h"
#include "hashid_ops.h"
#include "id_ops.h"
#include "member_ops.h"
#include "str_ops.h"
#include "error.h"
#include "catalog.h"
#include "option.h"
#include "buffer.h"
#include "char.h"
#include "constant.h"
#include "file.h"
#include "dump.h"
#include "hash.h"
#include "lex.h"
#include "literal.h"
#include "macro.h"
#include "parse.h"
#include "pragma.h"
#include "preproc.h"
#include "print.h"
#include "syntax.h"
#include "ustring.h"
#include "xalloc.h"


/*
    PARSER OPTIONS

    These flags control the behaviour of the parser and determine whether
    such features as trigraphs and digraphs are allowed.
*/

int allow_trigraphs = 1;
int allow_digraphs = 1;
int allow_unicodes = LANGUAGE_CPP;
int allow_multibyte = 1;
int allow_cpp_comments = LANGUAGE_CPP;
int allow_dos_newline = 0;
int allow_extra_symbols = 0;
int allow_iso_keywords = LANGUAGE_CPP;
int allow_newline_strings = 0;
int analyse_comments = 1;
unsigned long max_id_length = 1024;


/*
    TABLE OF SYMBOLS AND KEYWORDS

    This table gives the mapping between lexical token numbers and the
    corresponding symbols and keywords.  It is derived from the list of
    tokens in symbols.h.
*/

CONST char *token_names[] = {
#define LEX_TOKEN(A, B, C)              (B),
#include "symbols.h"
#undef LEX_TOKEN
        NULL
};


/*
    TRANSLATION A LEXICAL TOKEN TO ITS PRIMARY FORM

    This routine translates the alternative ISO keywords and digraphs
    into their primary form.
*/

int
primary_form(int t)
{
        int u = t;
        switch (u) {
        case lex_and_H2:
                u = lex_and_H1;
                break;
        case lex_and_Heq_H2:
                u = lex_and_Heq_H1;
                break;
        case lex_close_Hbrace_H2:
                u = lex_close_Hbrace_H1;
                break;
        case lex_close_Hsquare_H2:
                u = lex_close_Hsquare_H1;
                break;
        case lex_compl_H2:
                u = lex_compl_H1;
                break;
        case lex_hash_H2:
                u = lex_hash_H1;
                break;
        case lex_hash_Hhash_H2:
                u = lex_hash_Hhash_H1;
                break;
        case lex_logical_Hand_H2:
                u = lex_logical_Hand_H1;
                break;
        case lex_logical_Hor_H2:
                u = lex_logical_Hor_H1;
                break;
        case lex_not_H2:
                u = lex_not_H1;
                break;
        case lex_not_Heq_H2:
                u = lex_not_Heq_H1;
                break;
        case lex_open_Hbrace_H2:
                u = lex_open_Hbrace_H1;
                break;
        case lex_open_Hsquare_H2:
                u = lex_open_Hsquare_H1;
                break;
        case lex_or_H2:
                u = lex_or_H1;
                break;
        case lex_or_Heq_H2:
                u = lex_or_Heq_H1;
                break;
        case lex_xor_H2:
                u = lex_xor_H1;
                break;
        case lex_xor_Heq_H2:
                u = lex_xor_Heq_H1;
                break;
        }
        return(u);
}


/*
    REPORT A DIGRAPH TOKEN

    This routine reports the digraph t, returning the primary form of t.
*/

int
get_digraph(int t)
{
        int u = primary_form(t);
        if (u != t) {
                update_column();
                report(crt_loc, ERR_lex_digraph_replace(t, u));
        }
        return(u);
}


/*
    CREATE A KEYWORD

    This routine creates a keyword identifier with name nm and lexical
    token number key.  The special case when key is lex_unknown is used
    to indicate a reserved identifier.
*/

IDENTIFIER
make_keyword(HASHID nm, int key, IDENTIFIER id)
{
        PTR(IDENTIFIER)ptr = hashid_id(nm);
        if (IS_NULL_id(id)) {
                /* Find keyword type */
                unsigned tag = id_keyword_tag;
                if (key >= FIRST_ISO_KEYWORD && key <= LAST_ISO_KEYWORD) {
                        tag = id_iso_keyword_tag;
                } else if (key >= FIRST_SYMBOL && key <= LAST_SYMBOL) {
                        tag = id_iso_keyword_tag;
                } else if (key == lex_unknown) {
                        tag = id_reserved_tag;
                }

                /* Create keyword identifier */
                MAKE_id_keyword_etc(tag, nm, dspec_none, NULL_nspace, crt_loc,
                                    id);
                COPY_ulong(id_no(id), (unsigned long)key);
        }
        COPY_id(hashid_cache(nm), NULL_id);
        if (do_keyword) {
                dump_declare(id, &crt_loc, 1);
        }

        /* Add keyword to identifier meanings */
        for (;;) {
                IDENTIFIER pid = DEREF_id(ptr);
                switch (TAG_id(pid)) {
                case id_dummy_tag:
                case id_keyword_tag:
                case id_iso_keyword_tag:
                case id_reserved_tag:
                        COPY_id(id_alias(id), pid);
                        COPY_id(ptr, id);
                        return(id);
                }
                ptr = id_alias(pid);
        }
        /* NOTREACHED */
}


/*
    INITIALISE KEYWORDS

    This routine initialises the hash table entries for the keywords.
*/

void
init_keywords(void)
{
        int key;

        /* Set up keyword entries */
        for (key = FIRST_KEYWORD; key <= LAST_KEYWORD; key++) {
                int ext = 0;
                string keyword = token_name(key);
                unsigned long h = hash(keyword);
                if (keyword[0] == char_less) {
                        ext = 1;
                }
                KEYWORD(key) = lookup_name(keyword, h, ext, key);
        }

        /* Bring the C keywords into scope */
        for (key = FIRST_C_KEYWORD; key <= LAST_C_KEYWORD; key++) {
                HASHID nm = KEYWORD(key);
                IGNORE make_keyword(nm, key, NULL_id);
        }

        /* Bring the C++ keywords into scope */
        for (key = FIRST_CPP_KEYWORD; key <= LAST_CPP_KEYWORD; key++) {
                HASHID nm = KEYWORD(key);
#if LANGUAGE_CPP
                IGNORE make_keyword(nm, key, NULL_id);
#else
                if (key != lex_wchar_Ht) {
                        IGNORE make_keyword(nm, lex_unknown, NULL_id);
                }
#endif
        }

        /* Bring the ISO alternative keywords into scope */
        for (key = FIRST_ISO_KEYWORD; key <= LAST_ISO_KEYWORD; key++) {
                HASHID nm = KEYWORD(key);
                if (allow_iso_keywords) {
                        IGNORE make_keyword(nm, key, NULL_id);
                } else {
                        IGNORE make_keyword(nm, lex_unknown, NULL_id);
                }
        }

        /* Find underlying dummy identifier for 'operator' */
        underlying_op = DEREF_id(hashid_id(KEYWORD(lex_operator)));
        underlying_op = underlying_id(underlying_op);
        return;
}


/*
    ADJUST A CHARACTER FOR TRIGRAPHS

    This routine is called after a question mark has been read from the
    input file to allow for trigraphs.  It returns the trigraph replacement
    character or '?' if the following characters do not form a trigraph.
*/

static int
adjust_trigraph(void)
{
        if (allow_trigraphs) {
                int c = next_char();
                if (c == char_end) {
                        c = refill_char();
                }
                if (c == char_question) {
                        int d;
                        c = next_char();
                        if (c == char_end) {
                                c = refill_char();
                        }
                        switch (c) {
                        case char_close_round:
                                /* Map '\?\?)' to ']' */
                                d = char_close_square;
                                break;
                        case char_equal:
                                /* Map '\?\?=' to '#' */
                                d = char_hash;
                                break;
                        case char_exclaim:
                                /* Map '\?\?!' to '|' */
                                d = char_bar;
                                break;
                        case char_greater:
                                /* Map '\?\?>' to '}' */
                                d = char_close_brace;
                                break;
                        case char_less:
                                /* Map '\?\?<' to '{' */
                                d = char_open_brace;
                                break;
                        case char_minus:
                                /* Map '\?\?-' to '~' */
                                d = char_tilde;
                                break;
                        case char_open_round:
                                /* Map '\?\?(' to '[' */
                                d = char_open_square;
                                break;
                        case char_single_quote:
                                /* Map '\?\?\'' to '^' */
                                d = char_circum;
                                break;
                        case char_slash:
                                /* Map '\?\?/' to '\\' */
                                d = char_backslash;
                                break;
                        default:
                                /* Not a trigraph */
                                unread_char(c);
                                unread_char(char_question);
                                return(char_question);
                        }
                        update_column();
                        report(crt_loc, ERR_lex_trigraph_replace(c, d));
                        return(d);
                } else {
                        /* Not a trigraph */
                        unread_char(c);
                }
        }
        return(char_question);
}


/*
    READ A NEWLINE CHARACTER

    This routine is called after each carriage return character, checking
    for a following newline character.
*/

static int
read_newline(void)
{
        if (allow_dos_newline) {
                int c = next_char();
                if (c == char_end) {
                        c = refill_char();
                }
                if (c == char_newline) {
                        return(c);
                }
                unread_char(c);
        }
        return(char_return);
}


/*
    READ AN END OF FILE CHARACTER

    This routine is called after each terminate character, checking for
    a following end of file character.
*/

static int
read_eof(void)
{
        if (allow_dos_newline) {
                int c = next_char();
                if (c == char_end) {
                        c = refill_char();
                }
                if (c == char_eof) {
                        return(c);
                }
                unread_char(c);
        }
        return(char_sub);
}


/*
    READ THE NEXT CHARACTER ALLOWING FOR TRIGRAPHS ETC.

    This routine reads the next character from the input file, adjusting
    it as necessary for trigraphs and escaped newlines.  This routine
    corresponds to phases 1 and 2 of the phases of translation.
*/

static int
read_char(void)
{
        for (;;) {
                int c = next_char();
                if (c == char_end) {
                        c = refill_char();
                }
                if (c == char_question) {
                        c = adjust_trigraph();
                }
                if (c != char_backslash) {
                        /* Not an escaped newline */
                        return(c);
                }
                c = next_char();
                if (c == char_end) {
                        c = refill_char();
                }
                if (c == char_return) {
                        c = read_newline();
                }
                if (c != char_newline) {
                        /* Not an escaped newline */
                        unread_char(c);
                        return(char_backslash);
                }
                crt_loc.line++;
                crt_loc.column = 0;
                input_crt = input_posn;
        }
        /* NOTREACHED */
}


/*
    CHARACTER LOOK-UP TABLE

    This look-up table gives the various character types.  Note that the
    default look-up table is for ASCII, for other codesets the table
    needs to be rewritten.  The only really interesting points in the
    table itself are that newline has not been classified as a white-space
    and that character char_eof (-1) represents end of file.
*/

#define SPACE_M                 0x01
#define ALPHA_M                 0x02
#define DIGIT_M                 0x04
#define ALNUM_M                 0x08
#define PPDIG_M                 0x10
#define SYMBL_M                 0x20
#define NLINE_M                 0x40
#define LEGAL_M                 0x80

#define ILLEG                   0x00
#define LEGAL                   LEGAL_M
#define SPACE                   (SPACE_M | LEGAL_M)
#define ALPHA                   (ALPHA_M | ALNUM_M | PPDIG_M | LEGAL_M)
#define DIGIT                   (DIGIT_M | ALNUM_M | PPDIG_M | LEGAL_M)
#define SYMBL                   (SYMBL_M | LEGAL_M)
#define POINT                   (PPDIG_M | SYMBL_M | LEGAL_M)
#define NLINE                   (NLINE_M | LEGAL_M)

#define main_characters         (characters + 1)
#define lookup_char(C)          ((int)main_characters[C])
#define is_white(T)             ((T) & SPACE_M)
#define is_alpha(T)             ((T) & ALPHA_M)
#define is_digit(T)             ((T) & DIGIT_M)
#define is_alphanum(T)          ((T) & ALNUM_M)
#define is_ppdigit(T)           ((T) & PPDIG_M)
#define is_symbol(T)            ((T) & SYMBL_M)
#define is_newline(T)           ((T) & NLINE_M)
#define is_legal(T)             ((T) & LEGAL_M)

static unsigned char characters[NO_CHAR + 2] = {
        LEGAL,                  /* EOF */
#define CHAR_DATA(A, B, C, D)   (A),
#include "char.h"
#undef CHAR_DATA
        ILLEG                   /* dummy */
};

static unsigned char *copy_characters = main_characters;


/*
    SET A CHARACTER LOOK-UP

    This routine sets the look-up value for character a to be equal to
    the underlying value for character b.  As a special case, setting
    the look-up for a carriage return to that for newline enables
    DOS-like rules on newline and end of file characters.
*/

void
set_char_lookup(int a, int b)
{
        if (a >= 0 && a < NO_CHAR && b >= 0 && b < NO_CHAR) {
                unsigned char t = copy_characters[b];
                if (a == char_return) {
                        if (b == char_newline) {
                                /* Set DOS-like newline rules */
                                allow_dos_newline = 1;
                                return;
                        }
                        if (b == char_return) {
                                /* Unset DOS-like newline rules */
                                allow_dos_newline = 0;
                        }
                }
                main_characters[a] = t;
        }
        return;
}


/*
    SET A NUMBER OF CHARACTER LOOK-UPS

    This routine sets the character look-ups for all the elements of the
    string or character literal expression a to be equal to that for the
    character literal expression b.  If b is the null expression then
    the look-up is set to be an illegal character.
*/

void
set_character(EXP a, EXP b)
{
        int c = get_char_value(b);
        if (IS_exp_string_lit(a)) {
                STRING s = DEREF_str(exp_string_lit_str(a));
                unsigned long n = DEREF_ulong(str_simple_len(s));
                string t = DEREF_string(str_simple_text(s));
                unsigned kind = DEREF_unsigned(str_simple_kind(s));
                if (kind & STRING_MULTI) {
                        while (n) {
                                int ch = CHAR_SIMPLE;
                                unsigned long d = get_multi_char(t, &ch);
                                if (d < (unsigned long)NO_CHAR) {
                                        set_char_lookup((int)d, c);
                                }
                                t += MULTI_WIDTH;
                                n--;
                        }
                } else {
                        while (n) {
                                int d = (int)*t;
                                set_char_lookup(d, c);
                                t++;
                                n--;
                        }
                }
        } else {
                int d = get_char_value(a);
                if (d != char_illegal) {
                        set_char_lookup(d, c);
                }
        }
        return;
}


/*
    CHECK FOR WHITE SPACE CHARACTERS

    This routine checks whether the character a represents a white space.
    The newline character constitutes a special case.
*/

int
is_white_char(unsigned long a)
{
        int t;
        if (a >= NO_CHAR) {
                return(0);
        }
        t = lookup_char(a);
        return(is_white(t) || is_newline(t));
}


/*
    CHECK FOR ALPHABETIC CHARACTERS

    This routine checks whether the character a represents an alphabetic
    character.
*/

int
is_alpha_char(unsigned long a)
{
        if (a >= NO_CHAR) {
                return(0);
        }
        return(is_alpha(lookup_char(a)));
}


/*
    CHECK FOR LEGAL CHARACTERS

    This routine checks whether the character a represents a legal character.
*/

int
is_legal_char(unsigned long a)
{
        if (a >= NO_CHAR) {
                return(0);
        }
        return(is_legal(lookup_char(a)));
}


/*
    PEEK AHEAD ONE CHARACTER

    This routine tests whether the next character is a (which will not be
    newline).  If so the current character is advanced one, otherwise it
    is left unchanged.  legal is set to false if the next character is
    not legal.
*/

int
peek_char(int a, int *legal)
{
        int c = read_char();
        ASSERT(a != char_newline);
        if (c == a) {
                return(1);
        }
        *legal = is_legal_char((unsigned long)c);
        unread_char(c);
        return(0);
}


/*
    TOKEN BUFFER

    This buffer is used by read_token to hold the values of identifiers,
    numbers and strings.
*/

BUFFER token_buff = NULL_buff;


/*
    TOKEN IDENTIFICATION MACROS

    These macros are used to identify the start or end of certain tokens
    such as comments and strings.
*/

#define START_COMMENT(A)        ((A) == char_asterix)
#define END_COMMENT(A, B)       ((A) == char_asterix && (B) == char_slash)
#define START_CPP_COMMENT(A)    ((A) == char_slash && allow_cpp_comments)
#define END_CPP_COMMENT(A)      ((A) == char_newline)
#define START_STRING(A)         ((A) == char_quote || (A) == char_single_quote)
#define END_STRING(A, Q)        ((A) == (Q))


/*
    END OF FILE FLAG

    Each source file should end in a newline character, which is not
    preceded by a backspace.  This flag is used to indicate whether the
    end of the present file has the correct form.
*/

static int good_eof = 0;


/*
    SKIP A STRING

    This routine skips a string or character literal.  It is entered after
    the initial quote, q, has been read.  Escape sequences are always
    allowed.  The routine returns lex_string_Hlit if the string terminates
    correctly and lex_eof otherwise.
*/

static int
skip_string(int q)
{
        int e = q;
        LOCATION loc;
        unsigned nl = 0;
        int escaped = 0;
        int have_char = 0;
        int allow_nl = allow_newline_strings;
        if (e == char_single_quote || in_preproc_dir == 1) {
                allow_nl = 0;
        }
        update_column();
        loc = crt_loc;

        /* Scan to end of string */
        for (;;) {
                int c = read_char();
                if (END_STRING(c, e) && !escaped) {
                        if (e == char_single_quote && !have_char) {
                                update_column();
                                report(crt_loc, ERR_lex_ccon_empty());
                        }
                        if (nl) {
                                report(loc, ERR_lex_string_nl(nl, nl));
                        }
                        return(lex_string_Hlit);
                }
                if (c == char_newline) {
                        if (allow_nl) {
                                /* Report newlines but continue */
                                crt_loc.line++;
                                crt_loc.column = 0;
                                input_crt = input_posn;
                                nl++;
                        } else {
                                unread_char(c);
                                update_column();
                                report(crt_loc, ERR_lex_string_pp_nl());
                                break;
                        }
                } else if (c == char_eof) {
                        report(loc, ERR_lex_phases_str_eof());
                        good_eof = 1;
                        nl = 0;
                        break;
                }
                if (escaped) {
                        escaped = 0;
                } else {
                        if (c == char_backslash) {
                                escaped = 1;
                        }
                }
                if (!escaped) {
                        have_char = 1;
                }
        }
        if (nl) {
                /* Report newlines in string */
                report(loc, ERR_lex_string_nl(nl, nl));
        } else {
                /* Don't bother with error recovery */
                /* EMPTY */
        }
        return(lex_eof);
}


/*
    READ THE BODY OF A STRING

    This routine reads the body of a string or character literal or of a
    header name.  It is entered after the initial quote has been read.
    The corresponding close quote is passed in as q.  The esc argument
    indicates whether escape sequences are allowed (they are not in
    header names for example).  The string itself is built up in
    token_buff.  The routine returns lex_string_Hlit if the string
    terminates correctly and lex_eof otherwise.  It also sets
    token_buff.posn to point to the end of the string.
*/

int
read_string(int q, int esc)
{
        int c;
        int e = q;
        LOCATION loc;
        long posn = -1;
        int escaped = 0;
        unsigned nl = 0;
        int have_char = 0;
        string s = token_buff.start;
        string se = token_buff.end;
        int allow_nl = allow_newline_strings;
        update_column();
        if (e == char_single_quote) {
                posn = tell_buffer(crt_buff_no);
                allow_nl = 0;
        } else if (in_preproc_dir == 1) {
                allow_nl = 0;
        }
        loc = crt_loc;

        /* Scan the string */
        for (;;) {
                c = read_char();
                if (END_STRING(c, e) && !escaped) {
                        if (e == char_single_quote && !have_char) {
                                update_column();
                                report(crt_loc, ERR_lex_ccon_empty());
                        }
                        if (nl) {
                                report(loc, ERR_lex_string_nl(nl, nl));
                        }
                        token_buff.posn = s;
                        *s = 0;
                        return(lex_string_Hlit);
                }
                if (c == char_newline) {
                        if (allow_nl) {
                                /* Report newlines but continue */
                                crt_loc.line++;
                                crt_loc.column = 0;
                                input_crt = input_posn;
                                nl++;
                        } else {
                                unread_char(c);
                                update_column();
                                if (e == char_greater) {
                                        /* Header name */
                                        report(crt_loc,
                                               ERR_cpp_include_incompl());
                                } else {
                                        report(crt_loc, ERR_lex_string_pp_nl());
                                }
                                break;
                        }
                } else if (c == char_eof) {
                        report(loc, ERR_lex_phases_str_eof());
                        good_eof = 1;
                        nl = 0;
                        break;
                }
                *s = (character)c;
                if (++s == se) {
                        s = extend_buffer(&token_buff, s);
                        se = token_buff.end;
                }
                if (escaped) {
                        escaped = 0;
                } else {
                        if (c == char_backslash) {
                                escaped = esc;
                        }
                }
                if (!escaped)have_char = 1;
        }
        if (nl) {
                /* Report newlines in string */
                report(loc, ERR_lex_string_nl(nl, nl));
        } else {
                /* Error recovery */
                if (e == char_single_quote && have_char) {
                        seek_buffer(crt_buff_no, posn, 1);
                        crt_loc = loc;
                        s = token_buff.start;
                        c = read_char();
                        *(s++) = (character)c;
                        if (c == char_backslash && esc) {
                                c = read_char();
                                *(s++) = (character)c;
                        }
                }
        }
        token_buff.posn = s;
        *s = 0;
        return(lex_eof);
}


/*
    SKIP A C STYLE COMMENT

    This routine skips a C style comment, returning lex_ignore_token if
    the comment is terminated correctly and lex_eof otherwise.  It is
    entered after the first two characters comprising the comment start
    have been read.  If keep is true then the comment text is built up
    in token_buff, otherwise it is discarded.
*/

static int
skip_comment(int keep)
{
        int c = 0;
        int lastc;
        string s, se;
        LOCATION loc;
        update_column();
        loc = crt_loc;
        if (keep) {
                s = token_buff.start;
                se = token_buff.end;
        } else {
                s = NULL;
                se = NULL;
        }
        do {
                lastc = c;
read_label:
                /* Inlined version of read_char */
                c = next_char();
                if (c == char_end) {
                        c = refill_char();
                }
                if (c == char_question) {
                        c = adjust_trigraph();
                }
                if (c == char_backslash) {
                        c = next_char();
                        if (c == char_end) {
                                c = refill_char();
                        }
                        if (c == char_return) {
                                c = read_newline();
                        }
                        if (c == char_newline) {
                                /* Allow for escaped newlines */
                                crt_loc.line++;
                                crt_loc.column = 0;
                                input_crt = input_posn;
                                goto read_label;
                        }
                        unread_char(c);
                        c = char_backslash;
                } else if (c == char_newline) {
                        /* New line characters */
                        crt_loc.line++;
                        crt_loc.column = 0;
                        input_crt = input_posn;
                        crt_line_changed = 1;
                        crt_spaces = 0;
                } else if (c == char_eof) {
                        /* End of file characters */
                        report(loc, ERR_lex_phases_comm_eof());
                        good_eof = 1;
                        if (s) {
                                token_buff.posn = s;
                                *s = 0;
                        }
                        return(lex_eof);
                } else if (c == char_asterix && lastc == char_slash) {
                        /* Nested comments */
                        update_column();
                        report(crt_loc, ERR_lex_comment_nest());
                }
                if (s) {
                        *s = (character)c;
                        if (++s == se) {
                                s = extend_buffer(&token_buff, s);
                                se = token_buff.end;
                        }
                }
        } while (!END_COMMENT(lastc, c));
        if (s) {
                s -= 2;
                token_buff.posn = s;
                *s = 0;
        }
        crt_spaces++;
        return(lex_ignore_token);
}


/*
    SKIP A C++ STYLE COMMENT

    This routine skips a C++ style comment, returning lex_ignore_token
    if the comment terminates correctly and lex_eof otherwise.  It is
    entered after the first two characters comprising the comment start
    have been read.  The next token read after the comment will be the
    terminating newline.  If keep is true then the comment text is built
    up in token_buff, otherwise it is discarded.
*/

static int
skip_cpp_comment(int keep)
{
        int c;
        string s, se;
        if (keep) {
                s = token_buff.start;
                se = token_buff.end;
        } else {
                s = NULL;
                se = NULL;
        }
        do {
read_label:
                /* Inlined version of read_char */
                c = next_char();
                if (c == char_end) {
                        c = refill_char();
                }
                if (c == char_question) {
                        c = adjust_trigraph();
                }
                if (c == char_backslash) {
                        c = next_char();
                        if (c == char_end) {
                                c = refill_char();
                        }
                        if (c == char_return) {
                                c = read_newline();
                        }
                        if (c == char_newline) {
                                /* Allow for escaped newlines */
                                crt_loc.line++;
                                crt_loc.column = 0;
                                input_crt = input_posn;
                                goto read_label;
                        }
                        unread_char(c);
                        c = char_backslash;
                } else if (c == char_eof) {
                        /* End of file characters */
                        update_column();
                        report(crt_loc, ERR_lex_phases_comm_eof());
                        good_eof = 1;
                        if (s) {
                                token_buff.posn = s;
                                *s = 0;
                        }
                        return(lex_eof);
                }
                if (s) {
                        *s = (character)c;
                        if (++s == se) {
                                s = extend_buffer(&token_buff, s);
                                se = token_buff.end;
                        }
                }
        } while (!END_CPP_COMMENT(c));
        unread_char(c);
        if (s) {
                s -= 1;
                token_buff.posn = s;
                *s = 0;
        }
        crt_line_changed = 1;
        crt_spaces = 0;
        return(lex_ignore_token);
}


/*
    SKIP WHITE-SPACE CHARACTERS

    This routine skips any white-space characters (including comments).
    Newline characters are treated as white-space only if nl is true.
    The result is a bitpattern formed from the components:

        WHITE_SPACE             for white-space characters;
        WHITE_NEWLINE           for newline characters;
        WHITE_ESC_NEWLINE       for escaped newlines;

    the result being reset to WHITE_NEWLINE after each newline.  Note that
    trigraphs and escaped newlines are treated by hand.  The effect of this
    routine is that all non-empty sequences of white-space characters other
    than newlines are treated as if they were a single space (the C/C++
    specification says that this is implementation-defined).
*/

unsigned long
skip_white(int nl)
{
        int c;
        unsigned long sp = 0;
        for (;;) {
                c = next_char();
                if (c == char_end) {
                        c = refill_char();
                }
                if (c == char_return) {
                        c = read_newline();
                }
                if (c == char_sub) {
                        c = read_eof();
                }
                if (c == char_newline) {
                        /* Deal with newline characters */
                        if (!nl) {
                                break;
                        }
                        sp = WHITE_NEWLINE;
                        crt_loc.line++;
                        crt_loc.column = 0;
                        input_crt = input_posn;
                        crt_line_changed = 1;
                        crt_spaces = 0;
                } else if (c == char_space) {
                        /* Deal with simple spaces */
                        sp |= WHITE_SPACE;
                        crt_spaces++;
                } else if (c == char_tab) {
                        /* Deal with tab characters */
                        unsigned long tab = tab_width;
                        sp |= WHITE_SPACE;
                        crt_spaces = tab *(crt_spaces / tab + 1);
                } else if (c == char_eof) {
                        /* End of file */
                        if (sp == WHITE_NEWLINE) {
                                good_eof = 1;
                        }
                        break;
                } else {
                        int t;
#if FS_EXTENDED_CHAR
                        if (IS_EXTENDED(c)) {
                                break;
                        }
#endif
                        t = lookup_char(c);
                        if (is_white(t)) {
                                /* Deal with other white space characters */
                                sp |= WHITE_SPACE;
                                crt_spaces++;
                        } else {
                                if (c == char_question)c = adjust_trigraph();
                                if (c == char_slash) {
                                        /* Deal with comments */
                                        int b = read_char();
                                        if (START_COMMENT(b)) {
                                                sp |= WHITE_SPACE;
                                                b = skip_comment(0);
                                                if (b == lex_eof)  {
                                                        return(sp);
                                                }
                                        } else if (START_CPP_COMMENT(b)) {
                                                sp |= WHITE_SPACE;
                                                b = skip_cpp_comment(0);
                                                if (b == lex_eof) {
                                                        return(sp);
                                                }
                                                if (!nl) {
                                                        return(sp);
                                                }
                                        } else {
                                                unread_char(b);
                                                break;
                                        }
                                } else if (c == char_backslash) {
                                        /* Deal with escaped newlines */
                                        int b = next_char();
                                        if (b == char_end) {
                                                b = refill_char();
                                        }
                                        if (b == char_return) {
                                                b = read_newline();
                                        }
                                        if (b == char_newline) {
                                                crt_loc.line++;
                                                crt_loc.column = 0;
                                                input_crt = input_posn;
                                        } else {
                                                unread_char(b);
                                                break;
                                        }
                                        sp |= WHITE_ESC_NEWLINE;
                                } else {
                                        break;
                                }
                        }
                }
        }
        unread_char(c);
        return(sp);
}


/*
    PATCH UP WHITE-SPACE CHARACTERS

    Calling skip_white ( 1 ) can mess up the parser as regards spotting
    preprocessing directives and valid end of file markers.  This routine
    may be called with the return value of skip_white as an argument to
    patch up the buffer in order to get the parser back into the right
    state.
*/

void
patch_white(unsigned long sp)
{
        if (sp & WHITE_NEWLINE) {
                if (sp & WHITE_SPACE) {
                        /* Patch in a space after a newline */
                        unsigned long n;
                        update_column();
                        n = crt_loc.column;
                        while (n) {
                                unread_char(char_space);
                                if (input_posn <= input_start) {
                                        break;
                                }
                                n--;
                        }
                } else if (sp & WHITE_ESC_NEWLINE) {
                        /* Patch in an escaped newline after a newline */
                        unread_char(char_backslash);
                        unread_char(char_newline);
                        crt_loc.line--;
                }
                /* Patch in a newline */
                unread_char(char_newline);
                crt_loc.line--;
                crt_loc.column = 0;
                crt_spaces = 0;
        }
        return;
}


/*
    SKIP TO END OF LINE

    This routine skips to the end of the current line.  It returns 0 if
    only white-space characters are encountered.  It uses skip_white to
    jump over white-space (including comments).
*/

int
skip_to_end(void)
{
        int c;
        int res = 0;
        in_preproc_dir = 0;
        for (;;) {
                IGNORE skip_white(0);
read_label:
                /* Inlined version of read_char */
                c = next_char();
                if (c == char_end) {
                        c = refill_char();
                }
                if (c == char_question) {
                        c = adjust_trigraph();
                }
                if (c == char_backslash) {
                        c = next_char();
                        if (c == char_end) {
                                c = refill_char();
                        }
                        if (c == char_return) {
                                c = read_newline();
                        }
                        if (c == char_newline) {
                                /* Allow for escaped newlines */
                                crt_loc.line++;
                                crt_loc.column = 0;
                                input_crt = input_posn;
                                goto read_label;
                        }
                        unread_char(c);
                } else if (c == char_newline) {
                        /* New line characters */
                        crt_loc.line++;
                        crt_loc.column = 0;
                        input_crt = input_posn;
                        crt_line_changed = 1;
                        crt_spaces = 0;
                        return(res);
                } else if (START_STRING(c)) {
                        /* String literals */
                        res = 1;
                        c = skip_string(c);
                        if (c == lex_eof) {
                                return(res);
                        }
                } else if (c == char_eof) {
                        /* End of file characters */
                        break;
                } else {
                        res = 1;
                }
        }
        update_column();
        report(crt_loc, ERR_lex_phases_eof());
        good_eof = 1;
        return(res);
}


/*
    READ A UNICODE CHARACTER

    This routine reads a unicode character.  It is entered after the
    initial backslash and the following character, c, have been read.
    It assigns the character type to pc and returns the character code.
*/

static unsigned long
read_unicode(int c, int *pc)
{
        unsigned i, n;
        unsigned long u;
        character s[10];
        ERROR err = NULL_err;
        string p = s;
        if (c == char_u && allow_unicodes) {
                /* Read '\uxxxx' */
                *pc = CHAR_UNI4;
                n = 4;
        } else if (c == char_U && allow_unicodes) {
                /* Read '\Uxxxxxxxx' */
                *pc = CHAR_UNI8;
                n = 8;
        } else {
                unread_char(c);
                *pc = CHAR_NONE;
                return(0);
        }
        for (i = 0; i < n; i++) {
                int t;
                int d = read_char();
                if (d == char_eof) {
                        break;
                }
#if FS_EXTENDED_CHAR
                if (IS_EXTENDED(d)) {
                        unread_char(d);
                        break;
                }
#endif
                t = lookup_char(d);
                if (!is_alphanum(t)) {
                        unread_char(d);
                        break;
                }
                s[i] = (character)d;
        }
        s[i] = 0;
        u = eval_unicode(c, n, pc, &p, &err);
        if (!IS_NULL_err(err)) {
                update_column();
                report(crt_loc, err);
        }
        return(u);
}


/*
    READ AN EXTENDED IDENTIFIER

    This routine reads an extended identifier name (one including a unicode
    character).  It is entered after reading the simple characters in the
    token buffer plus the unicode character given by u and ch.
*/

static HASHID
read_extended_id(unsigned long u, int ch)
{
        string s;
        int c, t;
        HASHID nm;
        unsigned long h;
        BUFFER *bf = &token_buff;
        do {
                if (!unicode_alpha(u)) {
                        /* Report illegal identifiers */
                        update_column();
                        report(crt_loc, ERR_lex_name_extendid(u));
                }
                print_char(u, ch, 0, bf);
                for (;;) {
                        c = read_char();
#if FS_EXTENDED_CHAR
                        if (IS_EXTENDED(c)) {
                                break;
                        }
#endif
                        t = lookup_char(c);
                        if (!is_alphanum(t)) {
                                break;
                        }
                        bfputc(bf, c);
                }
                ch = CHAR_NONE;
                if (c == char_backslash) {
                        int nextc = read_char();
                        u = read_unicode(nextc, &ch);
                }
        } while (ch != CHAR_NONE);
        unread_char(c);
        bfputc(bf, 0);
        s = bf->start;
        h = hash(s);
        nm = lookup_name(s, h, 1, lex_unknown);
        return(nm);
}


/*
    HASH VALUE FOR IDENTIFIERS

    The hash value for identifiers is built up as the identifier is read.
    It is then stored in this variable.  The algorithm for calculuating
    the hash value needs to be kept in step with the routine hash (it
    is checked by an assertion in lookup_name, so any errors should be
    caught quickly if in debug mode).
*/

HASHID token_hashid = NULL_hashid;


/*
    MAIN PASS ANALYSER

    This routine reads the next preprocessing token from the input file.
    It is designed for speed rather than elegance, hence the rather
    indiscriminate use of labels.  Trigraphs and escaped newlines
    involving the first character are processed by hand.  This routine
    corresponds to phase 3 of the phases of translation.  The position
    within the line is tracked by column - this is zero at the start of
    a line, positive if only white space has been read and negative
    otherwise.  preproc keeps track of the last preprocessing directive.
*/

int
read_token(void)
{
        int c, t;
        int column = -1;
        int preproc = lex_ignore_token;

        /* Read the next character */
start_label:
        c = next_char();
        if (c == char_end)c = refill_char();
restart_label:
#if FS_EXTENDED_CHAR
        if (IS_EXTENDED(c)) {
                goto unknown_label;
        }
#endif
        t = lookup_char(c);
        if (is_white(t)) {
                crt_spaces++;
                goto start_label;
        }
process_label:
        /* Process the next character */

        /* Check symbols and punctuation */
        if (is_symbol(t)) {
                switch (c) {

                case char_question: {
                        /* Deal with '?' and trigraphs */
                        c = adjust_trigraph();
                        if (c == char_question) return(lex_question);
                        goto restart_label;
                }

                case char_backslash: {
                        /* Deal with escaped newlines */
                        unsigned long u;
                        int ch = CHAR_NONE;
                        int nextc = next_char();
                        if (nextc == char_end)nextc = refill_char();
                        if (nextc == char_return)nextc = read_newline();
                        if (nextc == char_newline) {
                                crt_loc.line++;
                                crt_loc.column = 0;
                                input_crt = input_posn;
                                if (column == 0)column = 1;
                                goto start_label;
                        }

                        /* Check for unicode characters */
                        u = read_unicode(nextc, &ch);
                        if (ch != CHAR_NONE) {
                                token_buff.posn = token_buff.start;
                                token_hashid = read_extended_id(u, ch);
                                return(lex_identifier);
                        }
                        return(lex_backslash);
                }

                case char_hash:
                        /* Deal with '#' and '##' */
                        c = read_char();
                        if (c == char_hash) {
                                return(lex_hash_Hhash_H1);
                        }
                        unread_char(c);

                        /* Return with '#' if not at start of line */
                        if (column < 0 || no_preproc_dir) {
                                return(lex_hash_H1);
                        }

                        /* Deal with preprocessing directives */
preproc_label:  {
                        unsigned long sp = skip_white(0);
                        update_column();
                        if (column) {
                                report(crt_loc, ERR_cpp_indent());
                        }
                        if (sp & (WHITE_SPACE | WHITE_ESC_NEWLINE)) {
                                report(preproc_loc, ERR_cpp_indent_dir());
                        }
                        preproc = read_preproc_dir(1, preproc);
                        if (preproc < 0) {
                                goto start_line_label;
                        }
                        unread_char(char_newline);
                        crt_loc.line--;
                        crt_loc.column = 0;
                        return(preproc);
                }

                case char_percent:
                        /* Deal with '%', '%=', '%>', '%:' and '%:%:' */
                        c = read_char();
                        if (c == char_equal) {
                                return(lex_rem_Heq);
                        }
                        if (c == char_greater && allow_digraphs) {
                                return(lex_close_Hbrace_H2);
                        }
                        if (c == char_colon && allow_digraphs) {
                                /* Check for '%:' and '%:%:' */
                                c = read_char();
                                if (c == char_percent) {
                                        int nextc = read_char();
                                        if (nextc == char_colon) {
                                                return(lex_hash_Hhash_H2);
                                        }
                                        unread_char(nextc);
                                }
                                unread_char(c);

                                /* Return with '%:' if not at start of line */
                                if (column < 0 || no_preproc_dir) {
                                        return(lex_hash_H2);
                                }

                                /* Otherwise this is a preprocessing
                                 * directive */
                                IGNORE get_digraph(lex_hash_H2);
                                goto preproc_label;
                        }
                        unread_char(c);
                        return(lex_rem);

                case char_quote:
                        /* Deal with string literals */
                        IGNORE read_string(c, 1);
                        return(lex_string_Hlit);

                case char_single_quote:
                        /* Deal with character literals */
                        IGNORE read_string(c, 1);
                        return(lex_char_Hlit);

                case char_exclaim:
                        /* Deal with '!' and '!=' */
                        c = read_char();
                        if (c == char_equal) {
                                return(lex_not_Heq_H1);
                        }
                        unread_char(c);
                        return(lex_not_H1);

                case char_ampersand:
                        /* Deal with '&', '&&' and '&=' */
                        c = read_char();
                        if (c == char_ampersand) {
                                return(lex_logical_Hand_H1);
                        }
                        if (c == char_equal) {
                                return(lex_and_Heq_H1);
                        }
                        unread_char(c);
                        return(lex_and_H1);

                case char_asterix:
                        /* Deal with '*' and '*=' */
                        c = read_char();
                        if (c == char_equal) {
                                return(lex_star_Heq);
                        }
                        unread_char(c);
                        return(lex_star);

                case char_plus:
                        /* Deal with '+', '++' and '+=' */
                        c = read_char();
                        if (c == char_plus) {
                                return(lex_plus_Hplus);
                        }
                        if (c == char_equal) {
                                return(lex_plus_Heq);
                        }
                        if (c == char_question && allow_extra_symbols) {
                                return(lex_abs);
                        }
                        unread_char(c);
                        return(lex_plus);

                case char_minus:
                        /* Deal with '-', '--', '-=', '->' and '->*' */
                        c = read_char();
                        if (c == char_minus) {
                                return(lex_minus_Hminus);
                        }
                        if (c == char_equal) {
                                return(lex_minus_Heq);
                        }
                        if (c == char_greater) {
#if LANGUAGE_CPP
                                /* '->*' is only allowed in C++ */
                                c = read_char();
                                if (c == char_asterix) {
                                        return(lex_arrow_Hstar);
                                }
                                unread_char(c);
#endif
                                return(lex_arrow);
                        }
                        unread_char(c);
                        return(lex_minus);

                case char_dot:
                        /* Deal with '.', '...', '.*' and numbers */
                        c = read_char();
                        if (c == char_dot) {
                                c = read_char();
                                if (c == char_dot) {
                                        return(lex_ellipsis);
                                }
                                unread_char(c);
                                unread_char(char_dot);
                                return(lex_dot);
                        }
#if LANGUAGE_CPP
                        /* '.*' is only allowed in C++ */
                        if (c == char_asterix) {
                                return(lex_dot_Hstar);
                        }
#endif
#if FS_EXTENDED_CHAR
                        if (IS_EXTENDED(c)) {
                                unread_char(c);
                                return(lex_dot);
                        }
#endif
                        t = lookup_char(c);
                        if (is_digit(t)) {
                                /* Indicate a number with first digit '.' */
                                t = POINT;
                                goto number_label;
                        }
                        unread_char(c);
                        return(lex_dot);

                case char_slash:
                        /* Deal with '/', '/=' and comments */
                        c = read_char();
                        if (START_COMMENT(c)) {
                                int a = analyse_comments;
                                c = skip_comment(a);
                                if (c == lex_eof) {
                                        goto eof_label;
                                }
                                if (a) {
                                        c = lint_comment();
                                        if (c >= 0) return(c);
                                }
                                if (column == 0) {
                                        column = 1;
                                }
                                goto start_label;
                        }
                        if (START_CPP_COMMENT(c)) {
                                int a = analyse_comments;
                                c = skip_cpp_comment(a);
                                if (c == lex_eof) {
                                        goto eof_label;
                                }
                                if (a) {
                                        c = lint_comment();
                                        if (c >= 0) {
                                                return(c);
                                        }
                                }
                                IGNORE read_char();
                                goto newline_label;
                        }
                        if (c == char_equal) {
                                return(lex_div_Heq);
                        }
                        unread_char(c);
                        return(lex_div);

                case char_colon:
                        /* Deal with ':', '::' and ':>' */
                        c = read_char();
#if LANGUAGE_CPP
                        /* '::' is only allowed in C++ */
                        if (c == char_colon) {
                                return(lex_colon_Hcolon);
                        }
#endif
                        if (c == char_greater && allow_digraphs) {
                                return(lex_close_Hsquare_H2);
                        }
                        unread_char(c);
                        return(lex_colon);

                case char_less:
                        /* Deal with '<', '<=', '<<', '<<=', '<%', '<:' */
                        c = read_char();
                        if (c == char_equal) {
                                return(lex_less_Heq);
                        }
                        if (c == char_less) {
                                c = read_char();
                                if (c == char_equal) {
                                        return(lex_lshift_Heq);
                                }
                                unread_char(c);
                                return(lex_lshift);
                        }
                        if (c == char_percent && allow_digraphs) {
                                return(lex_open_Hbrace_H2);
                        }
                        if (c == char_colon && allow_digraphs) {
                                return(lex_open_Hsquare_H2);
                        }
                        if (c == char_question && allow_extra_symbols) {
                                return(lex_min);
                        }
                        unread_char(c);
                        return(lex_less);

                case char_equal:
                        /* Deal with '=' and '==' */
                        c = read_char();
                        switch (c) {
                        case char_equal:
                                return(lex_eq);
                        case char_ampersand:
                        case char_asterix:
                        case char_minus:
                        case char_plus:
                                update_column();
                                report(crt_loc, ERR_lex_op_old_assign(c, c));
                                break;
                        }
                        unread_char(c);
                        return(lex_assign);

                case char_greater:
                        /* Deal with '>', '>=', '>>' and '>>=' */
                        c = read_char();
                        if (c == char_equal) {
                                return(lex_greater_Heq);
                        }
                        if (c == char_greater) {
                                c = read_char();
                                if (c == char_equal) {
                                        return(lex_rshift_Heq);
                                }
                                unread_char(c);
                                return(lex_rshift);
                        }
                        if (c == char_question && allow_extra_symbols) {
                                return(lex_max);
                        }
                        unread_char(c);
                        return(lex_greater);

                case char_circum:
                        /* Deal with '^' and '^=' */
                        c = read_char();
                        if (c == char_equal) {
                                return(lex_xor_Heq_H1);
                        }
                        unread_char(c);
                        return(lex_xor_H1);

                case char_bar:
                        /* Deal with '|', '||' and '|=' */
                        c = read_char();
                        if (c == char_bar) {
                                return(lex_logical_Hor_H1);
                        }
                        if (c == char_equal) {
                                return(lex_or_Heq_H1);
                        }
                        unread_char(c);
                        return(lex_or_H1);

                case char_open_round:
                        /* Deal with '(' */
                        return(lex_open_Hround);

                case char_close_round:
                        /* Deal with ')' */
                        return(lex_close_Hround);

                case char_comma:
                        /* Deal with ',' */
                        return(lex_comma);

                case char_semicolon:
                        /* Deal with ';' */
                        return(lex_semicolon);

                case char_open_square:
                        /* Deal with '[' */
                        return(lex_open_Hsquare_H1);

                case char_close_square:
                        /* Deal with ']' */
                        return(lex_close_Hsquare_H1);

                case char_open_brace:
                        /* Deal with '{' */
                        return(lex_open_Hbrace_H1);

                case char_close_brace:
                        /* Deal with '}' */
                        return(lex_close_Hbrace_H1);

                case char_tilde:
                        /* Deal with '~' */
                        return(lex_compl_H1);

                default:
                        /* Anything else is an unknown character */
                        goto unknown_label;
                }
        }

        /* Read an identifier (calculating hash value on fly) */
        if (is_alpha(t)) {
                HASHID nm;
                LOCATION loc;
                BUFFER *bf = &token_buff;
                string s = bf->start;
                string se = bf->end;
                unsigned long h = (unsigned long)c;
                *(s++) = (character)c;

                /* Get the second character */
                update_column();
                loc = crt_loc;
                c = read_char();
#if FS_EXTENDED_CHAR
                t = (IS_EXTENDED(c)? ILLEG : lookup_char(c));
#else
                t = lookup_char(c);
#endif
                if (is_alphanum(t)) {
                        /* Scan the third and subsequent characters */
                        do {
                                h = HASH_POWER * h + (unsigned long)c;
                                *s = (character)c;
                                if (++s == se) {
                                        s = extend_buffer(bf, s);
                                        se = bf->end;
                                }
                                c = read_char();
#if FS_EXTENDED_CHAR
                                if (IS_EXTENDED(c)) {
                                        break;
                                }
#endif
                                t = lookup_char(c);
                        } while (is_alphanum(t));
                } else {
                        /* Allow for wide strings and characters */
                        if (h == char_L && is_symbol(t)) {
                                if (c == char_quote) {
                                        IGNORE read_string(c, 1);
                                        return(lex_wstring_Hlit);
                                }
                                if (c == char_single_quote) {
                                        IGNORE read_string(c, 1);
                                        return(lex_wchar_Hlit);
                                }
                        }
                        /* Identifier of length one */
                }
                if (c == char_backslash) {
                        /* Allow for extended identifiers */
                        int ch = CHAR_NONE;
                        int nextc = read_char();
                        unsigned long u = read_unicode(nextc, &ch);
                        if (ch != CHAR_NONE) {
                                bf->posn = s;
                                nm = read_extended_id(u, ch);
                                goto identifier_label;
                        }
                }
                unread_char(c);
                se = s;
                *se = 0;

                /* Look up the symbol in the hash table */
                h %= HASH_SIZE;
                s = bf->start;
                nm = lookup_name(s, h, 0, lex_unknown);
identifier_label:
                {
                        IDENTIFIER id = DEREF_id(hashid_id(nm));
                        while (!IS_id_dummy(id)) {
                                /* Scan to last hidden value */
                                id = DEREF_id(id_alias(id));
                        }
                        COPY_loc(id_loc(id), loc);
                }
                token_hashid = nm;
                return(lex_identifier);
        }

        /* Read the first token in a line */
        if (c == char_return) {
                c = read_newline();
        }
        if (c == char_newline) {
newline_label:
                /* Re-entry point after C++ style comments */
                crt_loc.line++;
                crt_loc.column = 0;
                input_crt = input_posn;
                crt_line_changed = 1;
                crt_spaces = 0;
                if (in_preproc_dir == 1) {
                        in_preproc_dir = 0;
                        return(lex_newline);
                }
start_line_label:
                /* Re-entry point after preprocessing directives */
                column = 0;
                for (;;) {
                        /* Step over any obvious spaces */
                        c = next_char();
                        if (c == char_end) {
                                c = refill_char();
                        }
                        if (c == char_return) {
                                c = read_newline();
                        }
                        if (c == char_sub) {
                                c = read_eof();
                        }
                        if (c == char_newline) {
                                crt_loc.line++;
                                crt_loc.column = 0;
                                input_crt = input_posn;
                                crt_line_changed = 1;
                                crt_spaces = 0;
                                column = 0;
                        } else if (c == char_eof) {
                                /* Check for end of file (should start line) */
                                if (column == 0) {
                                        good_eof = 1;
                                }
                                goto eof_label;
                        } else if (c == char_space) {
                                crt_spaces++;
                                column = 1;
                        } else if (c == char_tab) {
                                unsigned long tab = tab_width;
                                crt_spaces = tab *(crt_spaces / tab + 1);
                                column = 1;
                        } else {
#if FS_EXTENDED_CHAR
                                if (IS_EXTENDED(c)) {
                                        t = ILLEG;
                                        break;
                                }
#endif
                                t = lookup_char(c);
                                if (is_white(t)) {
                                        if (!is_newline(t)) {
                                                crt_spaces++;
                                                column = 1;
                                        }
                                } else {
                                        break;
                                }
                        }
                }
                /* c and t now hold the next character */
                goto process_label;
        }

        /* Read a pp-number */
        if (is_digit(t)) {
number_label:   {
                        int lastc;
                        BUFFER *bf = &token_buff;
                        string s = bf->start;
                        string se = bf->end;
                        if (t == POINT) {
                                /* t is set to POINT to indicate an initial
                                 * '.' */
                                *(s++) = char_dot;
                        }
digit_label:
                        /* Step over alphanumeric characters and '.' */
                        do {
                                *s = (character)c;
                                if (++s == se) {
                                        s = extend_buffer(bf, s);
                                        se = bf->end;
                                }
next_digit_label:
                                lastc = c;
                                c = read_char();
#if FS_EXTENDED_CHAR
                                if (IS_EXTENDED(c)) {
                                        break;
                                }
#endif
                                t = lookup_char(c);
                        } while (is_ppdigit(t));
                        if (c == char_plus || c == char_minus) {
                                /* Allow for [Ee][+-] */
                                if (lastc == char_e || lastc == char_E) {
                                        goto digit_label;
                                }
                        }
                        if (c == char_backslash) {
                                /* Allow for unicode characters */
                                int ch = CHAR_NONE;
                                int nextc = read_char();
                                unsigned long u = read_unicode(nextc, &ch);
                                if (ch != CHAR_NONE) {
                                        bf->posn = s;
                                        print_char(u, ch, 0, bf);
                                        s = bf->posn;
                                        se = bf->end;
                                        goto next_digit_label;
                                }
                        }
                        *s = 0;
                        unread_char(c);
                }
                return(lex_integer_Hlit);
        }

        /* End of file marker */
        if (c == char_sub) {
                c = read_eof();
        }
        if (c == char_eof) {
eof_label:
                if (in_preproc_dir != 0) {
                        return(lex_eof);
                }
                if (!good_eof) {
                        update_column();
                        report(crt_loc, ERR_lex_phases_eof());
                        good_eof = 1;
                }
                if (end_include(preproc)) {
                        /* Revert to previous file */
                        good_eof = 0;
                        preproc = lex_ignore_token;
                        goto start_line_label;
                }
                /* End of main file */
                return(lex_eof);
        }

        /* Unknown characters */
unknown_label:
        {
                string s = token_buff.start;
                add_multi_char(s, (unsigned long)c, CHAR_SIMPLE);
                return(lex_unknown);
        }
}


/*
    INITIALISE INPUT VARIABLES

    This routine initialises the tables of character look-ups and the token
    buffer.
*/

void
init_char(void)
{
        int i;
        unsigned char *p, *q;

        /* Set native locale for multibyte characters */
#if FS_MULTIBYTE
        if (allow_multibyte) {
                IGNORE setlocale(LC_CTYPE, "");
        }
#endif

        /* Allow for non-ASCII codesets */
        map_ascii(main_characters);
        map_ascii(digit_values);
        map_ascii(escape_sequences);

        /* Set up extra characters */
        p = xmalloc_nof(unsigned char, NO_CHAR);
        q = main_characters;
        copy_characters = p;
        for (i = 0; i < NO_CHAR; i++) {
                *(p++) = *(q++);
        }

        /* Initialise token buffer */
        token_buff.posn = extend_buffer(&token_buff, token_buff.posn);
        return;
}


/*
    INITIALISE INPUT FILE READING

    This routine initialises the lexical analysis routines in preparation
    for parsing or preprocessing the current input file.
*/

void
init_lex(void)
{
        /* Initialise file variables */
        crt_buff_no = 0;
        IGNORE init_buffer(crt_buff_no);
        start_preproc_if ();
        preproc_loc = crt_loc;
        have_syntax_error = 0;
        if (do_header) {
                dump_start(&crt_loc, NIL(INCL_DIR));
        }

        /* Deal with first start-up file */
        open_startup();

        /* Force processing to start at the beginning of a line */
        unread_char(char_newline);
        crt_loc.line--;

        /* Initialise the parser */
        init_parser(NIL(PPTOKEN));
        return;
}


/*
    PARSE INPUT FILE

    This routine is the main entry point for the parsing of the current
    input file.
*/

void
process_file(void)
{
        init_lex();
        ADVANCE_LEXER;
        parse_file(NULL_type, dspec_none);
        return;
}
Subversion Repositories tendra.SVN

(root)/trunk/src/producers/common/parse/lex.c – Rev 7