From 52d3e2528e73a7208715f3683fa75cf9a05af930 Mon Sep 17 00:00:00 2001 From: "hyunghwan.chung" Date: Fri, 23 Sep 2016 08:56:25 +0000 Subject: [PATCH] changed the tokenizer not to read ahead before returning a token --- stix/lib/comp.c | 175 +++++++++++++++++++++++++++----------------- stix/lib/exec.c | 4 +- stix/lib/main.c | 26 ++++--- stix/lib/stix-prv.h | 4 +- 4 files changed, 129 insertions(+), 80 deletions(-) diff --git a/stix/lib/comp.c b/stix/lib/comp.c index 5cc51a3..4d6dce6 100644 --- a/stix/lib/comp.c +++ b/stix/lib/comp.c @@ -80,32 +80,32 @@ static struct voca_t stix_oow_t len; stix_ooch_t str[11]; } vocas[] = { - { 4, { 'b','y','t','e' } }, - { 9, { 'c','h','a','r','a','c','t','e','r' } }, - { 5, { 'c','l','a','s','s' } }, - { 9, { 'c','l','a','s','s','i','n','s','t' } }, - { 3, { 'd','c','l' } }, - { 7, { 'd','e','c','l','a','r','e' } }, + { 5, { '#','b','y','t','e' } }, + { 10, { '#','c','h','a','r','a','c','t','e','r' } }, + { 6, { '#','c','l','a','s','s' } }, + { 10, { '#','c','l','a','s','s','i','n','s','t' } }, + { 4, { '#','d','c','l' } }, + { 8, { '#','d','e','c','l','a','r','e' } }, { 6, { 'e','n','s','u','r','e', } }, { 9, { 'e','x','c','e','p','t','i','o','n' } }, - { 6, { 'e','x','t','e','n','d' } }, + { 7, { '#','e','x','t','e','n','d' } }, { 5, { 'f','a','l','s','e' } }, - { 8, { 'h','a','l','f','w','o','r','d' } }, - { 7, { 'i','n','c','l','u','d','e' } }, - { 6, { 'l','i','w','o','r','d' } }, - { 4, { 'm','a','i','n' } }, - { 6, { 'm','e','t','h','o','d' } }, - { 3, { 'm','t','h' } }, + { 9, { '#','h','a','l','f','w','o','r','d' } }, + { 8, { '#','i','n','c','l','u','d','e' } }, + { 7, { '#','l','i','w','o','r','d' } }, + { 5, { '#','m','a','i','n' } }, + { 7, { '#','m','e','t','h','o','d' } }, + { 4, { '#','m','t','h' } }, { 3, { 'n','i','l' } }, - { 7, { 'p','o','i','n','t','e','r' } }, - { 7, { 'p','o','o','l','d','i','c' } }, + { 8, { '#','p','o','i','n','t','e','r' } }, + { 8, { '#','p','o','o','l','d','i','c' } }, { 10, { 'p','r','i','m','i','t','i','v','e',':' } }, { 4, { 's','e','l','f' } }, { 5, { 's','u','p','e','r' } }, { 11, { 't','h','i','s','C','o','n','t','e','x','t' } }, { 11, { 't','h','i','s','P','r','o','c','e','s','s' } }, { 4, { 't','r','u','e' } }, - { 4, { 'w','o','r','d' } }, + { 5, { '#','w','o','r','d' } }, { 1, { '|' } }, { 1, { '>' } }, @@ -546,8 +546,8 @@ static stix_oop_t string_to_num (stix_t* stix, stix_oocs_t* str, int radixed) static STIX_INLINE int does_token_name_match (stix_t* stix, voca_id_t id) { - return stix->c->tok.name.len == vocas[id].len && - stix_equalchars(stix->c->tok.name.ptr, vocas[id].str, vocas[id].len); + return TOKEN_NAME_LEN(stix) == vocas[id].len && + stix_equalchars(TOKEN_NAME_PTR(stix), vocas[id].str, vocas[id].len); } static STIX_INLINE int is_token_symbol (stix_t* stix, voca_id_t id) @@ -679,7 +679,7 @@ static int get_char (stix_t* stix) return 1; /* indicate that a normal character has been read */ } -static int skip_spaces (stix_t* stix) +static STIX_INLINE int skip_spaces (stix_t* stix) { while (is_spacechar(stix->c->lxc.c)) GET_CHAR (stix); return 0; @@ -705,7 +705,7 @@ static int skip_comment (stix_t* stix) } while (c != '"'); - if (c == '"') GET_CHAR (stix); + if (c == '"') GET_CHAR (stix); /* keep the next character in lxc */ return 1; /* double-quoted comment */ } @@ -728,7 +728,7 @@ static int skip_comment (stix_t* stix) } else if (c == '\r' || c == '\n') { - GET_CHAR (stix); + GET_CHAR (stix); /* keep the first meaningful character in lxc */ break; } } @@ -737,7 +737,7 @@ static int skip_comment (stix_t* stix) return 1; /* single line comment led by ## or #! */ } - /* unget '#' */ + /* unget the leading '#' */ unget_char (stix, &stix->c->lxc); /* restore the previous state */ stix->c->lxc = lc; @@ -820,13 +820,20 @@ static int get_ident (stix_t* stix, stix_ooci_t char_read_ahead) while (is_identchar(c)); if (c == '.') goto read_more_seg; + else unget_char (stix, &stix->c->lxc); } else { unget_char (stix, &stix->c->lxc); - stix->c->lxc = period; + + /* unget the period itself */ + unget_char (stix, &period); } } + else + { + unget_char (stix, &stix->c->lxc); + } /* handle reserved words */ if (is_token_word(stix, VOCA_SELF)) @@ -935,6 +942,8 @@ static int get_numlit (stix_t* stix, int negated) SET_TOKEN_TYPE (stix, STIX_IOTOK_RADNUMLIT); } + unget_char (stix, &stix->c->lxc); + /* * TODO: handle floating point number */ @@ -957,7 +966,6 @@ static int get_charlit (stix_t* stix) SET_TOKEN_TYPE (stix, STIX_IOTOK_CHARLIT); ADD_TOKEN_CHAR(stix, c); - GET_CHAR (stix); return 0; } @@ -989,10 +997,13 @@ static int get_strlit (stix_t* stix) } while (c != '\''); + /* 'c' must be a single quote at this point*/ GET_CHAR_TO (stix, c); } - while (c == '\''); + while (c == '\''); /* if the next character is a single quote, + it becomes a literal single quote character. */ + unget_char (stix, &stix->c->lxc); return 0; } @@ -1088,8 +1099,6 @@ static int get_string (stix_t* stix, stix_ooch_t end_char, stix_ooch_t esc_char, if (escaped == 0 && c == end_char) { /* terminating quote */ - /*GET_CHAR_TO (stix, c);*/ - GET_CHAR (stix); break; } @@ -1171,21 +1180,25 @@ static int get_binsel (stix_t* stix) /* special case if a minus is followed by a digit immediately */ if (oc == '-' && is_digitchar(stix->c->lxc.c)) return get_numlit (stix, 1); +#if 1 /* up to 2 characters only */ if (is_binselchar (stix->c->lxc.c)) { ADD_TOKEN_CHAR (stix, stix->c->lxc.c); - GET_CHAR (stix); } - + else + { + unget_char (stix, &stix->c->lxc); + } +#else /* or up to any occurrences */ - /* while (is_binselchar(stix->c->lxc.c)) { ADD_TOKEN_CHAR (stix, c); GET_CHAR (stix); } - */ + unget_char (stix, &stix->c->lxc); +#endif SET_TOKEN_TYPE (stix, STIX_IOTOK_BINSEL); return 0; @@ -1197,9 +1210,13 @@ static int get_token (stix_t* stix) int n; retry: + GET_CHAR (stix); + do { - if (skip_spaces(stix) <= -1) return -1; + /* skip spaces */ + while (is_spacechar(stix->c->lxc.c)) GET_CHAR (stix); + /* the first character after the last space is in stix->c->lxc */ if ((n = skip_comment(stix)) <= -1) return -1; } while (n >= 1); @@ -1207,8 +1224,8 @@ retry: /* clear the token name, reset its location */ SET_TOKEN_TYPE (stix, STIX_IOTOK_EOF); /* is it correct? */ CLEAR_TOKEN_NAME (stix); + stix->c->tok.loc = stix->c->lxc.l; /* remember token location */ - stix->c->tok.loc = stix->c->lxc.l; c = stix->c->lxc.c; switch (c) @@ -1244,15 +1261,19 @@ retry: { SET_TOKEN_TYPE (stix, STIX_IOTOK_ASSIGN); ADD_TOKEN_CHAR (stix, c); - GET_CHAR (stix); + } + else + { + unget_char (stix, &stix->c->lxc); } break; case '^': SET_TOKEN_TYPE (stix, STIX_IOTOK_RETURN); ADD_TOKEN_CHAR(stix, c); - GET_CHAR_TO (stix, c); #if 0 + GET_CHAR_TO (stix, c); + /* TODO: support explicit block return */ if (c == '^') { @@ -1260,6 +1281,10 @@ retry: TOKEN_TYPE(stix) == STIX_IOTOK_BLKRET; ADD_TOKEN_CHAR (stix, c); } + else + { + unget_char (stix, &stix->c->lxc); + } #endif break; @@ -1289,28 +1314,24 @@ retry: goto single_char_token; case '#': - /* - * The hash sign is not the part of the token name. - * ADD_TOKEN_CHAR(stix, c); */ + ADD_TOKEN_CHAR(stix, c); GET_CHAR_TO (stix, c); switch (c) { case STIX_UCI_EOF: set_syntax_error (stix, STIX_SYNERR_HLTNT, LEXER_LOC(stix), STIX_NULL); return -1; - + case '(': /* #( */ ADD_TOKEN_CHAR(stix, c); SET_TOKEN_TYPE (stix, STIX_IOTOK_ARPAREN); - GET_CHAR (stix); break; case '[': /* #[ - byte array literal */ ADD_TOKEN_CHAR(stix, c); SET_TOKEN_TYPE (stix, STIX_IOTOK_BAPAREN); - GET_CHAR (stix); break; case '\'': @@ -1334,6 +1355,8 @@ retry: GET_CHAR_TO (stix, c); } while (is_binselchar(c)); + + unget_char (stix, &stix->c->lxc); } else if (is_leadidentchar(c)) { @@ -1370,6 +1393,10 @@ retry: return -1; } } + else + { + unget_char (stix, &stix->c->lxc); + } } else if (c == '.') { @@ -1393,13 +1420,18 @@ retry: while (is_identchar(c)); if (c == '.') goto read_more_seg; + else unget_char (stix, &stix->c->lxc); } else { unget_char (stix, &stix->c->lxc); - stix->c->lxc = period; + unget_char (stix, &period); } } + else + { + unget_char (stix, &stix->c->lxc); + } } else { @@ -1488,15 +1520,14 @@ retry: single_char_token: ADD_TOKEN_CHAR(stix, c); - GET_CHAR (stix); break; } -/*stix_logbfmt (stix, STIX_FMTOUT_COMPILER | STIX_FMTOUT_DEBUG, "TOKEN: [%.*S]\n", stix->c->tok.name.len, stix->c->tok.name.ptr);*/ +STIX_DEBUG2 (stix, "TOKEN: [%.*S]\n", (stix_ooi_t)stix->c->tok.name.len, stix->c->tok.name.ptr); + return 0; } - static void clear_io_names (stix_t* stix) { stix_iolink_t* cur; @@ -1561,14 +1592,14 @@ static int begin_include (stix_t* stix) goto oops; } - /* switch to theincludee's stream */ + /* switch to the includee's stream */ stix->c->curinp = arg; /* stix->c->depth.incl++; */ /* read in the first character in the included file. * so the next call to get_token() sees the character read * from this file. */ - if (get_char(stix) <= -1 || get_token(stix) <= -1) + if (get_token(stix) <= -1) { end_include (stix); /* i don't jump to oops since i've called @@ -1898,11 +1929,11 @@ static int add_string_literal (stix_t* stix, const stix_oocs_t* str, stix_oow_t* return add_literal (stix, lit, index); } -static int add_symbol_literal (stix_t* stix, const stix_oocs_t* str, stix_oow_t* index) +static int add_symbol_literal (stix_t* stix, const stix_oocs_t* str, int offset, stix_oow_t* index) { stix_oop_t tmp; - tmp = stix_makesymbol (stix, str->ptr, str->len); + tmp = stix_makesymbol (stix, str->ptr + offset, str->len - offset); if (!tmp) return -1; return add_literal (stix, tmp, index); @@ -2679,20 +2710,27 @@ static int compile_method_primitive (stix_t* stix) stix->c->mth.prim_no = prim_no; break; - + case STIX_IOTOK_SYMLIT: - prim_no = stix_getprimno (stix, TOKEN_NAME(stix)); + { + const stix_ooch_t* tptr; + stix_oow_t tlen; + + tptr = TOKEN_NAME_PTR(stix) + 1; + tlen = TOKEN_NAME_LEN(stix) - 1; + + prim_no = stix_getprimno (stix, tptr, tlen); if (prim_no <= -1) { const stix_ooch_t* us; /* the primitive is not found */ - us = stix_findoochar (stix->c->tok.name.ptr, stix->c->tok.name.len, '_'); - if (us > stix->c->tok.name.ptr && us < stix->c->tok.name.ptr + stix->c->tok.name.len - 1) + us = stix_findoochar (tptr, tlen, '_'); + if (us > tptr && us < tptr + tlen - 1) { stix_oow_t lit_idx; /* the symbol literal contains an underscore. - * and it is none of the first of the last character */ - if (add_symbol_literal(stix, TOKEN_NAME(stix), &lit_idx) >= 0 && + * and it is neither the first nor the last character */ + if (add_symbol_literal(stix, TOKEN_NAME(stix), 1, &lit_idx) >= 0 && STIX_OOI_IN_METHOD_PREAMBLE_INDEX_RANGE(lit_idx)) { stix->c->mth.prim_type = 2; /* named primitive */ @@ -2713,6 +2751,7 @@ static int compile_method_primitive (stix_t* stix) stix->c->mth.prim_type = 1; stix->c->mth.prim_no = prim_no; break; + } default: set_syntax_error (stix, STIX_SYNERR_INTEGER, TOKEN_LOC(stix), TOKEN_NAME(stix)); @@ -3290,19 +3329,22 @@ static int __read_array_literal (stix_t* stix, stix_oop_t* xlit) break; case STIX_IOTOK_STRLIT: - lit = stix_instantiate (stix, stix->_string, stix->c->tok.name.ptr, stix->c->tok.name.len); + lit = stix_instantiate (stix, stix->_string, TOKEN_NAME_PTR(stix), TOKEN_NAME_LEN(stix)); + break; + + case STIX_IOTOK_SYMLIT: + lit = stix_makesymbol (stix, TOKEN_NAME_PTR(stix) + 1, TOKEN_NAME_LEN(stix) - 1); break; case STIX_IOTOK_IDENT: case STIX_IOTOK_IDENT_DOTTED: case STIX_IOTOK_BINSEL: case STIX_IOTOK_KEYWORD: - case STIX_IOTOK_SYMLIT: case STIX_IOTOK_SELF: case STIX_IOTOK_SUPER: case STIX_IOTOK_THIS_CONTEXT: case STIX_IOTOK_THIS_PROCESS: - lit = stix_makesymbol (stix, stix->c->tok.name.ptr, stix->c->tok.name.len); + lit = stix_makesymbol (stix, TOKEN_NAME_PTR(stix), TOKEN_NAME_LEN(stix)); break; case STIX_IOTOK_NIL: @@ -3555,7 +3597,7 @@ static int compile_expression_primary (stix_t* stix, const stix_oocs_t* ident, c break; case STIX_IOTOK_SYMLIT: - if (add_symbol_literal(stix, TOKEN_NAME(stix), &index) <= -1 || + if (add_symbol_literal(stix, TOKEN_NAME(stix), 1, &index) <= -1 || emit_single_param_instruction(stix, BCODE_PUSH_LITERAL_0, index) <= -1) return -1; GET_TOKEN (stix); break; @@ -3649,7 +3691,7 @@ static int compile_unary_message (stix_t* stix, int to_super) do { - if (add_symbol_literal(stix, TOKEN_NAME(stix), &index) <= -1 || + if (add_symbol_literal(stix, TOKEN_NAME(stix), 0, &index) <= -1 || emit_double_param_instruction(stix, send_message_cmd[to_super], 0, index) <= -1) return -1; GET_TOKEN (stix); } @@ -3688,7 +3730,7 @@ static int compile_binary_message (stix_t* stix, int to_super) * to be free from reallocation risk for the recursive call * to compile_expression_primary(). */ binsel.ptr = &stix->c->mth.binsels.ptr[binsel_offset]; - if (add_symbol_literal(stix, &binsel, &index) <= -1 || + if (add_symbol_literal(stix, &binsel, 0, &index) <= -1 || emit_double_param_instruction(stix, send_message_cmd[to_super], 1, index) <= -1) goto oops; stix->c->mth.binsels.len = saved_binsels_len; @@ -3750,7 +3792,7 @@ static int compile_keyword_message (stix_t* stix, int to_super) kwsel.ptr = &stix->c->mth.kwsels.ptr[saved_kwsel_len]; kwsel.len = stix->c->mth.kwsels.len - saved_kwsel_len; - if (add_symbol_literal(stix, &kwsel, &index) <= -1 || + if (add_symbol_literal(stix, &kwsel, 0, &index) <= -1 || emit_double_param_instruction(stix, send_message_cmd[to_super], nargs, index) <= -1) goto oops; stix->c->mth.kwsels.len = saved_kwsel_len; @@ -4953,7 +4995,7 @@ static int __compile_pooldic_definition (stix_t* stix) while (TOKEN_TYPE(stix) == STIX_IOTOK_SYMLIT) { - lit = stix_makesymbol (stix, stix->c->tok.name.ptr, stix->c->tok.name.len); + lit = stix_makesymbol (stix, TOKEN_NAME_PTR(stix) + 1, TOKEN_NAME_LEN(stix) - 1); if (!lit || add_to_array_literal_buffer (stix, lit) <= -1) return -1; GET_TOKEN (stix); @@ -4986,12 +5028,12 @@ static int __compile_pooldic_definition (stix_t* stix) goto add_literal; case STIX_IOTOK_STRLIT: - lit = stix_instantiate (stix, stix->_string, stix->c->tok.name.ptr, stix->c->tok.name.len); + lit = stix_instantiate (stix, stix->_string, TOKEN_NAME_PTR(stix), TOKEN_NAME_LEN(stix)); if (!lit) return -1; goto add_literal; case STIX_IOTOK_SYMLIT: - lit = stix_makesymbol (stix, stix->c->tok.name.ptr, stix->c->tok.name.len); + lit = stix_makesymbol (stix, TOKEN_NAME_PTR(stix) + 1, TOKEN_NAME_LEN(stix) - 1); if (!lit) return -1; goto add_literal; @@ -5086,10 +5128,8 @@ static int compile_pooldic_definition (stix_t* stix) return n; } - static int compile_stream (stix_t* stix) { - GET_CHAR (stix); GET_TOKEN (stix); while (TOKEN_TYPE(stix) != STIX_IOTOK_EOF) @@ -5131,7 +5171,6 @@ static int compile_stream (stix_t* stix) } #endif - else { set_syntax_error(stix, STIX_SYNERR_DIRECTIVE, TOKEN_LOC(stix), TOKEN_NAME(stix)); diff --git a/stix/lib/exec.c b/stix/lib/exec.c index 2d6bdb3..d45b2fd 100644 --- a/stix/lib/exec.c +++ b/stix/lib/exec.c @@ -2747,13 +2747,13 @@ static prim_t primitives[] = }; -int stix_getprimno (stix_t* stix, const stix_oocs_t* name) +int stix_getprimno (stix_t* stix, const stix_ooch_t* ptr, stix_oow_t len) { int i; for (i = 0; i < STIX_COUNTOF(primitives); i++) { - if (stix_compucxbcstr(name->ptr, name->len, primitives[i].name) == 0) + if (stix_compucxbcstr(ptr, len, primitives[i].name) == 0) { return i; } diff --git a/stix/lib/main.c b/stix/lib/main.c index 7708e91..d89eab5 100644 --- a/stix/lib/main.c +++ b/stix/lib/main.c @@ -204,24 +204,31 @@ static STIX_INLINE stix_ooi_t close_input (stix_t* stix, stix_ioarg_t* arg) static STIX_INLINE stix_ooi_t read_input (stix_t* stix, stix_ioarg_t* arg) { - xtn_t* xtn = stix_getxtn(stix); + /*xtn_t* xtn = hcl_getxtn(hcl);*/ bb_t* bb; - stix_oow_t n, bcslen, ucslen, remlen; + stix_oow_t bcslen, ucslen, remlen; int x; + bb = (bb_t*)arg->handle; STIX_ASSERT (bb != STIX_NULL && bb->fp != STIX_NULL); - n = fread (&bb->buf[bb->len], STIX_SIZEOF(bb->buf[0]), STIX_COUNTOF(bb->buf) - bb->len, bb->fp); - if (n == 0) + do { - if (ferror((FILE*)bb->fp)) + x = fgetc (bb->fp); + if (x == EOF) { - stix_seterrnum (stix, STIX_EIOERR); - return -1; + if (ferror((FILE*)bb->fp)) + { + stix_seterrnum (stix, STIX_EIOERR); + return -1; + } + break; } - } - bb->len += n; + bb->buf[bb->len++] = x; + } + while (bb->len < STIX_COUNTOF(bb->buf) && x != '\r' && x != '\n'); + bcslen = bb->len; ucslen = STIX_COUNTOF(arg->buf); x = stix_utf8toucs (bb->buf, &bcslen, arg->buf, &ucslen); @@ -754,6 +761,7 @@ int main (int argc, char* argv[]) } } + printf ("COMPILE OK. STARTING EXECUTION ...\n"); xret = 0; g_stix = stix; setup_tick (); diff --git a/stix/lib/stix-prv.h b/stix/lib/stix-prv.h index 917ea97..a50ba21 100644 --- a/stix/lib/stix-prv.h +++ b/stix/lib/stix-prv.h @@ -1321,9 +1321,11 @@ STIX_EXPORT void stix_getsynerr ( /* ========================================================================= */ /* exec.c */ /* ========================================================================= */ + int stix_getprimno ( stix_t* stix, - const stix_oocs_t* name + const stix_ooch_t* ptr, + stix_oow_t len ); /* TODO: remove debugging functions */