fixed a couple of awk parsing bugs

- lost token in an expression like bash | xyz + 20 for the way preget_token() was used.
- wrong redirection handling in an expression like print 1,2,3 > (4)
This commit is contained in:
hyung-hwan 2013-02-04 09:19:17 +00:00
parent f070058372
commit 12b8ad7cc9
3 changed files with 151 additions and 40 deletions

View File

@ -180,6 +180,13 @@ struct qse_awk_t
/* maximum number of local variables */ /* maximum number of local variables */
qse_size_t nlcls_max; qse_size_t nlcls_max;
/* some data to find if an expression is
* enclosed in parentheses or not.
* see parse_primary_lparen() and parse_print() in parse.c
*/
qse_size_t lparen_seq;
qse_size_t lparen_last_closed;
} parse; } parse;
/* source code management */ /* source code management */

View File

@ -194,7 +194,7 @@ static qse_awk_nde_t* parse_block_dc (
static qse_awk_nde_t* parse_statement ( static qse_awk_nde_t* parse_statement (
qse_awk_t* awk, const qse_awk_loc_t* xloc); qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_expr_dc ( static qse_awk_nde_t* parse_expr_withdc (
qse_awk_t* awk, const qse_awk_loc_t* xloc); qse_awk_t* awk, const qse_awk_loc_t* xloc);
static qse_awk_nde_t* parse_logical_or ( static qse_awk_nde_t* parse_logical_or (
@ -972,7 +972,7 @@ static int parse_progunit (qse_awk_t* awk)
awk->parse.id.block = PARSE_PATTERN; awk->parse.id.block = PARSE_PATTERN;
eloc = awk->tok.loc; eloc = awk->tok.loc;
ptn = parse_expr_dc (awk, &eloc); ptn = parse_expr_withdc (awk, &eloc);
if (ptn == QSE_NULL) return -1; if (ptn == QSE_NULL) return -1;
QSE_ASSERT (ptn->next == QSE_NULL); QSE_ASSERT (ptn->next == QSE_NULL);
@ -986,7 +986,7 @@ static int parse_progunit (qse_awk_t* awk)
} }
eloc = awk->tok.loc; eloc = awk->tok.loc;
ptn->next = parse_expr_dc (awk, &eloc); ptn->next = parse_expr_withdc (awk, &eloc);
if (ptn->next == QSE_NULL) if (ptn->next == QSE_NULL)
{ {
@ -2062,7 +2062,7 @@ static qse_awk_nde_t* parse_if (qse_awk_t* awk, const qse_awk_loc_t* xloc)
if (get_token(awk) <= -1) return QSE_NULL; if (get_token(awk) <= -1) return QSE_NULL;
eloc = awk->tok.loc; eloc = awk->tok.loc;
test = parse_expr_dc (awk, &eloc); test = parse_expr_withdc (awk, &eloc);
if (test == QSE_NULL) goto oops; if (test == QSE_NULL) goto oops;
if (!MATCH(awk,TOK_RPAREN)) if (!MATCH(awk,TOK_RPAREN))
@ -2135,7 +2135,7 @@ static qse_awk_nde_t* parse_while (qse_awk_t* awk, const qse_awk_loc_t* xloc)
if (get_token(awk) <= -1) goto oops; if (get_token(awk) <= -1) goto oops;
ploc = awk->tok.loc; ploc = awk->tok.loc;
test = parse_expr_dc (awk, &ploc); test = parse_expr_withdc (awk, &ploc);
if (test == QSE_NULL) goto oops; if (test == QSE_NULL) goto oops;
if (!MATCH(awk,TOK_RPAREN)) if (!MATCH(awk,TOK_RPAREN))
@ -2193,7 +2193,7 @@ static qse_awk_nde_t* parse_for (qse_awk_t* awk, const qse_awk_loc_t* xloc)
int no_foreach = MATCH(awk,TOK_LPAREN); int no_foreach = MATCH(awk,TOK_LPAREN);
ploc = awk->tok.loc; ploc = awk->tok.loc;
init = parse_expr_dc (awk, &ploc); init = parse_expr_withdc (awk, &ploc);
if (init == QSE_NULL) goto oops; if (init == QSE_NULL) goto oops;
if (!no_foreach && init->type == QSE_AWK_NDE_EXP_BIN && if (!no_foreach && init->type == QSE_AWK_NDE_EXP_BIN &&
@ -2247,7 +2247,7 @@ static qse_awk_nde_t* parse_for (qse_awk_t* awk, const qse_awk_loc_t* xloc)
if (!MATCH(awk,TOK_SEMICOLON)) if (!MATCH(awk,TOK_SEMICOLON))
{ {
ploc = awk->tok.loc; ploc = awk->tok.loc;
test = parse_expr_dc (awk, &ploc); test = parse_expr_withdc (awk, &ploc);
if (test == QSE_NULL) goto oops; if (test == QSE_NULL) goto oops;
if (!MATCH(awk,TOK_SEMICOLON)) if (!MATCH(awk,TOK_SEMICOLON))
@ -2268,7 +2268,7 @@ static qse_awk_nde_t* parse_for (qse_awk_t* awk, const qse_awk_loc_t* xloc)
{ {
{ {
qse_awk_loc_t eloc = awk->tok.loc; qse_awk_loc_t eloc = awk->tok.loc;
incr = parse_expr_dc (awk, &eloc); incr = parse_expr_withdc (awk, &eloc);
if (incr == QSE_NULL) goto oops; if (incr == QSE_NULL) goto oops;
} }
@ -2344,7 +2344,7 @@ static qse_awk_nde_t* parse_dowhile (qse_awk_t* awk, const qse_awk_loc_t* xloc)
if (get_token(awk) <= -1) goto oops; if (get_token(awk) <= -1) goto oops;
ploc = awk->tok.loc; ploc = awk->tok.loc;
test = parse_expr_dc (awk, &ploc); test = parse_expr_withdc (awk, &ploc);
if (test == QSE_NULL) goto oops; if (test == QSE_NULL) goto oops;
if (!MATCH(awk,TOK_RPAREN)) if (!MATCH(awk,TOK_RPAREN))
@ -2449,7 +2449,7 @@ static qse_awk_nde_t* parse_return (qse_awk_t* awk, const qse_awk_loc_t* xloc)
else else
{ {
qse_awk_loc_t eloc = awk->tok.loc; qse_awk_loc_t eloc = awk->tok.loc;
val = parse_expr_dc (awk, &eloc); val = parse_expr_withdc (awk, &eloc);
if (val == QSE_NULL) if (val == QSE_NULL)
{ {
QSE_AWK_FREE (awk, nde); QSE_AWK_FREE (awk, nde);
@ -2487,7 +2487,7 @@ static qse_awk_nde_t* parse_exit (qse_awk_t* awk, const qse_awk_loc_t* xloc)
else else
{ {
qse_awk_loc_t eloc = awk->tok.loc; qse_awk_loc_t eloc = awk->tok.loc;
val = parse_expr_dc (awk, &eloc); val = parse_expr_withdc (awk, &eloc);
if (val == QSE_NULL) if (val == QSE_NULL)
{ {
QSE_AWK_FREE (awk, nde); QSE_AWK_FREE (awk, nde);
@ -2648,6 +2648,17 @@ static qse_awk_nde_t* parse_print (qse_awk_t* awk, const qse_awk_loc_t* xloc)
qse_awk_nde_t* args_tail; qse_awk_nde_t* args_tail;
qse_awk_nde_t* tail_prev; qse_awk_nde_t* tail_prev;
int in_parens = 0, gm_in_parens = 0;
qse_size_t opening_lparen_seq;
if (MATCH(awk,TOK_LPAREN))
{
/* just remember the sequence number of the left
* parenthesis before calling parse_expr_withdc()
* that eventually calls parse_primary_lparen() */
opening_lparen_seq = awk->parse.lparen_seq;
in_parens = 1; /* maybe. not confirmed yet */
/* print and printf provide weird syntaxs. /* print and printf provide weird syntaxs.
* *
* 1. print 10, 20; * 1. print 10, 20;
@ -2655,11 +2666,13 @@ static qse_awk_nde_t* parse_print (qse_awk_t* awk, const qse_awk_loc_t* xloc)
* 3. print (10,20,30) in a; * 3. print (10,20,30) in a;
* 4. print ((10,20,30) in a); * 4. print ((10,20,30) in a);
* *
* Due the case 3, i can't consume LPAREN * Due to case 3, i can't consume LPAREN
* here and expect RPAREN later. * here and expect RPAREN later.
*/ */
}
eloc = awk->tok.loc; eloc = awk->tok.loc;
args = parse_expr_dc (awk, &eloc); args = parse_expr_withdc (awk, &eloc);
if (args == QSE_NULL) goto oops; if (args == QSE_NULL) goto oops;
args_tail = args; args_tail = args;
@ -2669,6 +2682,7 @@ static qse_awk_nde_t* parse_print (qse_awk_t* awk, const qse_awk_loc_t* xloc)
{ {
/* args->type == QSE_AWK_NDE_GRP when print (a, b, c) /* args->type == QSE_AWK_NDE_GRP when print (a, b, c)
* args->type != QSE_AWK_NDE_GRP when print a, b, c */ * args->type != QSE_AWK_NDE_GRP when print a, b, c */
qse_size_t group_opening_lparen_seq;
while (MATCH(awk,TOK_COMMA)) while (MATCH(awk,TOK_COMMA))
{ {
@ -2678,19 +2692,60 @@ static qse_awk_nde_t* parse_print (qse_awk_t* awk, const qse_awk_loc_t* xloc)
} }
while (MATCH(awk,TOK_NEWLINE)); while (MATCH(awk,TOK_NEWLINE));
/* if it's grouped, i must check if the last group member
* is enclosed in parentheses.
*
* i set the condition to false whenever i see
* a new group member. */
gm_in_parens = 0;
if (MATCH(awk,TOK_LPAREN))
{
group_opening_lparen_seq = awk->parse.lparen_seq;
gm_in_parens = 1; /* maybe */
}
eloc = awk->tok.loc; eloc = awk->tok.loc;
args_tail->next = parse_expr_dc (awk, &eloc); args_tail->next = parse_expr_withdc (awk, &eloc);
if (args_tail->next == QSE_NULL) goto oops; if (args_tail->next == QSE_NULL) goto oops;
tail_prev = args_tail; tail_prev = args_tail;
args_tail = args_tail->next; args_tail = args_tail->next;
if (gm_in_parens == 1 && awk->ptok.type == TOK_RPAREN &&
awk->parse.lparen_last_closed == group_opening_lparen_seq)
{
/* confirm that the last group seen so far
* is parenthesized */
gm_in_parens = 2;
}
} }
} }
/* print 1 > 2 would print 1 to the file named 2. /* print 1 > 2 would print 1 to the file named 2.
* print (1 > 2) would print (1 > 2) on the console */ * print (1 > 2) would print (1 > 2) on the console
if (awk->ptok.type != TOK_RPAREN && *
args_tail->type == QSE_AWK_NDE_EXP_BIN) * think of all these... there are many more possible combinations.
*
* print ((10,20,30) in a) > "x";
* print ((10,20,30) in a)
* print ((10,20,30) in a) > ("x");
* print ((10,20,30) in a) > (("x"));
* function abc() { return "abc"; } BEGIN { print (1 > abc()); }
* function abc() { return "abc"; } BEGIN { print 1 > abc(); }
* print 1, 2, 3 > 4;
* print (1, 2, 3) > 4;
* print ((1, 2, 3) > 4);
* print 1, 2, 3 > 4 + 5;
* print 1, 2, (3 > 4) > 5;
* print 1, 2, (3 > 4) > 5 + 6;
*/
if (in_parens == 1 && awk->ptok.type == TOK_RPAREN &&
awk->parse.lparen_last_closed == opening_lparen_seq)
{
in_parens = 2; /* confirmed */
}
if (in_parens != 2 && gm_in_parens != 2 && args_tail->type == QSE_AWK_NDE_EXP_BIN)
{ {
int i; int i;
qse_awk_nde_exp_t* ep = (qse_awk_nde_exp_t*)args_tail; qse_awk_nde_exp_t* ep = (qse_awk_nde_exp_t*)args_tail;
@ -2762,7 +2817,7 @@ static qse_awk_nde_t* parse_print (qse_awk_t* awk, const qse_awk_loc_t* xloc)
if (get_token(awk) <= -1) goto oops; if (get_token(awk) <= -1) goto oops;
eloc = awk->tok.loc; eloc = awk->tok.loc;
out = parse_expr_dc (awk, &eloc); out = parse_expr_withdc (awk, &eloc);
if (out == QSE_NULL) goto oops; if (out == QSE_NULL) goto oops;
} }
} }
@ -2887,11 +2942,11 @@ static qse_awk_nde_t* parse_statement_nb (
if (get_token(awk) <= -1) return QSE_NULL; if (get_token(awk) <= -1) return QSE_NULL;
nde = parse_print (awk, xloc); nde = parse_print (awk, xloc);
} }
else nde = parse_expr_dc (awk, xloc); else nde = parse_expr_withdc (awk, xloc);
} }
else else
{ {
nde = parse_expr_dc (awk, xloc); nde = parse_expr_withdc (awk, xloc);
} }
if (nde == QSE_NULL) return QSE_NULL; if (nde == QSE_NULL) return QSE_NULL;
@ -3033,7 +3088,7 @@ static qse_awk_nde_t* parse_expr_basic (
} }
eloc = awk->tok.loc; eloc = awk->tok.loc;
n1 = parse_expr_dc (awk, &eloc); n1 = parse_expr_withdc (awk, &eloc);
if (n1 == QSE_NULL) if (n1 == QSE_NULL)
{ {
qse_awk_clrpt (awk, nde); qse_awk_clrpt (awk, nde);
@ -3055,7 +3110,7 @@ static qse_awk_nde_t* parse_expr_basic (
} }
eloc = awk->tok.loc; eloc = awk->tok.loc;
n2 = parse_expr_dc (awk, &eloc); n2 = parse_expr_withdc (awk, &eloc);
if (n2 == QSE_NULL) if (n2 == QSE_NULL)
{ {
qse_awk_clrpt (awk, nde); qse_awk_clrpt (awk, nde);
@ -3120,7 +3175,7 @@ static qse_awk_nde_t* parse_expr (
{ {
qse_awk_loc_t eloc = awk->tok.loc; qse_awk_loc_t eloc = awk->tok.loc;
y = parse_expr_dc (awk, &eloc); y = parse_expr_withdc (awk, &eloc);
} }
if (y == QSE_NULL) if (y == QSE_NULL)
{ {
@ -3149,11 +3204,13 @@ static qse_awk_nde_t* parse_expr (
return (qse_awk_nde_t*)nde; return (qse_awk_nde_t*)nde;
} }
static qse_awk_nde_t* parse_expr_dc ( static qse_awk_nde_t* parse_expr_withdc (
qse_awk_t* awk, const qse_awk_loc_t* xloc) qse_awk_t* awk, const qse_awk_loc_t* xloc)
{ {
qse_awk_nde_t* nde; qse_awk_nde_t* nde;
/* perform depth check before parsing expression */
if (awk->opt.depth.s.expr_parse > 0 && if (awk->opt.depth.s.expr_parse > 0 &&
awk->parse.depth.expr >= awk->opt.depth.s.expr_parse) awk->parse.depth.expr >= awk->opt.depth.s.expr_parse)
{ {
@ -4338,13 +4395,16 @@ static qse_awk_nde_t* parse_primary_lparen (qse_awk_t* awk, const qse_awk_loc_t*
qse_awk_nde_t* nde; qse_awk_nde_t* nde;
qse_awk_nde_t* last; qse_awk_nde_t* last;
qse_awk_loc_t eloc; qse_awk_loc_t eloc;
qse_size_t opening_lparen_seq;
opening_lparen_seq = awk->parse.lparen_seq++;
/* eat up the left parenthesis */ /* eat up the left parenthesis */
if (get_token(awk) <= -1) return QSE_NULL; if (get_token(awk) <= -1) return QSE_NULL;
/* parse the sub-expression inside the parentheses */ /* parse the sub-expression inside the parentheses */
eloc = awk->tok.loc; eloc = awk->tok.loc;
nde = parse_expr_dc (awk, &eloc); nde = parse_expr_withdc (awk, &eloc);
if (nde == QSE_NULL) return QSE_NULL; if (nde == QSE_NULL) return QSE_NULL;
/* parse subsequent expressions separated by a comma, if any */ /* parse subsequent expressions separated by a comma, if any */
@ -4362,7 +4422,7 @@ static qse_awk_nde_t* parse_primary_lparen (qse_awk_t* awk, const qse_awk_loc_t*
while (MATCH(awk,TOK_NEWLINE)); while (MATCH(awk,TOK_NEWLINE));
eloc = awk->tok.loc; eloc = awk->tok.loc;
tmp = parse_expr_dc (awk, &eloc); tmp = parse_expr_withdc (awk, &eloc);
if (tmp == QSE_NULL) goto oops; if (tmp == QSE_NULL) goto oops;
QSE_ASSERT (tmp->next == QSE_NULL); QSE_ASSERT (tmp->next == QSE_NULL);
@ -4378,6 +4438,10 @@ static qse_awk_nde_t* parse_primary_lparen (qse_awk_t* awk, const qse_awk_loc_t*
goto oops; goto oops;
} }
/* remember the sequence number of the left parenthesis
* that' been just closed by the matching right parenthesis */
awk->parse.lparen_last_closed = opening_lparen_seq;
if (get_token(awk) <= -1) goto oops; if (get_token(awk) <= -1) goto oops;
/* check if it is a chained node */ /* check if it is a chained node */
@ -4492,7 +4556,7 @@ novar:
ploc = awk->tok.loc; ploc = awk->tok.loc;
/* TODO: is this correct? */ /* TODO: is this correct? */
/*nde->in = parse_expr_dc (awk, &ploc);*/ /*nde->in = parse_expr_withdc (awk, &ploc);*/
nde->in = parse_primary (awk, &ploc); nde->in = parse_primary (awk, &ploc);
if (nde->in == QSE_NULL) goto oops; if (nde->in == QSE_NULL) goto oops;
@ -5077,7 +5141,7 @@ static qse_awk_nde_t* parse_hashidx (
{ {
qse_awk_loc_t eloc = awk->tok.loc; qse_awk_loc_t eloc = awk->tok.loc;
tmp = parse_expr_dc (awk, &eloc); tmp = parse_expr_withdc (awk, &eloc);
} }
if (tmp == QSE_NULL) if (tmp == QSE_NULL)
{ {
@ -5236,7 +5300,7 @@ static qse_awk_nde_t* parse_fncall (
while (1) while (1)
{ {
eloc = awk->tok.loc; eloc = awk->tok.loc;
nde = parse_expr_dc (awk, &eloc); nde = parse_expr_withdc (awk, &eloc);
if (nde == QSE_NULL) goto oops; if (nde == QSE_NULL) goto oops;
if (head == QSE_NULL) head = nde; if (head == QSE_NULL) head = nde;
@ -5843,6 +5907,7 @@ static int get_symbols (qse_awk_t* awk, qse_cint_t c, qse_awk_tok_t* tok)
/* note that the loop below is not generaic enough. /* note that the loop below is not generaic enough.
* you must keep the operators strings in a particular order */ * you must keep the operators strings in a particular order */
for (p = ops; p->str != QSE_NULL; ) for (p = ops; p->str != QSE_NULL; )
{ {
if (p->opt == 0 || (awk->opt.trait & p->opt)) if (p->opt == 0 || (awk->opt.trait & p->opt))
@ -6089,7 +6154,46 @@ static int get_token (qse_awk_t* awk)
static int preget_token (qse_awk_t* awk) static int preget_token (qse_awk_t* awk)
{ {
/* LIMITATION: no more than one token can be pre-read in a row
without consumption. */
if (QSE_STR_LEN(awk->ntok.name) > 0)
{
/* you can't read more than 1 token in advance.
*
* if there is a token already read in, it is just
* retained.
*
* parsing an expression like '$0 | a' causes this
* funtion to be called before get_token() consumes the
* pre-read token.
*
* Because the expression like this
* print $1 | getline x;
* must be parsed as
* print $(1 | getline x);
* preget_token() is called from parse_primary().
*
* For the expression '$0 | $2',
* 1) parse_primary() calls parse_primary_positional() if $ is encountered.
* 2) parse_primary_positional() calls parse_primary() recursively for the positional part after $.
* 3) parse_primary() in #2 calls preget_token()
* 4) parse_primary() in #1 also calls preget_token().
*
* this block is reached because no token is consumed between #3 and #4.
*
* in short, it happens if getline doesn't doesn't follow | after the positional.
* $1 | $2
* $1 | abc + 20
*/
return 0;
}
else
{
/* if there is no token pre-read, we get a new
* token and place it to awk->ntok. */
return get_token_into (awk, &awk->ntok); return get_token_into (awk, &awk->ntok);
}
} }
static int classify_ident (qse_awk_t* awk, const qse_cstr_t* name) static int classify_ident (qse_awk_t* awk, const qse_cstr_t* name)

View File

@ -178,7 +178,7 @@ PROGS="
lang-049.awk!!!--newline=on -d- lang-049.awk!!!--newline=on -d-
columnate.awk!passwd.dat!!--newline=on -F: columnate.awk!passwd.dat!!--newline=on -F:
levenshtein-utests.awk!!!--newline=on --extrakws=on levenshtein-utests.awk!!!--newline=on
rcalc.awk!!!--newline=on -v target=89000 rcalc.awk!!!--newline=on -v target=89000
quicksort.awk!quicksort.dat!! quicksort.awk!quicksort.dat!!
quicksort2.awk!quicksort2.dat!!-vQSEAWK=\"${QSEAWK}\" -vSCRIPT_PATH=\"${SCRIPT_DIR}\" quicksort2.awk!quicksort2.dat!!-vQSEAWK=\"${QSEAWK}\" -vSCRIPT_PATH=\"${SCRIPT_DIR}\"