diff --git a/README.md b/README.md index 39e51736..fc33abe0 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ The library is stable, portable, and designed for projects that need a scripting - [Pragmas](#pragmas) - [@pragma entry](#pragma-entry) - [@pragma implicit](#pragma-implicit) - - [@pragma sriprecspc](#pragma-sriprecspc) + - [@pragma striprecspc](#pragma-striprecspc) - [@include and @include\_once](#include-and-include_once) - [Comments](#comments) - [Reserved Words](#reserved-words) @@ -364,7 +364,7 @@ This feature can be beneficial for catching potential variable misspellings or u If you don't want to enforce variable declarations, you can simply omit the `@pragma implicit off` directive or specify `@pragma implicit on`, and Hawk will behave like traditional awk, allowing implicit variable declarations. -### @pragma sriprecspc +### @pragma striprecspc The `@pragma striprecspc` directive in Hawk controls how the interpreter handles leading and trailing blank fields in input records when using a regular expression as the field separator (FS). diff --git a/lib/parse.c b/lib/parse.c index b7182105..baab5f8e 100644 --- a/lib/parse.c +++ b/lib/parse.c @@ -6559,7 +6559,17 @@ static int get_string ( } else { - ADD_TOKEN_UINT32(hawk, tok, c_acc); + if (digit_count == 1 && end_char == HAWK_T('/')) + { + /* inside a regular expression, it's likely a backreference */ + hawk_ooch_t oc = c_acc + HAWK_T('0'); + ADD_TOKEN_CHAR(hawk, tok, esc_char); + ADD_TOKEN_CHAR(hawk, tok, oc); + } + else + { + ADD_TOKEN_UINT32(hawk, tok, c_acc); + } escaped = 0; } } @@ -6600,7 +6610,6 @@ static int get_string ( } else { - if (digit_count == 0) { hawk_ooch_t ec; @@ -6619,7 +6628,7 @@ static int get_string ( else ADD_TOKEN_UINT32(hawk, tok, c_acc); escaped = 0; - /* carray on to handle the current character */ + /* carry on to handle the current character */ } } else if (escaped == 99) @@ -6671,10 +6680,11 @@ static int get_string ( else if (c == HAWK_T('b')) c = HAWK_T('\b'); else if (c == HAWK_T('v')) c = HAWK_T('\v'); else if (c == HAWK_T('a')) c = HAWK_T('\a'); - else if (c >= HAWK_T('0') && c <= HAWK_T('7') && end_char != HAWK_T('/')) + else if (c >= HAWK_T('0') && c <= HAWK_T('7')) { - /* i don't support the octal notation for a regular expression. - * it conflicts with the backreference notation between \1 and \7 inclusive. */ + /* treat it as an octal notation first and + * check if it's a backreference between \1 and \7 inclusive + * in the `if (escaped == 3)` block. */ escaped = 3; digit_count = 1; c_acc = c - HAWK_T('0'); diff --git a/t/h-002.hawk b/t/h-002.hawk index dddc309b..2ee2c63d 100644 --- a/t/h-002.hawk +++ b/t/h-002.hawk @@ -557,10 +557,11 @@ function main() { - ## back reference in a regular expression - tap_ensure (("a2b" ~ /(a)\12b/), 0, @SCRIPTNAME, @SCRIPTLINE); - tap_ensure (("aa2b" ~ /(a)\12b/), 1, @SCRIPTNAME, @SCRIPTLINE); - tap_ensure (("aaa2b" ~ /(a)\12b/), 1, @SCRIPTNAME, @SCRIPTLINE); + ## back reference in a regular expression - use the character class + ## notation to avoid escaping - \1[2] + tap_ensure (("a2b" ~ /(a)\1[2]b/), 0, @SCRIPTNAME, @SCRIPTLINE); + tap_ensure (("aa2b" ~ /(a)\1[2]b/), 1, @SCRIPTNAME, @SCRIPTLINE); + tap_ensure (("aaa2b" ~ /(a)\1[2]b/), 1, @SCRIPTNAME, @SCRIPTLINE); }