From 2c544ae383518ff7113b02f6ade5aba3f6343533 Mon Sep 17 00:00:00 2001 From: hyung-hwan Date: Sat, 27 Sep 2025 11:37:25 +0900 Subject: [PATCH] enhanced split() and splitting by FS to support the escape doubling scheme --- README.md | 8 ++++---- lib/misc-imp.h | 13 ++++++++++++- lib/rio.c | 1 - t/h-002.hawk | 14 ++++++++++++++ 4 files changed, 30 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 47e1bd75..761645fc 100644 --- a/README.md +++ b/README.md @@ -832,7 +832,7 @@ You can pass fewer arguments than the number of declared parameters to a functio Here's an example to illustrate this behavior: ```awk -@function greet(name, greeting) { +function greet(name, greeting) { if (greeting == "") { greeting = "Hello" } @@ -908,8 +908,8 @@ In this example: |--------------|-------------| | CONVFMT | | | FILENAME | | -| FNR | File Number of Records, reset to 1 for each new input file | -| FS | Field Separator, specifies the character(s) that separate fields (columns) in an input record. Default is whitespace | +| FNR | File Number of Records, It reset to 1 for each new input file | +| FS | Field Separator, specifies the character(s) that separate fields (columns) in an input record. The default is whitespace. If `FS` is a string that begins with a question mark(`?`) and 3 characters, the 3 characters define special quoting characters in this order: escaper, left quote and right quote. | | IGNORECASE | | | NF | Number of Fields (columns) in the current input record | | NR | Number of Records processed so far | @@ -919,7 +919,7 @@ In this example: | OFS | | | ORS | | | RLENGTH | | -| RS | Record Separator, specifies the character(s) that separate input records (lines). Default is newline `"\n"` | +| RS | Record Separator, specifies the character(s) that separate input records (lines). The default is a newline `"\n"` | | RSTART | | | SCRIPTNAME | | | STRIPRECSPC | | diff --git a/lib/misc-imp.h b/lib/misc-imp.h index 41998813..87591ea6 100644 --- a/lib/misc-imp.h +++ b/lib/misc-imp.h @@ -30,6 +30,10 @@ char_t* split_xchars_to_fields (hawk_rtx_t* rtx, char_t* str, hawk_oow_t len, ch char_t* ts; /* token start */ char_t* tp; /* points to one char past the last token char */ char_t* xp; /* points to one char past the last effective char */ + int escape_doubling; + + /* to extract "abc""def" as abc"def */ + escape_doubling = (ec == lq && ec == rq); /* skip leading spaces */ while (p < end && is_xch_space(*p)) p++; @@ -48,8 +52,9 @@ char_t* split_xchars_to_fields (hawk_rtx_t* rtx, char_t* str, hawk_oow_t len, ch } else { - if (c == ec) + if (!escape_doubling && c == ec) { + /* normal escaping is never activated if escaping with two repeated characters is on */ escaped = 1; p++; } @@ -57,11 +62,17 @@ char_t* split_xchars_to_fields (hawk_rtx_t* rtx, char_t* str, hawk_oow_t len, ch { if (c == rq) { + if (escape_doubling && (p + 1) < end && *(p + 1) == rq) + { + p++; + goto not_rq; + } quoted = 0; p++; } else { + not_rq: *tp++ = c; xp = tp; p++; } } diff --git a/lib/rio.c b/lib/rio.c index 3bf2f81b..39dfb84e 100644 --- a/lib/rio.c +++ b/lib/rio.c @@ -683,7 +683,6 @@ int hawk_rtx_readio (hawk_rtx_t* rtx, hawk_in_type_t in_type, const hawk_ooch_t* return ret; } - int hawk_rtx_readiobytes (hawk_rtx_t* rtx, hawk_in_type_t in_type, const hawk_ooch_t* name, hawk_becs_t* buf) { hawk_rio_arg_t* p; diff --git a/t/h-002.hawk b/t/h-002.hawk index 8e6bd863..dddc309b 100644 --- a/t/h-002.hawk +++ b/t/h-002.hawk @@ -494,6 +494,20 @@ function main() tap_ensure (a[3] === @b"coke", 1, @SCRIPTNAME, @SCRIPTLINE); tap_ensure (a[4] === @b"dark,age", 1, @SCRIPTNAME, @SCRIPTLINE); + ## escape doubling scheme - useful for csv-like files + ## if escaper, left-quote, right-quote are the same, escape doubling scheme is turned on + tap_ensure (split(@b"sea of people, brandy, coke, \"\"\"dark\"\", age\"", a, "?,\"\"\""), 4, @SCRIPTNAME, @SCRIPTLINE); + tap_ensure (a[1] === @b"sea of people", 1, @SCRIPTNAME, @SCRIPTLINE); + tap_ensure (a[2] === @b"brandy", 1, @SCRIPTNAME, @SCRIPTLINE); + tap_ensure (a[3] === @b"coke", 1, @SCRIPTNAME, @SCRIPTLINE); + tap_ensure (a[4] === @b"\"dark\", age", 1, @SCRIPTNAME, @SCRIPTLINE); + + tap_ensure (split(@b"sea of people, brandy, coke, |||dark||, age|", a, "?,|||"), 4, @SCRIPTNAME, @SCRIPTLINE); + tap_ensure (a[1] === @b"sea of people", 1, @SCRIPTNAME, @SCRIPTLINE); + tap_ensure (a[2] === @b"brandy", 1, @SCRIPTNAME, @SCRIPTLINE); + tap_ensure (a[3] === @b"coke", 1, @SCRIPTNAME, @SCRIPTLINE); + tap_ensure (a[4] === @b"|dark|, age", 1, @SCRIPTNAME, @SCRIPTLINE); + tap_ensure (split("Here===Is=Some=====Data", a, ""), 23, @SCRIPTNAME, @SCRIPTLINE); tap_ensure (hawk::typename(a), "map", @SCRIPTNAME, @SCRIPTLINE); tap_ensure (str::splita("Here===Is=Some=====Data", a, ""), 23, @SCRIPTNAME, @SCRIPTLINE);