enhanced split() and splitting by FS to support the escape doubling scheme
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
2025-09-27 11:37:25 +09:00
parent 0ffe46992b
commit 2c544ae383
4 changed files with 30 additions and 6 deletions

View File

@ -832,7 +832,7 @@ You can pass fewer arguments than the number of declared parameters to a functio
Here's an example to illustrate this behavior: Here's an example to illustrate this behavior:
```awk ```awk
@function greet(name, greeting) { function greet(name, greeting) {
if (greeting == "") { if (greeting == "") {
greeting = "Hello" greeting = "Hello"
} }
@ -908,8 +908,8 @@ In this example:
|--------------|-------------| |--------------|-------------|
| CONVFMT | | | CONVFMT | |
| FILENAME | | | FILENAME | |
| FNR | File Number of Records, reset to 1 for each new input file | | FNR | File Number of Records, It reset to 1 for each new input file |
| FS | Field Separator, specifies the character(s) that separate fields (columns) in an input record. Default is whitespace | | FS | Field Separator, specifies the character(s) that separate fields (columns) in an input record. The default is whitespace. If `FS` is a string that begins with a question mark(`?`) and 3 characters, the 3 characters define special quoting characters in this order: escaper, left quote and right quote. |
| IGNORECASE | | | IGNORECASE | |
| NF | Number of Fields (columns) in the current input record | | NF | Number of Fields (columns) in the current input record |
| NR | Number of Records processed so far | | NR | Number of Records processed so far |
@ -919,7 +919,7 @@ In this example:
| OFS | | | OFS | |
| ORS | | | ORS | |
| RLENGTH | | | RLENGTH | |
| RS | Record Separator, specifies the character(s) that separate input records (lines). Default is newline `"\n"` | | RS | Record Separator, specifies the character(s) that separate input records (lines). The default is a newline `"\n"` |
| RSTART | | | RSTART | |
| SCRIPTNAME | | | SCRIPTNAME | |
| STRIPRECSPC | | | STRIPRECSPC | |

View File

@ -30,6 +30,10 @@ char_t* split_xchars_to_fields (hawk_rtx_t* rtx, char_t* str, hawk_oow_t len, ch
char_t* ts; /* token start */ char_t* ts; /* token start */
char_t* tp; /* points to one char past the last token char */ char_t* tp; /* points to one char past the last token char */
char_t* xp; /* points to one char past the last effective char */ char_t* xp; /* points to one char past the last effective char */
int escape_doubling;
/* to extract "abc""def" as abc"def */
escape_doubling = (ec == lq && ec == rq);
/* skip leading spaces */ /* skip leading spaces */
while (p < end && is_xch_space(*p)) p++; while (p < end && is_xch_space(*p)) p++;
@ -48,8 +52,9 @@ char_t* split_xchars_to_fields (hawk_rtx_t* rtx, char_t* str, hawk_oow_t len, ch
} }
else else
{ {
if (c == ec) if (!escape_doubling && c == ec)
{ {
/* normal escaping is never activated if escaping with two repeated characters is on */
escaped = 1; escaped = 1;
p++; p++;
} }
@ -57,11 +62,17 @@ char_t* split_xchars_to_fields (hawk_rtx_t* rtx, char_t* str, hawk_oow_t len, ch
{ {
if (c == rq) if (c == rq)
{ {
if (escape_doubling && (p + 1) < end && *(p + 1) == rq)
{
p++;
goto not_rq;
}
quoted = 0; quoted = 0;
p++; p++;
} }
else else
{ {
not_rq:
*tp++ = c; xp = tp; p++; *tp++ = c; xp = tp; p++;
} }
} }

View File

@ -683,7 +683,6 @@ int hawk_rtx_readio (hawk_rtx_t* rtx, hawk_in_type_t in_type, const hawk_ooch_t*
return ret; return ret;
} }
int hawk_rtx_readiobytes (hawk_rtx_t* rtx, hawk_in_type_t in_type, const hawk_ooch_t* name, hawk_becs_t* buf) int hawk_rtx_readiobytes (hawk_rtx_t* rtx, hawk_in_type_t in_type, const hawk_ooch_t* name, hawk_becs_t* buf)
{ {
hawk_rio_arg_t* p; hawk_rio_arg_t* p;

View File

@ -494,6 +494,20 @@ function main()
tap_ensure (a[3] === @b"coke", 1, @SCRIPTNAME, @SCRIPTLINE); tap_ensure (a[3] === @b"coke", 1, @SCRIPTNAME, @SCRIPTLINE);
tap_ensure (a[4] === @b"dark,age", 1, @SCRIPTNAME, @SCRIPTLINE); tap_ensure (a[4] === @b"dark,age", 1, @SCRIPTNAME, @SCRIPTLINE);
## escape doubling scheme - useful for csv-like files
## if escaper, left-quote, right-quote are the same, escape doubling scheme is turned on
tap_ensure (split(@b"sea of people, brandy, coke, \"\"\"dark\"\", age\"", a, "?,\"\"\""), 4, @SCRIPTNAME, @SCRIPTLINE);
tap_ensure (a[1] === @b"sea of people", 1, @SCRIPTNAME, @SCRIPTLINE);
tap_ensure (a[2] === @b"brandy", 1, @SCRIPTNAME, @SCRIPTLINE);
tap_ensure (a[3] === @b"coke", 1, @SCRIPTNAME, @SCRIPTLINE);
tap_ensure (a[4] === @b"\"dark\", age", 1, @SCRIPTNAME, @SCRIPTLINE);
tap_ensure (split(@b"sea of people, brandy, coke, |||dark||, age|", a, "?,|||"), 4, @SCRIPTNAME, @SCRIPTLINE);
tap_ensure (a[1] === @b"sea of people", 1, @SCRIPTNAME, @SCRIPTLINE);
tap_ensure (a[2] === @b"brandy", 1, @SCRIPTNAME, @SCRIPTLINE);
tap_ensure (a[3] === @b"coke", 1, @SCRIPTNAME, @SCRIPTLINE);
tap_ensure (a[4] === @b"|dark|, age", 1, @SCRIPTNAME, @SCRIPTLINE);
tap_ensure (split("Here===Is=Some=====Data", a, ""), 23, @SCRIPTNAME, @SCRIPTLINE); tap_ensure (split("Here===Is=Some=====Data", a, ""), 23, @SCRIPTNAME, @SCRIPTLINE);
tap_ensure (hawk::typename(a), "map", @SCRIPTNAME, @SCRIPTLINE); tap_ensure (hawk::typename(a), "map", @SCRIPTNAME, @SCRIPTLINE);
tap_ensure (str::splita("Here===Is=Some=====Data", a, ""), 23, @SCRIPTNAME, @SCRIPTLINE); tap_ensure (str::splita("Here===Is=Some=====Data", a, ""), 23, @SCRIPTNAME, @SCRIPTLINE);