enhanced a special form FS to affect record reading
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
2025-10-01 23:48:42 +09:00
parent 9afed26820
commit 7cee04ba94
4 changed files with 95 additions and 63 deletions

View File

@ -909,7 +909,7 @@ In this example:
| CONVFMT | | | CONVFMT | |
| FILENAME | | | FILENAME | |
| FNR | File Number of Records, It reset to 1 for each new input file | | FNR | File Number of Records, It reset to 1 for each new input file |
| FS | Field Separator, specifies the character(s) that separate fields (columns) in an input record. The default is whitespace. If `FS` is a string that begins with a question mark(`?`) and 3 characters, the 3 characters define special quoting characters in this order: escaper, left quote and right quote. | | FS | Field Separator, specifies the character(s) that separate fields (columns) in an input record. The default is whitespace. |
| IGNORECASE | | | IGNORECASE | |
| NF | Number of Fields (columns) in the current input record | | NF | Number of Fields (columns) in the current input record |
| NR | Number of Records processed so far | | NR | Number of Records processed so far |
@ -926,6 +926,25 @@ In this example:
| STRIPSTRSPC | | | STRIPSTRSPC | |
| SUBSPEP | | | SUBSPEP | |
If `FS` is a string beginning with a question mark(`?`) followed by four characters, those characters define special quoting behavior in this order:
- Separator
- Escaper
- Left quote
- Right quote
When the escaper, left quote, and right quote are all the same (for example, `?,"""`), you must repeat that character twice to represent it literally.
In this specific case - when `FS` is in quoting form and the escaper, left quote, and right quote are identical - if `RS` is unset or set to `@nil`, then records may span multiple lines. This allows fields enclosed in quotes to contain embedded newlines.
```sh
$ echo -e 'the tiger, "pounced on\n""me"""' | hawk -v FS='?,"""' '{ for (i = 0; i <= NF; i++) print i, "[" $i "]"; }'
0 [the tiger, "pounced on
""me"""]
1 [the tiger]
2 [pounced on
"me"]
```
## Pipes ## Pipes
```awk ```awk

View File

@ -43,7 +43,7 @@ char_t* split_xchars_to_fields (hawk_rtx_t* rtx, char_t* str, hawk_oow_t len, ch
while (p < end) while (p < end)
{ {
char c = *p; char_t c = *p;
if (escaped) if (escaped)
{ {

View File

@ -370,10 +370,9 @@ int hawk_rtx_readio (hawk_rtx_t* rtx, hawk_in_type_t in_type, const hawk_ooch_t*
hawk_rio_arg_t* p; hawk_rio_arg_t* p;
hawk_rio_impl_t handler; hawk_rio_impl_t handler;
int ret; int ret;
#if 0
int esc_lq_rq; int esc_lq_rq;
int quoted;
hawk_ooch_t esc, lq, rq; hawk_ooch_t esc, lq, rq;
#endif
hawk_val_t* rs; hawk_val_t* rs;
hawk_oocs_t rrs; hawk_oocs_t rrs;
@ -418,18 +417,18 @@ int hawk_rtx_readio (hawk_rtx_t* rtx, hawk_in_type_t in_type, const hawk_ooch_t*
return -1; return -1;
} }
#if 0
/* RS set to @nil, FS set to a special string starting with ?, followed by esc lq rq */ /* RS set to @nil, FS set to a special string starting with ?, followed by esc lq rq */
esc_lq_rq = 0; esc_lq_rq = 0;
quoted = 0;
if (ffs.len == 5 && ffs.ptr[0] == '?' && !rrs.ptr) if (ffs.len == 5 && ffs.ptr[0] == '?' && !rrs.ptr)
{ {
esc_lq_rq = 1;
esc_lq_rq += (esc == lq && esc == rq);
esc = ffs.ptr[2]; esc = ffs.ptr[2];
lq = ffs.ptr[3]; lq = ffs.ptr[3];
rq = ffs.ptr[4]; rq = ffs.ptr[4];
esc_lq_rq = 1;
esc_lq_rq += (esc == lq && esc == rq);
} }
#endif
ret = 1; ret = 1;
@ -511,13 +510,6 @@ int hawk_rtx_readio (hawk_rtx_t* rtx, hawk_in_type_t in_type, const hawk_ooch_t*
p->in.pos = 0; p->in.pos = 0;
} }
#if 0
if (esc_lq_rq == 2)
{
/* TODO: */
}
else
#endif
if (rrs.ptr == HAWK_NULL) if (rrs.ptr == HAWK_NULL)
{ {
hawk_oow_t start_pos = p->in.pos; hawk_oow_t start_pos = p->in.pos;
@ -529,6 +521,27 @@ int hawk_rtx_readio (hawk_rtx_t* rtx, hawk_in_type_t in_type, const hawk_ooch_t*
c = p->in.u.buf[p->in.pos++]; c = p->in.u.buf[p->in.pos++];
end_pos = p->in.pos; end_pos = p->in.pos;
if (esc_lq_rq == 2)
{
/* if FS is something like [?,"""] and RS is @nil,
* it supports multi-line quoted vlaues. */
if (quoted == 2)
{
quoted = (c == rq);
/* no continue here as c could be a new line */
}
else if (quoted == 1)
{
if (c == rq) quoted = 2;
continue;
}
else if (c == lq)
{
quoted = 1;
continue;
}
}
/* TODO: handle different line terminator */ /* TODO: handle different line terminator */
/* separate by a new line */ /* separate by a new line */
if (c == HAWK_T('\n')) if (c == HAWK_T('\n'))