From 2c544ae383518ff7113b02f6ade5aba3f6343533 Mon Sep 17 00:00:00 2001
From: hyung-hwan <hyunghwan.chung@gmail.com>
Date: Sat, 27 Sep 2025 11:37:25 +0900
Subject: [PATCH] enhanced split() and splitting by FS to support the escape
 doubling scheme

---
 README.md      |  8 ++++----
 lib/misc-imp.h | 13 ++++++++++++-
 lib/rio.c      |  1 -
 t/h-002.hawk   | 14 ++++++++++++++
 4 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 47e1bd75..761645fc 100644
--- a/README.md
+++ b/README.md
@@ -832,7 +832,7 @@ You can pass fewer arguments than the number of declared parameters to a functio
 Here's an example to illustrate this behavior:
 
 ```awk
-@function greet(name, greeting) {
+function greet(name, greeting) {
     if (greeting == "") {
         greeting = "Hello"
     }
@@ -908,8 +908,8 @@ In this example:
 |--------------|-------------|
 | CONVFMT      |             |
 | FILENAME     |             |
-| FNR          | File Number of Records, reset to 1 for each new input file |
-| FS           | Field Separator, specifies the character(s) that separate fields (columns) in an input record. Default is whitespace |
+| FNR          | File Number of Records, It reset to 1 for each new input file |
+| FS           | Field Separator, specifies the character(s) that separate fields (columns) in an input record. The default is whitespace. If `FS` is a string that begins with a question mark(`?`) and 3 characters, the 3 characters define special quoting characters in this order: escaper, left quote and right quote. |
 | IGNORECASE   |             |
 | NF           | Number of Fields (columns) in the current input record |
 | NR           | Number of Records processed so far |
@@ -919,7 +919,7 @@ In this example:
 | OFS          |             |
 | ORS          |             |
 | RLENGTH      |             |
-| RS           | Record Separator, specifies the character(s) that separate input records (lines). Default is newline `"\n"` |
+| RS           | Record Separator, specifies the character(s) that separate input records (lines). The default is a newline `"\n"` |
 | RSTART       |             |
 | SCRIPTNAME   |             |
 | STRIPRECSPC  |             |
diff --git a/lib/misc-imp.h b/lib/misc-imp.h
index 41998813..87591ea6 100644
--- a/lib/misc-imp.h
+++ b/lib/misc-imp.h
@@ -30,6 +30,10 @@ char_t* split_xchars_to_fields (hawk_rtx_t* rtx, char_t* str, hawk_oow_t len, ch
 	char_t* ts; /* token start */
 	char_t* tp; /* points to one char past the last token char */
 	char_t* xp; /* points to one char past the last effective char */
+	int escape_doubling;
+
+	/* to extract "abc""def" as abc"def */
+	escape_doubling = (ec == lq && ec == rq);
 
 	/* skip leading spaces */
 	while (p < end && is_xch_space(*p)) p++;
@@ -48,8 +52,9 @@ char_t* split_xchars_to_fields (hawk_rtx_t* rtx, char_t* str, hawk_oow_t len, ch
 		}
 		else
 		{
-			if (c == ec)
+			if (!escape_doubling && c == ec)
 			{
+				/* normal escaping is never activated if escaping with two repeated characters is on */
 				escaped = 1;
 				p++;
 			}
@@ -57,11 +62,17 @@ char_t* split_xchars_to_fields (hawk_rtx_t* rtx, char_t* str, hawk_oow_t len, ch
 			{
 				if (c == rq)
 				{
+					if (escape_doubling && (p + 1) < end && *(p + 1) == rq)
+					{
+						p++;
+						goto not_rq;
+					}
 					quoted = 0;
 					p++;
 				}
 				else
 				{
+				not_rq:
 					*tp++ = c; xp = tp; p++;
 				}
 			}
diff --git a/lib/rio.c b/lib/rio.c
index 3bf2f81b..39dfb84e 100644
--- a/lib/rio.c
+++ b/lib/rio.c
@@ -683,7 +683,6 @@ int hawk_rtx_readio (hawk_rtx_t* rtx, hawk_in_type_t in_type, const hawk_ooch_t*
 	return ret;
 }
 
-
 int hawk_rtx_readiobytes (hawk_rtx_t* rtx, hawk_in_type_t in_type, const hawk_ooch_t* name, hawk_becs_t* buf)
 {
 	hawk_rio_arg_t* p;
diff --git a/t/h-002.hawk b/t/h-002.hawk
index 8e6bd863..dddc309b 100644
--- a/t/h-002.hawk
+++ b/t/h-002.hawk
@@ -494,6 +494,20 @@ function main()
 		tap_ensure (a[3] === @b"coke",                             1, @SCRIPTNAME, @SCRIPTLINE);
 		tap_ensure (a[4] === @b"dark,age",                         1, @SCRIPTNAME, @SCRIPTLINE);
 
+		## escape doubling scheme - useful for csv-like files
+		## if escaper, left-quote, right-quote are the same, escape doubling scheme is turned on
+		tap_ensure (split(@b"sea of people, brandy, coke, \"\"\"dark\"\", age\"", a, "?,\"\"\""), 4, @SCRIPTNAME, @SCRIPTLINE);
+		tap_ensure (a[1] === @b"sea of people",                    1, @SCRIPTNAME, @SCRIPTLINE);
+		tap_ensure (a[2] === @b"brandy",                           1, @SCRIPTNAME, @SCRIPTLINE);
+		tap_ensure (a[3] === @b"coke",                             1, @SCRIPTNAME, @SCRIPTLINE);
+		tap_ensure (a[4] === @b"\"dark\", age",                    1, @SCRIPTNAME, @SCRIPTLINE);
+
+		tap_ensure (split(@b"sea of people, brandy, coke, |||dark||, age|", a, "?,|||"), 4, @SCRIPTNAME, @SCRIPTLINE);
+		tap_ensure (a[1] === @b"sea of people",                    1, @SCRIPTNAME, @SCRIPTLINE);
+		tap_ensure (a[2] === @b"brandy",                           1, @SCRIPTNAME, @SCRIPTLINE);
+		tap_ensure (a[3] === @b"coke",                             1, @SCRIPTNAME, @SCRIPTLINE);
+		tap_ensure (a[4] === @b"|dark|, age",                      1, @SCRIPTNAME, @SCRIPTLINE);
+
 		tap_ensure (split("Here===Is=Some=====Data", a, ""),       23,     @SCRIPTNAME, @SCRIPTLINE);
 		tap_ensure (hawk::typename(a),                             "map",  @SCRIPTNAME, @SCRIPTLINE);
 		tap_ensure (str::splita("Here===Is=Some=====Data", a, ""),  23,    @SCRIPTNAME, @SCRIPTLINE);