From 4ab01872607f26588cac15a67f094777fb0394aa Mon Sep 17 00:00:00 2001 From: hyung-hwan Date: Sun, 19 May 2024 17:09:31 +0900 Subject: [PATCH] more input stream handling code --- go/cb.go | 2 +- lib/hcl.h | 22 +++++-- lib/prim.c | 179 ++++++++++++++++++++++++++++++++++++++++++++++------ lib/read.c | 4 +- lib/std.c | 78 ++++++++++++++--------- pas/hcl.pas | 4 +- 6 files changed, 231 insertions(+), 58 deletions(-) diff --git a/go/cb.go b/go/cb.go index d385d14..c4c4c06 100644 --- a/go/cb.go +++ b/go/cb.go @@ -187,7 +187,7 @@ func hcl_go_cci_handler(c *C.hcl_t, cmd C.hcl_io_cmd_t, arg unsafe.Pointer) C.in return -1 } - ioarg.is_bytes = 0 + ioarg.byte_oriented = 0 if unsafe.Sizeof(buf[0]) == unsafe.Sizeof(dummy) { C.memcpy( unsafe.Pointer(&ioarg.buf[0]), diff --git a/lib/hcl.h b/lib/hcl.h index 385c22f..b9ffbd8 100644 --- a/lib/hcl.h +++ b/lib/hcl.h @@ -1268,7 +1268,7 @@ struct hcl_io_cciarg_t * the caller issues HCL_IO_READ_BYTES if it's set to non-zero, expecting bytes. * otherwise it issues HCL_IO_READ expecting characters. */ - int is_bytes; + int byte_oriented; /** * [OUT] place data here for #HCL_IO_READ or #HCL_IO_READ_BYTES @@ -1326,7 +1326,7 @@ struct hcl_io_udiarg_t /** * [OUT] indicates if HCL_IO_READ_BYTES is implemented */ - int is_bytes; + int byte_oriented; /** * [OUT] place data in c for #HCL_IO_READ and in d for #HCL_IO_READ_BYTES @@ -1342,10 +1342,20 @@ struct hcl_io_udiarg_t */ hcl_oow_t xlen; - /** - * Internal use only. Don't touch these. - */ - hcl_oow_t pos; + + /*-----------------------------------------------------------------*/ + /*----------- from here down, internal use only -------------------*/ + struct + { + hcl_oow_t pos; + hcl_oow_t len; + } b; /* buffer(buf.c or buf.b) usage status */ + + struct + { + hcl_uint8_t buf[HCL_MBLEN_MAX]; + hcl_oow_t len; + } rsd; /* residue bytes for HCL_IO_READ_BYTES */ }; typedef struct hcl_io_udoarg_t hcl_io_udoarg_t; diff --git a/lib/prim.c b/lib/prim.c index 1cb5119..e6918fd 100644 --- a/lib/prim.c +++ b/lib/prim.c @@ -215,42 +215,182 @@ static hcl_pfrc_t pf_sprintf (hcl_t* hcl, hcl_mod_t* mod, hcl_ooi_t nargs) /* ------------------------------------------------------------------------- */ +static hcl_oow_t move_udi_residue_bytes (hcl_io_udiarg_t* curinp) +{ + hcl_oow_t cpl; + + cpl = HCL_COUNTOF(curinp->rsd.buf) - curinp->rsd.len; + if (cpl > 0) + { + hcl_oow_t avail; + avail = curinp->b.len - curinp->b.pos; /* available in the read buffer */ + if (cpl > avail) cpl = avail; + HCL_MEMCPY(&curinp->rsd.buf[curinp->rsd.len], &curinp->buf.b[curinp->b.pos], cpl); + curinp->rsd.len += cpl; + curinp->b.pos += cpl; /* advance the position because the bytes moved to the residue buffer */ + } + return curinp->rsd.len; +} static int get_udi_char (hcl_t* hcl, hcl_ooch_t* ch) { - if (hcl->io.udi_arg.pos >= hcl->io.udi_arg.xlen) - { - hcl->io.udi_arg.pos = 0; - hcl->io.udi_arg.xlen = 0; - if (hcl->io.udi_rdr(hcl, HCL_IO_READ, &hcl->io.udi_arg) <= -1) return -1; - if (hcl->io.udi_arg.xlen <= 0) return 0; /* EOF */ - hcl->io.udi_arg.is_bytes = 0; - } + hcl_io_udiarg_t* curinp; + hcl_ooch_t c; + hcl_oow_t taken; + int x; - *ch = hcl->io.udi_arg.buf.c[hcl->io.udi_arg.pos++]; + curinp = &hcl->io.udi_arg; + + #if defined(HCL_OOCH_IS_UCH) + if (curinp->byte_oriented) + { + hcl_cmgr_t* cmgr; + const hcl_uint8_t* inpptr; + hcl_oow_t inplen, n; + + cmgr = HCL_CMGR(hcl); + + start_over: + if (curinp->b.pos >= curinp->b.len) + { + x = hcl->io.udi_rdr(hcl, HCL_IO_READ_BYTES, curinp); + if (x <= -1) + { + const hcl_ooch_t* orgmsg = hcl_backuperrmsg(hcl); + hcl_seterrbfmt (hcl, HCL_ERRNUM(hcl), "unable to read bytes from input stream - %js", orgmsg); + return -1; + } + + if (curinp->xlen <= 0) + { + /* got EOF from an included stream */ + if (curinp->rsd.len > 0) + { + hcl_seterrbfmt (hcl, HCL_EECERR, "incomplete byte sequence in input stream"); + return -1; + } + return 0; + } + + curinp->b.pos = 0; + curinp->b.len = curinp->xlen; + } + + if (curinp->rsd.len > 0) + { + /* there is data in the residue buffer. use the residue buffer to + * locate a proper multi-byte sequence */ + HCL_ASSERT (hcl, curinp->b.pos == 0); + inplen = move_udi_residue_bytes(curinp); + inpptr = &curinp->rsd.buf[0]; + } + else + { + inplen = curinp->b.len - curinp->b.pos; + inpptr = &curinp->buf.b[curinp->b.pos]; + } + + n = cmgr->bctouc((const hcl_bch_t*)inpptr, inplen, &c); + if (n == 0) /* invalid sequence */ + { + /* TODO: more accurate location of the invalid byte sequence */ + hcl_seterrbfmt (hcl, HCL_EECERR, "invalid byte sequence in input stream"); + return -1; + } + if (n > inplen) /* incomplete sequence */ + { + HCL_ASSERT (hcl, curinp->rsd.len < HCL_COUNTOF(curinp->rsd.buf)); + move_udi_residue_bytes (curinp); + goto start_over; + } + + if (curinp->rsd.len > 0) + { + /* move_cci_residue_bytes() advanced curinp->b.pos without checking + * the needed number of bytes to form a character. it must backoff by + * the number of excessive bytes moved to the residue buffer */ + curinp->b.pos -= curinp->rsd.len - n; + taken = 0; /* treat it as if no bytes are taken in this case */ + } + else + { + taken = n; + } + } + else + { +#endif + if (curinp->b.pos >= curinp->b.len) + { + x = hcl->io.udi_rdr(hcl, HCL_IO_READ, curinp); + if (x <= -1) + { + /* TODO: more accurate location of failure */ + const hcl_ooch_t* orgmsg = hcl_backuperrmsg(hcl); + hcl_seterrbfmt (hcl, HCL_ERRNUM(hcl), "unable to read input stream - %js", orgmsg); + return -1; + } + if (curinp->xlen <= 0) + { + /* got EOF from an included stream */ + return 0; + } + + curinp->b.pos = 0; + curinp->b.len = curinp->xlen; + } + + c = curinp->buf.c[curinp->b.pos]; + taken = 1; +#if defined(HCL_OOCH_IS_UCH) + } +#endif + + + curinp->b.pos += taken; +#if defined(HCL_OOCH_IS_UCH) + curinp->rsd.len = 0; /* clear up the residue byte buffer. needed for byte reading only */ +#endif + + *ch = c; return 1; } static int get_udi_byte (hcl_t* hcl, hcl_uint8_t* bt) { + hcl_io_udiarg_t* curinp; + int x; + #if defined(HCL_OOCH_IS_UCH) - if (!hcl->io.udi_arg.is_bytes) + if (!hcl->io.udi_arg.byte_oriented) { - hcl_seterrbfmt (hcl, HCL_EPERM, "prohibited byte-oriented input"); +/* TODO: convert characters to bytes? but do we know the original encoding? */ + hcl_seterrbfmt (hcl, HCL_EPERM, "byte-oriented input prohibited on character-oriented stream"); return -1; } #endif - if (hcl->io.udi_arg.pos >= hcl->io.udi_arg.xlen) + curinp = &hcl->io.udi_arg; + if (curinp->b.pos >= curinp->b.len) { - hcl->io.udi_arg.pos = 0; - hcl->io.udi_arg.xlen = 0; - if (hcl->io.udi_rdr(hcl, HCL_IO_READ_BYTES, &hcl->io.udi_arg) <= -1) return -1; - if (hcl->io.udi_arg.xlen <= 0) return 0; /* EOF */ - hcl->io.udi_arg.is_bytes = 1; + x = hcl->io.udi_rdr(hcl, HCL_IO_READ_BYTES, curinp); + if (x <= -1) + { + const hcl_ooch_t* orgmsg = hcl_backuperrmsg(hcl); + hcl_seterrbfmt (hcl, HCL_ERRNUM(hcl), "unable to read input stream - %js", orgmsg); + return -1; + } + if (curinp->xlen <= 0) + { + /* got EOF from an included stream */ + return 0; + } + + curinp->b.pos = 0; + curinp->b.len = curinp->xlen; } - *bt = hcl->io.udi_arg.buf.b[hcl->io.udi_arg.pos++]; + *bt = curinp->buf.b[curinp->b.pos++]; return 1; } @@ -990,6 +1130,9 @@ static hcl_pfrc_t pf_object_new (hcl_t* hcl, hcl_mod_t* mod, hcl_ooi_t nargs) static pf_t builtin_prims[] = { + /* TODO: move these primitives to modules... */ + + { 0, 0, pf_getbyte, 7, { 'g','e','t','b','y','t','e' } }, { 0, 0, pf_getch, 5, { 'g','e','t','c','h' } }, { 0, HCL_TYPE_MAX(hcl_oow_t), pf_log, 3, { 'l','o','g' } }, { 1, HCL_TYPE_MAX(hcl_oow_t), pf_logf, 4, { 'l','o','g','f' } }, diff --git a/lib/read.c b/lib/read.c index 9869862..5ae43a2 100644 --- a/lib/read.c +++ b/lib/read.c @@ -1109,7 +1109,7 @@ static int feed_begin_include (hcl_t* hcl) arg->line = 1; arg->colm = 1; /*arg->nl = '\0';*/ - /*arg->is_bytes = 0;*/ + /*arg->byte_oriented = 0;*/ arg->includer = hcl->c->curinp; if (hcl->c->cci_rdr(hcl, HCL_IO_OPEN, arg) <= -1) @@ -3087,7 +3087,7 @@ static int feed_from_includee (hcl_t* hcl) hcl_oow_t taken; #if defined(HCL_OOCH_IS_UCH) - if (curinp->is_bytes) + if (curinp->byte_oriented) { hcl_cmgr_t* cmgr; const hcl_uint8_t* inpptr; diff --git a/lib/std.c b/lib/std.c index 9a956fb..d59aff4 100644 --- a/lib/std.c +++ b/lib/std.c @@ -3534,6 +3534,7 @@ static HCL_INLINE int open_udi_stream (hcl_t* hcl, hcl_io_udiarg_t* arg) goto oops; } + arg->byte_oriented = 1; arg->handle = bb; return 0; @@ -3567,31 +3568,46 @@ static HCL_INLINE int read_udi_stream (hcl_t* hcl, hcl_io_udiarg_t* arg) bb_t* bb; hcl_oow_t bcslen, ucslen, remlen; int x; +#if defined(HCL_OOCH_IS_UCH) + int fetched = 0; +#endif bb = (bb_t*)arg->handle; HCL_ASSERT (hcl, bb != HCL_NULL && bb->fp != HCL_NULL); - do - { - x = fgetc(bb->fp); - if (x == EOF) - { - if (ferror((FILE*)bb->fp)) - { - hcl_seterrbfmtwithsyserr (hcl, 0, errno, "unable to read udi stream"); - return -1; - } - break; - } - bb->buf[bb->len++] = x; + if (bb->len > 0) + { +#if defined(HCL_OOCH_IS_UCH) + real_fetch: + fetched = 1; +#endif + do + { + x = fgetc(bb->fp); + if (x == EOF) + { + if (ferror((FILE*)bb->fp)) + { + hcl_seterrbfmtwithsyserr (hcl, 0, errno, "unable to read udi stream"); + return -1; + } + break; + } + + bb->buf[bb->len++] = x; + } + while (bb->len < HCL_COUNTOF(bb->buf) && x != '\r' && x != '\n'); } - while (bb->len < HCL_COUNTOF(bb->buf) && x != '\r' && x != '\n'); #if defined(HCL_OOCH_IS_UCH) bcslen = bb->len; ucslen = HCL_COUNTOF(arg->buf.c); x = hcl_convbtooochars(hcl, bb->buf, &bcslen, arg->buf.c, &ucslen); - if (x <= -1 && ucslen <= 0) return -1; + if (x <= -1 && ucslen <= 0) + { + if (x == -3 && !fetched) goto real_fetch; + return -1; + } /* if ucslen is greater than 0, i assume that some characters have been * converted properly. as the loop above reads an entire line if not too * large, the incomplete sequence error (x == -3) must happen after @@ -3620,22 +3636,26 @@ static HCL_INLINE int read_udi_stream_bytes (hcl_t* hcl, hcl_io_udiarg_t* arg) bb = (bb_t*)arg->handle; HCL_ASSERT (hcl, bb != HCL_NULL && bb->fp != HCL_NULL); - do - { - x = fgetc(bb->fp); - if (x == EOF) - { - if (ferror((FILE*)bb->fp)) - { - hcl_seterrbfmtwithsyserr (hcl, 0, errno, "unable to read udi stream"); - return -1; - } - break; - } - bb->buf[bb->len++] = x; + if (bb->len <= 0) + { + do + { + x = fgetc(bb->fp); + if (x == EOF) + { + if (ferror((FILE*)bb->fp)) + { + hcl_seterrbfmtwithsyserr (hcl, 0, errno, "unable to read udi stream"); + return -1; + } + break; + } + + bb->buf[bb->len++] = x; + } + while (bb->len < HCL_COUNTOF(bb->buf) && x != '\r' && x != '\n'); } - while (bb->len < HCL_COUNTOF(bb->buf) && x != '\r' && x != '\n'); bcslen = (bb->len < HCL_COUNTOF(arg->buf.b))? bb->len: HCL_COUNTOF(arg->buf.b); ucslen = bcslen; diff --git a/pas/hcl.pas b/pas/hcl.pas index dd6ed6a..6575ee8 100644 --- a/pas/hcl.pas +++ b/pas/hcl.pas @@ -50,7 +50,7 @@ type CciArg = record (* this record must follow the public part of hcl_io_cciarg_t in hcl.h *) name: pwidechar; handle: pointer; - is_bytes: integer; + byte_oriented: integer; buf: array[0..(HCL_CCI_BUF_LEN - 1)] of widechar; xlen: System.SizeUint; includer: CciArgPtr; @@ -296,7 +296,7 @@ begin nf^.name := name; arg^.handle := pointer(nf); - arg^.is_bytes := 1; + arg^.byte_oriented := 1; end; IO_CLOSE: begin