qse/lib/cmn/tre.h

447 lines
13 KiB
C

/*
* $Id$
*
Copyright (c) 2006-2019 Chung, Hyung-Hwan. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
tre-internal.h - TRE internal definitions
This is the license, copyright notice, and disclaimer for TRE, a regex
matching package (library and tools) with support for approximate
matching.
Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _QSE_LIB_CMN_TRE_H_
#define _QSE_LIB_CMN_TRE_H_
/* TODO: MAKE TRE WORK LIKE GNU
PATTERN: \(.\{0,1\}\)\(~[^,]*\)\([0-9]\)\(\.*\),\([^;]*\)\(;\([^;]*\(\3[^;]*\)\).*X*\1\(.*\)\)
INPUT: ~02.,3~3;0123456789;9876543210
------------------------------------------------------
samples/cmn/tre01 gives the following output. this does not seem wrong, though.
SUBMATCH[7],[8],[9].
SUBMATCH[0] = [~02.,3~3;0123456789;9876543210]
SUBMATCH[1] = []
SUBMATCH[2] = [~0]
SUBMATCH[3] = [2]
SUBMATCH[4] = [.]
SUBMATCH[5] = [3~3]
SUBMATCH[6] = [;0123456789;9876543210]
SUBMATCH[7] = [012]
SUBMATCH[8] = [2]
SUBMATCH[9] = [3456789;9876543210
------------------------------------------------------
Using the GNU regcomp(),regexec(), the following
is printed.
#include <sys/types.h>
#include <regex.h>
#include <stdio.h>
int main (int argc, char* argv[])
{
regex_t tre;
regmatch_t mat[10];
int i;
regcomp (&tre, argv[1], 0);
regexec (&tre, argv[2], 10, mat, 0);
for (i = 0; i < 10; i++)
{
if (mat[i].rm_so == -1) break;
printf ("SUBMATCH[%u] = [%.*s]\n", i,
(int)(mat[i].rm_eo - mat[i].rm_so), &argv[2][mat[i].rm_so]);
}
regfree (&tre);
return 0;
}
SUBMATCH[0] = [~02.,3~3;0123456789;9876543210]
SUBMATCH[1] = []
SUBMATCH[2] = [~0]
SUBMATCH[3] = [2]
SUBMATCH[4] = [.]
SUBMATCH[5] = [3~3]
SUBMATCH[6] = [;0123456789;9876543210]
SUBMATCH[7] = [0123456789]
SUBMATCH[8] = [23456789]
SUBMATCH[9] = []
------------------------------------------------------
One more example here:
$ ./tre01 "\(x*\)ab\(\(c*\1\)\(.*\)\)" "abcdefg"
Match: YES
SUBMATCH[0] = [abcdefg]
SUBMATCH[1] = []
SUBMATCH[2] = [cdefg]
SUBMATCH[3] = []
SUBMATCH[4] = [cdefg]
$ ./reg "\(x*\)ab\(\(c*\1\)\(.*\)\)" "abcdefg"
SUBMATCH[0] = [abcdefg]
SUBMATCH[1] = []
SUBMATCH[2] = [cdefg]
SUBMATCH[3] = [c]
SUBMATCH[4] = [defg]
*/
#include <qse/cmn/tre.h>
#ifdef QSE_CHAR_IS_WCHAR
# define TRE_WCHAR
/*# define TRE_MULTIBYTE*/
/*# define TRE_MBSTATE*/
#endif
#define TRE_REGEX_T_FIELD value
#define assert QSE_ASSERT
#define NULL QSE_NULL
#include <qse/cmn/chr.h>
#include <qse/cmn/str.h>
#include <qse/cmn/pma.h>
#include "mem-prv.h"
#define tre_islower(c) QSE_ISLOWER(c)
#define tre_isupper(c) QSE_ISUPPER(c)
#define tre_isalpha(c) QSE_ISALPHA(c)
#define tre_isdigit(c) QSE_ISDIGIT(c)
#define tre_isxdigit(c) QSE_ISXDIGIT(c)
#define tre_isalnum(c) QSE_ISALNUM(c)
#define tre_isspace(c) QSE_ISSPACE(c)
#define tre_isprint(c) QSE_ISPRINT(c)
#define tre_isgraph(c) QSE_ISGRAPH(c)
#define tre_iscntrl(c) QSE_ISCNTRL(c)
#define tre_ispunct(c) QSE_ISPUNCT(c)
#define tre_isblank(c) QSE_ISBLANK(c)
#define tre_tolower(c) QSE_TOLOWER(c)
#define tre_toupper(c) QSE_TOUPPER(c)
#if defined(QSE_CHAR_IS_MCHAR) && (QSE_SIZEOF_MCHAR_T == QSE_SIZEOF_CHAR)
typedef unsigned char tre_char_t;
#else
typedef qse_char_t tre_char_t;
#endif
typedef qse_cint_t tre_cint_t;
#define size_t qse_size_t
#define regex_t qse_tre_t
#define regmatch_t qse_tre_match_t
#define reg_errcode_t qse_tre_errnum_t
#define REG_OK QSE_TRE_ENOERR
#define REG_ESPACE QSE_TRE_ENOMEM
#define REG_NOMATCH QSE_TRE_ENOMATCH
#define REG_BADPAT QSE_TRE_EBADPAT
#define REG_ECOLLATE QSE_TRE_ECOLLATE
#define REG_ECTYPE QSE_TRE_ECTYPE
#define REG_EESCAPE QSE_TRE_EESCAPE
#define REG_ESUBREG QSE_TRE_ESUBREG
#define REG_EBRACK QSE_TRE_EBRACK
#define REG_EPAREN QSE_TRE_EPAREN
#define REG_EBRACE QSE_TRE_EBRACE
#define REG_BADBR QSE_TRE_EBADBR
#define REG_ERANGE QSE_TRE_ERANGE
#define REG_BADRPT QSE_TRE_EBADRPT
/* The maximum number of iterations in a bound expression. */
#undef RE_DUP_MAX
#define RE_DUP_MAX 255
/* POSIX tre_regcomp() flags. */
#define REG_EXTENDED QSE_TRE_EXTENDED
#define REG_ICASE QSE_TRE_IGNORECASE
#define REG_NEWLINE QSE_TRE_NEWLINE
#define REG_NOSUB QSE_TRE_NOSUBREG
/* Extra tre_regcomp() flags. */
#define REG_LITERAL QSE_TRE_LITERAL
#define REG_RIGHT_ASSOC QSE_TRE_RIGHTASSOC
#define REG_UNGREEDY QSE_TRE_UNGREEDY
#define REG_NOBOUND QSE_TRE_NOBOUND
#define REG_NONSTDEXT QSE_TRE_NONSTDEXT
/* POSIX tre_regexec() flags. */
#define REG_NOTBOL QSE_TRE_NOTBOL
#define REG_NOTEOL QSE_TRE_NOTEOL
#define REG_BACKTRACKING_MATCHER QSE_TRE_BACKTRACKING
#define tre_strlen(c) qse_strlen(c)
typedef qse_pma_t* tre_mem_t;
#define tre_mem_new(mmgr) qse_pma_open(mmgr,0)
#define tre_mem_destroy(mem) qse_pma_close(mem)
#define tre_mem_alloc(mem,size) qse_pma_alloc(mem,size)
#define tre_mem_calloc(mem,size) qse_pma_calloc(mem,size)
#define xmalloc(mmgr,size) QSE_MMGR_ALLOC(mmgr,size)
#define xfree(mmgr,ptr) QSE_MMGR_FREE(mmgr,ptr)
#define xrealloc(mmgr,ptr,new_size) QSE_MMGR_REALLOC(mmgr, ptr, new_size)
/* tre-ast.h */
#define tre_ast_new_node qse_tre_astnewnode
#define tre_ast_new_literal qse_tre_astnewliteral
#define tre_ast_new_iter qse_tre_astnewiter
#define tre_ast_new_union qse_tre_astnewunion
#define tre_ast_new_catenation qse_tre_astnewcatenation
/* tre-parse.h */
#define tre_parse qse_tre_parse
/* tre-stack.h */
#define tre_stack_destroy qse_tre_stackfree
#define tre_stack_new qse_tre_stacknew
#define tre_stack_num_objects qse_tre_stacknumobjs
#define tre_stack_pop_int qse_tre_stackpopint
#define tre_stack_pop_voidptr qse_tre_stackpopvoidptr
#define tre_stack_push_int qse_tre_stackpushint
#define tre_stack_push_voidptr qse_tre_stackpushvoidptr
/* this tre.h */
#define tre_compile qse_tre_compile
#define tre_free qse_tre_free
#define tre_fill_pmatch qse_tre_fillpmatch
#define tre_tnfa_run_backtrack qse_tre_runbacktrack
#define tre_tnfa_run_parallel qse_tre_runparallel
#define tre_have_backrefs qse_tre_havebackrefs
/* Define the character types and functions. */
#ifdef TRE_WCHAR
# define TRE_CHAR_MAX QSE_TYPE_MAX(qse_wchar_t)
/*
# ifdef TRE_MULTIBYTE
# define TRE_MB_CUR_MAX (qse_getmbcurmax())
# else
# define TRE_MB_CUR_MAX 1
# endif
*/
#else /* !TRE_WCHAR */
# define TRE_CHAR_MAX 255
/*# define TRE_MB_CUR_MAX 1*/
#endif /* !TRE_WCHAR */
#define DPRINT(msg)
typedef qse_ctype_t tre_ctype_t;
#define tre_isctype(c,t) QSE_ISCTYPE(c,t)
typedef enum { STR_WIDE, STR_BYTE, STR_MBS } tre_str_type_t;
/* Returns number of bytes to add to (char *)ptr to make it
properly aligned for the type. */
#define ALIGN(ptr,type) \
((((qse_uintptr_t)ptr) % QSE_SIZEOF(type))? \
(QSE_SIZEOF(type) - (((qse_uintptr_t)ptr) % QSE_SIZEOF(type))): 0)
#undef MAX
#undef MIN
#define MAX(a, b) (((a) >= (b)) ? (a) : (b))
#define MIN(a, b) (((a) <= (b)) ? (a) : (b))
/* Define STRF to the correct printf formatter for strings. */
#ifdef TRE_WCHAR
#define STRF "ls"
#else /* !TRE_WCHAR */
#define STRF "s"
#endif /* !TRE_WCHAR */
/* TNFA transition type. A TNFA state is an array of transitions,
the terminator is a transition with NULL `state'. */
typedef struct tnfa_transition tre_tnfa_transition_t;
struct tnfa_transition
{
/* Range of accepted characters. */
/* QSE indicate that code_min .. code_max is not yet negated for ^ in a bracket */
int negate_range;
/* END QSE */
tre_cint_t code_min;
tre_cint_t code_max;
/* Pointer to the destination state. */
tre_tnfa_transition_t *state;
/* ID number of the destination state. */
int state_id;
/* -1 terminated array of tags (or NULL). */
int *tags;
/* Matching parameters settings (or NULL). */
int *params;
/* Assertion bitmap. */
int assertions;
/* Assertion parameters. */
union
{
/* Character class assertion. */
tre_ctype_t class;
/* Back reference assertion. */
int backref;
} u;
/* Negative character class assertions. */
tre_ctype_t *neg_classes;
};
/* Assertions. */
#define ASSERT_AT_BOL 1 /* Beginning of line. */
#define ASSERT_AT_EOL 2 /* End of line. */
#define ASSERT_CHAR_CLASS 4 /* Character class in `class'. */
#define ASSERT_CHAR_CLASS_NEG 8 /* Character classes in `neg_classes'. */
#define ASSERT_AT_BOW 16 /* Beginning of word. */
#define ASSERT_AT_EOW 32 /* End of word. */
#define ASSERT_AT_WB 64 /* Word boundary. */
#define ASSERT_AT_WB_NEG 128 /* Not a word boundary. */
#define ASSERT_BACKREF 256 /* A back reference in `backref'. */
#define ASSERT_LAST 256
/* Tag directions. */
typedef enum
{
TRE_TAG_MINIMIZE = 0,
TRE_TAG_MAXIMIZE = 1
} tre_tag_direction_t;
/* Parameters that can be changed dynamically while matching. */
typedef enum
{
TRE_PARAM_COST_INS = 0,
TRE_PARAM_COST_DEL = 1,
TRE_PARAM_COST_SUBST = 2,
TRE_PARAM_COST_MAX = 3,
TRE_PARAM_MAX_INS = 4,
TRE_PARAM_MAX_DEL = 5,
TRE_PARAM_MAX_SUBST = 6,
TRE_PARAM_MAX_ERR = 7,
TRE_PARAM_DEPTH = 8,
TRE_PARAM_LAST = 9
} tre_param_t;
/* Unset matching parameter */
#define TRE_PARAM_UNSET -1
/* Signifies the default matching parameter value. */
#define TRE_PARAM_DEFAULT -2
/* Instructions to compute submatch register values from tag values
after a successful match. */
struct tre_submatch_data
{
/* Tag that gives the value for rm_so (submatch start offset). */
int so_tag;
/* Tag that gives the value for rm_eo (submatch end offset). */
int eo_tag;
/* List of submatches this submatch is contained in. */
int *parents;
};
typedef struct tre_submatch_data tre_submatch_data_t;
/* TNFA definition. */
typedef struct tnfa tre_tnfa_t;
struct tnfa
{
tre_tnfa_transition_t *transitions;
unsigned int num_transitions;
tre_tnfa_transition_t *initial;
tre_tnfa_transition_t *final;
tre_submatch_data_t *submatch_data;
#if 0
char *firstpos_chars;
#endif
int first_char;
unsigned int num_submatches;
tre_tag_direction_t *tag_directions;
int *minimal_tags;
int num_tags;
int num_minimals;
int end_tag;
int num_states;
int cflags;
int have_backrefs;
int have_approx;
int params_depth;
};
int tre_compile (regex_t *preg, const tre_char_t *regex, size_t n, int cflags);
void tre_free (regex_t *preg);
void tre_fill_pmatch(
size_t nmatch, regmatch_t pmatch[], int cflags,
const tre_tnfa_t *tnfa, int *tags, int match_eo);
reg_errcode_t tre_tnfa_run_backtrack(
qse_mmgr_t* mmgr, const tre_tnfa_t *tnfa, const void *string,
int len, tre_str_type_t type, int *match_tags,
int eflags, int *match_end_ofs);
reg_errcode_t tre_tnfa_run_parallel(
qse_mmgr_t* mmgr, const tre_tnfa_t *tnfa, const void *string, int len,
tre_str_type_t type, int *match_tags, int eflags,
int *match_end_ofs);
#endif
/* EOF */