*** empty log message ***
This commit is contained in:
		
							
								
								
									
										241
									
								
								ase/awk/rex.c
									
									
									
									
									
								
							
							
						
						
									
										241
									
								
								ase/awk/rex.c
									
									
									
									
									
								
							| @ -1,5 +1,5 @@ | ||||
| /* | ||||
|  * $Id: rex.c,v 1.6 2006-07-20 03:41:00 bacon Exp $ | ||||
|  * $Id: rex.c,v 1.7 2006-07-20 16:21:54 bacon Exp $ | ||||
|  */ | ||||
|  | ||||
| #include <xp/awk/awk_i.h> | ||||
| @ -26,12 +26,13 @@ enum | ||||
|  | ||||
| enum | ||||
| { | ||||
| 	CMD_BOL_CHAR, | ||||
| 	CMD_EOL_CHAR, | ||||
| 	CMD_ORD_CHAR, | ||||
| 	CMD_BOL, | ||||
| 	CMD_EOL, | ||||
| 	CMD_ANY_CHAR, | ||||
| 	CMD_ORD_CHAR, | ||||
| 	CMD_CHAR_RANGE, | ||||
| 	CMD_CHAR_CLASS | ||||
| 	CMD_CHAR_CLASS, | ||||
| 	CMD_GROUP | ||||
| }; | ||||
|  | ||||
| enum | ||||
| @ -53,10 +54,10 @@ enum | ||||
|  | ||||
| struct __code | ||||
| { | ||||
| 	xp_byte_t cmd; | ||||
| 	//xp_byte_t cmd; | ||||
| 	int cmd; | ||||
| 	xp_size_t lbound; | ||||
| 	xp_size_t ubound; | ||||
| 	xp_char_t cc; /* optional */ | ||||
| }; | ||||
|  | ||||
| #define NEXT_CHAR(rex,level) \ | ||||
| @ -69,11 +70,15 @@ static int __compile_expression (xp_awk_rex_t* rex); | ||||
| static int __compile_branch (xp_awk_rex_t* rex); | ||||
| static int __compile_atom (xp_awk_rex_t* rex); | ||||
| static int __compile_charset (xp_awk_rex_t* rex); | ||||
| static int __compile_bound (xp_awk_rex_t* rex); | ||||
| static int __compile_range (xp_awk_rex_t* rex); | ||||
| static int __compile_bound (xp_awk_rex_t* rex, struct __code* cmd); | ||||
| static int __compile_range (xp_awk_rex_t* rex, struct __code* cmd); | ||||
| static int __next_char (xp_awk_rex_t* rex, int level); | ||||
| static int __add_code (xp_awk_rex_t* rex, void* data, xp_size_t len); | ||||
|  | ||||
| static const xp_byte_t* __print_expression (const xp_byte_t* p); | ||||
| static const xp_byte_t* __print_branch (const xp_byte_t* p); | ||||
| static const xp_byte_t* __print_atom (const xp_byte_t* p); | ||||
|  | ||||
| xp_awk_rex_t* xp_awk_rex_open (xp_awk_rex_t* rex) | ||||
| { | ||||
| 	if (rex == XP_NULL) | ||||
| @ -104,7 +109,6 @@ void xp_awk_rex_close (xp_awk_rex_t* rex) | ||||
|  | ||||
| int xp_awk_rex_compile (xp_awk_rex_t* rex, const xp_char_t* ptn, xp_size_t len) | ||||
| { | ||||
|  | ||||
| 	rex->ptn.ptr = ptn; | ||||
| 	rex->ptn.end = rex->ptn.ptr + len; | ||||
| 	rex->ptn.curp = rex->ptn.ptr; | ||||
| @ -136,9 +140,12 @@ xp_printf (XP_T("garbage after expression\n")); | ||||
| static int __compile_expression (xp_awk_rex_t* rex) | ||||
| { | ||||
| 	xp_size_t zero = 0; | ||||
| 	xp_size_t* nb, * el, * bl; | ||||
| 	xp_size_t* nb, * el; | ||||
| 	xp_size_t old_size; | ||||
| 	int n; | ||||
|  | ||||
| 	old_size = rex->code.size; | ||||
|  | ||||
| 	/* secure space for header and set the header fields to zero */ | ||||
| 	nb = (xp_size_t*)&rex->code.buf[rex->code.size]; | ||||
| 	ADD_CODE (rex, &zero, xp_sizeof(zero)); | ||||
| @ -147,7 +154,6 @@ static int __compile_expression (xp_awk_rex_t* rex) | ||||
| 	ADD_CODE (rex, &zero, xp_sizeof(zero)); | ||||
|  | ||||
| 	/* handle the first branch */ | ||||
| 	bl = (xp_size_t*)&rex->code.buf[rex->code.size]; | ||||
| 	n = __compile_branch (rex); | ||||
| 	if (n == -1) return -1; | ||||
| 	if (n == 0)  | ||||
| @ -157,7 +163,6 @@ static int __compile_expression (xp_awk_rex_t* rex) | ||||
| 	} | ||||
|  | ||||
| 	(*nb) += 1; | ||||
| 	(*el) += *bl + xp_sizeof(*bl); | ||||
|  | ||||
| 	/* handle subsequent branches if any */ | ||||
| 	while (rex->ptn.curc.type == __SPECIAL &&  | ||||
| @ -165,7 +170,6 @@ static int __compile_expression (xp_awk_rex_t* rex) | ||||
| 	{ | ||||
| 		NEXT_CHAR (rex, __TOP); | ||||
|  | ||||
| 		bl = (xp_size_t*)&rex->code.buf[rex->code.size]; | ||||
| 		n = __compile_branch(rex); | ||||
| 		if (n == -1) return -1; | ||||
| 		if (n == 0)  | ||||
| @ -179,26 +183,32 @@ static int __compile_expression (xp_awk_rex_t* rex) | ||||
| 		} | ||||
|  | ||||
| 		(*nb) += 1; | ||||
| 		(*el) += *bl + xp_sizeof(*bl); | ||||
| 	} | ||||
|  | ||||
| 	*el = rex->code.size - old_size; | ||||
| 	return 1; | ||||
| } | ||||
|  | ||||
| static int __compile_branch (xp_awk_rex_t* rex) | ||||
| { | ||||
| 	int n; | ||||
| 	xp_size_t* bl; | ||||
| 	xp_size_t* na, * bl; | ||||
| 	xp_size_t old_size; | ||||
| 	xp_size_t zero = 0; | ||||
| 	struct __code* cmd; | ||||
|  | ||||
| 	old_size = rex->code.size; | ||||
|  | ||||
| 	na = (xp_size_t*)&rex->code.buf[rex->code.size]; | ||||
| 	ADD_CODE (rex, &zero, xp_sizeof(zero)); | ||||
|  | ||||
| 	bl = (xp_size_t*)&rex->code.buf[rex->code.size]; | ||||
| 	ADD_CODE (rex, &zero, xp_sizeof(zero)); | ||||
|  | ||||
| 	while (1) | ||||
| 	{ | ||||
| 		cmd = (struct __code*)&rex->code.buf[rex->code.size]; | ||||
|  | ||||
| 		n = __compile_atom (rex); | ||||
| 		if (n == -1)  | ||||
| 		{ | ||||
| @ -208,7 +218,7 @@ static int __compile_branch (xp_awk_rex_t* rex) | ||||
|  | ||||
| 		if (n == 0) break; /* no atom */ | ||||
|  | ||||
| 		n = __compile_bound (rex); | ||||
| 		n = __compile_bound (rex, cmd); | ||||
| 		if (n == -1) | ||||
| 		{ | ||||
| 			rex->code.size = old_size; | ||||
| @ -217,25 +227,31 @@ static int __compile_branch (xp_awk_rex_t* rex) | ||||
|  | ||||
| 		/* n == 0  no bound character. just continue */ | ||||
| 		/* n == 1  bound has been applied by compile_bound */ | ||||
|  | ||||
| 		(*na) += 1; | ||||
| 	} | ||||
|  | ||||
| 	return 0; | ||||
| 	*bl = rex->code.size - old_size; | ||||
| 	return ((*na) == 0)? 0: 1; | ||||
| } | ||||
|  | ||||
| static int __compile_atom (xp_awk_rex_t* rex) | ||||
| { | ||||
| 	int n = 0; | ||||
| 	int n; | ||||
|  | ||||
| 	if (rex->ptn.curc.type == __EOF) | ||||
| 	{ | ||||
| 		/* no atom */ | ||||
| 		return 0; | ||||
| 	} | ||||
| 	else if (rex->ptn.curc.type == __SPECIAL) | ||||
| 	if (rex->ptn.curc.type == __EOF) return 0; | ||||
|  | ||||
| 	if (rex->ptn.curc.type == __SPECIAL) | ||||
| 	{ | ||||
| 		if (rex->ptn.curc.value == XP_T('(')) | ||||
| 		{ | ||||
| 			// GROUP | ||||
| 			struct __code tmp; | ||||
|  | ||||
| 			tmp.cmd = CMD_GROUP; | ||||
| 			tmp.lbound = 1; | ||||
| 			tmp.ubound = 1; | ||||
|  | ||||
| 			ADD_CODE (rex, &tmp, xp_sizeof(tmp)); | ||||
| 			NEXT_CHAR (rex, __TOP); | ||||
|  | ||||
| 			n = __compile_expression (rex); | ||||
| @ -254,7 +270,7 @@ static int __compile_atom (xp_awk_rex_t* rex) | ||||
| 		{ | ||||
| 			struct __code tmp; | ||||
|  | ||||
| 			tmp.cmd = CMD_BOL_CHAR; | ||||
| 			tmp.cmd = CMD_BOL; | ||||
| 			tmp.lbound = 1; | ||||
| 			tmp.ubound = 1; | ||||
|  | ||||
| @ -265,7 +281,7 @@ static int __compile_atom (xp_awk_rex_t* rex) | ||||
| 		{ | ||||
| 			struct __code tmp; | ||||
|  | ||||
| 			tmp.cmd = CMD_EOL_CHAR; | ||||
| 			tmp.cmd = CMD_EOL; | ||||
| 			tmp.lbound = 1; | ||||
| 			tmp.ubound = 1; | ||||
|  | ||||
| @ -287,30 +303,26 @@ static int __compile_atom (xp_awk_rex_t* rex) | ||||
| 		{ | ||||
| 			if (__compile_charset (rex) == -1) return -1; | ||||
| 		} | ||||
| 		else | ||||
| 		{ | ||||
| 			/*invalid special character....*/ | ||||
| 			return -1; | ||||
| 		} | ||||
| 		else return 0; | ||||
|  | ||||
| 		return 1; | ||||
| 	} | ||||
| 	else  | ||||
| 	{ | ||||
| 		/* normal characters */ | ||||
| 		struct __code tmp; | ||||
|  | ||||
| 		xp_assert (rex->ptn.curc.type == __NORMAL); | ||||
|  | ||||
| 		tmp.cmd = CMD_ORD_CHAR; | ||||
| 		tmp.lbound = 1; | ||||
| 		tmp.ubound = 1; | ||||
|  | ||||
| 		ADD_CODE (rex, &tmp, xp_sizeof(tmp)); | ||||
| 		ADD_CODE (rex, &rex->ptn.curc, xp_sizeof(rex->ptn.curc)); | ||||
| 		ADD_CODE (rex, &rex->ptn.curc.value, xp_sizeof(rex->ptn.curc.value)); | ||||
| 		NEXT_CHAR (rex, __TOP); | ||||
|  | ||||
| 		return 1; | ||||
| 	} | ||||
|  | ||||
| } | ||||
|  | ||||
| static int __compile_charset (xp_awk_rex_t* rex) | ||||
| @ -318,7 +330,7 @@ static int __compile_charset (xp_awk_rex_t* rex) | ||||
| 	return -1; | ||||
| } | ||||
|  | ||||
| static int __compile_bound (xp_awk_rex_t* rex) | ||||
| static int __compile_bound (xp_awk_rex_t* rex, struct __code* cmd) | ||||
| { | ||||
| 	if (rex->ptn.curc.type != __SPECIAL) return 0; | ||||
|  | ||||
| @ -326,28 +338,42 @@ static int __compile_bound (xp_awk_rex_t* rex) | ||||
| 	{ | ||||
| 		case XP_T('+'): | ||||
| 		{ | ||||
| 			//__apply_bound (1, MAX); | ||||
| 			cmd->lbound = 1; | ||||
| 			cmd->ubound = BOUND_MAX; | ||||
| 			NEXT_CHAR(rex, __TOP); | ||||
| 			return 1; | ||||
| 		} | ||||
|  | ||||
| 		case XP_T('*'): | ||||
| 		{ | ||||
| 			//__apply_bound (0, MAX); | ||||
| 			cmd->lbound = 0; | ||||
| 			cmd->ubound = BOUND_MAX; | ||||
| 			NEXT_CHAR(rex, __TOP); | ||||
| 			return 1; | ||||
| 		} | ||||
|  | ||||
| 		case XP_T('?'): | ||||
| 		{ | ||||
| 			//__apply_bound (0, 1); | ||||
| 			cmd->lbound = 0; | ||||
| 			cmd->ubound = 1; | ||||
| 			NEXT_CHAR(rex, __TOP); | ||||
| 			return 1; | ||||
| 		} | ||||
|  | ||||
| 		case XP_T('{'): | ||||
| 		{ | ||||
| 			if (__compile_range(rex) == -1) return -1; | ||||
| 			NEXT_CHAR (rex, __IN_RANGE); | ||||
|  | ||||
| 			if (__compile_range(rex, cmd) == -1) return -1; | ||||
|  | ||||
| 			if (rex->ptn.curc.type != __SPECIAL ||  | ||||
| 			    rex->ptn.curc.value != XP_T('}'))  | ||||
| 			{ | ||||
| 				// rex->errnum = XP_AWK_REX_ERBRACE | ||||
| 				return -1; | ||||
| 			} | ||||
|  | ||||
| 			NEXT_CHAR (rex, __TOP); | ||||
| 			return 1; | ||||
| 		} | ||||
| 	} | ||||
| @ -355,9 +381,40 @@ static int __compile_bound (xp_awk_rex_t* rex) | ||||
| 	return 0; | ||||
| } | ||||
|  | ||||
| static int __compile_range (xp_awk_rex_t* rex) | ||||
| static int __compile_range (xp_awk_rex_t* rex, struct __code* cmd) | ||||
| { | ||||
| 	return -1; | ||||
| 	xp_size_t bound; | ||||
|  | ||||
| // TODO: should allow white spaces in the range??? | ||||
| //  what if it is not in the raight format? convert it to ordinary characters?? | ||||
| 	bound = 0; | ||||
| 	while (rex->ptn.curc.type == __NORMAL && | ||||
| 	       xp_isdigit(rex->ptn.curc.value)) | ||||
| 	{ | ||||
| 		bound = bound * 10 + rex->ptn.curc.value - XP_T('0'); | ||||
| 		NEXT_CHAR (rex, __IN_RANGE); | ||||
| 	} | ||||
|  | ||||
| 	cmd->lbound = bound; | ||||
|  | ||||
| 	if (rex->ptn.curc.type == __SPECIAL && | ||||
| 	    rex->ptn.curc.value == XP_T(','))  | ||||
| 	{ | ||||
| 		NEXT_CHAR (rex, __IN_RANGE); | ||||
|  | ||||
| 		bound = 0; | ||||
| 		while (rex->ptn.curc.type == __NORMAL && | ||||
| 		       xp_isdigit(rex->ptn.curc.value)) | ||||
| 		{ | ||||
| 			bound = bound * 10 + rex->ptn.curc.value - XP_T('0'); | ||||
| 			NEXT_CHAR (rex, __IN_RANGE); | ||||
| 		} | ||||
|  | ||||
| 		cmd->ubound = bound; | ||||
| 	} | ||||
| 	else cmd->ubound = BOUND_MAX; | ||||
|  | ||||
| 	return 0; | ||||
| } | ||||
|  | ||||
| static int __next_char (xp_awk_rex_t* rex, int level) | ||||
| @ -461,3 +518,101 @@ static int __add_code (xp_awk_rex_t* rex, void* data, xp_size_t len) | ||||
|  | ||||
| 	return 0; | ||||
| } | ||||
|  | ||||
| void xp_awk_rex_print (xp_awk_rex_t* rex) | ||||
| { | ||||
| 	const xp_byte_t* p; | ||||
| 	p = __print_expression (rex->code.buf); | ||||
| 	xp_printf (XP_T("\n")); | ||||
| 	xp_assert (p == rex->code.buf + rex->code.size); | ||||
| } | ||||
|  | ||||
| static const xp_byte_t* __print_expression (const xp_byte_t* p) | ||||
| { | ||||
| 	xp_size_t nb, el, i; | ||||
|  | ||||
| 	nb = *(xp_size_t*)p; p += xp_sizeof(nb); | ||||
| 	el = *(xp_size_t*)p; p += xp_sizeof(el); | ||||
| //xp_printf (XP_T("NA = %u, EL = %u\n"),  | ||||
| //	(unsigned int)nb, (unsigned int)el); | ||||
|  | ||||
| 	for (i = 0; i < nb; i++) | ||||
| 	{ | ||||
| 		if (i != 0) xp_printf (XP_T("|")); | ||||
| 		p = __print_branch (p); | ||||
| 	} | ||||
|  | ||||
| 	return p; | ||||
| } | ||||
|  | ||||
| static const xp_byte_t* __print_branch (const xp_byte_t* p) | ||||
| { | ||||
| 	xp_size_t na, bl, i; | ||||
|  | ||||
| 	na = *(xp_size_t*)p; p += xp_sizeof(na); | ||||
| 	bl = *(xp_size_t*)p; p += xp_sizeof(bl); | ||||
| //xp_printf (XP_T("NA = %u, BL = %u\n"),  | ||||
| //	(unsigned int) na, (unsigned int)bl); | ||||
|  | ||||
| 	for (i = 0; i < na; i++) | ||||
| 	{ | ||||
| 		p = __print_atom (p); | ||||
| 	} | ||||
|  | ||||
| 	return p; | ||||
| } | ||||
|  | ||||
| static const xp_byte_t* __print_atom (const xp_byte_t* p) | ||||
| { | ||||
| 	struct __code* cp = (struct __code*)p; | ||||
|  | ||||
| 	if (cp->cmd == CMD_BOL) | ||||
| 	{ | ||||
| 		xp_printf (XP_T("^")); | ||||
| 		p += xp_sizeof(*cp); | ||||
| 	} | ||||
| 	else if (cp->cmd == CMD_EOL) | ||||
| 	{ | ||||
| 		xp_printf (XP_T("$")); | ||||
| 		p += xp_sizeof(*cp); | ||||
| 	} | ||||
| 	else if (cp->cmd == CMD_ANY_CHAR)  | ||||
| 	{ | ||||
| 		xp_printf (XP_T(".")); | ||||
| 		p += xp_sizeof(*cp); | ||||
| 	} | ||||
| 	else if (cp->cmd == CMD_ORD_CHAR)  | ||||
| 	{ | ||||
| 		p += xp_sizeof(*cp); | ||||
| 		xp_printf (XP_T("%c"), *(xp_char_t*)p); | ||||
| 		p += xp_sizeof(xp_char_t); | ||||
| 	} | ||||
| 	else if (cp->cmd == CMD_GROUP) | ||||
| 	{ | ||||
| 		p += xp_sizeof(*cp); | ||||
| 		xp_printf (XP_T("(")); | ||||
| 		p = __print_expression (p); | ||||
| 		xp_printf (XP_T(")")); | ||||
| 	} | ||||
| 	else  | ||||
| 	{ | ||||
| xp_printf (XP_T("FUCK FUCK FUCK\n")); | ||||
| 	} | ||||
|  | ||||
| 	if (cp->lbound == 0 && cp->ubound == BOUND_MAX) | ||||
| 		xp_printf (XP_T("*")); | ||||
| 	else if (cp->lbound == 1 && cp->ubound == BOUND_MAX) | ||||
| 		xp_printf (XP_T("+")); | ||||
| 	else if (cp->lbound == 0 && cp->ubound == 1) | ||||
| 		xp_printf (XP_T("?")); | ||||
| 	else if (cp->lbound != 1 || cp->ubound != 1) | ||||
| 	{ | ||||
| 		xp_printf (XP_T("{%lu,%lu}"),  | ||||
| 			(unsigned long)cp->lbound, (unsigned long)cp->ubound); | ||||
| 	} | ||||
|  | ||||
|  | ||||
| 	return p; | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -1,5 +1,5 @@ | ||||
| /* | ||||
|  * $Id: rex.h,v 1.4 2006-07-20 03:41:00 bacon Exp $ | ||||
|  * $Id: rex.h,v 1.5 2006-07-20 16:21:54 bacon Exp $ | ||||
|  **/ | ||||
|  | ||||
| #ifndef _XP_AWK_REX_H_ | ||||
| @ -17,12 +17,13 @@ | ||||
|  * | ||||
|  * Compiled form of a regular expression: | ||||
|  * | ||||
|  *   | expression                                                       | | ||||
|  *   | header  | branch                     | branch         | branch   | | ||||
|  *   | nb | el | bl | cmd | arg | cmd | arg | bl | cmd | arg | bl | cmd | | ||||
|  *   | expression                                                                      | | ||||
|  *   | header  | branch                          | branch              | branch        | | ||||
|  *   | nb | el | na | bl | cmd | arg | cmd | arg | na | bl | cmd | arg | na | bl | cmd | | ||||
|  * | ||||
|  *   nb: the number of branches | ||||
|  *   el: the length of a expression excluding the length of nb and el | ||||
|  *   na: the number of atoms | ||||
|  *   bl: the length of a branch excluding the length of bl | ||||
|  *   cmd: The command and repetition info encoded together.  | ||||
|  *      Some commands require an argument to follow them but some other don't. | ||||
| @ -67,6 +68,7 @@ extern "C" { | ||||
| xp_awk_rex_t* xp_awk_rex_open (xp_awk_rex_t* rex); | ||||
| void xp_awk_rex_close (xp_awk_rex_t* rex); | ||||
| int xp_awk_rex_compile (xp_awk_rex_t* rex, const xp_char_t* ptn, xp_size_t len); | ||||
| void xp_awk_rex_print (xp_awk_rex_t* rex); | ||||
|  | ||||
| #ifdef __cplusplus | ||||
| } | ||||
|  | ||||
| @ -23,7 +23,7 @@ int xp_main (int argc, const xp_char_t* argv[]) | ||||
| 	} | ||||
|  | ||||
|  | ||||
| 	ptn = XP_T("^he.llo"); | ||||
| 	ptn = XP_T("^he.llo(jo(in|kk)s|com)+h*e{1,40}abc"); | ||||
| 	if (xp_awk_rex_compile (rex, ptn, xp_strlen(ptn)) == -1) | ||||
| 	{ | ||||
| 		xp_printf (XP_T("cannot compile pattern...\n")); | ||||
| @ -31,6 +31,7 @@ int xp_main (int argc, const xp_char_t* argv[]) | ||||
| 		return -1; | ||||
| 	} | ||||
|  | ||||
| 	xp_awk_rex_print (rex); | ||||
| 	xp_awk_rex_close (rex); | ||||
| 	return 0; | ||||
| } | ||||
|  | ||||
		Reference in New Issue
	
	Block a user