From 983026c7747848795f45bd7112a7c21075d9e91b Mon Sep 17 00:00:00 2001
From: "hyunghwan.chung" <hyunghwan.chung@c9d47f8d-4fed-4a2f-8cad-691f5fbe3904>
Date: Sun, 24 Mar 2019 18:49:16 +0000
Subject: [PATCH] optimized the division a bit removed moo_log2_for_pow2() and
 created similar macros local to bigint.c

---
 moo/kernel/test-001.moo |  10 +-
 moo/lib/bigint.c        | 226 ++++++++++++++++++++++++++++------------
 moo/lib/moo-cmn.h       |  51 ++++++++-
 moo/lib/moo-utl.h       |   6 --
 4 files changed, 218 insertions(+), 75 deletions(-)

diff --git a/moo/kernel/test-001.moo b/moo/kernel/test-001.moo
index 2acde35..e0016fa 100644
--- a/moo/kernel/test-001.moo
+++ b/moo/kernel/test-001.moo
@@ -250,10 +250,18 @@ extend MyObject
 			[ (-9p10.123 scale) = (-10.123000000 scale) ],
 			[ (+3p100.1 + 16rffff + +5p1.22 + -5p1.223) = 65635.09700 ],
 
-			## 80-84
+			## 90-94
 			[ (30p2123.12 asString) = '2123.120000000000000000000000000000' ],
 			[ (+30p2123.12 asString) = '2123.120000000000000000000000000000' ],
 			[ (-30p2123.12 asString) = '-2123.120000000000000000000000000000' ],
+			[ (811306333091350399588761 div: 16) = 50706645818209399974297 ],
+			[ (811306333091350399588761 rem: 16) = 9 ],
+
+			## 95-99
+			[ (811306333091350399588761 mod: 16) = 9 ],
+			[ (811306333091350399588761 div: 128) = 6338330727276174996787 ],
+			[ (811306333091350399588761 rem: 128) = 25 ],
+			[ (811306333091350399588761 mod: 128) = 25 ],
 
 			## =========================
 			[ 
diff --git a/moo/lib/bigint.c b/moo/lib/bigint.c
index 4468afb..a94e220 100644
--- a/moo/lib/bigint.c
+++ b/moo/lib/bigint.c
@@ -64,11 +64,11 @@
 #	error UNSUPPORTED LIW BIT SIZE
 #endif
 
-/*#define IS_POWER_OF_2(ui) (((ui) > 0) && (((ui) & (~(ui)+ 1)) == (ui)))*/
-#define IS_POWER_OF_2(ui) (((ui) > 0) && ((ui) & ((ui) - 1)) == 0)
-
 #define IS_SIGN_DIFF(x,y) (((x) ^ (y)) < 0)
 
+/*#define IS_POW2(ui) (((ui) > 0) && (((ui) & (~(ui)+ 1)) == (ui)))*/
+#define IS_POW2(ui) (((ui) > 0) && ((ui) & ((ui) - 1)) == 0)
+
 /* digit character array */
 static char* _digitc_array[] =
 {
@@ -76,13 +76,101 @@ static char* _digitc_array[] =
 	"0123456789abcdefghijklmnopqrstuvwxyz"
 };
 
-/* exponent table */
-static moo_uint8_t _exp_tab[] = 
+/* exponent table for pow2 between 1 and 32 inclusive. */
+static moo_uint8_t _exp_tab[32] = 
 {
-	0, 0, 1, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0
+	0, 1, 0, 2, 0, 0, 0, 3,
+	0, 0, 0, 0, 0, 0, 0, 4,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 5
 };
 
+static const moo_uint8_t debruijn_32[32] = 
+{
+	0, 1, 28, 2, 29, 14, 24, 3,
+	30, 22, 20, 15, 25, 17, 4, 8, 
+	31, 27, 13, 23, 21, 19, 16, 7,
+	26, 12, 18, 6, 11, 5, 10, 9
+};
+
+static const moo_uint8_t debruijn_64[64] = 
+{
+	0, 1,  2, 53,  3,  7, 54, 27,
+	4, 38, 41,  8, 34, 55, 48, 28,
+	62,  5, 39, 46, 44, 42, 22,  9,
+	24, 35, 59, 56, 49, 18, 29, 11,
+	63, 52,  6, 26, 37, 40, 33, 47,
+	61, 45, 43, 21, 23, 58, 17, 10,
+	51, 25, 36, 32, 60, 20, 57, 16,
+	50, 31, 19, 15, 30, 14, 13, 12
+};
+
+#if defined(MOO_HAVE_UINT32_T)
+#	define LOG2_FOR_POW2_32(x) (debruijn_32[(moo_uint32_t)((moo_uint32_t)(x) * 0x077CB531) >> 27])
+#endif
+
+#if defined(MOO_HAVE_UINT64_T)
+#	define LOG2_FOR_POW2_64(x) (debruijn_64[(moo_uint64_t)((moo_uint64_t)(x) * 0x022fdd63cc95386d) >> 58])
+#endif
+
+static MOO_INLINE int get_pos_of_msb_set (moo_oow_t x)
+{
+	/* the caller must ensure that x is power of 2. if x happens to be zero,
+	 * the return value is undefined as each method used may give different result. */
+#if defined(MOO_HAVE_BUILTIN_CTZLL) && (MOO_SIZEOF_OOW_T == MOO_SIZEOF_LONG_LONG)
+	return __builtin_ctzll(x); /* count the number of trailing zeros */
+#elif defined(MOO_HAVE_BUILTIN_CTZL) && (MOO_SIZEOF_OOW_T == MOO_SIZEOF_LONG)
+	return __builtin_ctzl(x); /* count the number of trailing zeros */
+#elif defined(MOO_HAVE_BUILTIN_CTZ) && (MOO_SIZEOF_OOW_T == MOO_SIZEOF_INT)
+	return __builtin_ctz(x); /* count the number of trailing zeros */
+#elif defined(__GNUC__) && (defined(__x86_64) || defined(__amd64) || defined(__i386) || defined(i386))
+	moo_oow_t pos;
+	/* use the Bit Scan Forward instruction */
+#if 1
+	__asm__ volatile (
+		"bsf %1,%0\n\t"
+		: "=r"(pos) /* output */
+		: "r"(x) /* input */
+	);
+#else
+	__asm__ volatile (
+		"bsf %[X],%[EXP]\n\t"
+		: [EXP]"=r"(pos) /* output */
+		: [X]"r"(x) /* input */
+	);
+#endif
+	return (int)pos;
+#elif defined(USE_UGLY_CODE) && defined(__GNUC__) && defined(__arm__) && (defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_8__))
+	moo_oow_t pos;
+
+	/* CLZ is available in ARMv5T and above. there is no instruction to
+	 * count trailing zeros or something similar. using RBIT with CLZ
+	 * would be good in ARMv6T2 and above to avoid further calculation
+	 * afte CLZ */
+	__asm__ volatile (
+		"clz %0,%1\n\t"
+		: "=r"(pos) /* output */
+		: "r"(x) /* input */
+	);
+	return (int)((MOO_SIZEOF(pos) * 8) - pos - 1); 
+
+	/* TODO: PPC - use cntlz, cntlzw, cntlzd, SPARC - use lzcnt, MIPS clz */
+
+#else
+	int pos = 0;
+	while (x >>= 1) pos++;
+	return pos;
+#endif
+}
+
+#if defined(MOO_HAVE_UINT32_T) && (MOO_SIZEOF_OOW_T == MOO_SIZEOF_UINT32_T)
+#	define LOG2_FOR_POW2(x) LOG2_FOR_POW2_32(x)
+#elif defined(MOO_HAVE_UINT64_T) && (MOO_SIZEOF_OOW_T == MOO_SIZEOF_UINT64_T)
+#	define LOG2_FOR_POW2(x) LOG2_FOR_POW2_64(x)
+#else
+#	define LOG2_FOR_POW2(x) get_pos_of_msb_set(x)
+#endif
+
 
 #if (MOO_SIZEOF_OOW_T == MOO_SIZEOF_INT) && defined(MOO_HAVE_BUILTIN_UADD_OVERFLOW)
 #	define oow_add_overflow(a,b,c)  __builtin_uadd_overflow(a,b,c)
@@ -1460,7 +1548,7 @@ static void divide_unsigned_array (moo_t* moo, const moo_liw_t* x, moo_oow_t xs,
 /* TODO: this function needs to be rewritten for performance improvement. 
  *       the binary long division is extremely slow for a big number */
 
-#if 0
+#if 1
 	/* Perform binary long division.
 	 * http://en.wikipedia.org/wiki/Division_algorithm
 	 * ---------------------------------------------------------------------
@@ -1501,7 +1589,7 @@ static void divide_unsigned_array (moo_t* moo, const moo_liw_t* x, moo_oow_t xs,
 		}
 	}
 #else
-
+	/* TODO: more efficient method */
 #endif
 }
 
@@ -1957,7 +2045,7 @@ moo_oop_t moo_divints (moo_t* moo, moo_oop_t x, moo_oop_t y, int modulo, moo_oop
 
 	if (MOO_OOP_IS_SMOOI(x) && MOO_OOP_IS_SMOOI(y))
 	{
-		moo_ooi_t xv, yv, q, r;
+		moo_ooi_t xv, yv, q, ri;
 
 		xv = MOO_OOP_TO_SMOOI(x);
 		yv = MOO_OOP_TO_SMOOI(y);
@@ -1995,8 +2083,8 @@ moo_oop_t moo_divints (moo_t* moo, moo_oop_t x, moo_oop_t y, int modulo, moo_oop
 		q = xv / yv;
 		MOO_ASSERT (moo, MOO_IN_SMOOI_RANGE(q));
 
-		r = xv - yv * q; /* xv % yv; */
-		if (r)
+		ri = xv - yv * q; /* xv % yv; */
+		if (ri)
 		{
 			if (modulo)
 			{
@@ -2012,13 +2100,13 @@ moo_oop_t moo_divints (moo_t* moo, moo_oop_t x, moo_oop_t y, int modulo, moo_oop
 
 				/* r must be floored. that is, it rounds away from zero 
 				 * and towards negative infinity */
-				if (IS_SIGN_DIFF(yv, r))
+				if (IS_SIGN_DIFF(yv, ri))
 				{
 					/* if the divisor has a different sign from r,
 					 * change the sign of r to the divisor's sign */
-					r += yv;
+					ri += yv;
 					--q;
-					MOO_ASSERT (moo, r && !IS_SIGN_DIFF(yv, r));
+					MOO_ASSERT (moo, ri && !IS_SIGN_DIFF(yv, ri));
 				}
 			}
 			else
@@ -2032,7 +2120,7 @@ moo_oop_t moo_divints (moo_t* moo, moo_oop_t x, moo_oop_t y, int modulo, moo_oop
 					 7      -3     -2       1
 					-7      -3      2      -1
 				 */
-				if (xv && IS_SIGN_DIFF(xv, r)) 
+				if (xv && IS_SIGN_DIFF(xv, ri)) 
 				{
 					/* if the dividend has a different sign from r,
 					 * change the sign of r to the dividend's sign.
@@ -2040,49 +2128,51 @@ moo_oop_t moo_divints (moo_t* moo, moo_oop_t x, moo_oop_t y, int modulo, moo_oop
 					 * the quotient and the remainder that don't need
 					 * any adjustment. however, there may be an esoteric
 					 * architecture. */
-					r -= yv;
+					ri -= yv;
 					++q;
-					MOO_ASSERT (moo, xv && !IS_SIGN_DIFF(xv, r));
+					MOO_ASSERT (moo, xv && !IS_SIGN_DIFF(xv, ri));
 				}
 			}
 		}
 
 		if (rem)
 		{
-			MOO_ASSERT (moo, MOO_IN_SMOOI_RANGE(r));
-			*rem = MOO_SMOOI_TO_OOP(r);
+			MOO_ASSERT (moo, MOO_IN_SMOOI_RANGE(ri));
+			*rem = MOO_SMOOI_TO_OOP(ri);
 		}
 
 		return MOO_SMOOI_TO_OOP((moo_ooi_t)q);
 	}
 	else 
 	{
+		moo_oop_t r;
+
 		if (MOO_OOP_IS_SMOOI(x))
 		{
-			moo_ooi_t v;
+			moo_ooi_t yv;
 
 			if (!is_bigint(moo,y)) goto oops_einval;
 
-			v = MOO_OOP_TO_SMOOI(x);
-			if (v == 0)
+			yv = MOO_OOP_TO_SMOOI(x);
+			if (yv == 0)
 			{
 				if (rem) *rem = MOO_SMOOI_TO_OOP(0);
 				return MOO_SMOOI_TO_OOP(0);
 			}
 
 			moo_pushvolat (moo, &y);
-			x = make_bigint_with_ooi(moo, v);
+			x = make_bigint_with_ooi(moo, yv);
 			moo_popvolat (moo);
 			if (!x) return MOO_NULL;
 		}
 		else if (MOO_OOP_IS_SMOOI(y))
 		{
-			moo_ooi_t v;
+			moo_ooi_t yv;
 
 			if (!is_bigint(moo,x)) goto oops_einval;
 
-			v = MOO_OOP_TO_SMOOI(y);
-			switch (v)
+			yv = MOO_OOP_TO_SMOOI(y);
+			switch (yv)
 			{
 				case 0: /* divide by 0 */
 					moo_seterrnum (moo, MOO_EDIVBY0);
@@ -2101,27 +2191,54 @@ moo_oop_t moo_divints (moo_t* moo, moo_oop_t x, moo_oop_t y, int modulo, moo_oop
 					if (rem) *rem = MOO_SMOOI_TO_OOP(0);
 					return z;
 
-#if 0
 				default:
-					if (IS_POWER_OF_2(v))
+					/* TODO: do division by shifting if both x & y are negative here */
+					if (yv > 0 && IS_POW2(yv) && !MOO_POINTER_IS_NBIGINT(moo, x))
 					{
+						moo_oow_t nshifts;
+
  						/* 
 						2**x = v
 						x = log2(v)
 						x is the number of shift to make */
-	TODO:
-	DO SHIFTING. how to get remainder..
-	if v is powerof2, do shifting???
+						nshifts = LOG2_FOR_POW2(yv);
 
-						z = clone_bigint_negated(moo, x, MOO_OBJ_GET_SIZE(x));
+						moo_pushvolat (moo, &x);
+						moo_pushvolat (moo, &y);
+						z = clone_bigint(moo, x, MOO_OBJ_GET_SIZE(x));
+						moo_popvolats (moo, 2);
 						if (!z) return MOO_NULL;
-						rshift_unsigned_array (z, MOO_OBJ_GET_SIZE(z), log(v)/log(2));
+
+						rshift_unsigned_array (MOO_OBJ_GET_LIWORD_SLOT(z), MOO_OBJ_GET_SIZE(z), nshifts);
+
+						moo_pushvolat (moo, &x);
+						moo_pushvolat (moo, &y);
+						z = normalize_bigint(moo, z);
+						moo_popvolats (moo, 2);
+						if (!z) return MOO_NULL;
+
+						if (rem) 
+						{
+							moo_pushvolat (moo, &x);
+							moo_pushvolat (moo, &z);
+							r = moo_mulints(moo, y, z);
+							moo_popvolats (moo, 2);
+							if (!r) return MOO_NULL;
+
+							moo_pushvolat (moo, &z);
+							r = moo_subints(moo, x, r);
+							moo_popvolat (moo);
+							if (!r) return MOO_NULL;
+
+							*rem = r;
+						}
+						return z;
 					}
-#endif
+					break;
 			}
 
 			moo_pushvolat (moo, &x);
-			y = make_bigint_with_ooi(moo, v);
+			y = make_bigint_with_ooi(moo, yv);
 			moo_popvolat (moo);
 			if (!y) return MOO_NULL;
 		}
@@ -3673,7 +3790,7 @@ moo_oop_t moo_strtoint (moo_t* moo, const moo_ooch_t* str, moo_oow_t len, int ra
 	hwlen = 0;
 	start = ptr; /* this is the real start */
 
-	if (IS_POWER_OF_2(radix))
+	if (IS_POW2(radix))
 	{
 		unsigned int exp;
 		unsigned int bitcnt;
@@ -3681,34 +3798,8 @@ moo_oop_t moo_strtoint (moo_t* moo, const moo_ooch_t* str, moo_oow_t len, int ra
 		/* get log2(radix) in a fast way under the fact that
 		 * radix is a power of 2. the exponent acquired is
 		 * the number of bits that a digit of the given radix takes up */
-	#if defined(MOO_HAVE_BUILTIN_CTZ)
-		exp = __builtin_ctz(radix);
-
-	#elif defined(__GNUC__) && (defined(__x86_64) || defined(__amd64) || defined(__i386) || defined(i386))
-		/* use the Bit Scan Forward instruction */
-		__asm__ volatile (
-			"bsf %1,%0\n\t"
-			: "=r"(exp) /* output */
-			: "r"(radix) /* input */
-		);
-
-	#elif defined(USE_UGLY_CODE) && defined(__GNUC__) && defined(__arm__) && (defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_8__))
-
-		/* CLZ is available in ARMv5T and above. there is no instruction to
-		 * count trailing zeros or something similar. using RBIT with CLZ
-		 * would be good in ARMv6T2 and above to avoid further calculation
-		 * afte CLZ */
-		__asm__ volatile (
-			"clz %0,%1\n\t"
-			: "=r"(exp) /* output */
-			: "r"(radix) /* input */
-		);
-		exp = (MOO_SIZEOF(exp) * 8) - exp - 1; 
-
-		/* TODO: PPC - use cntlz, cntlzw, cntlzd, SPARC - use lzcnt, MIPS clz */
-	#else
-		exp = _exp_tab[radix];
-	#endif
+		/*exp = LOG2_FOR_POW2(radix);*/
+		exp = _exp_tab[radix - 1];
 
 		/* bytes */
 		outlen = ((moo_oow_t)(end - str) * exp + 7) / 8; 
@@ -4214,13 +4305,14 @@ moo_oop_t moo_inttostr (moo_t* moo, moo_oop_t num, int flagged_radix)
 
 	as = MOO_OBJ_GET_SIZE(num);
 
-	if (IS_POWER_OF_2(radix))
+	if (IS_POW2(radix))
 	{
 		unsigned int exp, accbits;
 		moo_lidw_t acc;
 		moo_oow_t xpos;
 
-		exp = _exp_tab[radix];
+		/*exp = LOG2_FOR_POW2(radix);*/
+		exp = _exp_tab[radix - 1];
 		xlen = as * ((MOO_LIW_BITS + exp) / exp) + 1;
 		xpos = xlen;
 
diff --git a/moo/lib/moo-cmn.h b/moo/lib/moo-cmn.h
index 46a8a66..5b1121a 100644
--- a/moo/lib/moo-cmn.h
+++ b/moo/lib/moo-cmn.h
@@ -65,21 +65,29 @@
 #if defined(MOO_SIZEOF_CHAR) && (MOO_SIZEOF_CHAR == 1)
 #	define MOO_HAVE_UINT8_T
 #	define MOO_HAVE_INT8_T
+#	define MOO_SIZEOF_UINT8_T (MOO_SIZEOF_CHAR)
+#	define MOO_SIZEOF_INT8_T (MOO_SIZEOF_CHAR)
 	typedef unsigned char      moo_uint8_t;
 	typedef signed char        moo_int8_t;
 #elif defined(MOO_SIZEOF___INT8) && (MOO_SIZEOF___INT8 == 1)
 #	define MOO_HAVE_UINT8_T
 #	define MOO_HAVE_INT8_T
+#	define MOO_SIZEOF_UINT8_T (MOO_SIZEOF___INT8)
+#	define MOO_SIZEOF_INT8_T (MOO_SIZEOF___INT8)
 	typedef unsigned __int8    moo_uint8_t;
 	typedef signed __int8      moo_int8_t;
 #elif defined(MOO_SIZEOF___INT8_T) && (MOO_SIZEOF___INT8_T == 1)
 #	define MOO_HAVE_UINT8_T
 #	define MOO_HAVE_INT8_T
+#	define MOO_SIZEOF_UINT8_T (MOO_SIZEOF___INT8_T)
+#	define MOO_SIZEOF_INT8_T (MOO_SIZEOF___INT8_T)
 	typedef unsigned __int8_t  moo_uint8_t;
 	typedef signed __int8_t    moo_int8_t;
 #else
 #	define MOO_HAVE_UINT8_T
 #	define MOO_HAVE_INT8_T
+#	define MOO_SIZEOF_UINT8_T (1)
+#	define MOO_SIZEOF_INT8_T (1)
 	typedef unsigned char      moo_uint8_t;
 	typedef signed char        moo_int8_t;
 #endif
@@ -88,21 +96,29 @@
 #if defined(MOO_SIZEOF_SHORT) && (MOO_SIZEOF_SHORT == 2)
 #	define MOO_HAVE_UINT16_T
 #	define MOO_HAVE_INT16_T
+#	define MOO_SIZEOF_UINT16_T (MOO_SIZEOF_SHORT)
+#	define MOO_SIZEOF_INT16_T (MOO_SIZEOF_SHORT)
 	typedef unsigned short int  moo_uint16_t;
 	typedef signed short int    moo_int16_t;
 #elif defined(MOO_SIZEOF___INT16) && (MOO_SIZEOF___INT16 == 2)
 #	define MOO_HAVE_UINT16_T
 #	define MOO_HAVE_INT16_T
+#	define MOO_SIZEOF_UINT16_T (MOO_SIZEOF___INT16)
+#	define MOO_SIZEOF_INT16_T (MOO_SIZEOF___INT16)
 	typedef unsigned __int16    moo_uint16_t;
 	typedef signed __int16      moo_int16_t;
 #elif defined(MOO_SIZEOF___INT16_T) && (MOO_SIZEOF___INT16_T == 2)
 #	define MOO_HAVE_UINT16_T
 #	define MOO_HAVE_INT16_T
+#	define MOO_SIZEOF_UINT16_T (MOO_SIZEOF___INT16_T)
+#	define MOO_SIZEOF_INT16_T (MOO_SIZEOF___INT16_T)
 	typedef unsigned __int16_t  moo_uint16_t;
 	typedef signed __int16_t    moo_int16_t;
 #else
 #	define MOO_HAVE_UINT16_T
 #	define MOO_HAVE_INT16_T
+#	define MOO_SIZEOF_UINT16_T (2)
+#	define MOO_SIZEOF_INT16_T (2)
 	typedef unsigned short int  moo_uint16_t;
 	typedef signed short int    moo_int16_t;
 #endif
@@ -112,31 +128,43 @@
 #if defined(MOO_SIZEOF_INT) && (MOO_SIZEOF_INT == 4)
 #	define MOO_HAVE_UINT32_T
 #	define MOO_HAVE_INT32_T
+#	define MOO_SIZEOF_UINT32_T (MOO_SIZEOF_INT)
+#	define MOO_SIZEOF_INT32_T (MOO_SIZEOF_INT)
 	typedef unsigned int        moo_uint32_t;
 	typedef signed int          moo_int32_t;
 #elif defined(MOO_SIZEOF_LONG) && (MOO_SIZEOF_LONG == 4)
 #	define MOO_HAVE_UINT32_T
 #	define MOO_HAVE_INT32_T
+#	define MOO_SIZEOF_UINT32_T (MOO_SIZEOF_LONG)
+#	define MOO_SIZEOF_INT32_T (MOO_SIZEOF_LONG)
 	typedef unsigned long int   moo_uint32_t;
 	typedef signed long int     moo_int32_t;
 #elif defined(MOO_SIZEOF___INT32) && (MOO_SIZEOF___INT32 == 4)
 #	define MOO_HAVE_UINT32_T
 #	define MOO_HAVE_INT32_T
+#	define MOO_SIZEOF_UINT32_T (MOO_SIZEOF___INT32)
+#	define MOO_SIZEOF_INT32_T (MOO_SIZEOF___INT32)
 	typedef unsigned __int32    moo_uint32_t;
 	typedef signed __int32      moo_int32_t;
 #elif defined(MOO_SIZEOF___INT32_T) && (MOO_SIZEOF___INT32_T == 4)
 #	define MOO_HAVE_UINT32_T
 #	define MOO_HAVE_INT32_T
+#	define MOO_SIZEOF_UINT32_T (MOO_SIZEOF___INT32_T)
+#	define MOO_SIZEOF_INT32_T (MOO_SIZEOF___INT32_T)
 	typedef unsigned __int32_t  moo_uint32_t;
 	typedef signed __int32_t    moo_int32_t;
 #elif defined(__DOS__)
 #	define MOO_HAVE_UINT32_T
 #	define MOO_HAVE_INT32_T
+#	define MOO_SIZEOF_UINT32_T (4)
+#	define MOO_SIZEOF_INT32_T (4)
 	typedef unsigned long int   moo_uint32_t;
 	typedef signed long int     moo_int32_t;
 #else
 #	define MOO_HAVE_UINT32_T
 #	define MOO_HAVE_INT32_T
+#	define MOO_SIZEOF_UINT32_T (4)
+#	define MOO_SIZEOF_INT32_T (4)
 	typedef unsigned int        moo_uint32_t;
 	typedef signed int          moo_int32_t;
 #endif
@@ -145,26 +173,36 @@
 #if defined(MOO_SIZEOF_INT) && (MOO_SIZEOF_INT == 8)
 #	define MOO_HAVE_UINT64_T
 #	define MOO_HAVE_INT64_T
+#	define MOO_SIZEOF_UINT64_T (MOO_SIZEOF_INT)
+#	define MOO_SIZEOF_INT64_T (MOO_SIZEOF_INT)
 	typedef unsigned int        moo_uint64_t;
 	typedef signed int          moo_int64_t;
 #elif defined(MOO_SIZEOF_LONG) && (MOO_SIZEOF_LONG == 8)
 #	define MOO_HAVE_UINT64_T
 #	define MOO_HAVE_INT64_T
+#	define MOO_SIZEOF_UINT64_T (MOO_SIZEOF_LONG)
+#	define MOO_SIZEOF_INT64_T (MOO_SIZEOF_LONG)
 	typedef unsigned long int  moo_uint64_t;
 	typedef signed long int    moo_int64_t;
 #elif defined(MOO_SIZEOF_LONG_LONG) && (MOO_SIZEOF_LONG_LONG == 8)
 #	define MOO_HAVE_UINT64_T
 #	define MOO_HAVE_INT64_T
+#	define MOO_SIZEOF_UINT64_T (MOO_SIZEOF_LONG_LONG)
+#	define MOO_SIZEOF_INT64_T (MOO_SIZEOF_LONG_LONG)
 	typedef unsigned long long int  moo_uint64_t;
 	typedef signed long long int    moo_int64_t;
 #elif defined(MOO_SIZEOF___INT64) && (MOO_SIZEOF___INT64 == 8)
 #	define MOO_HAVE_UINT64_T
 #	define MOO_HAVE_INT64_T
+#	define MOO_SIZEOF_UINT64_T (MOO_SIZEOF_LONG___INT64)
+#	define MOO_SIZEOF_INT64_T (MOO_SIZEOF_LONG___INT64)
 	typedef unsigned __int64    moo_uint64_t;
 	typedef signed __int64      moo_int64_t;
 #elif defined(MOO_SIZEOF___INT64_T) && (MOO_SIZEOF___INT64_T == 8)
 #	define MOO_HAVE_UINT64_T
 #	define MOO_HAVE_INT64_T
+#	define MOO_SIZEOF_UINT64_T (MOO_SIZEOF_LONG___INT64_T)
+#	define MOO_SIZEOF_INT64_T (MOO_SIZEOF_LONG___INT64_T)
 	typedef unsigned __int64_t  moo_uint64_t;
 	typedef signed __int64_t    moo_int64_t;
 #else
@@ -175,26 +213,36 @@
 #if defined(MOO_SIZEOF_INT) && (MOO_SIZEOF_INT == 16)
 #	define MOO_HAVE_UINT128_T
 #	define MOO_HAVE_INT128_T
+#	define MOO_SIZEOF_UINT128_T (MOO_SIZEOF_INT)
+#	define MOO_SIZEOF_INT128_T (MOO_SIZEOF_INT)
 	typedef unsigned int        moo_uint128_t;
 	typedef signed int          moo_int128_t;
 #elif defined(MOO_SIZEOF_LONG) && (MOO_SIZEOF_LONG == 16)
 #	define MOO_HAVE_UINT128_T
 #	define MOO_HAVE_INT128_T
+#	define MOO_SIZEOF_UINT128_T (MOO_SIZEOF_LONG)
+#	define MOO_SIZEOF_INT128_T (MOO_SIZEOF_LONG)
 	typedef unsigned long int   moo_uint128_t;
 	typedef signed long int     moo_int128_t;
 #elif defined(MOO_SIZEOF_LONG_LONG) && (MOO_SIZEOF_LONG_LONG == 16)
 #	define MOO_HAVE_UINT128_T
 #	define MOO_HAVE_INT128_T
+#	define MOO_SIZEOF_UINT128_T (MOO_SIZEOF_LONG_LONG)
+#	define MOO_SIZEOF_INT128_T (MOO_SIZEOF_LONG_LONG)
 	typedef unsigned long long int moo_uint128_t;
 	typedef signed long long int   moo_int128_t;
 #elif defined(MOO_SIZEOF___INT128) && (MOO_SIZEOF___INT128 == 16)
 #	define MOO_HAVE_UINT128_T
 #	define MOO_HAVE_INT128_T
+#	define MOO_SIZEOF_UINT128_T (MOO_SIZEOF___INT128)
+#	define MOO_SIZEOF_INT128_T (MOO_SIZEOF___INT128)
 	typedef unsigned __int128    moo_uint128_t;
 	typedef signed __int128      moo_int128_t;
 #elif defined(MOO_SIZEOF___INT128_T) && (MOO_SIZEOF___INT128_T == 16)
 #	define MOO_HAVE_UINT128_T
 #	define MOO_HAVE_INT128_T
+#	define MOO_SIZEOF_UINT128_T (MOO_SIZEOF___INT128_T)
+#	define MOO_SIZEOF_INT128_T (MOO_SIZEOF___INT128_T)
 	#if defined(MOO_SIZEOF___UINT128_T) && (MOO_SIZEOF___UINT128_T == MOO_SIZEOF___INT128_T)
 	typedef __uint128_t  moo_uint128_t;
 	typedef __int128_t   moo_int128_t;
@@ -794,7 +842,6 @@ typedef struct moo_t moo_t;
 #endif
 */
 
-
 #if defined(__has_builtin) 
 	#if __has_builtin(__builtin_ctz)
 		#define MOO_HAVE_BUILTIN_CTZ
@@ -879,6 +926,8 @@ typedef struct moo_t moo_t;
 
 	#if (__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
 		#define MOO_HAVE_BUILTIN_CTZ
+		#define MOO_HAVE_BUILTIN_CTZL
+		#define MOO_HAVE_BUILTIN_CTZLL
 		#define MOO_HAVE_BUILTIN_EXPECT
 	#endif
 
diff --git a/moo/lib/moo-utl.h b/moo/lib/moo-utl.h
index 5b3b4cf..83c6b7c 100644
--- a/moo/lib/moo-utl.h
+++ b/moo/lib/moo-utl.h
@@ -640,12 +640,6 @@ MOO_EXPORT moo_oow_t moo_utf16_to_uc (
 
 /* ------------------------------------------------------------------------- */
 
-MOO_EXPORT int moo_log2_for_pow2 (
-	moo_oow_t pow2v
-);
-
-/* ------------------------------------------------------------------------- */
-
 #if defined(MOO_HAVE_UINT16_T)
 MOO_EXPORT moo_uint16_t moo_ntoh16 (
 	moo_uint16_t x