diff --git a/moo/kernel/test-001.moo b/moo/kernel/test-001.moo
index 594a639..25f7fd6 100644
--- a/moo/kernel/test-001.moo
+++ b/moo/kernel/test-001.moo
@@ -119,12 +119,13 @@ extend MyObject
                 ^(a value: 5) * (b value: 6). ## (12 * 5 + 22) * (99 * 6 + 4) => 49036
 	}
 
-	method(#class) testBigintDiv: divd_base divisor: divr_base
+	method(#class) testBigintDiv: divd_base divisor: divr_base count: count
 	{
-		| i q r divd divr |
+		| i q r divd divr ubound |
 
 		i := 1.
-		while (i < 2000)
+		ubound := i + count.
+		while (i < ubound)
 		{
 			divd := divd_base * i.
 			divr := divr_base * i.
@@ -375,8 +376,13 @@ extend MyObject
 			## 150-154
 			[ (8113063330913503995887611892379812731289731289312898971231 div: 8113063330913503995887611892379812731289731289312898971230) = 1 ],
 			[ (8113063330913503995887611892379812731289731289312898971231 rem: 8113063330913503995887611892379812731289731289312898971230) = 1 ],
-			[ self testBigintDiv: 100919283908998345873248972389472389791283789123712899089034258903482398198123912831 divisor: 129323482374892374238974238974238947328972389128387312892713891728391278 ],
-			[ self testBigintDiv: 234897230919283908998345873248972389472389791283789123712899089034258903482398198123912831 divisor: 129323482374892374238974238974238947328972389128387312892713891728391278 ],
+			[ self testBigintDiv: 100919283908998345873248972389472389791283789123712899089034258903482398198123912831 divisor: 129323482374892374238974238974238947328972389128387312892713891728391278 count: 2000 ],
+			[ self testBigintDiv: 234897230919283908998345873248972389472389791283789123712899089034258903482398198123912831 divisor: 12932348237489237423897423897423894732897238912838731289271389172839127 count: 2000 ],
+			[ self testBigintDiv: 16r1234567812345678123456781234567812345678123456781234567812345678123456781234567812345678 divisor: 16r12345678123456781234567812345678123456781234567812345678 count: 2000 ],
+			[ self testBigintDiv: 16r000089ab0000456700000123 divisor: 1 count: 2000 ],
+			[ self testBigintDiv: 16r000000030000000000008000 divisor: 16r000000010000000000002000 count: 2000 ],
+			[ self testBigintDiv: 16rfffffffe0000000080000000 divisor: 16r0000ffff0000000080000000 count: 12345 ],
+			[ self testBigintDiv: 16rfffffffffffffffe divisor: 16rffffffff count: 2000 ],
 
 
 			## =========================
diff --git a/moo/lib/bigint.c b/moo/lib/bigint.c
index e4a8449..18bc364 100644
--- a/moo/lib/bigint.c
+++ b/moo/lib/bigint.c
@@ -134,6 +134,41 @@ static const moo_uint8_t debruijn_64[64] =
 #	define LOG2_FOR_POW2_64(x) (debruijn_64[(moo_uint64_t)((moo_uint64_t)(x) * 0x022fdd63cc95386d) >> 58])
 #endif
 
+static MOO_INLINE int get_num_leading_zero_bits (moo_oow_t x)
+{
+#if defined(MOO_HAVE_BUILTIN_CLZLL) && (MOO_SIZEOF_OOW_T == MOO_SIZEOF_LONG_LONG)
+	return __builtin_clzll(x); /* count the number of leading zeros */
+#elif defined(MOO_HAVE_BUILTIN_CLZL) && (MOO_SIZEOF_OOW_T == MOO_SIZEOF_LONG)
+	return __builtin_clzl(x); /* count the number leading zeros */
+#elif defined(MOO_HAVE_BUILTIN_CLZ) && (MOO_SIZEOF_OOW_T == MOO_SIZEOF_INT)
+	return __builtin_clz(x); /* count the number of leading zeros */
+#elif defined(__GNUC__) && (defined(__x86_64) || defined(__amd64) || defined(__i386) || defined(i386))
+	/* bit scan reverse. not all x86 CPUs have LZCNT. */
+	moo_oow_t pos;
+	__asm__ volatile (
+		"bsr %1,%0\n\t"
+		: "=r"(pos) /* output */
+		: "r"(x) /* input */
+	);
+	return (int)((MOO_SIZEOF(pos) * 8) - pos);
+#elif defined(__GNUC__) && (defined(__arm__)
+	moo_oow_t count;
+	__asm__ volatile (
+		"clz %0,%1\n\t"
+		: "=r"(count) /* output */
+		: "r"(x) /* input */
+	);
+	return (int)count;
+#else
+	int count = 0;
+	while (x >= 0) 
+	{
+		x <<= 1;
+		count++;
+	}
+#endif
+}
+
 static MOO_INLINE int get_pos_of_msb_set (moo_oow_t x)
 {
 	/* the caller must ensure that x is power of 2. if x happens to be zero,
@@ -1802,20 +1837,17 @@ static void divide_unsigned_array2 (moo_t* moo, const moo_liw_t* x, moo_oow_t xs
 	for (i = 0; i < ys; i++) r[i] = y[i]; /* copy y to r */
 
 	y1 = r[ys - 1]; /* highest divisor word */
-	d = (y1 == MOO_TYPE_MAX(moo_liw_t)? ((moo_liw_t)1): ((moo_liw_t)(((moo_lidw_t)1 << MOO_LIW_BITS) / (y1 + 1))));
+
+
+	//d = (y1 == MOO_TYPE_MAX(moo_liw_t)? ((moo_liw_t)1): ((moo_liw_t)(((moo_lidw_t)1 << MOO_LIW_BITS) / (y1 + 1))));
+	d = (moo_liw_t)(((moo_lidw_t)1 << MOO_LIW_BITS) / ((moo_lidw_t)y1 + 1));
 	if (d > 1)
 	{
 		moo_lidw_t dw;
 		moo_liw_t carry;
 
-		/* q = q * d */
-		for (carry = 0, i = 0; i < xs; i++)
-		{
-			dw = ((moo_lidw_t)q[i] * d) + carry;
-			carry = (moo_liw_t)(dw >> MOO_LIW_BITS);
-			q[i] = (moo_liw_t)dw;
-		}
-		q[xs] = carry;
+		/* shift the divisor such that its high-order bit is on.
+		 * shift the dividend the same amount as the previous step */
 
 		/* r = r * d */
 		for (carry = 0, i = 0; i < ys; i++)
@@ -1825,6 +1857,15 @@ static void divide_unsigned_array2 (moo_t* moo, const moo_liw_t* x, moo_oow_t xs
 			r[i] = (moo_liw_t)dw;
 		}
 		MOO_ASSERT (moo, carry == 0);
+
+		/* q = q * d */
+		for (carry = 0, i = 0; i < xs; i++)
+		{
+			dw = ((moo_lidw_t)q[i] * d) + carry;
+			carry = (moo_liw_t)(dw >> MOO_LIW_BITS);
+			q[i] = (moo_liw_t)dw;
+		}
+		q[xs] = carry;
 	}
 
 	y1 = r[ys - 1];
@@ -1842,6 +1883,7 @@ static void divide_unsigned_array2 (moo_t* moo, const moo_liw_t* x, moo_oow_t xs
 		xhi = q[i];
 		xlo = q[i - 1];
 
+//printf ("AAAAAAAAAAAaa %llx %llx\n", (unsigned long long)xhi, (unsigned long long)y1);
 		if (xhi == y1) 
 		{
 			quo = MOO_TYPE_MAX(moo_liw_t);
@@ -1850,60 +1892,57 @@ static void divide_unsigned_array2 (moo_t* moo, const moo_liw_t* x, moo_oow_t xs
 		{
 			dw = ((moo_lidw_t)xhi << MOO_LIW_BITS) + xlo;
 			/* TODO: optimize it with ASM - no seperate / and % */
-			quo = (moo_liw_t)(dw / y1);
-			q[i] = (moo_liw_t)(dw % y1); 
+			quo = (moo_liw_t)(dw / y1); /* qhat */
+			q[i] = (moo_liw_t)(dw % y1);  /* rhat */
+
+MOO_ASSERT (moo, (dw / y1) == quo);
 		}
 
 		/* ---------------------------------------------------------- */
 		/*quo = adjust_for_over_estimate(y1, y2, quo, q[i], q[i - 2]);*/
-	{
-		moo_liw_t c, c2, y2q;
 
+//printf ("qhat == %llx rhat == %llx dw = %llx r[ys-1]  = %llx   /// %llx %llx\n", (unsigned long long)qhat, (unsigned long long)rhat, (unsigned long long)dw, (unsigned long long)r[ys-1], (unsigned long long)qq[j + ys], (unsigned long long)qq[j + ys - 1]);
+#if 0
+{
+moo_lidw_t rhat = q[i];
+	adjust_quotient:
+		if (quo > MOO_TYPE_MAX(moo_liw_t) || ((moo_lidw_t)quo * y2) > (((moo_lidw_t)rhat << MOO_LIW_BITS) + q[i - 2]))
+		{
+			quo = quo - 1;
+			rhat = rhat + y1;
+			if (rhat <= MOO_TYPE_MAX(moo_liw_t)) goto adjust_quotient;
+		}
+}
+#else
+	{
+		moo_liw_t c, c2, y2q, rhat;
+
+//printf ("XXXXXXXXXXXXXXXXXXXXXx %d %d\n", (int)xs, (int)i);
+rhat = q[i];
+adjust_quotient:
 		dw = ((moo_lidw_t)quo * y2);
 		c = (moo_liw_t)(dw >> MOO_LIW_BITS);
 		y2q = (moo_liw_t)dw;
-
-/*
-x1 => c
-x2 => y2q
-ujn2 => q[i - 2]
-rhat => q[i]
-
-for greaterThan(x1, x2, rhat, ujn2) 
-
-func greaterThan(x1, x2, y1, y2 Word) bool {
-	return x1 > y1 || x1 == y1 && x2 > y2
-}*/
-
-//printf ("XXXXXXXXXXXXXXXXXXXXXx %d %d\n", (int)xs, (int)i);
-		if (c > q[i] || (c == q[i] && y2q > q[i - 2])) 
+		if (c > rhat || (c == rhat && y2q > q[i - 2])) 
 		{
 //printf ("ADJUST...............%llu %llu %llu %llu\n", (unsigned long long)c,  (unsigned long long)q[i],  (unsigned long long)y2q,  (unsigned long long)q[i - 2]);
-			--quo; /* too large by 1 */
+			--quo; /* too large */
 
-			/*xpy(xi, y1, c2); add back divisor */
-			c2 = (moo_liw_t)(((moo_lidw_t)q[i] + y1) >> MOO_LIW_BITS);
-			if (c2 == 0) 
+			dw = (moo_lidw_t)q[i] + y1; /* add back the divisor */
+			if (dw <= MOO_TYPE_MAX(moo_liw_t)) /* if ((dw >> MOO_LIW_BITS) == 0) */
 			{
-				/*y2q = axpy(y2, q, 0, c);*/
-				dw = ((moo_lidw_t)quo * y2);
-				c = (moo_liw_t)(dw >> MOO_LIW_BITS);
-				y2q = (moo_liw_t)dw;
-				if (c > q[i] || (c == q[i] && y2q > q[i - 2])) 
-				{
-//printf ("ADJUST2...............%llu %llu %llu %llu\n", (unsigned long long)c,  (unsigned long long)q[i],  (unsigned long long)y2q,  (unsigned long long)q[i - 2]);
-					--quo; /* too large by 2 */
-				}
+				rhat = (moo_liw_t)dw;
+				goto adjust_quotient;
 			}
 		}
 	}
-
+#endif
 		/* ---------------------------------------------------------- */
 		b = calculate_remainder(moo, q, r, quo, i - ys, ys);
-		b = (moo_liw_t)((((moo_lidw_t)xhi - b) >> MOO_LIW_BITS) & 1);
+		b = (moo_liw_t)((((moo_lidw_t)xhi - b) >> MOO_LIW_BITS) & 1); /* is the sign bit set? */
 		if (b)
 		{
-			/* underflow */
+			/* yes. underflow */
 			b = adjust_for_underflow(q, r, i - ys, ys);
 			MOO_ASSERT (moo, b == 1);
 			q[i] = quo - 1;
@@ -1938,28 +1977,12 @@ func greaterThan(x1, x2, y1, y2 Word) bool {
 }
 
 
-static moo_oow_t nlz (moo_liw_t x) 
-{
-   moo_oow_t n;
-
-   if (x == 0) return(32);
-
-   n = 0;
-   if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
-   if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
-   if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
-   if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
-   if (x <= 0x7FFFFFFF) {n = n + 1;}
-
-   return n;
-}
-
 static void divide_unsigned_array3 (moo_t* moo, const moo_liw_t* x, moo_oow_t xs, const moo_liw_t* y, moo_oow_t ys, moo_liw_t* q, moo_liw_t* r)
 {
-	moo_oow_t i, j, s;
+	moo_oow_t s, i, j, g;
 	moo_lidw_t dw, qhat, rhat;
 	moo_lidi_t t, k;
-	moo_liw_t* qq;
+	moo_liw_t* qq, y1, y2;
 
 	/* the caller must ensure:
 	 *  - q can hold 'xs + 1' words and r can hold 'ys' words. 
@@ -2007,7 +2030,7 @@ for (i = 0; i < ys; i++) printf ("0x%llx, ", (unsigned long long)y[i]);
 printf ("};\n"); */
 //for (i = 0; i <= xs; i++) printf ("un %llx\n", (unsigned long long)q[i]);
 
-	s = nlz(y[ys - 1]);
+	s = get_num_leading_zero_bits(y[ys - 1]);
 	for (i = ys; i > 1; )
 	{
 		--i;
@@ -2028,79 +2051,62 @@ printf ("};\n"); */
 //for (i = 0; i < ys; i++) printf ("vn %llx\n", (unsigned long long)r[i]);
 //for (i = 0; i <= xs; i++) printf ("un %llx\n", (unsigned long long)qq[i]);
 
+	y1 = r[ys - 1];
+	y2 = r[ys - 2];
+
+/*
 	for (j = xs - ys + 1; j > 0; )
 	{
 		--j;
+*/
+
+
+	for (j = xs; j >= ys; --j)
+	{
+		g = j - ys; /* position where remainder begins in qq */
 
 		/* estimate */
-		dw = ((moo_lidw_t)qq[j + ys] << MOO_LIW_BITS) + qq[j + ys - 1];
-		qhat = dw / r[ys - 1];
-		rhat = dw - (qhat * r[ys - 1]);
+		dw = ((moo_lidw_t)qq[j] << MOO_LIW_BITS) + qq[j - 1];
+		qhat = dw / y1;
+		rhat = dw - (qhat * y1);
 
  //printf ("qhat == %llx rhat == %llx dw = %llx r[ys-1]  = %llx   /// %llx %llx\n", (unsigned long long)qhat, (unsigned long long)rhat, (unsigned long long)dw, (unsigned long long)r[ys-1], (unsigned long long)qq[j + ys], (unsigned long long)qq[j + ys - 1]);
 	adjust_quotient:
-		if (qhat > MOO_TYPE_MAX(moo_liw_t) || (qhat * r[ys - 2]) > ((rhat << MOO_LIW_BITS) + qq[j + ys - 2]))
+		if (qhat > MOO_TYPE_MAX(moo_liw_t) || (qhat * y2) > ((rhat << MOO_LIW_BITS) + qq[j - 2]))
 		{
 			qhat = qhat - 1;
-			rhat = rhat + r[ys - 1];
+			rhat = rhat + y1;
 			if (rhat <= MOO_TYPE_MAX(moo_liw_t)) goto adjust_quotient;
 		}
 
 //printf ("qhat == %llx rhat == %llx\n", (unsigned long long)qhat, (unsigned long long)rhat);
-#if 1
 		/* multiply and subtract */
 		for (k = 0, i = 0; i < ys; i++)
 		{
 			dw = qhat * r[i];
-			t = qq[j + i] - k - (dw & MOO_TYPE_MAX(moo_liw_t));
-			qq[j + i] = t;
+			t = qq[g + i] - k - (dw & MOO_TYPE_MAX(moo_liw_t));
+			qq[g + i] = t;
 			k = (dw >> MOO_LIW_BITS) - (t >> MOO_LIW_BITS);
 		}
-		t = qq[j + ys] - k;
-		qq[j + ys] = t;
+		MOO_ASSERT (moo, j == g + ys);
+		t = qq[j] - k;
+		qq[j] = t;
 
 		/* test remainder */
-		q[j] = qhat;
+		q[g] = qhat; /* store quotient */
 		if (t < 0)
 		{
-			q[j] = q[j] - 1;
+			q[g] = q[g] - 1; /* add back */
+
 			for (k = 0, i = 0; i < ys; i++)
 			{
-				t = (moo_lidw_t)qq[j + i] + r[i] + k;
-				qq[j + i] = (moo_liw_t)t;
+				t = (moo_lidw_t)qq[g + i] + r[i] + k;
+				qq[g + i] = (moo_liw_t)t;
 				k = t >> MOO_LIW_BITS;
 			}
-			qq[j + ys] += k;
+			MOO_ASSERT (moo, j == g + ys);
+			qq[j] += k;
 		}
-#else
-
-		for (k = 0, i = 0; i <= n; i++) 
-		{
-			p = qhat * (i == n ? 0 : v[i]);
-			k2 = (p >> 16);
-
-			d = u[j + i] - (uint16_t)p;
-			k2 += (d > u[j + i]);
-
-			u[j + i] = d - k;
-			k2 += (u[j + i] > d);
-
-			k = k2;
-		}
-
-		/* Test remainder. */
-		q[j] = qhat;
-		if (k != 0) {
-		    /* Add back. */
-		    q[j]--;
-		    k = 0;
-		    for (i = 0; i < n; i++) {
-				  t = u[j + i] + v[i] + k;
-				  u[j + i] = (uint16_t)t;
-				  k = t >> 16;
-		    }
-		 
-#endif
 	}
 
 	for (i = 0; i < ys - 1; i++)
@@ -2270,7 +2276,7 @@ static moo_oop_t divide_unsigned_integers (moo_t* moo, moo_oop_t x, moo_oop_t y,
 	if (!rr) return MOO_NULL;
 
 #if defined(USE_DIVIDE_UNSIGNED_ARRAY2)
-	divide_unsigned_array3 (moo,
+	divide_unsigned_array2 (moo,
 #else
 	divide_unsigned_array (moo,
 #endif
@@ -2897,7 +2903,6 @@ oops_einval:
 	return MOO_NULL;
 }
 
-
 moo_oop_t moo_negateint (moo_t* moo, moo_oop_t x)
 {
 	if (MOO_OOP_IS_SMOOI(x))
diff --git a/moo/lib/moo-cmn.h b/moo/lib/moo-cmn.h
index 5b1121a..82a7e0f 100644
--- a/moo/lib/moo-cmn.h
+++ b/moo/lib/moo-cmn.h
@@ -853,6 +853,16 @@ typedef struct moo_t moo_t;
 		#define MOO_HAVE_BUILTIN_CTZLL
 	#endif
 
+	#if __has_builtin(__builtin_clz)
+		#define MOO_HAVE_BUILTIN_CLZ
+	#endif
+	#if __has_builtin(__builtin_clzl)
+		#define MOO_HAVE_BUILTIN_CLZL
+	#endif
+	#if __has_builtin(__builtin_clzll)
+		#define MOO_HAVE_BUILTIN_CLZLL
+	#endif
+
 	#if __has_builtin(__builtin_uadd_overflow)
 		#define MOO_HAVE_BUILTIN_UADD_OVERFLOW 
 	#endif