~mpu/qbe

Simplify int mul/udiv/urem of 2^N into shl/shr/and. v1 PROPOSED

Roland Paterson-Jones: 1
 Simplify int mul/udiv/urem of 2^N into shl/shr/and.

 1 files changed, 58 insertions(+), 3 deletions(-)
I pushed a lightly edited version of this patch on master.

The only subtlety I noted is that it may call getcon() with
an unsigned long long that overflows int64_t (namely 2**63).
I think on all the platforms supported the implementation-
defined behavior will be the right one though.
Hi,

Following up on that I think we should not systematically
go to 'shl' for the multiplications. Shifts on amd64 are
annoying due to their register constraints, so we will
likely get better performance if we compile them to leas
and adds when possible.
There is a fun dynamic programming algorithm to write to
multiply by small constants.
Happy hacking.



          
          
          
          
Next
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~mpu/qbe/patches/53280/mbox | git am -3
Learn more about email & git

[PATCH] Simplify int mul/udiv/urem of 2^N into shl/shr/and. Export this patch

Passes the "standard" test suite.

(cproc bootstrap, hare[c] make test, roland units, linpack/coremark run)

However linpack benchmark is now notably slower. Coremark is ~2% faster.

As noticed before, linmark timing is dubious, and maybe my cheap (AMD) laptop
prefers mul to shl.
---
 simpl.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 58 insertions(+), 3 deletions(-)

diff --git a/simpl.c b/simpl.c
index 7001301..7e10f2b 100644
--- a/simpl.c
+++ b/simpl.c
@@ -30,15 +30,52 @@ blit(Ref sd[2], int sz, Fn *fn)
		}
}

static int
isconbits(Fn *fn, Ref r)
{
	return rtype(r) == RCon && fn->con[r.val].type == CBits;
}

static int
ispow2(uint64_t v)
{
	return (v & (v - 1)) == 0;
}

static int LOG2_TAB64[64] = {
	63,  0, 58,  1, 59, 47, 53,  2,
	60, 39, 48, 27, 54, 33, 42,  3,
	61, 51, 37, 40, 49, 18, 28, 20,
	55, 30, 34, 11, 43, 14, 22,  4,
	62, 57, 46, 52, 38, 26, 32, 41,
	50, 36, 17, 19, 29, 10, 13, 21,
	56, 45, 25, 31, 35, 16,  9, 12,
	44, 24, 15,  8, 23,  7,  6,  5};

/* caveat: only works on 64-bit systems */
static int log2_64 (uint64_t value)
{
	assert(sizeof(unsigned long) == 8);
	value |= value >> 1;
	value |= value >> 2;
	value |= value >> 4;
	value |= value >> 8;
	value |= value >> 16;
	value |= value >> 32;
	return LOG2_TAB64[((value - (value >> 1))*0x07EDD5E59A4E28C2UL) >> 58];
}

static void
ins(Ins **pi, int *new, Blk *b, Fn *fn)
{
	ulong ni;
	Con *c;
	Ins *i;
	int shift;

	i = *pi;
	/* simplify more instructions here;
	 * copy 0 into xor, mul 2^n into shift,
	 * copy 0 into xor, mul 2^n into shift (DONE),
	 * bit rotations, ... */
	switch (i->op) {
	case Oblit1:
@@ -53,12 +90,30 @@ ins(Ins **pi, int *new, Blk *b, Fn *fn)
		}
		blit((i-1)->arg, rsval(i->arg[0]), fn);
		*pi = i-1;
		return; /* finished */
	case Omul:
	case Oudiv:
	case Ourem:
		if (!KBASE(i->cls))
		if (isconbits(fn, i->arg[1])) {
			c = &fn->con[i->arg[1].val];
			if (ispow2(c->bits.i)) {
				shift = log2_64(c->bits.i);
				if (i->op == Ourem) {
					i->op = Oand;
					i->arg[1] = getcon((1ul << shift)-1, fn);
				} else {
					i->op = i->op == Omul ? Oshl : Oshr;
					i->arg[1] = getcon(shift, fn);
				}
			}
		}
		break;
	default:
		if (*new)
			emiti(*i);
		break;
	}
	if (*new)
		emiti(*i);
}

void
-- 
2.34.1
Hi Roland,

Thanks for tackling this! I definitely want to merge something
like that before the next release; it's long overdue.