~mpu/qbe

This thread contains a patchset. You're looking at the original emails, but you may wish to use the patch review UI. Review patch

[PATCH] Global Value Numbering (GVN) / Global Code Motion (GCM) RFC 8

Details
Message ID
<20240801064225.302721-1-rolandpj@gmail.com>
DKIM signature
pass
Download raw message
Patch: +2113 -657
Changes since RFC 7:

Bug fixes:

- remove isbad4gcm() in GVN/GCM - it is unsound due to different state
    at GVN vs GCM time; replace with "reassociation" pass after GCM
- fix intra-blk use-before-def after GCM
- prevent GVN from deduping trapping instructions cos GCM will not
    move them
- remove cmp eq/ne identical arg copy detection for floating point, it
    is not valid for NaN
- fix cges/cged flagged as commutative in ops.h instead of cnes/cned
    respectively; just a typo

Minor features:

- copy detection handles cmp le/lt/ge/gt with identical args
- treat (integer) div/rem by non-zero constant as non-trapping
- eliminate add N/sub N pairs in copy detection
- maintain accurate tmp use in GVN; not strictly necessary but enables
    interim global state sanity checking
- "reassociation" of trivial constant offset load/store addresses, and
    cmp ops with point-of-use in pass after GCM
- normalise commutative op arg order - e.g. op con, tmp -> op tmp, con
    to simplify copy detection and GVN instruction dedup

Codebase:

- split out core copy detection and constant folding (back) out into
    copy.c, fold.c respectively; gvn.c was getting monolithic
- generic support for instruction moving in ins.c - used by GCM and
    reassoc
- new reassociation pass in reassoc.c
- other minor clean-up/refactor

Changes since RFC 6:

- More ext elimination in GVN by examination of def and use bit width
- elimination of redundant and mask by bit width examination
- Incorporation of Song's patch

Changes since RFC 5:

- avoidance of "bad" candidates for GVN/GCM - trivial address offset
    calculations, and comparisons
- more copy detection mostly around boolean values
- allow elimination of unused load, alloc, trapping instructions
- detection of trivial boolean v ? 1 : 0 phi patterns
- bug fix for (removal of) "chg" optimisation in ins recreation - it
    was missing removal of unused instructions in some cases

Testing - standard QBE, cproc, hare, harec, roland, coremark

    [but latest roland master is failing rust build]

Benchmark
- coremark is ~15%+ faster than master
- hare "HARETEST_INCLUDE='slow' make check" ~5% faster
---
 .gitignore             |   1 +
 Makefile               |   3 +-
 all.h                  |  57 +++-
 cfg.c                  |  43 +++
 copy.c                 | 485 ++++++++++++++++++++-------------
 fold.c                 | 347 ++----------------------
 gcm.c                  | 352 ++++++++++++++++++++++++
 gvn.c                  | 592 +++++++++++++++++++++++++++++++++++++++++
 ins.c                  | 197 ++++++++++++++
 main.c                 |  13 +-
 ops.h                  | 291 ++++++++++----------
 parse.c                |  14 +-
 reassoc.c              | 225 ++++++++++++++++
 ssa.c                  |   2 +-
 test/_gcm1.ssa         |  48 ++++
 test/_gcm2.ssa         |  43 +++
 test/_load-elim.ssa    |  17 ++
 test/gvn-live-dead.ssa |  19 ++
 util.c                 |  21 ++
 19 files changed, 2113 insertions(+), 657 deletions(-)
 create mode 100644 gcm.c
 create mode 100644 gvn.c
 create mode 100644 ins.c
 create mode 100644 reassoc.c
 create mode 100644 test/_gcm1.ssa
 create mode 100644 test/_gcm2.ssa
 create mode 100644 test/_load-elim.ssa
 create mode 100644 test/gvn-live-dead.ssa

diff --git a/.gitignore b/.gitignore
index afd08d7..09e4118 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@ qbe
config.h
.comfile
*.out
*~
diff --git a/Makefile b/Makefile
index 2482eb1..86a35a4 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,8 @@ PREFIX = /usr/local
BINDIR = $(PREFIX)/bin

COMMOBJ  = main.o util.o parse.o abi.o cfg.o mem.o ssa.o alias.o load.o \
           copy.o fold.o simpl.o live.o spill.o rega.o emit.o
           copy.o fold.o gvn.o gcm.o ins.o reassoc.o simpl.o live.o \
           spill.o rega.o emit.o
AMD64OBJ = amd64/targ.o amd64/sysv.o amd64/isel.o amd64/emit.o
ARM64OBJ = arm64/targ.o arm64/abi.o arm64/isel.o arm64/emit.o
RV64OBJ  = rv64/targ.o rv64/abi.o rv64/isel.o rv64/emit.o
diff --git a/all.h b/all.h
index 3479d27..546a123 100644
--- a/all.h
+++ b/all.h
@@ -191,6 +191,7 @@ enum {
#define INRANGE(x, l, u) ((unsigned)(x) - l <= u - l) /* linear in x */
#define isstore(o) INRANGE(o, Ostoreb, Ostored)
#define isload(o) INRANGE(o, Oloadsb, Oload)
#define isalloc(o) INRANGE(o, Oalloc4, Oalloc16)
#define isext(o) INRANGE(o, Oextsb, Oextuw)
#define ispar(o) INRANGE(o, Opar, Opare)
#define isarg(o) INRANGE(o, Oarg, Oargv)
@@ -214,8 +215,14 @@ struct Op {
	char *name;
	short argcls[2][4];
	uint canfold:1;
	uint hasid:1;
	uint idval:1; /* identity value 0/1 */
	uint hasid:1;     /* op identity value? */
	uint idval:1;     /* identity value 0/1 */
	uint commutes:1;  /* commutative op? */
	uint idemp:1;     /* idempotent op? */
	uint iscmpeq:1;   /* cmp eq/ne? */
	uint iscmplgte:1; /* cmp lt/gt/le/ge? */
	uint eqval:1;     /* 1 for eq; 0 for ne */
	uint ispinned:1;  /* GCM pinned op? */
};

struct Ins {
@@ -230,7 +237,8 @@ struct Phi {
	Ref *arg;
	Blk **blk;
	uint narg;
	int cls;
	short cls;
	uint visit:1;
	Phi *link;
};

@@ -251,6 +259,7 @@ struct Blk {

	Blk *idom;
	Blk *dom, *dlink;
	int domdpth;
	Blk **fron;
	uint nfron;

@@ -344,6 +353,7 @@ struct Tmp {
		Wuw
	} width;
	int visit;
	uint gcmbid;
};

struct Con {
@@ -448,6 +458,20 @@ struct Dat {
	char isstr;
};

typedef struct InsLoc InsLoc;

struct InsLoc {
	uint bid;
	uint insn;
};

typedef struct InsMov InsMov;

struct InsMov {
	InsLoc from;
	InsLoc to;
};

/* main.c */
extern Target T;
extern char debug['Z'+1];
@@ -488,10 +512,11 @@ int symeq(Sym, Sym);
Ref newcon(Con *, Fn *);
Ref getcon(int64_t, Fn *);
int addcon(Con *, Con *, int);
int isconbits(Fn *fn, Ref r, int64_t *v);
int istmpconbits(Fn *fn, Ins *i, int64_t *v);
void salloc(Ref, Ref, Fn *);
void dumpts(BSet *, Tmp *, FILE *);
void runmatch(uchar *, Num *, Ref, Ref *);

void bsinit(BSet *, uint);
void bszero(BSet *);
uint bscount(BSet *);
@@ -531,6 +556,8 @@ int sdom(Blk *, Blk *);
int dom(Blk *, Blk *);
void fillfron(Fn *);
void loopiter(Fn *, void (*)(Blk *, Blk *));
void filldomdpth(Fn *);
Blk *lca(Blk *, Blk *);
void fillloop(Fn *);
void simpljmp(Fn *);

@@ -550,15 +577,33 @@ int storesz(Ins *);
void loadopt(Fn *);

/* ssa.c */
void adduse(Tmp *, int, Blk *, ...);
void filluse(Fn *);
void ssa(Fn *);
void ssacheck(Fn *);

/* copy.c */
void copy(Fn *);
int iswu1(Fn *, Ref);
void narrowpars(Fn *);
Ref copyref(Fn *, Ins *);

/* fold.c */
void fold(Fn *);
Ref foldref(Fn *, Ins *);

/* gvn.c */
extern Ref con01[2];
void gvn(Fn *);

/* gcm.c */
int isfixed(Fn *, Ins *);
void gcm(Fn *);

/* ins.c */
void movins(Fn *, InsMov *, uint, int);
void fixub4d(Fn *);

/* reassoc.c */
void reassoc(Fn *);

/* simpl.c */
void simpl(Fn *);
diff --git a/cfg.c b/cfg.c
index 406c307..94d4d53 100644
--- a/cfg.c
+++ b/cfg.c
@@ -269,6 +269,49 @@ multloop(Blk *hd, Blk *b)
	b->loop *= 10;
}

void
filldomdpth(Fn *fn)
{
	Blk *b, *dom;
	uint dpth;

	for (b=fn->start; b; b=b->link)
		b->domdpth = -1;

	fn->start->domdpth = 0;

	for (b=fn->start; b; b=b->link) {
		if (b->domdpth != -1)
			continue;
		dpth = 1;
		for (dom = b->idom; dom->domdpth == -1; dom = dom->idom)
			dpth++;
		dpth += dom->domdpth;
		b->domdpth = dpth;
		for (dom = b->idom; dom->domdpth == -1; dom = dom->idom)
			dom->domdpth = --dpth;
	}
}

/* least common ancestor in dom tree */
Blk *
lca(Blk *b1, Blk *b2)
{
	if (!b1)
		return b2;
	if (!b2)
		return b1;
	while (b1->domdpth > b2->domdpth)
		b1 = b1->idom;
	while (b2->domdpth > b1->domdpth)
		b2 = b2->idom;
	while (b1 != b2) {
		b1 = b1->idom;
		b2 = b2->idom;
	}
	return b1;
}

void
fillloop(Fn *fn)
{
diff --git a/copy.c b/copy.c
index da3987f..d794aa9 100644
--- a/copy.c
+++ b/copy.c
@@ -1,217 +1,346 @@
#include "all.h"

static int
iscon(Ref r, int64_t bits, Fn *fn)
static uint
u64_wbits(uint64_t v)
{
	return rtype(r) == RCon
		&& fn->con[r.val].type == CBits
		&& fn->con[r.val].bits.i == bits;
	uint n;

	n = 0;
	if (v >> 32) { n += 32; v >>= 32; }
	if (v >> 16) { n += 16; v >>= 16; }
	if (v >>  8) { n +=  8; v >>=  8; }
	if (v >>  4) { n +=  4; v >>=  4; }
	if (v >>  2) { n +=  2; v >>=  2; }
	if (v >>  1) { n +=  1; v >>=  1; }
	return n+v;
}

static int
iscopy(Ins *i, Ref r, Fn *fn)
EXTSIGNED[] = { /*extsb*/1, /*extub*/0, /*extsh*/1, /*extuh*/0, /*extsw*/1, /*extuw*/0 };

static uint
EXTMAXW[] = { /*extsb*/7, /*extub*/8, /*extsh*/15, /*extuh*/16, /*extsw*/31, /*extuw*/32 };

static uint
EXTW[] = { /*extsb*/8, /*extub*/8, /*extsh*/16, /*extuh*/16, /*extsw*/32, /*extuw*/32 };

static uint
STW[] = { /*storeb*/8, /*storeh*/16, /*storew*/32, /*storel*/64, /*stores*/32, /*stored*/64 };

/* is the ref used only as a narrow value? */
static int
usewidthle(Fn *fn, Ref r, uint wbits)
{
	static bits extcpy[] = {
		[WFull] = 0,
		[Wsb] = BIT(Wsb) | BIT(Wsh) | BIT(Wsw),
		[Wub] = BIT(Wub) | BIT(Wuh) | BIT(Wuw),
		[Wsh] = BIT(Wsh) | BIT(Wsw),
		[Wuh] = BIT(Wuh) | BIT(Wuw),
		[Wsw] = BIT(Wsw),
		[Wuw] = BIT(Wuw),
	};
	Op *op;
	bits b;
	Tmp *t;
	Use *u;
	Phi *p;
	int b;
	Ins *i;
	Ref rc;
	int64_t v;

	if (i->op == Ocopy)
	if (isconbits(fn, r, &v))
	if (u64_wbits(v) <= wbits)
		return 1;
	op = &optab[i->op];
	if (op->hasid && KBASE(i->cls) == 0)
		return iscon(i->arg[1], op->idval, fn);
	if (!isext(i->op) || rtype(r) != RTmp)
	if (rtype(r) != RTmp)
		return 0;
	if (i->op == Oextsw || i->op == Oextuw)
	if (i->cls == Kw)
		return 1;

	t = &fn->tmp[r.val];
	assert(KBASE(t->cls) == 0);
	if (i->cls == Kl && t->cls == Kw)
	for (u = t->use; u < &t->use[t->nuse]; u++) {
		switch (u->type) {
		case UPhi:
			p = u->u.phi;
			if (p->visit)
				continue;
			p->visit = 1;
			b = usewidthle(fn, p->to, wbits);
			p->visit = 0;
			if (b)
				continue;
			break;
		case UIns:
			i = u->u.ins;
			assert(i != 0);
			if (i->op == Ocopy)
				if (usewidthle(fn, i->to, wbits))
					continue;
			if (isext(i->op)) {
				if (EXTW[i->op - Oextsb] <= wbits)
					continue;
				else
					if (usewidthle(fn, i->to, wbits))
						continue;;
			}
			if (i->op == Oand) {
				if (req(r, i->arg[0]))
					rc = i->arg[1];
				else {
					assert(req(r, i->arg[1]));
					rc = i->arg[0];
				}
				if (isconbits(fn, rc, &v))
				if (u64_wbits(v) <= wbits)
					continue;
				break;
			}
			if (isstore(i->op))
			if (req(r, i->arg[1]))
			if (STW[i->op - Ostoreb] > wbits)
				continue;
			break;
		default:
			break;
		}
		return 0;
	b = extcpy[t->width];
	return (BIT(Wsb + (i->op-Oextsb)) & b) != 0;
	}
	return 1;
}

static Ref
copyof(Ref r, Ref *cpy)
static Phi*
findphi(Fn *fn, uint bid, Ref to)
{
	if (rtype(r) == RTmp && !req(cpy[r.val], R))
		return cpy[r.val];
	return r;
	Phi *p;
	for (p = fn->rpo[bid]->phi; p; p = p->link)
		if (req(p->to, to))
			break;
	assert(p);
	return p;
}

/* detects a cluster of phis/copies redundant with 'r';
 * the algorithm is inspired by Section 3.2 of "Simple
 * and Efficient SSA Construction" by Braun M. et al.
 */
static void
phisimpl(Phi *p, Ref r, Ref *cpy, Use ***pstk, BSet *ts, BSet *as, Fn *fn)
static uint
uint_min(uint v1, uint v2)
{
	Use **stk, *u, *u1;
	uint nstk, a;
	int t;
	Ref r1;
	Phi *p0;

	bszero(ts);
	bszero(as);
	p0 = &(Phi){.narg = 0};
	stk = *pstk;
	nstk = 1;
	stk[0] = &(Use){.type = UPhi, .u.phi = p};
	while (nstk) {
		u = stk[--nstk];
		if (u->type == UIns && iscopy(u->u.ins, r, fn)) {
			p = p0;
			t = u->u.ins->to.val;
		}
		else if (u->type == UPhi) {
			p = u->u.phi;
			t = p->to.val;
		}
		else
			continue;
		if (bshas(ts, t))
			continue;
		bsset(ts, t);
		for (a=0; a<p->narg; a++) {
			r1 = copyof(p->arg[a], cpy);
			if (req(r1, r))
				continue;
			if (rtype(r1) != RTmp)
				return;
			bsset(as, r1.val);
	return v1 < v2 ? v1 : v2;
}

/* is the ref def a narrow value? */
static int
defwidthle(Fn *fn, Ref r, uint wbits)
{
	Tmp *t;
	Phi *p;
	Ins *i;
	uint n;
	int64_t v;
	int x;

	if (isconbits(fn, r, &v))
	if (u64_wbits(v) <= wbits)
		return 1;
	if (rtype(r) != RTmp)
		return 0;
	t = &fn->tmp[r.val];
	if (t->cls != Kw)
		return 0;
	i = t->def;
	if (i == 0) {
		/* phi def */
		p = findphi(fn, t->bid, r);
		if (p->visit)
			return 1;
		p->visit = 1;
		for (n = 0; n < p->narg; n++)
			if (!defwidthle(fn, p->arg[n], wbits)) {
				p->visit = 0;
				return 0;
			}
		p->visit = 0;
		return 1;
	}
	/* ins def */
	if (i->op == Ocopy)
                return defwidthle(fn, i->arg[0], wbits);
	if (i->op == Oshr || i->op == Osar) {
		if (isconbits(fn, i->arg[1], &v))
		if (0 < v && v <= 32) {
			if (i->op == Oshr && 32-v <= wbits)
				return 1;
			if (0 <= v && v < 32 && wbits < 32)
				return defwidthle(fn, i->arg[0], uint_min((i->op == Osar ? 31 : 32), wbits+v));
		}
		u = fn->tmp[t].use;
		u1 = &u[fn->tmp[t].nuse];
		vgrow(pstk, nstk+(u1-u));
		stk = *pstk;
		for (; u<u1; u++)
			stk[nstk++] = u;
		return defwidthle(fn, i->arg[0], wbits);
	}
	if (iscmp(i->op, &x, &x))
		return wbits >= 1;
	if (i->op == Oand)
		return defwidthle(fn, i->arg[0], wbits) || defwidthle(fn, i->arg[1], wbits);
	if (i->op == Oor || i->op == Oxor)
		return defwidthle(fn, i->arg[0], wbits) && defwidthle(fn, i->arg[1], wbits);
	if (isext(i->op)) {
		if (EXTSIGNED[i->op - Oextsb])
			return defwidthle(fn, i->arg[0], uint_min(wbits, EXTMAXW[i->op - Oextsb]));
		if (EXTW[i->op - Oextsb] <= wbits)
			return 1;
		return defwidthle(fn, i->arg[0], wbits);
	}
	bsdiff(as, ts);
	if (!bscount(as))
		for (t=0; bsiter(ts, &t); t++)
			cpy[t] = r;
	return 0;
}

static void
subst(Ref *pr, Ref *cpy)
/* is the ref a boolean - 0, 1 - value? */
int
iswu1(Fn *fn, Ref r)
{
	assert(rtype(*pr) != RTmp || !req(cpy[pr->val], R));
	*pr = copyof(*pr, cpy);
	return defwidthle(fn, r, 1);
}

/* requires use and dom, breaks use */
static int
isnarrowpar(Fn *fn, Ref r)
{
	Tmp *t;

	if (rtype(r) != RTmp)
		return 0;
	t = &fn->tmp[r.val];
	if (t->bid != fn->start->id || t->def == 0)
		return 0;
	return ispar(t->def->op);
}

/* Insert extub/extuh instructions in start for pars used only narrowly */
/* needs use; breaks use */
void
copy(Fn *fn)
narrowpars(Fn *fn)
{
	BSet ts[1], as[1];
	Use **stk;
	Phi *p, **pp;
	Ins *i;
	Blk *b;
	uint n, a, eq;
	Ref *cpy, r, r1;
	int t;

	bsinit(ts, fn->ntmp);
	bsinit(as, fn->ntmp);
	cpy = emalloc(fn->ntmp * sizeof cpy[0]);
	stk = vnew(10, sizeof stk[0], PHeap);

	/* 1. build the copy-of map */
	for (n=0; n<fn->nblk; n++) {
		b = fn->rpo[n];
		for (p=b->phi; p; p=p->link) {
			assert(rtype(p->to) == RTmp);
			if (!req(cpy[p->to.val], R))
				continue;
			eq = 0;
			r = R;
			for (a=0; a<p->narg; a++)
				if (p->blk[a]->id < n) {
					r1 = copyof(p->arg[a], cpy);
					if (req(r, R) || req(r, UNDEF))
						r = r1;
					if (req(r1, r) || req(r1, UNDEF))
						eq++;
				}
			assert(!req(r, R));
			if (rtype(r) == RTmp
			&& !dom(fn->rpo[fn->tmp[r.val].bid], b))
				cpy[p->to.val] = p->to;
			else if (eq == p->narg)
				cpy[p->to.val] = r;
			else {
				cpy[p->to.val] = p->to;
				phisimpl(p, r, cpy, &stk, ts, as, fn);
			}
		}
		for (i=b->ins; i<&b->ins[b->nins]; i++) {
			assert(rtype(i->to) <= RTmp);
			if (!req(cpy[i->to.val], R))
				continue;
			r = copyof(i->arg[0], cpy);
			if (iscopy(i, r, fn))
				cpy[i->to.val] = r;
			else
				cpy[i->to.val] = i->to;
	int loop;
	Ins *i, *ins;
	uint npar, nins;
	enum O extop;
	Ref r;

	/* only useful for functions with loops */
	loop = 0;
	for (b = fn->start; b; b = b->link)
		if (b->loop > 1) {
			loop = 1;
			break;
		}
	if (!loop)
		return;

	b = fn->start;
	npar = 0;

	for (i = b->ins; i < &b->ins[b->nins]; i++) {
		if (!ispar(i->op))
			break;
		npar++;
	}

	/* 2. remove redundant phis/copies
	 * and rewrite their uses */
	for (b=fn->start; b; b=b->link) {
		for (pp=&b->phi; (p=*pp);) {
			r = cpy[p->to.val];
			if (!req(r, p->to)) {
				*pp = p->link;
				continue;
			}
			for (a=0; a<p->narg; a++)
				subst(&p->arg[a], cpy);
			pp=&p->link;
		}
		for (i=b->ins; i<&b->ins[b->nins]; i++) {
			r = cpy[i->to.val];
			if (!req(r, i->to)) {
				*i = (Ins){.op = Onop};
				continue;
	if (npar == 0)
		return;

	nins = b->nins + npar;
	ins = alloc(nins * sizeof ins[0]);
	memcpy(ins, b->ins, npar * sizeof ins[0]);
	memcpy(ins + 2*npar, b->ins + npar, (b->nins - npar) * sizeof ins[0]);
	b->ins = ins;
	b->nins = nins;

	for (i = b->ins; i < &b->ins[b->nins]; i++) {
		if (!ispar(i->op))
			break;
		extop = Onop;
		if (i->cls == Kw)
			if (usewidthle(fn, i->to, 16)) {
				if (usewidthle(fn, i->to, 8))
					extop = Oextub;
				else
					extop = Oextuh;
			}
			subst(&i->arg[0], cpy);
			subst(&i->arg[1], cpy);
		if (extop == Onop) {
			*(i+npar) = (Ins) {.op = Onop};
		} else {
			r = newtmp("vw", i->cls, fn);
			*(i+npar) = (Ins) {.op = extop, .cls = i->cls, .to = i->to, .arg = {r}};
			i->to = r;
		}
		subst(&b->jmp.arg, cpy);
	}
}

	if (debug['C']) {
		fprintf(stderr, "\n> Copy information:");
		for (t=Tmp0; t<fn->ntmp; t++) {
			if (req(cpy[t], R)) {
				fprintf(stderr, "\n%10s not seen!",
					fn->tmp[t].name);
			}
			else if (!req(cpy[t], TMP(t))) {
				fprintf(stderr, "\n%10s copy of ",
					fn->tmp[t].name);
				printref(cpy[t], fn, stderr);
			}
/* used by GVN */
Ref
copyref(Fn *fn, Ins *i)
{
	static bits extcpy[] = {
		[WFull] = 0,
		[Wsb] = BIT(Wsb) | BIT(Wsh) | BIT(Wsw),
		[Wub] = BIT(Wub) | BIT(Wuh) | BIT(Wuw),
		[Wsh] = BIT(Wsh) | BIT(Wsw),
		[Wuh] = BIT(Wuh) | BIT(Wuw),
		[Wsw] = BIT(Wsw),
		[Wuw] = BIT(Wuw),
	};
	bits b;
	Tmp *t;
	Ins *i2;
	int64_t v, v1;

	if (i->op == Ocopy)
		return i->arg[0];

	if (optab[i->op].hasid)
	if (KBASE(i->cls) == 0) /* integer only */
	if (isconbits(fn, i->arg[1], &v))
	if (v == optab[i->op].idval)
	if (!optab[i->op].iscmpeq || iswu1(fn, i->arg[0]))
			return i->arg[0];

	if (optab[i->op].idemp)
	if (req(i->arg[0], i->arg[1]))
		return i->arg[1];

	if (optab[i->op].iscmpeq || optab[i->op].iscmplgte) {
		if (req(i->arg[0], i->arg[1]))
			return con01[optab[i->op].eqval];
		/* non-identical RCon's - assumes RCon's are deduped */
		if (optab[i->op].iscmpeq)
		if (rtype(i->arg[0]) == RCon && rtype(i->arg[1]) == RCon)
			return con01[1-optab[i->op].eqval];
	}

	/* redundant and mask */
	if (i->op == Oand)
	if (isconbits(fn, i->arg[1], &v))
	if (((v+1) & v) == 0) /* v == 2^N-1 */
	if (defwidthle(fn, i->arg[0], u64_wbits(v)))
		return i->arg[0];

	/* add/sub same conbits pair */
	if (i->op == Oadd || i->op == Osub)
	if (KBASE(i->cls) == 0)
	if (istmpconbits(fn, i, &v)) {
		i2 = fn->tmp[i->arg[0].val].def;
		if (i2 && (i2->op == Oadd || i2->op == Osub))
		if (istmpconbits(fn, i2, &v1))
		if (i->cls == fn->tmp[i2->arg[0].val].cls) { // TODO - check this, here and elsewhere
			if (i->op == i2->op) {
				if ((i->cls == Kw && (int32_t)v == -(int32_t)v1)
				    || (i->cls == Kl && v == -v1))
					return i2->arg[0];
			} else if (v == v1)
				return i2->arg[0];
		}
		fprintf(stderr, "\n\n> After copy elimination:\n");
		printfn(fn, stderr);
	}
	vfree(stk);
	free(cpy);

	if (!isext(i->op) || rtype(i->arg[0]) != RTmp)
		return R;
	if (i->op == Oextsw || i->op == Oextuw)
	if (i->cls == Kw)
		return i->arg[0];

	t = &fn->tmp[i->arg[0].val];
	assert(KBASE(t->cls) == 0);
	if (i->cls == Kl && t->cls == Kw)
		return R;
	b = extcpy[t->width];
	if ((BIT(Wsb + (i->op-Oextsb)) & b) != 0)
		return i->arg[0];

	if (!isnarrowpar(fn, i->arg[0]))
	if (usewidthle(fn, i->to, EXTW[i->op - Oextsb]))
		return i->arg[0];
	if (defwidthle(fn, i->arg[0], EXTMAXW[i->op - Oextsb]))
		return i->arg[0];

	return R;
}
diff --git a/fold.c b/fold.c
index 3ff0bc7..87e5824 100644
--- a/fold.c
+++ b/fold.c
@@ -1,22 +1,6 @@
#include "all.h"

enum {
	Bot = -1, /* lattice bottom */
	Top = 0,  /* lattice top (matches UNDEF) */
};

typedef struct Edge Edge;

struct Edge {
	int dest;
	int dead;
	Edge *work;
};

static int *val;
static Edge *flowrk, (*edge)[2];
static Use **usewrk;
static uint nuse;
/* boring folding code */

static int
iscon(Con *c, int w, uint64_t k)
@@ -29,304 +13,6 @@ iscon(Con *c, int w, uint64_t k)
		return (uint32_t)c->bits.i == (uint32_t)k;
}

static int
latval(Ref r)
{
	switch (rtype(r)) {
	case RTmp:
		return val[r.val];
	case RCon:
		return r.val;
	default:
		die("unreachable");
	}
}

static int
latmerge(int v, int m)
{
	return m == Top ? v : (v == Top || v == m) ? m : Bot;
}

static void
update(int t, int m, Fn *fn)
{
	Tmp *tmp;
	uint u;

	m = latmerge(val[t], m);
	if (m != val[t]) {
		tmp = &fn->tmp[t];
		for (u=0; u<tmp->nuse; u++) {
			vgrow(&usewrk, ++nuse);
			usewrk[nuse-1] = &tmp->use[u];
		}
		val[t] = m;
	}
}

static int
deadedge(int s, int d)
{
	Edge *e;

	e = edge[s];
	if (e[0].dest == d && !e[0].dead)
		return 0;
	if (e[1].dest == d && !e[1].dead)
		return 0;
	return 1;
}

static void
visitphi(Phi *p, int n, Fn *fn)
{
	int v;
	uint a;

	v = Top;
	for (a=0; a<p->narg; a++)
		if (!deadedge(p->blk[a]->id, n))
			v = latmerge(v, latval(p->arg[a]));
	update(p->to.val, v, fn);
}

static int opfold(int, int, Con *, Con *, Fn *);

static void
visitins(Ins *i, Fn *fn)
{
	int v, l, r;

	if (rtype(i->to) != RTmp)
		return;
	if (optab[i->op].canfold) {
		l = latval(i->arg[0]);
		if (!req(i->arg[1], R))
			r = latval(i->arg[1]);
		else
			r = CON_Z.val;
		if (l == Bot || r == Bot)
			v = Bot;
		else if (l == Top || r == Top)
			v = Top;
		else
			v = opfold(i->op, i->cls, &fn->con[l], &fn->con[r], fn);
	} else
		v = Bot;
	/* fprintf(stderr, "\nvisiting %s (%p)", optab[i->op].name, (void *)i); */
	update(i->to.val, v, fn);
}

static void
visitjmp(Blk *b, int n, Fn *fn)
{
	int l;

	switch (b->jmp.type) {
	case Jjnz:
		l = latval(b->jmp.arg);
		if (l == Bot) {
			edge[n][1].work = flowrk;
			edge[n][0].work = &edge[n][1];
			flowrk = &edge[n][0];
		}
		else if (iscon(&fn->con[l], 0, 0)) {
			assert(edge[n][0].dead);
			edge[n][1].work = flowrk;
			flowrk = &edge[n][1];
		}
		else {
			assert(edge[n][1].dead);
			edge[n][0].work = flowrk;
			flowrk = &edge[n][0];
		}
		break;
	case Jjmp:
		edge[n][0].work = flowrk;
		flowrk = &edge[n][0];
		break;
	case Jhlt:
		break;
	default:
		if (isret(b->jmp.type))
			break;
		die("unreachable");
	}
}

static void
initedge(Edge *e, Blk *s)
{
	if (s)
		e->dest = s->id;
	else
		e->dest = -1;
	e->dead = 1;
	e->work = 0;
}

static int
renref(Ref *r)
{
	int l;

	if (rtype(*r) == RTmp)
		if ((l=val[r->val]) != Bot) {
			*r = CON(l);
			return 1;
		}
	return 0;
}

/* require rpo, use, pred */
void
fold(Fn *fn)
{
	Edge *e, start;
	Use *u;
	Blk *b, **pb;
	Phi *p, **pp;
	Ins *i;
	int t, d;
	uint n, a;

	val = emalloc(fn->ntmp * sizeof val[0]);
	edge = emalloc(fn->nblk * sizeof edge[0]);
	usewrk = vnew(0, sizeof usewrk[0], PHeap);

	for (t=0; t<fn->ntmp; t++)
		val[t] = Top;
	for (n=0; n<fn->nblk; n++) {
		b = fn->rpo[n];
		b->visit = 0;
		initedge(&edge[n][0], b->s1);
		initedge(&edge[n][1], b->s2);
	}
	initedge(&start, fn->start);
	flowrk = &start;
	nuse = 0;

	/* 1. find out constants and dead cfg edges */
	for (;;) {
		e = flowrk;
		if (e) {
			flowrk = e->work;
			e->work = 0;
			if (e->dest == -1 || !e->dead)
				continue;
			e->dead = 0;
			n = e->dest;
			b = fn->rpo[n];
			for (p=b->phi; p; p=p->link)
				visitphi(p, n, fn);
			if (b->visit == 0) {
				for (i=b->ins; i<&b->ins[b->nins]; i++)
					visitins(i, fn);
				visitjmp(b, n, fn);
			}
			b->visit++;
			assert(b->jmp.type != Jjmp
				|| !edge[n][0].dead
				|| flowrk == &edge[n][0]);
		}
		else if (nuse) {
			u = usewrk[--nuse];
			n = u->bid;
			b = fn->rpo[n];
			if (b->visit == 0)
				continue;
			switch (u->type) {
			case UPhi:
				visitphi(u->u.phi, u->bid, fn);
				break;
			case UIns:
				visitins(u->u.ins, fn);
				break;
			case UJmp:
				visitjmp(b, n, fn);
				break;
			default:
				die("unreachable");
			}
		}
		else
			break;
	}

	if (debug['F']) {
		fprintf(stderr, "\n> SCCP findings:");
		for (t=Tmp0; t<fn->ntmp; t++) {
			if (val[t] == Bot)
				continue;
			fprintf(stderr, "\n%10s: ", fn->tmp[t].name);
			if (val[t] == Top)
				fprintf(stderr, "Top");
			else
				printref(CON(val[t]), fn, stderr);
		}
		fprintf(stderr, "\n dead code: ");
	}

	/* 2. trim dead code, replace constants */
	d = 0;
	for (pb=&fn->start; (b=*pb);) {
		if (b->visit == 0) {
			d = 1;
			if (debug['F'])
				fprintf(stderr, "%s ", b->name);
			edgedel(b, &b->s1);
			edgedel(b, &b->s2);
			*pb = b->link;
			continue;
		}
		for (pp=&b->phi; (p=*pp);)
			if (val[p->to.val] != Bot)
				*pp = p->link;
			else {
				for (a=0; a<p->narg; a++)
					if (!deadedge(p->blk[a]->id, b->id))
						renref(&p->arg[a]);
				pp = &p->link;
			}
		for (i=b->ins; i<&b->ins[b->nins]; i++)
			if (renref(&i->to))
				*i = (Ins){.op = Onop};
			else {
				for (n=0; n<2; n++)
					renref(&i->arg[n]);
				if (isstore(i->op))
				if (req(i->arg[0], UNDEF))
					*i = (Ins){.op = Onop};
			}
		renref(&b->jmp.arg);
		if (b->jmp.type == Jjnz && rtype(b->jmp.arg) == RCon) {
				if (iscon(&fn->con[b->jmp.arg.val], 0, 0)) {
					edgedel(b, &b->s1);
					b->s1 = b->s2;
					b->s2 = 0;
				} else
					edgedel(b, &b->s2);
				b->jmp.type = Jjmp;
				b->jmp.arg = R;
		}
		pb = &b->link;
	}

	if (debug['F']) {
		if (!d)
			fprintf(stderr, "(none)");
		fprintf(stderr, "\n\n> After constant folding:\n");
		printfn(fn, stderr);
	}

	free(val);
	free(edge);
	vfree(usewrk);
}

/* boring folding code */

static int
foldint(Con *res, int op, int w, Con *cl, Con *cr)
{
@@ -516,7 +202,7 @@ foldflt(Con *res, int op, int w, Con *cl, Con *cr)
	}
}

static int
static Ref
opfold(int op, int cls, Con *cl, Con *cr, Fn *fn)
{
	Ref r;
@@ -524,12 +210,37 @@ opfold(int op, int cls, Con *cl, Con *cr, Fn *fn)

	if (cls == Kw || cls == Kl) {
		if (foldint(&c, op, cls == Kl, cl, cr))
			return Bot;
			return R;
	} else
		foldflt(&c, op, cls == Kd, cl, cr);
	if (!KWIDE(cls))
		c.bits.i &= 0xffffffff;
	r = newcon(&c, fn);
	assert(!(cls == Ks || cls == Kd) || c.flt);
	return r.val;
	return r;
}

/* used by GVN */
Ref
foldref(Fn *fn, Ins *i)
{
	Ref rr;
	Con *cl, *cr;

	if (rtype(i->to) != RTmp)
		return R;
	if (optab[i->op].canfold) {
		if (rtype(i->arg[0]) != RCon)
			return R;
		cl = &fn->con[i->arg[0].val];
		rr = i->arg[1];
		if (req(rr, R))
		    rr = CON_Z;
		if (rtype(rr) != RCon)
			return R;
		cr = &fn->con[rr.val];

		return opfold(i->op, i->cls, cl, cr, fn);
	}
	return R;
}
diff --git a/gcm.c b/gcm.c
new file mode 100644
index 0000000..ca4dd4b
--- /dev/null
+++ b/gcm.c
@@ -0,0 +1,352 @@
#include "all.h"

#define NOBID (-1u)

/* ins can trap at runtime */
static int
istrapping(Fn *fn, Ins *i)
{
	int64_t v;

	if (KBASE(i->cls) == 0)
	if (INRANGE(i->op, Odiv, Ourem))
	if (!isconbits(fn, i->arg[1], &v) || v == 0)
		return 1;
	return 0;
}

/* fixed ins that can be eliminated if unused */
static int
canelim(Fn *fn, Ins *i)
{
	return isload(i->op) || isalloc(i->op) || istrapping(fn, i);
}

/* ins must stay in def blk */
int
isfixed(Fn *fn, Ins *i)
{
	return optab[i->op].ispinned || istrapping(fn, i);
}

static uint earlyins(Fn *, Blk *, Ins *);

/* return earlybid of ref def ins */
static uint
schedearly(Fn *fn, Ref r)
{
	Tmp *t;
	Blk *b;

	if (rtype(r) != RTmp)
		return 0 /* root/start */;

	t = &fn->tmp[r.val];
	if (t->gcmbid != NOBID)
		return t->gcmbid; /* already visited/visiting */

	b = fn->rpo[t->bid];
	if (t->def) {
		/* def is an ins */
		assert(b->ins <= t->def && t->def < &b->ins[b->nins]);
		t->gcmbid = 0; /* mark visiting root/start blk */
		t->gcmbid = earlyins(fn, b, t->def); /* schedule ins input defs */
	} else {
		/* def is a phi - it stays in its def blk */
		t->gcmbid = t->bid;
	}

	return t->gcmbid;
}

/* return deepest domdpth bid of arg defs */
static uint
earlyins(Fn *fn, Blk *b, Ins* i)
{
	uint earlybid, arg1earlybid;

	earlybid = schedearly(fn, i->arg[0]);
	assert(earlybid != NOBID);
	arg1earlybid = schedearly(fn, i->arg[1]);
	assert(arg1earlybid != NOBID);
	if (fn->rpo[earlybid]->domdpth < fn->rpo[arg1earlybid]->domdpth) {
		assert(dom(fn->rpo[earlybid], fn->rpo[arg1earlybid]));
		earlybid = arg1earlybid;
	}

	/* fixed ins remain in their defining blk */
	return isfixed(fn, i) ? b->id : earlybid;
}

static void
earlyblk(Fn *fn, uint bid)
{
	Blk *b;
	Phi *p;
	Ins *i;
	uint n;

	b = fn->rpo[bid];
	for (p = b->phi; p; p = p->link)
		for (n = 0; n < p->narg; n++)
			schedearly(fn, p->arg[n]);
	for (i = b->ins; i < &b->ins[b->nins]; i++)
		if (isfixed(fn, i))
			earlyins(fn, b, i);
	schedearly(fn, b->jmp.arg);
}

/* least common ancestor in dom tree */
static uint
lcabid(Fn *fn, uint bid1, uint bid2)
{
	Blk *b;

	if (bid1 == NOBID)
		return bid2;
	if (bid2 == NOBID)
		return bid1;

	b = lca(fn->rpo[bid1], fn->rpo[bid2]);
	assert(b);
	return b->id;
}

static uint
bestbid(Fn *fn, uint earlybid, uint latebid)
{
	Blk *curb, *earlyb, *bestb;

	if (latebid == NOBID)
		return NOBID; /* unused */

	assert(earlybid != NOBID);

	earlyb = fn->rpo[earlybid];
	bestb = curb = fn->rpo[latebid];
	assert(dom(earlyb, curb));

	while (curb != earlyb) {
		curb = curb->idom;
		if (curb->loop < bestb->loop)
			bestb = curb;
	}
	return bestb->id;
}

static uint lateins(Fn *, Blk *, Ins *, Ref r);
static uint latephi(Fn *, Phi *, Ref r);
static uint latejmp(Blk *, Ref r);

/* return lca bid of ref uses */
static uint
schedlate(Fn *fn, Ref r)
{
	Tmp *t;
	Blk *b;
	Use *u;
	uint earlybid;
	uint latebid;
	uint uselatebid;

	if (rtype(r) != RTmp)
		return NOBID;

	t = &fn->tmp[r.val];
	if (t->visit)
		return t->gcmbid; /* already visited/visiting */

	t->visit = 1; /* mark visiting/visited */
	earlybid = t->gcmbid;
	if (earlybid == NOBID)
		return NOBID; /* not used */
	t->gcmbid = t->bid; /* t->gcmbid is now latebid */

	latebid = NOBID;
	for (u = t->use; u < &t->use[t->nuse]; u++) {
		assert(u->bid < fn->nblk);
		b = fn->rpo[u->bid];
		switch (u->type) {
		case UXXX:
			die("unreachable");
			break;
		case UPhi:
			uselatebid = latephi(fn, u->u.phi, r);
			break;
		case UIns:
			uselatebid = lateins(fn, b, u->u.ins, r);
			break;
		case UJmp:
			uselatebid = latejmp(b, r);
			break;
		}
		latebid = lcabid(fn, latebid, uselatebid);
	}

	/* phis stay in their def blk */
	if (t->def) {
		/* allow elimination of unused load/alloc/trapping ins */
		if (latebid == NOBID && canelim(fn, t->def))
			t->gcmbid = NOBID;
		/* ... otherwise fixed ins stay in defining blk */
		else if(!isfixed(fn, t->def))
			t->gcmbid = bestbid(fn, earlybid, latebid);
	}

	return t->gcmbid;
}

/* return lca bid of uses, or NOBID if no active uses */
static uint
lateins(Fn *fn, Blk *b, Ins *i, Ref r)
{
	uint latebid;

	assert(i->op == Onop || req(i->arg[0], r) || req(i->arg[1], r));
	if (i->op == Onop)
		return NOBID; /* already eliminated */

	assert(b->ins <= i && i < &b->ins[b->nins]);

	latebid = schedlate(fn, i->to);
	/* allow elimination of unused load/alloc/trapping ins */
	if (latebid == NOBID)
	if (canelim(fn, i))
		return NOBID;
	/* ... otherwise fixed ins stay in defining blk */
	return isfixed(fn, i) ? b->id : latebid;
}

static uint
latephi(Fn *fn, Phi *p, Ref r)
{
	uint n;
	uint latebid;

	if (!p->narg)
		return NOBID; /* marked as unused */

	latebid = NOBID;
	for (n = 0; n < p->narg; n++)
		if (req(p->arg[n], r))
			latebid = lcabid(fn, latebid, p->blk[n]->id);

	assert(latebid != NOBID);
	return latebid;
}

static uint
latejmp(Blk *b, Ref r)
{
	if (req(b->jmp.arg, R))
		return NOBID;
	else {
		assert(req(b->jmp.arg, r));
		return b->id;
	}
}

static void
lateblk(Fn *fn, uint bid)
{
	Blk *b;
	Phi **pp;
	Ins *i;

	b = fn->rpo[bid];
	for (pp = &b->phi; *(pp);)
		if (schedlate(fn, (*pp)->to) == NOBID) {
			/* unused */
			(*pp)->narg = 0; /* mark unused */
			*pp = (*pp)->link; /* remove phi */
		} else
			pp = &(*pp)->link;

	for (i = b->ins; i < &b->ins[b->nins]; i++)
		if (isfixed(fn, i))
		    lateins(fn, b, i, i->arg[0]);
}

/* move instructions */
static void
sched(Fn *fn)
{
	Blk *b, *b2;
	Ins *i;
	Tmp *t;
	InsMov *im;
	uint nim;

	nim = 0;
	im = vnew(nim, sizeof im[0], PHeap);

	for (b = fn->start; b; b = b->link) {
		for (i = b->ins; i < &b->ins[b->nins]; i++) {
			if (rtype(i->to) != RTmp)
				continue;
			if (isfixed(fn, i) && !canelim(fn, i))
				continue;
			assert(rtype(i->to) == RTmp);
			t = &fn->tmp[i->to.val];
			if (t->gcmbid == NOBID)
				*i = (Ins){.op = Onop};
			else if (t->gcmbid != b->id) {
				b2 = fn->rpo[t->gcmbid];
				vgrow(&im, ++nim);
				im[nim-1] = (InsMov){
					.from = (InsLoc){
						.bid = b->id,
						.insn = i - b->ins,
					},
					.to = (InsLoc){
						.bid = t->gcmbid,
						.insn = b->id < t->gcmbid ? 0 : b2->nins,
					}
				};
			}
		}
	}

	movins(fn, im, nim, 1/*del*/);

	vfree(im);
}

static void
cleartmps(Fn *fn)
{
	Tmp *t;

	for (t=&fn->tmp[Tmp0]; t < &fn->tmp[fn->ntmp]; t++) {
		t->visit = 0;
		t->gcmbid = NOBID;
	}
}

/* https://courses.cs.washington.edu/courses/cse501/06wi/reading/click-pldi95.pdf */
/* require use dom */
/* maintains rpo pred dom */
/* breaks use */
void
gcm(Fn *fn)
{
	uint bid;

	filldomdpth(fn);
	fillloop(fn);

	cleartmps(fn);
	for (bid=0; bid<fn->nblk; bid++)
		earlyblk(fn, bid);
	for (bid=0; bid<fn->nblk; bid++)
		lateblk(fn, bid);
	sched(fn);
	cleartmps(fn);
	filluse(fn);
	fixub4d(fn);

	if (debug['G']) {
		fprintf(stderr, "\n> After GCM:\n");
		printfn(fn, stderr);
	}
}
diff --git a/gvn.c b/gvn.c
new file mode 100644
index 0000000..3af3517
--- /dev/null
+++ b/gvn.c
@@ -0,0 +1,592 @@
#include "all.h"

/* blk unreachable? */
static int
isdead(Fn *fn, Blk *b) {
	if (b == fn->start)
		return 0;
	if (b->npred == 0)
		return 1;
	if (b->npred == 1 && b->pred[0] == b)
		return 1;
	return 0;
}

/* literal constants 0, 1 */
Ref con01[2];

static inline uint
mix(uint x0, uint x1)
{
	return x0 + 17*x1;
}

static inline uint
rhash(Ref r)
{
	return mix(r.type, r.val);
}

static uint
ihash(Ins *i)
{
	uint a0h, a1h, ah, h;

	a0h = rhash(i->arg[0]);
	a1h = rhash(i->arg[1]);
	ah = mix(a0h, a1h);
	h = mix(i->cls, i->op);
	h = mix(h, ah);

	return h;

}

static int
ieq(Ins *ia, Ins *ib)
{
	if (ia->cls == ib->cls)
	if (ia->op == ib->op)
	if (req(ia->arg[0], ib->arg[0]))
	if (req(ia->arg[1], ib->arg[1]))
		return 1;
	return 0;
}

static Ins** gvntbl;
static uint gvntbln;
static uint lookupn;
static uint proben;
static uint maxproben;

static Ins *
gvndup(Ins *i, int insert) {
	uint idx, n;
	Ins *i2;

	lookupn++;

	idx = ihash(i) % gvntbln;
	for (n = 1;; n++) {
		proben++;
		if (n > maxproben)
			maxproben = n;
		i2 = gvntbl[idx];
		if (!i2)
			break;
		if (ieq(i, i2))
			return i2;

		idx++;
		if (gvntbln <= idx)
			idx = 0;
	}
	/* not found */
	if (insert) {
		gvntbl[idx] = i;
	}
	return 0;
}

static void
replaceref(Ref *r, Ref r1, Ref r2)
{
	if (req(*r, r1))
		*r = r2;
}

static void
replacepuse(Phi *p, Ref r1, Ref r2)
{
	Ref *a;

	for (a = p->arg; a < &p->arg[p->narg]; a++)
		replaceref(a, r1, r2);
}

static void
replaceiuse(Ins *i, Ref r1, Ref r2)
{
	replaceref(&i->arg[0], r1, r2);
	replaceref(&i->arg[1], r1, r2);
}

static void
replacejuse(Blk* b, Ref r1, Ref r2)
{
	replaceref(&b->jmp.arg, r1, r2);
}

static void
replaceuse(Fn *fn, Use* u, Ref r1, Ref r2)
{
	Tmp *t2;
	Blk *b;

	t2 = rtype(r2) == RTmp ? &fn->tmp[r2.val] : 0;
	b = fn->rpo[u->bid];

	switch (u->type) {
	case UXXX:
		die("unreachable");
		break;
	case UPhi:
		replacepuse(u->u.phi, r1, r2);
		if (t2)
			adduse(t2, UPhi, b, u->u.phi);
		break;
	case UIns:
		replaceiuse(u->u.ins, r1, r2);
		if (t2)
			adduse(t2, UIns, b, u->u.ins);
		break;
	case UJmp:
		replacejuse(fn->rpo[u->bid], r1, r2);
		if (t2)
			adduse(t2, UJmp, b);
		break;
	}
}

static void
replaceuses(Fn *fn, Ref r1, Ref r2)
{
	Tmp *t1;
	Use *u;

	assert(rtype(r1) == RTmp);
	t1 = &fn->tmp[r1.val];

	for (u = t1->use; u < &t1->use[t1->nuse]; u++)
		replaceuse(fn, u, r1, r2);

	t1->nuse = 0;
}

static void
rmuse(Fn *fn, Blk *b, uint ty, Ins *i, Phi *p, Ref r, int strict)
{
	Tmp *t;
	Use *u;
	int found, rm;

	if (rtype(r) != RTmp)
		return;
	found = 0;
	t = &fn->tmp[r.val];
	for (u = t->use; u < &t->use[t->nuse];) {
		rm = 0;
		if (u->type == ty) {
			switch (ty) {
			case UXXX:
				die("unreachable");
				break;
			case UIns:
				assert(p == 0);
				assert(i);
				rm = u->u.ins == i;
				break;
			case UPhi:
				assert(i == 0);
				assert(p);
				rm = u->u.phi == p;
				break;
			case UJmp:
				assert(i == 0);
				assert(p == 0);
				rm = u->bid == b->id;
				break;
			default:
				die("unreachable");
				break;
			}
		}
		if (rm) {
			found++;
			assert(u < &t->use[t->nuse]);
			assert(u->bid == b->id);
			/* careful - deleting below iterator */
			memcpy(u, u+1, ((t->nuse) - ((u+1)-t->use)) * sizeof u[0]);
			t->nuse--;
		} else
			u++;
	}
	if (strict)
		assert(found);
}

/* sort phi args in blk pred order */
static void
phinorm(Blk *b, Phi *p)
{
	Blk *pred, *btmp;
	Ref atmp;
	uint n, n1;

	assert(b->npred == p->narg);
	/* bubble sort :) */
	for (n = 0; n < b->npred; n++) {
		pred = b->pred[n];
		if (p->blk[n] != pred) {
			for (n1 = n+1; n1 < b->npred; n1++)
				if (p->blk[n1] == pred)
					break;
			assert(n1 < b->npred);
			atmp = p->arg[n1];
			btmp = p->blk[n1];
			p->arg[n1] = p->arg[n];
			p->blk[n1] = p->blk[n];
			p->arg[n] = atmp;
			p->blk[n] = btmp;
		}
	}
}

static int
rcmp(Ref a, Ref b)
{
	if (rtype(a) != rtype(b))
		return rtype(a)-rtype(b);
	return a.val - b.val;
}

static int
phicmp(const void *a, const void *b)
{
	Phi *pa, *pb;
	uint n;
	int acmp;

	pa = *(Phi**)a;
	pb = *(Phi**)b;
	assert(pa->narg == pb->narg);
	for (n=0; n<pa->narg; n++) {
		assert(pa->blk[n] == pb->blk[n]);
		acmp = rcmp(pa->arg[n], pb->arg[n]);
		if (acmp != 0)
			return acmp;
	}
	return 0;
}

static int
phieq(Phi *pa, Phi *pb)
{
	uint n;

	assert(pa->narg == pb->narg);
	for (n=0; n<pa->narg; n++) {
		assert(pa->blk[n] == pb->blk[n]);
		if (!req(pa->arg[n], pb->arg[n]))
			return 0;
	}
	return 1;
}

static void
killphi(Fn *fn, Blk *b, Phi **pp, Ref r)
{
	Phi *p;
	uint n;

	p = *pp;
	replaceuses(fn, p->to, r);
	assert(b->npred == p->narg);
	for (n = 0; n < p->narg; n++)
		rmuse(fn, b, UPhi, 0, p, p->arg[n], 0/*!strict TODO dups*/);
	p->narg = 0; /* mark as unused - TODO should not be necessary with rmuse*/
	*pp = p->link;
}

static void
dedupphis(Fn *fn, Blk *b) {
	uint n, nphis;
	Blk *b2;
	Phi *p, **pp, **phis;

	/* phi "copy" - all args identical */
	for (pp = &b->phi; *pp;) {
		p = *pp;
		assert(p->narg == b->npred);
		for (n = 0; n < p->narg-1; n++) {
			if (!req(p->arg[n], p->arg[n+1]))
				goto Skip;
		}
		killphi(fn, b, pp, p->arg[0]);
		continue;
	Skip:;
		pp = &(*pp)->link;
	}

	/* phi bool 1/0 "copy" */
	/* TODO - triangle case */
	if (b->npred == 2)
	if (b->pred[0]->npred == 1)
	if (b->pred[1]->npred == 1)
	if (b->pred[0]->pred[0] == b->pred[1]->pred[0]) {
		b2 = b->pred[0]->pred[0];
		assert(b2->jmp.type == Jjnz);
		if (iswu1(fn, b2->jmp.arg)) {
			for (pp = &b->phi; *pp;) {
				p = *pp;
				assert(p->narg == 2);
				if (!req(p->arg[0], con01[p->blk[0] == b2->s1]))
					goto Skip2;
				if (!req(p->arg[1], con01[p->blk[0] == b2->s2]))
					goto Skip2;
				killphi(fn, b, pp, b2->jmp.arg);
				continue;
			Skip2:;
				pp = &(*pp)->link;
			}
		}
	}

	/* redundant phis */
	if (!b->phi || !b->phi->link)
		return;
	nphis = 0;
	for (p = b->phi; p; p = p->link) {
		nphis++;
		phinorm(b, p);
	}
	phis = emalloc(nphis * sizeof phis[0]);
	n = 0;
	for (p = b->phi; p; p = p->link)
		phis[n++] = p;
	qsort(phis, nphis, sizeof phis[0], phicmp);
	b->phi = phis[0];
	for (n = 0; n < nphis-1; n++)
		phis[n]->link = phis[n+1];
	phis[nphis-1]->link = 0;
	free(phis);
	for (pp = &b->phi; (*pp)->link;) {
		if (phieq(*pp, (*pp)->link)) {
			killphi(fn, b, pp, (*pp)->link->to);
			continue;
		}
		pp = &(*pp)->link;
	}
}

static void
normins(Fn *fn, Ins *i)
{
	uint n;
	int64_t v;
	Ref r;

	/* truncate constant bits to 32 bits for "s"/w" uses */
	for (n = 0; n < 2; n++) {
		if (!KWIDE(optab[i->op].argcls[n][i->cls]))
		if (isconbits(fn, i->arg[n], &v))
		if ((v & 0xffffffff) != v)
			i->arg[n] = getcon(v & 0xffffffff, fn);
	}
	/* order arg[0] <= arg[1] for commutative ops, preferring RTmp in arg[0] */
	if (optab[i->op].commutes)
	if (rcmp(i->arg[0], i->arg[1]) > 0) {
		r = i->arg[1];
		i->arg[1] = i->arg[0];
		i->arg[0] = r;
	}
}

static void
killins(Fn *fn, Blk *b, Ins *i, Ref r1, Ref r2)
{
		replaceuses(fn, r1, r2);
		rmuse(fn, b, UIns, i, 0, i->arg[0], 1/*strict*/);
		if (!req(i->arg[0], i->arg[1]))
			rmuse(fn, b, UIns, i, 0, i->arg[1], 1/*strict*/);
		*i = (Ins){.op = Onop};
}

static void
dedupi(Fn *fn, Blk *b, Ins *i)
{
	Ref r2;
	Ins *i2;

	if (i->op == Onop || i->op == Odbgloc)
		return;

	normins(fn, i);

	if (optab[i->op].ispinned)
		return;
	assert(rtype(i->to) == RTmp);

	/* effective copy? */
	r2 = copyref(fn, i);
	if (!req(r2, R)) {
		killins(fn, b, i, i->to, r2);
		return;
	}

	/* effective constant? */
	r2 = foldref(fn, i);
	if (!req(r2, R)) {
		killins(fn, b, i, i->to, r2);
		return;
	}

	/* do not dedup (trapping) ins that GCM will not move */
	if (isfixed(fn, i))
		return;

	/* duplicate? */
	i2 = gvndup(i, 1);
	if (i2) {
		killins(fn, b, i, i->to, i2->to);
		return;
	}
}

static void
dedupins(Fn *fn, Blk *b)
{
	Ins *i;

	for (i = b->ins; i < &b->ins[b->nins]; i++)
		dedupi(fn, b, i);
}

static void
dedupjmp(Fn *fn, Blk *b) {
	Blk *tmp;
	int64_t v;

	if (b->jmp.type != Jjnz)
		return;
	if (b->s1 != b->s2) {
		if (!isconbits(fn, b->jmp.arg, &v))
			return;
		if (v == 0) {
			tmp = b->s1;
			b->s1 = b->s2;
			b->s2 = tmp;
		}
	}
	/* we later move active ins out of dead blks */
	edgedel(b, &b->s2);
	b->jmp.type = Jjmp;
	rmuse(fn, b, UJmp, 0, 0, b->jmp.arg, 1/*strict*/);
	b->jmp.arg = R;
}

/* propagate dead blks */
static int
deadblk(Fn *fn, Blk *b) {
	if (!isdead(fn, b))
		return 0;
	if (b->s1)
		edgedel(b, &b->s1);
	if (b->s2)
		edgedel(b, &b->s2);
	b->jmp.type = Jhlt;
	rmuse(fn, b, UJmp, 0, 0, b->jmp.arg, 1/*strict*/);
	b->jmp.arg = R;
	return 1;
}

/* rebuild rpo pred use */
/* careful not to lose active ins */
static void
rebuildcfg(Fn *fn) {
	uint n, prevnblk, nins;
	Blk **prevrpo;
	Blk *b;
	Ins *i, *iv;

	prevnblk = fn->nblk;
	prevrpo = emalloc(prevnblk * sizeof prevrpo[0]);
	memcpy(prevrpo, fn->rpo, prevnblk * sizeof prevrpo[0]);

	fillrpo(fn);

	iv = 0;
	for (n=0; n<prevnblk; n++) {
		b = prevrpo[n];
		if (b->id == -1u) {
			/* blk unreachable after GVN */
			for (i = b->ins; i < &b->ins[b->nins]; i++)
				if (i->op != Onop)
				if (!optab[i->op].ispinned)
				if (gvndup(i, 0) == i) {
					/* (possibly) active ins - add to start blk */
					if (!iv) {
						nins = fn->start->nins;
						iv = vnew(nins, sizeof iv[0], PHeap);
						memcpy(iv, fn->start->ins, nins * sizeof iv[0]);
					}
					vgrow(&iv, ++nins);
					iv[nins-1] = *i;
				}
		}
	}
	if (iv) {
		idup(&fn->start->ins, iv, nins);
		fn->start->nins = nins;
		vfree(iv);
	}
	free(prevrpo);
}

/* https://courses.cs.washington.edu/courses/cse501/06wi/reading/click-pldi95.pdf */
/* require rpo pred ssa use */
/* recreates rpo */
/* breaks pred use dom ssa (GCM fixes ssa) */
void
gvn(Fn *fn)
{
	uint n, nins;
	Blk *b;
	Phi *p;

	con01[0] = getcon(0, fn);
	con01[1] = getcon(1, fn);

	nins = 0;
	for (b=fn->start; b; b=b->link)
		for (p = b->phi; p; p = p->link)
			p->visit = 0;

	fillloop(fn);
	narrowpars(fn);
	filluse(fn);
	ssacheck(fn);

	for (b=fn->start; b; b=b->link)
		nins += b->nins;

	gvntbln = nins + nins/2;
	gvntbl = emalloc(gvntbln * sizeof gvntbl[0]);
	lookupn = 0;
	proben = 0;
	maxproben = 0;

	/* GVN */
	for (n=0; n<fn->nblk; n++) {
		b = fn->rpo[n];
		if (deadblk(fn, b))
			continue;
		dedupphis(fn, b);
		dedupins(fn, b);
		dedupjmp(fn, b);
	}

	rebuildcfg(fn);

	free(gvntbl);
	gvntbl = 0;
	gvntbln = 0;
	lookupn = 0;
	proben = 0;
	maxproben = 0;

	if (debug['G']) {
		fprintf(stderr, "\n> After GVN:\n");
		printfn(fn, stderr);
	}
}
diff --git a/ins.c b/ins.c
new file mode 100644
index 0000000..1e238c7
--- /dev/null
+++ b/ins.c
@@ -0,0 +1,197 @@
#include "all.h"

typedef struct FromLoc FromLoc;
struct FromLoc {
	InsLoc from;
	uint ton;
};

static int
loccmp(InsLoc *la, InsLoc *lb)
{
	if (la->bid != lb->bid)
		return (int)la->bid - (int)lb->bid;
	return la->insn - lb->insn;
}

static int
tocmp(const void *a, const void *b)
{
	InsMov *ma, *mb;

	ma = (InsMov*)a;
	mb = (InsMov*)b;
	return loccmp(&ma->to, &mb->to);
}

static int
fromcmp(const void *a, const void *b)
{
	FromLoc *fa, *fb;

	fa = (FromLoc*)a;
	fb = (FromLoc*)b;
	return loccmp(&fa->from, &fb->from);
}

static int
loceq(InsLoc *a, InsLoc *b)
{
	return a->bid == b->bid && a->insn == b->insn;
}

/* after, mov is sorted by to, and to.insn, from.insn are updated */
void
movins(Fn *fn, InsMov *mov, uint nmov, int del)
{
	Blk *b, *b2;
	uint bid, n, fromn, ton, ifromn, iton;
	InsMov *m;
	BSet to[1];
	FromLoc *from;
	uint *newbnins;
	Ins **newbins;

	qsort(mov, nmov, sizeof mov[0], tocmp);

	from = emalloc(nmov * sizeof from[0]);
	for (n = 0; n < nmov; n++) {
		from[n].from = mov[n].from;
		from[n].ton = n;
	}

	qsort(from, nmov, sizeof from[0], fromcmp);

	/* count new ins in each blk */
	bsinit(to, fn->nblk); /* no free for BSet? */
	newbnins = emalloc(fn->nblk * sizeof newbnins[0]);
	for (m = mov; m < &mov[nmov]; m++) {
		bsset(to, m->to.bid);
		newbnins[m->to.bid]++;
	}

	/* allocate new b->ins for blks with new ins */
	newbins = emalloc(fn->nblk * sizeof newbins[0]);
	for (bid = 0; bid < fn->nblk; bid++) {
		assert(bshas(to, bid) == (newbnins[bid] != 0));
		if (!bshas(to, bid))
			continue;
		b = fn->rpo[bid];
		newbnins[bid] += b->nins;
		newbins[bid] = alloc(newbnins[bid] * sizeof(Ins));
	}

	/* populate new b->ins */
	/* update mov to.insn, from insn */
	fromn = ton = 0;
	for (bid = 0; bid < fn->nblk; bid++) {
		assert(bshas(to, bid) == (newbnins[bid] != 0));
		if (!bshas(to, bid)) {
			while (fromn < nmov && from[fromn].from.bid == bid)
				fromn++;
			continue;
		}
		b = fn->rpo[bid];
		iton = 0;
		for (ifromn = 0; ifromn < b->nins; ifromn++) {
			/* insert new ins, update to */
			while (ton < nmov && loceq(&mov[ton].to, &(InsLoc){.bid = bid, .insn = ifromn})) {
				b2 = fn->rpo[mov[ton].from.bid];
				assert(mov[ton].from.insn < b2->nins);
				newbins[bid][iton++] = b2->ins[mov[ton].from.insn];
				mov[ton++].to.insn = iton-1;
			}
			/* update from */
			while (fromn < nmov && loceq(&from[fromn].from, &(InsLoc){.bid = bid, .insn = ifromn}))
				from[fromn++].from.insn = iton;
			/* copy original ins */
			newbins[bid][iton++] = b->ins[ifromn];
		}
		/* append new ins, update to */
		while (ton < nmov && mov[ton].to.bid == bid) {
			assert(mov[ton].to.insn == b->nins);
			b2 = fn->rpo[mov[ton].from.bid];
			assert(mov[ton].from.insn < b2->nins);
			newbins[bid][iton++] = b2->ins[mov[ton].from.insn];
			mov[ton++].to.insn = iton-1;
		}
		assert(ifromn == b->nins);
		assert(iton == newbnins[bid]);
	}
	assert(fromn == nmov);
	assert(ton == nmov);

	/* install new b->ins */
	for (bid = 0; bid < fn->nblk; bid++) {
		assert(bshas(to, bid) == (newbins[bid] != 0));
		if (!bshas(to, bid))
			continue;
		b = fn->rpo[bid];
		b->ins = newbins[bid];
		b->nins = newbnins[bid];
	}

	/* remove from ins, update mov from insn */
	for (fromn = 0; fromn < nmov; fromn++) {
		b = fn->rpo[from[fromn].from.bid];
		assert(from[fromn].from.insn < b->nins);
		if (del)
			b->ins[from[fromn].from.insn] = (Ins){.op = Onop};
		mov[from[fromn].ton].from.insn = from[fromn].from.insn;
	}

	free(from);
	free(newbins);
	free(newbnins);
}

static void
setdef(Fn *fn, Blk *b, Ins *i)
{
	assert(b->ins <= i && i < &b->ins[b->nins]);
	if (rtype(i->to) == RTmp) {
		assert(fn->tmp[i->to.val].bid == b->id);
		fn->tmp[i->to.val].def = i;
	}
}

/* shuffle b->ins to fixup use-before-def */
void
fixub4d(Fn *fn)
{
	Blk *b;
	Ins *i, *i2;
	Ins icpy;
	uint n;
	Tmp *t;
	int ub4d;

	for (b = fn->start; b; b = b->link)
		/* O(N^2) but rare in practice */
		for (i = b->ins; i < &b->ins[b->nins];) {
			ub4d = 0;
			for (n = 0; n < 2; n++)
				if (rtype(i->arg[n]) == RTmp) {
					t = &fn->tmp[i->arg[n].val];
					if (t->bid == b->id && t->def)
					if (i < t->def) {
						assert(b->ins <= t->def && t->def < &b->ins[b->nins]);
						ub4d = 1;
						i2 = t->def;
						break;
					}
				}
			if (!ub4d)
				goto Skip;
			icpy = *i2;
			for(; i2 > i; --i2) {
				*i2 = *(i2-1);
				setdef(fn, b, i2);
			}
			*i = icpy;
			setdef(fn, b, i);
			continue;
		Skip:
			i++;
		}
}
diff --git a/main.c b/main.c
index 5ecb4d0..b590363 100644
--- a/main.c
+++ b/main.c
@@ -73,10 +73,19 @@ func(Fn *fn)
	fillalias(fn);
	coalesce(fn);
	filluse(fn);
	filldom(fn);
	ssacheck(fn);
	copy(fn);
	gvn(fn);
	fillpreds(fn);
	filluse(fn);
	filldom(fn);
	gcm(fn);
	fillrpo(fn);
	fillpreds(fn);
	filluse(fn);
	reassoc(fn);
	filluse(fn);
	fold(fn);
	ssacheck(fn);
	T.abi1(fn);
	simpl(fn);
	fillpreds(fn);
diff --git a/ops.h b/ops.h
index beaa6f3..92a299f 100644
--- a/ops.h
+++ b/ops.h
@@ -6,188 +6,185 @@
	#define V(Imm)
#endif

#ifndef P
	#define P(CanFold, HasId, IdVal)
#ifndef F
#define F(CanFold, HasId, IdVal, Commutes, Idemp, IsCmpEq, IsCmpLgte, CmpEqVal, IsPinned)
#endif


#define T(a,b,c,d,e,f,g,h) {                          \
	{[Kw]=K##a, [Kl]=K##b, [Ks]=K##c, [Kd]=K##d}, \
	{[Kw]=K##e, [Kl]=K##f, [Ks]=K##g, [Kd]=K##h}  \
}


/*********************/
/* PUBLIC OPERATIONS */
/*********************/

/* Arithmetic and Bits */
O(add,     T(w,l,s,d, w,l,s,d), P(1,1,0)) X(2,1,0) V(1)
O(sub,     T(w,l,s,d, w,l,s,d), P(1,1,0)) X(2,1,0) V(0)
O(neg,     T(w,l,s,d, x,x,x,x), P(1,0,0)) X(1,1,0) V(0)
O(div,     T(w,l,s,d, w,l,s,d), P(1,1,1)) X(0,0,0) V(0)
O(rem,     T(w,l,e,e, w,l,e,e), P(1,0,0)) X(0,0,0) V(0)
O(udiv,    T(w,l,e,e, w,l,e,e), P(1,1,1)) X(0,0,0) V(0)
O(urem,    T(w,l,e,e, w,l,e,e), P(1,0,0)) X(0,0,0) V(0)
O(mul,     T(w,l,s,d, w,l,s,d), P(1,1,1)) X(2,0,0) V(0)
O(and,     T(w,l,e,e, w,l,e,e), P(1,0,0)) X(2,1,0) V(1)
O(or,      T(w,l,e,e, w,l,e,e), P(1,1,0)) X(2,1,0) V(1)
O(xor,     T(w,l,e,e, w,l,e,e), P(1,1,0)) X(2,1,0) V(1)
O(sar,     T(w,l,e,e, w,w,e,e), P(1,1,0)) X(1,1,0) V(1)
O(shr,     T(w,l,e,e, w,w,e,e), P(1,1,0)) X(1,1,0) V(1)
O(shl,     T(w,l,e,e, w,w,e,e), P(1,1,0)) X(1,1,0) V(1)
O(add,     T(w,l,s,d, w,l,s,d), F(1,1,0,1,0,0,0,0,0)) X(2,1,0) V(1)
O(sub,     T(w,l,s,d, w,l,s,d), F(1,1,0,0,0,0,0,0,0)) X(2,1,0) V(0)
O(neg,     T(w,l,s,d, x,x,x,x), F(1,0,0,0,0,0,0,0,0)) X(1,1,0) V(0)
O(div,     T(w,l,s,d, w,l,s,d), F(1,1,1,0,0,0,0,0,0)) X(0,0,0) V(0)
O(rem,     T(w,l,e,e, w,l,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
O(udiv,    T(w,l,e,e, w,l,e,e), F(1,1,1,0,0,0,0,0,0)) X(0,0,0) V(0)
O(urem,    T(w,l,e,e, w,l,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
O(mul,     T(w,l,s,d, w,l,s,d), F(1,1,1,1,0,0,0,0,0)) X(2,0,0) V(0)
O(and,     T(w,l,e,e, w,l,e,e), F(1,0,0,1,1,0,0,0,0)) X(2,1,0) V(1)
O(or,      T(w,l,e,e, w,l,e,e), F(1,1,0,1,1,0,0,0,0)) X(2,1,0) V(1)
O(xor,     T(w,l,e,e, w,l,e,e), F(1,1,0,1,0,0,0,0,0)) X(2,1,0) V(1)
O(sar,     T(w,l,e,e, w,w,e,e), F(1,1,0,0,0,0,0,0,0)) X(1,1,0) V(1)
O(shr,     T(w,l,e,e, w,w,e,e), F(1,1,0,0,0,0,0,0,0)) X(1,1,0) V(1)
O(shl,     T(w,l,e,e, w,w,e,e), F(1,1,0,0,0,0,0,0,0)) X(1,1,0) V(1)

/* Comparisons */
O(ceqw,    T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cnew,    T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
O(csgew,   T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
O(csgtw,   T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cslew,   T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
O(csltw,   T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(1)
O(cugew,   T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cugtw,   T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
O(culew,   T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cultw,   T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(1)

O(ceql,    T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cnel,    T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
O(csgel,   T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
O(csgtl,   T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cslel,   T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
O(csltl,   T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(1)
O(cugel,   T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cugtl,   T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
O(culel,   T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cultl,   T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(1)

O(ceqs,    T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cges,    T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cgts,    T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cles,    T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
O(clts,    T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cnes,    T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cos,     T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cuos,    T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)

O(ceqd,    T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cged,    T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cgtd,    T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cled,    T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cltd,    T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cned,    T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cod,     T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
O(cuod,    T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
O(ceqw,    T(w,w,e,e, w,w,e,e), F(1,1,1,1,0,1,0,1,0)) X(0,1,0) V(0)
O(cnew,    T(w,w,e,e, w,w,e,e), F(1,1,0,1,0,1,0,0,0)) X(0,1,0) V(0)
O(csgew,   T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
O(csgtw,   T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,1,0,0)) X(0,1,0) V(0)
O(cslew,   T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
O(csltw,   T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,1,0,0)) X(0,1,0) V(1)
O(cugew,   T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
O(cugtw,   T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,1,0,0)) X(0,1,0) V(0)
O(culew,   T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
O(cultw,   T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,1,0,0)) X(0,1,0) V(1)

O(ceql,    T(l,l,e,e, l,l,e,e), F(1,0,0,1,0,1,0,1,0)) X(0,1,0) V(0)
O(cnel,    T(l,l,e,e, l,l,e,e), F(1,0,0,1,0,1,0,0,0)) X(0,1,0) V(0)
O(csgel,   T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
O(csgtl,   T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,1,0,0)) X(0,1,0) V(0)
O(cslel,   T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
O(csltl,   T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,1,0,0)) X(0,1,0) V(1)
O(cugel,   T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
O(cugtl,   T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,1,0,0)) X(0,1,0) V(0)
O(culel,   T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
O(cultl,   T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,1,0,0)) X(0,1,0) V(1)

O(ceqs,    T(s,s,e,e, s,s,e,e), F(1,0,0,1,0,0,0,0,0)) X(0,1,0) V(0)
O(cges,    T(s,s,e,e, s,s,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
O(cgts,    T(s,s,e,e, s,s,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
O(cles,    T(s,s,e,e, s,s,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
O(clts,    T(s,s,e,e, s,s,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
O(cnes,    T(s,s,e,e, s,s,e,e), F(1,0,0,1,0,0,0,0,0)) X(0,1,0) V(0)
O(cos,     T(s,s,e,e, s,s,e,e), F(1,0,0,1,0,0,0,0,0)) X(0,1,0) V(0)
O(cuos,    T(s,s,e,e, s,s,e,e), F(1,0,0,1,0,0,0,0,0)) X(0,1,0) V(0)

O(ceqd,    T(d,d,e,e, d,d,e,e), F(1,0,0,1,0,0,0,0,0)) X(0,1,0) V(0)
O(cged,    T(d,d,e,e, d,d,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
O(cgtd,    T(d,d,e,e, d,d,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
O(cled,    T(d,d,e,e, d,d,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
O(cltd,    T(d,d,e,e, d,d,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
O(cned,    T(d,d,e,e, d,d,e,e), F(1,0,0,1,0,0,0,0,0)) X(0,1,0) V(0)
O(cod,     T(d,d,e,e, d,d,e,e), F(1,0,0,1,0,0,0,0,0)) X(0,1,0) V(0)
O(cuod,    T(d,d,e,e, d,d,e,e), F(1,0,0,1,0,0,0,0,0)) X(0,1,0) V(0)

/* Memory */
O(storeb,  T(w,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0)
O(storeh,  T(w,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0)
O(storew,  T(w,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0)
O(storel,  T(l,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0)
O(stores,  T(s,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0)
O(stored,  T(d,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0)

O(loadsb,  T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(loadub,  T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(loadsh,  T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(loaduh,  T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(loadsw,  T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(loaduw,  T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(load,    T(m,m,m,m, x,x,x,x), P(0,0,0)) X(0,0,1) V(0)
O(storeb,  T(w,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
O(storeh,  T(w,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
O(storew,  T(w,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
O(storel,  T(l,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
O(stores,  T(s,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
O(stored,  T(d,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)

O(loadsb,  T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
O(loadub,  T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
O(loadsh,  T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
O(loaduh,  T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
O(loadsw,  T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
O(loaduw,  T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
O(load,    T(m,m,m,m, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)

/* Extensions and Truncations */
O(extsb,   T(w,w,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
O(extub,   T(w,w,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
O(extsh,   T(w,w,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
O(extuh,   T(w,w,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
O(extsw,   T(e,w,e,e, e,x,e,e), P(1,0,0)) X(0,0,1) V(0)
O(extuw,   T(e,w,e,e, e,x,e,e), P(1,0,0)) X(0,0,1) V(0)

O(exts,    T(e,e,e,s, e,e,e,x), P(1,0,0)) X(0,0,1) V(0)
O(truncd,  T(e,e,d,e, e,e,x,e), P(1,0,0)) X(0,0,1) V(0)
O(stosi,   T(s,s,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
O(stoui,   T(s,s,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
O(dtosi,   T(d,d,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
O(dtoui,   T(d,d,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
O(swtof,   T(e,e,w,w, e,e,x,x), P(1,0,0)) X(0,0,1) V(0)
O(uwtof,   T(e,e,w,w, e,e,x,x), P(1,0,0)) X(0,0,1) V(0)
O(sltof,   T(e,e,l,l, e,e,x,x), P(1,0,0)) X(0,0,1) V(0)
O(ultof,   T(e,e,l,l, e,e,x,x), P(1,0,0)) X(0,0,1) V(0)
O(cast,    T(s,d,w,l, x,x,x,x), P(1,0,0)) X(0,0,1) V(0)
O(extsb,   T(w,w,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(extub,   T(w,w,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(extsh,   T(w,w,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(extuh,   T(w,w,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(extsw,   T(e,w,e,e, e,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(extuw,   T(e,w,e,e, e,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)

O(exts,    T(e,e,e,s, e,e,e,x), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(truncd,  T(e,e,d,e, e,e,x,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(stosi,   T(s,s,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(stoui,   T(s,s,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(dtosi,   T(d,d,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(dtoui,   T(d,d,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(swtof,   T(e,e,w,w, e,e,x,x), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(uwtof,   T(e,e,w,w, e,e,x,x), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(sltof,   T(e,e,l,l, e,e,x,x), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(ultof,   T(e,e,l,l, e,e,x,x), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(cast,    T(s,d,w,l, x,x,x,x), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)

/* Stack Allocation */
O(alloc4,  T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
O(alloc8,  T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
O(alloc16, T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
O(alloc4,  T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
O(alloc8,  T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
O(alloc16, T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)

/* Variadic Function Helpers */
O(vaarg,   T(m,m,m,m, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
O(vastart, T(m,e,e,e, x,e,e,e), P(0,0,0)) X(0,0,0) V(0)
O(vaarg,   T(m,m,m,m, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
O(vastart, T(m,e,e,e, x,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)

O(copy,    T(w,l,s,d, x,x,x,x), P(0,0,0)) X(0,0,1) V(0)
O(copy,    T(w,l,s,d, x,x,x,x), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)

/* Debug */
O(dbgloc,  T(w,e,e,e, w,e,e,e), P(0,0,0)) X(0,0,1) V(0)
O(dbgloc,  T(w,e,e,e, w,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)

/****************************************/
/* INTERNAL OPERATIONS (keep nop first) */
/****************************************/

/* Miscellaneous and Architecture-Specific Operations */
O(nop,     T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,1) V(0)
O(addr,    T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(blit0,   T(m,e,e,e, m,e,e,e), P(0,0,0)) X(0,1,0) V(0)
O(blit1,   T(w,e,e,e, x,e,e,e), P(0,0,0)) X(0,1,0) V(0)
O(swap,    T(w,l,s,d, w,l,s,d), P(0,0,0)) X(1,0,0) V(0)
O(sign,    T(w,l,e,e, x,x,e,e), P(0,0,0)) X(0,0,0) V(0)
O(salloc,  T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
O(xidiv,   T(w,l,e,e, x,x,e,e), P(0,0,0)) X(1,0,0) V(0)
O(xdiv,    T(w,l,e,e, x,x,e,e), P(0,0,0)) X(1,0,0) V(0)
O(xcmp,    T(w,l,s,d, w,l,s,d), P(0,0,0)) X(1,1,0) V(0)
O(xtest,   T(w,l,e,e, w,l,e,e), P(0,0,0)) X(1,1,0) V(0)
O(acmp,    T(w,l,e,e, w,l,e,e), P(0,0,0)) X(0,0,0) V(0)
O(acmn,    T(w,l,e,e, w,l,e,e), P(0,0,0)) X(0,0,0) V(0)
O(afcmp,   T(e,e,s,d, e,e,s,d), P(0,0,0)) X(0,0,0) V(0)
O(reqz,    T(w,l,e,e, x,x,e,e), P(0,0,0)) X(0,0,0) V(0)
O(rnez,    T(w,l,e,e, x,x,e,e), P(0,0,0)) X(0,0,0) V(0)
O(nop,     T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(addr,    T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(blit0,   T(m,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,1,0) V(0)
O(blit1,   T(w,e,e,e, x,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,1,0) V(0)
O(swap,    T(w,l,s,d, w,l,s,d), F(0,0,0,0,0,0,0,0,0)) X(1,0,0) V(0)
O(sign,    T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
O(salloc,  T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
O(xidiv,   T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(1,0,0) V(0)
O(xdiv,    T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(1,0,0) V(0)
O(xcmp,    T(w,l,s,d, w,l,s,d), F(0,0,0,0,0,0,0,0,0)) X(1,1,0) V(0)
O(xtest,   T(w,l,e,e, w,l,e,e), F(0,0,0,0,0,0,0,0,0)) X(1,1,0) V(0)
O(acmp,    T(w,l,e,e, w,l,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
O(acmn,    T(w,l,e,e, w,l,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
O(afcmp,   T(e,e,s,d, e,e,s,d), F(0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
O(reqz,    T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
O(rnez,    T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)

/* Arguments, Parameters, and Calls */
O(par,     T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
O(parsb,   T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
O(parub,   T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
O(parsh,   T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
O(paruh,   T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
O(parc,    T(e,x,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
O(pare,    T(e,x,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
O(arg,     T(w,l,s,d, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
O(argsb,   T(w,e,e,e, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
O(argub,   T(w,e,e,e, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
O(argsh,   T(w,e,e,e, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
O(arguh,   T(w,e,e,e, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
O(argc,    T(e,x,e,e, e,l,e,e), P(0,0,0)) X(0,0,0) V(0)
O(arge,    T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
O(argv,    T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
O(call,    T(m,m,m,m, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
O(par,     T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
O(parsb,   T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
O(parub,   T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
O(parsh,   T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
O(paruh,   T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
O(parc,    T(e,x,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
O(pare,    T(e,x,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
O(arg,     T(w,l,s,d, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
O(argsb,   T(w,e,e,e, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
O(argub,   T(w,e,e,e, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
O(argsh,   T(w,e,e,e, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
O(arguh,   T(w,e,e,e, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
O(argc,    T(e,x,e,e, e,l,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
O(arge,    T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
O(argv,    T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
O(call,    T(m,m,m,m, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)

/* Flags Setting */
O(flagieq,  T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(flagine,  T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(flagisge, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(flagisgt, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(flagisle, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(flagislt, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(flagiuge, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(flagiugt, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(flagiule, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(flagiult, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(flagfeq,  T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(flagfge,  T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(flagfgt,  T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(flagfle,  T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(flagflt,  T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(flagfne,  T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(flagfo,   T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
O(flagfuo,  T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)

O(flagieq,  T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(flagine,  T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(flagisge, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(flagisgt, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(flagisle, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(flagislt, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(flagiuge, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(flagiugt, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(flagiule, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(flagiult, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(flagfeq,  T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(flagfge,  T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(flagfgt,  T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(flagfle,  T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(flagflt,  T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(flagfne,  T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(flagfo,   T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
O(flagfuo,  T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)

#undef T
#undef X
diff --git a/parse.c b/parse.c
index a745779..5c6ea92 100644
--- a/parse.c
+++ b/parse.c
@@ -15,11 +15,17 @@ enum {
};

Op optab[NOp] = {
#undef P
#define P(cf, hi, id) .canfold = cf, .hasid = hi, .idval = id
#define O(op, t, p) [O##op]={.name = #op, .argcls = t, p},
#undef F
#define F(cf, hi, id, co, im, ic, lg, cv, pn)	\
	.canfold = cf, \
	.hasid = hi, .idval = id, \
	.commutes = co, \
	.idemp = im, \
	.iscmpeq = ic, .iscmplgte = lg, .eqval = cv,	\
	.ispinned = pn
#define O(op, k, flags) [O##op]={.name = #op, .argcls = k, flags},
	#include "ops.h"
#undef P
#undef F
};

typedef enum {
diff --git a/reassoc.c b/reassoc.c
new file mode 100644
index 0000000..4dac9a9
--- /dev/null
+++ b/reassoc.c
@@ -0,0 +1,225 @@
#include "all.h"

typedef int testfn(Fn *fn, Use *u, Ref r);

static int
alluses(Fn *fn, Ref r, testfn *test)
{
	Tmp *t;
	Use *u;

	assert(rtype(r) == RTmp);
	t = &fn->tmp[r.val];
	for (u = t->use; u < &t->use[t->nuse]; u++)
		if (!test(fn, u, r))
			return 0;
	return 1;
}

static int
singleuse(Fn *fn, Ref r)
{
	assert(rtype(r) == RTmp);
	return fn->tmp[r.val].nuse == 1;
}

static int
ldstuse(Fn *fn __attribute__((unused)), Use *u, Ref r)
{
	if (u->type == UIns) {
		if (isload(u->u.ins->op)) {
			assert(req(r, u->u.ins->arg[0]));
			return 1;
		}
		if (isstore(u->u.ins->op))
		if (req(r, u->u.ins->arg[1]))
			return 1;
		if (u->u.ins->op == Oblit0) {
			assert(req(r, u->u.ins->arg[0]) || req(r, u->u.ins->arg[1]));
			return 1;
		}
	}
	return 0;
}

static int
iskladdcon(Fn *fn, Ins *i)
{
	int64_t v;

	if (i->op == Oadd)
	if (i->cls == Kl)
	if (rtype(i->arg[0]) == RTmp)
	if (isconbits(fn, i->arg[1], &v))
		return 1;
	return 0;
}

static int
inplaceldstaddr(Fn *fn, Blk *b, Ins *i)
{
	Ins *i2;

	assert(b->ins <= i && i < &b->ins[b->nins]);
	if (singleuse(fn, i->to))
	if (i < &b->ins[b->nins-1]) {
		i2 = i+1;
		if (isload(i2->op))
		if (req(i->to, i2->arg[0]))
			return 1;
		if (isstore(i2->op))
		if (req(i->to, i2->arg[1]))
			return 1;
		// TODO blit0
	}
	return 0;
}

static int
isldstaddr(Fn *fn, Blk *b, Ins *i)
{
	if (iskladdcon(fn, i))
	if (alluses(fn, i->to, ldstuse))
	if (!inplaceldstaddr(fn, b, i))
		return 1;
	return 0;
}

static int
jnzuse(Fn *fn, Use *u, Ref r)
{
	if (u->type == UJmp)
	if (fn->rpo[u->bid]->jmp.type == Jjnz) {
		assert(req(r, fn->rpo[u->bid]->jmp.arg));
		return 1;
	}
	return 0;
}

static int
iscmp4jnz(Fn *fn, Blk *b, Ins *i)
{
	int x;

	if (iscmp(i->op, &x, &x))
	if (alluses(fn, i->to, jnzuse))
	if (!(singleuse(fn, i->to) && i-b->ins == b->nins-1))
		return 1;

	return 0;
}

static int
canreassoc(Fn *fn, Blk *b, Ins *i)
{
	assert(b->ins <= i && i < &b->ins[b->nins]);
	return isldstaddr(fn, b, i) || iscmp4jnz(fn, b, i);
}

static void
ireassoc(Fn *fn, Blk *b, Ins *i, uint *pnim, InsMov **pim)
{
	Blk *b2;
	Tmp *t;
	Use *u;

	assert(b->ins <= i && i < &b->ins[b->nins]);
	if (!canreassoc(fn, b, i))
		return;

	assert(rtype(i->to) == RTmp);
	t = &fn->tmp[i->to.val];
	for (u = t->use; u < &t->use[t->nuse]; u++) {
		assert(u->type == UJmp || u->type == UIns);
		b2 = fn->rpo[u->bid];
		vgrow(pim, ++(*pnim));
		(*pim)[(*pnim)-1] = (InsMov){
			.from = {.bid = b->id, .insn = i-b->ins},
			.to = {
				.bid = u->bid,
				.insn = u->type == UJmp ? b2->nins : u->u.ins-b2->ins
			}};
	}
}

static void
trealloc(Fn *fn, Blk *b, Ins *i)
{
	Ins *i2;
	Ref r;
	int x;

	r = newtmp("rea", i->cls, fn);
	if (iskladdcon(fn, i)) {
		assert(i < &b->ins[b->nins-1]);
		i2 = i+1;
		if (isload(i2->op)) {
			assert(req(i->to, i2->arg[0]));
			i2->arg[0] = r;
		} else if (isstore(i2->op)) {
			assert(req(i->to, i2->arg[1]));
			i2->arg[1] = r;
		} else {
			if (i2->op != Oblit0) {
				assert(i < &b->ins[b->nins-2]);
				i2 = i+2;
			}
			assert(i2->op == Oblit0);
			assert(req(i->to, i2->arg[0]) || req(i->to, i2->arg[1]));
			if (req(i->to, i2->arg[0]))
				i2->arg[0] = r;
			if (req(i->to, i2->arg[1]))
				i2->arg[1] = r;
		}
		i->to = r;
	} else if (iscmp(i->op, &x, &x)) {
		assert(b->jmp.type == Jjnz);
		assert(req(i->to, b->jmp.arg));
		i->to = r;
		b->jmp.arg = r;
	} else
		die("unreachable");
}

/* Redistribute trivial ops to point of use. */
/* Reduces register pressure. */
/* needs rpo, use; breaks use */
void
reassoc(Fn *fn)
{
	Blk *b;
	Ins *i;
	InsMov *im;
	uint n, nim;

	nim = 0;
	im = vnew(nim, sizeof im[0], PHeap);

	/* identify trivial ins */
	for (b = fn->start; b; b = b->link) {
		for (i = b->ins; i < &b->ins[b->nins]; i++)
			ireassoc(fn, b, i, &nim, &im);
	}

	/* duplicate trivial ins */
	movins(fn, im, nim, 0/*!del*/);

	/* create new tmps for dups */
	for (n = 0; n < nim; n++) {
		b = fn->rpo[im[n].to.bid];
		i = &b->ins[im[n].to.insn];
		trealloc(fn, b, i);
	}

	/* delete original ins */
	for (n = 0; n < nim; n++) {
		b = fn->rpo[im[n].from.bid];
		i = &b->ins[im[n].from.insn];
		*i = (Ins){.op = Onop};
	}

	if (debug['G']) {
		fprintf(stderr, "\n> After Reassociation:\n");
		printfn(fn, stderr);
	}
}
diff --git a/ssa.c b/ssa.c
index 929301e..4e3d2a5 100644
--- a/ssa.c
+++ b/ssa.c
@@ -1,7 +1,7 @@
#include "all.h"
#include <stdarg.h>

static void
void
adduse(Tmp *tmp, int ty, Blk *b, ...)
{
	Use *u;
diff --git a/test/_gcm1.ssa b/test/_gcm1.ssa
new file mode 100644
index 0000000..719cddb
--- /dev/null
+++ b/test/_gcm1.ssa
@@ -0,0 +1,48 @@
export
function w $ifmv(w %p1, w %p2, w %p3) {
@start
@entry
	%rt =w add %p2, %p3    # gcm moves to @true
	%rf =w sub %p2, %p3    # gcm moves to @false
	jnz %p1, @true, @false
@true
	%r =w copy %rt
	jmp @exit
@false
	%r =w copy %rf
	jmp @exit
@exit
	ret %r
}

export
function w $hoist1(w %p1, w %p2, w %p3) {
@start
@entry
	%n =w copy 0
	%i =w copy %p1
@loop
	%base =w add %p2, %p3   # gcm moves to @exit
	%i =w sub %i, 1
	%n =w add %n, 1
	jnz %i, @loop, @exit
@exit
	%r =w add %base, %n
	ret %r
}

export
function w $hoist2(w %p1, w %p2, w %p3) {
@start
@entry
	%n =w copy 0
	%i =w copy %p1
@loop
	%base =w add %p2, %p3   # gcm moves to @entry
	%i =w sub %i, 1
	%n =w add %n, %base
	jnz %i, @loop, @exit
@exit
	%r =w add %base, %n
	ret %r
}
diff --git a/test/_gcm2.ssa b/test/_gcm2.ssa
new file mode 100644
index 0000000..baeb4a4
--- /dev/null
+++ b/test/_gcm2.ssa
@@ -0,0 +1,43 @@
# Programs from "Global Code Motion Global Value Numbering" by Cliff Click
# https://courses.cs.washington.edu/courses/cse501/06wi/reading/click-pldi95.pdf

# GCM program in Figure 1

function w $gcm_test(w %a){
@start
	%i.0 =w copy 0
@loop
	%i.1 =w phi @start %i.0, @loop %i.2
	%b =w add %a, 1                     # early schedule moves to @start
	%i.2 =w add %i.1, %b
	%c =w mul %i.2, 2                   #  late schedule moves to @end
	%x =w csltw %i.2, 10
	jnz %x, @loop, @end
@end
	ret %c
}

# GCM program in "Figure 3 x's definition does not dominate it's use"
#
# SSA contruction will insert phi instruction for "x" in @if_false
# preventing the "add" in @if_false from being moved to @if_true

function $gcm_test2 (w %a){
@start
	%f =w copy 1
	%x =w copy 0
	%s.0 =w copy 0
@loop
	%s.1 = w phi @start %s.0, @if_false %s.2
	jnz %a, @if, @end
@if
	jnz %f, @if_true, @if_false
@if_true
	%f =w copy 0
	%x =w add %x, 1
@if_false
	%s.2 =w add %s.1, %x
	jmp @loop
@end
	ret
}
diff --git a/test/_load-elim.ssa b/test/_load-elim.ssa
new file mode 100644
index 0000000..faae478
--- /dev/null
+++ b/test/_load-elim.ssa
@@ -0,0 +1,17 @@
# GCM can eliminate unused add/load instructions

export
function w $f(l %p, w %c) {
@start
        jnz %c, @true, @false
@true
	%p1 =l add %p, 4
	%v1 =w loaduw %p1
        jmp @end
@false
	%p2 =l add %p, 4
	%v2 =w loaduw %p2
        jmp @end
@end
        ret 0
}
diff --git a/test/gvn-live-dead.ssa b/test/gvn-live-dead.ssa
new file mode 100644
index 0000000..d47f05b
--- /dev/null
+++ b/test/gvn-live-dead.ssa
@@ -0,0 +1,19 @@
export
function w $test(w %p1, w %p2) {
@start
@entry
	%t1 =w copy 1
	jnz %t1, @live, @dead1
@live
	%t2 =w add %p1, %p2
	ret %t2
@dead1
	%t2 =w add %p1, %p2  # live ins in dead blk
@dead2
	jnz %t1, @live, @dead1
}

# >>> driver
# extern int test(int p1, int p2);
# int main() { return test(1, 2) != 3; }
# <<<
diff --git a/util.c b/util.c
index 2e4b4cc..d341720 100644
--- a/util.c
+++ b/util.c
@@ -417,6 +417,27 @@ addcon(Con *c0, Con *c1, int m)
	return 1;
}

int
isconbits(Fn *fn, Ref r, int64_t *v)
{
	Con *c;

	if (rtype(r) == RCon) {
		c = &fn->con[r.val];
		if (c->type == CBits) {
			*v = c->bits.i;
			return 1;
		}
	}
	return 0;
}

int
istmpconbits(Fn *fn, Ins *i, int64_t *v)
{
	return rtype(i->arg[0]) == RTmp && isconbits(fn, i->arg[1], v);
}

void
salloc(Ref rt, Ref rs, Fn *fn)
{
-- 
2.34.1
Reply to thread Export thread (mbox)