Changes since RFC 7:
Bug fixes:
- remove isbad4gcm() in GVN/GCM - it is unsound due to different state
at GVN vs GCM time; replace with "reassociation" pass after GCM
- fix intra-blk use-before-def after GCM
- prevent GVN from deduping trapping instructions cos GCM will not
move them
- remove cmp eq/ne identical arg copy detection for floating point, it
is not valid for NaN
- fix cges/cged flagged as commutative in ops.h instead of cnes/cned
respectively; just a typo
Minor features:
- copy detection handles cmp le/lt/ge/gt with identical args
- treat (integer) div/rem by non-zero constant as non-trapping
- eliminate add N/sub N pairs in copy detection
- maintain accurate tmp use in GVN; not strictly necessary but enables
interim global state sanity checking
- "reassociation" of trivial constant offset load/store addresses, and
cmp ops with point-of-use in pass after GCM
- normalise commutative op arg order - e.g. op con, tmp -> op tmp, con
to simplify copy detection and GVN instruction dedup
Codebase:
- split out core copy detection and constant folding (back) out into
copy.c, fold.c respectively; gvn.c was getting monolithic
- generic support for instruction moving in ins.c - used by GCM and
reassoc
- new reassociation pass in reassoc.c
- other minor clean-up/refactor
Changes since RFC 6:
- More ext elimination in GVN by examination of def and use bit width
- elimination of redundant and mask by bit width examination
- Incorporation of Song's patch
Changes since RFC 5:
- avoidance of "bad" candidates for GVN/GCM - trivial address offset
calculations, and comparisons
- more copy detection mostly around boolean values
- allow elimination of unused load, alloc, trapping instructions
- detection of trivial boolean v ? 1 : 0 phi patterns
- bug fix for (removal of) "chg" optimisation in ins recreation - it
was missing removal of unused instructions in some cases
Testing - standard QBE, cproc, hare, harec, roland, coremark
[but latest roland master is failing rust build]
Benchmark
- coremark is ~15%+ faster than master
- hare "HARETEST_INCLUDE='slow' make check" ~5% faster
---
.gitignore | 1 +
Makefile | 3 +-
all.h | 57 +++-
cfg.c | 43 +++
copy.c | 485 ++++++++++++++++++++-------------
fold.c | 347 ++----------------------
gcm.c | 352 ++++++++++++++++++++++++
gvn.c | 592 +++++++++++++++++++++++++++++++++++++++++
ins.c | 197 ++++++++++++++
main.c | 13 +-
ops.h | 291 ++++++++++----------
parse.c | 14 +-
reassoc.c | 225 ++++++++++++++++
ssa.c | 2 +-
test/_gcm1.ssa | 48 ++++
test/_gcm2.ssa | 43 +++
test/_load-elim.ssa | 17 ++
test/gvn-live-dead.ssa | 19 ++
util.c | 21 ++
19 files changed, 2113 insertions(+), 657 deletions(-)
create mode 100644 gcm.c
create mode 100644 gvn.c
create mode 100644 ins.c
create mode 100644 reassoc.c
create mode 100644 test/_gcm1.ssa
create mode 100644 test/_gcm2.ssa
create mode 100644 test/_load-elim.ssa
create mode 100644 test/gvn-live-dead.ssa
diff --git a/.gitignore b/.gitignore
index afd08d7..09e4118 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@ qbe
config.h
.comfile
*.out
+*~
diff --git a/Makefile b/Makefile
index 2482eb1..86a35a4 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,8 @@ PREFIX = /usr/local
BINDIR = $(PREFIX)/bin
COMMOBJ = main.o util.o parse.o abi.o cfg.o mem.o ssa.o alias.o load.o \
- copy.o fold.o simpl.o live.o spill.o rega.o emit.o
+ copy.o fold.o gvn.o gcm.o ins.o reassoc.o simpl.o live.o \
+ spill.o rega.o emit.o
AMD64OBJ = amd64/targ.o amd64/sysv.o amd64/isel.o amd64/emit.o
ARM64OBJ = arm64/targ.o arm64/abi.o arm64/isel.o arm64/emit.o
RV64OBJ = rv64/targ.o rv64/abi.o rv64/isel.o rv64/emit.o
diff --git a/all.h b/all.h
index 3479d27..546a123 100644
--- a/all.h
+++ b/all.h
@@ -191,6 +191,7 @@ enum {
#define INRANGE(x, l, u) ((unsigned)(x) - l <= u - l) /* linear in x */
#define isstore(o) INRANGE(o, Ostoreb, Ostored)
#define isload(o) INRANGE(o, Oloadsb, Oload)
+#define isalloc(o) INRANGE(o, Oalloc4, Oalloc16)
#define isext(o) INRANGE(o, Oextsb, Oextuw)
#define ispar(o) INRANGE(o, Opar, Opare)
#define isarg(o) INRANGE(o, Oarg, Oargv)
@@ -214,8 +215,14 @@ struct Op {
char *name;
short argcls[2][4];
uint canfold:1;
- uint hasid:1;
- uint idval:1; /* identity value 0/1 */
+ uint hasid:1; /* op identity value? */
+ uint idval:1; /* identity value 0/1 */
+ uint commutes:1; /* commutative op? */
+ uint idemp:1; /* idempotent op? */
+ uint iscmpeq:1; /* cmp eq/ne? */
+ uint iscmplgte:1; /* cmp lt/gt/le/ge? */
+ uint eqval:1; /* 1 for eq; 0 for ne */
+ uint ispinned:1; /* GCM pinned op? */
};
struct Ins {
@@ -230,7 +237,8 @@ struct Phi {
Ref *arg;
Blk **blk;
uint narg;
- int cls;
+ short cls;
+ uint visit:1;
Phi *link;
};
@@ -251,6 +259,7 @@ struct Blk {
Blk *idom;
Blk *dom, *dlink;
+ int domdpth;
Blk **fron;
uint nfron;
@@ -344,6 +353,7 @@ struct Tmp {
Wuw
} width;
int visit;
+ uint gcmbid;
};
struct Con {
@@ -448,6 +458,20 @@ struct Dat {
char isstr;
};
+typedef struct InsLoc InsLoc;
+
+struct InsLoc {
+ uint bid;
+ uint insn;
+};
+
+typedef struct InsMov InsMov;
+
+struct InsMov {
+ InsLoc from;
+ InsLoc to;
+};
+
/* main.c */
extern Target T;
extern char debug['Z'+1];
@@ -488,10 +512,11 @@ int symeq(Sym, Sym);
Ref newcon(Con *, Fn *);
Ref getcon(int64_t, Fn *);
int addcon(Con *, Con *, int);
+int isconbits(Fn *fn, Ref r, int64_t *v);
+int istmpconbits(Fn *fn, Ins *i, int64_t *v);
void salloc(Ref, Ref, Fn *);
void dumpts(BSet *, Tmp *, FILE *);
void runmatch(uchar *, Num *, Ref, Ref *);
-
void bsinit(BSet *, uint);
void bszero(BSet *);
uint bscount(BSet *);
@@ -531,6 +556,8 @@ int sdom(Blk *, Blk *);
int dom(Blk *, Blk *);
void fillfron(Fn *);
void loopiter(Fn *, void (*)(Blk *, Blk *));
+void filldomdpth(Fn *);
+Blk *lca(Blk *, Blk *);
void fillloop(Fn *);
void simpljmp(Fn *);
@@ -550,15 +577,33 @@ int storesz(Ins *);
void loadopt(Fn *);
/* ssa.c */
+void adduse(Tmp *, int, Blk *, ...);
void filluse(Fn *);
void ssa(Fn *);
void ssacheck(Fn *);
/* copy.c */
-void copy(Fn *);
+int iswu1(Fn *, Ref);
+void narrowpars(Fn *);
+Ref copyref(Fn *, Ins *);
/* fold.c */
-void fold(Fn *);
+Ref foldref(Fn *, Ins *);
+
+/* gvn.c */
+extern Ref con01[2];
+void gvn(Fn *);
+
+/* gcm.c */
+int isfixed(Fn *, Ins *);
+void gcm(Fn *);
+
+/* ins.c */
+void movins(Fn *, InsMov *, uint, int);
+void fixub4d(Fn *);
+
+/* reassoc.c */
+void reassoc(Fn *);
/* simpl.c */
void simpl(Fn *);
diff --git a/cfg.c b/cfg.c
index 406c307..94d4d53 100644
--- a/cfg.c
+++ b/cfg.c
@@ -269,6 +269,49 @@ multloop(Blk *hd, Blk *b)
b->loop *= 10;
}
+void
+filldomdpth(Fn *fn)
+{
+ Blk *b, *dom;
+ uint dpth;
+
+ for (b=fn->start; b; b=b->link)
+ b->domdpth = -1;
+
+ fn->start->domdpth = 0;
+
+ for (b=fn->start; b; b=b->link) {
+ if (b->domdpth != -1)
+ continue;
+ dpth = 1;
+ for (dom = b->idom; dom->domdpth == -1; dom = dom->idom)
+ dpth++;
+ dpth += dom->domdpth;
+ b->domdpth = dpth;
+ for (dom = b->idom; dom->domdpth == -1; dom = dom->idom)
+ dom->domdpth = --dpth;
+ }
+}
+
+/* least common ancestor in dom tree */
+Blk *
+lca(Blk *b1, Blk *b2)
+{
+ if (!b1)
+ return b2;
+ if (!b2)
+ return b1;
+ while (b1->domdpth > b2->domdpth)
+ b1 = b1->idom;
+ while (b2->domdpth > b1->domdpth)
+ b2 = b2->idom;
+ while (b1 != b2) {
+ b1 = b1->idom;
+ b2 = b2->idom;
+ }
+ return b1;
+}
+
void
fillloop(Fn *fn)
{
diff --git a/copy.c b/copy.c
index da3987f..d794aa9 100644
--- a/copy.c
+++ b/copy.c
@@ -1,217 +1,346 @@
#include "all.h"
-static int
-iscon(Ref r, int64_t bits, Fn *fn)
+static uint
+u64_wbits(uint64_t v)
{
- return rtype(r) == RCon
- && fn->con[r.val].type == CBits
- && fn->con[r.val].bits.i == bits;
+ uint n;
+
+ n = 0;
+ if (v >> 32) { n += 32; v >>= 32; }
+ if (v >> 16) { n += 16; v >>= 16; }
+ if (v >> 8) { n += 8; v >>= 8; }
+ if (v >> 4) { n += 4; v >>= 4; }
+ if (v >> 2) { n += 2; v >>= 2; }
+ if (v >> 1) { n += 1; v >>= 1; }
+ return n+v;
}
static int
-iscopy(Ins *i, Ref r, Fn *fn)
+EXTSIGNED[] = { /*extsb*/1, /*extub*/0, /*extsh*/1, /*extuh*/0, /*extsw*/1, /*extuw*/0 };
+
+static uint
+EXTMAXW[] = { /*extsb*/7, /*extub*/8, /*extsh*/15, /*extuh*/16, /*extsw*/31, /*extuw*/32 };
+
+static uint
+EXTW[] = { /*extsb*/8, /*extub*/8, /*extsh*/16, /*extuh*/16, /*extsw*/32, /*extuw*/32 };
+
+static uint
+STW[] = { /*storeb*/8, /*storeh*/16, /*storew*/32, /*storel*/64, /*stores*/32, /*stored*/64 };
+
+/* is the ref used only as a narrow value? */
+static int
+usewidthle(Fn *fn, Ref r, uint wbits)
{
- static bits extcpy[] = {
- [WFull] = 0,
- [Wsb] = BIT(Wsb) | BIT(Wsh) | BIT(Wsw),
- [Wub] = BIT(Wub) | BIT(Wuh) | BIT(Wuw),
- [Wsh] = BIT(Wsh) | BIT(Wsw),
- [Wuh] = BIT(Wuh) | BIT(Wuw),
- [Wsw] = BIT(Wsw),
- [Wuw] = BIT(Wuw),
- };
- Op *op;
- bits b;
Tmp *t;
+ Use *u;
+ Phi *p;
+ int b;
+ Ins *i;
+ Ref rc;
+ int64_t v;
- if (i->op == Ocopy)
+ if (isconbits(fn, r, &v))
+ if (u64_wbits(v) <= wbits)
return 1;
- op = &optab[i->op];
- if (op->hasid && KBASE(i->cls) == 0)
- return iscon(i->arg[1], op->idval, fn);
- if (!isext(i->op) || rtype(r) != RTmp)
+ if (rtype(r) != RTmp)
return 0;
- if (i->op == Oextsw || i->op == Oextuw)
- if (i->cls == Kw)
- return 1;
-
t = &fn->tmp[r.val];
- assert(KBASE(t->cls) == 0);
- if (i->cls == Kl && t->cls == Kw)
+ for (u = t->use; u < &t->use[t->nuse]; u++) {
+ switch (u->type) {
+ case UPhi:
+ p = u->u.phi;
+ if (p->visit)
+ continue;
+ p->visit = 1;
+ b = usewidthle(fn, p->to, wbits);
+ p->visit = 0;
+ if (b)
+ continue;
+ break;
+ case UIns:
+ i = u->u.ins;
+ assert(i != 0);
+ if (i->op == Ocopy)
+ if (usewidthle(fn, i->to, wbits))
+ continue;
+ if (isext(i->op)) {
+ if (EXTW[i->op - Oextsb] <= wbits)
+ continue;
+ else
+ if (usewidthle(fn, i->to, wbits))
+ continue;;
+ }
+ if (i->op == Oand) {
+ if (req(r, i->arg[0]))
+ rc = i->arg[1];
+ else {
+ assert(req(r, i->arg[1]));
+ rc = i->arg[0];
+ }
+ if (isconbits(fn, rc, &v))
+ if (u64_wbits(v) <= wbits)
+ continue;
+ break;
+ }
+ if (isstore(i->op))
+ if (req(r, i->arg[1]))
+ if (STW[i->op - Ostoreb] > wbits)
+ continue;
+ break;
+ default:
+ break;
+ }
return 0;
- b = extcpy[t->width];
- return (BIT(Wsb + (i->op-Oextsb)) & b) != 0;
+ }
+ return 1;
}
-static Ref
-copyof(Ref r, Ref *cpy)
+static Phi*
+findphi(Fn *fn, uint bid, Ref to)
{
- if (rtype(r) == RTmp && !req(cpy[r.val], R))
- return cpy[r.val];
- return r;
+ Phi *p;
+ for (p = fn->rpo[bid]->phi; p; p = p->link)
+ if (req(p->to, to))
+ break;
+ assert(p);
+ return p;
}
-/* detects a cluster of phis/copies redundant with 'r';
- * the algorithm is inspired by Section 3.2 of "Simple
- * and Efficient SSA Construction" by Braun M. et al.
- */
-static void
-phisimpl(Phi *p, Ref r, Ref *cpy, Use ***pstk, BSet *ts, BSet *as, Fn *fn)
+static uint
+uint_min(uint v1, uint v2)
{
- Use **stk, *u, *u1;
- uint nstk, a;
- int t;
- Ref r1;
- Phi *p0;
-
- bszero(ts);
- bszero(as);
- p0 = &(Phi){.narg = 0};
- stk = *pstk;
- nstk = 1;
- stk[0] = &(Use){.type = UPhi, .u.phi = p};
- while (nstk) {
- u = stk[--nstk];
- if (u->type == UIns && iscopy(u->u.ins, r, fn)) {
- p = p0;
- t = u->u.ins->to.val;
- }
- else if (u->type == UPhi) {
- p = u->u.phi;
- t = p->to.val;
- }
- else
- continue;
- if (bshas(ts, t))
- continue;
- bsset(ts, t);
- for (a=0; a<p->narg; a++) {
- r1 = copyof(p->arg[a], cpy);
- if (req(r1, r))
- continue;
- if (rtype(r1) != RTmp)
- return;
- bsset(as, r1.val);
+ return v1 < v2 ? v1 : v2;
+}
+
+/* is the ref def a narrow value? */
+static int
+defwidthle(Fn *fn, Ref r, uint wbits)
+{
+ Tmp *t;
+ Phi *p;
+ Ins *i;
+ uint n;
+ int64_t v;
+ int x;
+
+ if (isconbits(fn, r, &v))
+ if (u64_wbits(v) <= wbits)
+ return 1;
+ if (rtype(r) != RTmp)
+ return 0;
+ t = &fn->tmp[r.val];
+ if (t->cls != Kw)
+ return 0;
+ i = t->def;
+ if (i == 0) {
+ /* phi def */
+ p = findphi(fn, t->bid, r);
+ if (p->visit)
+ return 1;
+ p->visit = 1;
+ for (n = 0; n < p->narg; n++)
+ if (!defwidthle(fn, p->arg[n], wbits)) {
+ p->visit = 0;
+ return 0;
+ }
+ p->visit = 0;
+ return 1;
+ }
+ /* ins def */
+ if (i->op == Ocopy)
+ return defwidthle(fn, i->arg[0], wbits);
+ if (i->op == Oshr || i->op == Osar) {
+ if (isconbits(fn, i->arg[1], &v))
+ if (0 < v && v <= 32) {
+ if (i->op == Oshr && 32-v <= wbits)
+ return 1;
+ if (0 <= v && v < 32 && wbits < 32)
+ return defwidthle(fn, i->arg[0], uint_min((i->op == Osar ? 31 : 32), wbits+v));
}
- u = fn->tmp[t].use;
- u1 = &u[fn->tmp[t].nuse];
- vgrow(pstk, nstk+(u1-u));
- stk = *pstk;
- for (; u<u1; u++)
- stk[nstk++] = u;
+ return defwidthle(fn, i->arg[0], wbits);
+ }
+ if (iscmp(i->op, &x, &x))
+ return wbits >= 1;
+ if (i->op == Oand)
+ return defwidthle(fn, i->arg[0], wbits) || defwidthle(fn, i->arg[1], wbits);
+ if (i->op == Oor || i->op == Oxor)
+ return defwidthle(fn, i->arg[0], wbits) && defwidthle(fn, i->arg[1], wbits);
+ if (isext(i->op)) {
+ if (EXTSIGNED[i->op - Oextsb])
+ return defwidthle(fn, i->arg[0], uint_min(wbits, EXTMAXW[i->op - Oextsb]));
+ if (EXTW[i->op - Oextsb] <= wbits)
+ return 1;
+ return defwidthle(fn, i->arg[0], wbits);
}
- bsdiff(as, ts);
- if (!bscount(as))
- for (t=0; bsiter(ts, &t); t++)
- cpy[t] = r;
+ return 0;
}
-static void
-subst(Ref *pr, Ref *cpy)
+/* is the ref a boolean - 0, 1 - value? */
+int
+iswu1(Fn *fn, Ref r)
{
- assert(rtype(*pr) != RTmp || !req(cpy[pr->val], R));
- *pr = copyof(*pr, cpy);
+ return defwidthle(fn, r, 1);
}
-/* requires use and dom, breaks use */
+static int
+isnarrowpar(Fn *fn, Ref r)
+{
+ Tmp *t;
+
+ if (rtype(r) != RTmp)
+ return 0;
+ t = &fn->tmp[r.val];
+ if (t->bid != fn->start->id || t->def == 0)
+ return 0;
+ return ispar(t->def->op);
+}
+
+/* Insert extub/extuh instructions in start for pars used only narrowly */
+/* needs use; breaks use */
void
-copy(Fn *fn)
+narrowpars(Fn *fn)
{
- BSet ts[1], as[1];
- Use **stk;
- Phi *p, **pp;
- Ins *i;
Blk *b;
- uint n, a, eq;
- Ref *cpy, r, r1;
- int t;
-
- bsinit(ts, fn->ntmp);
- bsinit(as, fn->ntmp);
- cpy = emalloc(fn->ntmp * sizeof cpy[0]);
- stk = vnew(10, sizeof stk[0], PHeap);
-
- /* 1. build the copy-of map */
- for (n=0; n<fn->nblk; n++) {
- b = fn->rpo[n];
- for (p=b->phi; p; p=p->link) {
- assert(rtype(p->to) == RTmp);
- if (!req(cpy[p->to.val], R))
- continue;
- eq = 0;
- r = R;
- for (a=0; a<p->narg; a++)
- if (p->blk[a]->id < n) {
- r1 = copyof(p->arg[a], cpy);
- if (req(r, R) || req(r, UNDEF))
- r = r1;
- if (req(r1, r) || req(r1, UNDEF))
- eq++;
- }
- assert(!req(r, R));
- if (rtype(r) == RTmp
- && !dom(fn->rpo[fn->tmp[r.val].bid], b))
- cpy[p->to.val] = p->to;
- else if (eq == p->narg)
- cpy[p->to.val] = r;
- else {
- cpy[p->to.val] = p->to;
- phisimpl(p, r, cpy, &stk, ts, as, fn);
- }
- }
- for (i=b->ins; i<&b->ins[b->nins]; i++) {
- assert(rtype(i->to) <= RTmp);
- if (!req(cpy[i->to.val], R))
- continue;
- r = copyof(i->arg[0], cpy);
- if (iscopy(i, r, fn))
- cpy[i->to.val] = r;
- else
- cpy[i->to.val] = i->to;
+ int loop;
+ Ins *i, *ins;
+ uint npar, nins;
+ enum O extop;
+ Ref r;
+
+ /* only useful for functions with loops */
+ loop = 0;
+ for (b = fn->start; b; b = b->link)
+ if (b->loop > 1) {
+ loop = 1;
+ break;
}
+ if (!loop)
+ return;
+
+ b = fn->start;
+ npar = 0;
+
+ for (i = b->ins; i < &b->ins[b->nins]; i++) {
+ if (!ispar(i->op))
+ break;
+ npar++;
}
- /* 2. remove redundant phis/copies
- * and rewrite their uses */
- for (b=fn->start; b; b=b->link) {
- for (pp=&b->phi; (p=*pp);) {
- r = cpy[p->to.val];
- if (!req(r, p->to)) {
- *pp = p->link;
- continue;
- }
- for (a=0; a<p->narg; a++)
- subst(&p->arg[a], cpy);
- pp=&p->link;
- }
- for (i=b->ins; i<&b->ins[b->nins]; i++) {
- r = cpy[i->to.val];
- if (!req(r, i->to)) {
- *i = (Ins){.op = Onop};
- continue;
+ if (npar == 0)
+ return;
+
+ nins = b->nins + npar;
+ ins = alloc(nins * sizeof ins[0]);
+ memcpy(ins, b->ins, npar * sizeof ins[0]);
+ memcpy(ins + 2*npar, b->ins + npar, (b->nins - npar) * sizeof ins[0]);
+ b->ins = ins;
+ b->nins = nins;
+
+ for (i = b->ins; i < &b->ins[b->nins]; i++) {
+ if (!ispar(i->op))
+ break;
+ extop = Onop;
+ if (i->cls == Kw)
+ if (usewidthle(fn, i->to, 16)) {
+ if (usewidthle(fn, i->to, 8))
+ extop = Oextub;
+ else
+ extop = Oextuh;
}
- subst(&i->arg[0], cpy);
- subst(&i->arg[1], cpy);
+ if (extop == Onop) {
+ *(i+npar) = (Ins) {.op = Onop};
+ } else {
+ r = newtmp("vw", i->cls, fn);
+ *(i+npar) = (Ins) {.op = extop, .cls = i->cls, .to = i->to, .arg = {r}};
+ i->to = r;
}
- subst(&b->jmp.arg, cpy);
}
+}
- if (debug['C']) {
- fprintf(stderr, "\n> Copy information:");
- for (t=Tmp0; t<fn->ntmp; t++) {
- if (req(cpy[t], R)) {
- fprintf(stderr, "\n%10s not seen!",
- fn->tmp[t].name);
- }
- else if (!req(cpy[t], TMP(t))) {
- fprintf(stderr, "\n%10s copy of ",
- fn->tmp[t].name);
- printref(cpy[t], fn, stderr);
- }
+/* used by GVN */
+Ref
+copyref(Fn *fn, Ins *i)
+{
+ static bits extcpy[] = {
+ [WFull] = 0,
+ [Wsb] = BIT(Wsb) | BIT(Wsh) | BIT(Wsw),
+ [Wub] = BIT(Wub) | BIT(Wuh) | BIT(Wuw),
+ [Wsh] = BIT(Wsh) | BIT(Wsw),
+ [Wuh] = BIT(Wuh) | BIT(Wuw),
+ [Wsw] = BIT(Wsw),
+ [Wuw] = BIT(Wuw),
+ };
+ bits b;
+ Tmp *t;
+ Ins *i2;
+ int64_t v, v1;
+
+ if (i->op == Ocopy)
+ return i->arg[0];
+
+ if (optab[i->op].hasid)
+ if (KBASE(i->cls) == 0) /* integer only */
+ if (isconbits(fn, i->arg[1], &v))
+ if (v == optab[i->op].idval)
+ if (!optab[i->op].iscmpeq || iswu1(fn, i->arg[0]))
+ return i->arg[0];
+
+ if (optab[i->op].idemp)
+ if (req(i->arg[0], i->arg[1]))
+ return i->arg[1];
+
+ if (optab[i->op].iscmpeq || optab[i->op].iscmplgte) {
+ if (req(i->arg[0], i->arg[1]))
+ return con01[optab[i->op].eqval];
+ /* non-identical RCon's - assumes RCon's are deduped */
+ if (optab[i->op].iscmpeq)
+ if (rtype(i->arg[0]) == RCon && rtype(i->arg[1]) == RCon)
+ return con01[1-optab[i->op].eqval];
+ }
+
+ /* redundant and mask */
+ if (i->op == Oand)
+ if (isconbits(fn, i->arg[1], &v))
+ if (((v+1) & v) == 0) /* v == 2^N-1 */
+ if (defwidthle(fn, i->arg[0], u64_wbits(v)))
+ return i->arg[0];
+
+ /* add/sub same conbits pair */
+ if (i->op == Oadd || i->op == Osub)
+ if (KBASE(i->cls) == 0)
+ if (istmpconbits(fn, i, &v)) {
+ i2 = fn->tmp[i->arg[0].val].def;
+ if (i2 && (i2->op == Oadd || i2->op == Osub))
+ if (istmpconbits(fn, i2, &v1))
+ if (i->cls == fn->tmp[i2->arg[0].val].cls) { // TODO - check this, here and elsewhere
+ if (i->op == i2->op) {
+ if ((i->cls == Kw && (int32_t)v == -(int32_t)v1)
+ || (i->cls == Kl && v == -v1))
+ return i2->arg[0];
+ } else if (v == v1)
+ return i2->arg[0];
}
- fprintf(stderr, "\n\n> After copy elimination:\n");
- printfn(fn, stderr);
}
- vfree(stk);
- free(cpy);
+
+ if (!isext(i->op) || rtype(i->arg[0]) != RTmp)
+ return R;
+ if (i->op == Oextsw || i->op == Oextuw)
+ if (i->cls == Kw)
+ return i->arg[0];
+
+ t = &fn->tmp[i->arg[0].val];
+ assert(KBASE(t->cls) == 0);
+ if (i->cls == Kl && t->cls == Kw)
+ return R;
+ b = extcpy[t->width];
+ if ((BIT(Wsb + (i->op-Oextsb)) & b) != 0)
+ return i->arg[0];
+
+ if (!isnarrowpar(fn, i->arg[0]))
+ if (usewidthle(fn, i->to, EXTW[i->op - Oextsb]))
+ return i->arg[0];
+ if (defwidthle(fn, i->arg[0], EXTMAXW[i->op - Oextsb]))
+ return i->arg[0];
+
+ return R;
}
diff --git a/fold.c b/fold.c
index 3ff0bc7..87e5824 100644
--- a/fold.c
+++ b/fold.c
@@ -1,22 +1,6 @@
#include "all.h"
-enum {
- Bot = -1, /* lattice bottom */
- Top = 0, /* lattice top (matches UNDEF) */
-};
-
-typedef struct Edge Edge;
-
-struct Edge {
- int dest;
- int dead;
- Edge *work;
-};
-
-static int *val;
-static Edge *flowrk, (*edge)[2];
-static Use **usewrk;
-static uint nuse;
+/* boring folding code */
static int
iscon(Con *c, int w, uint64_t k)
@@ -29,304 +13,6 @@ iscon(Con *c, int w, uint64_t k)
return (uint32_t)c->bits.i == (uint32_t)k;
}
-static int
-latval(Ref r)
-{
- switch (rtype(r)) {
- case RTmp:
- return val[r.val];
- case RCon:
- return r.val;
- default:
- die("unreachable");
- }
-}
-
-static int
-latmerge(int v, int m)
-{
- return m == Top ? v : (v == Top || v == m) ? m : Bot;
-}
-
-static void
-update(int t, int m, Fn *fn)
-{
- Tmp *tmp;
- uint u;
-
- m = latmerge(val[t], m);
- if (m != val[t]) {
- tmp = &fn->tmp[t];
- for (u=0; u<tmp->nuse; u++) {
- vgrow(&usewrk, ++nuse);
- usewrk[nuse-1] = &tmp->use[u];
- }
- val[t] = m;
- }
-}
-
-static int
-deadedge(int s, int d)
-{
- Edge *e;
-
- e = edge[s];
- if (e[0].dest == d && !e[0].dead)
- return 0;
- if (e[1].dest == d && !e[1].dead)
- return 0;
- return 1;
-}
-
-static void
-visitphi(Phi *p, int n, Fn *fn)
-{
- int v;
- uint a;
-
- v = Top;
- for (a=0; a<p->narg; a++)
- if (!deadedge(p->blk[a]->id, n))
- v = latmerge(v, latval(p->arg[a]));
- update(p->to.val, v, fn);
-}
-
-static int opfold(int, int, Con *, Con *, Fn *);
-
-static void
-visitins(Ins *i, Fn *fn)
-{
- int v, l, r;
-
- if (rtype(i->to) != RTmp)
- return;
- if (optab[i->op].canfold) {
- l = latval(i->arg[0]);
- if (!req(i->arg[1], R))
- r = latval(i->arg[1]);
- else
- r = CON_Z.val;
- if (l == Bot || r == Bot)
- v = Bot;
- else if (l == Top || r == Top)
- v = Top;
- else
- v = opfold(i->op, i->cls, &fn->con[l], &fn->con[r], fn);
- } else
- v = Bot;
- /* fprintf(stderr, "\nvisiting %s (%p)", optab[i->op].name, (void *)i); */
- update(i->to.val, v, fn);
-}
-
-static void
-visitjmp(Blk *b, int n, Fn *fn)
-{
- int l;
-
- switch (b->jmp.type) {
- case Jjnz:
- l = latval(b->jmp.arg);
- if (l == Bot) {
- edge[n][1].work = flowrk;
- edge[n][0].work = &edge[n][1];
- flowrk = &edge[n][0];
- }
- else if (iscon(&fn->con[l], 0, 0)) {
- assert(edge[n][0].dead);
- edge[n][1].work = flowrk;
- flowrk = &edge[n][1];
- }
- else {
- assert(edge[n][1].dead);
- edge[n][0].work = flowrk;
- flowrk = &edge[n][0];
- }
- break;
- case Jjmp:
- edge[n][0].work = flowrk;
- flowrk = &edge[n][0];
- break;
- case Jhlt:
- break;
- default:
- if (isret(b->jmp.type))
- break;
- die("unreachable");
- }
-}
-
-static void
-initedge(Edge *e, Blk *s)
-{
- if (s)
- e->dest = s->id;
- else
- e->dest = -1;
- e->dead = 1;
- e->work = 0;
-}
-
-static int
-renref(Ref *r)
-{
- int l;
-
- if (rtype(*r) == RTmp)
- if ((l=val[r->val]) != Bot) {
- *r = CON(l);
- return 1;
- }
- return 0;
-}
-
-/* require rpo, use, pred */
-void
-fold(Fn *fn)
-{
- Edge *e, start;
- Use *u;
- Blk *b, **pb;
- Phi *p, **pp;
- Ins *i;
- int t, d;
- uint n, a;
-
- val = emalloc(fn->ntmp * sizeof val[0]);
- edge = emalloc(fn->nblk * sizeof edge[0]);
- usewrk = vnew(0, sizeof usewrk[0], PHeap);
-
- for (t=0; t<fn->ntmp; t++)
- val[t] = Top;
- for (n=0; n<fn->nblk; n++) {
- b = fn->rpo[n];
- b->visit = 0;
- initedge(&edge[n][0], b->s1);
- initedge(&edge[n][1], b->s2);
- }
- initedge(&start, fn->start);
- flowrk = &start;
- nuse = 0;
-
- /* 1. find out constants and dead cfg edges */
- for (;;) {
- e = flowrk;
- if (e) {
- flowrk = e->work;
- e->work = 0;
- if (e->dest == -1 || !e->dead)
- continue;
- e->dead = 0;
- n = e->dest;
- b = fn->rpo[n];
- for (p=b->phi; p; p=p->link)
- visitphi(p, n, fn);
- if (b->visit == 0) {
- for (i=b->ins; i<&b->ins[b->nins]; i++)
- visitins(i, fn);
- visitjmp(b, n, fn);
- }
- b->visit++;
- assert(b->jmp.type != Jjmp
- || !edge[n][0].dead
- || flowrk == &edge[n][0]);
- }
- else if (nuse) {
- u = usewrk[--nuse];
- n = u->bid;
- b = fn->rpo[n];
- if (b->visit == 0)
- continue;
- switch (u->type) {
- case UPhi:
- visitphi(u->u.phi, u->bid, fn);
- break;
- case UIns:
- visitins(u->u.ins, fn);
- break;
- case UJmp:
- visitjmp(b, n, fn);
- break;
- default:
- die("unreachable");
- }
- }
- else
- break;
- }
-
- if (debug['F']) {
- fprintf(stderr, "\n> SCCP findings:");
- for (t=Tmp0; t<fn->ntmp; t++) {
- if (val[t] == Bot)
- continue;
- fprintf(stderr, "\n%10s: ", fn->tmp[t].name);
- if (val[t] == Top)
- fprintf(stderr, "Top");
- else
- printref(CON(val[t]), fn, stderr);
- }
- fprintf(stderr, "\n dead code: ");
- }
-
- /* 2. trim dead code, replace constants */
- d = 0;
- for (pb=&fn->start; (b=*pb);) {
- if (b->visit == 0) {
- d = 1;
- if (debug['F'])
- fprintf(stderr, "%s ", b->name);
- edgedel(b, &b->s1);
- edgedel(b, &b->s2);
- *pb = b->link;
- continue;
- }
- for (pp=&b->phi; (p=*pp);)
- if (val[p->to.val] != Bot)
- *pp = p->link;
- else {
- for (a=0; a<p->narg; a++)
- if (!deadedge(p->blk[a]->id, b->id))
- renref(&p->arg[a]);
- pp = &p->link;
- }
- for (i=b->ins; i<&b->ins[b->nins]; i++)
- if (renref(&i->to))
- *i = (Ins){.op = Onop};
- else {
- for (n=0; n<2; n++)
- renref(&i->arg[n]);
- if (isstore(i->op))
- if (req(i->arg[0], UNDEF))
- *i = (Ins){.op = Onop};
- }
- renref(&b->jmp.arg);
- if (b->jmp.type == Jjnz && rtype(b->jmp.arg) == RCon) {
- if (iscon(&fn->con[b->jmp.arg.val], 0, 0)) {
- edgedel(b, &b->s1);
- b->s1 = b->s2;
- b->s2 = 0;
- } else
- edgedel(b, &b->s2);
- b->jmp.type = Jjmp;
- b->jmp.arg = R;
- }
- pb = &b->link;
- }
-
- if (debug['F']) {
- if (!d)
- fprintf(stderr, "(none)");
- fprintf(stderr, "\n\n> After constant folding:\n");
- printfn(fn, stderr);
- }
-
- free(val);
- free(edge);
- vfree(usewrk);
-}
-
-/* boring folding code */
-
static int
foldint(Con *res, int op, int w, Con *cl, Con *cr)
{
@@ -516,7 +202,7 @@ foldflt(Con *res, int op, int w, Con *cl, Con *cr)
}
}
-static int
+static Ref
opfold(int op, int cls, Con *cl, Con *cr, Fn *fn)
{
Ref r;
@@ -524,12 +210,37 @@ opfold(int op, int cls, Con *cl, Con *cr, Fn *fn)
if (cls == Kw || cls == Kl) {
if (foldint(&c, op, cls == Kl, cl, cr))
- return Bot;
+ return R;
} else
foldflt(&c, op, cls == Kd, cl, cr);
if (!KWIDE(cls))
c.bits.i &= 0xffffffff;
r = newcon(&c, fn);
assert(!(cls == Ks || cls == Kd) || c.flt);
- return r.val;
+ return r;
+}
+
+/* used by GVN */
+Ref
+foldref(Fn *fn, Ins *i)
+{
+ Ref rr;
+ Con *cl, *cr;
+
+ if (rtype(i->to) != RTmp)
+ return R;
+ if (optab[i->op].canfold) {
+ if (rtype(i->arg[0]) != RCon)
+ return R;
+ cl = &fn->con[i->arg[0].val];
+ rr = i->arg[1];
+ if (req(rr, R))
+ rr = CON_Z;
+ if (rtype(rr) != RCon)
+ return R;
+ cr = &fn->con[rr.val];
+
+ return opfold(i->op, i->cls, cl, cr, fn);
+ }
+ return R;
}
diff --git a/gcm.c b/gcm.c
new file mode 100644
index 0000000..ca4dd4b
--- /dev/null
+++ b/gcm.c
@@ -0,0 +1,352 @@
+#include "all.h"
+
+#define NOBID (-1u)
+
+/* ins can trap at runtime */
+static int
+istrapping(Fn *fn, Ins *i)
+{
+ int64_t v;
+
+ if (KBASE(i->cls) == 0)
+ if (INRANGE(i->op, Odiv, Ourem))
+ if (!isconbits(fn, i->arg[1], &v) || v == 0)
+ return 1;
+ return 0;
+}
+
+/* fixed ins that can be eliminated if unused */
+static int
+canelim(Fn *fn, Ins *i)
+{
+ return isload(i->op) || isalloc(i->op) || istrapping(fn, i);
+}
+
+/* ins must stay in def blk */
+int
+isfixed(Fn *fn, Ins *i)
+{
+ return optab[i->op].ispinned || istrapping(fn, i);
+}
+
+static uint earlyins(Fn *, Blk *, Ins *);
+
+/* return earlybid of ref def ins */
+static uint
+schedearly(Fn *fn, Ref r)
+{
+ Tmp *t;
+ Blk *b;
+
+ if (rtype(r) != RTmp)
+ return 0 /* root/start */;
+
+ t = &fn->tmp[r.val];
+ if (t->gcmbid != NOBID)
+ return t->gcmbid; /* already visited/visiting */
+
+ b = fn->rpo[t->bid];
+ if (t->def) {
+ /* def is an ins */
+ assert(b->ins <= t->def && t->def < &b->ins[b->nins]);
+ t->gcmbid = 0; /* mark visiting root/start blk */
+ t->gcmbid = earlyins(fn, b, t->def); /* schedule ins input defs */
+ } else {
+ /* def is a phi - it stays in its def blk */
+ t->gcmbid = t->bid;
+ }
+
+ return t->gcmbid;
+}
+
+/* return deepest domdpth bid of arg defs */
+static uint
+earlyins(Fn *fn, Blk *b, Ins* i)
+{
+ uint earlybid, arg1earlybid;
+
+ earlybid = schedearly(fn, i->arg[0]);
+ assert(earlybid != NOBID);
+ arg1earlybid = schedearly(fn, i->arg[1]);
+ assert(arg1earlybid != NOBID);
+ if (fn->rpo[earlybid]->domdpth < fn->rpo[arg1earlybid]->domdpth) {
+ assert(dom(fn->rpo[earlybid], fn->rpo[arg1earlybid]));
+ earlybid = arg1earlybid;
+ }
+
+ /* fixed ins remain in their defining blk */
+ return isfixed(fn, i) ? b->id : earlybid;
+}
+
+static void
+earlyblk(Fn *fn, uint bid)
+{
+ Blk *b;
+ Phi *p;
+ Ins *i;
+ uint n;
+
+ b = fn->rpo[bid];
+ for (p = b->phi; p; p = p->link)
+ for (n = 0; n < p->narg; n++)
+ schedearly(fn, p->arg[n]);
+ for (i = b->ins; i < &b->ins[b->nins]; i++)
+ if (isfixed(fn, i))
+ earlyins(fn, b, i);
+ schedearly(fn, b->jmp.arg);
+}
+
+/* least common ancestor in dom tree */
+static uint
+lcabid(Fn *fn, uint bid1, uint bid2)
+{
+ Blk *b;
+
+ if (bid1 == NOBID)
+ return bid2;
+ if (bid2 == NOBID)
+ return bid1;
+
+ b = lca(fn->rpo[bid1], fn->rpo[bid2]);
+ assert(b);
+ return b->id;
+}
+
+static uint
+bestbid(Fn *fn, uint earlybid, uint latebid)
+{
+ Blk *curb, *earlyb, *bestb;
+
+ if (latebid == NOBID)
+ return NOBID; /* unused */
+
+ assert(earlybid != NOBID);
+
+ earlyb = fn->rpo[earlybid];
+ bestb = curb = fn->rpo[latebid];
+ assert(dom(earlyb, curb));
+
+ while (curb != earlyb) {
+ curb = curb->idom;
+ if (curb->loop < bestb->loop)
+ bestb = curb;
+ }
+ return bestb->id;
+}
+
+static uint lateins(Fn *, Blk *, Ins *, Ref r);
+static uint latephi(Fn *, Phi *, Ref r);
+static uint latejmp(Blk *, Ref r);
+
+/* return lca bid of ref uses */
+static uint
+schedlate(Fn *fn, Ref r)
+{
+ Tmp *t;
+ Blk *b;
+ Use *u;
+ uint earlybid;
+ uint latebid;
+ uint uselatebid;
+
+ if (rtype(r) != RTmp)
+ return NOBID;
+
+ t = &fn->tmp[r.val];
+ if (t->visit)
+ return t->gcmbid; /* already visited/visiting */
+
+ t->visit = 1; /* mark visiting/visited */
+ earlybid = t->gcmbid;
+ if (earlybid == NOBID)
+ return NOBID; /* not used */
+ t->gcmbid = t->bid; /* t->gcmbid is now latebid */
+
+ latebid = NOBID;
+ for (u = t->use; u < &t->use[t->nuse]; u++) {
+ assert(u->bid < fn->nblk);
+ b = fn->rpo[u->bid];
+ switch (u->type) {
+ case UXXX:
+ die("unreachable");
+ break;
+ case UPhi:
+ uselatebid = latephi(fn, u->u.phi, r);
+ break;
+ case UIns:
+ uselatebid = lateins(fn, b, u->u.ins, r);
+ break;
+ case UJmp:
+ uselatebid = latejmp(b, r);
+ break;
+ }
+ latebid = lcabid(fn, latebid, uselatebid);
+ }
+
+ /* phis stay in their def blk */
+ if (t->def) {
+ /* allow elimination of unused load/alloc/trapping ins */
+ if (latebid == NOBID && canelim(fn, t->def))
+ t->gcmbid = NOBID;
+ /* ... otherwise fixed ins stay in defining blk */
+ else if(!isfixed(fn, t->def))
+ t->gcmbid = bestbid(fn, earlybid, latebid);
+ }
+
+ return t->gcmbid;
+}
+
+/* return lca bid of uses, or NOBID if no active uses */
+static uint
+lateins(Fn *fn, Blk *b, Ins *i, Ref r)
+{
+ uint latebid;
+
+ assert(i->op == Onop || req(i->arg[0], r) || req(i->arg[1], r));
+ if (i->op == Onop)
+ return NOBID; /* already eliminated */
+
+ assert(b->ins <= i && i < &b->ins[b->nins]);
+
+ latebid = schedlate(fn, i->to);
+ /* allow elimination of unused load/alloc/trapping ins */
+ if (latebid == NOBID)
+ if (canelim(fn, i))
+ return NOBID;
+ /* ... otherwise fixed ins stay in defining blk */
+ return isfixed(fn, i) ? b->id : latebid;
+}
+
+static uint
+latephi(Fn *fn, Phi *p, Ref r)
+{
+ uint n;
+ uint latebid;
+
+ if (!p->narg)
+ return NOBID; /* marked as unused */
+
+ latebid = NOBID;
+ for (n = 0; n < p->narg; n++)
+ if (req(p->arg[n], r))
+ latebid = lcabid(fn, latebid, p->blk[n]->id);
+
+ assert(latebid != NOBID);
+ return latebid;
+}
+
+static uint
+latejmp(Blk *b, Ref r)
+{
+ if (req(b->jmp.arg, R))
+ return NOBID;
+ else {
+ assert(req(b->jmp.arg, r));
+ return b->id;
+ }
+}
+
+static void
+lateblk(Fn *fn, uint bid)
+{
+ Blk *b;
+ Phi **pp;
+ Ins *i;
+
+ b = fn->rpo[bid];
+ for (pp = &b->phi; *(pp);)
+ if (schedlate(fn, (*pp)->to) == NOBID) {
+ /* unused */
+ (*pp)->narg = 0; /* mark unused */
+ *pp = (*pp)->link; /* remove phi */
+ } else
+ pp = &(*pp)->link;
+
+ for (i = b->ins; i < &b->ins[b->nins]; i++)
+ if (isfixed(fn, i))
+ lateins(fn, b, i, i->arg[0]);
+}
+
+/* move instructions */
+static void
+sched(Fn *fn)
+{
+ Blk *b, *b2;
+ Ins *i;
+ Tmp *t;
+ InsMov *im;
+ uint nim;
+
+ nim = 0;
+ im = vnew(nim, sizeof im[0], PHeap);
+
+ for (b = fn->start; b; b = b->link) {
+ for (i = b->ins; i < &b->ins[b->nins]; i++) {
+ if (rtype(i->to) != RTmp)
+ continue;
+ if (isfixed(fn, i) && !canelim(fn, i))
+ continue;
+ assert(rtype(i->to) == RTmp);
+ t = &fn->tmp[i->to.val];
+ if (t->gcmbid == NOBID)
+ *i = (Ins){.op = Onop};
+ else if (t->gcmbid != b->id) {
+ b2 = fn->rpo[t->gcmbid];
+ vgrow(&im, ++nim);
+ im[nim-1] = (InsMov){
+ .from = (InsLoc){
+ .bid = b->id,
+ .insn = i - b->ins,
+ },
+ .to = (InsLoc){
+ .bid = t->gcmbid,
+ .insn = b->id < t->gcmbid ? 0 : b2->nins,
+ }
+ };
+ }
+ }
+ }
+
+ movins(fn, im, nim, 1/*del*/);
+
+ vfree(im);
+}
+
+static void
+cleartmps(Fn *fn)
+{
+ Tmp *t;
+
+ for (t=&fn->tmp[Tmp0]; t < &fn->tmp[fn->ntmp]; t++) {
+ t->visit = 0;
+ t->gcmbid = NOBID;
+ }
+}
+
+/* https://courses.cs.washington.edu/courses/cse501/06wi/reading/click-pldi95.pdf */
+/* require use dom */
+/* maintains rpo pred dom */
+/* breaks use */
+void
+gcm(Fn *fn)
+{
+ uint bid;
+
+ filldomdpth(fn);
+ fillloop(fn);
+
+ cleartmps(fn);
+ for (bid=0; bid<fn->nblk; bid++)
+ earlyblk(fn, bid);
+ for (bid=0; bid<fn->nblk; bid++)
+ lateblk(fn, bid);
+ sched(fn);
+ cleartmps(fn);
+ filluse(fn);
+ fixub4d(fn);
+
+ if (debug['G']) {
+ fprintf(stderr, "\n> After GCM:\n");
+ printfn(fn, stderr);
+ }
+}
diff --git a/gvn.c b/gvn.c
new file mode 100644
index 0000000..3af3517
--- /dev/null
+++ b/gvn.c
@@ -0,0 +1,592 @@
+#include "all.h"
+
+/* blk unreachable? */
+static int
+isdead(Fn *fn, Blk *b) {
+ if (b == fn->start)
+ return 0;
+ if (b->npred == 0)
+ return 1;
+ if (b->npred == 1 && b->pred[0] == b)
+ return 1;
+ return 0;
+}
+
+/* literal constants 0, 1 */
+Ref con01[2];
+
+static inline uint
+mix(uint x0, uint x1)
+{
+ return x0 + 17*x1;
+}
+
+static inline uint
+rhash(Ref r)
+{
+ return mix(r.type, r.val);
+}
+
+static uint
+ihash(Ins *i)
+{
+ uint a0h, a1h, ah, h;
+
+ a0h = rhash(i->arg[0]);
+ a1h = rhash(i->arg[1]);
+ ah = mix(a0h, a1h);
+ h = mix(i->cls, i->op);
+ h = mix(h, ah);
+
+ return h;
+
+}
+
+static int
+ieq(Ins *ia, Ins *ib)
+{
+ if (ia->cls == ib->cls)
+ if (ia->op == ib->op)
+ if (req(ia->arg[0], ib->arg[0]))
+ if (req(ia->arg[1], ib->arg[1]))
+ return 1;
+ return 0;
+}
+
+static Ins** gvntbl;
+static uint gvntbln;
+static uint lookupn;
+static uint proben;
+static uint maxproben;
+
+static Ins *
+gvndup(Ins *i, int insert) {
+ uint idx, n;
+ Ins *i2;
+
+ lookupn++;
+
+ idx = ihash(i) % gvntbln;
+ for (n = 1;; n++) {
+ proben++;
+ if (n > maxproben)
+ maxproben = n;
+ i2 = gvntbl[idx];
+ if (!i2)
+ break;
+ if (ieq(i, i2))
+ return i2;
+
+ idx++;
+ if (gvntbln <= idx)
+ idx = 0;
+ }
+ /* not found */
+ if (insert) {
+ gvntbl[idx] = i;
+ }
+ return 0;
+}
+
+static void
+replaceref(Ref *r, Ref r1, Ref r2)
+{
+ if (req(*r, r1))
+ *r = r2;
+}
+
+static void
+replacepuse(Phi *p, Ref r1, Ref r2)
+{
+ Ref *a;
+
+ for (a = p->arg; a < &p->arg[p->narg]; a++)
+ replaceref(a, r1, r2);
+}
+
+static void
+replaceiuse(Ins *i, Ref r1, Ref r2)
+{
+ replaceref(&i->arg[0], r1, r2);
+ replaceref(&i->arg[1], r1, r2);
+}
+
+static void
+replacejuse(Blk* b, Ref r1, Ref r2)
+{
+ replaceref(&b->jmp.arg, r1, r2);
+}
+
+static void
+replaceuse(Fn *fn, Use* u, Ref r1, Ref r2)
+{
+ Tmp *t2;
+ Blk *b;
+
+ t2 = rtype(r2) == RTmp ? &fn->tmp[r2.val] : 0;
+ b = fn->rpo[u->bid];
+
+ switch (u->type) {
+ case UXXX:
+ die("unreachable");
+ break;
+ case UPhi:
+ replacepuse(u->u.phi, r1, r2);
+ if (t2)
+ adduse(t2, UPhi, b, u->u.phi);
+ break;
+ case UIns:
+ replaceiuse(u->u.ins, r1, r2);
+ if (t2)
+ adduse(t2, UIns, b, u->u.ins);
+ break;
+ case UJmp:
+ replacejuse(fn->rpo[u->bid], r1, r2);
+ if (t2)
+ adduse(t2, UJmp, b);
+ break;
+ }
+}
+
+static void
+replaceuses(Fn *fn, Ref r1, Ref r2)
+{
+ Tmp *t1;
+ Use *u;
+
+ assert(rtype(r1) == RTmp);
+ t1 = &fn->tmp[r1.val];
+
+ for (u = t1->use; u < &t1->use[t1->nuse]; u++)
+ replaceuse(fn, u, r1, r2);
+
+ t1->nuse = 0;
+}
+
+static void
+rmuse(Fn *fn, Blk *b, uint ty, Ins *i, Phi *p, Ref r, int strict)
+{
+ Tmp *t;
+ Use *u;
+ int found, rm;
+
+ if (rtype(r) != RTmp)
+ return;
+ found = 0;
+ t = &fn->tmp[r.val];
+ for (u = t->use; u < &t->use[t->nuse];) {
+ rm = 0;
+ if (u->type == ty) {
+ switch (ty) {
+ case UXXX:
+ die("unreachable");
+ break;
+ case UIns:
+ assert(p == 0);
+ assert(i);
+ rm = u->u.ins == i;
+ break;
+ case UPhi:
+ assert(i == 0);
+ assert(p);
+ rm = u->u.phi == p;
+ break;
+ case UJmp:
+ assert(i == 0);
+ assert(p == 0);
+ rm = u->bid == b->id;
+ break;
+ default:
+ die("unreachable");
+ break;
+ }
+ }
+ if (rm) {
+ found++;
+ assert(u < &t->use[t->nuse]);
+ assert(u->bid == b->id);
+ /* careful - deleting below iterator */
+ memcpy(u, u+1, ((t->nuse) - ((u+1)-t->use)) * sizeof u[0]);
+ t->nuse--;
+ } else
+ u++;
+ }
+ if (strict)
+ assert(found);
+}
+
+/* sort phi args in blk pred order */
+static void
+phinorm(Blk *b, Phi *p)
+{
+ Blk *pred, *btmp;
+ Ref atmp;
+ uint n, n1;
+
+ assert(b->npred == p->narg);
+ /* bubble sort :) */
+ for (n = 0; n < b->npred; n++) {
+ pred = b->pred[n];
+ if (p->blk[n] != pred) {
+ for (n1 = n+1; n1 < b->npred; n1++)
+ if (p->blk[n1] == pred)
+ break;
+ assert(n1 < b->npred);
+ atmp = p->arg[n1];
+ btmp = p->blk[n1];
+ p->arg[n1] = p->arg[n];
+ p->blk[n1] = p->blk[n];
+ p->arg[n] = atmp;
+ p->blk[n] = btmp;
+ }
+ }
+}
+
+static int
+rcmp(Ref a, Ref b)
+{
+ if (rtype(a) != rtype(b))
+ return rtype(a)-rtype(b);
+ return a.val - b.val;
+}
+
+static int
+phicmp(const void *a, const void *b)
+{
+ Phi *pa, *pb;
+ uint n;
+ int acmp;
+
+ pa = *(Phi**)a;
+ pb = *(Phi**)b;
+ assert(pa->narg == pb->narg);
+ for (n=0; n<pa->narg; n++) {
+ assert(pa->blk[n] == pb->blk[n]);
+ acmp = rcmp(pa->arg[n], pb->arg[n]);
+ if (acmp != 0)
+ return acmp;
+ }
+ return 0;
+}
+
+static int
+phieq(Phi *pa, Phi *pb)
+{
+ uint n;
+
+ assert(pa->narg == pb->narg);
+ for (n=0; n<pa->narg; n++) {
+ assert(pa->blk[n] == pb->blk[n]);
+ if (!req(pa->arg[n], pb->arg[n]))
+ return 0;
+ }
+ return 1;
+}
+
+static void
+killphi(Fn *fn, Blk *b, Phi **pp, Ref r)
+{
+ Phi *p;
+ uint n;
+
+ p = *pp;
+ replaceuses(fn, p->to, r);
+ assert(b->npred == p->narg);
+ for (n = 0; n < p->narg; n++)
+ rmuse(fn, b, UPhi, 0, p, p->arg[n], 0/*!strict TODO dups*/);
+ p->narg = 0; /* mark as unused - TODO should not be necessary with rmuse*/
+ *pp = p->link;
+}
+
+static void
+dedupphis(Fn *fn, Blk *b) {
+ uint n, nphis;
+ Blk *b2;
+ Phi *p, **pp, **phis;
+
+ /* phi "copy" - all args identical */
+ for (pp = &b->phi; *pp;) {
+ p = *pp;
+ assert(p->narg == b->npred);
+ for (n = 0; n < p->narg-1; n++) {
+ if (!req(p->arg[n], p->arg[n+1]))
+ goto Skip;
+ }
+ killphi(fn, b, pp, p->arg[0]);
+ continue;
+ Skip:;
+ pp = &(*pp)->link;
+ }
+
+ /* phi bool 1/0 "copy" */
+ /* TODO - triangle case */
+ if (b->npred == 2)
+ if (b->pred[0]->npred == 1)
+ if (b->pred[1]->npred == 1)
+ if (b->pred[0]->pred[0] == b->pred[1]->pred[0]) {
+ b2 = b->pred[0]->pred[0];
+ assert(b2->jmp.type == Jjnz);
+ if (iswu1(fn, b2->jmp.arg)) {
+ for (pp = &b->phi; *pp;) {
+ p = *pp;
+ assert(p->narg == 2);
+ if (!req(p->arg[0], con01[p->blk[0] == b2->s1]))
+ goto Skip2;
+ if (!req(p->arg[1], con01[p->blk[0] == b2->s2]))
+ goto Skip2;
+ killphi(fn, b, pp, b2->jmp.arg);
+ continue;
+ Skip2:;
+ pp = &(*pp)->link;
+ }
+ }
+ }
+
+ /* redundant phis */
+ if (!b->phi || !b->phi->link)
+ return;
+ nphis = 0;
+ for (p = b->phi; p; p = p->link) {
+ nphis++;
+ phinorm(b, p);
+ }
+ phis = emalloc(nphis * sizeof phis[0]);
+ n = 0;
+ for (p = b->phi; p; p = p->link)
+ phis[n++] = p;
+ qsort(phis, nphis, sizeof phis[0], phicmp);
+ b->phi = phis[0];
+ for (n = 0; n < nphis-1; n++)
+ phis[n]->link = phis[n+1];
+ phis[nphis-1]->link = 0;
+ free(phis);
+ for (pp = &b->phi; (*pp)->link;) {
+ if (phieq(*pp, (*pp)->link)) {
+ killphi(fn, b, pp, (*pp)->link->to);
+ continue;
+ }
+ pp = &(*pp)->link;
+ }
+}
+
+static void
+normins(Fn *fn, Ins *i)
+{
+ uint n;
+ int64_t v;
+ Ref r;
+
+ /* truncate constant bits to 32 bits for "s"/w" uses */
+ for (n = 0; n < 2; n++) {
+ if (!KWIDE(optab[i->op].argcls[n][i->cls]))
+ if (isconbits(fn, i->arg[n], &v))
+ if ((v & 0xffffffff) != v)
+ i->arg[n] = getcon(v & 0xffffffff, fn);
+ }
+ /* order arg[0] <= arg[1] for commutative ops, preferring RTmp in arg[0] */
+ if (optab[i->op].commutes)
+ if (rcmp(i->arg[0], i->arg[1]) > 0) {
+ r = i->arg[1];
+ i->arg[1] = i->arg[0];
+ i->arg[0] = r;
+ }
+}
+
+static void
+killins(Fn *fn, Blk *b, Ins *i, Ref r1, Ref r2)
+{
+ replaceuses(fn, r1, r2);
+ rmuse(fn, b, UIns, i, 0, i->arg[0], 1/*strict*/);
+ if (!req(i->arg[0], i->arg[1]))
+ rmuse(fn, b, UIns, i, 0, i->arg[1], 1/*strict*/);
+ *i = (Ins){.op = Onop};
+}
+
+static void
+dedupi(Fn *fn, Blk *b, Ins *i)
+{
+ Ref r2;
+ Ins *i2;
+
+ if (i->op == Onop || i->op == Odbgloc)
+ return;
+
+ normins(fn, i);
+
+ if (optab[i->op].ispinned)
+ return;
+ assert(rtype(i->to) == RTmp);
+
+ /* effective copy? */
+ r2 = copyref(fn, i);
+ if (!req(r2, R)) {
+ killins(fn, b, i, i->to, r2);
+ return;
+ }
+
+ /* effective constant? */
+ r2 = foldref(fn, i);
+ if (!req(r2, R)) {
+ killins(fn, b, i, i->to, r2);
+ return;
+ }
+
+ /* do not dedup (trapping) ins that GCM will not move */
+ if (isfixed(fn, i))
+ return;
+
+ /* duplicate? */
+ i2 = gvndup(i, 1);
+ if (i2) {
+ killins(fn, b, i, i->to, i2->to);
+ return;
+ }
+}
+
+static void
+dedupins(Fn *fn, Blk *b)
+{
+ Ins *i;
+
+ for (i = b->ins; i < &b->ins[b->nins]; i++)
+ dedupi(fn, b, i);
+}
+
+static void
+dedupjmp(Fn *fn, Blk *b) {
+ Blk *tmp;
+ int64_t v;
+
+ if (b->jmp.type != Jjnz)
+ return;
+ if (b->s1 != b->s2) {
+ if (!isconbits(fn, b->jmp.arg, &v))
+ return;
+ if (v == 0) {
+ tmp = b->s1;
+ b->s1 = b->s2;
+ b->s2 = tmp;
+ }
+ }
+ /* we later move active ins out of dead blks */
+ edgedel(b, &b->s2);
+ b->jmp.type = Jjmp;
+ rmuse(fn, b, UJmp, 0, 0, b->jmp.arg, 1/*strict*/);
+ b->jmp.arg = R;
+}
+
+/* propagate dead blks */
+static int
+deadblk(Fn *fn, Blk *b) {
+ if (!isdead(fn, b))
+ return 0;
+ if (b->s1)
+ edgedel(b, &b->s1);
+ if (b->s2)
+ edgedel(b, &b->s2);
+ b->jmp.type = Jhlt;
+ rmuse(fn, b, UJmp, 0, 0, b->jmp.arg, 1/*strict*/);
+ b->jmp.arg = R;
+ return 1;
+}
+
+/* rebuild rpo pred use */
+/* careful not to lose active ins */
+static void
+rebuildcfg(Fn *fn) {
+ uint n, prevnblk, nins;
+ Blk **prevrpo;
+ Blk *b;
+ Ins *i, *iv;
+
+ prevnblk = fn->nblk;
+ prevrpo = emalloc(prevnblk * sizeof prevrpo[0]);
+ memcpy(prevrpo, fn->rpo, prevnblk * sizeof prevrpo[0]);
+
+ fillrpo(fn);
+
+ iv = 0;
+ for (n=0; n<prevnblk; n++) {
+ b = prevrpo[n];
+ if (b->id == -1u) {
+ /* blk unreachable after GVN */
+ for (i = b->ins; i < &b->ins[b->nins]; i++)
+ if (i->op != Onop)
+ if (!optab[i->op].ispinned)
+ if (gvndup(i, 0) == i) {
+ /* (possibly) active ins - add to start blk */
+ if (!iv) {
+ nins = fn->start->nins;
+ iv = vnew(nins, sizeof iv[0], PHeap);
+ memcpy(iv, fn->start->ins, nins * sizeof iv[0]);
+ }
+ vgrow(&iv, ++nins);
+ iv[nins-1] = *i;
+ }
+ }
+ }
+ if (iv) {
+ idup(&fn->start->ins, iv, nins);
+ fn->start->nins = nins;
+ vfree(iv);
+ }
+ free(prevrpo);
+}
+
+/* https://courses.cs.washington.edu/courses/cse501/06wi/reading/click-pldi95.pdf */
+/* require rpo pred ssa use */
+/* recreates rpo */
+/* breaks pred use dom ssa (GCM fixes ssa) */
+void
+gvn(Fn *fn)
+{
+ uint n, nins;
+ Blk *b;
+ Phi *p;
+
+ con01[0] = getcon(0, fn);
+ con01[1] = getcon(1, fn);
+
+ nins = 0;
+ for (b=fn->start; b; b=b->link)
+ for (p = b->phi; p; p = p->link)
+ p->visit = 0;
+
+ fillloop(fn);
+ narrowpars(fn);
+ filluse(fn);
+ ssacheck(fn);
+
+ for (b=fn->start; b; b=b->link)
+ nins += b->nins;
+
+ gvntbln = nins + nins/2;
+ gvntbl = emalloc(gvntbln * sizeof gvntbl[0]);
+ lookupn = 0;
+ proben = 0;
+ maxproben = 0;
+
+ /* GVN */
+ for (n=0; n<fn->nblk; n++) {
+ b = fn->rpo[n];
+ if (deadblk(fn, b))
+ continue;
+ dedupphis(fn, b);
+ dedupins(fn, b);
+ dedupjmp(fn, b);
+ }
+
+ rebuildcfg(fn);
+
+ free(gvntbl);
+ gvntbl = 0;
+ gvntbln = 0;
+ lookupn = 0;
+ proben = 0;
+ maxproben = 0;
+
+ if (debug['G']) {
+ fprintf(stderr, "\n> After GVN:\n");
+ printfn(fn, stderr);
+ }
+}
diff --git a/ins.c b/ins.c
new file mode 100644
index 0000000..1e238c7
--- /dev/null
+++ b/ins.c
@@ -0,0 +1,197 @@
+#include "all.h"
+
+typedef struct FromLoc FromLoc;
+struct FromLoc {
+ InsLoc from;
+ uint ton;
+};
+
+static int
+loccmp(InsLoc *la, InsLoc *lb)
+{
+ if (la->bid != lb->bid)
+ return (int)la->bid - (int)lb->bid;
+ return la->insn - lb->insn;
+}
+
+static int
+tocmp(const void *a, const void *b)
+{
+ InsMov *ma, *mb;
+
+ ma = (InsMov*)a;
+ mb = (InsMov*)b;
+ return loccmp(&ma->to, &mb->to);
+}
+
+static int
+fromcmp(const void *a, const void *b)
+{
+ FromLoc *fa, *fb;
+
+ fa = (FromLoc*)a;
+ fb = (FromLoc*)b;
+ return loccmp(&fa->from, &fb->from);
+}
+
+static int
+loceq(InsLoc *a, InsLoc *b)
+{
+ return a->bid == b->bid && a->insn == b->insn;
+}
+
+/* after, mov is sorted by to, and to.insn, from.insn are updated */
+void
+movins(Fn *fn, InsMov *mov, uint nmov, int del)
+{
+ Blk *b, *b2;
+ uint bid, n, fromn, ton, ifromn, iton;
+ InsMov *m;
+ BSet to[1];
+ FromLoc *from;
+ uint *newbnins;
+ Ins **newbins;
+
+ qsort(mov, nmov, sizeof mov[0], tocmp);
+
+ from = emalloc(nmov * sizeof from[0]);
+ for (n = 0; n < nmov; n++) {
+ from[n].from = mov[n].from;
+ from[n].ton = n;
+ }
+
+ qsort(from, nmov, sizeof from[0], fromcmp);
+
+ /* count new ins in each blk */
+ bsinit(to, fn->nblk); /* no free for BSet? */
+ newbnins = emalloc(fn->nblk * sizeof newbnins[0]);
+ for (m = mov; m < &mov[nmov]; m++) {
+ bsset(to, m->to.bid);
+ newbnins[m->to.bid]++;
+ }
+
+ /* allocate new b->ins for blks with new ins */
+ newbins = emalloc(fn->nblk * sizeof newbins[0]);
+ for (bid = 0; bid < fn->nblk; bid++) {
+ assert(bshas(to, bid) == (newbnins[bid] != 0));
+ if (!bshas(to, bid))
+ continue;
+ b = fn->rpo[bid];
+ newbnins[bid] += b->nins;
+ newbins[bid] = alloc(newbnins[bid] * sizeof(Ins));
+ }
+
+ /* populate new b->ins */
+ /* update mov to.insn, from insn */
+ fromn = ton = 0;
+ for (bid = 0; bid < fn->nblk; bid++) {
+ assert(bshas(to, bid) == (newbnins[bid] != 0));
+ if (!bshas(to, bid)) {
+ while (fromn < nmov && from[fromn].from.bid == bid)
+ fromn++;
+ continue;
+ }
+ b = fn->rpo[bid];
+ iton = 0;
+ for (ifromn = 0; ifromn < b->nins; ifromn++) {
+ /* insert new ins, update to */
+ while (ton < nmov && loceq(&mov[ton].to, &(InsLoc){.bid = bid, .insn = ifromn})) {
+ b2 = fn->rpo[mov[ton].from.bid];
+ assert(mov[ton].from.insn < b2->nins);
+ newbins[bid][iton++] = b2->ins[mov[ton].from.insn];
+ mov[ton++].to.insn = iton-1;
+ }
+ /* update from */
+ while (fromn < nmov && loceq(&from[fromn].from, &(InsLoc){.bid = bid, .insn = ifromn}))
+ from[fromn++].from.insn = iton;
+ /* copy original ins */
+ newbins[bid][iton++] = b->ins[ifromn];
+ }
+ /* append new ins, update to */
+ while (ton < nmov && mov[ton].to.bid == bid) {
+ assert(mov[ton].to.insn == b->nins);
+ b2 = fn->rpo[mov[ton].from.bid];
+ assert(mov[ton].from.insn < b2->nins);
+ newbins[bid][iton++] = b2->ins[mov[ton].from.insn];
+ mov[ton++].to.insn = iton-1;
+ }
+ assert(ifromn == b->nins);
+ assert(iton == newbnins[bid]);
+ }
+ assert(fromn == nmov);
+ assert(ton == nmov);
+
+ /* install new b->ins */
+ for (bid = 0; bid < fn->nblk; bid++) {
+ assert(bshas(to, bid) == (newbins[bid] != 0));
+ if (!bshas(to, bid))
+ continue;
+ b = fn->rpo[bid];
+ b->ins = newbins[bid];
+ b->nins = newbnins[bid];
+ }
+
+ /* remove from ins, update mov from insn */
+ for (fromn = 0; fromn < nmov; fromn++) {
+ b = fn->rpo[from[fromn].from.bid];
+ assert(from[fromn].from.insn < b->nins);
+ if (del)
+ b->ins[from[fromn].from.insn] = (Ins){.op = Onop};
+ mov[from[fromn].ton].from.insn = from[fromn].from.insn;
+ }
+
+ free(from);
+ free(newbins);
+ free(newbnins);
+}
+
+static void
+setdef(Fn *fn, Blk *b, Ins *i)
+{
+ assert(b->ins <= i && i < &b->ins[b->nins]);
+ if (rtype(i->to) == RTmp) {
+ assert(fn->tmp[i->to.val].bid == b->id);
+ fn->tmp[i->to.val].def = i;
+ }
+}
+
+/* shuffle b->ins to fixup use-before-def */
+void
+fixub4d(Fn *fn)
+{
+ Blk *b;
+ Ins *i, *i2;
+ Ins icpy;
+ uint n;
+ Tmp *t;
+ int ub4d;
+
+ for (b = fn->start; b; b = b->link)
+ /* O(N^2) but rare in practice */
+ for (i = b->ins; i < &b->ins[b->nins];) {
+ ub4d = 0;
+ for (n = 0; n < 2; n++)
+ if (rtype(i->arg[n]) == RTmp) {
+ t = &fn->tmp[i->arg[n].val];
+ if (t->bid == b->id && t->def)
+ if (i < t->def) {
+ assert(b->ins <= t->def && t->def < &b->ins[b->nins]);
+ ub4d = 1;
+ i2 = t->def;
+ break;
+ }
+ }
+ if (!ub4d)
+ goto Skip;
+ icpy = *i2;
+ for(; i2 > i; --i2) {
+ *i2 = *(i2-1);
+ setdef(fn, b, i2);
+ }
+ *i = icpy;
+ setdef(fn, b, i);
+ continue;
+ Skip:
+ i++;
+ }
+}
diff --git a/main.c b/main.c
index 5ecb4d0..b590363 100644
--- a/main.c
+++ b/main.c
@@ -73,10 +73,19 @@ func(Fn *fn)
fillalias(fn);
coalesce(fn);
filluse(fn);
+ filldom(fn);
ssacheck(fn);
- copy(fn);
+ gvn(fn);
+ fillpreds(fn);
+ filluse(fn);
+ filldom(fn);
+ gcm(fn);
+ fillrpo(fn);
+ fillpreds(fn);
+ filluse(fn);
+ reassoc(fn);
filluse(fn);
- fold(fn);
+ ssacheck(fn);
T.abi1(fn);
simpl(fn);
fillpreds(fn);
diff --git a/ops.h b/ops.h
index beaa6f3..92a299f 100644
--- a/ops.h
+++ b/ops.h
@@ -6,188 +6,185 @@
#define V(Imm)
#endif
-#ifndef P
- #define P(CanFold, HasId, IdVal)
+#ifndef F
+#define F(CanFold, HasId, IdVal, Commutes, Idemp, IsCmpEq, IsCmpLgte, CmpEqVal, IsPinned)
#endif
-
#define T(a,b,c,d,e,f,g,h) { \
{[Kw]=K##a, [Kl]=K##b, [Ks]=K##c, [Kd]=K##d}, \
{[Kw]=K##e, [Kl]=K##f, [Ks]=K##g, [Kd]=K##h} \
}
-
/*********************/
/* PUBLIC OPERATIONS */
/*********************/
/* Arithmetic and Bits */
-O(add, T(w,l,s,d, w,l,s,d), P(1,1,0)) X(2,1,0) V(1)
-O(sub, T(w,l,s,d, w,l,s,d), P(1,1,0)) X(2,1,0) V(0)
-O(neg, T(w,l,s,d, x,x,x,x), P(1,0,0)) X(1,1,0) V(0)
-O(div, T(w,l,s,d, w,l,s,d), P(1,1,1)) X(0,0,0) V(0)
-O(rem, T(w,l,e,e, w,l,e,e), P(1,0,0)) X(0,0,0) V(0)
-O(udiv, T(w,l,e,e, w,l,e,e), P(1,1,1)) X(0,0,0) V(0)
-O(urem, T(w,l,e,e, w,l,e,e), P(1,0,0)) X(0,0,0) V(0)
-O(mul, T(w,l,s,d, w,l,s,d), P(1,1,1)) X(2,0,0) V(0)
-O(and, T(w,l,e,e, w,l,e,e), P(1,0,0)) X(2,1,0) V(1)
-O(or, T(w,l,e,e, w,l,e,e), P(1,1,0)) X(2,1,0) V(1)
-O(xor, T(w,l,e,e, w,l,e,e), P(1,1,0)) X(2,1,0) V(1)
-O(sar, T(w,l,e,e, w,w,e,e), P(1,1,0)) X(1,1,0) V(1)
-O(shr, T(w,l,e,e, w,w,e,e), P(1,1,0)) X(1,1,0) V(1)
-O(shl, T(w,l,e,e, w,w,e,e), P(1,1,0)) X(1,1,0) V(1)
+O(add, T(w,l,s,d, w,l,s,d), F(1,1,0,1,0,0,0,0,0)) X(2,1,0) V(1)
+O(sub, T(w,l,s,d, w,l,s,d), F(1,1,0,0,0,0,0,0,0)) X(2,1,0) V(0)
+O(neg, T(w,l,s,d, x,x,x,x), F(1,0,0,0,0,0,0,0,0)) X(1,1,0) V(0)
+O(div, T(w,l,s,d, w,l,s,d), F(1,1,1,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(rem, T(w,l,e,e, w,l,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(udiv, T(w,l,e,e, w,l,e,e), F(1,1,1,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(urem, T(w,l,e,e, w,l,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(mul, T(w,l,s,d, w,l,s,d), F(1,1,1,1,0,0,0,0,0)) X(2,0,0) V(0)
+O(and, T(w,l,e,e, w,l,e,e), F(1,0,0,1,1,0,0,0,0)) X(2,1,0) V(1)
+O(or, T(w,l,e,e, w,l,e,e), F(1,1,0,1,1,0,0,0,0)) X(2,1,0) V(1)
+O(xor, T(w,l,e,e, w,l,e,e), F(1,1,0,1,0,0,0,0,0)) X(2,1,0) V(1)
+O(sar, T(w,l,e,e, w,w,e,e), F(1,1,0,0,0,0,0,0,0)) X(1,1,0) V(1)
+O(shr, T(w,l,e,e, w,w,e,e), F(1,1,0,0,0,0,0,0,0)) X(1,1,0) V(1)
+O(shl, T(w,l,e,e, w,w,e,e), F(1,1,0,0,0,0,0,0,0)) X(1,1,0) V(1)
/* Comparisons */
-O(ceqw, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cnew, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(csgew, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(csgtw, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cslew, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(csltw, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(1)
-O(cugew, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cugtw, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(culew, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cultw, T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(1)
-
-O(ceql, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cnel, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(csgel, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(csgtl, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cslel, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(csltl, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(1)
-O(cugel, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cugtl, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(culel, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cultl, T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(1)
-
-O(ceqs, T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cges, T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cgts, T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cles, T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(clts, T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cnes, T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cos, T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cuos, T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
-
-O(ceqd, T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cged, T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cgtd, T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cled, T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cltd, T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cned, T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cod, T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cuod, T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
+O(ceqw, T(w,w,e,e, w,w,e,e), F(1,1,1,1,0,1,0,1,0)) X(0,1,0) V(0)
+O(cnew, T(w,w,e,e, w,w,e,e), F(1,1,0,1,0,1,0,0,0)) X(0,1,0) V(0)
+O(csgew, T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
+O(csgtw, T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,1,0,0)) X(0,1,0) V(0)
+O(cslew, T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
+O(csltw, T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,1,0,0)) X(0,1,0) V(1)
+O(cugew, T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
+O(cugtw, T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,1,0,0)) X(0,1,0) V(0)
+O(culew, T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
+O(cultw, T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,1,0,0)) X(0,1,0) V(1)
+
+O(ceql, T(l,l,e,e, l,l,e,e), F(1,0,0,1,0,1,0,1,0)) X(0,1,0) V(0)
+O(cnel, T(l,l,e,e, l,l,e,e), F(1,0,0,1,0,1,0,0,0)) X(0,1,0) V(0)
+O(csgel, T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
+O(csgtl, T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,1,0,0)) X(0,1,0) V(0)
+O(cslel, T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
+O(csltl, T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,1,0,0)) X(0,1,0) V(1)
+O(cugel, T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
+O(cugtl, T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,1,0,0)) X(0,1,0) V(0)
+O(culel, T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
+O(cultl, T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,1,0,0)) X(0,1,0) V(1)
+
+O(ceqs, T(s,s,e,e, s,s,e,e), F(1,0,0,1,0,0,0,0,0)) X(0,1,0) V(0)
+O(cges, T(s,s,e,e, s,s,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cgts, T(s,s,e,e, s,s,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cles, T(s,s,e,e, s,s,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(clts, T(s,s,e,e, s,s,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cnes, T(s,s,e,e, s,s,e,e), F(1,0,0,1,0,0,0,0,0)) X(0,1,0) V(0)
+O(cos, T(s,s,e,e, s,s,e,e), F(1,0,0,1,0,0,0,0,0)) X(0,1,0) V(0)
+O(cuos, T(s,s,e,e, s,s,e,e), F(1,0,0,1,0,0,0,0,0)) X(0,1,0) V(0)
+
+O(ceqd, T(d,d,e,e, d,d,e,e), F(1,0,0,1,0,0,0,0,0)) X(0,1,0) V(0)
+O(cged, T(d,d,e,e, d,d,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cgtd, T(d,d,e,e, d,d,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cled, T(d,d,e,e, d,d,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cltd, T(d,d,e,e, d,d,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cned, T(d,d,e,e, d,d,e,e), F(1,0,0,1,0,0,0,0,0)) X(0,1,0) V(0)
+O(cod, T(d,d,e,e, d,d,e,e), F(1,0,0,1,0,0,0,0,0)) X(0,1,0) V(0)
+O(cuod, T(d,d,e,e, d,d,e,e), F(1,0,0,1,0,0,0,0,0)) X(0,1,0) V(0)
/* Memory */
-O(storeb, T(w,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(storeh, T(w,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(storew, T(w,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(storel, T(l,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(stores, T(s,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(stored, T(d,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0)
-
-O(loadsb, T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(loadub, T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(loadsh, T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(loaduh, T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(loadsw, T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(loaduw, T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(load, T(m,m,m,m, x,x,x,x), P(0,0,0)) X(0,0,1) V(0)
+O(storeb, T(w,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(storeh, T(w,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(storew, T(w,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(storel, T(l,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(stores, T(s,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(stored, T(d,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+
+O(loadsb, T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(loadub, T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(loadsh, T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(loaduh, T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(loadsw, T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(loaduw, T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(load, T(m,m,m,m, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
/* Extensions and Truncations */
-O(extsb, T(w,w,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-O(extub, T(w,w,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-O(extsh, T(w,w,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-O(extuh, T(w,w,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-O(extsw, T(e,w,e,e, e,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-O(extuw, T(e,w,e,e, e,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-
-O(exts, T(e,e,e,s, e,e,e,x), P(1,0,0)) X(0,0,1) V(0)
-O(truncd, T(e,e,d,e, e,e,x,e), P(1,0,0)) X(0,0,1) V(0)
-O(stosi, T(s,s,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-O(stoui, T(s,s,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-O(dtosi, T(d,d,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-O(dtoui, T(d,d,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-O(swtof, T(e,e,w,w, e,e,x,x), P(1,0,0)) X(0,0,1) V(0)
-O(uwtof, T(e,e,w,w, e,e,x,x), P(1,0,0)) X(0,0,1) V(0)
-O(sltof, T(e,e,l,l, e,e,x,x), P(1,0,0)) X(0,0,1) V(0)
-O(ultof, T(e,e,l,l, e,e,x,x), P(1,0,0)) X(0,0,1) V(0)
-O(cast, T(s,d,w,l, x,x,x,x), P(1,0,0)) X(0,0,1) V(0)
+O(extsb, T(w,w,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(extub, T(w,w,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(extsh, T(w,w,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(extuh, T(w,w,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(extsw, T(e,w,e,e, e,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(extuw, T(e,w,e,e, e,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+
+O(exts, T(e,e,e,s, e,e,e,x), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(truncd, T(e,e,d,e, e,e,x,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(stosi, T(s,s,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(stoui, T(s,s,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(dtosi, T(d,d,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(dtoui, T(d,d,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(swtof, T(e,e,w,w, e,e,x,x), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(uwtof, T(e,e,w,w, e,e,x,x), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(sltof, T(e,e,l,l, e,e,x,x), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(ultof, T(e,e,l,l, e,e,x,x), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(cast, T(s,d,w,l, x,x,x,x), F(1,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
/* Stack Allocation */
-O(alloc4, T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(alloc8, T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(alloc16, T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
+O(alloc4, T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(alloc8, T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(alloc16, T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
/* Variadic Function Helpers */
-O(vaarg, T(m,m,m,m, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(vastart, T(m,e,e,e, x,e,e,e), P(0,0,0)) X(0,0,0) V(0)
+O(vaarg, T(m,m,m,m, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(vastart, T(m,e,e,e, x,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
-O(copy, T(w,l,s,d, x,x,x,x), P(0,0,0)) X(0,0,1) V(0)
+O(copy, T(w,l,s,d, x,x,x,x), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
/* Debug */
-O(dbgloc, T(w,e,e,e, w,e,e,e), P(0,0,0)) X(0,0,1) V(0)
+O(dbgloc, T(w,e,e,e, w,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
/****************************************/
/* INTERNAL OPERATIONS (keep nop first) */
/****************************************/
/* Miscellaneous and Architecture-Specific Operations */
-O(nop, T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,1) V(0)
-O(addr, T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(blit0, T(m,e,e,e, m,e,e,e), P(0,0,0)) X(0,1,0) V(0)
-O(blit1, T(w,e,e,e, x,e,e,e), P(0,0,0)) X(0,1,0) V(0)
-O(swap, T(w,l,s,d, w,l,s,d), P(0,0,0)) X(1,0,0) V(0)
-O(sign, T(w,l,e,e, x,x,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(salloc, T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(xidiv, T(w,l,e,e, x,x,e,e), P(0,0,0)) X(1,0,0) V(0)
-O(xdiv, T(w,l,e,e, x,x,e,e), P(0,0,0)) X(1,0,0) V(0)
-O(xcmp, T(w,l,s,d, w,l,s,d), P(0,0,0)) X(1,1,0) V(0)
-O(xtest, T(w,l,e,e, w,l,e,e), P(0,0,0)) X(1,1,0) V(0)
-O(acmp, T(w,l,e,e, w,l,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(acmn, T(w,l,e,e, w,l,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(afcmp, T(e,e,s,d, e,e,s,d), P(0,0,0)) X(0,0,0) V(0)
-O(reqz, T(w,l,e,e, x,x,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(rnez, T(w,l,e,e, x,x,e,e), P(0,0,0)) X(0,0,0) V(0)
+O(nop, T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(addr, T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(blit0, T(m,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,1,0) V(0)
+O(blit1, T(w,e,e,e, x,e,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,1,0) V(0)
+O(swap, T(w,l,s,d, w,l,s,d), F(0,0,0,0,0,0,0,0,0)) X(1,0,0) V(0)
+O(sign, T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(salloc, T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(xidiv, T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(1,0,0) V(0)
+O(xdiv, T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(1,0,0) V(0)
+O(xcmp, T(w,l,s,d, w,l,s,d), F(0,0,0,0,0,0,0,0,0)) X(1,1,0) V(0)
+O(xtest, T(w,l,e,e, w,l,e,e), F(0,0,0,0,0,0,0,0,0)) X(1,1,0) V(0)
+O(acmp, T(w,l,e,e, w,l,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(acmn, T(w,l,e,e, w,l,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(afcmp, T(e,e,s,d, e,e,s,d), F(0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(reqz, T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(rnez, T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
/* Arguments, Parameters, and Calls */
-O(par, T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(parsb, T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(parub, T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(parsh, T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(paruh, T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(parc, T(e,x,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(pare, T(e,x,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(arg, T(w,l,s,d, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(argsb, T(w,e,e,e, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(argub, T(w,e,e,e, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(argsh, T(w,e,e,e, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(arguh, T(w,e,e,e, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(argc, T(e,x,e,e, e,l,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(arge, T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(argv, T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(call, T(m,m,m,m, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
+O(par, T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(parsb, T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(parub, T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(parsh, T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(paruh, T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(parc, T(e,x,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(pare, T(e,x,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(arg, T(w,l,s,d, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(argsb, T(w,e,e,e, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(argub, T(w,e,e,e, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(argsh, T(w,e,e,e, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(arguh, T(w,e,e,e, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(argc, T(e,x,e,e, e,l,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(arge, T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(argv, T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(call, T(m,m,m,m, x,x,x,x), F(0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
/* Flags Setting */
-O(flagieq, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagine, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagisge, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagisgt, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagisle, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagislt, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagiuge, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagiugt, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagiule, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagiult, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagfeq, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagfge, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagfgt, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagfle, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagflt, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagfne, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagfo, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagfuo, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-
+O(flagieq, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagine, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagisge, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagisgt, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagisle, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagislt, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagiuge, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagiugt, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagiule, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagiult, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagfeq, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagfge, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagfgt, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagfle, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagflt, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagfne, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagfo, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagfuo, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
#undef T
#undef X
diff --git a/parse.c b/parse.c
index a745779..5c6ea92 100644
--- a/parse.c
+++ b/parse.c
@@ -15,11 +15,17 @@ enum {
};
Op optab[NOp] = {
-#undef P
-#define P(cf, hi, id) .canfold = cf, .hasid = hi, .idval = id
-#define O(op, t, p) [O##op]={.name = #op, .argcls = t, p},
+#undef F
+#define F(cf, hi, id, co, im, ic, lg, cv, pn) \
+ .canfold = cf, \
+ .hasid = hi, .idval = id, \
+ .commutes = co, \
+ .idemp = im, \
+ .iscmpeq = ic, .iscmplgte = lg, .eqval = cv, \
+ .ispinned = pn
+#define O(op, k, flags) [O##op]={.name = #op, .argcls = k, flags},
#include "ops.h"
-#undef P
+#undef F
};
typedef enum {
diff --git a/reassoc.c b/reassoc.c
new file mode 100644
index 0000000..4dac9a9
--- /dev/null
+++ b/reassoc.c
@@ -0,0 +1,225 @@
+#include "all.h"
+
+typedef int testfn(Fn *fn, Use *u, Ref r);
+
+static int
+alluses(Fn *fn, Ref r, testfn *test)
+{
+ Tmp *t;
+ Use *u;
+
+ assert(rtype(r) == RTmp);
+ t = &fn->tmp[r.val];
+ for (u = t->use; u < &t->use[t->nuse]; u++)
+ if (!test(fn, u, r))
+ return 0;
+ return 1;
+}
+
+static int
+singleuse(Fn *fn, Ref r)
+{
+ assert(rtype(r) == RTmp);
+ return fn->tmp[r.val].nuse == 1;
+}
+
+static int
+ldstuse(Fn *fn __attribute__((unused)), Use *u, Ref r)
+{
+ if (u->type == UIns) {
+ if (isload(u->u.ins->op)) {
+ assert(req(r, u->u.ins->arg[0]));
+ return 1;
+ }
+ if (isstore(u->u.ins->op))
+ if (req(r, u->u.ins->arg[1]))
+ return 1;
+ if (u->u.ins->op == Oblit0) {
+ assert(req(r, u->u.ins->arg[0]) || req(r, u->u.ins->arg[1]));
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int
+iskladdcon(Fn *fn, Ins *i)
+{
+ int64_t v;
+
+ if (i->op == Oadd)
+ if (i->cls == Kl)
+ if (rtype(i->arg[0]) == RTmp)
+ if (isconbits(fn, i->arg[1], &v))
+ return 1;
+ return 0;
+}
+
+static int
+inplaceldstaddr(Fn *fn, Blk *b, Ins *i)
+{
+ Ins *i2;
+
+ assert(b->ins <= i && i < &b->ins[b->nins]);
+ if (singleuse(fn, i->to))
+ if (i < &b->ins[b->nins-1]) {
+ i2 = i+1;
+ if (isload(i2->op))
+ if (req(i->to, i2->arg[0]))
+ return 1;
+ if (isstore(i2->op))
+ if (req(i->to, i2->arg[1]))
+ return 1;
+ // TODO blit0
+ }
+ return 0;
+}
+
+static int
+isldstaddr(Fn *fn, Blk *b, Ins *i)
+{
+ if (iskladdcon(fn, i))
+ if (alluses(fn, i->to, ldstuse))
+ if (!inplaceldstaddr(fn, b, i))
+ return 1;
+ return 0;
+}
+
+static int
+jnzuse(Fn *fn, Use *u, Ref r)
+{
+ if (u->type == UJmp)
+ if (fn->rpo[u->bid]->jmp.type == Jjnz) {
+ assert(req(r, fn->rpo[u->bid]->jmp.arg));
+ return 1;
+ }
+ return 0;
+}
+
+static int
+iscmp4jnz(Fn *fn, Blk *b, Ins *i)
+{
+ int x;
+
+ if (iscmp(i->op, &x, &x))
+ if (alluses(fn, i->to, jnzuse))
+ if (!(singleuse(fn, i->to) && i-b->ins == b->nins-1))
+ return 1;
+
+ return 0;
+}
+
+static int
+canreassoc(Fn *fn, Blk *b, Ins *i)
+{
+ assert(b->ins <= i && i < &b->ins[b->nins]);
+ return isldstaddr(fn, b, i) || iscmp4jnz(fn, b, i);
+}
+
+static void
+ireassoc(Fn *fn, Blk *b, Ins *i, uint *pnim, InsMov **pim)
+{
+ Blk *b2;
+ Tmp *t;
+ Use *u;
+
+ assert(b->ins <= i && i < &b->ins[b->nins]);
+ if (!canreassoc(fn, b, i))
+ return;
+
+ assert(rtype(i->to) == RTmp);
+ t = &fn->tmp[i->to.val];
+ for (u = t->use; u < &t->use[t->nuse]; u++) {
+ assert(u->type == UJmp || u->type == UIns);
+ b2 = fn->rpo[u->bid];
+ vgrow(pim, ++(*pnim));
+ (*pim)[(*pnim)-1] = (InsMov){
+ .from = {.bid = b->id, .insn = i-b->ins},
+ .to = {
+ .bid = u->bid,
+ .insn = u->type == UJmp ? b2->nins : u->u.ins-b2->ins
+ }};
+ }
+}
+
+static void
+trealloc(Fn *fn, Blk *b, Ins *i)
+{
+ Ins *i2;
+ Ref r;
+ int x;
+
+ r = newtmp("rea", i->cls, fn);
+ if (iskladdcon(fn, i)) {
+ assert(i < &b->ins[b->nins-1]);
+ i2 = i+1;
+ if (isload(i2->op)) {
+ assert(req(i->to, i2->arg[0]));
+ i2->arg[0] = r;
+ } else if (isstore(i2->op)) {
+ assert(req(i->to, i2->arg[1]));
+ i2->arg[1] = r;
+ } else {
+ if (i2->op != Oblit0) {
+ assert(i < &b->ins[b->nins-2]);
+ i2 = i+2;
+ }
+ assert(i2->op == Oblit0);
+ assert(req(i->to, i2->arg[0]) || req(i->to, i2->arg[1]));
+ if (req(i->to, i2->arg[0]))
+ i2->arg[0] = r;
+ if (req(i->to, i2->arg[1]))
+ i2->arg[1] = r;
+ }
+ i->to = r;
+ } else if (iscmp(i->op, &x, &x)) {
+ assert(b->jmp.type == Jjnz);
+ assert(req(i->to, b->jmp.arg));
+ i->to = r;
+ b->jmp.arg = r;
+ } else
+ die("unreachable");
+}
+
+/* Redistribute trivial ops to point of use. */
+/* Reduces register pressure. */
+/* needs rpo, use; breaks use */
+void
+reassoc(Fn *fn)
+{
+ Blk *b;
+ Ins *i;
+ InsMov *im;
+ uint n, nim;
+
+ nim = 0;
+ im = vnew(nim, sizeof im[0], PHeap);
+
+ /* identify trivial ins */
+ for (b = fn->start; b; b = b->link) {
+ for (i = b->ins; i < &b->ins[b->nins]; i++)
+ ireassoc(fn, b, i, &nim, &im);
+ }
+
+ /* duplicate trivial ins */
+ movins(fn, im, nim, 0/*!del*/);
+
+ /* create new tmps for dups */
+ for (n = 0; n < nim; n++) {
+ b = fn->rpo[im[n].to.bid];
+ i = &b->ins[im[n].to.insn];
+ trealloc(fn, b, i);
+ }
+
+ /* delete original ins */
+ for (n = 0; n < nim; n++) {
+ b = fn->rpo[im[n].from.bid];
+ i = &b->ins[im[n].from.insn];
+ *i = (Ins){.op = Onop};
+ }
+
+ if (debug['G']) {
+ fprintf(stderr, "\n> After Reassociation:\n");
+ printfn(fn, stderr);
+ }
+}
diff --git a/ssa.c b/ssa.c
index 929301e..4e3d2a5 100644
--- a/ssa.c
+++ b/ssa.c
@@ -1,7 +1,7 @@
#include "all.h"
#include <stdarg.h>
-static void
+void
adduse(Tmp *tmp, int ty, Blk *b, ...)
{
Use *u;
diff --git a/test/_gcm1.ssa b/test/_gcm1.ssa
new file mode 100644
index 0000000..719cddb
--- /dev/null
+++ b/test/_gcm1.ssa
@@ -0,0 +1,48 @@
+export
+function w $ifmv(w %p1, w %p2, w %p3) {
+@start
+@entry
+ %rt =w add %p2, %p3 # gcm moves to @true
+ %rf =w sub %p2, %p3 # gcm moves to @false
+ jnz %p1, @true, @false
+@true
+ %r =w copy %rt
+ jmp @exit
+@false
+ %r =w copy %rf
+ jmp @exit
+@exit
+ ret %r
+}
+
+export
+function w $hoist1(w %p1, w %p2, w %p3) {
+@start
+@entry
+ %n =w copy 0
+ %i =w copy %p1
+@loop
+ %base =w add %p2, %p3 # gcm moves to @exit
+ %i =w sub %i, 1
+ %n =w add %n, 1
+ jnz %i, @loop, @exit
+@exit
+ %r =w add %base, %n
+ ret %r
+}
+
+export
+function w $hoist2(w %p1, w %p2, w %p3) {
+@start
+@entry
+ %n =w copy 0
+ %i =w copy %p1
+@loop
+ %base =w add %p2, %p3 # gcm moves to @entry
+ %i =w sub %i, 1
+ %n =w add %n, %base
+ jnz %i, @loop, @exit
+@exit
+ %r =w add %base, %n
+ ret %r
+}
diff --git a/test/_gcm2.ssa b/test/_gcm2.ssa
new file mode 100644
index 0000000..baeb4a4
--- /dev/null
+++ b/test/_gcm2.ssa
@@ -0,0 +1,43 @@
+# Programs from "Global Code Motion Global Value Numbering" by Cliff Click
+# https://courses.cs.washington.edu/courses/cse501/06wi/reading/click-pldi95.pdf
+
+# GCM program in Figure 1
+
+function w $gcm_test(w %a){
+@start
+ %i.0 =w copy 0
+@loop
+ %i.1 =w phi @start %i.0, @loop %i.2
+ %b =w add %a, 1 # early schedule moves to @start
+ %i.2 =w add %i.1, %b
+ %c =w mul %i.2, 2 # late schedule moves to @end
+ %x =w csltw %i.2, 10
+ jnz %x, @loop, @end
+@end
+ ret %c
+}
+
+# GCM program in "Figure 3 x's definition does not dominate it's use"
+#
+# SSA contruction will insert phi instruction for "x" in @if_false
+# preventing the "add" in @if_false from being moved to @if_true
+
+function $gcm_test2 (w %a){
+@start
+ %f =w copy 1
+ %x =w copy 0
+ %s.0 =w copy 0
+@loop
+ %s.1 = w phi @start %s.0, @if_false %s.2
+ jnz %a, @if, @end
+@if
+ jnz %f, @if_true, @if_false
+@if_true
+ %f =w copy 0
+ %x =w add %x, 1
+@if_false
+ %s.2 =w add %s.1, %x
+ jmp @loop
+@end
+ ret
+}
diff --git a/test/_load-elim.ssa b/test/_load-elim.ssa
new file mode 100644
index 0000000..faae478
--- /dev/null
+++ b/test/_load-elim.ssa
@@ -0,0 +1,17 @@
+# GCM can eliminate unused add/load instructions
+
+export
+function w $f(l %p, w %c) {
+@start
+ jnz %c, @true, @false
+@true
+ %p1 =l add %p, 4
+ %v1 =w loaduw %p1
+ jmp @end
+@false
+ %p2 =l add %p, 4
+ %v2 =w loaduw %p2
+ jmp @end
+@end
+ ret 0
+}
diff --git a/test/gvn-live-dead.ssa b/test/gvn-live-dead.ssa
new file mode 100644
index 0000000..d47f05b
--- /dev/null
+++ b/test/gvn-live-dead.ssa
@@ -0,0 +1,19 @@
+export
+function w $test(w %p1, w %p2) {
+@start
+@entry
+ %t1 =w copy 1
+ jnz %t1, @live, @dead1
+@live
+ %t2 =w add %p1, %p2
+ ret %t2
+@dead1
+ %t2 =w add %p1, %p2 # live ins in dead blk
+@dead2
+ jnz %t1, @live, @dead1
+}
+
+# >>> driver
+# extern int test(int p1, int p2);
+# int main() { return test(1, 2) != 3; }
+# <<<
diff --git a/util.c b/util.c
index 2e4b4cc..d341720 100644
--- a/util.c
+++ b/util.c
@@ -417,6 +417,27 @@ addcon(Con *c0, Con *c1, int m)
return 1;
}
+int
+isconbits(Fn *fn, Ref r, int64_t *v)
+{
+ Con *c;
+
+ if (rtype(r) == RCon) {
+ c = &fn->con[r.val];
+ if (c->type == CBits) {
+ *v = c->bits.i;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int
+istmpconbits(Fn *fn, Ins *i, int64_t *v)
+{
+ return rtype(i->arg[0]) == RTmp && isconbits(fn, i->arg[1], v);
+}
+
void
salloc(Ref rt, Ref rs, Fn *fn)
{
--
2.34.1