Authentication-Results: mail-b.sr.ht; dkim=pass header.d=c9x.me header.i=@c9x.me; dkim=pass header.d=messagingengine.com header.i=@messagingengine.com Received: from out2-smtp.messagingengine.com (out2-smtp.messagingengine.com [66.111.4.26]) by mail-b.sr.ht (Postfix) with ESMTPS id BDAE111EE06 for <~mpu/qbe@lists.sr.ht>; Sun, 20 Nov 2022 21:28:36 +0000 (UTC) Received: from compute2.internal (compute2.nyi.internal [10.202.2.46]) by mailout.nyi.internal (Postfix) with ESMTP id 7B07E5C00C7; Sun, 20 Nov 2022 16:28:36 -0500 (EST) Received: from mailfrontend1 ([10.202.2.162]) by compute2.internal (MEProxy); Sun, 20 Nov 2022 16:28:36 -0500 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=c9x.me; h=cc:cc :content-transfer-encoding:date:date:from:from:in-reply-to :message-id:mime-version:reply-to:sender:subject:subject:to:to; s=fm2; t=1668979716; x=1669066116; bh=vXbg/oOMW2v63j8WwLXd7ojmk z5tGYxBJli67SfzJY0=; b=Z7DoC2pYS4BEELhauuvNepzBjXj2YZD3QPb/watwC rLop/jyLxLLDLUoGFeTc4CxiAMTEuROoaoeMkyILAKmkQHyhmc0JxhSLs5QCNaSB GoZmoIRcBQuCaxI+pkFqkxvRSh5ViYiuMaSkGpfYcNbnA2+Z+HCV6PQYgwRleIzo 9VVFXfb/t9h0P5W+xlBUSdkuLKAuuDdoLDcIavLx1rLcQgRKeJpM55aus9BoCq9D HDOKGVeki5r0qZ8fvio0X/Yg4vNAaV76O+cxUEuvLy0HKWjjVQRpgLtu3wasHWIy MR3YS4MKt61Vm2zwqdsb4qXtsAl8jtZ2iV3aYWbbx4F9A== DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d= messagingengine.com; h=cc:cc:content-transfer-encoding:date:date :feedback-id:feedback-id:from:from:in-reply-to:message-id :mime-version:reply-to:sender:subject:subject:to:to:x-me-proxy :x-me-proxy:x-me-sender:x-me-sender:x-sasl-enc; s=fm1; t= 1668979716; x=1669066116; bh=vXbg/oOMW2v63j8WwLXd7ojmkz5tGYxBJli 67SfzJY0=; b=H274kmaGA658V/AhxJDcvioNU3MFnGKTSoJktZWGcZZI96KJZaA rAMzBVGJkHo1lyvf/gLLMvMnPDqUKrdMGCFgpyiC4YMmseSfT91wmoWbNZ4r/Y+Z C5XS/EDK4hYk8aGXq1V7ml44xw1Z1x3cODOSjVSruGXkBf7cIfvGv+911NMrtr6+ 6oMS79be0b1ZT5sr9F652phxdTaSfJc53Z7wWpIan6rZEnuKEBm7AjktCBxzSp+R soE3VLRwbBHVK5J49nJ4sNNoCfDEogjG1JOgDXqUHAKsmNV9EVZ5nlMqaiFR6sd+ el618irhk8d/BMr/TbMjks7hETalhA+USRg== X-ME-Sender: X-ME-Received: X-ME-Proxy-Cause: gggruggvucftvghtrhhoucdtuddrgedvgedrheeggdduvdejucetufdoteggodetrfdotf fvucfrrhhofhhilhgvmecuhfgrshhtofgrihhlpdfqfgfvpdfurfetoffkrfgpnffqhgen uceurghilhhouhhtmecufedttdenucenucfjughrpefhvfevufffkffoggfgsedtkeertd ertddtnecuhfhrohhmpefsuhgvnhhtihhnucevrghrsghonhhnvggruhiguceoqhhuvghn thhinhestgelgidrmhgvqeenucggtffrrghtthgvrhhnpedvvdfgudeugffggfetudefte ffuedvffevudeliefhteehleduhfdvuefgkeejhfenucevlhhushhtvghrufhiiigvpedt necurfgrrhgrmhepmhgrihhlfhhrohhmpehquhgvnhhtihhnsegtleigrdhmvg X-ME-Proxy: Feedback-ID: i52e14683:Fastmail Received: by mail.messagingengine.com (Postfix) with ESMTPA; Sun, 20 Nov 2022 16:28:36 -0500 (EST) Received: by nuc.c9x.me (Postfix, from userid 1000) id 9AB97400F9; Sun, 20 Nov 2022 22:28:33 +0100 (CET) From: Quentin Carbonneaux To: ~mpu/qbe@lists.sr.ht Cc: quentin@c9x.me Subject: [PATCH] new slot coalescing pass Date: Sun, 20 Nov 2022 22:28:20 +0100 Message-Id: <20221120212820.2043203-1-quentin@c9x.me> X-Mailer: git-send-email 2.38.1 MIME-Version: 1.0 Content-Transfer-Encoding: 8bit This pass limits stack usage when many small aggregates are allocated on the stack. A fast liveness analysis figures out which slots interfere and the pass then fuses slots that do not interfere. The pass also kills stack slots that are only ever assigned. On the hare stdlib test suite, this fusion pass managed to reduce the total eligible slot bytes count by 84%. The slots considered for fusion must not escape and not exceed 64 bytes in size. --- I'm going to push this patch on master but I'm sharing it on the mailing list so that people get a chance to look at it. If you have remarks or spot bugs & oddities, please let me know! Thanks. all.h | 3 +- main.c | 5 +- mem.c | 303 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 306 insertions(+), 5 deletions(-) diff --git a/all.h b/all.h index 8dd4276..92c02ee 100644 --- a/all.h +++ b/all.h @@ -514,7 +514,8 @@ void fillloop(Fn *); void simpljmp(Fn *); /* mem.c */ -void memopt(Fn *); +void promote(Fn *); +void coalesce(Fn *); /* alias.c */ void fillalias(Fn *); diff --git a/main.c b/main.c index 8959cd9..3fcfd7f 100644 --- a/main.c +++ b/main.c @@ -62,7 +62,7 @@ func(Fn *fn) fillrpo(fn); fillpreds(fn); filluse(fn); - memopt(fn); + promote(fn); filluse(fn); ssa(fn); filluse(fn); @@ -70,6 +70,9 @@ func(Fn *fn) fillalias(fn); loadopt(fn); filluse(fn); + fillalias(fn); + coalesce(fn); + filluse(fn); ssacheck(fn); copy(fn); filluse(fn); diff --git a/mem.c b/mem.c index f20f378..6cbb801 100644 --- a/mem.c +++ b/mem.c @@ -1,8 +1,11 @@ #include "all.h" +typedef struct Range Range; +typedef struct Slot Slot; + /* require use, maintains use counts */ void -memopt(Fn *fn) +promote(Fn *fn) { Blk *b; Ins *i, *l; @@ -22,7 +25,7 @@ memopt(Fn *fn) goto Skip; k = -1; s = -1; - for (u=t->use; u != &t->use[t->nuse]; u++) { + for (u=t->use; u<&t->use[t->nuse]; u++) { if (u->type != UIns) goto Skip; l = u->u.ins; @@ -82,7 +85,301 @@ memopt(Fn *fn) Skip:; } if (debug['M']) { - fprintf(stderr, "\n> After memory optimization:\n"); + fprintf(stderr, "\n> After slot promotion:\n"); printfn(fn, stderr); } } + +/* [a, b) with 0 <= a */ +struct Range { + int a, b; +}; + +struct Slot { + int t; + int sz; + bits m; + bits l; + Range r; + Slot *s; +}; + +static inline int +rin(Range r, int n) +{ + return r.a <= n && n < r.b; +} + +static inline int +rovlap(Range r0, Range r1) +{ + return r0.b && r1.b && r0.a < r1.b && r1.a < r0.b; +} + +static void +radd(Range *r, int n) +{ + if (!r->b) + *r = (Range){n, n+1}; + else if (n < r->a) + r->a = n; + else if (n >= r->b) + r->b = n+1; +} + +static int +slot(Slot **ps, int64_t *off, Ref r, Fn *fn, Slot *sl) +{ + Alias a; + Tmp *t; + + getalias(&a, r, fn); + if (a.type != ALoc) + return 0; + t = &fn->tmp[a.base]; + if (t->visit < 0) + return 0; + *off = a.offset; + *ps = &sl[t->visit]; + return 1; +} + +static void +memr(Ref r, bits x, int ip, Fn *fn, Slot *sl) +{ + int64_t off; + Slot *s; + + if (slot(&s, &off, r, fn, sl)) { + s->l |= x << off; + s->l &= s->m; + if (s->l) + radd(&s->r, ip); + } +} + +static void +memw(Ref r, bits x, int ip, Fn *fn, Slot *sl) +{ + int64_t off; + Slot *s; + + if (slot(&s, &off, r, fn, sl)) { + if (s->l) + radd(&s->r, ip); + s->l &= ~(x << off); + } +} + +static int +scmp(const void *pa, const void *pb) +{ + Slot *a, *b; + + a = (Slot *)pa, b = (Slot *)pb; + if (a->sz != b->sz) + return b->sz - a->sz; + return a->r.a - b->r.a; +} + +static void +maxrpo(Blk *hd, Blk *b) +{ + if (hd->loop < (int)b->id) + hd->loop = b->id; +} + +void +coalesce(Fn *fn) +{ + Range r, *br; + Slot *s, *s0, *sl; + Blk *b, **ps, *succ[3]; + Ins *i; + Use *u; + Tmp *t; + Ref *arg; + bits x; + int n, m, nsl, ip, *kill; + uint total, freed, fused; + + /* minimize the stack usage + * by coalescing slots + */ + nsl = 0; + sl = vnew(0, sizeof sl[0], PHeap); + for (n=Tmp0; nntmp; n++) { + t = &fn->tmp[n]; + t->visit = -1; + if (t->alias.type == ALoc) + if (t->alias.slot == &t->alias) + if (t->alias.u.loc.sz != -1) { + t->visit = nsl; + vgrow(&sl, ++nsl); + s = &sl[nsl-1]; + s->t = n; + s->sz = t->alias.u.loc.sz; + s->m = t->alias.u.loc.m; + s->s = 0; + } + } + + /* one-pass liveness analysis */ + for (b=fn->start; b; b=b->link) + b->loop = -1; + loopiter(fn, maxrpo); + br = vnew(fn->nblk, sizeof br[0], PHeap); + ip = INT_MAX - 1; + for (n=fn->nblk-1; n>=0; n--) { + b = fn->rpo[n]; + succ[0] = b->s1; + succ[1] = b->s2; + succ[2] = 0; + br[n].b = ip--; + for (s=sl; s<&sl[nsl]; s++) { + s->l = 0; + for (ps=succ; *ps; ps++) { + m = (*ps)->id; + if (m > n && rin(s->r, br[m].a)) { + s->l = s->m; + radd(&s->r, ip); + } + } + } + for (i=&b->ins[b->nins]; i!=b->ins;) { + arg = (--i)->arg; + if (i->op == Oargc) { + memr(arg[1], -1, --ip, fn, sl); + } + if (isload(i->op)) { + x = BIT(loadsz(i)) - 1; + memr(arg[0], x, --ip, fn, sl); + } + if (isstore(i->op)) { + x = BIT(storesz(i)) - 1; + memw(arg[1], x, ip--, fn, sl); + } + } + for (s=sl; s<&sl[nsl]; s++) + if (s->l) { + radd(&s->r, ip); + if (b->loop != -1) { + assert(b->loop > n); + radd(&s->r, br[b->loop].b - 1); + } + } + br[n].a = ip; + } + vfree(br); + + /* kill slots with an empty live range */ + total = 0; + freed = 0; + kill = vnew(0, sizeof kill[0], PHeap); + n = 0; + for (s=s0=sl; s<&sl[nsl]; s++) { + total += s->sz; + if (!s->r.b) { + vgrow(&kill, ++n); + kill[n-1] = s->t; + freed += s->sz; + } else + *s0++ = *s; + } + nsl = s0-sl; + if (debug['M']) { + fputs("\n> Slot coalescing:\n", stderr); + if (n) { + fputs("\tDEAD", stderr); + for (m=0; mtmp[kill[m]].name); + fputc('\n', stderr); + } + } + while (n--) { + t = &fn->tmp[kill[n]]; + assert(t->ndef == 1 && t->ins); + *t->ins = (Ins){.op = Onop}; + for (u=t->use; u<&t->use[t->nuse]; u++) { + assert(u->type == UIns); + i = u->u.ins; + if (!req(i->to, R)) { + assert(rtype(i->to) == RTmp); + vgrow(&kill, ++n); + kill[n-1] = i->to.val; + } else + *i = (Ins){.op = Onop}; + } + } + vfree(kill); + + /* fuse slots by decreasing size */ + qsort(sl, nsl, sizeof *sl, scmp); + fused = 0; + for (n=0; ns) + continue; + s0->s = s0; + r = s0->r; + for (s=s0+1; s<&sl[nsl]; s++) { + if (s->s || !s->r.b) + goto Skip; + if (rovlap(r, s->r)) + /* O(n^2) can be approximated + * by 'goto Skip;' if need be + */ + for (m=n; &sl[m]r)) + goto Skip; + radd(&r, s->r.a); + radd(&r, s->r.b - 1); + s->s = s0; + fused += s->sz; + Skip:; + } + } + + /* substitute fused slots */ + for (s=sl; s<&sl[nsl]; s++) { + if (s->s == s) + continue; + t = &fn->tmp[s->t]; + assert(t->ndef == 1 && t->ins); + *t->ins = (Ins){.op = Onop}; + for (u=t->use; u<&t->use[t->nuse]; u++) { + assert(u->type == UIns); + arg = u->u.ins->arg; + for (n=0; n<2; n++) + if (req(arg[n], TMP(s->t))) + arg[n] = TMP(s->s->t); + } + } + + if (debug['M']) { + for (s0=sl; s0<&sl[nsl]; s0++) { + if (s0->s != s0) + continue; + fprintf(stderr, "\tLOCL (% 3db) %s:", + s0->sz, fn->tmp[s0->t].name); + for (s=s0; s<&sl[nsl]; s++) { + if (s->s != s0) + continue; + fprintf(stderr, " %%%s", fn->tmp[s->t].name); + if (s->r.b) + fprintf(stderr, "[%d,%d)", + s->r.a-ip, s->r.b-ip); + else + fputs("{}", stderr); + } + fputc('\n', stderr); + } + fprintf(stderr, "\tSUMS %u/%u/%u (freed/fused/total)\n\n", + freed, fused, total); + printfn(fn, stderr); + } + + vfree(sl); +} -- 2.38.1