~mpu/qbe

Rudimentary implementation of Windows ABI v1 PROPOSED

Finxx: 1
 Rudimentary implementation of Windows ABI

 7 files changed, 41 insertions(+), 6 deletions(-)
Thanks a lot! That is really cool. I can publicize it on
the website and include it in a branch on the official
git repository. I would like to keep master with features
that are either fully supported, or close to that.

About how to best use git with email, I suggest you
take a look at Drew's nice tutorial[1].

Thanks again for your contribution!


[1]: https://git-send-email.io/
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~mpu/qbe/patches/49766/mbox | git am -3
Learn more about email & git

[PATCH] Rudimentary implementation of Windows ABI Export this patch

This is a patch to implement Windows' x64 ABI. It works with the
example on QBE's intro page, but I haven't tested anything else. I
figured basic Windows support is better than none at all. Compiles
with MSYS2 UCRT64, and probably also Cygwin.

Patch (generated with git diff, is that the correct way? I've never
done this before, sorry.)

diff --git a/Makefile b/Makefile
index f5e8a76..0a715f5 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ BINDIR = $(PREFIX)/bin

COMMOBJ  = main.o util.o parse.o abi.o cfg.o mem.o ssa.o alias.o load.o \
           copy.o fold.o simpl.o live.o spill.o rega.o emit.o
AMD64OBJ = amd64/targ.o amd64/sysv.o amd64/isel.o amd64/emit.o
AMD64OBJ = amd64/targ.o amd64/sysv.o amd64/isel.o amd64/emit.o amd64/winabi.o
ARM64OBJ = arm64/targ.o arm64/abi.o arm64/isel.o arm64/emit.o
RV64OBJ  = rv64/targ.o rv64/abi.o rv64/isel.o rv64/emit.o
OBJ      = $(COMMOBJ) $(AMD64OBJ) $(ARM64OBJ) $(RV64OBJ)
diff --git a/all.h b/all.h
index e421b9c..ca9769f 100644
--- a/all.h
+++ b/all.h
@@ -43,6 +43,7 @@ enum {
struct Target {
   char name[16];
   char apple;
    char windows;
   int gpr0;   /* first general purpose reg */
   int ngpr;
   int fpr0;   /* first floating point reg */
@@ -573,4 +574,5 @@ void emitdbgloc(uint, uint, FILE *);
int stashbits(void *, int);
void elf_emitfnfin(char *, FILE *);
void elf_emitfin(FILE *);
void win_emitfin(FILE *);
void macho_emitfin(FILE *);
diff --git a/amd64/all.h b/amd64/all.h
index 3a2db0e..d1a28c2 100644
--- a/amd64/all.h
+++ b/amd64/all.h
@@ -63,6 +63,13 @@ bits amd64_sysv_retregs(Ref, int[2]);
bits amd64_sysv_argregs(Ref, int[2]);
void amd64_sysv_abi(Fn *);

/* winabi.c (abi) */
extern int amd64_winabi_rsave[];
extern int amd64_winabi_rclob[];
bits amd64_winabi_retregs(Ref, int[2]);
bits amd64_winabi_argregs(Ref, int[2]);
void amd64_winabi_abi(Fn *);

/* isel.c */
void amd64_isel(Fn *);

diff --git a/amd64/emit.c b/amd64/emit.c
index 51d1a5c..7cfaa61 100644
--- a/amd64/emit.c
+++ b/amd64/emit.c
@@ -651,6 +651,6 @@ amd64_emitfn(Fn *fn, FILE *f)
       }
   }
   id0 += fn->nblk;
   if (!T.apple)
   if (!T.apple && !T.windows)
       elf_emitfnfin(fn->name, f);
}
diff --git a/amd64/targ.c b/amd64/targ.c
index fba9144..12d364a 100644
--- a/amd64/targ.c
+++ b/amd64/targ.c
@@ -19,13 +19,9 @@ amd64_memargs(int op)
   .nfpr = NFPR, \
   .rglob = BIT(RBP) | BIT(RSP), \
   .nrglob = 2, \
   .rsave = amd64_sysv_rsave, \
   .nrsave = {NGPS, NFPS}, \
   .retregs = amd64_sysv_retregs, \
   .argregs = amd64_sysv_argregs, \
   .memargs = amd64_memargs, \
   .abi0 = elimsb, \
   .abi1 = amd64_sysv_abi, \
   .isel = amd64_isel, \
   .emitfn = amd64_emitfn, \

@@ -33,6 +29,10 @@ Target T_amd64_sysv = {
   .name = "amd64_sysv",
   .emitfin = elf_emitfin,
   .asloc = ".L",
   .abi1 = amd64_sysv_abi,
   .retregs = amd64_sysv_retregs,
   .argregs = amd64_sysv_argregs,
   .rsave = amd64_sysv_rsave,
   AMD64_COMMON
};

@@ -42,5 +42,21 @@ Target T_amd64_apple = {
   .emitfin = macho_emitfin,
   .asloc = "L",
   .assym = "_",
   .abi1 = amd64_sysv_abi,
   .retregs = amd64_sysv_retregs,
   .argregs = amd64_sysv_argregs,
   .rsave = amd64_sysv_rsave,
   AMD64_COMMON
};

Target T_amd64_win = {
    .name = "amd64_win",
    .windows = 1,
    .emitfin = win_emitfin,
    .asloc = ".L",
   .abi1 = amd64_winabi_abi,
   .retregs = amd64_winabi_retregs,
   .argregs = amd64_winabi_argregs,
   .rsave = amd64_winabi_rsave,
    AMD64_COMMON
};
diff --git a/emit.c b/emit.c
index 490628e..2f2cd1d 100644
--- a/emit.c
+++ b/emit.c
@@ -189,6 +189,14 @@ elf_emitfin(FILE *f)
   fprintf(f, ".section .note.GNU-stack,\"\",@progbits\n");
}

void
win_emitfin(FILE *f)
{
    static char *sec[3] = { ".rodata", ".rodata", ".rodata" };

    emitfin(f ,sec);
}

void
elf_emitfnfin(char *fn, FILE *f)
{
diff --git a/main.c b/main.c
index 5ecb4d0..bd1bc41 100644
--- a/main.c
+++ b/main.c
@@ -20,6 +20,7 @@ char debug['Z'+1] = {

extern Target T_amd64_sysv;
extern Target T_amd64_apple;
extern Target T_amd64_win;
extern Target T_arm64;
extern Target T_arm64_apple;
extern Target T_rv64;
@@ -27,6 +28,7 @@ extern Target T_rv64;
static Target *tlist[] = {
   &T_amd64_sysv,
   &T_amd64_apple,
    &T_amd64_win,
   &T_arm64,
   &T_arm64_apple,
   &T_rv64,
amd64/winabi.c:
#include "all.h"

typedef struct AClass AClass;
typedef struct RAlloc RAlloc;

struct AClass {
    Typ *type;
    int inmem;
    int align;
    uint size;
    int cls[2];
    Ref ref[2];
};

struct RAlloc {
    Ins i;
    RAlloc *link;
};

static void
classify(AClass *a, Typ *t, uint s)
{
    Field *f;
    int *cls;
    uint n, s1;

    for (n=0, s1=s; n<t->nunion; n++, s=s1)
        for (f=t->fields[n]; f->type!=FEnd; f++) {
            assert(s <= 16);
            cls = &a->cls[s/8];
            switch (f->type) {
            case FEnd:
                die("unreachable");
            case FPad:
                /* don't change anything */
                s += f->len;
                break;
            case Fs:
            case Fd:
                if (*cls == Kx)
                    *cls = Kd;
                s += f->len;
                break;
            case Fb:
            case Fh:
            case Fw:
            case Fl:
                *cls = Kl;
                s += f->len;
                break;
            case FTyp:
                classify(a, &typ[f->len], s);
                s += typ[f->len].size;
                break;
            }
        }
}

static void
typclass(AClass *a, Typ *t)
{
    uint sz, al;

    sz = t->size;
    al = 1u << t->align;

    /* the ABI requires sizes to be rounded
     * up to the nearest multiple of 8, moreover
     * it makes it easy load and store structures
     * in registers
     */
    if (al < 8)
        al = 8;
    sz = (sz + al-1) & -al;

    a->type = t;
    a->size = sz;
    a->align = t->align;

    if (t->isdark || sz > 16 || sz == 0) {
        /* large or unaligned structures are
         * required to be passed in memory
         */
        a->inmem = 1;
        return;
    }

    a->cls[0] = Kx;
    a->cls[1] = Kx;
    a->inmem = 0;
    classify(a, t, 0);
}

static int
retr(Ref reg[2], AClass *aret)
{
    static int retreg[2][2] = {{RAX, RDX}, {XMM0, XMM0+1}};
    int n, k, ca, nr[2];

    nr[0] = nr[1] = 0;
    ca = 0;
    for (n=0; (uint)n*8<aret->size; n++) {
        k = KBASE(aret->cls[n]);
        reg[n] = TMP(retreg[k][nr[k]++]);
        ca += 1 << (2 * k);
    }
    return ca;
}

static void
selret(Blk *b, Fn *fn)
{
    int j, k, ca;
    Ref r, r0, reg[2];
    AClass aret;

    j = b->jmp.type;

    if (!isret(j) || j == Jret0)
        return;

    r0 = b->jmp.arg;
    b->jmp.type = Jret0;

    if (j == Jretc) {
        typclass(&aret, &typ[fn->retty]);
        if (aret.inmem) {
            assert(rtype(fn->retr) == RTmp);
            emit(Ocopy, Kl, TMP(RAX), fn->retr, R);
            emit(Oblit1, 0, R, INT(aret.type->size), R);
            emit(Oblit0, 0, R, r0, fn->retr);
            ca = 1;
        } else {
            ca = retr(reg, &aret);
            if (aret.size > 8) {
                r = newtmp("abi", Kl, fn);
                emit(Oload, Kl, reg[1], r, R);
                emit(Oadd, Kl, r, r0, getcon(8, fn));
            }
            emit(Oload, Kl, reg[0], r0, R);
        }
    } else {
        k = j - Jretw;
        if (KBASE(k) == 0) {
            emit(Ocopy, k, TMP(RAX), r0, R);
            ca = 1;
        } else {
            emit(Ocopy, k, TMP(XMM0), r0, R);
            ca = 1 << 2;
        }
    }

    b->jmp.arg = CALL(ca);
}

static int
argsclass(Ins *i0, Ins *i1, AClass *ac, int op, AClass *aret, Ref *env)
{
    int varc, envc, nint, ni, nsse, ns, n, *pn;
    AClass *a;
    Ins *i;

    if (aret && aret->inmem)
        nint = 5; /* hidden argument */
    else
        nint = 6;
    nsse = 8;
    varc = 0;
    envc = 0;
    for (i=i0, a=ac; i<i1; i++, a++)
        switch (i->op - op + Oarg) {
        case Oarg:
            if (KBASE(i->cls) == 0)
                pn = &nint;
            else
                pn = &nsse;
            if (*pn > 0) {
                --*pn;
                a->inmem = 0;
            } else
                a->inmem = 2;
            a->align = 3;
            a->size = 8;
            a->cls[0] = i->cls;
            break;
        case Oargc:
            n = i->arg[0].val;
            typclass(a, &typ[n]);
            if (a->inmem)
                continue;
            ni = ns = 0;
            for (n=0; (uint)n*8<a->size; n++)
                if (KBASE(a->cls[n]) == 0)
                    ni++;
                else
                    ns++;
            if (nint >= ni && nsse >= ns) {
                nint -= ni;
                nsse -= ns;
            } else
                a->inmem = 1;
            break;
        case Oarge:
            envc = 1;
            if (op == Opar)
                *env = i->to;
            else
                *env = i->arg[0];
            break;
        case Oargv:
            varc = 1;
            break;
        default:
            die("unreachable");
        }

    if (varc && envc)
        err("winabi does not support variadic env calls");

    return ((varc|envc) << 12) | ((6-nint) << 4) | ((8-nsse) << 8);
}

int amd64_winabi_rsave[] = {
    RCX, RDX, R8, R9, -1
};
int amd64_winabi_rclob[] = {RBX, R12, R13, R14, R15, -1};

bits
amd64_winabi_retregs(Ref r, int p[2])
{
    bits b;
    int ni, nf;

    assert(rtype(r) == RCall);
    b = 0;
    ni = r.val & 3;
    nf = (r.val >> 2) & 3;
    if (ni == 1)
        b |= BIT(RAX);
    else
        b |= BIT(XMM0);
    if (p) {
        p[0] = ni;
        p[1] = nf;
    }
    return b;
}

bits
amd64_winabi_argregs(Ref r, int p[2])
{
    bits b;
    int j, ni, nf, ra;

    assert(rtype(r) == RCall);
    b = 0;
    ni = (r.val >> 4) & 15;
    nf = (r.val >> 8) & 15;
    ra = (r.val >> 12) & 1;
    for (j=0; j<ni; j++)
        b |= BIT(amd64_winabi_rsave[j]);
    for (j=0; j<nf; j++)
        b |= BIT(XMM0+j);
    if (p) {
        p[0] = ni + ra;
        p[1] = nf;
    }
    return b | (ra ? BIT(RAX) : 0);
}

static Ref
rarg(int ty, int *ni, int *ns)
{
    if (KBASE(ty) == 0)
        return TMP(amd64_winabi_rsave[(*ni)++]);
    else
        return TMP(XMM0 + (*ns)++);
}

static void
selcall(Fn *fn, Ins *i0, Ins *i1, RAlloc **rap)
{
    Ins *i;
    AClass *ac, *a, aret;
    int ca, ni, ns, al;
    uint stk, off;
    Ref r, r1, r2, reg[2], env;
    RAlloc *ra;

    env = R;
    ac = alloc((i1-i0) * sizeof ac[0]);

    if (!req(i1->arg[1], R)) {
        assert(rtype(i1->arg[1]) == RType);
        typclass(&aret, &typ[i1->arg[1].val]);
        ca = argsclass(i0, i1, ac, Oarg, &aret, &env);
    } else
        ca = argsclass(i0, i1, ac, Oarg, 0, &env);

    for (stk=0, a=&ac[i1-i0]; a>ac;)
        if ((--a)->inmem) {
            if (a->align > 4)
                err("win abi requires alignments of 16 or less");
            stk += a->size;
            if (a->align == 4)
                stk += stk & 15;
        }
    stk += stk & 15;
    if (stk) {
        r = getcon(-(int64_t)stk, fn);
        emit(Osalloc, Kl, R, r, R);
    }

    if (!req(i1->arg[1], R)) {
        if (aret.inmem) {
            /* get the return location from eax
             * it saves one callee-save reg */
            r1 = newtmp("abi", Kl, fn);
            emit(Ocopy, Kl, i1->to, TMP(RAX), R);
            ca += 1;
        } else {
            /* todo, may read out of bounds.
             * gcc did this up until 5.2, but
             * this should still be fixed.
             */
            if (aret.size > 8) {
                r = newtmp("abi", Kl, fn);
                aret.ref[1] = newtmp("abi", aret.cls[1], fn);
                emit(Ostorel, 0, R, aret.ref[1], r);
                emit(Oadd, Kl, r, i1->to, getcon(8, fn));
            }
            aret.ref[0] = newtmp("abi", aret.cls[0], fn);
            emit(Ostorel, 0, R, aret.ref[0], i1->to);
            ca += retr(reg, &aret);
            if (aret.size > 8)
                emit(Ocopy, aret.cls[1], aret.ref[1], reg[1], R);
            emit(Ocopy, aret.cls[0], aret.ref[0], reg[0], R);
            r1 = i1->to;
        }
        /* allocate return pad */
        ra = alloc(sizeof *ra);
        /* specific to NAlign == 3 */
        al = aret.align >= 2 ? aret.align - 2 : 0;
        ra->i = (Ins){Oalloc+al, Kl, r1, {getcon(aret.size, fn)}};
        ra->link = (*rap);
        *rap = ra;
    } else {
        ra = 0;
        if (KBASE(i1->cls) == 0) {
            emit(Ocopy, i1->cls, i1->to, TMP(RAX), R);
            ca += 1;
        } else {
            emit(Ocopy, i1->cls, i1->to, TMP(XMM0), R);
            ca += 1 << 2;
        }
    }

    emit(Ocall, i1->cls, R, i1->arg[0], CALL(ca));

    if (!req(R, env))
        emit(Ocopy, Kl, TMP(RAX), env, R);
    else if ((ca >> 12) & 1) /* vararg call */
        emit(Ocopy, Kw, TMP(RAX), getcon((ca >> 8) & 15, fn), R);

    ni = ns = 0;
    if (ra && aret.inmem)
        emit(Ocopy, Kl, rarg(Kl, &ni, &ns), ra->i.to, R); /* pass
hidden argument */

    for (i=i0, a=ac; i<i1; i++, a++) {
        if (i->op >= Oarge || a->inmem)
            continue;
        r1 = rarg(a->cls[0], &ni, &ns);
        if (i->op == Oargc) {
            if (a->size > 8) {
                r2 = rarg(a->cls[1], &ni, &ns);
                r = newtmp("abi", Kl, fn);
                emit(Oload, a->cls[1], r2, r, R);
                emit(Oadd, Kl, r, i->arg[1], getcon(8, fn));
            }
            emit(Oload, a->cls[0], r1, i->arg[1], R);
        } else
            emit(Ocopy, i->cls, r1, i->arg[0], R);
    }

    if (!stk)
        return;

    r = newtmp("abi", Kl, fn);
    for (i=i0, a=ac, off=0; i<i1; i++, a++) {
        if (i->op >= Oarge || !a->inmem)
            continue;
        r1 = newtmp("abi", Kl, fn);
        if (i->op == Oargc) {
            if (a->align == 4)
                off += off & 15;
            emit(Oblit1, 0, R, INT(a->type->size), R);
            emit(Oblit0, 0, R, i->arg[1], r1);
        } else
            emit(Ostorel, 0, R, i->arg[0], r1);
        emit(Oadd, Kl, r1, r, getcon(off, fn));
        off += a->size;
    }
    emit(Osalloc, Kl, r, getcon(stk, fn), R);
}

static int
selpar(Fn *fn, Ins *i0, Ins *i1)
{
    AClass *ac, *a, aret;
    Ins *i;
    int ni, ns, s, al, fa;
    Ref r, env;

    env = R;
    ac = alloc((i1-i0) * sizeof ac[0]);
    curi = &insb[NIns];
    ni = ns = 0;

    if (fn->retty >= 0) {
        typclass(&aret, &typ[fn->retty]);
        fa = argsclass(i0, i1, ac, Opar, &aret, &env);
    } else
        fa = argsclass(i0, i1, ac, Opar, 0, &env);
    fn->reg = amd64_winabi_argregs(CALL(fa), 0);

    for (i=i0, a=ac; i<i1; i++, a++) {
        if (i->op != Oparc || a->inmem)
            continue;
        if (a->size > 8) {
            r = newtmp("abi", Kl, fn);
            a->ref[1] = newtmp("abi", Kl, fn);
            emit(Ostorel, 0, R, a->ref[1], r);
            emit(Oadd, Kl, r, i->to, getcon(8, fn));
        }
        a->ref[0] = newtmp("abi", Kl, fn);
        emit(Ostorel, 0, R, a->ref[0], i->to);
        /* specific to NAlign == 3 */
        al = a->align >= 2 ? a->align - 2 : 0;
        emit(Oalloc+al, Kl, i->to, getcon(a->size, fn), R);
    }

    if (fn->retty >= 0 && aret.inmem) {
        r = newtmp("abi", Kl, fn);
        emit(Ocopy, Kl, r, rarg(Kl, &ni, &ns), R);
        fn->retr = r;
    }

    for (i=i0, a=ac, s=4; i<i1; i++, a++) {
        switch (a->inmem) {
        case 1:
            if (a->align > 4)
                err("win abi requires alignments of 16 or less");
            if (a->align == 4)
                s = (s+3) & -4;
            fn->tmp[i->to.val].slot = -s;
            s += a->size / 4;
            continue;
        case 2:
            emit(Oload, i->cls, i->to, SLOT(-s), R);
            s += 2;
            continue;
        }
        if (i->op == Opare)
            continue;
        r = rarg(a->cls[0], &ni, &ns);
        if (i->op == Oparc) {
            emit(Ocopy, a->cls[0], a->ref[0], r, R);
            if (a->size > 8) {
                r = rarg(a->cls[1], &ni, &ns);
                emit(Ocopy, a->cls[1], a->ref[1], r, R);
            }
        } else
            emit(Ocopy, i->cls, i->to, r, R);
    }

    if (!req(R, env))
        emit(Ocopy, Kl, env, TMP(RAX), R);

    return fa | (s*4)<<12;
}

static Blk *
split(Fn *fn, Blk *b)
{
    Blk *bn;

    ++fn->nblk;
    bn = newblk();
    bn->nins = &insb[NIns] - curi;
    idup(&bn->ins, curi, bn->nins);
    curi = &insb[NIns];
    bn->visit = ++b->visit;
    strf(bn->name, "%s.%d", b->name, b->visit);
    bn->loop = b->loop;
    bn->link = b->link;
    b->link = bn;
    return bn;
}

static void
chpred(Blk *b, Blk *bp, Blk *bp1)
{
    Phi *p;
    uint a;

    for (p=b->phi; p; p=p->link) {
        for (a=0; p->blk[a]!=bp; a++)
            assert(a+1<p->narg);
        p->blk[a] = bp1;
    }
}

static void
selvaarg(Fn *fn, Blk *b, Ins *i)
{
    Ref loc, lreg, lstk, nr, r0, r1, c4, c8, c16, c, ap;
    Blk *b0, *bstk, *breg;
    int isint;

    c4 = getcon(4, fn);
    c8 = getcon(8, fn);
    c16 = getcon(16, fn);
    ap = i->arg[0];
    isint = KBASE(i->cls) == 0;

    /* @b [...]
           r0 =l add ap, (0 or 4)
           nr =l loadsw r0
           r1 =w cultw nr, (48 or 176)
           jnz r1, @breg, @bstk
       @breg
           r0 =l add ap, 16
           r1 =l loadl r0
           lreg =l add r1, nr
           r0 =w add nr, (8 or 16)
           r1 =l add ap, (0 or 4)
           storew r0, r1
       @bstk
           r0 =l add ap, 8
           lstk =l loadl r0
           r1 =l add lstk, 8
           storel r1, r0
       @b0
           %loc =l phi @breg %lreg, @bstk %lstk
           i->to =(i->cls) load %loc
    */

    loc = newtmp("abi", Kl, fn);
    emit(Oload, i->cls, i->to, loc, R);
    b0 = split(fn, b);
    b0->jmp = b->jmp;
    b0->s1 = b->s1;
    b0->s2 = b->s2;
    if (b->s1)
        chpred(b->s1, b, b0);
    if (b->s2 && b->s2 != b->s1)
        chpred(b->s2, b, b0);

    lreg = newtmp("abi", Kl, fn);
    nr = newtmp("abi", Kl, fn);
    r0 = newtmp("abi", Kw, fn);
    r1 = newtmp("abi", Kl, fn);
    emit(Ostorew, Kw, R, r0, r1);
    emit(Oadd, Kl, r1, ap, isint ? CON_Z : c4);
    emit(Oadd, Kw, r0, nr, isint ? c8 : c16);
    r0 = newtmp("abi", Kl, fn);
    r1 = newtmp("abi", Kl, fn);
    emit(Oadd, Kl, lreg, r1, nr);
    emit(Oload, Kl, r1, r0, R);
    emit(Oadd, Kl, r0, ap, c16);
    breg = split(fn, b);
    breg->jmp.type = Jjmp;
    breg->s1 = b0;

    lstk = newtmp("abi", Kl, fn);
    r0 = newtmp("abi", Kl, fn);
    r1 = newtmp("abi", Kl, fn);
    emit(Ostorel, Kw, R, r1, r0);
    emit(Oadd, Kl, r1, lstk, c8);
    emit(Oload, Kl, lstk, r0, R);
    emit(Oadd, Kl, r0, ap, c8);
    bstk = split(fn, b);
    bstk->jmp.type = Jjmp;
    bstk->s1 = b0;

    b0->phi = alloc(sizeof *b0->phi);
    *b0->phi = (Phi){
        .cls = Kl, .to = loc,
        .narg = 2,
        .blk = vnew(2, sizeof b0->phi->blk[0], PFn),
        .arg = vnew(2, sizeof b0->phi->arg[0], PFn),
    };
    b0->phi->blk[0] = bstk;
    b0->phi->blk[1] = breg;
    b0->phi->arg[0] = lstk;
    b0->phi->arg[1] = lreg;
    r0 = newtmp("abi", Kl, fn);
    r1 = newtmp("abi", Kw, fn);
    b->jmp.type = Jjnz;
    b->jmp.arg = r1;
    b->s1 = breg;
    b->s2 = bstk;
    c = getcon(isint ? 48 : 176, fn);
    emit(Ocmpw+Ciult, Kw, r1, nr, c);
    emit(Oloadsw, Kl, nr, r0, R);
    emit(Oadd, Kl, r0, ap, isint ? CON_Z : c4);
}

static void
selvastart(Fn *fn, int fa, Ref ap)
{
    Ref r0, r1;
    int gp, fp, sp;

    gp = ((fa >> 4) & 15) * 8;
    fp = 48 + ((fa >> 8) & 15) * 16;
    sp = fa >> 12;
    r0 = newtmp("abi", Kl, fn);
    r1 = newtmp("abi", Kl, fn);
    emit(Ostorel, Kw, R, r1, r0);
    emit(Oadd, Kl, r1, TMP(RBP), getcon(-176, fn));
    emit(Oadd, Kl, r0, ap, getcon(16, fn));
    r0 = newtmp("abi", Kl, fn);
    r1 = newtmp("abi", Kl, fn);
    emit(Ostorel, Kw, R, r1, r0);
    emit(Oadd, Kl, r1, TMP(RBP), getcon(sp, fn));
    emit(Oadd, Kl, r0, ap, getcon(8, fn));
    r0 = newtmp("abi", Kl, fn);
    emit(Ostorew, Kw, R, getcon(fp, fn), r0);
    emit(Oadd, Kl, r0, ap, getcon(4, fn));
    emit(Ostorew, Kw, R, getcon(gp, fn), ap);
}

void
amd64_winabi_abi(Fn *fn)
{
    Blk *b;
    Ins *i, *i0, *ip;
    RAlloc *ral;
    int n, fa;

    for (b=fn->start; b; b=b->link)
        b->visit = 0;

    /* lower parameters */
    for (b=fn->start, i=b->ins; i<&b->ins[b->nins]; i++)
        if (!ispar(i->op))
            break;
    fa = selpar(fn, b->ins, i);
    n = b->nins - (i - b->ins) + (&insb[NIns] - curi);
    i0 = alloc(n * sizeof(Ins));
    ip = icpy(ip = i0, curi, &insb[NIns] - curi);
    ip = icpy(ip, i, &b->ins[b->nins] - i);
    b->nins = n;
    b->ins = i0;

    /* lower calls, returns, and vararg instructions */
    ral = 0;
    b = fn->start;
    do {
        if (!(b = b->link))
            b = fn->start; /* do it last */
        if (b->visit)
            continue;
        curi = &insb[NIns];
        selret(b, fn);
        for (i=&b->ins[b->nins]; i!=b->ins;)
            switch ((--i)->op) {
            default:
                emiti(*i);
                break;
            case Ocall:
                for (i0=i; i0>b->ins; i0--)
                    if (!isarg((i0-1)->op))
                        break;
                selcall(fn, i0, i, &ral);
                i = i0;
                break;
            case Ovastart:
                selvastart(fn, fa, i->arg[0]);
                break;
            case Ovaarg:
                selvaarg(fn, b, i);
                break;
            case Oarg:
            case Oargc:
                die("unreachable");
            }
        if (b == fn->start)
            for (; ral; ral=ral->link)
                emiti(ral->i);
        b->nins = &insb[NIns] - curi;
        idup(&b->ins, curi, b->nins);
    } while (b != fn->start);

    if (debug['A']) {
        fprintf(stderr, "\n> After ABI lowering:\n");
        printfn(fn, stderr);
    }
}