~eliasnaur/gio-patches

gio: Multiple optimizations v1 PROPOSED

This is implemented on top of https://lists.sr.ht/~eliasnaur/gio-
patches/patches/33494 change.

This contains bunch of tiny optimizations for functions that are in the
hot path.

Egon Elbre (5):
  f32: add Affine2D.IsOffset and GetOffset
  internal/ops: use lookup table for NumRefs
  internal/ops: avoid some bounds checks in decode
  internal/ops: use single table for OpType
  internal/ops: optimize Decode

 f32/affine.go          |  11 ++++
 gpu/gpu.go             |  10 +---
 internal/ops/ops.go    | 129 ++++++++++++++++++++---------------------
 internal/ops/reader.go |  45 ++++++--------
 4 files changed, 92 insertions(+), 103 deletions(-)

-- 
2.34.2
#792821 apple.yml failed
#792822 freebsd.yml failed
#792823 linux.yml failed
#792824 openbsd.yml failed
I assume this is for performance, right? If so, why isn't the Go compiler clever
enough to see inline Elem?

On Sat Jul 2, 2022 at 2:11 PM CEST, ~egonelbre wrote:
Next
Thanks, merged.

Elias
Thanks, merged.

Elias
Thanks. Merged with a single nit.

Elias

On Sat Jul 2, 2022 at 2:49 PM CEST, ~egonelbre wrote:
Next
gio/patches: FAILED in 3m21s

[Multiple optimizations][0] from [~egonelbre][1]

[0]: https://lists.sr.ht/~eliasnaur/gio-patches/patches/33500
[1]: mailto:egonelbre@gmail.com

✗ #792823 FAILED gio/patches/linux.yml   https://builds.sr.ht/~eliasnaur/job/792823
✗ #792821 FAILED gio/patches/apple.yml   https://builds.sr.ht/~eliasnaur/job/792821
✗ #792824 FAILED gio/patches/openbsd.yml https://builds.sr.ht/~eliasnaur/job/792824
✗ #792822 FAILED gio/patches/freebsd.yml https://builds.sr.ht/~eliasnaur/job/792822
Merged, although unfortunate. Is there a Go issue for tracking this kind of
inefficiency?

Elias
Not that I know of, I was mainly focused on making the compiled output
nicer at the moment.

+ Egon


+ Egon

On Sat, Jul 2, 2022 at 9:19 PM Elias Naur <mail@eliasnaur.com> wrote:
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~eliasnaur/gio-patches/patches/33500/mbox | git am -3
Learn more about email & git

[PATCH gio 1/5] f32: add Affine2D.IsOffset and GetOffset Export this patch

From: Egon Elbre <egonelbre@gmail.com>

Signed-off-by: Egon Elbre <egonelbre@gmail.com>
---
 f32/affine.go | 11 +++++++++++
 gpu/gpu.go    | 10 ++--------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/f32/affine.go b/f32/affine.go
index b056f720..c1bb9b9b 100644
--- a/f32/affine.go
+++ b/f32/affine.go
@@ -121,6 +121,17 @@ func (a *Affine2D) Split() (srs Affine2D, offset Point) {
	}, Point{X: a.c, Y: a.f}
}

// IsOffset returns whether the transform only contains an offset.
func (a *Affine2D) IsOffset() bool {
	return a.a == 0 && a.b == 0 && a.d == 0 && a.e == 0
}

// GetOffset returns the offset from the affine transform.
// It does not take into account any scaling, shearing and rotation.
func (a *Affine2D) GetOffset() Point {
	return Point{X: a.c, Y: a.f}
}

func (a Affine2D) scale(factor Point) Affine2D {
	return Affine2D{
		(a.a+1)*factor.X - 1, a.b * factor.X, a.c * factor.X,
diff --git a/gpu/gpu.go b/gpu/gpu.go
index 0977415b..83135416 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -1338,10 +1338,9 @@ func decodeToOutlineQuads(qs *quadSplitter, tr f32.Affine2D, pathData []byte) {

// create GPU vertices for transformed r, find the bounds and establish texture transform.
func (d *drawOps) boundsForTransformedRect(r f32.Rectangle, tr f32.Affine2D) (aux []byte, bnd f32.Rectangle, ptr f32.Affine2D) {
	if isPureOffset(tr) {
	if tr.IsOffset() {
		// fast-path to allow blitting of pure rectangles
		_, _, ox, _, _, oy := tr.Elems()
		off := f32.Pt(ox, oy)
		off := tr.GetOffset()
		bnd.Min = r.Min.Add(off)
		bnd.Max = r.Max.Add(off)
		return
@@ -1392,8 +1391,3 @@ func (d *drawOps) boundsForTransformedRect(r f32.Rectangle, tr f32.Affine2D) (au

	return
}

func isPureOffset(t f32.Affine2D) bool {
	a, b, _, d, e, _ := t.Elems()
	return a == 1 && b == 0 && d == 0 && e == 1
}
-- 
2.34.2
I assume this is for performance, right? If so, why isn't the Go compiler clever
enough to see inline Elem?

On Sat Jul 2, 2022 at 2:11 PM CEST, ~egonelbre wrote:

[PATCH gio 2/5] internal/ops: use lookup table for NumRefs Export this patch

From: Egon Elbre <egonelbre@gmail.com>

Signed-off-by: Egon Elbre <egonelbre@gmail.com>
---
 internal/ops/ops.go | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/internal/ops/ops.go b/internal/ops/ops.go
index 2d4d1138..3691c272 100644
--- a/internal/ops/ops.go
+++ b/internal/ops/ops.go
@@ -426,17 +426,28 @@ func (t OpType) Size() int {
	return int(opSize[t])
}

var opNumRefs = [0x100]byte{
	TypeKeyFocus:       1,
	TypePointerInput:   1,
	TypeProfile:        1,
	TypeCall:           1,
	TypeClipboardRead:  1,
	TypeClipboardWrite: 1,
	TypeSemanticLabel:  1,
	TypeSemanticDesc:   1,
	TypeSelection:      1,

	TypeKeyInput: 2,
	TypeImage:    2,
	TypeSource:   2,
	TypeTarget:   2,
	TypeSnippet:  2,

	TypeOffer: 3,
}

func (t OpType) NumRefs() int {
	switch t {
	case TypeKeyFocus, TypePointerInput, TypeProfile, TypeCall, TypeClipboardRead, TypeClipboardWrite, TypeSemanticLabel, TypeSemanticDesc, TypeSelection:
		return 1
	case TypeKeyInput, TypeImage, TypeSource, TypeTarget, TypeSnippet:
		return 2
	case TypeOffer:
		return 3
	default:
		return 0
	}
	return int(opNumRefs[t])
}

func (t OpType) String() string {
-- 
2.34.2
Thanks, merged.

Elias

[PATCH gio 3/5] internal/ops: avoid some bounds checks in decode Export this patch

From: Egon Elbre <egonelbre@gmail.com>

Signed-off-by: Egon Elbre <egonelbre@gmail.com>
---
 internal/ops/ops.go    | 3 ++-
 internal/ops/reader.go | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/internal/ops/ops.go b/internal/ops/ops.go
index 3691c272..c68a51ee 100644
--- a/internal/ops/ops.go
+++ b/internal/ops/ops.go
@@ -163,9 +163,10 @@ const (
)

func (op *ClipOp) Decode(data []byte) {
	if OpType(data[0]) != TypeClip {
	if len(data) < TypeClipLen || OpType(data[0]) != TypeClip {
		panic("invalid op")
	}
	data = data[:TypeClipLen]
	bo := binary.LittleEndian
	r := image.Rectangle{
		Min: image.Point{
diff --git a/internal/ops/reader.go b/internal/ops/reader.go
index 63ec9cc3..b7f0f006 100644
--- a/internal/ops/reader.go
+++ b/internal/ops/reader.go
@@ -160,7 +160,7 @@ func (r *Reader) Decode() (EncodedOp, bool) {
}

func (op *opMacroDef) decode(data []byte) {
	if OpType(data[0]) != TypeMacro {
	if len(data) < TypeMacroLen || OpType(data[0]) != TypeMacro {
		panic("invalid op")
	}
	bo := binary.LittleEndian
@@ -176,11 +176,11 @@ func (op *opMacroDef) decode(data []byte) {
}

func (m *macroOp) decode(data []byte, refs []interface{}) {
	if OpType(data[0]) != TypeCall {
	if len(data) < TypeCallLen || len(refs) < 1 || OpType(data[0]) != TypeCall {
		panic("invalid op")
	}
	data = data[:TypeCallLen]
	bo := binary.LittleEndian
	data = data[:TypeCallLen]
	*m = macroOp{
		ops: refs[0].(*Ops),
		start: PC{
-- 
2.34.2
Thanks, merged.

Elias

[PATCH gio 4/5] internal/ops: use single table for OpType Export this patch

From: Egon Elbre <egonelbre@gmail.com>

Size and NumRefs are always used together, so consolidate info to
a single table to avoid two separate lookups.

Signed-off-by: Egon Elbre <egonelbre@gmail.com>
---
 internal/ops/ops.go    | 116 +++++++++++++++++++----------------------
 internal/ops/reader.go |  12 ++---
 2 files changed, 60 insertions(+), 68 deletions(-)

diff --git a/internal/ops/ops.go b/internal/ops/ops.go
index c68a51ee..a998f1cf 100644
--- a/internal/ops/ops.go
+++ b/internal/ops/ops.go
@@ -381,74 +381,66 @@ func DecodeLoad(data []byte) int {
	return int(bo.Uint32(data[1:]))
}

var opSize = [0x100]byte{
	TypeMacro:            TypeMacroLen,
	TypeCall:             TypeCallLen,
	TypeDefer:            TypeDeferLen,
	TypePushTransform:    TypePushTransformLen,
	TypeTransform:        TypeTransformLen,
	TypePopTransform:     TypePopTransformLen,
	TypeInvalidate:       TypeRedrawLen,
	TypeImage:            TypeImageLen,
	TypePaint:            TypePaintLen,
	TypeColor:            TypeColorLen,
	TypeLinearGradient:   TypeLinearGradientLen,
	TypePass:             TypePassLen,
	TypePopPass:          TypePopPassLen,
	TypePointerInput:     TypePointerInputLen,
	TypeClipboardRead:    TypeClipboardReadLen,
	TypeClipboardWrite:   TypeClipboardWriteLen,
	TypeSource:           TypeSourceLen,
	TypeTarget:           TypeTargetLen,
	TypeOffer:            TypeOfferLen,
	TypeKeyInput:         TypeKeyInputLen,
	TypeKeyFocus:         TypeKeyFocusLen,
	TypeKeySoftKeyboard:  TypeKeySoftKeyboardLen,
	TypeSave:             TypeSaveLen,
	TypeLoad:             TypeLoadLen,
	TypeAux:              TypeAuxLen,
	TypeClip:             TypeClipLen,
	TypePopClip:          TypePopClipLen,
	TypeProfile:          TypeProfileLen,
	TypeCursor:           TypeCursorLen,
	TypePath:             TypePathLen,
	TypeStroke:           TypeStrokeLen,
	TypeSemanticLabel:    TypeSemanticLabelLen,
	TypeSemanticDesc:     TypeSemanticDescLen,
	TypeSemanticClass:    TypeSemanticClassLen,
	TypeSemanticSelected: TypeSemanticSelectedLen,
	TypeSemanticDisabled: TypeSemanticDisabledLen,
	TypeSnippet:          TypeSnippetLen,
	TypeSelection:        TypeSelectionLen,
	TypeActionInput:      TypeActionInputLen,
type opProp struct {
	Size    byte
	NumRefs byte
}

var opProps = [0x100]opProp{
	TypeMacro:            {Size: TypeMacroLen, NumRefs: 0},
	TypeCall:             {Size: TypeCallLen, NumRefs: 1},
	TypeDefer:            {Size: TypeDeferLen, NumRefs: 0},
	TypePushTransform:    {Size: TypePushTransformLen, NumRefs: 0},
	TypeTransform:        {Size: TypeTransformLen, NumRefs: 0},
	TypePopTransform:     {Size: TypePopTransformLen, NumRefs: 0},
	TypeInvalidate:       {Size: TypeRedrawLen, NumRefs: 0},
	TypeImage:            {Size: TypeImageLen, NumRefs: 2},
	TypePaint:            {Size: TypePaintLen, NumRefs: 0},
	TypeColor:            {Size: TypeColorLen, NumRefs: 0},
	TypeLinearGradient:   {Size: TypeLinearGradientLen, NumRefs: 0},
	TypePass:             {Size: TypePassLen, NumRefs: 0},
	TypePopPass:          {Size: TypePopPassLen, NumRefs: 0},
	TypePointerInput:     {Size: TypePointerInputLen, NumRefs: 1},
	TypeClipboardRead:    {Size: TypeClipboardReadLen, NumRefs: 1},
	TypeClipboardWrite:   {Size: TypeClipboardWriteLen, NumRefs: 1},
	TypeSource:           {Size: TypeSourceLen, NumRefs: 2},
	TypeTarget:           {Size: TypeTargetLen, NumRefs: 2},
	TypeOffer:            {Size: TypeOfferLen, NumRefs: 3},
	TypeKeyInput:         {Size: TypeKeyInputLen, NumRefs: 2},
	TypeKeyFocus:         {Size: TypeKeyFocusLen, NumRefs: 1},
	TypeKeySoftKeyboard:  {Size: TypeKeySoftKeyboardLen, NumRefs: 0},
	TypeSave:             {Size: TypeSaveLen, NumRefs: 0},
	TypeLoad:             {Size: TypeLoadLen, NumRefs: 0},
	TypeAux:              {Size: TypeAuxLen, NumRefs: 0},
	TypeClip:             {Size: TypeClipLen, NumRefs: 0},
	TypePopClip:          {Size: TypePopClipLen, NumRefs: 0},
	TypeProfile:          {Size: TypeProfileLen, NumRefs: 1},
	TypeCursor:           {Size: TypeCursorLen, NumRefs: 0},
	TypePath:             {Size: TypePathLen, NumRefs: 0},
	TypeStroke:           {Size: TypeStrokeLen, NumRefs: 0},
	TypeSemanticLabel:    {Size: TypeSemanticLabelLen, NumRefs: 1},
	TypeSemanticDesc:     {Size: TypeSemanticDescLen, NumRefs: 1},
	TypeSemanticClass:    {Size: TypeSemanticClassLen, NumRefs: 0},
	TypeSemanticSelected: {Size: TypeSemanticSelectedLen, NumRefs: 0},
	TypeSemanticDisabled: {Size: TypeSemanticDisabledLen, NumRefs: 0},
	TypeSnippet:          {Size: TypeSnippetLen, NumRefs: 2},
	TypeSelection:        {Size: TypeSelectionLen, NumRefs: 1},
	TypeActionInput:      {Size: TypeActionInputLen, NumRefs: 0},
}

var opSize = [0x100]byte{}

func (t OpType) props() (size, numRefs int) {
	v := opProps[t]
	return int(v.Size), int(v.NumRefs)
}

func (t OpType) Size() int {
	return int(opSize[t])
}

var opNumRefs = [0x100]byte{
	TypeKeyFocus:       1,
	TypePointerInput:   1,
	TypeProfile:        1,
	TypeCall:           1,
	TypeClipboardRead:  1,
	TypeClipboardWrite: 1,
	TypeSemanticLabel:  1,
	TypeSemanticDesc:   1,
	TypeSelection:      1,

	TypeKeyInput: 2,
	TypeImage:    2,
	TypeSource:   2,
	TypeTarget:   2,
	TypeSnippet:  2,

	TypeOffer: 3,
	return int(opProps[t].Size)
}

func (t OpType) NumRefs() int {
	return int(opNumRefs[t])
	return int(opProps[t].NumRefs)
}

func (t OpType) String() string {
diff --git a/internal/ops/reader.go b/internal/ops/reader.go
index b7f0f006..fbfc7f70 100644
--- a/internal/ops/reader.go
+++ b/internal/ops/reader.go
@@ -54,9 +54,10 @@ type opMacroDef struct {
}

func (pc PC) Add(op OpType) PC {
	size, numRefs := op.props()
	return PC{
		data: pc.data + op.Size(),
		refs: pc.refs + op.NumRefs(),
		data: pc.data + size,
		refs: pc.refs + numRefs,
	}
}

@@ -104,8 +105,7 @@ func (r *Reader) Decode() (EncodedOp, bool) {
		}
		key := Key{ops: r.ops, pc: r.pc.data, version: r.ops.version}
		t := OpType(data[0])
		n := t.Size()
		nrefs := t.NumRefs()
		n, nrefs := t.props()
		data = data[:n]
		refs = refs[r.pc.refs:]
		refs = refs[:nrefs]
@@ -125,10 +125,10 @@ func (r *Reader) Decode() (EncodedOp, bool) {
			if deferring {
				deferring = false
				// Copy macro for deferred execution.
				if t.NumRefs() != 1 {
				if nrefs != 1 {
					panic("internal error: unexpected number of macro refs")
				}
				deferData := Write1(&r.deferOps, t.Size(), refs[0])
				deferData := Write1(&r.deferOps, n, refs[0])
				copy(deferData, data)
				r.pc.data += n
				r.pc.refs += nrefs
-- 
2.34.2
Thanks. Merged with a single nit.

Elias

On Sat Jul 2, 2022 at 2:49 PM CEST, ~egonelbre wrote:

[PATCH gio 5/5] internal/ops: optimize Decode Export this patch

From: Egon Elbre <egonelbre@gmail.com>

Using clean struct creation creates a lot of temporary variables in
assembly. Inline the assignments, which generates less code.

Signed-off-by: Egon Elbre <egonelbre@gmail.com>
---
 internal/ops/ops.go    | 21 ++++++---------------
 internal/ops/reader.go | 27 ++++++++-------------------
 2 files changed, 14 insertions(+), 34 deletions(-)

diff --git a/internal/ops/ops.go b/internal/ops/ops.go
index a998f1cf..edaac6e7 100644
--- a/internal/ops/ops.go
+++ b/internal/ops/ops.go
@@ -168,21 +168,12 @@ func (op *ClipOp) Decode(data []byte) {
	}
	data = data[:TypeClipLen]
	bo := binary.LittleEndian
	r := image.Rectangle{
		Min: image.Point{
			X: int(int32(bo.Uint32(data[1:]))),
			Y: int(int32(bo.Uint32(data[5:]))),
		},
		Max: image.Point{
			X: int(int32(bo.Uint32(data[9:]))),
			Y: int(int32(bo.Uint32(data[13:]))),
		},
	}
	*op = ClipOp{
		Bounds:  r,
		Outline: data[17] == 1,
		Shape:   Shape(data[18]),
	}
	op.Bounds.Min.X = int(int32(bo.Uint32(data[1:])))
	op.Bounds.Min.Y = int(int32(bo.Uint32(data[5:])))
	op.Bounds.Max.X = int(int32(bo.Uint32(data[9:])))
	op.Bounds.Max.Y = int(int32(bo.Uint32(data[13:])))
	op.Outline = data[17] == 1
	op.Shape = Shape(data[18])
}

func Reset(o *Ops) {
diff --git a/internal/ops/reader.go b/internal/ops/reader.go
index fbfc7f70..7e212dfb 100644
--- a/internal/ops/reader.go
+++ b/internal/ops/reader.go
@@ -165,14 +165,8 @@ func (op *opMacroDef) decode(data []byte) {
	}
	bo := binary.LittleEndian
	data = data[:TypeMacroLen]
	dataIdx := int(int32(bo.Uint32(data[1:])))
	refsIdx := int(int32(bo.Uint32(data[5:])))
	*op = opMacroDef{
		endpc: PC{
			data: dataIdx,
			refs: refsIdx,
		},
	}
	op.endpc.data = int(int32(bo.Uint32(data[1:])))
	op.endpc.refs = int(int32(bo.Uint32(data[5:])))
}

func (m *macroOp) decode(data []byte, refs []interface{}) {
@@ -181,15 +175,10 @@ func (m *macroOp) decode(data []byte, refs []interface{}) {
	}
	bo := binary.LittleEndian
	data = data[:TypeCallLen]
	*m = macroOp{
		ops: refs[0].(*Ops),
		start: PC{
			data: int(int32(bo.Uint32(data[1:]))),
			refs: int(int32(bo.Uint32(data[5:]))),
		},
		end: PC{
			data: int(int32(bo.Uint32(data[9:]))),
			refs: int(int32(bo.Uint32(data[13:]))),
		},
	}

	m.ops = refs[0].(*Ops)
	m.start.data = int(int32(bo.Uint32(data[1:])))
	m.start.refs = int(int32(bo.Uint32(data[5:])))
	m.end.data = int(int32(bo.Uint32(data[9:])))
	m.end.refs = int(int32(bo.Uint32(data[13:])))
}
-- 
2.34.2
gio/patches: FAILED in 3m21s

[Multiple optimizations][0] from [~egonelbre][1]

[0]: https://lists.sr.ht/~eliasnaur/gio-patches/patches/33500
[1]: mailto:egonelbre@gmail.com

✗ #792823 FAILED gio/patches/linux.yml   https://builds.sr.ht/~eliasnaur/job/792823
✗ #792821 FAILED gio/patches/apple.yml   https://builds.sr.ht/~eliasnaur/job/792821
✗ #792824 FAILED gio/patches/openbsd.yml https://builds.sr.ht/~eliasnaur/job/792824
✗ #792822 FAILED gio/patches/freebsd.yml https://builds.sr.ht/~eliasnaur/job/792822
Merged, although unfortunate. Is there a Go issue for tracking this kind of
inefficiency?

Elias