~emersion/public-inbox

This thread contains a patchset. You're looking at the original emails, but you may wish to use the patch review UI. Review patch
4 2

[PATCH tlstunnel] Add config reloading

minus
Details
Message ID
<20201211102310.2790341-1-minus@mnus.de>
DKIM signature
pass
Download raw message
Patch: +115 -13
Instead of updating the configuration, we configure a new Server instance and
then migrate Listeners that still exist to it. Open client connections are
left completely untouched.

Closes https://todo.sr.ht/~emersion/tlstunnel/1
---
This basically works, but there's two TODOs that I'd like feedback on:
The listener migration is not concurrency-safe, but putting a lock
around it doesn't seem like a nice solution to me.
The other one is the shutdown procedure. It'd be nice if shutdown waited
for active connections to finish.

 cmd/tlstunnel/main.go | 37 +++++++++++++++---
 server.go             | 89 +++++++++++++++++++++++++++++++++++++++----
 tlstunnel.1.scd       |  2 +
 3 files changed, 115 insertions(+), 13 deletions(-)

diff --git a/cmd/tlstunnel/main.go b/cmd/tlstunnel/main.go
index f4ba7ef..1723f77 100644
--- a/cmd/tlstunnel/main.go
+++ b/cmd/tlstunnel/main.go
@@ -3,6 +3,9 @@ package main
import (
	"flag"
	"log"
	"os"
	"os/signal"
	"syscall"

	"git.sr.ht/~emersion/go-scfg"
	"git.sr.ht/~emersion/tlstunnel"
@@ -15,10 +18,7 @@ var (
	certDataPath = ""
)

func main() {
	flag.StringVar(&configPath, "config", configPath, "path to configuration file")
	flag.Parse()

func newServer() *tlstunnel.Server {
	cfg, err := scfg.Load(configPath)
	if err != nil {
		log.Fatalf("failed to load config file: %v", err)
@@ -49,10 +49,37 @@ func main() {
	if err := srv.Load(cfg); err != nil {
		log.Fatal(err)
	}
	return srv
}

func main() {
	flag.StringVar(&configPath, "config", configPath, "path to configuration file")
	flag.Parse()

	srv := newServer()

	sigCh := make(chan os.Signal, 1)
	signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP)

	if err := srv.Start(); err != nil {
		log.Fatal(err)
	}

	select {}
	for {
		sig := <-sigCh
		switch sig {
		case syscall.SIGINT:
		case syscall.SIGTERM:
			srv.Stop()
			return
		case syscall.SIGHUP:
			log.Print("caught SIGHUP, reloading config")
			newSrv := newServer()
			err := newSrv.TakeOver(srv)
			if err != nil {
				log.Printf("reload failed: %v", err)
			}
			srv = newSrv
		}
	}
}
diff --git a/server.go b/server.go
index c9afe32..c663d92 100644
--- a/server.go
+++ b/server.go
@@ -24,6 +24,9 @@ type Server struct {

	ACMEManager *certmagic.ACMEManager
	ACMEConfig  *certmagic.Config

	ctx    context.Context
	cancel context.CancelFunc
}

func NewServer() *Server {
@@ -57,22 +60,83 @@ func (srv *Server) RegisterListener(addr string) *Listener {
	return ln
}

func (srv *Server) Start() error {
func (srv *Server) startACME() error {
	srv.ctx, srv.cancel = context.WithCancel(context.Background())

	for _, cert := range srv.UnmanagedCerts {
		if err := srv.ACMEConfig.CacheUnmanagedTLSCertificate(cert, nil); err != nil {
			return err
		}
	}

	if err := srv.ACMEConfig.ManageAsync(context.Background(), srv.ManagedNames); err != nil {
	if err := srv.ACMEConfig.ManageAsync(srv.ctx, srv.ManagedNames); err != nil {
		return fmt.Errorf("failed to manage TLS certificates: %v", err)
	}

	return nil
}

func (srv *Server) Start() error {
	if err := srv.startACME(); err != nil {
		return err
	}

	for _, ln := range srv.Listeners {
		if err := ln.Start(); err != nil {
			return err
		}
	}
	return nil
}

func (srv *Server) Stop() {
	// Stop ACME
	srv.cancel()
	// TODO: clean cached unmanaged certs
	for _, ln := range srv.Listeners {
		ln.Stop()
	}
}

// TakeOver starts the server but takes over existing listeners from an old
// Server instance. The old instance keeps running unchanged if TakeOver
// returns an error.
func (srv *Server) TakeOver(old *Server) error {
	// Try to start new listeners
	for addr, ln := range srv.Listeners {
		if _, ok := old.Listeners[addr]; ok {
			continue
		}
		if err := ln.Start(); err != nil {
			for _, ln2 := range srv.Listeners {
				ln2.Stop()
			}
			return err
		}
	}

	// Restart ACME
	old.cancel()
	if err := srv.startACME(); err != nil {
		for _, ln2 := range srv.Listeners {
			ln2.Stop()
		}
		return err
	}
	// TODO: clean cached unmanaged certs

	// Take over existing listeners and terminate old ones
	for addr, oldLn := range old.Listeners {
		if ln, ok := srv.Listeners[addr]; ok {
			// TODO: lock this swap
			oldLn.Frontends = ln.Frontends
			oldLn.Server = srv
			srv.Listeners[addr] = oldLn
		} else {
			oldLn.Stop()
		}
	}

	return nil
}

@@ -80,6 +144,7 @@ type Listener struct {
	Address   string
	Server    *Server
	Frontends map[string]*Frontend // indexed by server name
	netLn     net.Listener
}

func newListener(srv *Server, addr string) *Listener {
@@ -99,14 +164,15 @@ func (ln *Listener) RegisterFrontend(name string, fe *Frontend) error {
}

func (ln *Listener) Start() error {
	netLn, err := net.Listen("tcp", ln.Address)
	var err error
	ln.netLn, err = net.Listen("tcp", ln.Address)
	if err != nil {
		return err
	}
	log.Printf("listening on %q", ln.Address)

	go func() {
		if err := ln.serve(netLn); err != nil {
		if err := ln.serve(); err != nil {
			log.Fatalf("listener %q: %v", ln.Address, err)
		}
	}()
@@ -114,10 +180,17 @@ func (ln *Listener) Start() error {
	return nil
}

func (ln *Listener) serve(netLn net.Listener) error {
func (ln *Listener) Stop() {
	ln.netLn.Close()
	// TODO: wait for connections to have terminated?
}

func (ln *Listener) serve() error {
	for {
		conn, err := netLn.Accept()
		if err != nil {
		conn, err := ln.netLn.Accept()
		if err != nil && strings.Contains(err.Error(), "use of closed network connection") {
			return nil
		} else if err != nil {
			return fmt.Errorf("failed to accept connection: %v", err)
		}

@@ -265,7 +338,7 @@ func authorityTLV(name string) proxyproto.TLV {

func alpnTLV(proto string) proxyproto.TLV {
	return proxyproto.TLV{
		Type: proxyproto.PP2_TYPE_ALPN,
		Type:  proxyproto.PP2_TYPE_ALPN,
		Value: []byte(proto),
	}
}
diff --git a/tlstunnel.1.scd b/tlstunnel.1.scd
index 30ee269..b4c409a 100644
--- a/tlstunnel.1.scd
+++ b/tlstunnel.1.scd
@@ -27,6 +27,8 @@ The config file has one directive per line. Directives have a name, followed
by parameters separated by space characters. Directives may have children in
blocks delimited by "{" and "}". Lines beginning with "#" are comments.

tlstunnel will reload the config file when it receives the HUP signal.

Example:

```
-- 
2.29.2
Details
Message ID
<N9zqKJxbreT4kSqdY0Ee0aWI_bXJe1LuXU0i1hhN-9p2WGRlUHPPwo7kK6P3V0h8_DvIaASCIMmTAjVVuEVSzDezlzJRO8nEUQFZh9TUO2I=@emersion.fr>
In-Reply-To
<20201211102310.2790341-1-minus@mnus.de> (view parent)
DKIM signature
pass
Download raw message
> Instead of updating the configuration, we configure a new Server instance and
> then migrate Listeners that still exist to it. Open client connections are
> left completely untouched.

This approach looks good to me. Thanks for your patch, it sounds pretty
close to being mergeable!

> The listener migration is not concurrency-safe, but putting a lock
> around it doesn't seem like a nice solution to me.

Hm. It doesn't seem like using a channel would be a lot nicer. This problem
makes me think of [1], but not sure it'd be a lot nicer than a mutex.

[1]: https://golang.org/pkg/sync/atomic/#example_Value_config

> The other one is the shutdown procedure. It'd be nice if shutdown waited
> for active connections to finish.

I can see where you're coming from, but I wonder if it really makes sense. For
some use-cases (e.g. HTTP, SMTP, maybe IMAP), you don't want to interrupt an
open connection, and connections aren't long-lived. For some other use-cases
(WebSockets, IMAP with IDLE, IRC) connections are long-lived and
clients/servers won't close them in a timely manner.

Is there a way to design shutdown so that both use-cases are accounted for?
Maybe wait for opened connections on the first SIGINT, and force-close on the
second one? Maybe have a timeout? Something else? In any case, I'm fine with
deferring this for now.

>  cmd/tlstunnel/main.go | 37 +++++++++++++++---
>  server.go             | 89 +++++++++++++++++++++++++++++++++++++++----
>  tlstunnel.1.scd       |  2 +
>  3 files changed, 115 insertions(+), 13 deletions(-)
>
> diff --git a/cmd/tlstunnel/main.go b/cmd/tlstunnel/main.go
> index f4ba7ef..1723f77 100644
> --- a/cmd/tlstunnel/main.go
> +++ b/cmd/tlstunnel/main.go
> @@ -3,6 +3,9 @@ package main
>  import (
>  	"flag"
>  	"log"
> +	"os"
> +	"os/signal"
> +	"syscall"
>
>  	"git.sr.ht/~emersion/go-scfg"
>  	"git.sr.ht/~emersion/tlstunnel"
> @@ -15,10 +18,7 @@ var (
>  	certDataPath = ""
>  )
>
> -func main() {
> -	flag.StringVar(&configPath, "config", configPath, "path to configuration file")
> -	flag.Parse()
> -
> +func newServer() *tlstunnel.Server {
>  	cfg, err := scfg.Load(configPath)
>  	if err != nil {
>  		log.Fatalf("failed to load config file: %v", err)
> @@ -49,10 +49,37 @@ func main() {
>  	if err := srv.Load(cfg); err != nil {
>  		log.Fatal(err)
>  	}
> +	return srv
> +}
> +
> +func main() {
> +	flag.StringVar(&configPath, "config", configPath, "path to configuration file")
> +	flag.Parse()
> +
> +	srv := newServer()
> +
> +	sigCh := make(chan os.Signal, 1)
> +	signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP)
>
>  	if err := srv.Start(); err != nil {
>  		log.Fatal(err)
>  	}
>
> -	select {}
> +	for {
> +		sig := <-sigCh

This can be simplified to:

    for sig := range sigCh {

> +		switch sig {
> +		case syscall.SIGINT:
> +		case syscall.SIGTERM:
> +			srv.Stop()
> +			return
> +		case syscall.SIGHUP:
> +			log.Print("caught SIGHUP, reloading config")
> +			newSrv := newServer()

Maybe we can add a TODO here to make config file parsing failures non-fatal.

> +			err := newSrv.TakeOver(srv)
> +			if err != nil {
> +				log.Printf("reload failed: %v", err)
> +			}
> +			srv = newSrv
> +		}
> +	}
>  }
> diff --git a/server.go b/server.go
> index c9afe32..c663d92 100644
> --- a/server.go
> +++ b/server.go
> @@ -24,6 +24,9 @@ type Server struct {
>
>  	ACMEManager *certmagic.ACMEManager
>  	ACMEConfig  *certmagic.Config
> +
> +	ctx    context.Context
> +	cancel context.CancelFunc
>  }
>
>  func NewServer() *Server {
> @@ -57,22 +60,83 @@ func (srv *Server) RegisterListener(addr string) *Listener {
>  	return ln
>  }
>
> -func (srv *Server) Start() error {
> +func (srv *Server) startACME() error {
> +	srv.ctx, srv.cancel = context.WithCancel(context.Background())

Nit: we don't need to store srv.ctx in the Server struct, yet.

>  	for _, cert := range srv.UnmanagedCerts {
>  		if err := srv.ACMEConfig.CacheUnmanagedTLSCertificate(cert, nil); err != nil {
>  			return err
>  		}
>  	}
>
> -	if err := srv.ACMEConfig.ManageAsync(context.Background(), srv.ManagedNames); err != nil {
> +	if err := srv.ACMEConfig.ManageAsync(srv.ctx, srv.ManagedNames); err != nil {
>  		return fmt.Errorf("failed to manage TLS certificates: %v", err)
>  	}
>
> +	return nil
> +}
> +
> +func (srv *Server) Start() error {
> +	if err := srv.startACME(); err != nil {
> +		return err
> +	}
> +
> +	for _, ln := range srv.Listeners {
> +		if err := ln.Start(); err != nil {
> +			return err
> +		}
> +	}
> +	return nil
> +}
> +
> +func (srv *Server) Stop() {
> +	// Stop ACME
> +	srv.cancel()
> +	// TODO: clean cached unmanaged certs
>  	for _, ln := range srv.Listeners {
> +		ln.Stop()
> +	}
> +}
> +
> +// TakeOver starts the server but takes over existing listeners from an old
> +// Server instance. The old instance keeps running unchanged if TakeOver
> +// returns an error.
> +func (srv *Server) TakeOver(old *Server) error {
> +	// Try to start new listeners
> +	for addr, ln := range srv.Listeners {
> +		if _, ok := old.Listeners[addr]; ok {
> +			continue
> +		}
>  		if err := ln.Start(); err != nil {
> +			for _, ln2 := range srv.Listeners {
> +				ln2.Stop()
> +			}
>  			return err
>  		}
>  	}
> +
> +	// Restart ACME
> +	old.cancel()
> +	if err := srv.startACME(); err != nil {
> +		for _, ln2 := range srv.Listeners {
> +			ln2.Stop()
> +		}
> +		return err
> +	}
> +	// TODO: clean cached unmanaged certs
> +
> +	// Take over existing listeners and terminate old ones
> +	for addr, oldLn := range old.Listeners {
> +		if ln, ok := srv.Listeners[addr]; ok {
> +			// TODO: lock this swap
> +			oldLn.Frontends = ln.Frontends
> +			oldLn.Server = srv
> +			srv.Listeners[addr] = oldLn
> +		} else {
> +			oldLn.Stop()
> +		}
> +	}
> +
>  	return nil
>  }
>
> @@ -80,6 +144,7 @@ type Listener struct {
>  	Address   string
>  	Server    *Server
>  	Frontends map[string]*Frontend // indexed by server name
> +	netLn     net.Listener
>  }
>
>  func newListener(srv *Server, addr string) *Listener {
> @@ -99,14 +164,15 @@ func (ln *Listener) RegisterFrontend(name string, fe *Frontend) error {
>  }
>
>  func (ln *Listener) Start() error {
> -	netLn, err := net.Listen("tcp", ln.Address)
> +	var err error
> +	ln.netLn, err = net.Listen("tcp", ln.Address)
>  	if err != nil {
>  		return err
>  	}
>  	log.Printf("listening on %q", ln.Address)
>
>  	go func() {
> -		if err := ln.serve(netLn); err != nil {
> +		if err := ln.serve(); err != nil {
>  			log.Fatalf("listener %q: %v", ln.Address, err)
>  		}
>  	}()
> @@ -114,10 +180,17 @@ func (ln *Listener) Start() error {
>  	return nil
>  }
>
> -func (ln *Listener) serve(netLn net.Listener) error {
> +func (ln *Listener) Stop() {
> +	ln.netLn.Close()
> +	// TODO: wait for connections to have terminated?
> +}
> +
> +func (ln *Listener) serve() error {
>  	for {
> -		conn, err := netLn.Accept()
> -		if err != nil {
> +		conn, err := ln.netLn.Accept()
> +		if err != nil && strings.Contains(err.Error(), "use of closed network connection") {
> +			return nil

Hm. I wonder why we get this error?
minus
Details
Message ID
<4dd0ad70-8167-dcb4-da86-16f37f097c28@mnus.de>
In-Reply-To
<N9zqKJxbreT4kSqdY0Ee0aWI_bXJe1LuXU0i1hhN-9p2WGRlUHPPwo7kK6P3V0h8_DvIaASCIMmTAjVVuEVSzDezlzJRO8nEUQFZh9TUO2I=@emersion.fr> (view parent)
DKIM signature
pass
Download raw message
On 14/12/2020 16.19, Simon Ser wrote:
>> The listener migration is not concurrency-safe, but putting a lock
>> around it doesn't seem like a nice solution to me.
> Hm. It doesn't seem like using a channel would be a lot nicer. This problem
> makes me think of [1], but not sure it'd be a lot nicer than a mutex.
>
> [1]: https://golang.org/pkg/sync/atomic/#example_Value_config
Oh, I thought atomic.Value used a mutex internally (it doesn't) so I 
didn't even look at it. I'll check those two options out.
>
>> The other one is the shutdown procedure. It'd be nice if shutdown waited
>> for active connections to finish.
> I can see where you're coming from, but I wonder if it really makes sense. For
> some use-cases (e.g. HTTP, SMTP, maybe IMAP), you don't want to interrupt an
> open connection, and connections aren't long-lived. For some other use-cases
> (WebSockets, IMAP with IDLE, IRC) connections are long-lived and
> clients/servers won't close them in a timely manner.
>
> Is there a way to design shutdown so that both use-cases are accounted for?
> Maybe wait for opened connections on the first SIGINT, and force-close on the
> second one? Maybe have a timeout? Something else? In any case, I'm fine with
> deferring this for now.
Since when you're stopping the daemon, new connections won't work 
anyway, so you probably don't really mind about killing open connections 
either. And for restarting, waiting would actually be counterproductive, 
since you want the new instance to start up as fast as possible there. 
Probably best to leave it as it is then indeed.
> Maybe we can add a TODO here to make config file parsing failures non-fatal.
>
Oh wow, I didn't test that well enough. It was actually my intention to 
do that. Already done so locally, will send with v2.
>
>> +		conn, err := ln.netLn.Accept()
>> +		if err != nil && strings.Contains(err.Error(), "use of closed network connection") {
>> +			return nil
> Hm. I wonder why we get this error?

That error occurs after netLn.Close() has been called. There 
unfortunately is no other way to interrupt Accept() and it does not 
return a properly typed error. Will add a comment.
Details
Message ID
<PSdY77-T6d1pWFupOdiUU0-b4jY38plxRfOMnqbi9LA6mx8oIJwJfxGJrhP69TxPYVd2HawsT6Ytx4TQu6MLrmeGIoReKxNnk9mjOEWAA3k=@emersion.fr>
In-Reply-To
<4dd0ad70-8167-dcb4-da86-16f37f097c28@mnus.de> (view parent)
DKIM signature
pass
Download raw message
On Monday, December 14th, 2020 at 5:01 PM, minus <minus@mnus.de> wrote:

> >> +		conn, err := ln.netLn.Accept()
> >> +		if err != nil && strings.Contains(err.Error(), "use of closed network connection") {
> >> +			return nil
> > Hm. I wonder why we get this error?
>
> That error occurs after netLn.Close() has been called. There
> unfortunately is no other way to interrupt Accept() and it does not
> return a properly typed error. Will add a comment.

Maybe we could check if the server context has been closed here, and ignore the
error if that's the case?

    select {
    case <-srv.ctx.Done():
        // Ignore error
    default:
        // Report error
    }
Details
Message ID
<UdXTS3hTDTIIrpxzvfJQVfAiCHn5XknsgtZJs0z29ID-6Mt6GkqCpiWGS8caykKOZ3mIj6VQqAwWtMaYG1wTYeZv3abGLb1lazKfreqcEOs=@emersion.fr>
In-Reply-To
<PSdY77-T6d1pWFupOdiUU0-b4jY38plxRfOMnqbi9LA6mx8oIJwJfxGJrhP69TxPYVd2HawsT6Ytx4TQu6MLrmeGIoReKxNnk9mjOEWAA3k=@emersion.fr> (view parent)
DKIM signature
pass
Download raw message
On Monday, December 14th, 2020 at 5:21 PM, Simon Ser <contact@emersion.fr> wrote:

> On Monday, December 14th, 2020 at 5:01 PM, minus <minus@mnus.de> wrote:
>
> > >> +		conn, err := ln.netLn.Accept()
> > >> +		if err != nil && strings.Contains(err.Error(), "use of closed network connection") {
> > >> +			return nil
> > > Hm. I wonder why we get this error?
> >
> > That error occurs after netLn.Close() has been called. There
> > unfortunately is no other way to interrupt Accept() and it does not
> > return a properly typed error. Will add a comment.
>
> Maybe we could check if the server context has been closed here, and ignore the
> error if that's the case?
>
>     select {
>     case <-srv.ctx.Done():
>         // Ignore error
>     default:
>         // Report error
>     }

Go 1.16 will have net.ErrClosed for this purpose, so we'll be able to replace
that with a proper check.
Reply to thread Export thread (mbox)