Files
Benson Wong 9be9a87fa0
Linux CI / run-tests (push) Failing after 14m58s
Windows CI / run-tests (push) Has been cancelled
internal/process: improve windows shutdown behaviour (#808)
Add Windows specific shutdown code paths so stopping of child processes
is more reliable:

- stopping llama-swap won't leave behind any child processes it created
- uses Job Objects in Windows so the whole llama-swap tree is closed by
the os
- add procCtx to baseRouter. It replaces shutdownCtx as a signal for
managing lifetime state.
- shutdownCtx is only used by the router to stop handling new requests
during shutdown
- improve debug logging to make it easier to trace source of issues

Fixes #804
Updates #807
2026-06-01 00:45:30 -07:00

329 lines
9.1 KiB
Go

package main
import (
"context"
"errors"
"flag"
"fmt"
"log/slog"
"net/http"
"os"
"os/signal"
"path/filepath"
"strings"
"sync"
"syscall"
"time"
"github.com/mostlygeek/llama-swap/internal/config"
"github.com/mostlygeek/llama-swap/internal/event"
"github.com/mostlygeek/llama-swap/internal/logmon"
"github.com/mostlygeek/llama-swap/internal/perf"
"github.com/mostlygeek/llama-swap/internal/process"
"github.com/mostlygeek/llama-swap/internal/server"
"github.com/mostlygeek/llama-swap/internal/shared"
"github.com/mostlygeek/llama-swap/internal/watcher"
)
var (
version = "0"
commit = "abcd1234"
date = "unknown"
)
const shutdownTimeout = 30 * time.Second
// logTimeFormats maps the cfg.LogTimeFormat value to a Go time layout. An
// unset or unrecognised value yields "" — no timestamp prefix.
var logTimeFormats = map[string]string{
"ansic": time.ANSIC,
"unixdate": time.UnixDate,
"rubydate": time.RubyDate,
"rfc822": time.RFC822,
"rfc822z": time.RFC822Z,
"rfc850": time.RFC850,
"rfc1123": time.RFC1123,
"rfc1123z": time.RFC1123Z,
"rfc3339": time.RFC3339,
"rfc3339nano": time.RFC3339Nano,
"kitchen": time.Kitchen,
"stamp": time.Stamp,
"stampmilli": time.StampMilli,
"stampmicro": time.StampMicro,
"stampnano": time.StampNano,
}
func main() {
flagConfig := flag.String("config", "", "path to config file (required)")
flagListen := flag.String("listen", "", "listen address (default :8080 or :8443 for TLS)")
flagCertFile := flag.String("tls-cert-file", "", "TLS certificate file")
flagKeyFile := flag.String("tls-key-file", "", "TLS key file")
flagVersion := flag.Bool("version", false, "show version and exit")
flagWatchConfig := flag.Bool("watch-config", false, "reload config on file change")
flag.Parse()
if *flagVersion {
fmt.Printf("version: %s (%s), built at %s\n", version, commit, date)
os.Exit(0)
}
if *flagConfig == "" {
slog.Error("-config is required")
os.Exit(1)
}
useTLS := *flagCertFile != "" || *flagKeyFile != ""
if (*flagCertFile != "" && *flagKeyFile == "") || (*flagCertFile == "" && *flagKeyFile != "") {
slog.Error("both -tls-cert-file and -tls-key-file must be provided for TLS")
os.Exit(1)
}
listenAddr := *flagListen
if listenAddr == "" {
if useTLS {
listenAddr = ":8443"
} else {
listenAddr = ":8080"
}
}
configPath := *flagConfig
cfg, err := config.LoadConfig(configPath)
if err != nil {
slog.Error("failed to load config", "path", configPath, "error", err)
os.Exit(1)
}
// Loggers are wired per cfg.LogToStdout: proxy/upstream feed muxLog, which
// owns the combined history served by /logs. They outlive config reloads,
// so a LogToStdout change requires a restart to take effect.
muxLog, proxyLog, upstreamLog := server.NewLoggers(cfg.LogToStdout)
if len(cfg.Profiles) > 0 {
proxyLog.Warn("Profile functionality has been removed in favor of Groups. See the README for more information.")
}
applyLogSettings := func(cfg config.Config) {
level := logmon.LevelInfo
switch strings.ToLower(strings.TrimSpace(cfg.LogLevel)) {
case "debug":
level = logmon.LevelDebug
case "warn":
level = logmon.LevelWarn
case "error":
level = logmon.LevelError
}
timeFormat := logTimeFormats[strings.ToLower(strings.TrimSpace(cfg.LogTimeFormat))]
for _, lg := range []*logmon.Monitor{proxyLog, upstreamLog} {
lg.SetLogLevel(level)
lg.SetLogTimeFormat(timeFormat)
}
}
applyLogSettings(cfg)
proxyLog.Debugf("PID: %d", os.Getpid())
// On Windows, bind the process tree to a Job Object so every upstream
// process is reaped when llama-swap exits — even on a forced kill. No-op
// elsewhere. Non-fatal: a failure just falls back to per-process teardown.
if err := process.SetupTreeCleanup(); err != nil {
proxyLog.Warnf("failed to set up process tree cleanup: %v", err)
}
// perfMon outlives config reloads; its config is updated in place.
var perfMon *perf.Monitor
if !cfg.Performance.Disabled {
perfMon, err = perf.New(cfg.Performance, proxyLog)
if err != nil {
slog.Error("failed to create performance monitor", "error", err)
os.Exit(1)
}
perfMon.Start()
} else {
proxyLog.Info("performance monitoring is disabled")
}
buildInfo := server.BuildInfo{Version: version, Commit: commit, Date: date}
initialSrv, err := server.New(cfg, muxLog, proxyLog, upstreamLog, perfMon, buildInfo)
if err != nil {
slog.Error("failed to create server", "error", err)
os.Exit(1)
}
// activeSrv is swapped atomically during hot reload.
var activeMu sync.RWMutex
activeSrv := initialSrv
httpServer := &http.Server{
Addr: listenAddr,
Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
activeMu.RLock()
srv := activeSrv
activeMu.RUnlock()
srv.ServeHTTP(w, r)
}),
}
// reload guards against overlapping reloads triggered by concurrent signals
// or file-watcher callbacks.
var reloading bool
var reloadMu sync.Mutex
reload := func() {
reloadMu.Lock()
if reloading {
reloadMu.Unlock()
return
}
reloading = true
reloadMu.Unlock()
defer func() {
reloadMu.Lock()
reloading = false
reloadMu.Unlock()
}()
proxyLog.Info("reloading configuration")
newCfg, err := config.LoadConfig(configPath)
if err != nil {
proxyLog.Warnf("failed to reload config: %v", err)
return
}
if len(newCfg.Profiles) > 0 {
proxyLog.Warn("Profile functionality has been removed in favor of Groups. See the README for more information.")
}
if perfMon != nil {
perfMon.UpdateConfig(newCfg.Performance)
}
newSrv, err := server.New(newCfg, muxLog, proxyLog, upstreamLog, perfMon, buildInfo)
if err != nil {
proxyLog.Warnf("failed to build new server during reload: %v", err)
return
}
activeMu.Lock()
old := activeSrv
activeSrv = newSrv
activeMu.Unlock()
applyLogSettings(newCfg)
if err := old.Shutdown(shutdownTimeout); err != nil {
proxyLog.Warnf("error shutting down old server during reload: %v", err)
}
// Notify UI after a short delay so it can refresh model state.
time.AfterFunc(3*time.Second, func() {
event.Emit(shared.ConfigFileChangedEvent{State: shared.ReloadingStateEnd})
})
proxyLog.Info("configuration reloaded")
}
watcherCtx, watcherCancel := context.WithCancel(context.Background())
defer watcherCancel()
if *flagWatchConfig {
absConfigPath, err := filepath.Abs(configPath)
if err != nil {
slog.Error("watch-config: failed to resolve config path", "error", err)
os.Exit(1)
}
proxyLog.Info("watching configuration for changes (poll-based, 2s interval)")
go func() {
(&configwatcher.Watcher{
Path: absConfigPath,
Interval: configwatcher.DefaultInterval,
OnChange: reload,
}).Run(watcherCtx)
}()
}
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP)
go func() {
var startErr error
if useTLS {
proxyLog.Infof("llama-swap listening with TLS on https://%s", listenAddr)
startErr = httpServer.ListenAndServeTLS(*flagCertFile, *flagKeyFile)
} else {
proxyLog.Infof("llama-swap listening on http://%s", listenAddr)
startErr = httpServer.ListenAndServe()
}
if startErr != nil && !errors.Is(startErr, http.ErrServerClosed) {
slog.Error("http server error", "error", startErr)
os.Exit(1)
}
}()
exitChan := make(chan struct{})
go func() {
for {
sig := <-sigChan
switch sig {
case syscall.SIGHUP:
proxyLog.Info("received SIGHUP, reloading config")
go reload()
case syscall.SIGINT, syscall.SIGTERM:
proxyLog.Infof("received signal %v, shutting down", sig)
watcherCancel()
// Backstop against a stalled shutdown: force the process to
// exit once the whole graceful sequence has had its full budget.
// On Windows the Job Object reaps upstream processes on exit, so
// a forced exit still cleans up rather than orphaning children.
go func() {
time.Sleep(shutdownTimeout + 5*time.Second)
proxyLog.Warnf("graceful shutdown exceeded %v, forcing exit", shutdownTimeout)
os.Exit(1)
}()
activeMu.RLock()
srv := activeSrv
activeMu.RUnlock()
// Close long-lived SSE streams first so httpServer.Shutdown can
// drain without blocking on them for the full timeout.
srv.CloseStreams()
// Both phases share a single deadline so total shutdown is
// bounded by shutdownTimeout rather than 2x it.
deadline := time.Now().Add(shutdownTimeout)
shutdownCtx, cancel := context.WithDeadline(context.Background(), deadline)
defer cancel()
if err := httpServer.Shutdown(shutdownCtx); err != nil {
proxyLog.Warnf("http server shutdown error: %v", err)
}
// Clamp the remaining budget to a small positive value: a
// non-positive timeout makes the router fall back to its own
// healthCheckTimeout, which would defeat the shared deadline.
remaining := time.Until(deadline)
if remaining <= 0 {
remaining = time.Millisecond
}
if err := srv.Shutdown(remaining); err != nil {
proxyLog.Warnf("router shutdown error: %v", err)
}
if perfMon != nil {
perfMon.Stop()
}
close(exitChan)
return
}
}
}()
<-exitChan
proxyLog.Info("shutdown complete")
}