mirror of
https://github.com/mostlygeek/llama-swap.git
synced 2026-06-09 06:46:34 +02:00
02e015fa49
This is a huge backend change that essentially started with rewriting the concurrency handling for processes and blew up to a refactor of the entire application. In short these are the improvements: **Better state and life cycle management:** Life cycle management of processes has always been the trickiest part of the code. Juggling mutex locks between multiple locations to reduce race conditions was complex. Too complex for my feeble brain to build a simple mental model around as llama-swap gained more features. All of that has been refactored. Most of the locks are gone, replaced with a single run() that owns all state changes. There is one place to start from now to understand and extend routing logic. The improved life cycle management makes it easier to implement more complex swap optimization strategies in the future like #727. **Collation of requests:** llama-swap previously handled requests and swapping in the order they came in. For example requests for models in this order ABCABC would result in 5 swaps. Now those requests are handled in this order AABBCC. The result is less time waiting for swap under a high churn request queue. This fixes #588 #612. A possible future enhancement is to support a starvation parameter so swap can be forced when models have been waiting too long. **Shared base implementation for groups and swap matrix:** During the refactor it became clear that much of the swapping logic was shared between these two implementations. That is not surprising considering the swap matrix was added many moons after groups. Now they share a common base and their specific swap strategies are implemented into the swapPlanner interface. Requests for bespoke or specific swapping scenarios is a common theme in the issues. Now users can implement whatever bespoke and weird swapping strategy they want in their own fork. Just ask your agent of choice to implement swapPlanner. I'll still remaining more conservative on what actually lands in core llama-swap and will continue to evaluate PRs if the changes is good for everyone or just one specific use case. **AI / Agentic Disclosure:** I paid very close attention to the low level swap concurrency design and implementation. It's important to keep that essential part reliable, boring and no surprises. Backwards compatibility was also maintained, even the one way non-exclusive group model loading behaviour that people have rightly pointed out be a weird design decision. With the underlying swap core done the web server, api and UI sitting on top were largely ported over with Claude Code and Opus 4.7 in multiple phases. If you're curious I kept the changes in docs/newrouter-todo.md. I did several passes to make sure things weren't left behind. However, even frontier LLMs at the time of this PR still make small decisions that don't make a lot of sense. They get shit wrong all the time, just in small subtle way. That said, there's likely to be some new bugs introduced with this massive refactor. I'm fairly confident that there's no major architectural flaws that would cause goal seeking agents to make dumb, ugly code decisions. For a little while the legacy llama-swap will be available under cmd/legacy/llama-swap. The plan is to eventually delete that entry point as well as the proxy package. On a bit of a personal note, this PR is exciting and a bit sad for me. I hand wrote much of the original code and this PR ultimately replaces much of it. While the old code served as a good reference for the agent to implement the new stuff it still a bit sad to eventually delete it all.
331 lines
9.4 KiB
Go
331 lines
9.4 KiB
Go
package proxy
|
|
|
|
import (
|
|
"fmt"
|
|
"net/http"
|
|
"slices"
|
|
"sort"
|
|
"sync"
|
|
|
|
"github.com/mostlygeek/llama-swap/internal/config"
|
|
"github.com/mostlygeek/llama-swap/internal/logmon"
|
|
)
|
|
|
|
// MatrixSolver contains pure swap-decision logic with no Process dependencies.
|
|
// It is safe for concurrent reads after construction.
|
|
type MatrixSolver struct {
|
|
expandedSets []config.ExpandedSet // all valid model combinations
|
|
evictCosts map[string]int // real model name -> eviction cost (default 1)
|
|
modelToSets map[string][]int // model name -> indices into expandedSets
|
|
}
|
|
|
|
// NewMatrixSolver builds a solver from expanded sets and eviction costs.
|
|
func NewMatrixSolver(expandedSets []config.ExpandedSet, evictCosts map[string]int) *MatrixSolver {
|
|
modelToSets := make(map[string][]int)
|
|
for i, es := range expandedSets {
|
|
for _, model := range es.Models {
|
|
modelToSets[model] = append(modelToSets[model], i)
|
|
}
|
|
}
|
|
|
|
return &MatrixSolver{
|
|
expandedSets: expandedSets,
|
|
evictCosts: evictCosts,
|
|
modelToSets: modelToSets,
|
|
}
|
|
}
|
|
|
|
// SolveResult describes what the solver decided.
|
|
type SolveResult struct {
|
|
Evict []string // running models that must be stopped
|
|
TargetSet []string // the chosen set of models (for informational purposes)
|
|
SetName string // name of the chosen set
|
|
DSL string // original DSL expression for the chosen set
|
|
TotalCost int // total eviction cost
|
|
}
|
|
|
|
// Solve determines which models to evict when a model is requested.
|
|
//
|
|
// Algorithm:
|
|
// 1. If requestedModel is already running, no eviction needed.
|
|
// 2. Find all sets containing requestedModel.
|
|
// 3. If no sets found, the model runs alone; evict all running models.
|
|
// 4. For each candidate set, compute cost = sum of evict_costs for running
|
|
// models NOT in that set.
|
|
// 5. Pick lowest cost. Ties broken by definition order (index in expandedSets).
|
|
// 6. Return models to evict and the chosen set.
|
|
func (s *MatrixSolver) Solve(requestedModel string, runningModels []string) (SolveResult, error) {
|
|
// If already running, nothing to do (but fill in set info for logging)
|
|
if slices.Contains(runningModels, requestedModel) {
|
|
setName, dsl := s.findMatchingSet(requestedModel, runningModels)
|
|
return SolveResult{
|
|
TargetSet: runningModels,
|
|
SetName: setName,
|
|
DSL: dsl,
|
|
}, nil
|
|
}
|
|
|
|
candidateIndices := s.modelToSets[requestedModel]
|
|
|
|
// Model not in any set: runs alone, evict everything
|
|
if len(candidateIndices) == 0 {
|
|
evict := make([]string, len(runningModels))
|
|
copy(evict, runningModels)
|
|
return SolveResult{
|
|
Evict: evict,
|
|
TargetSet: []string{requestedModel},
|
|
}, nil
|
|
}
|
|
|
|
// Find the cheapest candidate set
|
|
bestCost := -1
|
|
bestIdx := -1
|
|
|
|
for _, idx := range candidateIndices {
|
|
setModels := s.expandedSets[idx].Models
|
|
cost := 0
|
|
for _, running := range runningModels {
|
|
if !slices.Contains(setModels, running) {
|
|
cost += s.evictCost(running)
|
|
}
|
|
}
|
|
|
|
if bestCost < 0 || cost < bestCost || (cost == bestCost && idx < bestIdx) {
|
|
bestCost = cost
|
|
bestIdx = idx
|
|
}
|
|
}
|
|
|
|
// Determine which running models to evict
|
|
chosen := s.expandedSets[bestIdx]
|
|
var evict []string
|
|
for _, running := range runningModels {
|
|
if !slices.Contains(chosen.Models, running) {
|
|
evict = append(evict, running)
|
|
}
|
|
}
|
|
|
|
return SolveResult{
|
|
Evict: evict,
|
|
TargetSet: chosen.Models,
|
|
SetName: chosen.SetName,
|
|
DSL: chosen.DSL,
|
|
TotalCost: bestCost,
|
|
}, nil
|
|
}
|
|
|
|
// findMatchingSet finds the expanded set that contains all running models.
|
|
// Returns the set name and DSL, or empty strings if no match.
|
|
func (s *MatrixSolver) findMatchingSet(requestedModel string, runningModels []string) (string, string) {
|
|
for _, idx := range s.modelToSets[requestedModel] {
|
|
set := s.expandedSets[idx]
|
|
allInSet := true
|
|
for _, m := range runningModels {
|
|
if !slices.Contains(set.Models, m) {
|
|
allInSet = false
|
|
break
|
|
}
|
|
}
|
|
if allInSet {
|
|
return set.SetName, set.DSL
|
|
}
|
|
}
|
|
return "", ""
|
|
}
|
|
|
|
func (s *MatrixSolver) evictCost(model string) int {
|
|
if cost, ok := s.evictCosts[model]; ok {
|
|
return cost
|
|
}
|
|
return 1
|
|
}
|
|
|
|
// Matrix manages processes using solver-based swap logic.
|
|
type Matrix struct {
|
|
sync.Mutex
|
|
solver *MatrixSolver
|
|
processes map[string]*Process // all processes keyed by real model name
|
|
config config.Config
|
|
proxyLogger *logmon.Monitor
|
|
upstreamLogger *logmon.Monitor
|
|
|
|
// inflight tracks ProxyRequest calls that have released m.Lock but may
|
|
// not yet have incremented Process.inFlightRequests. A concurrent
|
|
// request that needs to evict models waits for inflight to drain under
|
|
// m.Lock before stopping anything. Without this, a request that
|
|
// released m.Lock but has not yet reached Process.inFlightRequests.Add(1)
|
|
// races with Stop()'s Wait() and can be killed mid-request.
|
|
inflight sync.WaitGroup
|
|
|
|
// testDelayFastPath is a test-only hook invoked in the no-eviction path
|
|
// after m.Lock is released but before the request is dispatched to
|
|
// Process.ProxyRequest. Tests use it to park a request at the exact
|
|
// race window to deterministically reproduce the race.
|
|
testDelayFastPath func()
|
|
}
|
|
|
|
// NewMatrix creates a Matrix from config. It creates a Process for every
|
|
// model defined in the config (any model can run alone even if not in a set).
|
|
func NewMatrix(cfg config.Config, proxyLogger, upstreamLogger *logmon.Monitor) *Matrix {
|
|
processes := make(map[string]*Process)
|
|
for modelID, modelConfig := range cfg.Models {
|
|
processLogger := logmon.NewWriter(upstreamLogger)
|
|
process := NewProcess(modelID, cfg.HealthCheckTimeout, modelConfig, processLogger, proxyLogger)
|
|
processes[modelID] = process
|
|
}
|
|
|
|
evictCosts := cfg.Matrix.ResolvedEvictCosts()
|
|
|
|
return &Matrix{
|
|
solver: NewMatrixSolver(cfg.ExpandedSets, evictCosts),
|
|
processes: processes,
|
|
config: cfg,
|
|
proxyLogger: proxyLogger,
|
|
upstreamLogger: upstreamLogger,
|
|
}
|
|
}
|
|
|
|
// ProxyRequest handles the swap logic and proxies the request to the model.
|
|
func (m *Matrix) ProxyRequest(modelID string, w http.ResponseWriter, r *http.Request) error {
|
|
process, ok := m.processes[modelID]
|
|
if !ok {
|
|
return fmt.Errorf("model %s not found in matrix", modelID)
|
|
}
|
|
|
|
m.Lock()
|
|
running := m.runningModels()
|
|
result, err := m.solver.Solve(modelID, running)
|
|
if err != nil {
|
|
m.Unlock()
|
|
return fmt.Errorf("matrix solver error: %w", err)
|
|
}
|
|
|
|
// Log solver decision
|
|
if len(result.Evict) > 0 {
|
|
m.proxyLogger.Infof("Matrix: model=%s set=%s dsl=%q evict=%v target=%v cost=%d",
|
|
modelID, result.SetName, result.DSL, result.Evict, result.TargetSet, result.TotalCost)
|
|
} else if len(running) == 0 {
|
|
m.proxyLogger.Infof("Matrix: model=%s starting (no models running)", modelID)
|
|
} else {
|
|
m.proxyLogger.Debugf("Matrix: model=%s already running in set=%s dsl=%q", modelID, result.SetName, result.DSL)
|
|
}
|
|
|
|
// Evict models that need to be stopped
|
|
if len(result.Evict) > 0 {
|
|
// Wait for any in-flight ProxyRequest calls to register on their
|
|
// Process before stopping anything. Without this, a request that
|
|
// released m.Lock but has not yet incremented
|
|
// Process.inFlightRequests races with Stop() and can be killed
|
|
// mid-request.
|
|
m.inflight.Wait()
|
|
|
|
var wg sync.WaitGroup
|
|
for _, evictModel := range result.Evict {
|
|
if p, exists := m.processes[evictModel]; exists {
|
|
wg.Add(1)
|
|
go func(p *Process) {
|
|
defer wg.Done()
|
|
p.Stop()
|
|
}(p)
|
|
}
|
|
}
|
|
wg.Wait()
|
|
}
|
|
|
|
// Register this request in inflight before releasing m.Lock so a
|
|
// concurrent eviction will wait for it to complete.
|
|
m.inflight.Add(1)
|
|
defer m.inflight.Done()
|
|
isFastPath := len(result.Evict) == 0
|
|
m.Unlock()
|
|
|
|
if isFastPath && m.testDelayFastPath != nil {
|
|
m.testDelayFastPath()
|
|
}
|
|
|
|
// Proxy the request (Process handles on-demand start)
|
|
process.ProxyRequest(w, r)
|
|
return nil
|
|
}
|
|
|
|
// StopProcesses stops all running processes.
|
|
func (m *Matrix) StopProcesses(strategy StopStrategy) {
|
|
m.Lock()
|
|
defer m.Unlock()
|
|
|
|
var wg sync.WaitGroup
|
|
for _, process := range m.processes {
|
|
wg.Add(1)
|
|
go func(p *Process) {
|
|
defer wg.Done()
|
|
switch strategy {
|
|
case StopImmediately:
|
|
p.StopImmediately()
|
|
default:
|
|
p.Stop()
|
|
}
|
|
}(process)
|
|
}
|
|
wg.Wait()
|
|
}
|
|
|
|
// StopProcess stops a single process by model ID.
|
|
func (m *Matrix) StopProcess(modelID string, strategy StopStrategy) error {
|
|
process, ok := m.processes[modelID]
|
|
if !ok {
|
|
return fmt.Errorf("process not found for %s", modelID)
|
|
}
|
|
|
|
switch strategy {
|
|
case StopImmediately:
|
|
process.StopImmediately()
|
|
default:
|
|
process.Stop()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Shutdown shuts down all processes.
|
|
func (m *Matrix) Shutdown() {
|
|
var wg sync.WaitGroup
|
|
for _, process := range m.processes {
|
|
wg.Add(1)
|
|
go func(p *Process) {
|
|
defer wg.Done()
|
|
p.Shutdown()
|
|
}(process)
|
|
}
|
|
wg.Wait()
|
|
}
|
|
|
|
// RunningModels returns model names currently in an active (non-stopped) state.
|
|
func (m *Matrix) RunningModels() []string {
|
|
m.Lock()
|
|
defer m.Unlock()
|
|
return m.runningModels()
|
|
}
|
|
|
|
// runningModels returns running model names (caller must hold lock).
|
|
func (m *Matrix) runningModels() []string {
|
|
var running []string
|
|
for id, process := range m.processes {
|
|
if process.CurrentState() != StateStopped && process.CurrentState() != StateShutdown {
|
|
running = append(running, id)
|
|
}
|
|
}
|
|
sort.Strings(running)
|
|
return running
|
|
}
|
|
|
|
// GetProcess returns the Process for a model.
|
|
func (m *Matrix) GetProcess(modelID string) (*Process, bool) {
|
|
p, ok := m.processes[modelID]
|
|
return p, ok
|
|
}
|
|
|
|
// HasModel returns true if the model is managed by this matrix.
|
|
func (m *Matrix) HasModel(modelID string) bool {
|
|
_, ok := m.processes[modelID]
|
|
return ok
|
|
}
|