proxy: support timings for /infill from llama-server (#510)

fixes: #463
This commit is contained in:
Benson Wong
2026-02-07 17:16:27 -08:00
committed by GitHub
parent b5fde8eb6d
commit 8d6d949ec3
2 changed files with 77 additions and 1 deletions
+8 -1
View File
@@ -240,7 +240,6 @@ func (mp *metricsMonitor) wrapHandler(
return nil
}
}
if strings.Contains(recorder.Header().Get("Content-Type"), "text/event-stream") {
if parsed, err := processStreamingResponse(modelID, recorder.StartTime(), body); err != nil {
mp.logger.Warnf("error processing streaming response: %v, path=%s, recording minimal metrics", err, request.URL.Path)
@@ -253,6 +252,14 @@ func (mp *metricsMonitor) wrapHandler(
usage := parsed.Get("usage")
timings := parsed.Get("timings")
// extract timings for infill - response is an array, timings are in the last element
// see #463
if strings.HasPrefix(request.URL.Path, "/infill") {
if arr := parsed.Array(); len(arr) > 0 {
timings = arr[len(arr)-1].Get("timings")
}
}
if usage.Exists() || timings.Exists() {
if parsedMetrics, err := parseMetrics(modelID, recorder.StartTime(), usage, timings); err != nil {
mp.logger.Warnf("error parsing metrics: %v, path=%s, recording minimal metrics", err, request.URL.Path)
+69
View File
@@ -384,6 +384,75 @@ data: [DONE]
assert.Equal(t, 0, metrics[0].InputTokens)
assert.Equal(t, 0, metrics[0].OutputTokens)
})
t.Run("infill request extracts timings from last array element", func(t *testing.T) {
mm := newMetricsMonitor(testLogger, 10, 0)
// Infill response is an array with timings in the last element
responseBody := `[
{"content": "first chunk"},
{"content": "second chunk"},
{"content": "final", "timings": {
"prompt_n": 150,
"predicted_n": 75,
"prompt_per_second": 200.5,
"predicted_per_second": 35.5,
"prompt_ms": 600.0,
"predicted_ms": 1800.0,
"cache_n": 30
}}
]`
nextHandler := func(modelID string, w http.ResponseWriter, r *http.Request) error {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
w.Write([]byte(responseBody))
return nil
}
req := httptest.NewRequest("POST", "/infill", nil)
rec := httptest.NewRecorder()
ginCtx, _ := gin.CreateTestContext(rec)
err := mm.wrapHandler("test-model", ginCtx.Writer, req, nextHandler)
assert.NoError(t, err)
metrics := mm.getMetrics()
assert.Equal(t, 1, len(metrics))
assert.Equal(t, "test-model", metrics[0].Model)
assert.Equal(t, 150, metrics[0].InputTokens)
assert.Equal(t, 75, metrics[0].OutputTokens)
assert.Equal(t, 30, metrics[0].CachedTokens)
assert.Equal(t, 200.5, metrics[0].PromptPerSecond)
assert.Equal(t, 35.5, metrics[0].TokensPerSecond)
assert.Equal(t, 2400, metrics[0].DurationMs) // 600 + 1800
})
t.Run("infill request with empty array records minimal metrics", func(t *testing.T) {
mm := newMetricsMonitor(testLogger, 10, 0)
responseBody := `[]`
nextHandler := func(modelID string, w http.ResponseWriter, r *http.Request) error {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
w.Write([]byte(responseBody))
return nil
}
req := httptest.NewRequest("POST", "/infill", nil)
rec := httptest.NewRecorder()
ginCtx, _ := gin.CreateTestContext(rec)
err := mm.wrapHandler("test-model", ginCtx.Writer, req, nextHandler)
assert.NoError(t, err)
metrics := mm.getMetrics()
assert.Equal(t, 1, len(metrics))
assert.Equal(t, "test-model", metrics[0].Model)
assert.Equal(t, 0, metrics[0].InputTokens)
assert.Equal(t, 0, metrics[0].OutputTokens)
})
}
func TestMetricsMonitor_ResponseBodyCopier(t *testing.T) {