Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Added
- Add `--ping-interval` flag to override server-specified ping interval for improved job pickup performance [#3469](https://github.com/buildkite/agent/pull/3469) (@jasonwbarnett)

### Changed
- Change ping interval logging from debug to info level for better visibility [#3469](https://github.com/buildkite/agent/pull/3469) (@jasonwbarnett)

## [v3.108.0](https://github.com/buildkite/agent/tree/v3.108.0) (2025-10-02)
[Full Changelog](https://github.com/buildkite/agent/compare/v3.107.2...v3.108.0)

Expand Down
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,18 @@ Agents page within Buildkite, and a build path. For example:
buildkite-agent start --token=<your token> --build-path=/tmp/buildkite-builds
```

### Performance Optimization

By default, agents poll for jobs every 10-20 seconds (server-specified interval plus random jitter). For performance-sensitive workloads like dynamic pipelines, you can reduce job pickup latency:

```bash
# Faster job pickup (5-10 seconds instead of 10-20 seconds)
# Integer values only, minimum value is 2 seconds
buildkite-agent start --token=<your token> --build-path=/tmp/buildkite-builds --ping-interval=5
```

See the [agent documentation](docs/agent-start.md#ping-interval) for more details.

### Telemetry

By default, the agent sends some information back to the Buildkite mothership on
Expand Down
1 change: 1 addition & 0 deletions agent/agent_configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ type AgentConfiguration struct {
DisconnectAfterJob bool
DisconnectAfterIdleTimeout int
DisconnectAfterUptime int
PingInterval int
CancelGracePeriod int
SignalGracePeriod time.Duration
EnableJobLogTmpfile bool
Expand Down
20 changes: 19 additions & 1 deletion agent/agent_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,24 @@ func (a *AgentWorker) getCurrentJobID() string {
return a.currentJobID
}

// determinePingInterval determines the ping interval to use, applying validation and logging
func (a *AgentWorker) determinePingInterval() time.Duration {
if a.agentConfiguration.PingInterval != 0 {
// Use the override ping interval if specified, with a minimum of 2 seconds
if a.agentConfiguration.PingInterval < 2 {
a.logger.Warn("Ping interval override %ds is below minimum of 2s, using 2s instead", a.agentConfiguration.PingInterval)
return 2 * time.Second
} else {
pingInterval := time.Duration(a.agentConfiguration.PingInterval) * time.Second
a.logger.Info("Using ping interval override: %ds", int(pingInterval.Seconds()))
return pingInterval
}
} else {
// Use the server-specified ping interval
return time.Duration(a.agent.PingInterval) * time.Second
}
}

type errUnrecoverable struct {
action string
response *api.Response
Expand Down Expand Up @@ -317,7 +335,7 @@ func (a *AgentWorker) runPingLoop(ctx context.Context, idleMonitor *IdleMonitor)
disconnectAfterIdleTimeout := time.Second * time.Duration(a.agentConfiguration.DisconnectAfterIdleTimeout)

// Create the ticker
pingInterval := time.Second * time.Duration(a.agent.PingInterval)
pingInterval := a.determinePingInterval()
pingTicker := time.NewTicker(pingInterval)
defer pingTicker.Stop()

Expand Down
104 changes: 104 additions & 0 deletions agent/agent_worker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -913,6 +913,110 @@ func TestAgentWorker_SetRequestHeadersDuringRegistration(t *testing.T) {
}
}

func TestAgentWorker_PingIntervalValidation(t *testing.T) {
tests := []struct {
name string
configuredPingInterval int
serverPingInterval int
expectedInterval time.Duration
expectWarning bool
expectOverrideLog bool
}{
{
name: "uses server interval when override is 0",
configuredPingInterval: 0,
serverPingInterval: 10,
expectedInterval: 10 * time.Second,
expectWarning: false,
expectOverrideLog: false,
},
{
name: "uses override when valid (5s)",
configuredPingInterval: 5,
serverPingInterval: 10,
expectedInterval: 5 * time.Second,
expectWarning: false,
expectOverrideLog: true,
},
{
name: "uses override when valid (2s minimum)",
configuredPingInterval: 2,
serverPingInterval: 10,
expectedInterval: 2 * time.Second,
expectWarning: false,
expectOverrideLog: true,
},
{
name: "clamps 1s to 2s with warning",
configuredPingInterval: 1,
serverPingInterval: 10,
expectedInterval: 2 * time.Second,
expectWarning: true,
expectOverrideLog: false,
},
{
name: "clamps negative values to 2s with warning",
configuredPingInterval: -5,
serverPingInterval: 10,
expectedInterval: 2 * time.Second,
expectWarning: true,
expectOverrideLog: false,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Create a test logger that captures log messages
logOutput := &testLogCapture{}
logger := logger.NewConsoleLogger(logger.NewTextPrinter(logOutput), func(int) {})

worker := &AgentWorker{
logger: logger,
agent: &api.AgentRegisterResponse{
PingInterval: tt.serverPingInterval,
},
agentConfiguration: AgentConfiguration{
PingInterval: tt.configuredPingInterval,
},
}

actualInterval := worker.determinePingInterval()

// Verify the returned interval
assert.Equal(t, tt.expectedInterval, actualInterval, "ping interval should match expected")

// Verify warning log
if tt.expectWarning {
assert.Contains(t, logOutput.String(), "is below minimum of 2s", "should log warning for values below 2s")
} else {
assert.NotContains(t, logOutput.String(), "is below minimum of 2s", "should not log warning for valid values")
}

// Verify override log
if tt.expectOverrideLog {
assert.Contains(t, logOutput.String(), "Using ping interval override", "should log override usage")
} else if tt.configuredPingInterval > 0 && !tt.expectWarning {
// If we have an override but no warning, we should still get the override log
assert.Contains(t, logOutput.String(), "Using ping interval override", "should log override usage for valid overrides")
}
})
}
}

// testLogCapture captures log output for testing
type testLogCapture struct {
output []byte
}

func (t *testLogCapture) Write(p []byte) (n int, err error) {
t.output = append(t.output, p...)
return len(p), nil
}

func (t *testLogCapture) String() string {
return string(t.output)
}

func TestAgentWorker_UpdateRequestHeadersDuringPing(t *testing.T) {
t.Parallel()

Expand Down
8 changes: 8 additions & 0 deletions clicommand/agent_start.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ type AgentStartConfig struct {
DisconnectAfterJob bool `cli:"disconnect-after-job"`
DisconnectAfterIdleTimeout int `cli:"disconnect-after-idle-timeout"`
DisconnectAfterUptime int `cli:"disconnect-after-uptime"`
PingInterval int `cli:"ping-interval"`
CancelGracePeriod int `cli:"cancel-grace-period"`
SignalGracePeriodSeconds int `cli:"signal-grace-period-seconds"`
ReflectExitStatus bool `cli:"reflect-exit-status"`
Expand Down Expand Up @@ -382,6 +383,12 @@ var AgentStartCommand = cli.Command{
Usage: "The maximum uptime in seconds before the agent stops accepting new jobs and shuts down after any running jobs complete. The default of 0 means no timeout",
EnvVar: "BUILDKITE_AGENT_DISCONNECT_AFTER_UPTIME",
},
cli.IntFlag{
Name: "ping-interval",
Value: 0,
Usage: "Override the server-specified ping interval in seconds (integer values only). The default of 0 uses the server-provided interval. Minimum value is 2 seconds",
EnvVar: "BUILDKITE_AGENT_PING_INTERVAL",
},
cancelGracePeriodFlag,
cli.BoolFlag{
Name: "enable-job-log-tmpfile",
Expand Down Expand Up @@ -1044,6 +1051,7 @@ var AgentStartCommand = cli.Command{
DisconnectAfterJob: cfg.DisconnectAfterJob,
DisconnectAfterIdleTimeout: cfg.DisconnectAfterIdleTimeout,
DisconnectAfterUptime: cfg.DisconnectAfterUptime,
PingInterval: cfg.PingInterval,
CancelGracePeriod: cfg.CancelGracePeriod,
SignalGracePeriod: signalGracePeriod,
EnableJobLogTmpfile: cfg.EnableJobLogTmpfile,
Expand Down
43 changes: 43 additions & 0 deletions clicommand/agent_start_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"runtime"
"testing"

"github.com/buildkite/agent/v3/agent"
"github.com/buildkite/agent/v3/core"
"github.com/buildkite/agent/v3/logger"
"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -43,6 +44,48 @@ func writeAgentHook(t *testing.T, dir, hookName, msg string) string {
return filepath
}

func TestAgentStartConfig_PingInterval(t *testing.T) {
tests := []struct {
name string
pingInterval int
expectedResult int
}{
{
name: "default ping interval (0)",
pingInterval: 0,
expectedResult: 0,
},
{
name: "custom ping interval (5)",
pingInterval: 5,
expectedResult: 5,
},
{
name: "minimum ping interval (2)",
pingInterval: 2,
expectedResult: 2,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
config := AgentStartConfig{
PingInterval: tt.pingInterval,
}

// Test that the configuration value is set correctly
assert.Equal(t, tt.expectedResult, config.PingInterval, "AgentStartConfig.PingInterval should match input")

// Test configuration mapping (this would happen in the Action function)
agentConfig := agent.AgentConfiguration{
PingInterval: config.PingInterval,
}

assert.Equal(t, tt.expectedResult, agentConfig.PingInterval, "AgentConfiguration.PingInterval should match AgentStartConfig")
})
}
}

func TestAgentStartupHook(t *testing.T) {
t.Parallel()

Expand Down
2 changes: 1 addition & 1 deletion core/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ func (c *Client) Register(ctx context.Context, req api.AgentRegisterRequest) (*a
c.Logger.Info("Successfully registered agent \"%s\" with tags [%s]", registered.Name,
strings.Join(registered.Tags, ", "))

c.Logger.Debug("Ping interval: %ds", registered.PingInterval)
c.Logger.Info("Ping interval: %ds", registered.PingInterval)
c.Logger.Debug("Job status interval: %ds", registered.JobStatusInterval)
c.Logger.Debug("Heartbeat interval: %ds", registered.HeartbeatInterval)

Expand Down
24 changes: 24 additions & 0 deletions docs/agent-start.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,30 @@ After connecting, `AgentWorker` runs two main goroutines: one periodically
calls `Heartbeat`, the other more frequently calls `Ping`. `Ping` is how the
worker discovers work from the API.

## Ping Interval

The agent polls for jobs using a ping interval specified by the Buildkite server
during agent registration (typically 10 seconds). To prevent thundering herd
problems, each ping includes random jitter (0 to ping-interval seconds), meaning
jobs may take 10-20 seconds to be picked up with default settings.

For performance-sensitive workloads (like dynamic pipelines), you can override
the server-specified interval:

```bash
# Override to ping every 5 seconds (plus 0-5s jitter = 5-10s total)
# Only integer values are supported (e.g., 2, 5, 10), not decimals
buildkite-agent start --ping-interval 5

# Or via environment variable
export BUILDKITE_AGENT_PING_INTERVAL=5
buildkite-agent start
```

Setting `--ping-interval 0` or omitting it uses the server-provided interval.
Values below 2 seconds are automatically clamped to 2 seconds with a warning.
Float values like `2.5` are not supported and will cause an error.

Once a job has been accepted, the `AgentWorker` fires up a `JobRunner` to run
it. Each `JobRunner` starts several goroutines that handle various tasks:

Expand Down