Go Optimization Skill

This skill provides expert guidance on Go performance optimization, covering profiling, benchmarking, memory management, and runtime tuning for building high-performance applications.

When to Use

Activate this skill when:

Profiling application performance
Optimizing CPU-intensive operations
Reducing memory allocations
Tuning garbage collection
Writing benchmarks
Analyzing performance bottlenecks
Optimizing hot paths
Reducing lock contention

Profiling

CPU Profiling

import (
    "os"
    "runtime/pprof"
)

func main() {
    // Start CPU profiling
    f, err := os.Create("cpu.prof")
    if err != nil {
        log.Fatal(err)
    }
    defer f.Close()

    if err := pprof.StartCPUProfile(f); err != nil {
        log.Fatal(err)
    }
    defer pprof.StopCPUProfile()

    // Your code here
    runApplication()
}

// Analyze:
// go tool pprof cpu.prof
// (pprof) top10
// (pprof) list functionName
// (pprof) web

Memory Profiling

import (
    "os"
    "runtime"
    "runtime/pprof"
)

func writeMemProfile(filename string) {
    f, err := os.Create(filename)
    if err != nil {
        log.Fatal(err)
    }
    defer f.Close()

    runtime.GC() // Force GC before snapshot
    if err := pprof.WriteHeapProfile(f); err != nil {
        log.Fatal(err)
    }
}

// Analyze:
// go tool pprof -alloc_space mem.prof
// go tool pprof -inuse_space mem.prof

HTTP Profiling

import (
    _ "net/http/pprof"
    "net/http"
)

func main() {
    // Enable pprof endpoints
    go func() {
        log.Println(http.ListenAndServe("localhost:6060", nil))
    }()

    // Your application
    runServer()
}

// Access profiles:
// http://localhost:6060/debug/pprof/
// go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30
// go tool pprof http://localhost:6060/debug/pprof/heap

Execution Tracing

import (
    "os"
    "runtime/trace"
)

func main() {
    f, err := os.Create("trace.out")
    if err != nil {
        log.Fatal(err)
    }
    defer f.Close()

    if err := trace.Start(f); err != nil {
        log.Fatal(err)
    }
    defer trace.Stop()

    // Your code
    runApplication()
}

// View trace:
// go tool trace trace.out

Benchmarking

Basic Benchmarks

func BenchmarkStringConcat(b *testing.B) {
    for i := 0; i < b.N; i++ {
        _ = "hello" + " " + "world"
    }
}

func BenchmarkStringBuilder(b *testing.B) {
    for i := 0; i < b.N; i++ {
        var sb strings.Builder
        sb.WriteString("hello")
        sb.WriteString(" ")
        sb.WriteString("world")
        _ = sb.String()
    }
}

// Run: go test -bench=. -benchmem

Sub-benchmarks

func BenchmarkEncode(b *testing.B) {
    data := generateTestData()

    b.Run("JSON", func(b *testing.B) {
        b.ReportAllocs()
        for i := 0; i < b.N; i++ {
            json.Marshal(data)
        }
    })

    b.Run("MessagePack", func(b *testing.B) {
        b.ReportAllocs()
        for i := 0; i < b.N; i++ {
            msgpack.Marshal(data)
        }
    })
}

Parallel Benchmarks

func BenchmarkConcurrentAccess(b *testing.B) {
    cache := NewCache()

    b.RunParallel(func(pb *testing.PB) {
        for pb.Next() {
            cache.Get("key")
        }
    })
}

Benchmark Comparison

# Run benchmarks and save results
go test -bench=. -benchmem > old.txt

# Make optimizations

# Run again and compare
go test -bench=. -benchmem > new.txt
benchstat old.txt new.txt

Memory Optimization

Escape Analysis

// Check what escapes to heap
// go build -gcflags="-m" main.go

// ✅ GOOD: Stack allocation
func stackAlloc() int {
    x := 42
    return x
}

// ❌ BAD: Heap escape
func heapEscape() *int {
    x := 42
    return &x // x escapes to heap
}

// ✅ GOOD: Interface without allocation
func noAlloc(w io.Writer, data []byte) {
    w.Write(data)
}

// ❌ BAD: Interface causes allocation
func withAlloc() io.Writer {
    var b bytes.Buffer
    return &b // &b escapes
}

Pre-allocation

// ❌ BAD: Growing slice
func badAppend(n int) []int {
    var result []int
    for i := 0; i < n; i++ {
        result = append(result, i) // Multiple allocations
    }
    return result
}

// ✅ GOOD: Pre-allocate
func goodAppend(n int) []int {
    result := make([]int, 0, n) // Single allocation
    for i := 0; i < n; i++ {
        result = append(result, i)
    }
    return result
}

// ✅ GOOD: Known length
func knownLength(n int) []int {
    result := make([]int, n)
    for i := 0; i < n; i++ {
        result[i] = i
    }
    return result
}

// ❌ BAD: String concatenation
func badConcat(strs []string) string {
    result := ""
    for _, s := range strs {
        result += s // New allocation each time
    }
    return result
}

// ✅ GOOD: strings.Builder
func goodConcat(strs []string) string {
    var sb strings.Builder
    sb.Grow(estimateSize(strs))
    for _, s := range strs {
        sb.WriteString(s)
    }
    return sb.String()
}

sync.Pool

var bufferPool = sync.Pool{
    New: func() interface{} {
        return new(bytes.Buffer)
    },
}

func processData(data []byte) []byte {
    // Get buffer from pool
    buf := bufferPool.Get().(*bytes.Buffer)
    buf.Reset()
    defer bufferPool.Put(buf)

    // Use buffer
    buf.Write(data)
    // Process...

    return buf.Bytes()
}

// String builder pool
var sbPool = sync.Pool{
    New: func() interface{} {
        return &strings.Builder{}
    },
}

func buildString(parts []string) string {
    sb := sbPool.Get().(*strings.Builder)
    sb.Reset()
    defer sbPool.Put(sb)

    for _, part := range parts {
        sb.WriteString(part)
    }
    return sb.String()
}

Zero-Copy Techniques

// Use byte slices instead of strings
func parseHeader(header []byte) (key, value []byte) {
    i := bytes.IndexByte(header, ':')
    if i < 0 {
        return nil, nil
    }
    return header[:i], header[i+1:]
}

// Reuse buffers
type Parser struct {
    buf []byte
}

func (p *Parser) Parse(data []byte) error {
    p.buf = p.buf[:0] // Reset length, keep capacity
    p.buf = append(p.buf, data...)
    // Process p.buf...
    return nil
}

// Direct writing
func writeResponse(w io.Writer, data interface{}) error {
    enc := json.NewEncoder(w) // Write directly to w
    return enc.Encode(data)
}

Garbage Collection Tuning

GC Control

import "runtime/debug"

// Adjust GC target percentage
debug.SetGCPercent(100) // Default
// Higher = less frequent GC, more memory
// Lower = more frequent GC, less memory

// Force GC (use sparingly!)
runtime.GC()

// Monitor GC stats
var stats runtime.MemStats
runtime.ReadMemStats(&stats)
fmt.Printf("Alloc = %v MB\n", stats.Alloc/1024/1024)
fmt.Printf("TotalAlloc = %v MB\n", stats.TotalAlloc/1024/1024)
fmt.Printf("Sys = %v MB\n", stats.Sys/1024/1024)
fmt.Printf("NumGC = %v\n", stats.NumGC)

GOGC Environment Variable

# Default (100%)
GOGC=100 ./myapp

# More aggressive GC (uses less memory)
GOGC=50 ./myapp

# Less frequent GC (uses more memory)
GOGC=200 ./myapp

# Disable GC (for debugging)
GOGC=off ./myapp

Concurrency Optimization

Reduce Lock Contention

// ❌ BAD: Single lock
type BadCache struct {
    mu    sync.Mutex
    items map[string]interface{}
}

// ✅ GOOD: RWMutex
type GoodCache struct {
    mu    sync.RWMutex
    items map[string]interface{}
}

func (c *GoodCache) Get(key string) interface{} {
    c.mu.RLock()
    defer c.mu.RUnlock()
    return c.items[key]
}

// ✅ BETTER: Sharded locks
type ShardedCache struct {
    shards [256]*shard
}

type shard struct {
    mu    sync.RWMutex
    items map[string]interface{}
}

func (c *ShardedCache) Get(key string) interface{} {
    shard := c.getShard(key)
    shard.mu.RLock()
    defer shard.mu.RUnlock()
    return shard.items[key]
}

Channel Buffering

// ❌ BAD: Unbuffered channel causes blocking
ch := make(chan int)

// ✅ GOOD: Buffered channel
ch := make(chan int, 100)

// Optimal buffer size depends on:
// - Producer/consumer rates
// - Memory constraints
// - Latency requirements

Atomic Operations

import "sync/atomic"

type Counter struct {
    value int64
}

func (c *Counter) Increment() {
    atomic.AddInt64(&c.value, 1)
}

func (c *Counter) Value() int64 {
    return atomic.LoadInt64(&c.value)
}

// ✅ Faster than mutex for simple operations
// ❌ Limited to basic types and operations

Algorithmic Optimization

Map Pre-sizing

// ❌ BAD: Growing map
func badMap(items []Item) map[string]Item {
    m := make(map[string]Item)
    for _, item := range items {
        m[item.ID] = item
    }
    return m
}

// ✅ GOOD: Pre-sized map
func goodMap(items []Item) map[string]Item {
    m := make(map[string]Item, len(items))
    for _, item := range items {
        m[item.ID] = item
    }
    return m
}

Avoid Unnecessary Work

// ❌ BAD: Repeated computation
func process(items []Item) {
    for _, item := range items {
        if isValid(item) {
            result := expensiveComputation(item)
            if result > threshold {
                handleResult(result)
            }
        }
    }
}

// ✅ GOOD: Early returns
func process(items []Item) {
    for _, item := range items {
        if !isValid(item) {
            continue // Skip early
        }
        result := expensiveComputation(item)
        if result <= threshold {
            continue // Skip early
        }
        handleResult(result)
    }
}

// ✅ BETTER: Fast path
func process(items []Item) {
    for _, item := range items {
        // Fast path for common case
        if item.IsSimple() {
            handleSimple(item)
            continue
        }
        // Slow path for complex case
        handleComplex(item)
    }
}

Runtime Tuning

GOMAXPROCS

import "runtime"

// Set number of OS threads
runtime.GOMAXPROCS(runtime.NumCPU())

// For CPU-bound: NumCPU
// For I/O-bound: NumCPU * 2 or more

Environment Variables

# Max OS threads
GOMAXPROCS=8 ./myapp

# GC aggressiveness
GOGC=100 ./myapp

# Memory limit (Go 1.19+)
GOMEMLIMIT=4GiB ./myapp

# Trace execution
GODEBUG=gctrace=1 ./myapp

Performance Patterns

Inline Functions

// Compiler inlines small functions automatically

//go:inline
func add(a, b int) int {
    return a + b
}

// Keep hot-path functions small for inlining

Avoid Interface Allocations

// ❌ BAD: Interface allocation
func badPrint(value interface{}) {
    fmt.Println(value) // value escapes
}

// ✅ GOOD: Type-specific functions
func printInt(value int) {
    fmt.Println(value)
}

func printString(value string) {
    fmt.Println(value)
}

Batch Operations

// ❌ BAD: Individual operations
for _, item := range items {
    db.Insert(item) // N database calls
}

// ✅ GOOD: Batch operations
db.BatchInsert(items) // 1 database call

Best Practices

Profile before optimizing - Measure, don't guess
Focus on hot paths - Optimize the 20% that matters
Reduce allocations - Reuse objects, pre-allocate
Use appropriate data structures - Map vs slice vs array
Minimize lock contention - Use RWMutex, sharding
Benchmark changes - Use benchstat for comparisons
Test with race detector - go test -race
Monitor in production - Use profiling endpoints
Balance readability and performance - Don't over-optimize
Use PGO - Profile-guided optimization (Go 1.20+)

Profile-Guided Optimization (PGO)

# 1. Build with profiling
go build -o myapp

# 2. Run and collect profile
./myapp -cpuprofile=default.pgo

# 3. Rebuild with PGO
go build -pgo=default.pgo -o myapp-optimized

# Performance improvement: 5-15% typical

Resources

Additional resources in:

assets/examples/ - Performance optimization examples
assets/benchmarks/ - Benchmark templates
references/ - Links to profiling guides and performance papers

go-optimization