ajhahn.de
← eeco
Go 171 lines
package queue

import (
	"errors"
	"fmt"
	"io/fs"
	"os"
	"path/filepath"
	"strconv"
	"strings"
	"time"
)

// LockName is the presence-based lock file written next to queue.md
// while Append is mid-flight. It is portable (no flock / fcntl) and
// recovers from a crashed eeco via the staleness check in acquireLock.
const LockName = ".queue.lock"

// ErrLocked is returned by Append when the queue lock is already held
// by another live eeco invocation in the same workspace. Callers that
// must persist their item propagate it so the top-level CLI surfaces a
// non-zero exit with a clear message; "fire-and-forget" sites already
// discard via `_ = queue.Append(...)`. The lock guarantees the queue
// file is never partially written: a contender either holds the lock
// and writes cleanly, or fails up-front without touching queue.md.
var ErrLocked = errors.New("queue is locked by another eeco invocation")

// staleLockAge is the conservative window after which a lock left over
// by a crashed run is considered abandoned. Append itself is
// millisecond-scale, so any healthy run releases the lock long before
// this; only a hard crash mid-write reaches the takeover path.
const staleLockAge = 5 * time.Minute

// acquireLock claims <stateDir>/.queue.lock for the lifetime of one
// Append. On success it returns a release closure that removes the
// lock; the caller defers it. On contention it returns ErrLocked.
// A stale lock (older than staleLockAge or owned by a dead PID on
// Unix) is forcibly reclaimed once before the takeover is reported as
// ErrLocked.
func acquireLock(stateDir string) (release func(), err error) {
	lockPath := filepath.Join(stateDir, LockName)

	claimed, err := tryClaim(lockPath)
	if err != nil {
		return func() {}, err
	}
	if claimed {
		return func() { removeLock(lockPath) }, nil
	}

	if !lockIsStale(lockPath) {
		return func() {}, ErrLocked
	}

	if err := os.Remove(lockPath); err != nil && !errors.Is(err, fs.ErrNotExist) {
		return func() {}, fmt.Errorf("queue.lock: remove stale: %w", err)
	}

	claimed, err = tryClaim(lockPath)
	if err != nil {
		return func() {}, err
	}
	if !claimed {
		return func() {}, ErrLocked
	}
	return func() { removeLock(lockPath) }, nil
}

// tryClaim makes the exclusive O_CREATE|O_EXCL claim on the lock file
// and writes its contents on success. It returns:
//
//	(true, nil)  — the lock was created cleanly and is now held;
//	(false, nil) — the lock already exists (genuine contention), so the
//	               caller should run the staleness check;
//	(false, err) — a real I/O error.
//
// On Windows a lock file in "delete pending" state — a contender just
// released it while a staleness probe (readLockPID) still held an open
// read handle — makes CreateFile reject the exclusive create with
// ERROR_ACCESS_DENIED rather than reporting the file as existing. That
// window clears in microseconds once the stray handle closes, so the
// create is retried briefly; if it never clears it is reported as
// contention so the caller falls through to the staleness path instead
// of erroring out. On Unix isPendingDelete is always false and the loop
// runs exactly once.
func tryClaim(lockPath string) (bool, error) {
	deadline := time.Now().Add(2 * time.Second)
	for {
		f, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o644)
		if err == nil {
			writeLockContents(f)
			_ = f.Close()
			return true, nil
		}
		if errors.Is(err, fs.ErrExist) {
			return false, nil
		}
		if isPendingDelete(err) {
			if time.Now().Before(deadline) {
				time.Sleep(20 * time.Millisecond)
				continue
			}
			return false, nil
		}
		return false, fmt.Errorf("queue.lock: %w", err)
	}
}

// removeLock deletes the lock file, retrying briefly past the transient
// sharing violation Windows raises when a contender's staleness probe
// (readLockPID's os.ReadFile) holds an open handle at the moment of
// deletion. Go opens files without FILE_SHARE_DELETE on Windows, so a
// concurrent reader blocks DeleteFile until its handle closes —
// microseconds later. Without the retry the releaser's os.Remove fails,
// the lock file is orphaned, and the queue stays locked out until the
// staleLockAge window elapses. On Unix the first Remove always wins.
func removeLock(path string) {
	deadline := time.Now().Add(2 * time.Second)
	for {
		err := os.Remove(path)
		if err == nil || errors.Is(err, fs.ErrNotExist) {
			return
		}
		if time.Now().After(deadline) {
			return
		}
		time.Sleep(20 * time.Millisecond)
	}
}

func writeLockContents(f *os.File) {
	_, _ = fmt.Fprintf(f, "pid=%d\ntime=%s\n", os.Getpid(), time.Now().UTC().Format(time.RFC3339))
}

// lockIsStale returns true when the existing lock at path is safe to
// reclaim: either older than staleLockAge by mtime, or owned by a PID
// that no longer exists (Unix only — see processAlive).
func lockIsStale(path string) bool {
	info, err := os.Stat(path)
	if err != nil {
		return false
	}
	if time.Since(info.ModTime()) > staleLockAge {
		return true
	}
	pid, ok := readLockPID(path)
	if !ok {
		return false
	}
	return !processAlive(pid)
}

func readLockPID(path string) (int, bool) {
	b, err := os.ReadFile(path)
	if err != nil {
		return 0, false
	}
	for line := range strings.SplitSeq(string(b), "\n") {
		v, ok := strings.CutPrefix(strings.TrimSpace(line), "pid=")
		if !ok {
			continue
		}
		pid, err := strconv.Atoi(strings.TrimSpace(v))
		if err != nil || pid <= 0 {
			return 0, false
		}
		return pid, true
	}
	return 0, false
}