ajhahn.de
← eeco
Go 272 lines
// Package gates runs cross-cutting policy gates that compose multiple
// scans over a project tree. Today the only gate is check-attribution,
// which combines a tracked-file scan (delegated to
// internal/workflow.Detector — the same primitive comment-hygiene uses)
// with a commit-body scan applying a stricter, trailer-anchored pattern
// set. The package is consumed by the eeco gates CLI verb; it depends
// on git being on PATH for the commit-body scan and the tracked-files
// enumeration.
package gates

import (
	"bytes"
	"errors"
	"fmt"
	"os"
	"os/exec"
	"path/filepath"
	"regexp"
	"strings"

	"github.com/ajhahnde/eeco/internal/workflow"
)

// Pattern fragments are assembled at runtime so this source stays
// self-clean for eeco's own comment-hygiene scan (Constraint 3 —
// mirrors the discipline in internal/workflow/attribution.go and
// internal/hooks/commitmsg.go).
var (
	gateCoAuthored = "[Cc]o-" + "[Aa]uthored-" + "[Bb]y"
	gateGenVerb    = "[Gg]enerated"
	gateRobotEmoji = "\\x{1F916}"
)

// strictTrailerPatterns is the commit-body pattern set — same shape as
// internal/hooks/commitmsg.go. Trailer-anchored Co-Authored-By rules so
// a docs commit subject like "remove the Co-Authored-By trailer" does
// not false-fire, plus the robot-emoji Generated-with signature.
var strictTrailerPatterns = []*regexp.Regexp{
	regexp.MustCompile(`(?im)^` + gateCoAuthored + `:.*claude`),
	regexp.MustCompile(`(?im)^` + gateCoAuthored + `:.*anthropic`),
	regexp.MustCompile(`(?im)^` + gateCoAuthored + `:.*noreply@anthropic`),
	regexp.MustCompile(gateRobotEmoji + `[^\n]{0,20}` + gateGenVerb),
}

// textExtensions is the default extension allowlist for the file scan
// — same set the existing scripts/check_comment_hygiene.sh in
// downstream consumers uses, extended with the Go-side extensions a
// Go project carries.
var textExtensions = map[string]bool{
	".md":   true,
	".sh":   true,
	".go":   true,
	".zig":  true,
	".S":    true,
	".inc":  true,
	".zon":  true,
	".yml":  true,
	".yaml": true,
	".txt":  true,
	".ld":   true,
	".json": true,
	".toml": true,
}

// Options governs CheckAttribution scope. Zero value scans nothing —
// callers must set at least one of ScanFiles / ScanCommits.
type Options struct {
	// Paths overrides the default tracked-files enumeration when set.
	// Each entry is repo-relative.
	Paths []string
	// Range is the commit-body git range (e.g. "origin/main..HEAD"). An
	// empty value selects the default: origin/main..HEAD when
	// resolvable, otherwise HEAD~10..HEAD with a notice.
	Range string
	// ScanFiles enables the tracked-tree file scan.
	ScanFiles bool
	// ScanCommits enables the commit-body scan.
	ScanCommits bool
	// Excludes are additional repo-relative paths to skip during the
	// file scan; the gate's own source is already excluded.
	Excludes []string
}

// Finding is one policy hit. Path/Line/Excerpt set for file hits;
// Commit/Line/Excerpt set for commit-body hits.
type Finding struct {
	Path    string
	Line    int
	Commit  string
	Excerpt string
}

// Result groups findings with non-fatal notices the caller should
// surface to stderr (for example the HEAD~10 range fallback).
type Result struct {
	Findings []Finding
	Notices  []string
}

// CheckAttribution runs the configured scans against workdir (a git
// repository). Returns the combined Result and a non-nil error only on
// infrastructure failure (workdir is not a repo, git is unavailable).
// A clean result is Result{} with both slices nil; a finding-only
// outcome returns the populated Result and a nil error so callers can
// distinguish "ran and found things" from "could not run".
func CheckAttribution(workdir string, opts Options) (Result, error) {
	var res Result
	if opts.ScanFiles {
		fs, err := scanFiles(workdir, opts)
		if err != nil {
			return res, err
		}
		res.Findings = append(res.Findings, fs...)
	}
	if opts.ScanCommits {
		cs, notices, err := scanCommits(workdir, opts)
		if err != nil {
			return res, err
		}
		res.Findings = append(res.Findings, cs...)
		res.Notices = append(res.Notices, notices...)
	}
	return res, nil
}

func scanFiles(workdir string, opts Options) ([]Finding, error) {
	paths := opts.Paths
	if len(paths) == 0 {
		out, err := runGit(workdir, "ls-files")
		if err != nil {
			return nil, fmt.Errorf("git ls-files: %w", err)
		}
		for p := range strings.SplitSeq(strings.TrimRight(out, "\n"), "\n") {
			p = strings.TrimSpace(p)
			if p == "" {
				continue
			}
			if !isTextExtension(p) {
				continue
			}
			paths = append(paths, p)
		}
	}
	excluded := make(map[string]bool, len(opts.Excludes)+1)
	excluded["internal/gates/attribution.go"] = true
	for _, e := range opts.Excludes {
		excluded[filepath.ToSlash(e)] = true
	}
	det, err := workflow.NewDetector(nil)
	if err != nil {
		return nil, fmt.Errorf("build detector: %w", err)
	}
	var findings []Finding
	for _, rel := range paths {
		if excluded[rel] {
			continue
		}
		full := filepath.Join(workdir, rel)
		b, err := os.ReadFile(full)
		if err != nil {
			continue
		}
		// Cheap binary sniff so a JSON-like blob with a NUL skips.
		if bytes.IndexByte(b[:min(len(b), 8000)], 0) != -1 {
			continue
		}
		for _, hit := range det.Scan(rel, string(b)) {
			excerpt := readLine(b, hit.Line)
			findings = append(findings, Finding{
				Path:    rel,
				Line:    hit.Line,
				Excerpt: excerpt,
			})
		}
	}
	return findings, nil
}

func scanCommits(workdir string, opts Options) ([]Finding, []string, error) {
	var notices []string
	rng := opts.Range
	if rng == "" {
		if _, err := runGit(workdir, "rev-parse", "--verify", "--quiet", "origin/main"); err == nil {
			rng = "origin/main..HEAD"
		} else {
			rng = "HEAD~10..HEAD"
			notices = append(notices, "origin/main not resolvable; commit-body scan range falls back to "+rng)
		}
	}
	out, err := runGit(workdir, "rev-list", rng)
	if err != nil {
		// Empty range (e.g. HEAD has no ancestor for HEAD~10..HEAD in a
		// shallow repo): treat as no commits, not an infrastructure
		// failure. The notice already names the fallback range.
		return nil, notices, nil
	}
	var findings []Finding
	for sha := range strings.FieldsSeq(out) {
		body, err := runGit(workdir, "log", "-1", "--format=%B", sha)
		if err != nil {
			continue
		}
		for _, p := range strictTrailerPatterns {
			loc := p.FindStringIndex(body)
			if loc == nil {
				continue
			}
			line := strings.Count(body[:loc[0]], "\n") + 1
			excerpt := strings.TrimRight(body[loc[0]:loc[1]], "\r\n")
			findings = append(findings, Finding{
				Commit:  shortSHA(sha),
				Line:    line,
				Excerpt: excerpt,
			})
			break // one hit per commit is enough — keep reports terse
		}
	}
	return findings, notices, nil
}

func runGit(workdir string, args ...string) (string, error) {
	cmd := exec.Command("git", args...)
	cmd.Dir = workdir
	var stdout, stderr bytes.Buffer
	cmd.Stdout = &stdout
	cmd.Stderr = &stderr
	if err := cmd.Run(); err != nil {
		var exitErr *exec.ExitError
		if errors.As(err, &exitErr) {
			return "", fmt.Errorf("git %s: %s", strings.Join(args, " "), strings.TrimSpace(stderr.String()))
		}
		return "", fmt.Errorf("git %s: %w", strings.Join(args, " "), err)
	}
	return stdout.String(), nil
}

func shortSHA(sha string) string {
	if len(sha) >= 7 {
		return sha[:7]
	}
	return sha
}

func isTextExtension(path string) bool {
	ext := filepath.Ext(path)
	return textExtensions[strings.ToLower(ext)]
}

// readLine returns the 1-indexed line of b, with trailing CR/LF
// stripped. An out-of-range line returns "".
func readLine(b []byte, n int) string {
	if n <= 0 {
		return ""
	}
	cur := 1
	start := 0
	for i, c := range b {
		if c != '\n' {
			continue
		}
		if cur == n {
			return strings.TrimRight(string(b[start:i]), "\r")
		}
		cur++
		start = i + 1
	}
	if cur == n {
		return strings.TrimRight(string(b[start:]), "\r")
	}
	return ""
}