ajhahn.de
← eeco
Go 355 lines
package workflow

import (
	"os"
	"os/exec"
	"path/filepath"
	"strings"
)

// CommitGuardResult is the outcome of inspecting a candidate shell
// command for a `git commit` that would carry AI attribution. IsCommit
// is true when a real `git commit` segment was detected — token-based,
// never substring, so `echo "git commit"` does not qualify. Findings is
// the union of attribution findings across the assembled commit message,
// the staged diff, and the raw command string; an empty Findings means
// allow.
type CommitGuardResult struct {
	IsCommit bool
	Findings []Finding
}

// stagedDiff returns the staged diff for the repo rooted at cwd. It is a
// package var so the test suite can stub it; the default shells out to
// git and is degrade-open — any error yields "" so the guard never
// blocks on infrastructure trouble (not a repo, git missing, …).
var stagedDiff = func(cwd string) string {
	cmd := exec.Command("git", "-C", cwd, "diff", "--cached")
	out, err := cmd.Output()
	if err != nil {
		return ""
	}
	return string(out)
}

// ScanCommitGuard inspects command — the full Bash command string from a
// Claude Code PreToolUse tool call — for a pending `git commit` and
// scans the text it would commit for AI attribution. cwd is the hook's
// working directory (the repo the commit targets); det is the shared
// attribution detector that also backs leak-guard and the pre-write
// scanner, so the patterns stay one source of truth.
//
// Degrade-open by contract: Findings is populated only on a positive
// detector match. Any parse/infra uncertainty — not a commit, a message
// it cannot statically resolve (heredoc / command substitution), an
// unreadable file, git unavailable — yields no findings, so a harness
// session is never wedged. Defense-in-depth keeps the git pre-commit
// hook and CI leak-guard as the hard gates; this guard covers the common
// case (an inline `-m` trailer) reliably.
func ScanCommitGuard(det *Detector, command, cwd string) CommitGuardResult {
	var res CommitGuardResult
	for _, words := range commandSegments(command) {
		if !isGitCommit(words) {
			continue
		}
		res.IsCommit = true
		if msg := assembleMessage(words, cwd); msg != "" {
			res.Findings = append(res.Findings, det.Scan("commit message", msg)...)
		}
	}
	if !res.IsCommit {
		return res
	}
	// The staged diff catches a non-line-anchored generated-by line added
	// to a file rather than the message.
	if diff := stagedDiff(cwd); diff != "" {
		res.Findings = append(res.Findings, det.Scan("staged diff", diff)...)
	}
	// Belt: the raw command string catches a trailer or generated-by line
	// embedded with a real newline inside the command.
	res.Findings = append(res.Findings, det.Scan("command", command)...)
	return res
}

// --- shell command lexing -------------------------------------------

// tokKind distinguishes a word from a command separator.
type tokKind int

const (
	tokWord tokKind = iota
	tokSep
)

type token struct {
	kind tokKind
	text string
}

// commandSegments splits a shell command into independent segments at
// unquoted separators (&&, ||, ;, |, &, newline) and returns each
// segment as its quote-aware word list. It is best-effort: a construct
// it cannot statically resolve (command substitution, heredoc) degrades
// to literal text, which the detector then finds nothing in — allow.
func commandSegments(command string) [][]string {
	toks := lex(command)
	var segs [][]string
	var cur []string
	for _, t := range toks {
		if t.kind == tokSep {
			if len(cur) > 0 {
				segs = append(segs, cur)
				cur = nil
			}
			continue
		}
		cur = append(cur, t.text)
	}
	if len(cur) > 0 {
		segs = append(segs, cur)
	}
	return segs
}

// lex tokenizes s into words and separators, honoring single quotes,
// double quotes (with the standard backslash escapes), and backslash
// escaping. Adjacent quoted and unquoted runs concatenate into one word,
// matching shell word-splitting. Operator characters inside quotes are
// literal, so a `;` or `&&` inside an `-m "…"` value never splits.
func lex(s string) []token {
	var toks []token
	var buf strings.Builder
	hasWord := false
	flush := func() {
		if hasWord {
			toks = append(toks, token{tokWord, buf.String()})
			buf.Reset()
			hasWord = false
		}
	}
	emitSep := func() {
		flush()
		toks = append(toks, token{kind: tokSep})
	}
	i, n := 0, len(s)
	for i < n {
		c := s[i]
		switch c {
		case '\n':
			emitSep()
			i++
		case ' ', '\t', '\r':
			flush()
			i++
		case ';':
			emitSep()
			i++
		case '&':
			emitSep()
			if i+1 < n && s[i+1] == '&' {
				i += 2
			} else {
				i++
			}
		case '|':
			emitSep()
			if i+1 < n && s[i+1] == '|' {
				i += 2
			} else {
				i++
			}
		case '\'':
			hasWord = true
			i++
			for i < n && s[i] != '\'' {
				buf.WriteByte(s[i])
				i++
			}
			if i < n {
				i++ // closing quote
			}
		case '"':
			hasWord = true
			i++
			for i < n && s[i] != '"' {
				if s[i] == '\\' && i+1 < n {
					switch s[i+1] {
					case '"', '\\', '`', '$':
						buf.WriteByte(s[i+1])
						i += 2
						continue
					case '\n':
						i += 2 // line continuation
						continue
					}
				}
				buf.WriteByte(s[i])
				i++
			}
			if i < n {
				i++ // closing quote
			}
		case '\\':
			hasWord = true
			if i+1 < n {
				if s[i+1] == '\n' {
					i += 2 // line continuation
					continue
				}
				buf.WriteByte(s[i+1])
				i += 2
			} else {
				i++
			}
		default:
			hasWord = true
			buf.WriteByte(c)
			i++
		}
	}
	flush()
	return toks
}

// --- git commit detection -------------------------------------------

// isGitCommit reports whether a segment's word list invokes `git commit`.
// It skips leading NAME=VALUE env assignments, requires the program token
// to be git (bare or a path ending in /git), then walks git's global
// options to the subcommand and checks it is exactly "commit". A bare `--`
// before any subcommand, or any other subcommand, disqualifies — so
// `git status` and `git log -m commit` never fire.
func isGitCommit(words []string) bool {
	i := 0
	for i < len(words) && isEnvAssign(words[i]) {
		i++
	}
	if i >= len(words) || !isGitProg(words[i]) {
		return false
	}
	i++ // past git
	// git global options that consume the following token as their value.
	valueOpts := map[string]bool{
		"-C": true, "-c": true, "--git-dir": true, "--work-tree": true,
		"--namespace": true, "--exec-path": true, "--super-prefix": true,
		"--config-env": true,
	}
	for i < len(words) {
		w := words[i]
		if w == "--" {
			return false // end of options without a subcommand
		}
		if strings.HasPrefix(w, "-") {
			if valueOpts[w] {
				i += 2 // skip the option and its value
			} else {
				i++ // a flag or an --opt=val single token
			}
			continue
		}
		return w == "commit" // first bare token is the subcommand
	}
	return false
}

// isGitProg reports whether tok names the git program.
func isGitProg(tok string) bool {
	return tok == "git" || strings.HasSuffix(tok, "/git")
}

// isEnvAssign reports whether w is a leading NAME=VALUE env assignment
// (NAME is a shell identifier), e.g. GIT_AUTHOR_NAME=x before `git …`.
func isEnvAssign(w string) bool {
	eq := strings.IndexByte(w, '=')
	if eq <= 0 {
		return false
	}
	for i := range eq {
		c := w[i]
		switch {
		case c == '_':
		case c >= 'A' && c <= 'Z':
		case c >= 'a' && c <= 'z':
		case i > 0 && c >= '0' && c <= '9':
		default:
			return false
		}
	}
	return true
}

// --- commit message assembly ----------------------------------------

// assembleMessage reconstructs the text a `git commit` segment would
// commit as its message. -m/--message values join with a blank line (how
// git forms paragraphs), so an attribution trailer lands at line-start
// for the line-anchored detector pattern. It falls back to -F/--file
// (read relative to cwd) and then to <cwd>/.git/COMMIT_EDITMSG. An
// unresolved or unreadable source returns "" (degrade-open).
func assembleMessage(words []string, cwd string) string {
	var parts []string
	var filePath string
	for i := 0; i < len(words); i++ {
		w := words[i]
		switch {
		case w == "-m" || w == "--message":
			if i+1 < len(words) {
				parts = append(parts, words[i+1])
				i++
			}
		case strings.HasPrefix(w, "--message="):
			parts = append(parts, w[len("--message="):])
		case clusterEndsInM(w):
			// a short cluster like -am / -sm: the trailing m takes the
			// next token as the message.
			if i+1 < len(words) {
				parts = append(parts, words[i+1])
				i++
			}
		case strings.HasPrefix(w, "-m") && !strings.HasPrefix(w, "--"):
			parts = append(parts, strings.TrimPrefix(w[2:], "=")) // -mMSG / -m=MSG
		case w == "-F" || w == "--file":
			if i+1 < len(words) {
				filePath = words[i+1]
				i++
			}
		case strings.HasPrefix(w, "--file="):
			filePath = w[len("--file="):]
		case strings.HasPrefix(w, "-F") && !strings.HasPrefix(w, "--") && len(w) > 2:
			filePath = strings.TrimPrefix(w[2:], "=")
		}
	}
	if len(parts) > 0 {
		return strings.Join(parts, "\n\n")
	}
	if filePath != "" {
		if !filepath.IsAbs(filePath) {
			filePath = filepath.Join(cwd, filePath)
		}
		if b, err := os.ReadFile(filePath); err == nil {
			return string(b)
		}
		return ""
	}
	if b, err := os.ReadFile(filepath.Join(cwd, ".git", "COMMIT_EDITMSG")); err == nil {
		return string(b)
	}
	return ""
}

// clusterEndsInM reports whether w is a combined short-flag cluster whose
// last flag is m (so it consumes the next token as the message), e.g.
// -am or -sm. Pure -m (length 2) is handled separately.
func clusterEndsInM(w string) bool {
	if len(w) < 3 || w[0] != '-' || w[1] == '-' {
		return false
	}
	for i := 1; i < len(w); i++ {
		c := w[i]
		if (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') {
			return false
		}
	}
	return w[len(w)-1] == 'm'
}