Go 355 lines
package workflow
import (
"os"
"os/exec"
"path/filepath"
"strings"
)
// CommitGuardResult is the outcome of inspecting a candidate shell
// command for a `git commit` that would carry AI attribution. IsCommit
// is true when a real `git commit` segment was detected — token-based,
// never substring, so `echo "git commit"` does not qualify. Findings is
// the union of attribution findings across the assembled commit message,
// the staged diff, and the raw command string; an empty Findings means
// allow.
type CommitGuardResult struct {
IsCommit bool
Findings []Finding
}
// stagedDiff returns the staged diff for the repo rooted at cwd. It is a
// package var so the test suite can stub it; the default shells out to
// git and is degrade-open — any error yields "" so the guard never
// blocks on infrastructure trouble (not a repo, git missing, …).
var stagedDiff = func(cwd string) string {
cmd := exec.Command("git", "-C", cwd, "diff", "--cached")
out, err := cmd.Output()
if err != nil {
return ""
}
return string(out)
}
// ScanCommitGuard inspects command — the full Bash command string from a
// Claude Code PreToolUse tool call — for a pending `git commit` and
// scans the text it would commit for AI attribution. cwd is the hook's
// working directory (the repo the commit targets); det is the shared
// attribution detector that also backs leak-guard and the pre-write
// scanner, so the patterns stay one source of truth.
//
// Degrade-open by contract: Findings is populated only on a positive
// detector match. Any parse/infra uncertainty — not a commit, a message
// it cannot statically resolve (heredoc / command substitution), an
// unreadable file, git unavailable — yields no findings, so a harness
// session is never wedged. Defense-in-depth keeps the git pre-commit
// hook and CI leak-guard as the hard gates; this guard covers the common
// case (an inline `-m` trailer) reliably.
func ScanCommitGuard(det *Detector, command, cwd string) CommitGuardResult {
var res CommitGuardResult
for _, words := range commandSegments(command) {
if !isGitCommit(words) {
continue
}
res.IsCommit = true
if msg := assembleMessage(words, cwd); msg != "" {
res.Findings = append(res.Findings, det.Scan("commit message", msg)...)
}
}
if !res.IsCommit {
return res
}
// The staged diff catches a non-line-anchored generated-by line added
// to a file rather than the message.
if diff := stagedDiff(cwd); diff != "" {
res.Findings = append(res.Findings, det.Scan("staged diff", diff)...)
}
// Belt: the raw command string catches a trailer or generated-by line
// embedded with a real newline inside the command.
res.Findings = append(res.Findings, det.Scan("command", command)...)
return res
}
// --- shell command lexing -------------------------------------------
// tokKind distinguishes a word from a command separator.
type tokKind int
const (
tokWord tokKind = iota
tokSep
)
type token struct {
kind tokKind
text string
}
// commandSegments splits a shell command into independent segments at
// unquoted separators (&&, ||, ;, |, &, newline) and returns each
// segment as its quote-aware word list. It is best-effort: a construct
// it cannot statically resolve (command substitution, heredoc) degrades
// to literal text, which the detector then finds nothing in — allow.
func commandSegments(command string) [][]string {
toks := lex(command)
var segs [][]string
var cur []string
for _, t := range toks {
if t.kind == tokSep {
if len(cur) > 0 {
segs = append(segs, cur)
cur = nil
}
continue
}
cur = append(cur, t.text)
}
if len(cur) > 0 {
segs = append(segs, cur)
}
return segs
}
// lex tokenizes s into words and separators, honoring single quotes,
// double quotes (with the standard backslash escapes), and backslash
// escaping. Adjacent quoted and unquoted runs concatenate into one word,
// matching shell word-splitting. Operator characters inside quotes are
// literal, so a `;` or `&&` inside an `-m "…"` value never splits.
func lex(s string) []token {
var toks []token
var buf strings.Builder
hasWord := false
flush := func() {
if hasWord {
toks = append(toks, token{tokWord, buf.String()})
buf.Reset()
hasWord = false
}
}
emitSep := func() {
flush()
toks = append(toks, token{kind: tokSep})
}
i, n := 0, len(s)
for i < n {
c := s[i]
switch c {
case '\n':
emitSep()
i++
case ' ', '\t', '\r':
flush()
i++
case ';':
emitSep()
i++
case '&':
emitSep()
if i+1 < n && s[i+1] == '&' {
i += 2
} else {
i++
}
case '|':
emitSep()
if i+1 < n && s[i+1] == '|' {
i += 2
} else {
i++
}
case '\'':
hasWord = true
i++
for i < n && s[i] != '\'' {
buf.WriteByte(s[i])
i++
}
if i < n {
i++ // closing quote
}
case '"':
hasWord = true
i++
for i < n && s[i] != '"' {
if s[i] == '\\' && i+1 < n {
switch s[i+1] {
case '"', '\\', '`', '$':
buf.WriteByte(s[i+1])
i += 2
continue
case '\n':
i += 2 // line continuation
continue
}
}
buf.WriteByte(s[i])
i++
}
if i < n {
i++ // closing quote
}
case '\\':
hasWord = true
if i+1 < n {
if s[i+1] == '\n' {
i += 2 // line continuation
continue
}
buf.WriteByte(s[i+1])
i += 2
} else {
i++
}
default:
hasWord = true
buf.WriteByte(c)
i++
}
}
flush()
return toks
}
// --- git commit detection -------------------------------------------
// isGitCommit reports whether a segment's word list invokes `git commit`.
// It skips leading NAME=VALUE env assignments, requires the program token
// to be git (bare or a path ending in /git), then walks git's global
// options to the subcommand and checks it is exactly "commit". A bare `--`
// before any subcommand, or any other subcommand, disqualifies — so
// `git status` and `git log -m commit` never fire.
func isGitCommit(words []string) bool {
i := 0
for i < len(words) && isEnvAssign(words[i]) {
i++
}
if i >= len(words) || !isGitProg(words[i]) {
return false
}
i++ // past git
// git global options that consume the following token as their value.
valueOpts := map[string]bool{
"-C": true, "-c": true, "--git-dir": true, "--work-tree": true,
"--namespace": true, "--exec-path": true, "--super-prefix": true,
"--config-env": true,
}
for i < len(words) {
w := words[i]
if w == "--" {
return false // end of options without a subcommand
}
if strings.HasPrefix(w, "-") {
if valueOpts[w] {
i += 2 // skip the option and its value
} else {
i++ // a flag or an --opt=val single token
}
continue
}
return w == "commit" // first bare token is the subcommand
}
return false
}
// isGitProg reports whether tok names the git program.
func isGitProg(tok string) bool {
return tok == "git" || strings.HasSuffix(tok, "/git")
}
// isEnvAssign reports whether w is a leading NAME=VALUE env assignment
// (NAME is a shell identifier), e.g. GIT_AUTHOR_NAME=x before `git …`.
func isEnvAssign(w string) bool {
eq := strings.IndexByte(w, '=')
if eq <= 0 {
return false
}
for i := range eq {
c := w[i]
switch {
case c == '_':
case c >= 'A' && c <= 'Z':
case c >= 'a' && c <= 'z':
case i > 0 && c >= '0' && c <= '9':
default:
return false
}
}
return true
}
// --- commit message assembly ----------------------------------------
// assembleMessage reconstructs the text a `git commit` segment would
// commit as its message. -m/--message values join with a blank line (how
// git forms paragraphs), so an attribution trailer lands at line-start
// for the line-anchored detector pattern. It falls back to -F/--file
// (read relative to cwd) and then to <cwd>/.git/COMMIT_EDITMSG. An
// unresolved or unreadable source returns "" (degrade-open).
func assembleMessage(words []string, cwd string) string {
var parts []string
var filePath string
for i := 0; i < len(words); i++ {
w := words[i]
switch {
case w == "-m" || w == "--message":
if i+1 < len(words) {
parts = append(parts, words[i+1])
i++
}
case strings.HasPrefix(w, "--message="):
parts = append(parts, w[len("--message="):])
case clusterEndsInM(w):
// a short cluster like -am / -sm: the trailing m takes the
// next token as the message.
if i+1 < len(words) {
parts = append(parts, words[i+1])
i++
}
case strings.HasPrefix(w, "-m") && !strings.HasPrefix(w, "--"):
parts = append(parts, strings.TrimPrefix(w[2:], "=")) // -mMSG / -m=MSG
case w == "-F" || w == "--file":
if i+1 < len(words) {
filePath = words[i+1]
i++
}
case strings.HasPrefix(w, "--file="):
filePath = w[len("--file="):]
case strings.HasPrefix(w, "-F") && !strings.HasPrefix(w, "--") && len(w) > 2:
filePath = strings.TrimPrefix(w[2:], "=")
}
}
if len(parts) > 0 {
return strings.Join(parts, "\n\n")
}
if filePath != "" {
if !filepath.IsAbs(filePath) {
filePath = filepath.Join(cwd, filePath)
}
if b, err := os.ReadFile(filePath); err == nil {
return string(b)
}
return ""
}
if b, err := os.ReadFile(filepath.Join(cwd, ".git", "COMMIT_EDITMSG")); err == nil {
return string(b)
}
return ""
}
// clusterEndsInM reports whether w is a combined short-flag cluster whose
// last flag is m (so it consumes the next token as the message), e.g.
// -am or -sm. Pure -m (length 2) is handled separately.
func clusterEndsInM(w string) bool {
if len(w) < 3 || w[0] != '-' || w[1] == '-' {
return false
}
for i := 1; i < len(w); i++ {
c := w[i]
if (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') {
return false
}
}
return w[len(w)-1] == 'm'
}