ajhahn.de
← eeco
Go 105 lines
package workflow

import (
	"bufio"
	"fmt"
	"regexp"
	"strings"
)

// Detector finds AI-attribution fingerprints in text. It backs both the
// comment-hygiene and leak-guard workflows and enforces Constraint 3
// ("no AI attribution") — which eeco applies to its own repository.
//
// Self-clean by construction: the sensitive trigger literals are
// assembled from fragments at runtime, so this source file contains no
// contiguous attribution string for the detector to flag when it scans
// eeco's own tracked tree. The trailer rule is line-anchored so a prose
// mention of the trailer's name (for example in documentation) is not a
// false positive — only an actual trailer line is.
type Detector struct {
	patterns []namedPattern
}

type namedPattern struct {
	what string
	re   *regexp.Regexp
}

// fragment assembly: keeping these split means the full trigger token
// never appears verbatim in tracked source.
var (
	coAuthored = "[Cc]o-" + "[Aa]uthored-" + "[Bb]y"
	genVerb    = "[Gg]enerated"
	// Tool tokens are word-bounded and case-scoped on purpose: a global
	// (?i) would let the bare letters "ai" inside ordinary prose (for
	// example "fair") trip the gate, which would make it untrustworthy
	// (Constraint 5). Generic words like "model" are excluded for the
	// same reason; operators add project-specific tokens via config.
	assistanten = `\b(?:[Aa]ssistant|[Aa]gent|[Cc]opilot|[Bb]ot|AI|CLI|LLM)\b`
	robotEmoji  = "\\x{1F916}" // U+1F916; not written as a literal glyph here.
)

// NewDetector builds the detector with the default denylist plus any
// operator-supplied extra patterns (compiled as regular expressions).
// An invalid extra pattern is an error so a typo is loud, not silent.
func NewDetector(extra []string) (*Detector, error) {
	d := &Detector{patterns: []namedPattern{
		// An actual trailer line: anchored to line start so a prose or
		// backticked mention of the name is not flagged.
		{"co-authored-by trailer", regexp.MustCompile(`(?m)^\s*` + coAuthored + `:\s*\S`)},
		// "Generated with/by <AI-ish tool>" co-marketing line.
		{"generated-by attribution", regexp.MustCompile(genVerb + ` (?:[Ww]ith|[Bb]y) [^\n]{0,40}?` + assistanten)},
		// Robot-emoji-prefixed generated line.
		{"robot-emoji attribution", regexp.MustCompile(`(?m)` + robotEmoji + `[^\n]{0,20}` + genVerb)},
	}}
	for i, p := range extra {
		re, err := regexp.Compile(p)
		if err != nil {
			return nil, fmt.Errorf("attribution_pattern[%d] %q: %w", i, p, err)
		}
		d.patterns = append(d.patterns, namedPattern{"configured pattern", re})
	}
	return d, nil
}

// Scan returns one Finding per matching line. path is recorded on each
// Finding for reporting; it is not inspected. A line that trips several
// patterns is reported once (first match wins) to keep reports terse.
func (d *Detector) Scan(path, content string) []Finding {
	var out []Finding
	sc := bufio.NewScanner(strings.NewReader(content))
	sc.Buffer(make([]byte, 0, 64*1024), 4*1024*1024)
	ln := 0
	for sc.Scan() {
		ln++
		line := sc.Text()
		for _, p := range d.patterns {
			// The trailer pattern is intentionally multi-line-anchored;
			// evaluating it per line keeps the anchor meaningful and the
			// line number exact.
			if p.re.MatchString(line) {
				out = append(out, Finding{Path: path, Line: ln, Msg: p.what})
				break
			}
		}
	}
	return out
}

// ScanResponse adapts the detector to ai.ResponseScanner: scans an AI response
// body, returns one description per flagged line (nil for clean). Signature
// matches ai.ResponseScanner without importing internal/ai, so the cmd / tui
// layer can wire d.ScanResponse with no import cycle.
func (d *Detector) ScanResponse(text string) []string {
	findings := d.Scan("ai-response", text)
	if len(findings) == 0 {
		return nil
	}
	out := make([]string, 0, len(findings))
	for _, f := range findings {
		out = append(out, fmt.Sprintf("line %d: %s", f.Line, f.Msg))
	}
	return out
}