Go 105 lines
package workflow
import (
"bufio"
"fmt"
"regexp"
"strings"
)
// Detector finds AI-attribution fingerprints in text. It backs both the
// comment-hygiene and leak-guard workflows and enforces Constraint 3
// ("no AI attribution") — which eeco applies to its own repository.
//
// Self-clean by construction: the sensitive trigger literals are
// assembled from fragments at runtime, so this source file contains no
// contiguous attribution string for the detector to flag when it scans
// eeco's own tracked tree. The trailer rule is line-anchored so a prose
// mention of the trailer's name (for example in documentation) is not a
// false positive — only an actual trailer line is.
type Detector struct {
patterns []namedPattern
}
type namedPattern struct {
what string
re *regexp.Regexp
}
// fragment assembly: keeping these split means the full trigger token
// never appears verbatim in tracked source.
var (
coAuthored = "[Cc]o-" + "[Aa]uthored-" + "[Bb]y"
genVerb = "[Gg]enerated"
// Tool tokens are word-bounded and case-scoped on purpose: a global
// (?i) would let the bare letters "ai" inside ordinary prose (for
// example "fair") trip the gate, which would make it untrustworthy
// (Constraint 5). Generic words like "model" are excluded for the
// same reason; operators add project-specific tokens via config.
assistanten = `\b(?:[Aa]ssistant|[Aa]gent|[Cc]opilot|[Bb]ot|AI|CLI|LLM)\b`
robotEmoji = "\\x{1F916}" // U+1F916; not written as a literal glyph here.
)
// NewDetector builds the detector with the default denylist plus any
// operator-supplied extra patterns (compiled as regular expressions).
// An invalid extra pattern is an error so a typo is loud, not silent.
func NewDetector(extra []string) (*Detector, error) {
d := &Detector{patterns: []namedPattern{
// An actual trailer line: anchored to line start so a prose or
// backticked mention of the name is not flagged.
{"co-authored-by trailer", regexp.MustCompile(`(?m)^\s*` + coAuthored + `:\s*\S`)},
// "Generated with/by <AI-ish tool>" co-marketing line.
{"generated-by attribution", regexp.MustCompile(genVerb + ` (?:[Ww]ith|[Bb]y) [^\n]{0,40}?` + assistanten)},
// Robot-emoji-prefixed generated line.
{"robot-emoji attribution", regexp.MustCompile(`(?m)` + robotEmoji + `[^\n]{0,20}` + genVerb)},
}}
for i, p := range extra {
re, err := regexp.Compile(p)
if err != nil {
return nil, fmt.Errorf("attribution_pattern[%d] %q: %w", i, p, err)
}
d.patterns = append(d.patterns, namedPattern{"configured pattern", re})
}
return d, nil
}
// Scan returns one Finding per matching line. path is recorded on each
// Finding for reporting; it is not inspected. A line that trips several
// patterns is reported once (first match wins) to keep reports terse.
func (d *Detector) Scan(path, content string) []Finding {
var out []Finding
sc := bufio.NewScanner(strings.NewReader(content))
sc.Buffer(make([]byte, 0, 64*1024), 4*1024*1024)
ln := 0
for sc.Scan() {
ln++
line := sc.Text()
for _, p := range d.patterns {
// The trailer pattern is intentionally multi-line-anchored;
// evaluating it per line keeps the anchor meaningful and the
// line number exact.
if p.re.MatchString(line) {
out = append(out, Finding{Path: path, Line: ln, Msg: p.what})
break
}
}
}
return out
}
// ScanResponse adapts the detector to ai.ResponseScanner: scans an AI response
// body, returns one description per flagged line (nil for clean). Signature
// matches ai.ResponseScanner without importing internal/ai, so the cmd / tui
// layer can wire d.ScanResponse with no import cycle.
func (d *Detector) ScanResponse(text string) []string {
findings := d.Scan("ai-response", text)
if len(findings) == 0 {
return nil
}
out := make([]string, 0, len(findings))
for _, f := range findings {
out = append(out, fmt.Sprintf("line %d: %s", f.Line, f.Msg))
}
return out
}