Go 71 lines
package workflow
import (
"bytes"
"io/fs"
"os"
"path/filepath"
"strings"
)
// maxScanBytes caps the size of a file the text scanners will read. A
// file larger than this is treated as non-text and skipped: attribution
// fingerprints live in source and docs, not in large generated blobs.
const maxScanBytes = 4 << 20 // 4 MiB
// walkText walks root and calls fn(relPath, content) for every regular
// text file, skipping the .git directory and the gitignored workspace
// (engine output must never gate the tracked tree). Skipped: binary
// files, oversized files, and unreadable entries. relPath is
// slash-separated and repo-relative.
func walkText(root, workspaceName string, fn func(rel, content string) error) error {
return filepath.WalkDir(root, func(path string, de fs.DirEntry, err error) error {
if err != nil {
return err
}
name := de.Name()
if de.IsDir() {
if path == root {
return nil
}
if name == ".git" || name == workspaceName {
return filepath.SkipDir
}
return nil
}
if !de.Type().IsRegular() {
return nil
}
info, ierr := de.Info()
if ierr != nil || info.Size() > maxScanBytes {
return nil
}
b, rerr := os.ReadFile(path)
if rerr != nil || !isText(b) {
return nil
}
rel, rerr := filepath.Rel(root, path)
if rerr != nil {
return nil
}
return fn(filepath.ToSlash(rel), string(b))
})
}
// isText reports whether b looks like text: a NUL byte in the first
// chunk marks it binary. Cheap and good enough for source/doc trees.
func isText(b []byte) bool {
n := min(len(b), 8000)
return !bytes.ContainsRune(b[:n], 0)
}
// splitLines splits content into lines, dropping a trailing CR so a
// CRLF file reports the same line content as an LF one.
func splitLines(content string) []string {
lines := strings.Split(content, "\n")
for i, l := range lines {
lines[i] = strings.TrimSuffix(l, "\r")
}
return lines
}