eeco/internal/workflow/scan.go

Go 71 lines
package workflow

import (
	"bytes"
	"io/fs"
	"os"
	"path/filepath"
	"strings"
)

// maxScanBytes caps the size of a file the text scanners will read. A
// file larger than this is treated as non-text and skipped: attribution
// fingerprints live in source and docs, not in large generated blobs.
const maxScanBytes = 4 << 20 // 4 MiB

// walkText walks root and calls fn(relPath, content) for every regular
// text file, skipping the .git directory and the gitignored workspace
// (engine output must never gate the tracked tree). Skipped: binary
// files, oversized files, and unreadable entries. relPath is
// slash-separated and repo-relative.
func walkText(root, workspaceName string, fn func(rel, content string) error) error {
	return filepath.WalkDir(root, func(path string, de fs.DirEntry, err error) error {
		if err != nil {
			return err
		}
		name := de.Name()
		if de.IsDir() {
			if path == root {
				return nil
			}
			if name == ".git" || name == workspaceName {
				return filepath.SkipDir
			}
			return nil
		}
		if !de.Type().IsRegular() {
			return nil
		}
		info, ierr := de.Info()
		if ierr != nil || info.Size() > maxScanBytes {
			return nil
		}
		b, rerr := os.ReadFile(path)
		if rerr != nil || !isText(b) {
			return nil
		}
		rel, rerr := filepath.Rel(root, path)
		if rerr != nil {
			return nil
		}
		return fn(filepath.ToSlash(rel), string(b))
	})
}

// isText reports whether b looks like text: a NUL byte in the first
// chunk marks it binary. Cheap and good enough for source/doc trees.
func isText(b []byte) bool {
	n := min(len(b), 8000)
	return !bytes.ContainsRune(b[:n], 0)
}

// splitLines splits content into lines, dropping a trailing CR so a
// CRLF file reports the same line content as an LF one.
func splitLines(content string) []string {
	lines := strings.Split(content, "\n")
	for i, l := range lines {
		lines[i] = strings.TrimSuffix(l, "\r")
	}
	return lines
}
raw view on GitHub →