ajhahn.de
← eeco
Go 404 lines
package docs

import (
	"bytes"
	"errors"
	"fmt"
	"os"
	"path/filepath"
	"strings"
)

// Marker spellings for `eeco docs compact`. Fixed in slice 1; a future
// slice can introduce a config knob for custom markers if a user needs
// it.
const (
	startMarker = "<!-- eeco:archive:start -->"
	endMarker   = "<!-- eeco:archive:end -->"
)

// CompactRegion records one marked region that was (or would be, in
// dry-run) moved to the archive. Line numbers are 1-based and inclusive,
// covering the start marker line through the end marker line.
type CompactRegion struct {
	StartLine int
	EndLine   int
}

// CompactReport summarises a compact run. It is returned in both the
// dry-run and write paths so a CLI caller can render the same summary
// either way.
type CompactReport struct {
	Source        string
	Archive       string
	Regions       []CompactRegion
	ArchiveExists bool
	DryRun        bool
}

// Compact moves every region of source delimited by
// `<!-- eeco:archive:start -->` / `<!-- eeco:archive:end -->` into
// archive, leaving a pointer stub in place at the source (marker mode).
// It is a thin wrapper over the shared compact engine; the regions to
// move are discovered from explicit markers. The public signature is
// unchanged.
func Compact(source, archive string, dryRun bool) (CompactReport, error) {
	return compact(source, archive, dryRun, func(raw []byte) ([]CompactRegion, error) {
		return scanArchiveRegions(raw)
	})
}

// CompactKeepLast moves heading-delimited regions of source into archive
// (heading mode). prefix is a heading-line prefix such as "## Snapshot"
// whose `#` run fixes the section level; the keepLast most-recent
// matching sections (newest first, top of file) are kept and everything
// older is archived. It shares every move mechanic with Compact via the
// compact engine — only region discovery differs. Heading mode refuses
// to run on a source that still carries explicit archive markers (the
// two modes are mutually exclusive).
func CompactKeepLast(source, archive string, dryRun bool, prefix string, keepLast int) (CompactReport, error) {
	return compact(source, archive, dryRun, func(raw []byte) ([]CompactRegion, error) {
		return scanHeadingRegions(raw, prefix, keepLast)
	})
}

// compact is the shared engine behind Compact (marker mode) and
// CompactKeepLast (heading mode). find discovers the regions to move;
// everything downstream — the archive stat, the dry-run / no-region
// early return, splitRegions, appendArchive, and the source rewrite — is
// identical across both modes. Both paths are absolute. The
// repo-relativity check belongs in the CLI layer where the repo root is
// known; this function trusts both paths.
//
// Behaviour:
//   - Markers / headings inside fenced code blocks (``` or ~~~) are ignored.
//   - Unmatched, nested, or out-of-order markers return an error.
//   - With dryRun=true, nothing is written; the report still names every
//     region that would move.
//   - Re-running with no discoverable regions is an idempotent no-op
//     (returns an empty Regions slice and writes nothing).
//   - The archive file is created on first run and appended to on later
//     runs; the appended content is a deterministic concatenation of the
//     cut regions, each preceded by a one-line provenance header.
//   - The source-doc trailing newline is preserved exactly.
//
// The header carries no date or wall-clock content so byte output is
// reproducible across runs.
func compact(source, archive string, dryRun bool, find func([]byte) ([]CompactRegion, error)) (CompactReport, error) {
	report := CompactReport{
		Source:  source,
		Archive: archive,
		DryRun:  dryRun,
	}

	raw, err := os.ReadFile(source)
	if err != nil {
		return report, fmt.Errorf("read source: %w", err)
	}

	regions, err := find(raw)
	if err != nil {
		return report, err
	}
	report.Regions = regions

	if _, err := os.Stat(archive); err == nil {
		report.ArchiveExists = true
	} else if !errors.Is(err, os.ErrNotExist) {
		return report, fmt.Errorf("stat archive: %w", err)
	}

	if len(regions) == 0 || dryRun {
		return report, nil
	}

	// Source path is given as an absolute path; the archive header
	// records the source by basename to keep the header short and avoid
	// leaking the operator's local layout. The stub references the
	// archive by its path relative to the source's directory so a
	// reader can follow the pointer without guessing where the archive
	// lives.
	sourceTag := filepath.Base(source)
	stubTarget, relErr := filepath.Rel(filepath.Dir(source), archive)
	if relErr != nil {
		stubTarget = filepath.Base(archive)
	}
	stubTarget = filepath.ToSlash(stubTarget)
	archiveAddition, sourceRewrite := splitRegions(raw, regions, sourceTag, stubTarget)

	if err := appendArchive(archive, archiveAddition, !report.ArchiveExists); err != nil {
		return report, fmt.Errorf("write archive: %w", err)
	}
	if err := os.WriteFile(source, sourceRewrite, 0o644); err != nil {
		return report, fmt.Errorf("rewrite source: %w", err)
	}
	return report, nil
}

// scanArchiveRegions walks src line-by-line tracking fenced-code state
// and returns every paired start/end region. Markers inside a fence are
// ignored. A start without a matching end, an end without an open start,
// or a second start before the first end is a hard error.
func scanArchiveRegions(src []byte) ([]CompactRegion, error) {
	var regions []CompactRegion
	inFence := false
	openStart := 0 // 1-based line number of the open start marker; 0 = no open start

	lines := splitLinesKeepEOL(src)
	for i, line := range lines {
		lineNo := i + 1
		trimmed := strings.TrimRight(line, "\r\n")
		// Track fenced code boundaries. The trim handles indented fences too
		// (a fence may carry leading whitespace).
		stripped := strings.TrimLeft(trimmed, " \t")
		if strings.HasPrefix(stripped, "```") || strings.HasPrefix(stripped, "~~~") {
			inFence = !inFence
			continue
		}
		if inFence {
			continue
		}
		marker := strings.TrimSpace(trimmed)
		switch marker {
		case startMarker:
			if openStart != 0 {
				return nil, fmt.Errorf("%s line %d: nested start marker (previous still open at line %d)", "compact", lineNo, openStart)
			}
			openStart = lineNo
		case endMarker:
			if openStart == 0 {
				return nil, fmt.Errorf("%s line %d: end marker with no matching start", "compact", lineNo)
			}
			regions = append(regions, CompactRegion{StartLine: openStart, EndLine: lineNo})
			openStart = 0
		}
	}
	if openStart != 0 {
		return nil, fmt.Errorf("compact line %d: start marker with no matching end", openStart)
	}
	return regions, nil
}

// headingSection is one matched heading-mode section: the 1-based line of
// the matched heading and the 1-based line of the boundary that
// terminates it (exclusive — the section spans [start, end)). At EOF the
// terminating boundary is len(lines)+1.
type headingSection struct {
	start int
	end   int
}

// headingLevel returns the ATX-heading level of line (the number of
// leading `#` characters) when line is a heading, or 0 when it is not. A
// heading is a run of one or more `#` at the start of the line (after
// optional leading whitespace) followed by a space or the line end. The
// trailing newline is ignored.
func headingLevel(line string) int {
	s := strings.TrimLeft(strings.TrimRight(line, "\r\n"), " \t")
	n := 0
	for n < len(s) && s[n] == '#' {
		n++
	}
	if n == 0 || (n < len(s) && s[n] != ' ') {
		return 0
	}
	return n
}

// scanHeadingRegions discovers archivable regions by heading rather than
// by explicit markers. prefix is a heading-line prefix such as
// "## Snapshot"; its `#` run fixes the section level L. A *matched*
// section opens at a heading of exactly level L whose trimmed text has
// the given prefix, and runs until the next *boundary* heading (any
// heading of level <= L) or EOF — so a section can never swallow a later
// same-or-higher heading such as a live "## Next session" tail. The N
// most-recent matched sections (newest first, i.e. topmost in the file)
// are kept; everything older is archivable, and adjacent archivable
// sections coalesce into one CompactRegion per contiguous run. Headings
// inside fenced code blocks are ignored, mirroring scanArchiveRegions.
//
// Heading mode is mutually exclusive with explicit markers: if the
// source already contains a paired archive-marker region, this returns
// an error rather than silently mixing the two schemes.
func scanHeadingRegions(src []byte, prefix string, keepLast int) ([]CompactRegion, error) {
	level := headingLevel(prefix)
	if level == 0 {
		return nil, fmt.Errorf("compact: --heading %q is not a markdown heading (expected a leading '#' run, e.g. \"## Snapshot\")", prefix)
	}
	if keepLast < 0 {
		return nil, fmt.Errorf("compact: --keep-last must be >= 0 (got %d)", keepLast)
	}
	// Any explicit archive markers (a complete pair, or even a malformed
	// unmatched/nested one) mean the source is set up for marker mode;
	// refuse rather than silently mix the two schemes. Inline prose
	// mentions are unaffected — scanArchiveRegions only matches standalone
	// marker lines.
	if markers, err := scanArchiveRegions(src); err != nil || len(markers) > 0 {
		return nil, errors.New("source contains explicit archive markers; remove them or drop --keep-last")
	}
	wantPrefix := strings.TrimSpace(prefix)

	lines := splitLinesKeepEOL(src)
	var matched []headingSection
	openStart := 0 // 1-based line of the currently open matched section; 0 = none
	inFence := false

	for i, line := range lines {
		lineNo := i + 1
		stripped := strings.TrimLeft(strings.TrimRight(line, "\r\n"), " \t")
		if strings.HasPrefix(stripped, "```") || strings.HasPrefix(stripped, "~~~") {
			inFence = !inFence
			continue
		}
		if inFence {
			continue
		}
		lvl := headingLevel(line)
		if lvl == 0 || lvl > level {
			continue // body line (a deeper heading does not split the section)
		}
		// A boundary heading (lvl <= level) closes any open matched section.
		if openStart != 0 {
			matched = append(matched, headingSection{start: openStart, end: lineNo})
			openStart = 0
		}
		// The boundary is itself a new matched section only when it is at
		// exactly level L and carries the prefix.
		if lvl == level && strings.HasPrefix(strings.TrimSpace(line), wantPrefix) {
			openStart = lineNo
		}
	}
	if openStart != 0 {
		matched = append(matched, headingSection{start: openStart, end: len(lines) + 1})
	}

	if keepLast >= len(matched) {
		return nil, nil // nothing older than the kept window — idempotent no-op
	}
	archivable := matched[keepLast:] // newest-on-top: keep the first keepLast

	// Coalesce adjacent archivable sections (section_i.end == the next
	// section's start) into maximal contiguous runs; each run is one
	// region whose EndLine is the last line before its terminating
	// boundary.
	var regions []CompactRegion
	for i := 0; i < len(archivable); {
		runStart := archivable[i].start
		runEnd := archivable[i].end
		j := i + 1
		for j < len(archivable) && archivable[j].start == runEnd {
			runEnd = archivable[j].end
			j++
		}
		regions = append(regions, CompactRegion{StartLine: runStart, EndLine: runEnd - 1})
		i = j
	}
	return regions, nil
}

// splitRegions partitions src into (archiveBytes, sourceBytes) using the
// pre-validated regions. Each cut region (markers + body) is appended to
// archiveBytes after a one-line provenance header. The same region is
// replaced in sourceBytes with a single-line pointer stub that names the
// archive destination.
func splitRegions(src []byte, regions []CompactRegion, sourceTag, stubTarget string) (archiveAddition, sourceRewrite []byte) {
	lines := splitLinesKeepEOL(src)
	newline := dominantNewline(lines)
	stub := fmt.Sprintf("> _archived to `%s` (eeco docs compact)._%s", stubTarget, newline)

	var archive bytes.Buffer
	var out bytes.Buffer
	cursor := 0
	for _, r := range regions {
		for ; cursor < r.StartLine-1; cursor++ {
			out.WriteString(lines[cursor])
		}
		out.WriteString(stub)

		archive.WriteString("<!-- archived from ")
		archive.WriteString(sourceTag)
		archive.WriteString(" -->")
		archive.WriteString(newline)
		for j := r.StartLine - 1; j < r.EndLine; j++ {
			archive.WriteString(lines[j])
		}
		// Guarantee a blank line between consecutive archive blocks. If
		// the cut content already ended with a newline (the end-marker
		// line normally does), one extra newline is enough; if it did
		// not, add two.
		last := lines[r.EndLine-1]
		if !strings.HasSuffix(last, "\n") {
			archive.WriteString(newline)
		}
		archive.WriteString(newline)

		cursor = r.EndLine
	}
	for ; cursor < len(lines); cursor++ {
		out.WriteString(lines[cursor])
	}

	return archive.Bytes(), out.Bytes()
}

// dominantNewline picks the newline style used most often in lines, with
// a "\n" fallback for files with no newlines at all.
func dominantNewline(lines []string) string {
	crlf, lf := 0, 0
	for _, line := range lines {
		switch {
		case strings.HasSuffix(line, "\r\n"):
			crlf++
		case strings.HasSuffix(line, "\n"):
			lf++
		}
	}
	if crlf > lf {
		return "\r\n"
	}
	return "\n"
}

// splitLinesKeepEOL returns the lines of src with their trailing newline
// (LF or CRLF) preserved. An unterminated final line is returned as-is.
func splitLinesKeepEOL(src []byte) []string {
	var lines []string
	for len(src) > 0 {
		i := bytes.IndexByte(src, '\n')
		if i < 0 {
			lines = append(lines, string(src))
			break
		}
		lines = append(lines, string(src[:i+1]))
		src = src[i+1:]
	}
	return lines
}

// appendArchive appends content to archive, creating the file (and any
// parent directories) on first write. When the archive already exists,
// a single blank line is written between the prior content and the new
// content so successive runs do not glue blocks together visually.
func appendArchive(archive string, content []byte, createNew bool) error {
	if createNew {
		if err := os.MkdirAll(filepath.Dir(archive), 0o755); err != nil {
			return err
		}
		return os.WriteFile(archive, content, 0o644)
	}
	existing, err := os.ReadFile(archive)
	if err != nil {
		return err
	}
	var buf bytes.Buffer
	buf.Write(existing)
	if len(existing) > 0 && !bytes.HasSuffix(existing, []byte("\n")) {
		buf.WriteByte('\n')
	}
	if len(existing) > 0 {
		buf.WriteByte('\n')
	}
	buf.Write(content)
	return os.WriteFile(archive, buf.Bytes(), 0o644)
}