eeco/internal/ask/ask.go

Go 359 lines
// Package ask answers a free-form question about a project with a
// deterministic, no-AI-spend, ranked set of pointers: the matching
// memory facts first (eeco's curated topic→file map) and then the
// best-matching code locations as path:line references.
//
// It is the engine behind `eeco ask`. Where `eeco go` (package brief)
// gives a one-shot project overview, `eeco ask` is the interactive
// counterpart: a fast, precise pointer into the codebase for any
// assistant, beyond the static brief.
//
// The package only reads — the resolved config, the memory store, and
// the repository's tracked files — and writes nothing. It calls no AI
// provider: relevance is a simple word-overlap score, the same
// tokenisation the memory store uses for fact selection. The output
// carries no timestamp and every list is in a stable sort order, so a
// given question over a given tree always produces the same answer.
package ask

import (
	"bytes"
	"encoding/json"
	"errors"
	"fmt"
	"io/fs"
	"os"
	"path/filepath"
	"regexp"
	"sort"
	"strings"

	"github.com/ajhahnde/eeco/internal/config"
	"github.com/ajhahnde/eeco/internal/gitx"
	"github.com/ajhahnde/eeco/internal/memory"
)

// DefaultLimit is the number of code locations Search returns when the
// caller passes a non-positive limit.
const DefaultLimit = 10

// maxFileBytes caps the size of a file ask will scan. Larger files are
// skipped: they are almost always generated or vendored, and reading
// them would blow the time budget for a command meant to feel instant.
const maxFileBytes = 256 * 1024

// snippetCap bounds the length of a code-line snippet in the output so
// one long minified line cannot dominate the answer.
const snippetCap = 160

// tokenSplit matches the inverse of a word character (letters, digits,
// underscore). Tokenisation mirrors internal/memory/select.go: lowercase,
// split on non-word runs, dedupe. A private copy lives here rather than
// widening the memory package's surface — the two are tiny, independent,
// and ask layers its own scoring on top.
var tokenSplit = regexp.MustCompile(`[^\p{L}\p{N}_]+`)

// Result is the structured answer to one question: the matching memory
// facts and the matching code locations, each ranked. It is the data
// behind `eeco ask`, independent of how it is rendered — Render turns it
// into Markdown, RenderJSON into a JSON object, so the two always
// describe the same answer. Both slice fields are always non-nil so the
// JSON form renders an empty list rather than null.
type Result struct {
	Question string      `json:"question"`
	Memory   []MemoryHit `json:"memory"`
	Code     []CodeHit   `json:"code"`
}

// MemoryHit is one memory fact whose name, description, or body shares a
// word with the question. Ref is the repo-relative file the fact points
// at, empty when the fact carries none. Score is the count of distinct
// question terms the fact matched.
type MemoryHit struct {
	Name        string `json:"name"`
	Description string `json:"description"`
	Ref         string `json:"ref"`
	Score       int    `json:"score"`
}

// CodeHit is one line in a tracked source file that shares a word with
// the question. Path is repo-relative and slash-separated, Line is
// 1-based, Text is the trimmed (and length-capped) line, and Score is
// the count of distinct question terms the line matched.
type CodeHit struct {
	Path  string `json:"path"`
	Line  int    `json:"line"`
	Text  string `json:"text"`
	Score int    `json:"score"`
}

// Search answers question for cfg: it scores the memory store and the
// repository's tracked files by word overlap with the question and
// returns the ranked matches. limit caps the number of code locations
// returned (a non-positive limit means DefaultLimit); every matching
// memory fact is returned. It reads the memory store only when the
// workspace is initialised and degrades gracefully when it is not — the
// code search still runs, so `eeco ask` is useful in any git repo.
//
// A non-nil error means a real I/O fault while walking the tree or
// reading the store; an unmatched question is not an error (the Result
// simply carries empty lists).
func Search(cfg *config.Config, question string, limit int) (Result, error) {
	if cfg == nil {
		return Result{}, errors.New("ask.Search: nil config")
	}
	if limit <= 0 {
		limit = DefaultLimit
	}

	res := Result{
		Question: strings.TrimSpace(question),
		Memory:   []MemoryHit{},
		Code:     []CodeHit{},
	}

	terms := tokenize(question)
	if len(terms) == 0 {
		return res, nil
	}

	mem, err := searchMemory(cfg, terms)
	if err != nil {
		return Result{}, err
	}
	res.Memory = mem

	code, err := searchCode(cfg, terms, limit)
	if err != nil {
		return Result{}, err
	}
	res.Code = code

	return res, nil
}

// searchMemory scores each fact by the number of distinct question terms
// found across its name, description, and body. It reads the store
// read-only (it does not call memory.Select, which would bump last_used
// and re-save). Facts are sorted by score descending, then name
// ascending, for a stable order.
func searchMemory(cfg *config.Config, terms map[string]struct{}) ([]MemoryHit, error) {
	if !config.IsInitialized(cfg) {
		return []MemoryHit{}, nil
	}
	store, err := memory.Open(cfg)
	if err != nil {
		return nil, fmt.Errorf("ask: open memory: %w", err)
	}
	facts, err := store.LoadAll()
	if err != nil {
		return nil, fmt.Errorf("ask: load memory: %w", err)
	}
	hits := []MemoryHit{}
	for _, f := range facts {
		if f.Disabled {
			continue
		}
		score := overlapCount(terms, tokenize(f.Name+" "+f.Description+" "+f.Body))
		if score == 0 {
			continue
		}
		hits = append(hits, MemoryHit{
			Name:        f.Name,
			Description: f.Description,
			Ref:         f.Ref,
			Score:       score,
		})
	}
	sort.SliceStable(hits, func(i, j int) bool {
		if hits[i].Score != hits[j].Score {
			return hits[i].Score > hits[j].Score
		}
		return hits[i].Name < hits[j].Name
	})
	return hits, nil
}

// searchCode scans every tracked text file and scores each line by the
// number of distinct question terms it contains. The top limit lines are
// returned, ranked by score descending, then path ascending, then line
// ascending — a fully deterministic order.
func searchCode(cfg *config.Config, terms map[string]struct{}, limit int) ([]CodeHit, error) {
	files, err := collectFiles(cfg)
	if err != nil {
		return nil, err
	}
	hits := []CodeHit{}
	for _, rel := range files {
		data, err := os.ReadFile(filepath.Join(cfg.RepoRoot, filepath.FromSlash(rel)))
		if err != nil {
			// A file listed by git but unreadable now (a race, a broken
			// symlink) is skipped, not fatal: the answer degrades rather
			// than aborting.
			continue
		}
		if len(data) > maxFileBytes || bytes.IndexByte(data, 0) >= 0 {
			continue // oversized or binary
		}
		for i, raw := range strings.Split(string(data), "\n") {
			score := overlapCount(terms, tokenize(raw))
			if score == 0 {
				continue
			}
			hits = append(hits, CodeHit{
				Path:  rel,
				Line:  i + 1,
				Text:  snippet(raw),
				Score: score,
			})
		}
	}
	sort.SliceStable(hits, func(i, j int) bool {
		if hits[i].Score != hits[j].Score {
			return hits[i].Score > hits[j].Score
		}
		if hits[i].Path != hits[j].Path {
			return hits[i].Path < hits[j].Path
		}
		return hits[i].Line < hits[j].Line
	})
	if len(hits) > limit {
		hits = hits[:limit]
	}
	return hits, nil
}

// collectFiles lists the repository's text-file candidates, repo-relative
// and slash-separated. It prefers git's tracked set so build artifacts,
// the eeco workspace, and other untracked clutter stay out of the search;
// it falls back to a recursive directory walk when git is unavailable or
// the repo has no tracked files (the same two-branch strategy the brief
// uses for the top-level listing). Either path skips the .git directory
// and eeco's own workspace.
func collectFiles(cfg *config.Config) ([]string, error) {
	if tracked, err := gitx.TrackedFiles(cfg.RepoRoot); err == nil && len(tracked) > 0 {
		out := make([]string, 0, len(tracked))
		for _, p := range tracked {
			seg, _, _ := strings.Cut(p, "/")
			if seg == cfg.WorkspaceName {
				continue
			}
			out = append(out, p)
		}
		sort.Strings(out)
		return out, nil
	}

	var out []string
	err := filepath.WalkDir(cfg.RepoRoot, func(path string, d fs.DirEntry, err error) error {
		if err != nil {
			return err
		}
		if d.IsDir() {
			if path == cfg.RepoRoot {
				return nil
			}
			if name := d.Name(); name == ".git" || name == cfg.WorkspaceName {
				return filepath.SkipDir
			}
			return nil
		}
		rel, err := filepath.Rel(cfg.RepoRoot, path)
		if err != nil {
			return err
		}
		out = append(out, filepath.ToSlash(rel))
		return nil
	})
	if err != nil {
		return nil, fmt.Errorf("ask: walk repo: %w", err)
	}
	sort.Strings(out)
	return out, nil
}

// Render serialises a Result to the Markdown answer. When the answer is
// empty it renders a single guidance line instead of empty sections.
func Render(r Result) string {
	var b strings.Builder
	fmt.Fprintf(&b, "# eeco ask: %q\n\n", r.Question)

	if len(r.Memory) == 0 && len(r.Code) == 0 {
		b.WriteString("No matches — try different terms, or run `eeco go` for the project brief.\n")
		return b.String()
	}

	b.WriteString("## Memory\n\n")
	if len(r.Memory) == 0 {
		b.WriteString("No matching facts.\n")
	} else {
		for _, m := range r.Memory {
			if m.Ref != "" {
				fmt.Fprintf(&b, "- %s → `%s`\n", m.Description, m.Ref)
			} else {
				fmt.Fprintf(&b, "- %s — %s\n", m.Name, m.Description)
			}
		}
	}
	b.WriteString("\n## Code\n\n")
	if len(r.Code) == 0 {
		b.WriteString("No matching code.\n")
	} else {
		for _, c := range r.Code {
			fmt.Fprintf(&b, "- `%s:%d`  %s\n", c.Path, c.Line, c.Text)
		}
	}
	return b.String()
}

// RenderJSON serialises a Result to an indented JSON object — the
// machine-readable counterpart to Render. The three top-level keys
// (question, memory, code) are frozen; the arrays are always present,
// never null.
func RenderJSON(r Result) (string, error) {
	out, err := json.MarshalIndent(r, "", "  ")
	if err != nil {
		return "", fmt.Errorf("ask: marshal json: %w", err)
	}
	return string(out) + "\n", nil
}

// snippet trims a code line and caps its length so one very long line
// cannot dominate the answer.
func snippet(line string) string {
	s := strings.TrimSpace(line)
	if len(s) > snippetCap {
		s = s[:snippetCap] + "…"
	}
	return s
}

// tokenize lowercases s, splits it on non-word runs, drops single
// characters (which carry little signal and inflate code-search noise),
// and returns the distinct tokens as a set.
func tokenize(s string) map[string]struct{} {
	out := map[string]struct{}{}
	for _, t := range tokenSplit.Split(strings.ToLower(s), -1) {
		if len(t) <= 1 {
			continue
		}
		out[t] = struct{}{}
	}
	return out
}

// overlapCount returns the number of distinct terms present in both sets.
func overlapCount(terms, hay map[string]struct{}) int {
	short, long := terms, hay
	if len(hay) < len(terms) {
		short, long = hay, terms
	}
	n := 0
	for k := range short {
		if _, ok := long[k]; ok {
			n++
		}
	}
	return n
}
raw view on GitHub →