eeco/internal/memory/select.go

Go 73 lines
package memory

import (
	"regexp"
	"sort"
	"strings"
)

// tokenSplit matches the inverse of \w (letters/digits/underscore).
// Tokenisation is intentionally simple: lowercase, split on non-word
// characters, dedupe. No stopword list — the store is small enough that
// recall matters more than precision at this stage.
var tokenSplit = regexp.MustCompile(`[^\p{L}\p{N}_]+`)

// Select returns the facts whose name or description shares a word with
// task, sorted by name. Each selected fact's last_used is bumped to the
// store clock and re-saved. Pinned facts participate in selection so
// that explicit knowledge still surfaces.
func (s *Store) Select(task string) ([]*Fact, error) {
	facts, err := s.LoadAll()
	if err != nil {
		return nil, err
	}
	terms := tokenize(task)
	if len(terms) == 0 {
		return nil, nil
	}
	var out []*Fact
	for _, f := range facts {
		if f.Disabled {
			continue
		}
		if overlap(terms, tokenize(f.Name+" "+f.Description)) {
			out = append(out, f)
		}
	}
	sort.Slice(out, func(i, j int) bool { return out[i].Name < out[j].Name })

	now := s.Now().UTC()
	today := now.Truncate(24 * 60 * 60 * 1e9) // truncate to day; equivalent to date-only
	for _, f := range out {
		f.LastUsed = today
		if err := s.Save(f); err != nil {
			return out, err
		}
	}
	return out, nil
}

func tokenize(s string) map[string]struct{} {
	out := map[string]struct{}{}
	for _, t := range tokenSplit.Split(strings.ToLower(s), -1) {
		if t == "" {
			continue
		}
		out[t] = struct{}{}
	}
	return out
}

func overlap(a, b map[string]struct{}) bool {
	short, long := a, b
	if len(b) < len(a) {
		short, long = b, a
	}
	for k := range short {
		if _, ok := long[k]; ok {
			return true
		}
	}
	return false
}
raw view on GitHub →