ajhahn.de
← eeco
Go 405 lines
package projecttype

import (
	"context"
	"encoding/json"
	"fmt"
	"os"
	"path/filepath"
	"slices"
	"sort"
	"strings"
)

// DefaultThreshold is the deterministic-confidence floor at or above
// which Detect accepts the marker-scan result without prompting. It
// backs the init_detection_threshold config key.
const DefaultThreshold = 0.7

// minAIConfidence is the floor below which an AI-fallback classification
// is rejected: the result is re-offered to the operator (when a Prompter
// is available) or degraded to generic.
const minAIConfidence = 0.5

// Source records which pipeline layer produced a Result.
type Source string

const (
	SourceMarker      Source = "marker-scan"
	SourceFlag        Source = "type-flag"
	SourceInteractive Source = "interactive-prompt"
	SourceAI          Source = "ai-fallback"
	SourceFallback    Source = "generic-fallback"
)

// Result is the outcome of Detect.
type Result struct {
	Category Category
	// Confidence is the marker-scan confidence in [0,1] for a
	// deterministic result. An operator pick or a forced --type is 1.0; an
	// AI result carries the model's reported confidence.
	Confidence float64
	// Dirs is the knowledge-directory set to scaffold: the catalog dirs
	// for Category, plus any AI-proposed deviations when Source is
	// SourceAI.
	Dirs          []string
	Source        Source
	Justification string
}

// Prompter asks the operator to resolve an ambiguous detection. A nil
// Prompter makes Detect non-interactive (layer 3 is skipped).
type Prompter interface {
	// Pick presents the candidate categories best-first and the catalog
	// (for descriptions and the generic escape) and returns the operator's
	// choice. When describe is true the operator asked to describe the
	// project freely; freeText carries that description and Detect routes
	// to the AI layer. A non-nil error aborts detection.
	Pick(candidates []Category, cat *Catalog) (choice Category, describe bool, freeText string, err error)
}

// AIFunc runs one gated AI pass and returns the model's raw text. A nil
// AIFunc means no AI fallback is available and Detect degrades to
// generic where the pipeline would otherwise call it.
type AIFunc func(ctx context.Context, prompt string) (string, error)

// Options configures one Detect call.
type Options struct {
	// RepoRoot is the directory the deterministic layers scan.
	RepoRoot string
	// Threshold overrides DefaultThreshold when > 0.
	Threshold float64
	// Forced short-circuits the whole pipeline with an operator-supplied
	// --type value. An unknown value is an error.
	Forced Category
	// ForceAI routes straight to the AI layer (the --ai flag), skipping
	// the deterministic accept and the interactive prompt.
	ForceAI bool
	// Prompter resolves ambiguity interactively; nil disables layer 3.
	Prompter Prompter
	// AI runs the layer-4 fallback; nil disables layer 4.
	AI AIFunc
}

func (o Options) threshold() float64 {
	if o.Threshold > 0 {
		return o.Threshold
	}
	return DefaultThreshold
}

// Detect classifies opt.RepoRoot through the four-layer pipeline and
// returns the resolved category and its scaffold dir-set. It never
// errors on an unclassifiable tree: the terminal fallback is generic.
func Detect(ctx context.Context, cat *Catalog, opt Options) (Result, error) {
	if cat == nil {
		return Result{}, fmt.Errorf("nil catalog")
	}
	if opt.Forced != "" {
		if !cat.Has(opt.Forced) {
			return Result{}, fmt.Errorf("unknown project type %q", opt.Forced)
		}
		return Result{
			Category:   opt.Forced,
			Confidence: 1.0,
			Dirs:       cat.DirsFor(opt.Forced),
			Source:     SourceFlag,
		}, nil
	}

	if opt.ForceAI {
		return aiLayer(ctx, cat, opt, "")
	}

	scores := scoreRepo(opt.RepoRoot)
	top, second := topTwo(scores)
	conf := confidence(scores[top], scores[second])

	if top != "" && conf >= opt.threshold() {
		return Result{
			Category:   top,
			Confidence: conf,
			Dirs:       cat.DirsFor(top),
			Source:     SourceMarker,
		}, nil
	}

	candidates := rankedCandidates(scores)

	if opt.Prompter != nil {
		choice, describe, freeText, err := opt.Prompter.Pick(candidates, cat)
		if err != nil {
			return Result{}, err
		}
		if describe {
			return aiLayer(ctx, cat, opt, freeText)
		}
		if !cat.Has(choice) {
			return Result{}, fmt.Errorf("operator chose unknown project type %q", choice)
		}
		return Result{
			Category:   choice,
			Confidence: 1.0,
			Dirs:       cat.DirsFor(choice),
			Source:     SourceInteractive,
		}, nil
	}

	// Non-interactive: accept the best deterministic guess if there is
	// one, otherwise fall back to generic.
	if top != "" {
		return Result{
			Category:   top,
			Confidence: conf,
			Dirs:       cat.DirsFor(top),
			Source:     SourceMarker,
		}, nil
	}
	return genericResult(cat, "no marker or convention identified the project"), nil
}

// aiLayer runs the layer-4 fallback. It degrades to generic when no
// AIFunc is wired, the call fails, the response is malformed, or the
// reported confidence is below minAIConfidence and no Prompter can
// re-offer the top candidates.
func aiLayer(ctx context.Context, cat *Catalog, opt Options, desc string) (Result, error) {
	if opt.AI == nil {
		return genericResult(cat, "AI fallback not configured"), nil
	}
	tree := topLevelEntries(opt.RepoRoot)
	prompt, err := buildDetectPrompt(cat, tree, desc)
	if err != nil {
		return genericResult(cat, "AI fallback prompt build failed: "+err.Error()), nil
	}
	raw, err := opt.AI(ctx, prompt)
	if err != nil {
		return genericResult(cat, "AI fallback unavailable: "+err.Error()), nil
	}
	parsed, ok := parseAIDetect(raw)
	if !ok || !cat.Has(Category(parsed.Category)) {
		return genericResult(cat, "AI fallback returned no usable classification"), nil
	}
	chosen := Category(parsed.Category)
	if parsed.Confidence < minAIConfidence {
		if opt.Prompter != nil {
			choice, describe, _, perr := opt.Prompter.Pick(topThree(cat, parsed), cat)
			if perr != nil {
				return Result{}, perr
			}
			if !describe && cat.Has(choice) {
				return Result{
					Category:   choice,
					Confidence: 1.0,
					Dirs:       cat.DirsFor(choice),
					Source:     SourceInteractive,
				}, nil
			}
		}
		return genericResult(cat, "AI fallback confidence too low"), nil
	}
	return Result{
		Category:      chosen,
		Confidence:    clamp01(parsed.Confidence),
		Dirs:          mergeDirs(cat.DirsFor(chosen), parsed.Dirs),
		Source:        SourceAI,
		Justification: strings.TrimSpace(parsed.Justification),
	}, nil
}

func genericResult(cat *Catalog, why string) Result {
	return Result{
		Category:      Generic,
		Confidence:    0,
		Dirs:          cat.DirsFor(Generic),
		Source:        SourceFallback,
		Justification: why,
	}
}

// scoreRepo accumulates per-category votes from the marker-file scan
// (layer 1) and the conventional-directory scan (layer 2).
func scoreRepo(repoRoot string) map[Category]float64 {
	scores := make(map[Category]float64)
	if repoRoot == "" {
		return scores
	}
	for marker, votes := range markerRules {
		if rootHas(repoRoot, marker) {
			for _, v := range votes {
				scores[v.cat] += v.weight
			}
		}
	}
	for dir, votes := range signalRules {
		if rootHasDir(repoRoot, dir) {
			for _, v := range votes {
				scores[v.cat] += v.weight
			}
		}
	}
	return scores
}

// confidence is the share of the winning score over itself plus the
// runner-up: 1.0 when only one category scores, lower as the runner-up
// closes in. It deliberately ignores the long tail of small votes so a
// clear leader is not diluted by many partial matches.
func confidence(top, second float64) float64 {
	if top <= 0 {
		return 0
	}
	return top / (top + second)
}

func topTwo(scores map[Category]float64) (top, second Category) {
	var topV, secondV float64
	for _, cat := range sortedCats(scores) {
		v := scores[cat]
		switch {
		case v > topV:
			second, secondV = top, topV
			top, topV = cat, v
		case v > secondV:
			second, secondV = cat, v
		}
	}
	return top, second
}

func rankedCandidates(scores map[Category]float64) []Category {
	cats := sortedCats(scores)
	sort.SliceStable(cats, func(i, j int) bool {
		return scores[cats[i]] > scores[cats[j]]
	})
	out := make([]Category, 0, len(cats))
	for _, c := range cats {
		if scores[c] > 0 {
			out = append(out, c)
		}
	}
	return out
}

// sortedCats returns the scored categories in deterministic name order
// so the score walk and tie-breaks do not depend on map iteration order.
func sortedCats(scores map[Category]float64) []Category {
	out := make([]Category, 0, len(scores))
	for c := range scores {
		out = append(out, c)
	}
	slices.Sort(out)
	return out
}

func rootHas(repoRoot, marker string) bool {
	if strings.ContainsAny(marker, "*?[") {
		matches, err := filepath.Glob(filepath.Join(repoRoot, marker))
		return err == nil && len(matches) > 0
	}
	_, err := os.Stat(filepath.Join(repoRoot, marker))
	return err == nil
}

func rootHasDir(repoRoot, name string) bool {
	info, err := os.Stat(filepath.Join(repoRoot, name))
	return err == nil && info.IsDir()
}

func topLevelEntries(repoRoot string) []string {
	var names []string
	ents, err := os.ReadDir(repoRoot)
	if err != nil {
		return names
	}
	for _, e := range ents {
		if e.Name() == ".git" {
			continue
		}
		name := e.Name()
		if e.IsDir() {
			name += "/"
		}
		names = append(names, name)
	}
	sort.Strings(names)
	return names
}

func clamp01(v float64) float64 {
	switch {
	case v < 0:
		return 0
	case v > 1:
		return 1
	default:
		return v
	}
}

// mergeDirs returns base with any extra dirs appended that are not
// already present, preserving order. It backs the AI layer's bounded
// "propose deviations to the dir-set" affordance.
func mergeDirs(base, extra []string) []string {
	seen := make(map[string]struct{}, len(base))
	out := make([]string, 0, len(base)+len(extra))
	for _, d := range base {
		seen[d] = struct{}{}
		out = append(out, d)
	}
	for _, d := range extra {
		d = strings.TrimSpace(d)
		if d == "" {
			continue
		}
		if _, dup := seen[d]; dup {
			continue
		}
		seen[d] = struct{}{}
		out = append(out, d)
	}
	return out
}

type aiDetect struct {
	Category      string   `json:"category"`
	Confidence    float64  `json:"confidence"`
	Dirs          []string `json:"dirs"`
	Justification string   `json:"justification"`
	Deviations    []string `json:"deviations"`
}

// parseAIDetect extracts the first JSON object from the model's text
// (which may wrap it in prose or a code fence) and unmarshals it.
func parseAIDetect(raw string) (aiDetect, bool) {
	start := strings.IndexByte(raw, '{')
	end := strings.LastIndexByte(raw, '}')
	if start < 0 || end < start {
		return aiDetect{}, false
	}
	var d aiDetect
	if err := json.Unmarshal([]byte(raw[start:end+1]), &d); err != nil {
		return aiDetect{}, false
	}
	d.Dirs = mergeDirs(d.Dirs, d.Deviations)
	return d, true
}

// topThree returns the AI-chosen category (when known) plus other known
// categories, capped at three, for an operator re-prompt.
func topThree(cat *Catalog, d aiDetect) []Category {
	var out []Category
	if cat.Has(Category(d.Category)) {
		out = append(out, Category(d.Category))
	}
	for _, c := range cat.Categories() {
		if len(out) >= 3 {
			break
		}
		if c == Generic || (len(out) > 0 && c == out[0]) {
			continue
		}
		out = append(out, c)
	}
	return out
}