Add cmd/ entrypoints, fix auth, deploy to K8s
- cmd/indexer/main.go: CLI with full/repo/webhook/search subcommands Clones repos via Gitea API, walks files, indexes to MeiliSearch. Webhook HTTP server on :8080 for real-time push reindexing. - cmd/mcp-server/main.go: MCP stdio server wiring meili + mcp packages - internal/gitea/client.go: Use Authorization header instead of ?token= query param (required by current Gitea API) - k8s/indexer-cronjob.yaml: Remove embedded secret (foot-gun), pin image to v1.0.1, add imagePullPolicy: IfNotPresent - .gitignore: Anchor binary patterns to root so cmd/ dirs aren't ignored Deployed: 1,003 documents from 39 repos indexed in 83s. Global Gitea webhook configured for real-time reindexing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
466
cmd/indexer/main.go
Normal file
466
cmd/indexer/main.go
Normal file
@@ -0,0 +1,466 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"crypto/hmac"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
"unicode/utf8"
|
||||
|
||||
"gitea.rspworks.tech/rpert/gitea-search/internal/gitea"
|
||||
"gitea.rspworks.tech/rpert/gitea-search/internal/meili"
|
||||
)
|
||||
|
||||
const version = "1.0.0"
|
||||
|
||||
// Files and directories to skip during indexing.
|
||||
var skipDirs = map[string]bool{
|
||||
".git": true,
|
||||
"node_modules": true,
|
||||
"vendor": true,
|
||||
"__pycache__": true,
|
||||
".venv": true,
|
||||
".tox": true,
|
||||
"dist": true,
|
||||
"build": true,
|
||||
}
|
||||
|
||||
var skipExtensions = map[string]bool{
|
||||
".png": true, ".jpg": true, ".jpeg": true, ".gif": true, ".svg": true,
|
||||
".ico": true, ".webp": true, ".bmp": true, ".tiff": true,
|
||||
".zip": true, ".tar": true, ".gz": true, ".bz2": true, ".xz": true,
|
||||
".rar": true, ".7z": true,
|
||||
".woff": true, ".woff2": true, ".ttf": true, ".eot": true,
|
||||
".exe": true, ".dll": true, ".so": true, ".dylib": true,
|
||||
".pdf": true, ".doc": true, ".docx": true,
|
||||
".mp3": true, ".mp4": true, ".wav": true, ".flac": true,
|
||||
".lock": true,
|
||||
".min.js": true, ".min.css": true,
|
||||
}
|
||||
|
||||
const maxFileSize = 50 * 1024 // 50KB
|
||||
|
||||
func main() {
|
||||
if len(os.Args) < 2 {
|
||||
fmt.Fprintf(os.Stderr, "Usage: indexer <command> [args]\n")
|
||||
fmt.Fprintf(os.Stderr, "Commands:\n")
|
||||
fmt.Fprintf(os.Stderr, " full Full re-index of all repos\n")
|
||||
fmt.Fprintf(os.Stderr, " repo <owner/name> Re-index a single repo\n")
|
||||
fmt.Fprintf(os.Stderr, " webhook Start webhook HTTP server\n")
|
||||
fmt.Fprintf(os.Stderr, " search <query> CLI search for testing\n")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
switch os.Args[1] {
|
||||
case "full":
|
||||
cmdFull()
|
||||
case "repo":
|
||||
if len(os.Args) < 3 {
|
||||
log.Fatal("Usage: indexer repo <owner/name>")
|
||||
}
|
||||
cmdRepo(os.Args[2])
|
||||
case "webhook":
|
||||
cmdWebhook()
|
||||
case "search":
|
||||
if len(os.Args) < 3 {
|
||||
log.Fatal("Usage: indexer search <query> [--repo=X] [--type=X] [--limit=N]")
|
||||
}
|
||||
cmdSearch(os.Args[2:])
|
||||
default:
|
||||
log.Fatalf("Unknown command: %s", os.Args[1])
|
||||
}
|
||||
}
|
||||
|
||||
func newClients() (*gitea.Client, *meili.Client) {
|
||||
giteaURL := envOr("GITEA_URL", "https://gitea.rspworks.tech")
|
||||
giteaToken := os.Getenv("GITEA_TOKEN")
|
||||
if giteaToken == "" {
|
||||
log.Fatal("GITEA_TOKEN is required")
|
||||
}
|
||||
|
||||
meiliURL := envOr("MEILI_URL", "http://localhost:7700")
|
||||
meiliKey := os.Getenv("MEILI_KEY")
|
||||
indexName := envOr("INDEX_NAME", "gitea-code")
|
||||
|
||||
gc := gitea.NewClient(giteaURL, giteaToken)
|
||||
|
||||
mc, err := meili.NewClient(meiliURL, meiliKey, indexName)
|
||||
if err != nil {
|
||||
log.Fatalf("connecting to MeiliSearch: %v", err)
|
||||
}
|
||||
|
||||
return gc, mc
|
||||
}
|
||||
|
||||
// cmdFull clones all repos and indexes everything.
|
||||
func cmdFull() {
|
||||
gc, mc := newClients()
|
||||
|
||||
log.Println("Fetching repo list from Gitea...")
|
||||
repos, err := gc.ListAllRepos()
|
||||
if err != nil {
|
||||
log.Fatalf("listing repos: %v", err)
|
||||
}
|
||||
log.Printf("Found %d repos", len(repos))
|
||||
|
||||
// Clear existing index for full reindex
|
||||
log.Println("Clearing existing index...")
|
||||
if err := mc.DeleteAll(); err != nil {
|
||||
log.Fatalf("clearing index: %v", err)
|
||||
}
|
||||
|
||||
tmpBase, err := os.MkdirTemp("", "gitea-indexer-*")
|
||||
if err != nil {
|
||||
log.Fatalf("creating temp dir: %v", err)
|
||||
}
|
||||
defer os.RemoveAll(tmpBase)
|
||||
|
||||
var totalDocs int
|
||||
for _, repo := range repos {
|
||||
docs, err := indexRepo(gc, repo, tmpBase)
|
||||
if err != nil {
|
||||
log.Printf("ERROR indexing %s: %v", repo.FullName, err)
|
||||
continue
|
||||
}
|
||||
if len(docs) == 0 {
|
||||
log.Printf(" %s: no indexable files", repo.FullName)
|
||||
continue
|
||||
}
|
||||
|
||||
if err := mc.IndexDocuments(docs); err != nil {
|
||||
log.Printf("ERROR pushing %s to MeiliSearch: %v", repo.FullName, err)
|
||||
continue
|
||||
}
|
||||
totalDocs += len(docs)
|
||||
log.Printf(" %s: indexed %d files", repo.FullName, len(docs))
|
||||
}
|
||||
|
||||
log.Printf("Done. Total: %d documents from %d repos", totalDocs, len(repos))
|
||||
}
|
||||
|
||||
// cmdRepo re-indexes a single repo.
|
||||
func cmdRepo(fullName string) {
|
||||
gc, mc := newClients()
|
||||
|
||||
repo, err := gc.GetRepo(fullName)
|
||||
if err != nil {
|
||||
log.Fatalf("fetching repo %s: %v", fullName, err)
|
||||
}
|
||||
|
||||
// Delete existing docs for this repo
|
||||
log.Printf("Deleting existing documents for %s...", fullName)
|
||||
if err := mc.DeleteByRepo(fullName); err != nil {
|
||||
log.Fatalf("deleting documents: %v", err)
|
||||
}
|
||||
|
||||
tmpBase, err := os.MkdirTemp("", "gitea-indexer-*")
|
||||
if err != nil {
|
||||
log.Fatalf("creating temp dir: %v", err)
|
||||
}
|
||||
defer os.RemoveAll(tmpBase)
|
||||
|
||||
docs, err := indexRepo(gc, *repo, tmpBase)
|
||||
if err != nil {
|
||||
log.Fatalf("indexing %s: %v", fullName, err)
|
||||
}
|
||||
|
||||
if len(docs) == 0 {
|
||||
log.Printf("%s: no indexable files", fullName)
|
||||
return
|
||||
}
|
||||
|
||||
if err := mc.IndexDocuments(docs); err != nil {
|
||||
log.Fatalf("pushing to MeiliSearch: %v", err)
|
||||
}
|
||||
log.Printf("Indexed %d files from %s", len(docs), fullName)
|
||||
}
|
||||
|
||||
// cmdWebhook starts an HTTP server for Gitea push webhooks.
|
||||
func cmdWebhook() {
|
||||
gc, mc := newClients()
|
||||
webhookSecret := os.Getenv("WEBHOOK_SECRET")
|
||||
|
||||
mux := http.NewServeMux()
|
||||
|
||||
mux.HandleFunc("/webhook", func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(r.Body)
|
||||
if err != nil {
|
||||
http.Error(w, "error reading body", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Validate HMAC signature if secret is configured
|
||||
if webhookSecret != "" {
|
||||
sig := r.Header.Get("X-Gitea-Signature")
|
||||
if !validateSignature(body, sig, webhookSecret) {
|
||||
log.Printf("Invalid webhook signature")
|
||||
http.Error(w, "invalid signature", http.StatusUnauthorized)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
var payload struct {
|
||||
Repository struct {
|
||||
FullName string `json:"full_name"`
|
||||
} `json:"repository"`
|
||||
}
|
||||
if err := json.Unmarshal(body, &payload); err != nil {
|
||||
log.Printf("Error parsing webhook payload: %v", err)
|
||||
http.Error(w, "invalid payload", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
repoName := payload.Repository.FullName
|
||||
if repoName == "" {
|
||||
http.Error(w, "missing repository name", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
log.Printf("Webhook: re-indexing %s", repoName)
|
||||
w.WriteHeader(http.StatusAccepted)
|
||||
fmt.Fprintf(w, "accepted: %s\n", repoName)
|
||||
|
||||
// Re-index in background
|
||||
go func() {
|
||||
repo, err := gc.GetRepo(repoName)
|
||||
if err != nil {
|
||||
log.Printf("ERROR fetching %s: %v", repoName, err)
|
||||
return
|
||||
}
|
||||
|
||||
if err := mc.DeleteByRepo(repoName); err != nil {
|
||||
log.Printf("ERROR deleting docs for %s: %v", repoName, err)
|
||||
return
|
||||
}
|
||||
|
||||
tmpBase, err := os.MkdirTemp("", "gitea-indexer-*")
|
||||
if err != nil {
|
||||
log.Printf("ERROR creating temp dir: %v", err)
|
||||
return
|
||||
}
|
||||
defer os.RemoveAll(tmpBase)
|
||||
|
||||
docs, err := indexRepo(gc, *repo, tmpBase)
|
||||
if err != nil {
|
||||
log.Printf("ERROR indexing %s: %v", repoName, err)
|
||||
return
|
||||
}
|
||||
|
||||
if err := mc.IndexDocuments(docs); err != nil {
|
||||
log.Printf("ERROR pushing %s: %v", repoName, err)
|
||||
return
|
||||
}
|
||||
log.Printf("Webhook: re-indexed %s (%d files)", repoName, len(docs))
|
||||
}()
|
||||
})
|
||||
|
||||
mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
fmt.Fprint(w, "ok")
|
||||
})
|
||||
|
||||
addr := envOr("LISTEN_ADDR", ":8080")
|
||||
log.Printf("Webhook server listening on %s", addr)
|
||||
if err := http.ListenAndServe(addr, mux); err != nil {
|
||||
log.Fatalf("server error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// cmdSearch runs a CLI search for testing.
|
||||
func cmdSearch(args []string) {
|
||||
_, mc := newClients()
|
||||
|
||||
query := args[0]
|
||||
var repo, filetype string
|
||||
var limit int64 = 10
|
||||
|
||||
for _, arg := range args[1:] {
|
||||
switch {
|
||||
case strings.HasPrefix(arg, "--repo="):
|
||||
repo = strings.TrimPrefix(arg, "--repo=")
|
||||
case strings.HasPrefix(arg, "--type="):
|
||||
filetype = strings.TrimPrefix(arg, "--type=")
|
||||
case strings.HasPrefix(arg, "--limit="):
|
||||
fmt.Sscanf(strings.TrimPrefix(arg, "--limit="), "%d", &limit)
|
||||
}
|
||||
}
|
||||
|
||||
results, err := mc.Search(query, repo, filetype, limit)
|
||||
if err != nil {
|
||||
log.Fatalf("search error: %v", err)
|
||||
}
|
||||
|
||||
if len(results) == 0 {
|
||||
fmt.Printf("No results for %q\n", query)
|
||||
return
|
||||
}
|
||||
|
||||
for i, r := range results {
|
||||
fmt.Printf("%d. %s — %s\n", i+1, r.Repo, r.Path)
|
||||
if r.Snippet != "" {
|
||||
fmt.Printf(" %s\n", r.Snippet)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
}
|
||||
|
||||
// indexRepo clones a single repo and extracts indexable documents.
|
||||
func indexRepo(gc *gitea.Client, repo gitea.Repo, tmpBase string) ([]meili.Document, error) {
|
||||
cloneURL := gc.AuthenticatedCloneURL(repo)
|
||||
repoDir := filepath.Join(tmpBase, strings.ReplaceAll(repo.FullName, "/", "_"))
|
||||
|
||||
cmd := exec.Command("git", "clone", "--depth", "1", "--single-branch",
|
||||
"--branch", repo.DefaultBranch, cloneURL, repoDir)
|
||||
cmd.Stdout = io.Discard
|
||||
cmd.Stderr = io.Discard
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
return nil, fmt.Errorf("cloning %s: %w", repo.FullName, err)
|
||||
}
|
||||
|
||||
var docs []meili.Document
|
||||
now := time.Now().Unix()
|
||||
|
||||
err := filepath.Walk(repoDir, func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return nil // skip errors
|
||||
}
|
||||
|
||||
// Skip directories
|
||||
if info.IsDir() {
|
||||
if skipDirs[info.Name()] {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Skip by extension
|
||||
ext := strings.ToLower(filepath.Ext(info.Name()))
|
||||
if skipExtensions[ext] {
|
||||
return nil
|
||||
}
|
||||
// Check compound extensions like .min.js
|
||||
base := strings.ToLower(info.Name())
|
||||
if strings.HasSuffix(base, ".min.js") || strings.HasSuffix(base, ".min.css") {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Skip large files
|
||||
if info.Size() > maxFileSize {
|
||||
return nil
|
||||
}
|
||||
// Skip empty files
|
||||
if info.Size() == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Read file
|
||||
content, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil // skip unreadable
|
||||
}
|
||||
|
||||
// Skip binary files (check for null bytes in first 512 bytes)
|
||||
checkLen := 512
|
||||
if len(content) < checkLen {
|
||||
checkLen = len(content)
|
||||
}
|
||||
for _, b := range content[:checkLen] {
|
||||
if b == 0 {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// Skip non-UTF8
|
||||
if !utf8.Valid(content) {
|
||||
return nil
|
||||
}
|
||||
|
||||
relPath, _ := filepath.Rel(repoDir, path)
|
||||
if ext != "" {
|
||||
ext = ext[1:] // strip leading dot
|
||||
}
|
||||
|
||||
docs = append(docs, meili.Document{
|
||||
ID: meili.DocumentID(repo.FullName, repo.DefaultBranch, relPath),
|
||||
Repo: repo.FullName,
|
||||
Branch: repo.DefaultBranch,
|
||||
Path: relPath,
|
||||
Filename: info.Name(),
|
||||
Extension: ext,
|
||||
Content: string(content),
|
||||
Language: langFromExt(ext),
|
||||
UpdatedAt: now,
|
||||
})
|
||||
|
||||
return nil
|
||||
})
|
||||
|
||||
// Clean up clone
|
||||
os.RemoveAll(repoDir)
|
||||
|
||||
return docs, err
|
||||
}
|
||||
|
||||
func validateSignature(body []byte, signature, secret string) bool {
|
||||
mac := hmac.New(sha256.New, []byte(secret))
|
||||
mac.Write(body)
|
||||
expected := hex.EncodeToString(mac.Sum(nil))
|
||||
return hmac.Equal([]byte(expected), []byte(signature))
|
||||
}
|
||||
|
||||
func langFromExt(ext string) string {
|
||||
switch ext {
|
||||
case "go":
|
||||
return "go"
|
||||
case "py":
|
||||
return "python"
|
||||
case "js", "jsx":
|
||||
return "javascript"
|
||||
case "ts", "tsx":
|
||||
return "typescript"
|
||||
case "sh", "bash":
|
||||
return "shell"
|
||||
case "yaml", "yml":
|
||||
return "yaml"
|
||||
case "json":
|
||||
return "json"
|
||||
case "md":
|
||||
return "markdown"
|
||||
case "html", "htm":
|
||||
return "html"
|
||||
case "css":
|
||||
return "css"
|
||||
case "sql":
|
||||
return "sql"
|
||||
case "rs":
|
||||
return "rust"
|
||||
case "rb":
|
||||
return "ruby"
|
||||
case "conf", "cfg", "ini", "toml":
|
||||
return "config"
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func envOr(key, fallback string) string {
|
||||
if v := os.Getenv(key); v != "" {
|
||||
return v
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
Reference in New Issue
Block a user