Files
gitea-search/cmd/indexer/main.go
Raymond Scott Pert 74b894fea0 Add cmd/ entrypoints, fix auth, deploy to K8s
- cmd/indexer/main.go: CLI with full/repo/webhook/search subcommands
  Clones repos via Gitea API, walks files, indexes to MeiliSearch.
  Webhook HTTP server on :8080 for real-time push reindexing.
- cmd/mcp-server/main.go: MCP stdio server wiring meili + mcp packages
- internal/gitea/client.go: Use Authorization header instead of ?token=
  query param (required by current Gitea API)
- k8s/indexer-cronjob.yaml: Remove embedded secret (foot-gun),
  pin image to v1.0.1, add imagePullPolicy: IfNotPresent
- .gitignore: Anchor binary patterns to root so cmd/ dirs aren't ignored

Deployed: 1,003 documents from 39 repos indexed in 83s.
Global Gitea webhook configured for real-time reindexing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 04:55:05 +00:00

467 lines
11 KiB
Go

package main
import (
"crypto/hmac"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
"unicode/utf8"
"gitea.rspworks.tech/rpert/gitea-search/internal/gitea"
"gitea.rspworks.tech/rpert/gitea-search/internal/meili"
)
const version = "1.0.0"
// Files and directories to skip during indexing.
var skipDirs = map[string]bool{
".git": true,
"node_modules": true,
"vendor": true,
"__pycache__": true,
".venv": true,
".tox": true,
"dist": true,
"build": true,
}
var skipExtensions = map[string]bool{
".png": true, ".jpg": true, ".jpeg": true, ".gif": true, ".svg": true,
".ico": true, ".webp": true, ".bmp": true, ".tiff": true,
".zip": true, ".tar": true, ".gz": true, ".bz2": true, ".xz": true,
".rar": true, ".7z": true,
".woff": true, ".woff2": true, ".ttf": true, ".eot": true,
".exe": true, ".dll": true, ".so": true, ".dylib": true,
".pdf": true, ".doc": true, ".docx": true,
".mp3": true, ".mp4": true, ".wav": true, ".flac": true,
".lock": true,
".min.js": true, ".min.css": true,
}
const maxFileSize = 50 * 1024 // 50KB
func main() {
if len(os.Args) < 2 {
fmt.Fprintf(os.Stderr, "Usage: indexer <command> [args]\n")
fmt.Fprintf(os.Stderr, "Commands:\n")
fmt.Fprintf(os.Stderr, " full Full re-index of all repos\n")
fmt.Fprintf(os.Stderr, " repo <owner/name> Re-index a single repo\n")
fmt.Fprintf(os.Stderr, " webhook Start webhook HTTP server\n")
fmt.Fprintf(os.Stderr, " search <query> CLI search for testing\n")
os.Exit(1)
}
switch os.Args[1] {
case "full":
cmdFull()
case "repo":
if len(os.Args) < 3 {
log.Fatal("Usage: indexer repo <owner/name>")
}
cmdRepo(os.Args[2])
case "webhook":
cmdWebhook()
case "search":
if len(os.Args) < 3 {
log.Fatal("Usage: indexer search <query> [--repo=X] [--type=X] [--limit=N]")
}
cmdSearch(os.Args[2:])
default:
log.Fatalf("Unknown command: %s", os.Args[1])
}
}
func newClients() (*gitea.Client, *meili.Client) {
giteaURL := envOr("GITEA_URL", "https://gitea.rspworks.tech")
giteaToken := os.Getenv("GITEA_TOKEN")
if giteaToken == "" {
log.Fatal("GITEA_TOKEN is required")
}
meiliURL := envOr("MEILI_URL", "http://localhost:7700")
meiliKey := os.Getenv("MEILI_KEY")
indexName := envOr("INDEX_NAME", "gitea-code")
gc := gitea.NewClient(giteaURL, giteaToken)
mc, err := meili.NewClient(meiliURL, meiliKey, indexName)
if err != nil {
log.Fatalf("connecting to MeiliSearch: %v", err)
}
return gc, mc
}
// cmdFull clones all repos and indexes everything.
func cmdFull() {
gc, mc := newClients()
log.Println("Fetching repo list from Gitea...")
repos, err := gc.ListAllRepos()
if err != nil {
log.Fatalf("listing repos: %v", err)
}
log.Printf("Found %d repos", len(repos))
// Clear existing index for full reindex
log.Println("Clearing existing index...")
if err := mc.DeleteAll(); err != nil {
log.Fatalf("clearing index: %v", err)
}
tmpBase, err := os.MkdirTemp("", "gitea-indexer-*")
if err != nil {
log.Fatalf("creating temp dir: %v", err)
}
defer os.RemoveAll(tmpBase)
var totalDocs int
for _, repo := range repos {
docs, err := indexRepo(gc, repo, tmpBase)
if err != nil {
log.Printf("ERROR indexing %s: %v", repo.FullName, err)
continue
}
if len(docs) == 0 {
log.Printf(" %s: no indexable files", repo.FullName)
continue
}
if err := mc.IndexDocuments(docs); err != nil {
log.Printf("ERROR pushing %s to MeiliSearch: %v", repo.FullName, err)
continue
}
totalDocs += len(docs)
log.Printf(" %s: indexed %d files", repo.FullName, len(docs))
}
log.Printf("Done. Total: %d documents from %d repos", totalDocs, len(repos))
}
// cmdRepo re-indexes a single repo.
func cmdRepo(fullName string) {
gc, mc := newClients()
repo, err := gc.GetRepo(fullName)
if err != nil {
log.Fatalf("fetching repo %s: %v", fullName, err)
}
// Delete existing docs for this repo
log.Printf("Deleting existing documents for %s...", fullName)
if err := mc.DeleteByRepo(fullName); err != nil {
log.Fatalf("deleting documents: %v", err)
}
tmpBase, err := os.MkdirTemp("", "gitea-indexer-*")
if err != nil {
log.Fatalf("creating temp dir: %v", err)
}
defer os.RemoveAll(tmpBase)
docs, err := indexRepo(gc, *repo, tmpBase)
if err != nil {
log.Fatalf("indexing %s: %v", fullName, err)
}
if len(docs) == 0 {
log.Printf("%s: no indexable files", fullName)
return
}
if err := mc.IndexDocuments(docs); err != nil {
log.Fatalf("pushing to MeiliSearch: %v", err)
}
log.Printf("Indexed %d files from %s", len(docs), fullName)
}
// cmdWebhook starts an HTTP server for Gitea push webhooks.
func cmdWebhook() {
gc, mc := newClients()
webhookSecret := os.Getenv("WEBHOOK_SECRET")
mux := http.NewServeMux()
mux.HandleFunc("/webhook", func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
return
}
body, err := io.ReadAll(r.Body)
if err != nil {
http.Error(w, "error reading body", http.StatusBadRequest)
return
}
// Validate HMAC signature if secret is configured
if webhookSecret != "" {
sig := r.Header.Get("X-Gitea-Signature")
if !validateSignature(body, sig, webhookSecret) {
log.Printf("Invalid webhook signature")
http.Error(w, "invalid signature", http.StatusUnauthorized)
return
}
}
var payload struct {
Repository struct {
FullName string `json:"full_name"`
} `json:"repository"`
}
if err := json.Unmarshal(body, &payload); err != nil {
log.Printf("Error parsing webhook payload: %v", err)
http.Error(w, "invalid payload", http.StatusBadRequest)
return
}
repoName := payload.Repository.FullName
if repoName == "" {
http.Error(w, "missing repository name", http.StatusBadRequest)
return
}
log.Printf("Webhook: re-indexing %s", repoName)
w.WriteHeader(http.StatusAccepted)
fmt.Fprintf(w, "accepted: %s\n", repoName)
// Re-index in background
go func() {
repo, err := gc.GetRepo(repoName)
if err != nil {
log.Printf("ERROR fetching %s: %v", repoName, err)
return
}
if err := mc.DeleteByRepo(repoName); err != nil {
log.Printf("ERROR deleting docs for %s: %v", repoName, err)
return
}
tmpBase, err := os.MkdirTemp("", "gitea-indexer-*")
if err != nil {
log.Printf("ERROR creating temp dir: %v", err)
return
}
defer os.RemoveAll(tmpBase)
docs, err := indexRepo(gc, *repo, tmpBase)
if err != nil {
log.Printf("ERROR indexing %s: %v", repoName, err)
return
}
if err := mc.IndexDocuments(docs); err != nil {
log.Printf("ERROR pushing %s: %v", repoName, err)
return
}
log.Printf("Webhook: re-indexed %s (%d files)", repoName, len(docs))
}()
})
mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
fmt.Fprint(w, "ok")
})
addr := envOr("LISTEN_ADDR", ":8080")
log.Printf("Webhook server listening on %s", addr)
if err := http.ListenAndServe(addr, mux); err != nil {
log.Fatalf("server error: %v", err)
}
}
// cmdSearch runs a CLI search for testing.
func cmdSearch(args []string) {
_, mc := newClients()
query := args[0]
var repo, filetype string
var limit int64 = 10
for _, arg := range args[1:] {
switch {
case strings.HasPrefix(arg, "--repo="):
repo = strings.TrimPrefix(arg, "--repo=")
case strings.HasPrefix(arg, "--type="):
filetype = strings.TrimPrefix(arg, "--type=")
case strings.HasPrefix(arg, "--limit="):
fmt.Sscanf(strings.TrimPrefix(arg, "--limit="), "%d", &limit)
}
}
results, err := mc.Search(query, repo, filetype, limit)
if err != nil {
log.Fatalf("search error: %v", err)
}
if len(results) == 0 {
fmt.Printf("No results for %q\n", query)
return
}
for i, r := range results {
fmt.Printf("%d. %s — %s\n", i+1, r.Repo, r.Path)
if r.Snippet != "" {
fmt.Printf(" %s\n", r.Snippet)
}
fmt.Println()
}
}
// indexRepo clones a single repo and extracts indexable documents.
func indexRepo(gc *gitea.Client, repo gitea.Repo, tmpBase string) ([]meili.Document, error) {
cloneURL := gc.AuthenticatedCloneURL(repo)
repoDir := filepath.Join(tmpBase, strings.ReplaceAll(repo.FullName, "/", "_"))
cmd := exec.Command("git", "clone", "--depth", "1", "--single-branch",
"--branch", repo.DefaultBranch, cloneURL, repoDir)
cmd.Stdout = io.Discard
cmd.Stderr = io.Discard
if err := cmd.Run(); err != nil {
return nil, fmt.Errorf("cloning %s: %w", repo.FullName, err)
}
var docs []meili.Document
now := time.Now().Unix()
err := filepath.Walk(repoDir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return nil // skip errors
}
// Skip directories
if info.IsDir() {
if skipDirs[info.Name()] {
return filepath.SkipDir
}
return nil
}
// Skip by extension
ext := strings.ToLower(filepath.Ext(info.Name()))
if skipExtensions[ext] {
return nil
}
// Check compound extensions like .min.js
base := strings.ToLower(info.Name())
if strings.HasSuffix(base, ".min.js") || strings.HasSuffix(base, ".min.css") {
return nil
}
// Skip large files
if info.Size() > maxFileSize {
return nil
}
// Skip empty files
if info.Size() == 0 {
return nil
}
// Read file
content, err := os.ReadFile(path)
if err != nil {
return nil // skip unreadable
}
// Skip binary files (check for null bytes in first 512 bytes)
checkLen := 512
if len(content) < checkLen {
checkLen = len(content)
}
for _, b := range content[:checkLen] {
if b == 0 {
return nil
}
}
// Skip non-UTF8
if !utf8.Valid(content) {
return nil
}
relPath, _ := filepath.Rel(repoDir, path)
if ext != "" {
ext = ext[1:] // strip leading dot
}
docs = append(docs, meili.Document{
ID: meili.DocumentID(repo.FullName, repo.DefaultBranch, relPath),
Repo: repo.FullName,
Branch: repo.DefaultBranch,
Path: relPath,
Filename: info.Name(),
Extension: ext,
Content: string(content),
Language: langFromExt(ext),
UpdatedAt: now,
})
return nil
})
// Clean up clone
os.RemoveAll(repoDir)
return docs, err
}
func validateSignature(body []byte, signature, secret string) bool {
mac := hmac.New(sha256.New, []byte(secret))
mac.Write(body)
expected := hex.EncodeToString(mac.Sum(nil))
return hmac.Equal([]byte(expected), []byte(signature))
}
func langFromExt(ext string) string {
switch ext {
case "go":
return "go"
case "py":
return "python"
case "js", "jsx":
return "javascript"
case "ts", "tsx":
return "typescript"
case "sh", "bash":
return "shell"
case "yaml", "yml":
return "yaml"
case "json":
return "json"
case "md":
return "markdown"
case "html", "htm":
return "html"
case "css":
return "css"
case "sql":
return "sql"
case "rs":
return "rust"
case "rb":
return "ruby"
case "conf", "cfg", "ini", "toml":
return "config"
default:
return ""
}
}
func envOr(key, fallback string) string {
if v := os.Getenv(key); v != "" {
return v
}
return fallback
}