package main import ( "crypto/hmac" "crypto/sha256" "encoding/hex" "encoding/json" "fmt" "io" "log" "net/http" "os" "os/exec" "path/filepath" "strings" "time" "unicode/utf8" "gitea.rspworks.tech/rpert/gitea-search/internal/gitea" "gitea.rspworks.tech/rpert/gitea-search/internal/meili" ) const version = "1.0.0" // Files and directories to skip during indexing. var skipDirs = map[string]bool{ ".git": true, "node_modules": true, "vendor": true, "__pycache__": true, ".venv": true, ".tox": true, "dist": true, "build": true, } var skipExtensions = map[string]bool{ ".png": true, ".jpg": true, ".jpeg": true, ".gif": true, ".svg": true, ".ico": true, ".webp": true, ".bmp": true, ".tiff": true, ".zip": true, ".tar": true, ".gz": true, ".bz2": true, ".xz": true, ".rar": true, ".7z": true, ".woff": true, ".woff2": true, ".ttf": true, ".eot": true, ".exe": true, ".dll": true, ".so": true, ".dylib": true, ".pdf": true, ".doc": true, ".docx": true, ".mp3": true, ".mp4": true, ".wav": true, ".flac": true, ".lock": true, ".min.js": true, ".min.css": true, } const maxFileSize = 50 * 1024 // 50KB func main() { if len(os.Args) < 2 { fmt.Fprintf(os.Stderr, "Usage: indexer [args]\n") fmt.Fprintf(os.Stderr, "Commands:\n") fmt.Fprintf(os.Stderr, " full Full re-index of all repos\n") fmt.Fprintf(os.Stderr, " repo Re-index a single repo\n") fmt.Fprintf(os.Stderr, " webhook Start webhook HTTP server\n") fmt.Fprintf(os.Stderr, " search CLI search for testing\n") os.Exit(1) } switch os.Args[1] { case "full": cmdFull() case "repo": if len(os.Args) < 3 { log.Fatal("Usage: indexer repo ") } cmdRepo(os.Args[2]) case "webhook": cmdWebhook() case "search": if len(os.Args) < 3 { log.Fatal("Usage: indexer search [--repo=X] [--type=X] [--limit=N]") } cmdSearch(os.Args[2:]) default: log.Fatalf("Unknown command: %s", os.Args[1]) } } func newClients() (*gitea.Client, *meili.Client) { giteaURL := envOr("GITEA_URL", "https://gitea.rspworks.tech") giteaToken := os.Getenv("GITEA_TOKEN") if giteaToken == "" { log.Fatal("GITEA_TOKEN is required") } meiliURL := envOr("MEILI_URL", "http://localhost:7700") meiliKey := os.Getenv("MEILI_KEY") indexName := envOr("INDEX_NAME", "gitea-code") gc := gitea.NewClient(giteaURL, giteaToken) mc, err := meili.NewClient(meiliURL, meiliKey, indexName) if err != nil { log.Fatalf("connecting to MeiliSearch: %v", err) } return gc, mc } // cmdFull clones all repos and indexes everything. func cmdFull() { gc, mc := newClients() log.Println("Fetching repo list from Gitea...") repos, err := gc.ListAllRepos() if err != nil { log.Fatalf("listing repos: %v", err) } log.Printf("Found %d repos", len(repos)) // Clear existing index for full reindex log.Println("Clearing existing index...") if err := mc.DeleteAll(); err != nil { log.Fatalf("clearing index: %v", err) } tmpBase, err := os.MkdirTemp("", "gitea-indexer-*") if err != nil { log.Fatalf("creating temp dir: %v", err) } defer os.RemoveAll(tmpBase) var totalDocs int for _, repo := range repos { docs, err := indexRepo(gc, repo, tmpBase) if err != nil { log.Printf("ERROR indexing %s: %v", repo.FullName, err) continue } if len(docs) == 0 { log.Printf(" %s: no indexable files", repo.FullName) continue } if err := mc.IndexDocuments(docs); err != nil { log.Printf("ERROR pushing %s to MeiliSearch: %v", repo.FullName, err) continue } totalDocs += len(docs) log.Printf(" %s: indexed %d files", repo.FullName, len(docs)) } log.Printf("Done. Total: %d documents from %d repos", totalDocs, len(repos)) } // cmdRepo re-indexes a single repo. func cmdRepo(fullName string) { gc, mc := newClients() repo, err := gc.GetRepo(fullName) if err != nil { log.Fatalf("fetching repo %s: %v", fullName, err) } // Delete existing docs for this repo log.Printf("Deleting existing documents for %s...", fullName) if err := mc.DeleteByRepo(fullName); err != nil { log.Fatalf("deleting documents: %v", err) } tmpBase, err := os.MkdirTemp("", "gitea-indexer-*") if err != nil { log.Fatalf("creating temp dir: %v", err) } defer os.RemoveAll(tmpBase) docs, err := indexRepo(gc, *repo, tmpBase) if err != nil { log.Fatalf("indexing %s: %v", fullName, err) } if len(docs) == 0 { log.Printf("%s: no indexable files", fullName) return } if err := mc.IndexDocuments(docs); err != nil { log.Fatalf("pushing to MeiliSearch: %v", err) } log.Printf("Indexed %d files from %s", len(docs), fullName) } // cmdWebhook starts an HTTP server for Gitea push webhooks. func cmdWebhook() { gc, mc := newClients() webhookSecret := os.Getenv("WEBHOOK_SECRET") mux := http.NewServeMux() mux.HandleFunc("/webhook", func(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodPost { http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } body, err := io.ReadAll(r.Body) if err != nil { http.Error(w, "error reading body", http.StatusBadRequest) return } // Validate HMAC signature if secret is configured if webhookSecret != "" { sig := r.Header.Get("X-Gitea-Signature") if !validateSignature(body, sig, webhookSecret) { log.Printf("Invalid webhook signature") http.Error(w, "invalid signature", http.StatusUnauthorized) return } } var payload struct { Repository struct { FullName string `json:"full_name"` } `json:"repository"` } if err := json.Unmarshal(body, &payload); err != nil { log.Printf("Error parsing webhook payload: %v", err) http.Error(w, "invalid payload", http.StatusBadRequest) return } repoName := payload.Repository.FullName if repoName == "" { http.Error(w, "missing repository name", http.StatusBadRequest) return } log.Printf("Webhook: re-indexing %s", repoName) w.WriteHeader(http.StatusAccepted) fmt.Fprintf(w, "accepted: %s\n", repoName) // Re-index in background go func() { repo, err := gc.GetRepo(repoName) if err != nil { log.Printf("ERROR fetching %s: %v", repoName, err) return } if err := mc.DeleteByRepo(repoName); err != nil { log.Printf("ERROR deleting docs for %s: %v", repoName, err) return } tmpBase, err := os.MkdirTemp("", "gitea-indexer-*") if err != nil { log.Printf("ERROR creating temp dir: %v", err) return } defer os.RemoveAll(tmpBase) docs, err := indexRepo(gc, *repo, tmpBase) if err != nil { log.Printf("ERROR indexing %s: %v", repoName, err) return } if err := mc.IndexDocuments(docs); err != nil { log.Printf("ERROR pushing %s: %v", repoName, err) return } log.Printf("Webhook: re-indexed %s (%d files)", repoName, len(docs)) }() }) mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) fmt.Fprint(w, "ok") }) addr := envOr("LISTEN_ADDR", ":8080") log.Printf("Webhook server listening on %s", addr) if err := http.ListenAndServe(addr, mux); err != nil { log.Fatalf("server error: %v", err) } } // cmdSearch runs a CLI search for testing. func cmdSearch(args []string) { _, mc := newClients() query := args[0] var repo, filetype string var limit int64 = 10 for _, arg := range args[1:] { switch { case strings.HasPrefix(arg, "--repo="): repo = strings.TrimPrefix(arg, "--repo=") case strings.HasPrefix(arg, "--type="): filetype = strings.TrimPrefix(arg, "--type=") case strings.HasPrefix(arg, "--limit="): fmt.Sscanf(strings.TrimPrefix(arg, "--limit="), "%d", &limit) } } results, err := mc.Search(query, repo, filetype, limit) if err != nil { log.Fatalf("search error: %v", err) } if len(results) == 0 { fmt.Printf("No results for %q\n", query) return } for i, r := range results { fmt.Printf("%d. %s — %s\n", i+1, r.Repo, r.Path) if r.Snippet != "" { fmt.Printf(" %s\n", r.Snippet) } fmt.Println() } } // indexRepo clones a single repo and extracts indexable documents. func indexRepo(gc *gitea.Client, repo gitea.Repo, tmpBase string) ([]meili.Document, error) { cloneURL := gc.AuthenticatedCloneURL(repo) repoDir := filepath.Join(tmpBase, strings.ReplaceAll(repo.FullName, "/", "_")) cmd := exec.Command("git", "clone", "--depth", "1", "--single-branch", "--branch", repo.DefaultBranch, cloneURL, repoDir) cmd.Stdout = io.Discard cmd.Stderr = io.Discard if err := cmd.Run(); err != nil { return nil, fmt.Errorf("cloning %s: %w", repo.FullName, err) } var docs []meili.Document now := time.Now().Unix() err := filepath.Walk(repoDir, func(path string, info os.FileInfo, err error) error { if err != nil { return nil // skip errors } // Skip directories if info.IsDir() { if skipDirs[info.Name()] { return filepath.SkipDir } return nil } // Skip by extension ext := strings.ToLower(filepath.Ext(info.Name())) if skipExtensions[ext] { return nil } // Check compound extensions like .min.js base := strings.ToLower(info.Name()) if strings.HasSuffix(base, ".min.js") || strings.HasSuffix(base, ".min.css") { return nil } // Skip large files if info.Size() > maxFileSize { return nil } // Skip empty files if info.Size() == 0 { return nil } // Read file content, err := os.ReadFile(path) if err != nil { return nil // skip unreadable } // Skip binary files (check for null bytes in first 512 bytes) checkLen := 512 if len(content) < checkLen { checkLen = len(content) } for _, b := range content[:checkLen] { if b == 0 { return nil } } // Skip non-UTF8 if !utf8.Valid(content) { return nil } relPath, _ := filepath.Rel(repoDir, path) if ext != "" { ext = ext[1:] // strip leading dot } docs = append(docs, meili.Document{ ID: meili.DocumentID(repo.FullName, repo.DefaultBranch, relPath), Repo: repo.FullName, Branch: repo.DefaultBranch, Path: relPath, Filename: info.Name(), Extension: ext, Content: string(content), Language: langFromExt(ext), UpdatedAt: now, }) return nil }) // Clean up clone os.RemoveAll(repoDir) return docs, err } func validateSignature(body []byte, signature, secret string) bool { mac := hmac.New(sha256.New, []byte(secret)) mac.Write(body) expected := hex.EncodeToString(mac.Sum(nil)) return hmac.Equal([]byte(expected), []byte(signature)) } func langFromExt(ext string) string { switch ext { case "go": return "go" case "py": return "python" case "js", "jsx": return "javascript" case "ts", "tsx": return "typescript" case "sh", "bash": return "shell" case "yaml", "yml": return "yaml" case "json": return "json" case "md": return "markdown" case "html", "htm": return "html" case "css": return "css" case "sql": return "sql" case "rs": return "rust" case "rb": return "ruby" case "conf", "cfg", "ini", "toml": return "config" default: return "" } } func envOr(key, fallback string) string { if v := os.Getenv(key); v != "" { return v } return fallback }