- Skip HMAC validation when X-Gitea-Signature header is absent (Gitea 1.25.5 doesn't send signatures for webhooks created via API) - Bump image tag to v1.0.2 - Gitea app.ini: added [webhook] ALLOWED_HOST_LIST for K8s internal - Per-repo webhooks created on all 39 repos Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
467 lines
11 KiB
Go
467 lines
11 KiB
Go
package main
|
|
|
|
import (
|
|
"crypto/hmac"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
"unicode/utf8"
|
|
|
|
"gitea.rspworks.tech/rpert/gitea-search/internal/gitea"
|
|
"gitea.rspworks.tech/rpert/gitea-search/internal/meili"
|
|
)
|
|
|
|
const version = "1.0.0"
|
|
|
|
// Files and directories to skip during indexing.
|
|
var skipDirs = map[string]bool{
|
|
".git": true,
|
|
"node_modules": true,
|
|
"vendor": true,
|
|
"__pycache__": true,
|
|
".venv": true,
|
|
".tox": true,
|
|
"dist": true,
|
|
"build": true,
|
|
}
|
|
|
|
var skipExtensions = map[string]bool{
|
|
".png": true, ".jpg": true, ".jpeg": true, ".gif": true, ".svg": true,
|
|
".ico": true, ".webp": true, ".bmp": true, ".tiff": true,
|
|
".zip": true, ".tar": true, ".gz": true, ".bz2": true, ".xz": true,
|
|
".rar": true, ".7z": true,
|
|
".woff": true, ".woff2": true, ".ttf": true, ".eot": true,
|
|
".exe": true, ".dll": true, ".so": true, ".dylib": true,
|
|
".pdf": true, ".doc": true, ".docx": true,
|
|
".mp3": true, ".mp4": true, ".wav": true, ".flac": true,
|
|
".lock": true,
|
|
".min.js": true, ".min.css": true,
|
|
}
|
|
|
|
const maxFileSize = 50 * 1024 // 50KB
|
|
|
|
func main() {
|
|
if len(os.Args) < 2 {
|
|
fmt.Fprintf(os.Stderr, "Usage: indexer <command> [args]\n")
|
|
fmt.Fprintf(os.Stderr, "Commands:\n")
|
|
fmt.Fprintf(os.Stderr, " full Full re-index of all repos\n")
|
|
fmt.Fprintf(os.Stderr, " repo <owner/name> Re-index a single repo\n")
|
|
fmt.Fprintf(os.Stderr, " webhook Start webhook HTTP server\n")
|
|
fmt.Fprintf(os.Stderr, " search <query> CLI search for testing\n")
|
|
os.Exit(1)
|
|
}
|
|
|
|
switch os.Args[1] {
|
|
case "full":
|
|
cmdFull()
|
|
case "repo":
|
|
if len(os.Args) < 3 {
|
|
log.Fatal("Usage: indexer repo <owner/name>")
|
|
}
|
|
cmdRepo(os.Args[2])
|
|
case "webhook":
|
|
cmdWebhook()
|
|
case "search":
|
|
if len(os.Args) < 3 {
|
|
log.Fatal("Usage: indexer search <query> [--repo=X] [--type=X] [--limit=N]")
|
|
}
|
|
cmdSearch(os.Args[2:])
|
|
default:
|
|
log.Fatalf("Unknown command: %s", os.Args[1])
|
|
}
|
|
}
|
|
|
|
func newClients() (*gitea.Client, *meili.Client) {
|
|
giteaURL := envOr("GITEA_URL", "https://gitea.rspworks.tech")
|
|
giteaToken := os.Getenv("GITEA_TOKEN")
|
|
if giteaToken == "" {
|
|
log.Fatal("GITEA_TOKEN is required")
|
|
}
|
|
|
|
meiliURL := envOr("MEILI_URL", "http://localhost:7700")
|
|
meiliKey := os.Getenv("MEILI_KEY")
|
|
indexName := envOr("INDEX_NAME", "gitea-code")
|
|
|
|
gc := gitea.NewClient(giteaURL, giteaToken)
|
|
|
|
mc, err := meili.NewClient(meiliURL, meiliKey, indexName)
|
|
if err != nil {
|
|
log.Fatalf("connecting to MeiliSearch: %v", err)
|
|
}
|
|
|
|
return gc, mc
|
|
}
|
|
|
|
// cmdFull clones all repos and indexes everything.
|
|
func cmdFull() {
|
|
gc, mc := newClients()
|
|
|
|
log.Println("Fetching repo list from Gitea...")
|
|
repos, err := gc.ListAllRepos()
|
|
if err != nil {
|
|
log.Fatalf("listing repos: %v", err)
|
|
}
|
|
log.Printf("Found %d repos", len(repos))
|
|
|
|
// Clear existing index for full reindex
|
|
log.Println("Clearing existing index...")
|
|
if err := mc.DeleteAll(); err != nil {
|
|
log.Fatalf("clearing index: %v", err)
|
|
}
|
|
|
|
tmpBase, err := os.MkdirTemp("", "gitea-indexer-*")
|
|
if err != nil {
|
|
log.Fatalf("creating temp dir: %v", err)
|
|
}
|
|
defer os.RemoveAll(tmpBase)
|
|
|
|
var totalDocs int
|
|
for _, repo := range repos {
|
|
docs, err := indexRepo(gc, repo, tmpBase)
|
|
if err != nil {
|
|
log.Printf("ERROR indexing %s: %v", repo.FullName, err)
|
|
continue
|
|
}
|
|
if len(docs) == 0 {
|
|
log.Printf(" %s: no indexable files", repo.FullName)
|
|
continue
|
|
}
|
|
|
|
if err := mc.IndexDocuments(docs); err != nil {
|
|
log.Printf("ERROR pushing %s to MeiliSearch: %v", repo.FullName, err)
|
|
continue
|
|
}
|
|
totalDocs += len(docs)
|
|
log.Printf(" %s: indexed %d files", repo.FullName, len(docs))
|
|
}
|
|
|
|
log.Printf("Done. Total: %d documents from %d repos", totalDocs, len(repos))
|
|
}
|
|
|
|
// cmdRepo re-indexes a single repo.
|
|
func cmdRepo(fullName string) {
|
|
gc, mc := newClients()
|
|
|
|
repo, err := gc.GetRepo(fullName)
|
|
if err != nil {
|
|
log.Fatalf("fetching repo %s: %v", fullName, err)
|
|
}
|
|
|
|
// Delete existing docs for this repo
|
|
log.Printf("Deleting existing documents for %s...", fullName)
|
|
if err := mc.DeleteByRepo(fullName); err != nil {
|
|
log.Fatalf("deleting documents: %v", err)
|
|
}
|
|
|
|
tmpBase, err := os.MkdirTemp("", "gitea-indexer-*")
|
|
if err != nil {
|
|
log.Fatalf("creating temp dir: %v", err)
|
|
}
|
|
defer os.RemoveAll(tmpBase)
|
|
|
|
docs, err := indexRepo(gc, *repo, tmpBase)
|
|
if err != nil {
|
|
log.Fatalf("indexing %s: %v", fullName, err)
|
|
}
|
|
|
|
if len(docs) == 0 {
|
|
log.Printf("%s: no indexable files", fullName)
|
|
return
|
|
}
|
|
|
|
if err := mc.IndexDocuments(docs); err != nil {
|
|
log.Fatalf("pushing to MeiliSearch: %v", err)
|
|
}
|
|
log.Printf("Indexed %d files from %s", len(docs), fullName)
|
|
}
|
|
|
|
// cmdWebhook starts an HTTP server for Gitea push webhooks.
|
|
func cmdWebhook() {
|
|
gc, mc := newClients()
|
|
webhookSecret := os.Getenv("WEBHOOK_SECRET")
|
|
|
|
mux := http.NewServeMux()
|
|
|
|
mux.HandleFunc("/webhook", func(w http.ResponseWriter, r *http.Request) {
|
|
if r.Method != http.MethodPost {
|
|
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
|
|
return
|
|
}
|
|
|
|
body, err := io.ReadAll(r.Body)
|
|
if err != nil {
|
|
http.Error(w, "error reading body", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
// Validate HMAC signature if secret is configured AND header is present
|
|
if webhookSecret != "" {
|
|
sig := r.Header.Get("X-Gitea-Signature")
|
|
if sig != "" && !validateSignature(body, sig, webhookSecret) {
|
|
log.Printf("Invalid webhook signature")
|
|
http.Error(w, "invalid signature", http.StatusUnauthorized)
|
|
return
|
|
}
|
|
}
|
|
|
|
var payload struct {
|
|
Repository struct {
|
|
FullName string `json:"full_name"`
|
|
} `json:"repository"`
|
|
}
|
|
if err := json.Unmarshal(body, &payload); err != nil {
|
|
log.Printf("Error parsing webhook payload: %v", err)
|
|
http.Error(w, "invalid payload", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
repoName := payload.Repository.FullName
|
|
if repoName == "" {
|
|
http.Error(w, "missing repository name", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
log.Printf("Webhook: re-indexing %s", repoName)
|
|
w.WriteHeader(http.StatusAccepted)
|
|
fmt.Fprintf(w, "accepted: %s\n", repoName)
|
|
|
|
// Re-index in background
|
|
go func() {
|
|
repo, err := gc.GetRepo(repoName)
|
|
if err != nil {
|
|
log.Printf("ERROR fetching %s: %v", repoName, err)
|
|
return
|
|
}
|
|
|
|
if err := mc.DeleteByRepo(repoName); err != nil {
|
|
log.Printf("ERROR deleting docs for %s: %v", repoName, err)
|
|
return
|
|
}
|
|
|
|
tmpBase, err := os.MkdirTemp("", "gitea-indexer-*")
|
|
if err != nil {
|
|
log.Printf("ERROR creating temp dir: %v", err)
|
|
return
|
|
}
|
|
defer os.RemoveAll(tmpBase)
|
|
|
|
docs, err := indexRepo(gc, *repo, tmpBase)
|
|
if err != nil {
|
|
log.Printf("ERROR indexing %s: %v", repoName, err)
|
|
return
|
|
}
|
|
|
|
if err := mc.IndexDocuments(docs); err != nil {
|
|
log.Printf("ERROR pushing %s: %v", repoName, err)
|
|
return
|
|
}
|
|
log.Printf("Webhook: re-indexed %s (%d files)", repoName, len(docs))
|
|
}()
|
|
})
|
|
|
|
mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) {
|
|
w.WriteHeader(http.StatusOK)
|
|
fmt.Fprint(w, "ok")
|
|
})
|
|
|
|
addr := envOr("LISTEN_ADDR", ":8080")
|
|
log.Printf("Webhook server listening on %s", addr)
|
|
if err := http.ListenAndServe(addr, mux); err != nil {
|
|
log.Fatalf("server error: %v", err)
|
|
}
|
|
}
|
|
|
|
// cmdSearch runs a CLI search for testing.
|
|
func cmdSearch(args []string) {
|
|
_, mc := newClients()
|
|
|
|
query := args[0]
|
|
var repo, filetype string
|
|
var limit int64 = 10
|
|
|
|
for _, arg := range args[1:] {
|
|
switch {
|
|
case strings.HasPrefix(arg, "--repo="):
|
|
repo = strings.TrimPrefix(arg, "--repo=")
|
|
case strings.HasPrefix(arg, "--type="):
|
|
filetype = strings.TrimPrefix(arg, "--type=")
|
|
case strings.HasPrefix(arg, "--limit="):
|
|
fmt.Sscanf(strings.TrimPrefix(arg, "--limit="), "%d", &limit)
|
|
}
|
|
}
|
|
|
|
results, err := mc.Search(query, repo, filetype, limit)
|
|
if err != nil {
|
|
log.Fatalf("search error: %v", err)
|
|
}
|
|
|
|
if len(results) == 0 {
|
|
fmt.Printf("No results for %q\n", query)
|
|
return
|
|
}
|
|
|
|
for i, r := range results {
|
|
fmt.Printf("%d. %s — %s\n", i+1, r.Repo, r.Path)
|
|
if r.Snippet != "" {
|
|
fmt.Printf(" %s\n", r.Snippet)
|
|
}
|
|
fmt.Println()
|
|
}
|
|
}
|
|
|
|
// indexRepo clones a single repo and extracts indexable documents.
|
|
func indexRepo(gc *gitea.Client, repo gitea.Repo, tmpBase string) ([]meili.Document, error) {
|
|
cloneURL := gc.AuthenticatedCloneURL(repo)
|
|
repoDir := filepath.Join(tmpBase, strings.ReplaceAll(repo.FullName, "/", "_"))
|
|
|
|
cmd := exec.Command("git", "clone", "--depth", "1", "--single-branch",
|
|
"--branch", repo.DefaultBranch, cloneURL, repoDir)
|
|
cmd.Stdout = io.Discard
|
|
cmd.Stderr = io.Discard
|
|
|
|
if err := cmd.Run(); err != nil {
|
|
return nil, fmt.Errorf("cloning %s: %w", repo.FullName, err)
|
|
}
|
|
|
|
var docs []meili.Document
|
|
now := time.Now().Unix()
|
|
|
|
err := filepath.Walk(repoDir, func(path string, info os.FileInfo, err error) error {
|
|
if err != nil {
|
|
return nil // skip errors
|
|
}
|
|
|
|
// Skip directories
|
|
if info.IsDir() {
|
|
if skipDirs[info.Name()] {
|
|
return filepath.SkipDir
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Skip by extension
|
|
ext := strings.ToLower(filepath.Ext(info.Name()))
|
|
if skipExtensions[ext] {
|
|
return nil
|
|
}
|
|
// Check compound extensions like .min.js
|
|
base := strings.ToLower(info.Name())
|
|
if strings.HasSuffix(base, ".min.js") || strings.HasSuffix(base, ".min.css") {
|
|
return nil
|
|
}
|
|
|
|
// Skip large files
|
|
if info.Size() > maxFileSize {
|
|
return nil
|
|
}
|
|
// Skip empty files
|
|
if info.Size() == 0 {
|
|
return nil
|
|
}
|
|
|
|
// Read file
|
|
content, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return nil // skip unreadable
|
|
}
|
|
|
|
// Skip binary files (check for null bytes in first 512 bytes)
|
|
checkLen := 512
|
|
if len(content) < checkLen {
|
|
checkLen = len(content)
|
|
}
|
|
for _, b := range content[:checkLen] {
|
|
if b == 0 {
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// Skip non-UTF8
|
|
if !utf8.Valid(content) {
|
|
return nil
|
|
}
|
|
|
|
relPath, _ := filepath.Rel(repoDir, path)
|
|
if ext != "" {
|
|
ext = ext[1:] // strip leading dot
|
|
}
|
|
|
|
docs = append(docs, meili.Document{
|
|
ID: meili.DocumentID(repo.FullName, repo.DefaultBranch, relPath),
|
|
Repo: repo.FullName,
|
|
Branch: repo.DefaultBranch,
|
|
Path: relPath,
|
|
Filename: info.Name(),
|
|
Extension: ext,
|
|
Content: string(content),
|
|
Language: langFromExt(ext),
|
|
UpdatedAt: now,
|
|
})
|
|
|
|
return nil
|
|
})
|
|
|
|
// Clean up clone
|
|
os.RemoveAll(repoDir)
|
|
|
|
return docs, err
|
|
}
|
|
|
|
func validateSignature(body []byte, signature, secret string) bool {
|
|
mac := hmac.New(sha256.New, []byte(secret))
|
|
mac.Write(body)
|
|
expected := hex.EncodeToString(mac.Sum(nil))
|
|
return hmac.Equal([]byte(expected), []byte(signature))
|
|
}
|
|
|
|
func langFromExt(ext string) string {
|
|
switch ext {
|
|
case "go":
|
|
return "go"
|
|
case "py":
|
|
return "python"
|
|
case "js", "jsx":
|
|
return "javascript"
|
|
case "ts", "tsx":
|
|
return "typescript"
|
|
case "sh", "bash":
|
|
return "shell"
|
|
case "yaml", "yml":
|
|
return "yaml"
|
|
case "json":
|
|
return "json"
|
|
case "md":
|
|
return "markdown"
|
|
case "html", "htm":
|
|
return "html"
|
|
case "css":
|
|
return "css"
|
|
case "sql":
|
|
return "sql"
|
|
case "rs":
|
|
return "rust"
|
|
case "rb":
|
|
return "ruby"
|
|
case "conf", "cfg", "ini", "toml":
|
|
return "config"
|
|
default:
|
|
return ""
|
|
}
|
|
}
|
|
|
|
func envOr(key, fallback string) string {
|
|
if v := os.Getenv(key); v != "" {
|
|
return v
|
|
}
|
|
return fallback
|
|
}
|