diff --git a/.gitignore b/.gitignore index fafe1d4..c5b11fb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ -# Binaries -indexer -mcp-server +# Binaries (root level only, not cmd/ source dirs) +/indexer +/mcp-server *.exe # Go diff --git a/cmd/indexer/main.go b/cmd/indexer/main.go new file mode 100644 index 0000000..0efde83 --- /dev/null +++ b/cmd/indexer/main.go @@ -0,0 +1,466 @@ +package main + +import ( + "crypto/hmac" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + "unicode/utf8" + + "gitea.rspworks.tech/rpert/gitea-search/internal/gitea" + "gitea.rspworks.tech/rpert/gitea-search/internal/meili" +) + +const version = "1.0.0" + +// Files and directories to skip during indexing. +var skipDirs = map[string]bool{ + ".git": true, + "node_modules": true, + "vendor": true, + "__pycache__": true, + ".venv": true, + ".tox": true, + "dist": true, + "build": true, +} + +var skipExtensions = map[string]bool{ + ".png": true, ".jpg": true, ".jpeg": true, ".gif": true, ".svg": true, + ".ico": true, ".webp": true, ".bmp": true, ".tiff": true, + ".zip": true, ".tar": true, ".gz": true, ".bz2": true, ".xz": true, + ".rar": true, ".7z": true, + ".woff": true, ".woff2": true, ".ttf": true, ".eot": true, + ".exe": true, ".dll": true, ".so": true, ".dylib": true, + ".pdf": true, ".doc": true, ".docx": true, + ".mp3": true, ".mp4": true, ".wav": true, ".flac": true, + ".lock": true, + ".min.js": true, ".min.css": true, +} + +const maxFileSize = 50 * 1024 // 50KB + +func main() { + if len(os.Args) < 2 { + fmt.Fprintf(os.Stderr, "Usage: indexer [args]\n") + fmt.Fprintf(os.Stderr, "Commands:\n") + fmt.Fprintf(os.Stderr, " full Full re-index of all repos\n") + fmt.Fprintf(os.Stderr, " repo Re-index a single repo\n") + fmt.Fprintf(os.Stderr, " webhook Start webhook HTTP server\n") + fmt.Fprintf(os.Stderr, " search CLI search for testing\n") + os.Exit(1) + } + + switch os.Args[1] { + case "full": + cmdFull() + case "repo": + if len(os.Args) < 3 { + log.Fatal("Usage: indexer repo ") + } + cmdRepo(os.Args[2]) + case "webhook": + cmdWebhook() + case "search": + if len(os.Args) < 3 { + log.Fatal("Usage: indexer search [--repo=X] [--type=X] [--limit=N]") + } + cmdSearch(os.Args[2:]) + default: + log.Fatalf("Unknown command: %s", os.Args[1]) + } +} + +func newClients() (*gitea.Client, *meili.Client) { + giteaURL := envOr("GITEA_URL", "https://gitea.rspworks.tech") + giteaToken := os.Getenv("GITEA_TOKEN") + if giteaToken == "" { + log.Fatal("GITEA_TOKEN is required") + } + + meiliURL := envOr("MEILI_URL", "http://localhost:7700") + meiliKey := os.Getenv("MEILI_KEY") + indexName := envOr("INDEX_NAME", "gitea-code") + + gc := gitea.NewClient(giteaURL, giteaToken) + + mc, err := meili.NewClient(meiliURL, meiliKey, indexName) + if err != nil { + log.Fatalf("connecting to MeiliSearch: %v", err) + } + + return gc, mc +} + +// cmdFull clones all repos and indexes everything. +func cmdFull() { + gc, mc := newClients() + + log.Println("Fetching repo list from Gitea...") + repos, err := gc.ListAllRepos() + if err != nil { + log.Fatalf("listing repos: %v", err) + } + log.Printf("Found %d repos", len(repos)) + + // Clear existing index for full reindex + log.Println("Clearing existing index...") + if err := mc.DeleteAll(); err != nil { + log.Fatalf("clearing index: %v", err) + } + + tmpBase, err := os.MkdirTemp("", "gitea-indexer-*") + if err != nil { + log.Fatalf("creating temp dir: %v", err) + } + defer os.RemoveAll(tmpBase) + + var totalDocs int + for _, repo := range repos { + docs, err := indexRepo(gc, repo, tmpBase) + if err != nil { + log.Printf("ERROR indexing %s: %v", repo.FullName, err) + continue + } + if len(docs) == 0 { + log.Printf(" %s: no indexable files", repo.FullName) + continue + } + + if err := mc.IndexDocuments(docs); err != nil { + log.Printf("ERROR pushing %s to MeiliSearch: %v", repo.FullName, err) + continue + } + totalDocs += len(docs) + log.Printf(" %s: indexed %d files", repo.FullName, len(docs)) + } + + log.Printf("Done. Total: %d documents from %d repos", totalDocs, len(repos)) +} + +// cmdRepo re-indexes a single repo. +func cmdRepo(fullName string) { + gc, mc := newClients() + + repo, err := gc.GetRepo(fullName) + if err != nil { + log.Fatalf("fetching repo %s: %v", fullName, err) + } + + // Delete existing docs for this repo + log.Printf("Deleting existing documents for %s...", fullName) + if err := mc.DeleteByRepo(fullName); err != nil { + log.Fatalf("deleting documents: %v", err) + } + + tmpBase, err := os.MkdirTemp("", "gitea-indexer-*") + if err != nil { + log.Fatalf("creating temp dir: %v", err) + } + defer os.RemoveAll(tmpBase) + + docs, err := indexRepo(gc, *repo, tmpBase) + if err != nil { + log.Fatalf("indexing %s: %v", fullName, err) + } + + if len(docs) == 0 { + log.Printf("%s: no indexable files", fullName) + return + } + + if err := mc.IndexDocuments(docs); err != nil { + log.Fatalf("pushing to MeiliSearch: %v", err) + } + log.Printf("Indexed %d files from %s", len(docs), fullName) +} + +// cmdWebhook starts an HTTP server for Gitea push webhooks. +func cmdWebhook() { + gc, mc := newClients() + webhookSecret := os.Getenv("WEBHOOK_SECRET") + + mux := http.NewServeMux() + + mux.HandleFunc("/webhook", func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + body, err := io.ReadAll(r.Body) + if err != nil { + http.Error(w, "error reading body", http.StatusBadRequest) + return + } + + // Validate HMAC signature if secret is configured + if webhookSecret != "" { + sig := r.Header.Get("X-Gitea-Signature") + if !validateSignature(body, sig, webhookSecret) { + log.Printf("Invalid webhook signature") + http.Error(w, "invalid signature", http.StatusUnauthorized) + return + } + } + + var payload struct { + Repository struct { + FullName string `json:"full_name"` + } `json:"repository"` + } + if err := json.Unmarshal(body, &payload); err != nil { + log.Printf("Error parsing webhook payload: %v", err) + http.Error(w, "invalid payload", http.StatusBadRequest) + return + } + + repoName := payload.Repository.FullName + if repoName == "" { + http.Error(w, "missing repository name", http.StatusBadRequest) + return + } + + log.Printf("Webhook: re-indexing %s", repoName) + w.WriteHeader(http.StatusAccepted) + fmt.Fprintf(w, "accepted: %s\n", repoName) + + // Re-index in background + go func() { + repo, err := gc.GetRepo(repoName) + if err != nil { + log.Printf("ERROR fetching %s: %v", repoName, err) + return + } + + if err := mc.DeleteByRepo(repoName); err != nil { + log.Printf("ERROR deleting docs for %s: %v", repoName, err) + return + } + + tmpBase, err := os.MkdirTemp("", "gitea-indexer-*") + if err != nil { + log.Printf("ERROR creating temp dir: %v", err) + return + } + defer os.RemoveAll(tmpBase) + + docs, err := indexRepo(gc, *repo, tmpBase) + if err != nil { + log.Printf("ERROR indexing %s: %v", repoName, err) + return + } + + if err := mc.IndexDocuments(docs); err != nil { + log.Printf("ERROR pushing %s: %v", repoName, err) + return + } + log.Printf("Webhook: re-indexed %s (%d files)", repoName, len(docs)) + }() + }) + + mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + fmt.Fprint(w, "ok") + }) + + addr := envOr("LISTEN_ADDR", ":8080") + log.Printf("Webhook server listening on %s", addr) + if err := http.ListenAndServe(addr, mux); err != nil { + log.Fatalf("server error: %v", err) + } +} + +// cmdSearch runs a CLI search for testing. +func cmdSearch(args []string) { + _, mc := newClients() + + query := args[0] + var repo, filetype string + var limit int64 = 10 + + for _, arg := range args[1:] { + switch { + case strings.HasPrefix(arg, "--repo="): + repo = strings.TrimPrefix(arg, "--repo=") + case strings.HasPrefix(arg, "--type="): + filetype = strings.TrimPrefix(arg, "--type=") + case strings.HasPrefix(arg, "--limit="): + fmt.Sscanf(strings.TrimPrefix(arg, "--limit="), "%d", &limit) + } + } + + results, err := mc.Search(query, repo, filetype, limit) + if err != nil { + log.Fatalf("search error: %v", err) + } + + if len(results) == 0 { + fmt.Printf("No results for %q\n", query) + return + } + + for i, r := range results { + fmt.Printf("%d. %s — %s\n", i+1, r.Repo, r.Path) + if r.Snippet != "" { + fmt.Printf(" %s\n", r.Snippet) + } + fmt.Println() + } +} + +// indexRepo clones a single repo and extracts indexable documents. +func indexRepo(gc *gitea.Client, repo gitea.Repo, tmpBase string) ([]meili.Document, error) { + cloneURL := gc.AuthenticatedCloneURL(repo) + repoDir := filepath.Join(tmpBase, strings.ReplaceAll(repo.FullName, "/", "_")) + + cmd := exec.Command("git", "clone", "--depth", "1", "--single-branch", + "--branch", repo.DefaultBranch, cloneURL, repoDir) + cmd.Stdout = io.Discard + cmd.Stderr = io.Discard + + if err := cmd.Run(); err != nil { + return nil, fmt.Errorf("cloning %s: %w", repo.FullName, err) + } + + var docs []meili.Document + now := time.Now().Unix() + + err := filepath.Walk(repoDir, func(path string, info os.FileInfo, err error) error { + if err != nil { + return nil // skip errors + } + + // Skip directories + if info.IsDir() { + if skipDirs[info.Name()] { + return filepath.SkipDir + } + return nil + } + + // Skip by extension + ext := strings.ToLower(filepath.Ext(info.Name())) + if skipExtensions[ext] { + return nil + } + // Check compound extensions like .min.js + base := strings.ToLower(info.Name()) + if strings.HasSuffix(base, ".min.js") || strings.HasSuffix(base, ".min.css") { + return nil + } + + // Skip large files + if info.Size() > maxFileSize { + return nil + } + // Skip empty files + if info.Size() == 0 { + return nil + } + + // Read file + content, err := os.ReadFile(path) + if err != nil { + return nil // skip unreadable + } + + // Skip binary files (check for null bytes in first 512 bytes) + checkLen := 512 + if len(content) < checkLen { + checkLen = len(content) + } + for _, b := range content[:checkLen] { + if b == 0 { + return nil + } + } + + // Skip non-UTF8 + if !utf8.Valid(content) { + return nil + } + + relPath, _ := filepath.Rel(repoDir, path) + if ext != "" { + ext = ext[1:] // strip leading dot + } + + docs = append(docs, meili.Document{ + ID: meili.DocumentID(repo.FullName, repo.DefaultBranch, relPath), + Repo: repo.FullName, + Branch: repo.DefaultBranch, + Path: relPath, + Filename: info.Name(), + Extension: ext, + Content: string(content), + Language: langFromExt(ext), + UpdatedAt: now, + }) + + return nil + }) + + // Clean up clone + os.RemoveAll(repoDir) + + return docs, err +} + +func validateSignature(body []byte, signature, secret string) bool { + mac := hmac.New(sha256.New, []byte(secret)) + mac.Write(body) + expected := hex.EncodeToString(mac.Sum(nil)) + return hmac.Equal([]byte(expected), []byte(signature)) +} + +func langFromExt(ext string) string { + switch ext { + case "go": + return "go" + case "py": + return "python" + case "js", "jsx": + return "javascript" + case "ts", "tsx": + return "typescript" + case "sh", "bash": + return "shell" + case "yaml", "yml": + return "yaml" + case "json": + return "json" + case "md": + return "markdown" + case "html", "htm": + return "html" + case "css": + return "css" + case "sql": + return "sql" + case "rs": + return "rust" + case "rb": + return "ruby" + case "conf", "cfg", "ini", "toml": + return "config" + default: + return "" + } +} + +func envOr(key, fallback string) string { + if v := os.Getenv(key); v != "" { + return v + } + return fallback +} diff --git a/cmd/mcp-server/main.go b/cmd/mcp-server/main.go new file mode 100644 index 0000000..544ee7a --- /dev/null +++ b/cmd/mcp-server/main.go @@ -0,0 +1,34 @@ +package main + +import ( + "log" + "os" + + "gitea.rspworks.tech/rpert/gitea-search/internal/mcp" + "gitea.rspworks.tech/rpert/gitea-search/internal/meili" +) + +const version = "1.0.0" + +func main() { + meiliURL := envOr("MEILI_URL", "http://localhost:7700") + meiliKey := os.Getenv("MEILI_KEY") + indexName := envOr("INDEX_NAME", "gitea-code") + + client, err := meili.NewClient(meiliURL, meiliKey, indexName) + if err != nil { + log.Fatalf("connecting to MeiliSearch: %v", err) + } + + server := mcp.NewServer(client, version) + if err := server.Run(); err != nil { + log.Fatalf("server error: %v", err) + } +} + +func envOr(key, fallback string) string { + if v := os.Getenv(key); v != "" { + return v + } + return fallback +} diff --git a/internal/gitea/client.go b/internal/gitea/client.go index 7de8edb..88d23d7 100644 --- a/internal/gitea/client.go +++ b/internal/gitea/client.go @@ -45,14 +45,15 @@ func (c *Client) ListAllRepos() ([]Repo, error) { limit := 50 for { - url := fmt.Sprintf("%s/api/v1/repos/search?page=%d&limit=%d&token=%s", - c.baseURL, page, limit, c.token) + url := fmt.Sprintf("%s/api/v1/repos/search?page=%d&limit=%d", + c.baseURL, page, limit) req, err := http.NewRequest("GET", url, nil) if err != nil { return nil, fmt.Errorf("creating request: %w", err) } req.Header.Set("Accept", "application/json") + req.Header.Set("Authorization", "token "+c.token) resp, err := c.httpClient.Do(req) if err != nil { @@ -99,13 +100,14 @@ func (c *Client) ListAllRepos() ([]Repo, error) { // GetRepo returns a single repository by owner/name. func (c *Client) GetRepo(fullName string) (*Repo, error) { - url := fmt.Sprintf("%s/api/v1/repos/%s?token=%s", c.baseURL, fullName, c.token) + url := fmt.Sprintf("%s/api/v1/repos/%s", c.baseURL, fullName) req, err := http.NewRequest("GET", url, nil) if err != nil { return nil, fmt.Errorf("creating request: %w", err) } req.Header.Set("Accept", "application/json") + req.Header.Set("Authorization", "token "+c.token) resp, err := c.httpClient.Do(req) if err != nil { diff --git a/k8s/indexer-cronjob.yaml b/k8s/indexer-cronjob.yaml index 3265497..cc1a375 100644 --- a/k8s/indexer-cronjob.yaml +++ b/k8s/indexer-cronjob.yaml @@ -1,12 +1,7 @@ -apiVersion: v1 -kind: Secret -metadata: - name: indexer-secret - namespace: gitea-search -type: Opaque -stringData: - gitea-token: "CHANGE-ME" - webhook-secret: "CHANGE-ME" +# Secret 'indexer-secret' must be created separately with real values: +# kubectl -n gitea-search create secret generic indexer-secret \ +# --from-literal=gitea-token="" \ +# --from-literal=webhook-secret="$(openssl rand -hex 32)" --- apiVersion: batch/v1 kind: CronJob @@ -27,7 +22,8 @@ spec: restartPolicy: OnFailure containers: - name: indexer - image: gitea.rspworks.tech/rpert/gitea-search:latest + image: gitea.rspworks.tech/rpert/gitea-search:v1.0.1 + imagePullPolicy: IfNotPresent command: ["indexer", "full"] env: - name: GITEA_URL @@ -74,7 +70,8 @@ spec: spec: containers: - name: webhook - image: gitea.rspworks.tech/rpert/gitea-search:latest + image: gitea.rspworks.tech/rpert/gitea-search:v1.0.1 + imagePullPolicy: IfNotPresent command: ["indexer", "webhook"] ports: - containerPort: 8080