From 9eafa5c06a8dd3a3360e51335c24914760f9766a Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Wed, 18 Mar 2026 22:50:21 +0100 Subject: [PATCH 1/4] fix: make exclude patterns recursive and share via single constant The indexer's exclude patterns were non-recursive (e.g. `coverage/**`), only matching at the project root. Nested occurrences in monorepo packages and worktrees passed through, polluting the index with generated artifacts and worktree copies. - Extract EXCLUDED_DIRECTORY_NAMES and EXCLUDED_GLOB_PATTERNS into src/constants/codebase-context.ts as the single source of truth - Indexer, file-watcher, and project-discovery all import from there - Add missing directories: .cache, .claude, .planning, worktrees, target, vendor, .nx, .turbo, .next, build - Add integration test reproducing the consumer audit failure case (nested coverage/, .claude/worktrees/, worktrees/, dist/) --- src/constants/codebase-context.ts | 33 ++++++++++ src/core/file-watcher.ts | 14 +--- src/core/indexer.ts | 10 +-- src/utils/project-discovery.ts | 22 ++----- tests/indexer-exclude-patterns.test.ts | 89 ++++++++++++++++++++++++++ 5 files changed, 132 insertions(+), 36 deletions(-) create mode 100644 tests/indexer-exclude-patterns.test.ts diff --git a/src/constants/codebase-context.ts b/src/constants/codebase-context.ts index 62e97b2..90f91e3 100644 --- a/src/constants/codebase-context.ts +++ b/src/constants/codebase-context.ts @@ -25,3 +25,36 @@ export const INDEXING_STATS_FILENAME = 'indexing-stats.json' as const; export const VECTOR_DB_DIRNAME = 'index' as const; export const MANIFEST_FILENAME = 'manifest.json' as const; export const RELATIONSHIPS_FILENAME = 'relationships.json' as const; + +/** + * Directories excluded from indexing, file-watching, and project discovery. + * Single source of truth — all three consumers import from here. + */ +export const EXCLUDED_DIRECTORY_NAMES = [ + '.cache', + '.claude', + '.codebase-context', + '.git', + '.next', + '.nx', + '.planning', + '.turbo', + 'build', + 'coverage', + 'dist', + 'node_modules', + 'target', + 'vendor', + 'worktrees' +] as const; + +/** Glob patterns that match excluded directories at any nesting depth. */ +export const EXCLUDED_GLOB_PATTERNS: string[] = EXCLUDED_DIRECTORY_NAMES.map( + (dir) => `**/${dir}/**` +); + +/** + * Additional directories skipped only during project discovery (not generated + * code, just not useful roots to recurse into). + */ +export const DISCOVERY_ONLY_IGNORED = ['.hg', '.nuxt', '.svn', '.venv', '.yarn', 'out', 'tmp'] as const; diff --git a/src/core/file-watcher.ts b/src/core/file-watcher.ts index 58efb81..155ac6e 100644 --- a/src/core/file-watcher.ts +++ b/src/core/file-watcher.ts @@ -1,5 +1,6 @@ import chokidar from 'chokidar'; import path from 'path'; +import { EXCLUDED_GLOB_PATTERNS } from '../constants/codebase-context.js'; import { getSupportedExtensions } from '../utils/language-detection.js'; export interface FileWatcherOptions { @@ -43,18 +44,7 @@ export function startFileWatcher(opts: FileWatcherOptions): () => void { }; const watcher = chokidar.watch(rootPath, { - ignored: [ - '**/node_modules/**', - '**/.codebase-context/**', - '**/.git/**', - '**/dist/**', - '**/.nx/**', - '**/.planning/**', - '**/coverage/**', - '**/.turbo/**', - '**/.next/**', - '**/.cache/**' - ], + ignored: [...EXCLUDED_GLOB_PATTERNS], persistent: true, ignoreInitial: true, awaitWriteFinish: { stabilityThreshold: 200, pollInterval: 100 } diff --git a/src/core/indexer.ts b/src/core/indexer.ts index afef69b..d0e06d4 100644 --- a/src/core/indexer.ts +++ b/src/core/indexer.ts @@ -39,6 +39,7 @@ import { mergeSmallChunks } from '../utils/chunking.js'; import { getFileCommitDates } from '../utils/git-dates.js'; import { CODEBASE_CONTEXT_DIRNAME, + EXCLUDED_GLOB_PATTERNS, INDEX_FORMAT_VERSION, INDEXING_STATS_FILENAME, INDEX_META_FILENAME, @@ -274,14 +275,7 @@ export class CodebaseIndexer { '**/*.{sql,graphql,gql}', '**/*.{json,jsonc,yaml,yml,toml,xml}' ], - exclude: [ - 'node_modules/**', - 'dist/**', - 'build/**', - '.git/**', - 'coverage/**', - '.codebase-context/**' - ], + exclude: [...EXCLUDED_GLOB_PATTERNS], respectGitignore: true, parsing: { maxFileSize: 1048576, diff --git a/src/utils/project-discovery.ts b/src/utils/project-discovery.ts index 64e1156..7499dd2 100644 --- a/src/utils/project-discovery.ts +++ b/src/utils/project-discovery.ts @@ -1,6 +1,10 @@ import { promises as fs } from 'fs'; import type { Dirent } from 'fs'; import path from 'path'; +import { + EXCLUDED_DIRECTORY_NAMES, + DISCOVERY_ONLY_IGNORED +} from '../constants/codebase-context.js'; export type ProjectEvidence = | 'existing_index' @@ -20,22 +24,8 @@ export interface DiscoverProjectsOptions { const DEFAULT_MAX_DEPTH = 4; const IGNORED_DIRECTORY_NAMES = new Set([ - '.git', - '.hg', - '.svn', - '.next', - '.nuxt', - '.turbo', - '.venv', - '.yarn', - 'build', - 'coverage', - 'dist', - 'node_modules', - 'out', - 'target', - 'tmp', - 'vendor' + ...EXCLUDED_DIRECTORY_NAMES, + ...DISCOVERY_ONLY_IGNORED ]); const STRONG_DIRECTORY_MARKERS = new Set(['.codebase-context', '.git']); diff --git a/tests/indexer-exclude-patterns.test.ts b/tests/indexer-exclude-patterns.test.ts new file mode 100644 index 0000000..e83d167 --- /dev/null +++ b/tests/indexer-exclude-patterns.test.ts @@ -0,0 +1,89 @@ +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { promises as fs } from 'fs'; +import os from 'os'; +import path from 'path'; +import { CodebaseIndexer } from '../src/core/indexer.js'; +import { analyzerRegistry } from '../src/core/analyzer-registry.js'; +import { GenericAnalyzer } from '../src/analyzers/generic/index.js'; +import { + CODEBASE_CONTEXT_DIRNAME, + KEYWORD_INDEX_FILENAME +} from '../src/constants/codebase-context.js'; + +describe('Indexer exclude patterns — nested directories', () => { + let tempDir: string; + + beforeEach(async () => { + analyzerRegistry.register(new GenericAnalyzer()); + tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'indexer-exclude-patterns-')); + }); + + afterEach(async () => { + await fs.rm(tempDir, { recursive: true, force: true }); + }); + + it('excludes nested coverage, worktrees, .claude, and dist directories', async () => { + // Legitimate source file + await fs.mkdir(path.join(tempDir, 'src'), { recursive: true }); + await fs.writeFile( + path.join(tempDir, 'src', 'app.ts'), + 'export function main() { return "hello"; }\n' + ); + + // Polluters — nested paths that should be excluded + const polluters = [ + ['packages', 'ui', 'coverage', 'prettify.js'], + ['.claude', 'worktrees', 'branch', 'src', 'app.ts'], + ['worktrees', 'portal30-pr', 'src', 'real.ts'], + ['apps', 'web', 'dist', 'bundle.js'] + ]; + + for (const segments of polluters) { + const dir = path.join(tempDir, ...segments.slice(0, -1)); + await fs.mkdir(dir, { recursive: true }); + await fs.writeFile( + path.join(tempDir, ...segments), + 'export const polluter = true;\n' + ); + } + + const indexer = new CodebaseIndexer({ + rootPath: tempDir, + config: { + skipEmbedding: true, + parsing: { + maxFileSize: 1048576, + chunkSize: 50, + chunkOverlap: 0, + parseTests: true, + parseNodeModules: false + } + } + }); + + await indexer.index(); + + const indexPath = path.join(tempDir, CODEBASE_CONTEXT_DIRNAME, KEYWORD_INDEX_FILENAME); + const indexRaw = JSON.parse(await fs.readFile(indexPath, 'utf-8')) as Record; + const chunks = ( + Array.isArray(indexRaw) + ? indexRaw + : Array.isArray(indexRaw?.chunks) + ? indexRaw.chunks + : [] + ) as Array<{ filePath: string }>; + const indexedPaths = chunks.map((chunk) => chunk.filePath); + + // The legitimate file must be indexed + expect(indexedPaths.some((p) => p.includes('src/app.ts') || p.includes('src\\app.ts'))).toBe( + true + ); + + // None of the polluter paths should appear + const polluterMarkers = ['coverage', '.claude', 'worktrees', 'dist']; + for (const marker of polluterMarkers) { + const leaked = indexedPaths.filter((p) => p.includes(marker)); + expect(leaked, `paths containing "${marker}" should not be indexed`).toEqual([]); + } + }); +}); From 31fdda5df3551b74ee273b8f88eaa75cf35883df Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Wed, 18 Mar 2026 22:50:40 +0100 Subject: [PATCH 2/4] style: format with prettier --- src/constants/codebase-context.ts | 10 +++++++++- src/utils/project-discovery.ts | 10 ++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/constants/codebase-context.ts b/src/constants/codebase-context.ts index 90f91e3..ec23cd8 100644 --- a/src/constants/codebase-context.ts +++ b/src/constants/codebase-context.ts @@ -57,4 +57,12 @@ export const EXCLUDED_GLOB_PATTERNS: string[] = EXCLUDED_DIRECTORY_NAMES.map( * Additional directories skipped only during project discovery (not generated * code, just not useful roots to recurse into). */ -export const DISCOVERY_ONLY_IGNORED = ['.hg', '.nuxt', '.svn', '.venv', '.yarn', 'out', 'tmp'] as const; +export const DISCOVERY_ONLY_IGNORED = [ + '.hg', + '.nuxt', + '.svn', + '.venv', + '.yarn', + 'out', + 'tmp' +] as const; diff --git a/src/utils/project-discovery.ts b/src/utils/project-discovery.ts index 7499dd2..3d6b3b3 100644 --- a/src/utils/project-discovery.ts +++ b/src/utils/project-discovery.ts @@ -1,10 +1,7 @@ import { promises as fs } from 'fs'; import type { Dirent } from 'fs'; import path from 'path'; -import { - EXCLUDED_DIRECTORY_NAMES, - DISCOVERY_ONLY_IGNORED -} from '../constants/codebase-context.js'; +import { EXCLUDED_DIRECTORY_NAMES, DISCOVERY_ONLY_IGNORED } from '../constants/codebase-context.js'; export type ProjectEvidence = | 'existing_index' @@ -23,10 +20,7 @@ export interface DiscoverProjectsOptions { const DEFAULT_MAX_DEPTH = 4; -const IGNORED_DIRECTORY_NAMES = new Set([ - ...EXCLUDED_DIRECTORY_NAMES, - ...DISCOVERY_ONLY_IGNORED -]); +const IGNORED_DIRECTORY_NAMES = new Set([...EXCLUDED_DIRECTORY_NAMES, ...DISCOVERY_ONLY_IGNORED]); const STRONG_DIRECTORY_MARKERS = new Set(['.codebase-context', '.git']); const WORKSPACE_MARKERS = new Set(['lerna.json', 'nx.json', 'pnpm-workspace.yaml', 'turbo.json']); From cafdb9539377bc9e706e7b97839c93f3469db473 Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Wed, 18 Mar 2026 23:00:14 +0100 Subject: [PATCH 3/4] fix: address PR review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Make EXCLUDED_GLOB_PATTERNS readonly to prevent accidental mutation by consumers (spreads at call sites are now intentional, creating mutable copies for APIs that require string[]) - Throw on unrecognized index format in test instead of silently defaulting to empty array (prevents polluter assertions from passing vacuously) - Move analyzerRegistry.register into test body — only one test, no need for beforeEach ceremony --- src/constants/codebase-context.ts | 2 +- tests/indexer-exclude-patterns.test.ts | 36 +++++++++++++------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/constants/codebase-context.ts b/src/constants/codebase-context.ts index ec23cd8..23592ce 100644 --- a/src/constants/codebase-context.ts +++ b/src/constants/codebase-context.ts @@ -49,7 +49,7 @@ export const EXCLUDED_DIRECTORY_NAMES = [ ] as const; /** Glob patterns that match excluded directories at any nesting depth. */ -export const EXCLUDED_GLOB_PATTERNS: string[] = EXCLUDED_DIRECTORY_NAMES.map( +export const EXCLUDED_GLOB_PATTERNS: readonly string[] = EXCLUDED_DIRECTORY_NAMES.map( (dir) => `**/${dir}/**` ); diff --git a/tests/indexer-exclude-patterns.test.ts b/tests/indexer-exclude-patterns.test.ts index e83d167..a147b71 100644 --- a/tests/indexer-exclude-patterns.test.ts +++ b/tests/indexer-exclude-patterns.test.ts @@ -1,4 +1,4 @@ -import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { afterEach, describe, expect, it } from 'vitest'; import { promises as fs } from 'fs'; import os from 'os'; import path from 'path'; @@ -13,16 +13,14 @@ import { describe('Indexer exclude patterns — nested directories', () => { let tempDir: string; - beforeEach(async () => { - analyzerRegistry.register(new GenericAnalyzer()); - tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'indexer-exclude-patterns-')); - }); - afterEach(async () => { - await fs.rm(tempDir, { recursive: true, force: true }); + if (tempDir) await fs.rm(tempDir, { recursive: true, force: true }); }); it('excludes nested coverage, worktrees, .claude, and dist directories', async () => { + analyzerRegistry.register(new GenericAnalyzer()); + tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'indexer-exclude-patterns-')); + // Legitimate source file await fs.mkdir(path.join(tempDir, 'src'), { recursive: true }); await fs.writeFile( @@ -41,10 +39,7 @@ describe('Indexer exclude patterns — nested directories', () => { for (const segments of polluters) { const dir = path.join(tempDir, ...segments.slice(0, -1)); await fs.mkdir(dir, { recursive: true }); - await fs.writeFile( - path.join(tempDir, ...segments), - 'export const polluter = true;\n' - ); + await fs.writeFile(path.join(tempDir, ...segments), 'export const polluter = true;\n'); } const indexer = new CodebaseIndexer({ @@ -65,13 +60,18 @@ describe('Indexer exclude patterns — nested directories', () => { const indexPath = path.join(tempDir, CODEBASE_CONTEXT_DIRNAME, KEYWORD_INDEX_FILENAME); const indexRaw = JSON.parse(await fs.readFile(indexPath, 'utf-8')) as Record; - const chunks = ( - Array.isArray(indexRaw) - ? indexRaw - : Array.isArray(indexRaw?.chunks) - ? indexRaw.chunks - : [] - ) as Array<{ filePath: string }>; + + let chunks: Array<{ filePath: string }>; + if (Array.isArray(indexRaw)) { + chunks = indexRaw; + } else if (Array.isArray(indexRaw?.chunks)) { + chunks = indexRaw.chunks as Array<{ filePath: string }>; + } else { + throw new Error( + `Unexpected index format: keys=${JSON.stringify(Object.keys(indexRaw ?? {}))}` + ); + } + const indexedPaths = chunks.map((chunk) => chunk.filePath); // The legitimate file must be indexed From 83ba23e5fc107f58aceacc598c89c93bd41e01a1 Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Wed, 18 Mar 2026 23:05:29 +0100 Subject: [PATCH 4/4] fix: widen IGNORED_DIRECTORY_NAMES to Set for TS compatibility Set.has() requires the argument to match the set's type parameter. Spreading as-const tuples into a Set infers a narrow literal union, which rejects entry.name (plain string) at the call site on line 178. --- src/utils/project-discovery.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/utils/project-discovery.ts b/src/utils/project-discovery.ts index 3d6b3b3..0b4bc8a 100644 --- a/src/utils/project-discovery.ts +++ b/src/utils/project-discovery.ts @@ -20,7 +20,10 @@ export interface DiscoverProjectsOptions { const DEFAULT_MAX_DEPTH = 4; -const IGNORED_DIRECTORY_NAMES = new Set([...EXCLUDED_DIRECTORY_NAMES, ...DISCOVERY_ONLY_IGNORED]); +const IGNORED_DIRECTORY_NAMES: Set = new Set([ + ...EXCLUDED_DIRECTORY_NAMES, + ...DISCOVERY_ONLY_IGNORED +]); const STRONG_DIRECTORY_MARKERS = new Set(['.codebase-context', '.git']); const WORKSPACE_MARKERS = new Set(['lerna.json', 'nx.json', 'pnpm-workspace.yaml', 'turbo.json']);