diff --git a/src/constants/codebase-context.ts b/src/constants/codebase-context.ts index 62e97b2..23592ce 100644 --- a/src/constants/codebase-context.ts +++ b/src/constants/codebase-context.ts @@ -25,3 +25,44 @@ export const INDEXING_STATS_FILENAME = 'indexing-stats.json' as const; export const VECTOR_DB_DIRNAME = 'index' as const; export const MANIFEST_FILENAME = 'manifest.json' as const; export const RELATIONSHIPS_FILENAME = 'relationships.json' as const; + +/** + * Directories excluded from indexing, file-watching, and project discovery. + * Single source of truth — all three consumers import from here. + */ +export const EXCLUDED_DIRECTORY_NAMES = [ + '.cache', + '.claude', + '.codebase-context', + '.git', + '.next', + '.nx', + '.planning', + '.turbo', + 'build', + 'coverage', + 'dist', + 'node_modules', + 'target', + 'vendor', + 'worktrees' +] as const; + +/** Glob patterns that match excluded directories at any nesting depth. */ +export const EXCLUDED_GLOB_PATTERNS: readonly string[] = EXCLUDED_DIRECTORY_NAMES.map( + (dir) => `**/${dir}/**` +); + +/** + * Additional directories skipped only during project discovery (not generated + * code, just not useful roots to recurse into). + */ +export const DISCOVERY_ONLY_IGNORED = [ + '.hg', + '.nuxt', + '.svn', + '.venv', + '.yarn', + 'out', + 'tmp' +] as const; diff --git a/src/core/file-watcher.ts b/src/core/file-watcher.ts index 58efb81..155ac6e 100644 --- a/src/core/file-watcher.ts +++ b/src/core/file-watcher.ts @@ -1,5 +1,6 @@ import chokidar from 'chokidar'; import path from 'path'; +import { EXCLUDED_GLOB_PATTERNS } from '../constants/codebase-context.js'; import { getSupportedExtensions } from '../utils/language-detection.js'; export interface FileWatcherOptions { @@ -43,18 +44,7 @@ export function startFileWatcher(opts: FileWatcherOptions): () => void { }; const watcher = chokidar.watch(rootPath, { - ignored: [ - '**/node_modules/**', - '**/.codebase-context/**', - '**/.git/**', - '**/dist/**', - '**/.nx/**', - '**/.planning/**', - '**/coverage/**', - '**/.turbo/**', - '**/.next/**', - '**/.cache/**' - ], + ignored: [...EXCLUDED_GLOB_PATTERNS], persistent: true, ignoreInitial: true, awaitWriteFinish: { stabilityThreshold: 200, pollInterval: 100 } diff --git a/src/core/indexer.ts b/src/core/indexer.ts index afef69b..d0e06d4 100644 --- a/src/core/indexer.ts +++ b/src/core/indexer.ts @@ -39,6 +39,7 @@ import { mergeSmallChunks } from '../utils/chunking.js'; import { getFileCommitDates } from '../utils/git-dates.js'; import { CODEBASE_CONTEXT_DIRNAME, + EXCLUDED_GLOB_PATTERNS, INDEX_FORMAT_VERSION, INDEXING_STATS_FILENAME, INDEX_META_FILENAME, @@ -274,14 +275,7 @@ export class CodebaseIndexer { '**/*.{sql,graphql,gql}', '**/*.{json,jsonc,yaml,yml,toml,xml}' ], - exclude: [ - 'node_modules/**', - 'dist/**', - 'build/**', - '.git/**', - 'coverage/**', - '.codebase-context/**' - ], + exclude: [...EXCLUDED_GLOB_PATTERNS], respectGitignore: true, parsing: { maxFileSize: 1048576, diff --git a/src/utils/project-discovery.ts b/src/utils/project-discovery.ts index 64e1156..0b4bc8a 100644 --- a/src/utils/project-discovery.ts +++ b/src/utils/project-discovery.ts @@ -1,6 +1,7 @@ import { promises as fs } from 'fs'; import type { Dirent } from 'fs'; import path from 'path'; +import { EXCLUDED_DIRECTORY_NAMES, DISCOVERY_ONLY_IGNORED } from '../constants/codebase-context.js'; export type ProjectEvidence = | 'existing_index' @@ -19,23 +20,9 @@ export interface DiscoverProjectsOptions { const DEFAULT_MAX_DEPTH = 4; -const IGNORED_DIRECTORY_NAMES = new Set([ - '.git', - '.hg', - '.svn', - '.next', - '.nuxt', - '.turbo', - '.venv', - '.yarn', - 'build', - 'coverage', - 'dist', - 'node_modules', - 'out', - 'target', - 'tmp', - 'vendor' +const IGNORED_DIRECTORY_NAMES: Set = new Set([ + ...EXCLUDED_DIRECTORY_NAMES, + ...DISCOVERY_ONLY_IGNORED ]); const STRONG_DIRECTORY_MARKERS = new Set(['.codebase-context', '.git']); diff --git a/tests/indexer-exclude-patterns.test.ts b/tests/indexer-exclude-patterns.test.ts new file mode 100644 index 0000000..a147b71 --- /dev/null +++ b/tests/indexer-exclude-patterns.test.ts @@ -0,0 +1,89 @@ +import { afterEach, describe, expect, it } from 'vitest'; +import { promises as fs } from 'fs'; +import os from 'os'; +import path from 'path'; +import { CodebaseIndexer } from '../src/core/indexer.js'; +import { analyzerRegistry } from '../src/core/analyzer-registry.js'; +import { GenericAnalyzer } from '../src/analyzers/generic/index.js'; +import { + CODEBASE_CONTEXT_DIRNAME, + KEYWORD_INDEX_FILENAME +} from '../src/constants/codebase-context.js'; + +describe('Indexer exclude patterns — nested directories', () => { + let tempDir: string; + + afterEach(async () => { + if (tempDir) await fs.rm(tempDir, { recursive: true, force: true }); + }); + + it('excludes nested coverage, worktrees, .claude, and dist directories', async () => { + analyzerRegistry.register(new GenericAnalyzer()); + tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'indexer-exclude-patterns-')); + + // Legitimate source file + await fs.mkdir(path.join(tempDir, 'src'), { recursive: true }); + await fs.writeFile( + path.join(tempDir, 'src', 'app.ts'), + 'export function main() { return "hello"; }\n' + ); + + // Polluters — nested paths that should be excluded + const polluters = [ + ['packages', 'ui', 'coverage', 'prettify.js'], + ['.claude', 'worktrees', 'branch', 'src', 'app.ts'], + ['worktrees', 'portal30-pr', 'src', 'real.ts'], + ['apps', 'web', 'dist', 'bundle.js'] + ]; + + for (const segments of polluters) { + const dir = path.join(tempDir, ...segments.slice(0, -1)); + await fs.mkdir(dir, { recursive: true }); + await fs.writeFile(path.join(tempDir, ...segments), 'export const polluter = true;\n'); + } + + const indexer = new CodebaseIndexer({ + rootPath: tempDir, + config: { + skipEmbedding: true, + parsing: { + maxFileSize: 1048576, + chunkSize: 50, + chunkOverlap: 0, + parseTests: true, + parseNodeModules: false + } + } + }); + + await indexer.index(); + + const indexPath = path.join(tempDir, CODEBASE_CONTEXT_DIRNAME, KEYWORD_INDEX_FILENAME); + const indexRaw = JSON.parse(await fs.readFile(indexPath, 'utf-8')) as Record; + + let chunks: Array<{ filePath: string }>; + if (Array.isArray(indexRaw)) { + chunks = indexRaw; + } else if (Array.isArray(indexRaw?.chunks)) { + chunks = indexRaw.chunks as Array<{ filePath: string }>; + } else { + throw new Error( + `Unexpected index format: keys=${JSON.stringify(Object.keys(indexRaw ?? {}))}` + ); + } + + const indexedPaths = chunks.map((chunk) => chunk.filePath); + + // The legitimate file must be indexed + expect(indexedPaths.some((p) => p.includes('src/app.ts') || p.includes('src\\app.ts'))).toBe( + true + ); + + // None of the polluter paths should appear + const polluterMarkers = ['coverage', '.claude', 'worktrees', 'dist']; + for (const marker of polluterMarkers) { + const leaked = indexedPaths.filter((p) => p.includes(marker)); + expect(leaked, `paths containing "${marker}" should not be indexed`).toEqual([]); + } + }); +});