Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions src/constants/codebase-context.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,44 @@ export const INDEXING_STATS_FILENAME = 'indexing-stats.json' as const;
export const VECTOR_DB_DIRNAME = 'index' as const;
export const MANIFEST_FILENAME = 'manifest.json' as const;
export const RELATIONSHIPS_FILENAME = 'relationships.json' as const;

/**
* Directories excluded from indexing, file-watching, and project discovery.
* Single source of truth — all three consumers import from here.
*/
export const EXCLUDED_DIRECTORY_NAMES = [
'.cache',
'.claude',
'.codebase-context',
'.git',
'.next',
'.nx',
'.planning',
'.turbo',
'build',

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Do not drop nested build scripts from indexing

Adding 'build' to the shared recursive exclude list means mergeConfig() now passes **/build/** to scanFiles() (src/core/indexer.ts:278,1033-1039) and startFileWatcher() ignores the same paths (src/core/file-watcher.ts:46-50). In monorepos that keep checked-in tooling under packages/*/build/ or similar, those source files will silently disappear from search and auto-refresh even though the old build/** rule only skipped the repository root’s build output.

Useful? React with 👍 / 👎.

'coverage',
'dist',
'node_modules',
'target',
'vendor',
'worktrees'
] as const;

/** Glob patterns that match excluded directories at any nesting depth. */
export const EXCLUDED_GLOB_PATTERNS: readonly string[] = EXCLUDED_DIRECTORY_NAMES.map(
(dir) => `**/${dir}/**`
);

/**
* Additional directories skipped only during project discovery (not generated
* code, just not useful roots to recurse into).
*/
export const DISCOVERY_ONLY_IGNORED = [
'.hg',
'.nuxt',
'.svn',
'.venv',
'.yarn',
'out',
'tmp'
] as const;
14 changes: 2 additions & 12 deletions src/core/file-watcher.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import chokidar from 'chokidar';
import path from 'path';
import { EXCLUDED_GLOB_PATTERNS } from '../constants/codebase-context.js';
import { getSupportedExtensions } from '../utils/language-detection.js';

export interface FileWatcherOptions {
Expand Down Expand Up @@ -43,18 +44,7 @@ export function startFileWatcher(opts: FileWatcherOptions): () => void {
};

const watcher = chokidar.watch(rootPath, {
ignored: [
'**/node_modules/**',
'**/.codebase-context/**',
'**/.git/**',
'**/dist/**',
'**/.nx/**',
'**/.planning/**',
'**/coverage/**',
'**/.turbo/**',
'**/.next/**',
'**/.cache/**'
],
ignored: [...EXCLUDED_GLOB_PATTERNS],
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Unnecessary spread of an already-mutable array

EXCLUDED_GLOB_PATTERNS is already typed as string[], so spreading it into a new array literal is redundant — it just allocates a copy.

Suggested change
ignored: [...EXCLUDED_GLOB_PATTERNS],
ignored: EXCLUDED_GLOB_PATTERNS,

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!

persistent: true,
ignoreInitial: true,
awaitWriteFinish: { stabilityThreshold: 200, pollInterval: 100 }
Expand Down
10 changes: 2 additions & 8 deletions src/core/indexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import { mergeSmallChunks } from '../utils/chunking.js';
import { getFileCommitDates } from '../utils/git-dates.js';
import {
CODEBASE_CONTEXT_DIRNAME,
EXCLUDED_GLOB_PATTERNS,
INDEX_FORMAT_VERSION,
INDEXING_STATS_FILENAME,
INDEX_META_FILENAME,
Expand Down Expand Up @@ -274,14 +275,7 @@ export class CodebaseIndexer {
'**/*.{sql,graphql,gql}',
'**/*.{json,jsonc,yaml,yml,toml,xml}'
],
exclude: [
'node_modules/**',
'dist/**',
'build/**',
'.git/**',
'coverage/**',
'.codebase-context/**'
],
exclude: [...EXCLUDED_GLOB_PATTERNS],
respectGitignore: true,
parsing: {
maxFileSize: 1048576,
Expand Down
21 changes: 4 additions & 17 deletions src/utils/project-discovery.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { promises as fs } from 'fs';
import type { Dirent } from 'fs';
import path from 'path';
import { EXCLUDED_DIRECTORY_NAMES, DISCOVERY_ONLY_IGNORED } from '../constants/codebase-context.js';

export type ProjectEvidence =
| 'existing_index'
Expand All @@ -19,23 +20,9 @@ export interface DiscoverProjectsOptions {

const DEFAULT_MAX_DEPTH = 4;

const IGNORED_DIRECTORY_NAMES = new Set([
'.git',
'.hg',
'.svn',
'.next',
'.nuxt',
'.turbo',
'.venv',
'.yarn',
'build',
'coverage',
'dist',
'node_modules',
'out',
'target',
'tmp',
'vendor'
const IGNORED_DIRECTORY_NAMES: Set<string> = new Set([
...EXCLUDED_DIRECTORY_NAMES,
...DISCOVERY_ONLY_IGNORED
]);

const STRONG_DIRECTORY_MARKERS = new Set(['.codebase-context', '.git']);
Expand Down
89 changes: 89 additions & 0 deletions tests/indexer-exclude-patterns.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import { afterEach, describe, expect, it } from 'vitest';
import { promises as fs } from 'fs';
import os from 'os';
import path from 'path';
import { CodebaseIndexer } from '../src/core/indexer.js';
import { analyzerRegistry } from '../src/core/analyzer-registry.js';
import { GenericAnalyzer } from '../src/analyzers/generic/index.js';
import {
CODEBASE_CONTEXT_DIRNAME,
KEYWORD_INDEX_FILENAME
} from '../src/constants/codebase-context.js';

describe('Indexer exclude patterns — nested directories', () => {
let tempDir: string;

afterEach(async () => {
if (tempDir) await fs.rm(tempDir, { recursive: true, force: true });
});

it('excludes nested coverage, worktrees, .claude, and dist directories', async () => {
analyzerRegistry.register(new GenericAnalyzer());
tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'indexer-exclude-patterns-'));

// Legitimate source file
await fs.mkdir(path.join(tempDir, 'src'), { recursive: true });
await fs.writeFile(
path.join(tempDir, 'src', 'app.ts'),
'export function main() { return "hello"; }\n'
);

// Polluters — nested paths that should be excluded
const polluters = [
['packages', 'ui', 'coverage', 'prettify.js'],
['.claude', 'worktrees', 'branch', 'src', 'app.ts'],
['worktrees', 'portal30-pr', 'src', 'real.ts'],
['apps', 'web', 'dist', 'bundle.js']
];

for (const segments of polluters) {
const dir = path.join(tempDir, ...segments.slice(0, -1));
await fs.mkdir(dir, { recursive: true });
await fs.writeFile(path.join(tempDir, ...segments), 'export const polluter = true;\n');
}

const indexer = new CodebaseIndexer({
rootPath: tempDir,
config: {
skipEmbedding: true,
parsing: {
maxFileSize: 1048576,
chunkSize: 50,
chunkOverlap: 0,
parseTests: true,
parseNodeModules: false
}
}
});

await indexer.index();

const indexPath = path.join(tempDir, CODEBASE_CONTEXT_DIRNAME, KEYWORD_INDEX_FILENAME);
const indexRaw = JSON.parse(await fs.readFile(indexPath, 'utf-8')) as Record<string, unknown>;

let chunks: Array<{ filePath: string }>;
if (Array.isArray(indexRaw)) {
chunks = indexRaw;
} else if (Array.isArray(indexRaw?.chunks)) {
chunks = indexRaw.chunks as Array<{ filePath: string }>;
} else {
throw new Error(
`Unexpected index format: keys=${JSON.stringify(Object.keys(indexRaw ?? {}))}`
);
}

const indexedPaths = chunks.map((chunk) => chunk.filePath);

// The legitimate file must be indexed
expect(indexedPaths.some((p) => p.includes('src/app.ts') || p.includes('src\\app.ts'))).toBe(
true
);

// None of the polluter paths should appear
const polluterMarkers = ['coverage', '.claude', 'worktrees', 'dist'];
for (const marker of polluterMarkers) {
const leaked = indexedPaths.filter((p) => p.includes(marker));
expect(leaked, `paths containing "${marker}" should not be indexed`).toEqual([]);
}
});
});
Loading