diff --git a/ts/packages/agentSdk/src/helpers/actionHelpers.ts b/ts/packages/agentSdk/src/helpers/actionHelpers.ts index efc4c964e..e2fb7cdd7 100644 --- a/ts/packages/agentSdk/src/helpers/actionHelpers.ts +++ b/ts/packages/agentSdk/src/helpers/actionHelpers.ts @@ -76,6 +76,19 @@ export function createActionResultFromHtmlDisplayWithScript( }; } +export function createActionResultFromMarkdownDisplay( + literalText: string, + entities: Entity[] = [], + resultEntity?: Entity, +): ActionResultSuccess { + return { + literalText, + entities, + resultEntity, + displayContent: { type: "markdown", content: literalText }, + }; +} + export function createActionResultFromError(error: string): ActionResultError { return { error, diff --git a/ts/packages/agents/spelunker/design.md b/ts/packages/agents/spelunker/design.md index 205c2b7df..5c7665e83 100644 --- a/ts/packages/agents/spelunker/design.md +++ b/ts/packages/agents/spelunker/design.md @@ -16,22 +16,36 @@ Questions about the focused code base are answered roughly as follows: 1. Gather all relevant source files. (E.g. `**/*.{py,ts}`) 2. Chunkify locally (using chunker.py or typescriptChunker.ts) -3. Send batches of chunks, in parallel, to a cheap, fast LLM +3. Send batches of chunks, in parallel batches, to a cheap, fast LLM + with a prompt asking to summarize each chunk. + +(Note that 1-3 need to be done only for new or changed files.) + +4. Send batches of chunks, in parallel batches, to a cheap, fast LLM with a prompt asking it to find chunks relevant to the user question. -4. Sort by relevance, keep top `N`. (E.g. `N = 30`) -5. Send the selected chunks as context to a smart model (the "oracle") +5. Sort selected chunks by relevance, keep top _N_. + (_N_ is dynamically computed to fit in the oracle prompt size limit.) +6. Send the _N_ top selected chunks as context to a smart model ("the oracle") with the request to answer the user question using those chunks as context. -6. Construct a result from the answer and the chunks used to come up with it. +7. Construct a result from the answer and the chunks used to come up with it + ("references"). ## How easy is it to target other languages? - Need a chunker for each language; the rest is the same. -- Chunking TypeScript was, realistically, a week's work. +- Chunking TypeScript was, realistically, a week's work, so not too terrible. + +## Latest changes + +The summaries are (so far, only) used to update so-called "breadcrumb" blobs +(placeholders for sub-chunks) to make the placeholder text look better +(a comment plus the full signature, rather than just e.g. `def foo ...`). ## TO DO - Prompt engineering (borrow from John Lam?) - Evaluation of selection process (does the model do a good enough job?) -- Scaling. It takes 60-80 seconds to select from ~4000 chunks. -- Do we need a "global index" (of summaries) like John Lam's ask.py? +- Scaling. It takes 20-50 seconds to select from ~4000 chunks (and $5). + About the same to summarize that number of chunks. +- Do we need to send a "global index" (of summaries) like John Lam's ask.py? How to make that scale? diff --git a/ts/packages/agents/spelunker/scaling.md b/ts/packages/agents/spelunker/scaling.md new file mode 100644 index 000000000..37b7a3b6f --- /dev/null +++ b/ts/packages/agents/spelunker/scaling.md @@ -0,0 +1,56 @@ +# Scaling ideas + +These are very unformed thoughts. + +## Local indexing with fuzzy matching + +Directly after chunking, add embeddings for all chunks, just based on the code alone. +(Yes I know that's pretty lame, but it's what we can do without summarizing all chunks.) + +Whenever a question is asked, _first_ search the embeddings for _k_ nearest neighbors, +where _k_ is pretty large (maybe start with 1000). +Then pass those chunks on to the usual AI-driven selection process. + +Do we still need summaries if we do this? How would they be used? +(Possibly we could generate summaries for the query context on demand.) + +### Implementation planning + +- For now, skip the summarization phase. +- Copy vectorTable.ts from _examples/memoryProviders_ (which IMO isn't a real package). +- Maybe remove stuff we don't need, e.g. generics over `ValueType` and the other weird thing. +- Keep using `interface typeagent.VectorStore` and put creation in one place. +- Add another file defining an `async` function to get an embedding (probably needs a model). +- After we've got `allChunks` filled (with all the chunks), batch compute and insert + embeddings for each chunks into the vectore store. +- When prepping for a question, instead of sending all chunks off for selection, + get the query's embedding and request a generous k nearest neighbors, and send _those_ + off to the selection process. Let's start with _k_=1000, and then see if reducing it + by half or doubling by two makes much of a difference. +- The rest is the same. + +### Again, with feeling + +- Copy `vectorTable` from _examples/memoryProviders_, change to pass in the Database object. + (We could import sqlite from memory-providers, but then the embeddings are in a different database.) +- BETTER: `import { sqlite } from "memory-providers"` and add a createStorageFromDb method. +- EVEN BETTER: Just extract the nearest-neighbors algorithm and do the rest myself. memory-providers is obsolete anyways. +- Create an embedding model when we initialize `QueryContext` (and put it there). + (Look in old spelunker for example code.) +- Create a table named `ChunkEmbeddings (chunkId TEXT PRIMARY KEY, ebedding BLOB)` when creating the db. +- Use `generateTextEmbeddings` or `generateEmbedding` from `typeagent` to get embedding(s). + Those are async and not free and might fail, but generally pretty reliable. + (There are retry versions too if we need them.) +- IIUC these normalize, so we can use dot product instead of cosine similarity. +- Skip the summarizing step. (Keep the code and the Summaries table, we may need them later.) +- Manage embeddings as chunks are removed and added. Probably have to add something + to remove all embeddings that reference a chunk for a given file (like we do for blobs). +- When processing a query, before the selection step, slim down the chunks using embeddings: + - Get the embedding for the user query + - Call `nearestNeighbors` on the `VectorTable` + - Only read the selected chunk IDs from the Chunks table. + +### TODO + +- When fewer than maxConcurrency batches, create more batches and distribute evenly. + (I have an algorithm in mind, this can go in `makeBatches`.) diff --git a/ts/packages/agents/spelunker/src/batching.ts b/ts/packages/agents/spelunker/src/batching.ts new file mode 100644 index 000000000..e47feab04 --- /dev/null +++ b/ts/packages/agents/spelunker/src/batching.ts @@ -0,0 +1,68 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { Chunk } from "./chunkSchema.js"; +import { console_log } from "./logging.js"; +import { ChunkDescription } from "./selectorSchema.js"; + +export function makeBatches( + chunks: Chunk[], + batchSize: number, // In characters + maxChunks: number, // How many chunks at most per batch +): Chunk[][] { + const batches: Chunk[][] = []; + let batch: Chunk[] = []; + let size = 0; + function flush(): void { + batches.push(batch); + console_log( + ` [Batch ${batches.length} has ${batch.length} chunks and ${size} characters]`, + ); + batch = []; + size = 0; + } + for (const chunk of chunks) { + const chunkSize = getChunkSize(chunk); + if ( + size && + (size + chunkSize > batchSize || batch.length >= maxChunks) + ) { + flush(); + } + batch.push(chunk); + size += chunkSize; + } + if (size) { + flush(); + } + return batches; +} + +export function keepBestChunks( + chunkDescs: ChunkDescription[], // Sorted by descending relevance + allChunks: Chunk[], + batchSize: number, // In characters +): Chunk[] { + const chunks: Chunk[] = []; + let size = 0; + for (const chunkDesc of chunkDescs) { + const chunk = allChunks.find((c) => c.chunkId === chunkDesc.chunkId); + if (!chunk) continue; + const chunkSize = getChunkSize(chunk); + if (size + chunkSize > batchSize && chunks.length) { + break; + } + chunks.push(chunk); + size += chunkSize; + } + return chunks; +} + +function getChunkSize(chunk: Chunk): number { + // This is all an approximation + let size = chunk.fileName.length + 50; + for (const blob of chunk.blobs) { + size += blob.lines.join("").length + 4 * blob.lines.length; + } + return size; +} diff --git a/ts/packages/agents/spelunker/src/databaseUtils.ts b/ts/packages/agents/spelunker/src/databaseUtils.ts new file mode 100644 index 000000000..6da2df4c1 --- /dev/null +++ b/ts/packages/agents/spelunker/src/databaseUtils.ts @@ -0,0 +1,119 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import * as fs from "fs"; +import * as path from "path"; +import { createRequire } from "module"; + +import Database, * as sqlite from "better-sqlite3"; + +import { SpelunkerContext } from "./spelunkerActionHandler.js"; + +import { console_log } from "./logging.js"; + +const databaseSchema = ` +CREATE TABLE IF NOT EXISTS Files ( + fileName TEXT PRIMARY KEY, + mtime FLOAT NOT NULL, + size INTEGER NOT NULL +); +CREATE TABLE IF NOT EXISTS Chunks ( + chunkId TEXT PRIMARY KEY, + treeName TEXT NOT NULL, + codeName TEXT NOT NULL, + parentId TEXT KEY REFERENCES Chunks(chunkId), -- May be null + fileName TEXT KEY REFERENCES files(fileName) NOT NULL, + lineNo INTEGER NOT NULL -- 1-based +); +CREATE TABLE IF NOT EXISTS Blobs ( + chunkId TEXT KEY REFERENCES Chunks(chunkId) NOT NULL, + start INTEGER NOT NULL, -- 0-based + lines TEXT NOT NULL, + breadcrumb TEXT -- Chunk ID or empty string or NULL +); +CREATE TABLE IF NOT EXISTS Summaries ( + chunkId TEXT PRIMARY KEY REFERENCES Chunks(chunkId), + language TEXT, -- "python", "typescript", etc. + summary TEXT, + signature TEXT +); +CREATE TABLE IF NOT EXISTS ChunkEmbeddings ( + chunkId TEXT PRIMARY KEY REFERENCES Chunks(chunkId), + embedding BLOB NOT NULL +); +`; + +function getDbOptions() { + if (process?.versions?.electron !== undefined) { + return undefined; + } + const r = createRequire(import.meta.url); + const betterSqlitePath = r.resolve("better-sqlite3/package.json"); + const nativeBinding = path.join( + betterSqlitePath, + "../build/Release/better_sqlite3.n.node", + ); + return { nativeBinding }; +} + +export function createDatabase(context: SpelunkerContext): void { + if (!context.queryContext) { + throw new Error( + "context.queryContext must be set before calling createDatabase", + ); + } + const loc = context.queryContext.databaseLocation; + if (context.queryContext.database) { + console_log(`[Using database at ${loc}]`); + return; + } + if (fs.existsSync(loc)) { + console_log(`[Opening database at ${loc}]`); + } else { + console_log(`[Creating database at ${loc}]`); + } + const db = new Database(loc, getDbOptions()); + // Write-Ahead Logging, improving concurrency and performance + db.pragma("journal_mode = WAL"); + // Fix permissions to be read/write only by the owner + fs.chmodSync(context.queryContext.databaseLocation, 0o600); + // Create all the tables we'll use + db.exec(databaseSchema); + context.queryContext.database = db; +} + +export function purgeFile(db: sqlite.Database, fileName: string): void { + const prepDeleteEmbeddings = db.prepare(` + DELETE FROM ChunkEmbeddings WHERE chunkId IN ( + SELECT chunkId + FROM chunks + WHERE filename = ? + ) + `); + const prepDeleteSummaries = db.prepare(` + DELETE FROM Summaries WHERE chunkId IN ( + SELECT chunkId + FROM chunks + WHERE fileName = ? + ) + `); + const prepDeleteBlobs = db.prepare(` + DELETE FROM Blobs WHERE chunkId IN ( + SELECT chunkId + FROM chunks + WHERE filename = ? + ) + `); + const prepDeleteChunks = db.prepare( + `DELETE FROM Chunks WHERE fileName = ?`, + ); + const prepDeleteFiles = db.prepare(`DELETE FROM files WHERE fileName = ?`); + + db.exec(`BEGIN TRANSACTION`); + prepDeleteSummaries.run(fileName); + prepDeleteBlobs.run(fileName); + prepDeleteEmbeddings.run(fileName); + prepDeleteChunks.run(fileName); + prepDeleteFiles.run(fileName); + db.exec(`COMMIT`); +} diff --git a/ts/packages/agents/spelunker/src/embeddings.ts b/ts/packages/agents/spelunker/src/embeddings.ts new file mode 100644 index 000000000..c8d914d5c --- /dev/null +++ b/ts/packages/agents/spelunker/src/embeddings.ts @@ -0,0 +1,198 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { Statement } from "better-sqlite3"; +import { Result } from "typechat"; + +import { openai, TextEmbeddingModel } from "aiclient"; +import { createLimiter } from "common-utils"; +import { createNormalized, dotProduct } from "typeagent"; +import { NormalizedEmbedding } from "typeagent"; + +import { Chunk, ChunkId } from "./chunkSchema.js"; +import { console_log } from "./logging.js"; +import { retryOn429 } from "./retryLogic.js"; +import { makeBatches } from "./batching.js"; +import { SpelunkerContext } from "./spelunkerActionHandler.js"; +import path from "path"; + +export function makeEmbeddingModel(): TextEmbeddingModel { + const apiSettings = openai.apiSettingsFromEnv(openai.ModelType.Embedding); + apiSettings.maxRetryAttempts = 0; + const endpoint = process.env.AZURE_OPENAI_ENDPOINT_EMBEDDING_3_SMALL; + if (endpoint) { + apiSettings.endpoint = endpoint; + } + const embeddingModel = openai.createEmbeddingModel(apiSettings); + console_log(`[Max embedding batch size: ${embeddingModel.maxBatchSize}]`); + return embeddingModel; +} + +export async function loadEmbeddings( + context: SpelunkerContext, + chunks: Chunk[], +): Promise { + const model = context.queryContext!.embeddingModel; + if (!model.generateEmbeddingBatch) { + console_log(`[This embedding model does not support batch operations]`); // TODO: Fix this + return; + } + + console_log(`[Step 1c: Store chunk embeddings]`); + const generateEmbeddingBatch = model.generateEmbeddingBatch; + const db = context.queryContext!.database!; + const prepInsertEmbeddings = db.prepare( + `INSERT OR REPLACE INTO ChunkEmbeddings (chunkId, embedding) VALUES (?, ?)`, + ); + const maxCharacters = 100000; // TODO: tune + const batches = makeBatches(chunks, maxCharacters, model.maxBatchSize); + // const maxConcurrency = + // parseInt(process.env.AZURE_OPENAI_MAX_CONCURRENCY ?? "5") ?? 5; + const maxConcurrency = 2; // Seems we can do no better, given the low quota. + console_log( + ` [${batches.length} batches, maxConcurrency ${maxConcurrency}]`, + ); + const limiter = createLimiter(maxConcurrency); + const promises: Promise[] = []; + for (const batch of batches) { + const p = limiter(() => + generateAndInsertEmbeddings( + generateEmbeddingBatch, + prepInsertEmbeddings, + batch, + ), + ); + promises.push(p); + } + await Promise.all(promises); +} + +async function generateAndInsertEmbeddings( + generateEmbeddingBatch: (a: string[]) => Promise>, + prepInsertEmbeddings: Statement, + batch: Chunk[], +): Promise { + const t0 = new Date().getTime(); + const stringBatch = batch.map(blobText); + const embeddings = await retryOn429(() => + generateEmbeddingBatch(stringBatch), + ); + if (embeddings) { + for (let i = 0; i < embeddings.length; i++) { + const chunk = batch[i]; + const embedding: NormalizedEmbedding = createNormalized( + embeddings[i], + ); + prepInsertEmbeddings.run(chunk.chunkId, Buffer.from(embedding)); + } + const t1 = new Date().getTime(); + const dtms = t1 - t0; + const dtStr = + dtms < 1000 ? `${dtms}ms` : `${(dtms / 1000).toFixed(3)}s`; + console_log( + ` [Generated and inserted embedding batch of ${batch.length} in ${dtStr}]`, + ); + } else { + const t1 = new Date().getTime(); + const dtms = t1 - t0; + const dtStr = + dtms < 1000 ? `${dtms}ms` : `${(dtms / 1000).toFixed(3)}s`; + console_log(` [Failed to generate embedding batch in ${dtStr}]`); + } +} + +function blobText(chunk: Chunk): string { + const lines: string[] = []; + for (const blob of chunk.blobs) { + lines.push(...blob.lines); + } + // Keep only alphanumerical words; everything else is removed (hoping to reduce the cost) + const fileName = shortenedFilename(chunk.fileName); + const line = lines.join("").replace(/\W+/g, " ").trim().slice(0, 20000); // Assuming average 2.5 chars per token + return `${fileName}\n${line}\n}`; +} + +function shortenedFilename(fileName: string): string { + const prefix = process.env.HOME; + if (prefix && fileName.startsWith(prefix + path.sep)) { + return "~" + fileName.slice(prefix.length); + } else { + return fileName; + } +} + +export async function preSelectChunks( + context: SpelunkerContext, + input: string, + maxChunks = 1000, +): Promise { + const ta0 = new Date().getTime(); + const db = context.queryContext!.database!; + const prepAllEmbeddings = db.prepare( + `SELECT chunkId, embedding FROM ChunkEmbeddings`, + ); + const allEmbeddingRows: { + chunkId: ChunkId; + embedding: Buffer; + }[] = prepAllEmbeddings.all() as any[]; + const ta1 = new Date().getTime(); + console_log( + ` [Read ${allEmbeddingRows.length} embeddings in ${((ta1 - ta0) / 1000).toFixed(3)} seconds]`, + ); + if (allEmbeddingRows.length <= maxChunks) { + console_log(` [Returning all ${allEmbeddingRows.length} chunk IDs]`); + return allEmbeddingRows.map((row) => row.chunkId); + } + + const tb0 = new Date().getTime(); + const queryEmbedding = await getEmbedding(context, input); + const tb1 = new Date().getTime(); + const tail = !queryEmbedding ? " (failure)" : ""; + console_log( + ` [Embedding input of ${input.length} characters took ${((tb1 - tb0) / 1000).toFixed(3)} seconds${tail}]`, + ); + if (!queryEmbedding) { + return []; + } + + const embeddings = allEmbeddingRows.map( + (row) => new Float32Array(Buffer.from(row.embedding)), + ); + const tc0 = new Date().getTime(); + const similarities: { chunkId: ChunkId; score: number }[] = []; + for (let i = 0; i < embeddings.length; i++) { + const chunkId = allEmbeddingRows[i].chunkId; + const score = dotProduct(embeddings[i], queryEmbedding); + similarities.push({ chunkId, score }); + } + similarities.sort((a, b) => b.score - a.score); + similarities.splice(maxChunks); + const chunkIds = similarities.map((s) => s.chunkId); + const tc1 = new Date().getTime(); + console_log( + ` [Found ${chunkIds.length} nearest neighbors in ${((tc1 - tc0) / 1000).toFixed(3)} seconds]`, + ); + return chunkIds; +} + +async function getEmbedding( + context: SpelunkerContext, + query: string, +): Promise { + const model = context.queryContext!.embeddingModel!; + const generateEmbeddingBatch = model.generateEmbeddingBatch; + if (!generateEmbeddingBatch) { + console_log(`[This embedding model does not support batch operations]`); // TODO: Fix this + return undefined; + } + + const rawEmbeddings: number[][] | undefined = await retryOn429(() => + generateEmbeddingBatch([query]), + ); + const rawEmbedding = rawEmbeddings?.[0]; + if (!rawEmbedding) { + console_log(`[Failed to generate embedding]`); + return undefined; + } + return rawEmbedding ? createNormalized(rawEmbedding) : undefined; +} diff --git a/ts/packages/agents/spelunker/src/logging.ts b/ts/packages/agents/spelunker/src/logging.ts new file mode 100644 index 000000000..5d1d80d58 --- /dev/null +++ b/ts/packages/agents/spelunker/src/logging.ts @@ -0,0 +1,17 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +let epoch: number = 0; + +export function resetEpoch(): void { + epoch = 0; +} + +export function console_log(...rest: any[]): void { + if (!epoch) { + epoch = Date.now(); + console.log(""); // Start new epoch with a blank line + } + const t = Date.now(); + console.log(((t - epoch) / 1000).toFixed(3).padStart(6), ...rest); +} diff --git a/ts/packages/agents/spelunker/src/queryContext.ts b/ts/packages/agents/spelunker/src/queryContext.ts new file mode 100644 index 000000000..686ba5502 --- /dev/null +++ b/ts/packages/agents/spelunker/src/queryContext.ts @@ -0,0 +1,110 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import * as fs from "fs"; +import * as path from "path"; + +import * as sqlite from "better-sqlite3"; + +import { createJsonTranslator, TypeChatJsonTranslator } from "typechat"; +import { createTypeScriptJsonValidator } from "typechat/ts"; + +import { ChatModel, openai, TextEmbeddingModel } from "aiclient"; +import { loadSchema } from "typeagent"; + +import { makeEmbeddingModel } from "./embeddings.js"; +import { console_log } from "./logging.js"; +import { OracleSpecs } from "./oracleSchema.js"; +import { SelectorSpecs } from "./selectorSchema.js"; +import { SummarizerSpecs } from "./summarizerSchema.js"; + +export interface QueryContext { + chatModel: ChatModel; + miniModel: ChatModel; + embeddingModel: TextEmbeddingModel; + oracle: TypeChatJsonTranslator; + chunkSelector: TypeChatJsonTranslator; + chunkSummarizer: TypeChatJsonTranslator; + databaseLocation: string; + database: sqlite.Database | undefined; +} + +function captureTokenStats(req: any, response: any): void { + const inputTokens = response.usage.prompt_tokens; + const outputTokens = response.usage.completion_tokens; + const cost = inputTokens * 0.000005 + outputTokens * 0.000015; + console_log( + ` [Tokens used: prompt=${inputTokens}, ` + + `completion=${outputTokens}, ` + + `cost=\$${cost.toFixed(2)}]`, + ); +} + +export function createQueryContext(): QueryContext { + const chatModel = openai.createChatModelDefault("spelunkerChat"); + chatModel.completionCallback = captureTokenStats; + chatModel.retryMaxAttempts = 0; + + const miniModel = openai.createChatModel( + undefined, // "GPT_4_O_MINI" is slower than default model?! + undefined, + undefined, + ["spelunkerMini"], + ); + miniModel.completionCallback = captureTokenStats; + miniModel.retryMaxAttempts = 0; + + const embeddingModel = makeEmbeddingModel(); + + const oracle = createTranslator( + chatModel, + "oracleSchema.ts", + "OracleSpecs", + ); + const chunkSelector = createTranslator( + miniModel, + "selectorSchema.ts", + "SelectorSpecs", + ); + const chunkSummarizer = createTranslator( + miniModel, + "summarizerSchema.ts", + "SummarizerSpecs", + ); + + const databaseFolder = path.join( + process.env.HOME ?? "", + ".typeagent", + "agents", + "spelunker", + ); + const mkdirOptions: fs.MakeDirectoryOptions = { + recursive: true, + mode: 0o700, + }; + fs.mkdirSync(databaseFolder, mkdirOptions); + + const databaseLocation = path.join(databaseFolder, "codeSearchDatabase.db"); + const database = undefined; + return { + chatModel, + miniModel, + embeddingModel, + oracle, + chunkSelector, + chunkSummarizer, + databaseLocation, + database, + }; +} + +function createTranslator( + model: ChatModel, + schemaFile: string, + typeName: string, +): TypeChatJsonTranslator { + const schema = loadSchema([schemaFile], import.meta.url); + const validator = createTypeScriptJsonValidator(schema, typeName); + const translator = createJsonTranslator(model, validator); + return translator; +} diff --git a/ts/packages/agents/spelunker/src/retryLogic.ts b/ts/packages/agents/spelunker/src/retryLogic.ts new file mode 100644 index 000000000..63ebc4bed --- /dev/null +++ b/ts/packages/agents/spelunker/src/retryLogic.ts @@ -0,0 +1,58 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { Result } from "typechat"; + +import { console_log } from "./logging.js"; + +export async function retryOn429( + translate: () => Promise>, + retries: number = 3, + defaultDelay: number = 5000, +): Promise { + let wrappedResult: Result; + do { + retries--; + wrappedResult = await translate(); + // console_log(wrappedResult); + if (!wrappedResult.success) { + if ( + retries > 0 && + wrappedResult.message.includes("fetch error: 429:") + ) { + let delay = defaultDelay; + const embeddingTime = wrappedResult.message.match( + /Try again in (\d+) seconds/, + ); + const azureTime = wrappedResult.message.match( + /after (\d+) milliseconds/, + ); + const openaiTime = wrappedResult.message.match( + /Please try again in (\d+\.\d*|\.\d+|\d+m)s./, + ); + if (embeddingTime || azureTime || openaiTime) { + if (embeddingTime) { + delay = parseInt(embeddingTime[1]) * 1000; + } else if (azureTime) { + delay = parseInt(azureTime[1]); + } else if (openaiTime) { + delay = parseFloat(openaiTime[1]); + if (!openaiTime[1].endsWith("m")) { + delay *= 1000; + } + } + } else { + console_log( + ` [Couldn't find msec in '${wrappedResult.message}'`, + ); + } + console_log(` [Retry on 429 error: sleep ${delay} ms]`); + await new Promise((resolve) => setTimeout(resolve, delay)); + continue; + } + console_log(` [Giving up: ${wrappedResult.message}]`); + return undefined; + } + } while (!wrappedResult.success); + return wrappedResult.data; +} diff --git a/ts/packages/agents/spelunker/src/searchCode.ts b/ts/packages/agents/spelunker/src/searchCode.ts index 022ba844f..bb476ffee 100644 --- a/ts/packages/agents/spelunker/src/searchCode.ts +++ b/ts/packages/agents/spelunker/src/searchCode.ts @@ -3,125 +3,36 @@ import * as fs from "fs"; import * as path from "path"; -import { createRequire } from "module"; -import Database, * as sqlite from "better-sqlite3"; +import * as sqlite from "better-sqlite3"; +import { Result, TypeChatJsonTranslator } from "typechat"; -import { createJsonTranslator, Result, TypeChatJsonTranslator } from "typechat"; -import { createTypeScriptJsonValidator } from "typechat/ts"; - -import { ChatModel, openai } from "aiclient"; import { createLimiter } from "common-utils"; - +import { ActionResult, Entity } from "@typeagent/agent-sdk"; import { - ActionResult, - ActionResultSuccess, - Entity, -} from "@typeagent/agent-sdk"; -import { createActionResultFromError } from "@typeagent/agent-sdk/helpers/action"; -import { loadSchema } from "typeagent"; - -import { - Blob, - Chunk, - ChunkedFile, - ChunkerErrorItem, - ChunkId, -} from "./chunkSchema.js"; -import { OracleSpecs } from "./oracleSchema.js"; + createActionResultFromMarkdownDisplay, + createActionResultFromError, +} from "@typeagent/agent-sdk/helpers/action"; + +import { keepBestChunks, makeBatches } from "./batching.js"; +import { Blob, Chunk, ChunkedFile, ChunkerErrorItem } from "./chunkSchema.js"; +import { createDatabase, purgeFile } from "./databaseUtils.js"; +import { loadEmbeddings, preSelectChunks } from "./embeddings.js"; +import { console_log, resetEpoch } from "./logging.js"; import { chunkifyPythonFiles } from "./pythonChunker.js"; +import { createQueryContext } from "./queryContext.js"; +import { retryOn429 } from "./retryLogic.js"; import { ChunkDescription, SelectorSpecs } from "./selectorSchema.js"; import { SpelunkerContext } from "./spelunkerActionHandler.js"; -import { SummarizerSpecs } from "./summarizerSchema.js"; +import { prepareChunks } from "./summarizing.js"; import { chunkifyTypeScriptFiles } from "./typescriptChunker.js"; -let epoch: number = 0; - -function console_log(...rest: any[]): void { - if (!epoch) { - epoch = Date.now(); - console.log(""); // Start new epoch with a blank line - } - const t = Date.now(); - console.log(((t - epoch) / 1000).toFixed(3).padStart(6), ...rest); -} - -export interface QueryContext { - chatModel: ChatModel; - oracle: TypeChatJsonTranslator; - miniModel: ChatModel; - chunkSelector: TypeChatJsonTranslator; - chunkSummarizer: TypeChatJsonTranslator; - databaseLocation: string; - database: sqlite.Database | undefined; -} - -function captureTokenStats(req: any, response: any): void { - console_log( - ` [Tokens used: prompt=${response.usage.prompt_tokens}, ` + - `completion=${response.usage.completion_tokens}]`, - ); -} - -function createQueryContext(): QueryContext { - const chatModel = openai.createChatModelDefault("spelunkerChat"); - chatModel.completionCallback = captureTokenStats; - - const miniModel = openai.createChatModel( - undefined, // "GPT_4_O_MINI" is slower than default model?! - undefined, - undefined, - ["spelunkerMini"], - ); - miniModel.completionCallback = captureTokenStats; - - const oracle = createTranslator( - chatModel, - "oracleSchema.ts", - "OracleSpecs", - ); - const chunkSelector = createTranslator( - miniModel, - "selectorSchema.ts", - "SelectorSpecs", - ); - const chunkSummarizer = createTranslator( - miniModel, - "summarizerSchema.ts", - "SummarizerSpecs", - ); - - const databaseFolder = path.join( - process.env.HOME ?? "/", - ".typeagent", - "agents", - "spelunker", - ); - const mkdirOptions: fs.MakeDirectoryOptions = { - recursive: true, - mode: 0o700, - }; - fs.mkdirSync(databaseFolder, mkdirOptions); - - const databaseLocation = path.join(databaseFolder, "codeSearchDatabase.db"); - const database = undefined; - return { - chatModel, - oracle, - miniModel, - chunkSelector, - chunkSummarizer, - databaseLocation, - database, - }; -} - // Answer a question; called from request and from searchCode action export async function searchCode( context: SpelunkerContext, input: string, ): Promise { - epoch = 0; // Reset logging clock + resetEpoch(); console_log(`[searchCode question='${input}']`); // 0. Check if the focus is set. @@ -131,10 +42,15 @@ export async function searchCode( // 1. Create the database, chunkify all files in the focus folders, and store the chunks. // Or use what's in the database if it looks up-to-date. - const db = await loadDatabaseAndChunks(context); + if (!context.queryContext) { + context.queryContext = createQueryContext(); + } + await createDatabase(context); + await loadDatabase(context); + const db = context.queryContext!.database!; // 2. Load all chunks from the database. - const allChunks = await loadAllChunksFromDatabase(db); + const allChunks = await readAllChunksFromDatabase(db); // 3. Ask a fast LLM for the most relevant chunk Ids, rank them, and keep the best ones. const chunks = await selectChunks(context, allChunks, input); @@ -168,14 +84,7 @@ export async function searchCode( ); } -async function loadDatabaseAndChunks( - context: SpelunkerContext, -): Promise { - console_log(`[Step 1: Load database]`); - return await loadDatabase(context); -} - -async function loadAllChunksFromDatabase( +async function readAllChunksFromDatabase( db: sqlite.Database, ): Promise { console_log(`[Step 2: Load chunks from database]`); @@ -282,21 +191,29 @@ export async function selectChunks( console_log( `[Step 3: Select relevant chunks from ${allChunks.length} chunks]`, ); + console_log(`[Step 3a: Pre-select with fuzzy matching]`); + const nearestChunkIds = await preSelectChunks(context, input, 500); + allChunks = allChunks.filter((c) => nearestChunkIds.includes(c.chunkId)); + console_log(` [Pre-selected ${allChunks.length} chunks]`); + + console_log(`[Step 3b: Narrow those down with LLM]`); const promises: Promise[] = []; const maxConcurrency = parseInt(process.env.AZURE_OPENAI_MAX_CONCURRENCY ?? "5") ?? 5; const limiter = createLimiter(maxConcurrency); - const batchLimit = process.env.OPENAI_API_KEY ? 100000 : 250000; // TODO: tune - const batches = makeBatches(allChunks, batchLimit); + const batchLimit = process.env.OPENAI_API_KEY ? 100000 : 100000; // TODO: tune + const batches = makeBatches(allChunks, batchLimit, 60); // TODO: tune console_log( ` [${batches.length} batches, maxConcurrency ${maxConcurrency}]`, ); - for (const batch of batches) { + for (let i = 0; i < batches.length; i++) { + const batch = batches[i]; const p = limiter(() => selectRelevantChunks( context.queryContext!.chunkSelector, batch, input, + i, ), ); promises.push(p); @@ -315,7 +232,7 @@ export async function selectChunks( allChunkDescs.sort((a, b) => b.relevance - a.relevance); // console_log(` [${allChunks.map((c) => (c.relevance)).join(", ")}]`); - const maxKeep = process.env.OPENAI_API_KEY ? 100000 : 200000; // TODO: tune + const maxKeep = process.env.OPENAI_API_KEY ? 100000 : 100000; // TODO: tune const chunks = keepBestChunks(allChunkDescs, allChunks, maxKeep); console_log(` [Keeping ${chunks.length} chunks]`); // for (let i = 0; i < chunks.length; i++) { @@ -332,6 +249,7 @@ async function selectRelevantChunks( selector: TypeChatJsonTranslator, chunks: Chunk[], input: string, + batchIndex: number, ): Promise { // TODO: Prompt engineering const prompt = `\ @@ -347,91 +265,18 @@ async function selectRelevantChunks( ${prepareChunks(chunks)} `; // console_log(prompt); - const result = await retryTranslateOn429(() => selector.translate(prompt)); + const result = await retryOn429(() => selector.translate(prompt)); if (!result) { - console_log(` [Failed to select chunks for ${chunks.length} chunks]`); + console_log( + ` [Failed to select chunks for batch ${batchIndex + 1} with ${chunks.length} chunks]`, + ); return []; } else { return result.chunkDescs; } } -function prepareChunks(chunks: Chunk[]): string { - chunks.sort( - // Sort by file name and chunk ID (should order by line number) - (a, b) => { - let cmp = a.fileName.localeCompare(b.fileName); - if (!cmp) { - cmp = a.lineNo - b.lineNo; - } - return cmp; - }, - ); - const output: string[] = []; - function put(line: string): void { - // console_log(line.trimEnd()); - output.push(line); - } - let lastFn = ""; - let lineNo = 0; - for (const chunk of chunks) { - if (chunk.fileName !== lastFn) { - lastFn = chunk.fileName; - lineNo = 0; - put("\n"); - put(`** file=${chunk.fileName}\n`); - } - put( - `* chunkId=${chunk.chunkId} kind=${chunk.treeName} name=${chunk.codeName}\n`, - ); - for (const blob of chunk.blobs) { - lineNo = blob.start; - for (const line of blob.lines) { - lineNo += 1; - put(`${lineNo} ${line}`); - } - } - } - return output.join(""); -} - -// TODO: Make the values two elements, comment start and comment end -// (and then caller should ensure comment end doesn't occur in the comment text). -const languageCommentMap: { [key: string]: string } = { - python: "#", - typescript: "//", -}; - -// TODO: Remove export once we're using summaries again. -export function prepareSummaries(db: sqlite.Database): string { - const selectAllSummaries = db.prepare(`SELECT * FROM Summaries`); - const summaryRows: any[] = selectAllSummaries.all(); - if (summaryRows.length > 100) { - console_log(` [Over 100 summary rows, skipping summaries in prompt]`); - return ""; - } - const lines: string[] = []; - for (const summaryRow of summaryRows) { - const comment = languageCommentMap[summaryRow.language] ?? "#"; - lines.push(""); - lines.push(`${comment} ${summaryRow.summary}`); - lines.push(summaryRow.signature); - } - return lines.join("\n"); -} - -function createTranslator( - model: ChatModel, - schemaFile: string, - typeName: string, -): TypeChatJsonTranslator { - const schema = loadSchema([schemaFile], import.meta.url); - const validator = createTypeScriptJsonValidator(schema, typeName); - const translator = createJsonTranslator(model, validator); - return translator; -} - -interface FileMtimeSize { +export interface FileMtimeSize { file: string; mtime: number; size: number; @@ -477,46 +322,15 @@ function getAllSourceFiles(dir: string): FileMtimeSize[] { return results; } -// Should be in actionHelpers.ts -function createActionResultFromMarkdownDisplay( - literalText: string, - entities: Entity[] = [], - resultEntity?: Entity, -): ActionResultSuccess { - return { - literalText, - entities, - resultEntity, - displayContent: { type: "markdown", content: literalText }, - }; -} - -async function loadDatabase( - context: SpelunkerContext, -): Promise { +// TODO: Break into multiple functions. +// Notably the part that compares files in the database and files on disk. +async function loadDatabase(context: SpelunkerContext): Promise { + console_log(`[Step 1: Load database]`); if (!context.queryContext) { context.queryContext = createQueryContext(); } - const db = createDatabase(context); - - const prepDeleteSummaries = db.prepare(` - DELETE FROM Summaries WHERE chunkId IN ( - SELECT chunkId - FROM chunks - WHERE fileName = ? - ) - `); - const prepDeleteBlobs = db.prepare(` - DELETE FROM Blobs WHERE chunkId IN ( - SELECT chunkId - FROM chunks - WHERE filename = ? - ) - `); - const prepDeleteChunks = db.prepare( - `DELETE FROM Chunks WHERE fileName = ?`, - ); - const prepDeleteFiles = db.prepare(`DELETE FROM files WHERE fileName = ?`); + const db = context.queryContext!.database!; + const prepInsertFiles = db.prepare( `INSERT OR REPLACE INTO Files (fileName, mtime, size) VALUES (?, ?, ?)`, ); @@ -549,6 +363,7 @@ async function loadDatabase( size: fileRow.size, }); } + const filesToInsert: FileMtimeSize[] = []; for (const file of files) { const dbStat = filesInDb.get(file.file); if ( @@ -559,7 +374,7 @@ async function loadDatabase( // console_log(` [Need to update ${file} (mtime/size mismatch)]`); filesToDo.push(file.file); // TODO: Make this insert part of the transaction for this file - prepInsertFiles.run(file.file, file.mtime, file.size); + filesToInsert.push(file); filesInDb.set(file.file, { file: file.file, mtime: file.mtime, @@ -575,12 +390,7 @@ async function loadDatabase( console_log(` [Deleting ${filesToDelete.length} files from database]`); for (const file of filesToDelete) { // console_log(` [Deleting ${file} from database]`); - db.exec(`BEGIN TRANSACTION`); - prepDeleteSummaries.run(file); - prepDeleteBlobs.run(file); - prepDeleteChunks.run(file); - prepDeleteFiles.run(file); - db.exec(`COMMIT`); + purgeFile(db, file); } } @@ -588,7 +398,7 @@ async function loadDatabase( console_log( ` [No files to update out of ${files.length}, yay cache!]`, ); - return db; + return; } // 1b. Chunkify all new files (without LLM help). @@ -620,10 +430,16 @@ async function loadDatabase( ); const allChunks: Chunk[] = []; for (const chunkedFile of allChunkedFiles) { + purgeFile(db, chunkedFile.fileName); db.exec(`BEGIN TRANSACTION`); - prepDeleteSummaries.run(chunkedFile.fileName); - prepDeleteBlobs.run(chunkedFile.fileName); - prepDeleteChunks.run(chunkedFile.fileName); + const file = filesToInsert.find((f) => f.file === chunkedFile.fileName); + if (!file) { + console_log( + ` [*** File ${chunkedFile.fileName} is missing from filesToInsert]`, + ); + continue; + } + prepInsertFiles.run(file.file, file.mtime, file.size); for (const chunk of chunkedFile.chunks) { allChunks.push(chunk); prepInsertChunks.run( @@ -648,293 +464,14 @@ async function loadDatabase( console_log( ` [Chunked ${allChunkedFiles.length} files into ${allChunks.length} chunks]`, ); - - // 1c. Use a fast model to summarize all chunks. - if (allChunks.length) { - await summarizeChunks(context, allChunks); - } - - return db; -} - -const databaseSchema = ` -CREATE TABLE IF NOT EXISTS Files ( - fileName TEXT PRIMARY KEY, - mtime FLOAT NOT NULL, - size INTEGER NOT NULL -); -CREATE TABLE IF NOT EXISTS Chunks ( - chunkId TEXT PRIMARY KEY, - treeName TEXT NOT NULL, - codeName TEXT NOT NULL, - parentId TEXT KEY REFERENCES Chunks(chunkId), -- May be null - fileName TEXT KEY REFERENCES files(fileName) NOT NULL, - lineNo INTEGER NOT NULL -- 1-based -); -CREATE TABLE IF NOT EXISTS Blobs ( - chunkId TEXT KEY REFERENCES Chunks(chunkId) NOT NULL, - start INTEGER NOT NULL, -- 0-based - lines TEXT NOT NULL, - breadcrumb TEXT -- Chunk ID or empty string or NULL -); -CREATE TABLE IF NOT EXISTS Summaries ( - chunkId TEXT PRIMARY KEY REFERENCES Chunks(chunkId), - language TEXT, -- "python", "typescript", etc. - summary TEXT, - signature TEXT -) -`; - -function getDbOptions() { - if (process?.versions?.electron !== undefined) { - return undefined; - } - const r = createRequire(import.meta.url); - const betterSqlitePath = r.resolve("better-sqlite3/package.json"); - const nativeBinding = path.join( - betterSqlitePath, - "../build/Release/better_sqlite3.n.node", - ); - return { nativeBinding }; -} - -function createDatabase(context: SpelunkerContext): sqlite.Database { - if (!context.queryContext) { - context.queryContext = createQueryContext(); - } - const loc = context.queryContext.databaseLocation; - const db0 = context.queryContext.database; - if (db0) { - console_log(` [Using database at ${loc}]`); - return db0; - } - if (fs.existsSync(loc)) { - console_log(` [Opening database at ${loc}]`); - } else { - console_log(` [Creating database at ${loc}]`); - } - const db = new Database(loc, getDbOptions()); - // Write-Ahead Logging, improving concurrency and performance - db.pragma("journal_mode = WAL"); - // Fix permissions to be read/write only by the owner - fs.chmodSync(context.queryContext.databaseLocation, 0o600); - // Create all the tables we'll use - db.exec(databaseSchema); - context.queryContext.database = db; - return db; -} - -async function summarizeChunks( - context: SpelunkerContext, - chunks: Chunk[], -): Promise { - console_log(`[Step 1c: Summarizing ${chunks.length} chunks]`); - // NOTE: We cannot stuff the buffer, because the completion size - // is limited to 4096 tokens, and we expect a certain number of - // tokens per chunk. Experimentally, 40 chunks per job works great. - const maxConcurrency = - parseInt(process.env.AZURE_OPENAI_MAX_CONCURRENCY ?? "0") ?? 5; - let chunksPerJob = 40; - let numJobs = Math.ceil(chunks.length / chunksPerJob); - console_log( - ` [${chunksPerJob} chunks/job, ${numJobs} jobs, maxConcurrency ${maxConcurrency}]`, - ); - const limiter = createLimiter(maxConcurrency); - const promises: Promise[] = []; - for (let i = 0; i < chunks.length; i += chunksPerJob) { - const slice = chunks.slice(i, i + chunksPerJob); - promises.push(limiter(() => summarizeChunkSlice(context, slice))); - } - await Promise.all(promises); -} - -async function summarizeChunkSlice( - context: SpelunkerContext, - chunks: Chunk[], -): Promise { - const summarizer = context.queryContext!.chunkSummarizer; - // TODO: Prompt engineering - const prompt = `\ - Please summarize each of the given chunks. - A summary should be a one-line description of the chunk. - Also include the signature of the chunk. - - Chunks: - ${prepareChunks(chunks)} - `; - // console_log(prompt); - const result = await retryTranslateOn429(() => - summarizer.translate(prompt), - ); - if (!result) { - console_log( - ` [Failed to summarize chunks for ${chunks.length} chunks]`, - ); + if (!allChunks.length) { + console_log(` [No chunks to load]`); return; } - const summarizeSpecs = result; - // console_log(` [Received ${result.summaries.length} summaries]`); - // Enter them into the database - const db = context.queryContext!.database!; - const prepInsertSummary = db.prepare( - `INSERT OR REPLACE INTO Summaries (chunkId, language, summary, signature) VALUES (?, ?, ?, ?)`, - ); - const prepGetBlobWithBreadcrumb = db.prepare( - `SELECT lines, breadcrumb FROM Blobs WHERE breadcrumb = ?`, - ); - const prepUpdateBlob = db.prepare( - "UPDATE Blobs SET lines = ? WHERE breadcrumb = ?", - ); - let errors = 0; - for (const summary of summarizeSpecs.summaries) { - // console_log(summary); - try { - prepInsertSummary.run( - summary.chunkId, - summary.language, - summary.summary, - summary.signature, - ); - } catch (error) { - console_log( - `*** Db error for insert summary ${JSON.stringify(summary)}: ${error}`, - ); - errors += 1; - } - try { - type BlobRowType = { lines: string; breadcrumb: ChunkId }; - const blobRow: BlobRowType = prepGetBlobWithBreadcrumb.get( - summary.chunkId, - ) as any; - if (blobRow) { - let blobLines: string = blobRow.lines; - // Assume it doesn't start with a blank line /(^\s*\r?\n)*/ - const indent = blobLines?.match(/^(\s*)\S/)?.[1] ?? ""; // Whitespace followed by non-whitespace - blobLines = - `${indent}${languageCommentMap[summary.language ?? "python"]} ${summary.summary}\n` + - `${indent}${summary.signature} ...\n`; - // console_log( - // ` [Replacing\n'''\n${blobRow.lines}'''\nwith\n'''\n${blobLines}\n''']`, - // ); - const res = prepUpdateBlob.run(blobLines, summary.chunkId); - if (res.changes !== 1) { - console_log( - ` [*** Failed to update blob lines for ${summary.chunkId}]`, - ); - } - } - } catch (error) { - console_log( - `*** Db error for update blob ${JSON.stringify(summary)}: ${error}`, - ); - errors += 1; - } - } - if (errors) console_log(` [${errors} errors]`); -} - -async function retryTranslateOn429( - translate: () => Promise>, - retries: number = 3, - defaultDelay: number = 5000, -): Promise { - let wrappedResult: Result; - do { - retries--; - wrappedResult = await translate(); - // console_log(wrappedResult); - if (!wrappedResult.success) { - if ( - retries > 0 && - wrappedResult.message.includes("fetch error: 429:") - ) { - let delay = defaultDelay; - const azureTime = wrappedResult.message.match( - /after (\d+) milliseconds/, - ); - const openaiTime = wrappedResult.message.match( - /Please try again in (\d+\.\d*|\.\d+|\d+m)s./, - ); - if (azureTime || openaiTime) { - if (azureTime) { - delay = parseInt(azureTime[1]); - } else if (openaiTime) { - delay = parseFloat(openaiTime[1]); - if (!openaiTime[1].endsWith("m")) { - delay *= 1000; - } - } - } else { - console_log( - ` [Couldn't find msec in '${wrappedResult.message}'`, - ); - } - console_log(` [Retry on 429 error: sleep ${delay} ms]`); - await new Promise((resolve) => setTimeout(resolve, delay)); - continue; - } - console_log(` [${wrappedResult.message}]`); - return undefined; - } - } while (!wrappedResult.success); - return wrappedResult.data; -} - -function keepBestChunks( - chunkDescs: ChunkDescription[], // Sorted by descending relevance - allChunks: Chunk[], - batchSize: number, // In characters -): Chunk[] { - const chunks: Chunk[] = []; - let size = 0; - for (const chunkDesc of chunkDescs) { - const chunk = allChunks.find((c) => c.chunkId === chunkDesc.chunkId); - if (!chunk) continue; - const chunkSize = getChunkSize(chunk); - if (size + chunkSize > batchSize && chunks.length) { - break; - } - chunks.push(chunk); - size += chunkSize; - } - return chunks; -} - -function makeBatches( - chunks: Chunk[], - batchSize: number, // In characters -): Chunk[][] { - const batches: Chunk[][] = []; - let batch: Chunk[] = []; - let size = 0; - function flush(): void { - batches.push(batch); - console_log( - ` [Batch ${batches.length} has ${batch.length} chunks and ${size} bytes]`, - ); - batch = []; - size = 0; - } - for (const chunk of chunks) { - const chunkSize = getChunkSize(chunk); - if (size + chunkSize > batchSize && batch.length) { - flush(); - } - batch.push(chunk); - size += chunkSize; - } - if (batch.length) { - flush(); - } - return batches; -} + // 1c. Store all chunk embeddings. + await loadEmbeddings(context, allChunks); -function getChunkSize(chunk: Chunk): number { - // This is all an approximation - let size = chunk.fileName.length + 50; - for (const blob of chunk.blobs) { - size += blob.lines.join("").length + 4 * blob.lines.length; - } - return size; + // 1d. Use a fast model to summarize all chunks. + // await summarizeChunks(context, allChunks); } diff --git a/ts/packages/agents/spelunker/src/spelunkerActionHandler.ts b/ts/packages/agents/spelunker/src/spelunkerActionHandler.ts index e806f374d..46714cf2b 100644 --- a/ts/packages/agents/spelunker/src/spelunkerActionHandler.ts +++ b/ts/packages/agents/spelunker/src/spelunkerActionHandler.ts @@ -25,7 +25,8 @@ import { getCommandInterface, } from "@typeagent/agent-sdk/helpers/command"; -import { searchCode, QueryContext } from "./searchCode.js"; +import { searchCode } from "./searchCode.js"; +import { QueryContext } from "./queryContext.js"; import { SpelunkerAction } from "./spelunkerSchema.js"; class RequestCommandHandler implements CommandHandler { diff --git a/ts/packages/agents/spelunker/src/summarizing.ts b/ts/packages/agents/spelunker/src/summarizing.ts new file mode 100644 index 000000000..12238abdd --- /dev/null +++ b/ts/packages/agents/spelunker/src/summarizing.ts @@ -0,0 +1,183 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import * as sqlite from "better-sqlite3"; + +import { createLimiter } from "common-utils"; + +import { Chunk, ChunkId } from "./chunkSchema.js"; +import { console_log } from "./logging.js"; +import { retryOn429 } from "./retryLogic.js"; +import { SpelunkerContext } from "./spelunkerActionHandler.js"; + +export async function summarizeChunks( + context: SpelunkerContext, + chunks: Chunk[], +): Promise { + console_log(`[Step 1d: Summarizing ${chunks.length} chunks]`); + // NOTE: We cannot stuff the buffer, because the completion size + // is limited to 4096 tokens, and we expect a certain number of + // tokens per chunk. Experimentally, 40 chunks per job works great. + const maxConcurrency = + parseInt(process.env.AZURE_OPENAI_MAX_CONCURRENCY ?? "0") ?? 5; + let chunksPerJob = 40; + let numJobs = Math.ceil(chunks.length / chunksPerJob); + console_log( + ` [${chunksPerJob} chunks/job, ${numJobs} jobs, maxConcurrency ${maxConcurrency}]`, + ); + const limiter = createLimiter(maxConcurrency); + const promises: Promise[] = []; + for (let i = 0; i < chunks.length; i += chunksPerJob) { + const slice = chunks.slice(i, i + chunksPerJob); + promises.push(limiter(() => summarizeChunkSlice(context, slice))); + } + await Promise.all(promises); +} + +async function summarizeChunkSlice( + context: SpelunkerContext, + chunks: Chunk[], +): Promise { + const summarizer = context.queryContext!.chunkSummarizer; + // TODO: Prompt engineering + const prompt = `\ + Please summarize each of the given chunks. + A summary should be a one-line description of the chunk. + Also include the signature of the chunk. + + Chunks: + ${prepareChunks(chunks)} + `; + // console_log(prompt); + const result = await retryOn429(() => summarizer.translate(prompt)); + if (!result) { + console_log( + ` [Failed to summarize chunks for ${chunks.length} chunks]`, + ); + return; + } + + const summarizeSpecs = result; + // console_log(` [Received ${result.summaries.length} summaries]`); + // Enter them into the database + const db = context.queryContext!.database!; + const prepInsertSummary = db.prepare( + `INSERT OR REPLACE INTO Summaries (chunkId, language, summary, signature) VALUES (?, ?, ?, ?)`, + ); + const prepGetBlobWithBreadcrumb = db.prepare( + `SELECT lines, breadcrumb FROM Blobs WHERE breadcrumb = ?`, + ); + const prepUpdateBlob = db.prepare( + "UPDATE Blobs SET lines = ? WHERE breadcrumb = ?", + ); + let errors = 0; + for (const summary of summarizeSpecs.summaries) { + // console_log(summary); + try { + prepInsertSummary.run( + summary.chunkId, + summary.language, + summary.summary, + summary.signature, + ); + } catch (error) { + console_log( + `*** Db error for insert summary ${JSON.stringify(summary)}: ${error}`, + ); + errors += 1; + } + try { + type BlobRowType = { lines: string; breadcrumb: ChunkId }; + const blobRow: BlobRowType = prepGetBlobWithBreadcrumb.get( + summary.chunkId, + ) as any; + if (blobRow) { + let blobLines: string = blobRow.lines; + // Assume it doesn't start with a blank line /(^\s*\r?\n)*/ + const indent = blobLines?.match(/^(\s*)\S/)?.[1] ?? ""; // Whitespace followed by non-whitespace + blobLines = + `${indent}${languageCommentMap[summary.language ?? "python"]} ${summary.summary}\n` + + `${indent}${summary.signature} ...\n`; + // console_log( + // ` [Replacing\n'''\n${blobRow.lines}'''\nwith\n'''\n${blobLines}\n''']`, + // ); + const res = prepUpdateBlob.run(blobLines, summary.chunkId); + if (res.changes !== 1) { + console_log( + ` [*** Failed to update blob lines for ${summary.chunkId}]`, + ); + } + } + } catch (error) { + console_log( + `*** Db error for update blob ${JSON.stringify(summary)}: ${error}`, + ); + errors += 1; + } + } + if (errors) console_log(` [${errors} errors]`); +} + +export function prepareChunks(chunks: Chunk[]): string { + chunks.sort( + // Sort by file name and chunk ID (should order by line number) + (a, b) => { + let cmp = a.fileName.localeCompare(b.fileName); + if (!cmp) { + cmp = a.lineNo - b.lineNo; + } + return cmp; + }, + ); + const output: string[] = []; + function put(line: string): void { + // console_log(line.trimEnd()); + output.push(line); + } + let lastFn = ""; + let lineNo = 0; + for (const chunk of chunks) { + if (chunk.fileName !== lastFn) { + lastFn = chunk.fileName; + lineNo = 0; + put("\n"); + put(`** file=${chunk.fileName}\n`); + } + put( + `* chunkId=${chunk.chunkId} kind=${chunk.treeName} name=${chunk.codeName}\n`, + ); + for (const blob of chunk.blobs) { + lineNo = blob.start; + for (const line of blob.lines) { + lineNo += 1; + put(`${lineNo} ${line}`); + } + } + } + return output.join(""); +} + +// TODO: Remove export once we're using summaries again. +export function prepareSummaries(db: sqlite.Database): string { + const selectAllSummaries = db.prepare(`SELECT * FROM Summaries`); + const summaryRows: any[] = selectAllSummaries.all(); + if (summaryRows.length > 100) { + console_log(` [Over 100 summary rows, skipping summaries in prompt]`); + return ""; + } + const lines: string[] = []; + for (const summaryRow of summaryRows) { + const comment = languageCommentMap[summaryRow.language] ?? "#"; + lines.push(""); + lines.push(`${comment} ${summaryRow.summary}`); + lines.push(summaryRow.signature); + } + return lines.join("\n"); +} + +// TODO: Make the values two elements, comment start and comment end +// (and then caller should ensure comment end doesn't occur in the comment text). +const languageCommentMap: { [key: string]: string } = { + python: "#", + typescript: "//", +}; diff --git a/ts/packages/agents/spelunker/src/typescriptChunker.ts b/ts/packages/agents/spelunker/src/typescriptChunker.ts index 05b9a56fe..25f649f45 100644 --- a/ts/packages/agents/spelunker/src/typescriptChunker.ts +++ b/ts/packages/agents/spelunker/src/typescriptChunker.ts @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +import path from "path"; + import ts from "typescript"; import { tsCode } from "code-processor"; @@ -12,7 +14,7 @@ import { ChunkedFile, ChunkerErrorItem, } from "./chunkSchema.js"; -import path from "path"; +import { console_log } from "./logging.js"; let last_ts = Date.now() * 1000; export function generate_id(): ChunkId { @@ -27,10 +29,10 @@ export function generate_id(): ChunkId { export async function chunkifyTypeScriptFiles( fileNames: string[], ): Promise<(ChunkedFile | ChunkerErrorItem)[]> { - // console.log("========================================================"); + // console_log("========================================================"); const results: (ChunkedFile | ChunkerErrorItem)[] = []; for (const fileName of fileNames) { - // console.log(fileName); + // console_log(fileName); const sourceFile: ts.SourceFile = await tsCode.loadSourceFile(fileName); const baseName = path.basename(fileName); @@ -76,7 +78,7 @@ export async function chunkifyTypeScriptFiles( ts.isFunctionDeclaration(childNode) || ts.isClassDeclaration(childNode) ) { - // console.log( + // console_log( // ts.SyntaxKind[childNode.kind], // tsCode.getStatementName(childNode), // ); @@ -156,7 +158,7 @@ function spliceBlobs(parentChunk: Chunk, childChunk: Chunk): void { blobs.push({ start: startBefore, lines: linesBefore }); } const sig: string = signature(childChunk); - // console.log("signature", sig); + // console_log("signature", sig); if (sig) { blobs.push({ start: childBlob.start, @@ -209,7 +211,7 @@ function makeBlobs( startPos = lineStarts[startLoc.line + 1]; startLoc = sourceFile.getLineAndCharacterOfPosition(startPos); } - // console.log( + // console_log( // `Start and end: ${startPos}=${startLoc.line + 1}:${startLoc.character}, ` + // `${endPos}=${endLoc.line + 1}:${endLoc.character}`, // ); @@ -217,7 +219,7 @@ function makeBlobs( startPos = lineStarts[startLoc.line + 1]; startLoc = sourceFile.getLineAndCharacterOfPosition(startPos); } - // console.log( + // console_log( // `Updated start: ${startPos}=${startLoc.line + 1}:${startLoc.character}`, // ); const lines: string[] = []; @@ -229,7 +231,7 @@ function makeBlobs( while (lines && !lines[lines.length - 1].trim()) { lines.pop(); } - // console.log(lines.slice(0, 3), "...", lines.slice(-3)); + // console_log(lines.slice(0, 3), "...", lines.slice(-3)); if (!lines.length) { return []; } @@ -251,6 +253,6 @@ export class Testing { "./packages/agents/spelunker/src/pythonChunker.ts", ]; const results = await chunkifyTypeScriptFiles(fileNames); - console.log(JSON.stringify(results, null, 2)); + console_log(JSON.stringify(results, null, 2)); } }