Skip to content

Commit

Permalink
Remove dead code from pythonChunker.ts
Browse files Browse the repository at this point in the history
  • Loading branch information
gvanrossum-ms committed Feb 10, 2025
1 parent 6b12c22 commit 4618278
Showing 1 changed file with 1 addition and 105 deletions.
106 changes: 1 addition & 105 deletions ts/packages/agents/spelunker/src/pythonChunker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,7 @@ import path from "path";
import { fileURLToPath } from "url";
import { promisify } from "util";

import {
ChunkId,
Chunk,
ChunkedFile,
ChunkerErrorItem,
} from "./chunkSchema.js";
import { ChunkedFile, ChunkerErrorItem } from "./chunkSchema.js";

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
Expand Down Expand Up @@ -65,104 +60,5 @@ export async function chunkifyPythonFiles(
}
}
}
if (true) return results;
else return splitLargeFiles(results); // Alas, this breaks some things.
}

// *** Everything below here is potentially dead code.***

// TODO: Tune these constants (count should be much larger, at least).
const CHUNK_COUNT_LIMIT = 25; // How many chunks at most.
const FILE_SIZE_LIMIT = 25000; // How many characters at most.

function splitLargeFiles(
items: (ChunkedFile | ChunkerErrorItem)[],
): (ChunkedFile | ChunkerErrorItem)[] {
const results: (ChunkedFile | ChunkerErrorItem)[] = [];
for (const item of items) {
if (
"error" in item ||
(item.chunks.length <= CHUNK_COUNT_LIMIT &&
fileSize(item) <= FILE_SIZE_LIMIT)
) {
results.push(item);
} else {
results.push(...splitFile(item));
}
}
return results;
}

// This algorithm is too complex. I needed a debugger and logging to get it right.
function splitFile(file: ChunkedFile): ChunkedFile[] {
const fileName = file.fileName;
const parentMap: Map<ChunkId, Chunk> = new Map();
for (const chunk of file.chunks) {
// Only nodes with children will be looked up in this map.
if (chunk.children.length) parentMap.set(chunk.chunkId, chunk);
}

const results: ChunkedFile[] = []; // Where output accumulates.
let chunks = Array.from(file.chunks); // The chunks yet to emit.
let minNumChunks = 1;

outer: while (true) {
// Keep going until we exit the inner loop.
let totalSize = 0; // Size in characters of chunks to be output.
for (let i = 0; i < chunks.length; i++) {
// Iterate in pre-order.
const currentChunk = chunks[i];
const size = chunkSize(currentChunk);
if (
i < minNumChunks ||
(i < CHUNK_COUNT_LIMIT && totalSize + size <= FILE_SIZE_LIMIT)
) {
totalSize += size;
continue;
}

// Split the file here (current chunk goes into ancestors).
const rest = chunks.splice(i);
if (rest.shift() !== currentChunk)
throw Error(
"Internal error: expected current chunk at head of rest",
);
results.push({ fileName, chunks });
const ancestors: Chunk[] = [];

let c: Chunk | undefined = currentChunk;
do {
ancestors.unshift(c);
c = parentMap.get(c.parentId);
} while (c);
// Note that the current chunk is the last ancestor.
chunks = [...ancestors, ...rest];
minNumChunks = ancestors.length;
continue outer;
}
// Append the final chunk.
results.push({ fileName, chunks });
break;
}
// console.log(
// `Split ${file.fileName} (${file.chunks.length} chunks) into ${results.length} files.`,
// );
// console.log(`Sizes: ${results.map((f) => f.chunks.length).join(", ")}`);
return results;
}

function fileSize(file: ChunkedFile): number {
return file.chunks.reduce((acc, chunk) => acc + chunkSize(chunk), 0);
}

function chunkSize(chunk: Chunk): number {
let totalCharacters = 0;
for (const blob of chunk.blobs) {
if (!blob.breadcrumb) {
for (const line of blob.lines) {
totalCharacters += line.length;
}
}
}
return totalCharacters;
}

0 comments on commit 4618278

Please sign in to comment.