diff --git a/ts/packages/agents/spelunker/evals/README.md b/ts/packages/agents/spelunker/evals/README.md index abfab25a..60fc90ba 100644 --- a/ts/packages/agents/spelunker/evals/README.md +++ b/ts/packages/agents/spelunker/evals/README.md @@ -12,6 +12,10 @@ as sample code. We use `evals/eval-1` as the directory to hold all eval data. (This happens to be the default built into the scripts.) +(Consider using a different directory -- we now have checked-in +data for both `eval-1` (`dispatcher`) and `eval-2` (spelunker), +so consider a higher number or a different prefix. + ## 1. Copy source files to EVALDIR (`evals/eval-1`) Assume the TypeAgent root is `~/TypeAgent`. Adjust to taste. @@ -28,6 +32,10 @@ $ We delete `dist` and `node_mpdules` to save space (Spelunker ignores them). We remove `package.json` since otherwise the Repo policy test fails. +Create `evals/eval-1/source/README.md` to explain the origin of the code +(notably the git commit ID from which the code was copied, and the path +of the code relative to TypeAgent). + ## 2. Run Spelunker over the copied sources NOTE: You must exit the CLI by hitting `^C`. @@ -57,8 +65,11 @@ This leaves the data in the database `~/.typeagent/agents/spelunker/codeSearchDa ## 3. Initialize the eval database -You can do this multiple times, but once you've started scoring (#4 below), -it will erase the scores you've already entered. (TODO: preserve scores.) +You can do this multiple times, using `--overwrite`. (Without that flag, +it will create a new eval directory `eval-N`.) `--overwrite` preserves +the Questions and Scores tables, but recomputes the Hashes table, after +recopying the Files, Chunks and Blobs tables. +(It doesn't need the Embeddings and Summaries tables.) ```shell $ python3 ./evals/src/evalsetup.py --overwrite diff --git a/ts/packages/agents/spelunker/evals/design.md b/ts/packages/agents/spelunker/evals/design.md index fb324979..b17918c0 100644 --- a/ts/packages/agents/spelunker/evals/design.md +++ b/ts/packages/agents/spelunker/evals/design.md @@ -122,7 +122,28 @@ we can move on to the next stage, running the evals: ## Automatic eval runs -TBD +An eval run needs to do the following: + +- Use the eval database as ground truth for files, chunks and blobs. +- (Not sure yet what to do if the run needs e.g. summaries.) +- For each question in the Questions table: + - Run the full chunk selection process using that question. + - This includes the part that keeps the top N selected chunks only. + - Compute the F1 score by comparing the precision and recall + based on the scores in the Scores table: `F1 = 2 * (p*r) / (p+r)` + - Print some JSON with the question, the F1 score, and the algorithm + (and perhaps a timestamp). + +## Tooling needed for automatic eval runs + +We need to write a new TypeScript program that reuses much of +`searchCode.ts`, setting the database to the right file +(given on the command line), +and running variants of the selection algorithm +(another command line flag). +This should be straightforward. +We may need small tweaks to the existing algorithms to make the right +APIs available. # Random notes diff --git a/ts/packages/agents/spelunker/evals/src/evalsetup.py b/ts/packages/agents/spelunker/evals/src/evalsetup.py index a367b367..9f1c8f65 100755 --- a/ts/packages/agents/spelunker/evals/src/evalsetup.py +++ b/ts/packages/agents/spelunker/evals/src/evalsetup.py @@ -90,7 +90,7 @@ def main(): dst_conn = sqlite3.connect(dbname) dst_cur = dst_conn.cursor() - copy_table(src_cur, dst_cur, "Files") + copy_table(src_cur, dst_cur, "Files", filename_prefix) copy_table(src_cur, dst_cur, "Chunks") copy_table(src_cur, dst_cur, "Blobs") src_conn.close() @@ -101,20 +101,28 @@ def main(): dst_conn.close() -def copy_table(src_cur, dst_cur, table_name): - # Get CREATE TABLE SQL from the source +def copy_table(src_cur, dst_cur, table_name, prefix=None): + # Get CREATE TABLE SQL from the database'schema create_sql = src_cur.execute( f"SELECT sql FROM sqlite_master WHERE type='table' AND name='{table_name}'" ).fetchone()[0] if create_sql.startswith("CREATE TABLE"): create_sql = create_sql.replace("CREATE TABLE", "CREATE TABLE IF NOT EXISTS") # print(create_sql) - print("Creating and clearing table {table_name}") + print(f"Creating and clearing table {table_name}") dst_cur.execute(create_sql) dst_cur.execute(f"DELETE FROM {table_name}") - # Copy rows + # Read rows rows = src_cur.execute(f"SELECT * FROM {table_name}").fetchall() + if prefix and table_name.lower() == "files": + # Check the filenames start with the prefix + for row in rows: + filename = row[0] + if not filename.startswith(prefix): + print(f"Aborting because {filename} does not start with {prefix}") + sys.exit(1) + # Copy rows print(f"Inserting {len(rows)} rows with {len(rows[0])} columns into {table_name}") placeholders = ",".join(["?"] * len(rows[0])) dst_cur.executemany(f"INSERT INTO {table_name} VALUES ({placeholders})", rows) @@ -132,7 +140,7 @@ def copy_table(src_cur, dst_cur, table_name): CREATE TABLE IF NOT EXISTS Scores ( questionId INTEGER REFERENCES Questions(id), chunkHash TEXT REFERENCES Hashes(chunkHash), - score INTEGER, -- 0 or 1, + score INTEGER, -- 0 or 1 timestamp TEXT ); """