Tweak eval setup, add to evals/design

gvanrossum-ms · Feb 24, 2025 · 2aeea37 · 2aeea37
1 parent 44c25dc
commit 2aeea37
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 9 deletions.
diff --git a/ts/packages/agents/spelunker/evals/README.md b/ts/packages/agents/spelunker/evals/README.md
@@ -12,6 +12,10 @@ as sample code.
 We use `evals/eval-1` as the directory to hold all eval data.
 (This happens to be the default built into the scripts.)
 
+(Consider using a different directory -- we now have checked-in
+data for both `eval-1` (`dispatcher`) and `eval-2` (spelunker),
+so consider a higher number or a different prefix.
+
 ## 1. Copy source files to EVALDIR (`evals/eval-1`)
 
 Assume the TypeAgent root is `~/TypeAgent`. Adjust to taste.
@@ -28,6 +32,10 @@ $
 We delete `dist` and `node_mpdules` to save space (Spelunker ignores them).
 We remove `package.json` since otherwise the Repo policy test fails.
 
+Create `evals/eval-1/source/README.md` to explain the origin of the code
+(notably the git commit ID from which the code was copied, and the path
+of the code relative to TypeAgent).
+
 ## 2. Run Spelunker over the copied sources
 
 NOTE: You must exit the CLI by hitting `^C`.
@@ -57,8 +65,11 @@ This leaves the data in the database `~/.typeagent/agents/spelunker/codeSearchDa
 
 ## 3. Initialize the eval database
 
-You can do this multiple times, but once you've started scoring (#4 below),
-it will erase the scores you've already entered. (TODO: preserve scores.)
+You can do this multiple times, using `--overwrite`. (Without that flag,
+it will create a new eval directory `eval-N`.) `--overwrite` preserves
+the Questions and Scores tables, but recomputes the Hashes table, after
+recopying the Files, Chunks and Blobs tables.
+(It doesn't need the Embeddings and Summaries tables.)
 
 ```shell
 $ python3 ./evals/src/evalsetup.py --overwrite

diff --git a/ts/packages/agents/spelunker/evals/design.md b/ts/packages/agents/spelunker/evals/design.md
@@ -122,7 +122,28 @@ we can move on to the next stage, running the evals:
 
 ## Automatic eval runs
 
-TBD
+An eval run needs to do the following:
+
+- Use the eval database as ground truth for files, chunks and blobs.
+- (Not sure yet what to do if the run needs e.g. summaries.)
+- For each question in the Questions table:
+  - Run the full chunk selection process using that question.
+  - This includes the part that keeps the top N selected chunks only.
+  - Compute the F1 score by comparing the precision and recall
+    based on the scores in the Scores table: `F1 = 2 * (p*r) / (p+r)`
+  - Print some JSON with the question, the F1 score, and the algorithm
+    (and perhaps a timestamp).
+
+## Tooling needed for automatic eval runs
+
+We need to write a new TypeScript program that reuses much of
+`searchCode.ts`, setting the database to the right file
+(given on the command line),
+and running variants of the selection algorithm
+(another command line flag).
+This should be straightforward.
+We may need small tweaks to the existing algorithms to make the right
+APIs available.
 
 # Random notes
 

diff --git a/ts/packages/agents/spelunker/evals/src/evalsetup.py b/ts/packages/agents/spelunker/evals/src/evalsetup.py
@@ -90,7 +90,7 @@ def main():
     dst_conn = sqlite3.connect(dbname)
     dst_cur = dst_conn.cursor()
 
-    copy_table(src_cur, dst_cur, "Files")
+    copy_table(src_cur, dst_cur, "Files", filename_prefix)
     copy_table(src_cur, dst_cur, "Chunks")
     copy_table(src_cur, dst_cur, "Blobs")
     src_conn.close()
@@ -101,20 +101,28 @@ def main():
     dst_conn.close()
 
 
-def copy_table(src_cur, dst_cur, table_name):
-    # Get CREATE TABLE SQL from the source
+def copy_table(src_cur, dst_cur, table_name, prefix=None):
+    # Get CREATE TABLE SQL from the database'schema
     create_sql = src_cur.execute(
         f"SELECT sql FROM sqlite_master WHERE type='table' AND name='{table_name}'"
     ).fetchone()[0]
     if create_sql.startswith("CREATE TABLE"):
         create_sql = create_sql.replace("CREATE TABLE", "CREATE TABLE IF NOT EXISTS")
     # print(create_sql)
-    print("Creating and clearing table {table_name}")
+    print(f"Creating and clearing table {table_name}")
     dst_cur.execute(create_sql)
     dst_cur.execute(f"DELETE FROM {table_name}")
 
-    # Copy rows
+    # Read rows
     rows = src_cur.execute(f"SELECT * FROM {table_name}").fetchall()
+    if prefix and  table_name.lower() == "files":
+        # Check the filenames start with the prefix
+        for row in rows:
+            filename = row[0]
+            if not filename.startswith(prefix):
+                print(f"Aborting because {filename} does not start with {prefix}")
+                sys.exit(1)
+    # Copy rows
     print(f"Inserting {len(rows)} rows with {len(rows[0])} columns into {table_name}")
     placeholders = ",".join(["?"] * len(rows[0]))
     dst_cur.executemany(f"INSERT INTO {table_name} VALUES ({placeholders})", rows)
@@ -132,7 +140,7 @@ def copy_table(src_cur, dst_cur, table_name):
 CREATE TABLE IF NOT EXISTS Scores (
     questionId INTEGER REFERENCES Questions(id),
     chunkHash TEXT REFERENCES Hashes(chunkHash),
-    score INTEGER,  -- 0 or 1,
+    score INTEGER,  -- 0 or 1
     timestamp TEXT
 );
 """