From 1047d8a80aa04930f5ccc9daa9090e99e518755f Mon Sep 17 00:00:00 2001 From: Umesh Madan Date: Fri, 7 Feb 2025 08:20:53 -0800 Subject: [PATCH 1/8] Bug fix --- ts/packages/knowPro/src/collections.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ts/packages/knowPro/src/collections.ts b/ts/packages/knowPro/src/collections.ts index 9befb76c..fb914a73 100644 --- a/ts/packages/knowPro/src/collections.ts +++ b/ts/packages/knowPro/src/collections.ts @@ -66,7 +66,11 @@ export class MatchAccumulator { public add(value: T, score: number, isExactMatch: boolean) { const existingMatch = this.getMatch(value); if (existingMatch) { - this.updateExisting(existingMatch, score, isExactMatch); + //this.updateExisting(existingMatch, score, isExactMatch); + if (isExactMatch) { + existingMatch.exactHitCount++; + existingMatch.score += score; + } } else { this.setMatch({ value, From e7e12a94d0738da1130d44761e931cdfcc1cfe73 Mon Sep 17 00:00:00 2001 From: Umesh Madan Date: Fri, 7 Feb 2025 09:30:42 -0800 Subject: [PATCH 2/8] Relevance --- ts/packages/knowPro/src/collections.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ts/packages/knowPro/src/collections.ts b/ts/packages/knowPro/src/collections.ts index fb914a73..fe86ed9d 100644 --- a/ts/packages/knowPro/src/collections.ts +++ b/ts/packages/knowPro/src/collections.ts @@ -66,11 +66,13 @@ export class MatchAccumulator { public add(value: T, score: number, isExactMatch: boolean) { const existingMatch = this.getMatch(value); if (existingMatch) { - //this.updateExisting(existingMatch, score, isExactMatch); + this.updateExisting(existingMatch, score, isExactMatch); + /* if (isExactMatch) { existingMatch.exactHitCount++; - existingMatch.score += score; } + existingMatch.score += score; + */ } else { this.setMatch({ value, From 62f8e6ba8bd36416e06609d271dfc86d347b7610 Mon Sep 17 00:00:00 2001 From: Umesh Madan Date: Fri, 7 Feb 2025 12:12:53 -0800 Subject: [PATCH 3/8] Related term similarity --- ts/packages/knowPro/src/collections.ts | 8 +++ ts/packages/knowPro/src/relatedTermsIndex.ts | 63 ++++++++++++++++++++ ts/packages/typeagent/src/vector/vector.ts | 8 +++ 3 files changed, 79 insertions(+) diff --git a/ts/packages/knowPro/src/collections.ts b/ts/packages/knowPro/src/collections.ts index fe86ed9d..b6558916 100644 --- a/ts/packages/knowPro/src/collections.ts +++ b/ts/packages/knowPro/src/collections.ts @@ -381,6 +381,10 @@ export class TextRangeCollection { export class TermSet { constructor(private terms: Map = new Map()) {} + public get size() { + return this.terms.size; + } + public add(term: Term) { const existingTerm = this.terms.get(term.text); if (!existingTerm) { @@ -414,6 +418,10 @@ export class TermSet { public clear(): void { this.terms.clear(); } + + public values() { + return this.terms.values(); + } } export class PropertyTermSet { diff --git a/ts/packages/knowPro/src/relatedTermsIndex.ts b/ts/packages/knowPro/src/relatedTermsIndex.ts index e3e72403..3ac23da2 100644 --- a/ts/packages/knowPro/src/relatedTermsIndex.ts +++ b/ts/packages/knowPro/src/relatedTermsIndex.ts @@ -10,6 +10,7 @@ import { generateTextEmbeddingsWithRetry, collections, dotProduct, + EmbeddedValue, } from "typeagent"; import { Term, @@ -311,3 +312,65 @@ export function createTextEmbeddingIndexSettings( retryPauseMs: 2000, }; } + +export class RelatedTermSet { + private embeddedTerms: Map>; + + constructor() { + this.embeddedTerms = new Map(); + } + + public *getTerms() { + for (const embeddedTerm of this.embeddedTerms.values()) { + yield embeddedTerm.value; + } + } + + public add(term: Term, embedding: NormalizedEmbedding) { + this.embeddedTerms.set(term.text, { value: term, embedding }); + } + + public getSimilar(term: Term, minScore?: number): Term[] { + minScore ??= 0; + const similarTerms: Term[] = []; + const testTerm = this.embeddedTerms.get(term.text); + if (testTerm === undefined) { + return similarTerms; + } + for (const embeddedTerm of this.embeddedTerms.values()) { + const similarity = dotProduct( + testTerm.embedding, + embeddedTerm.embedding, + ); + if ( + similarity >= minScore && + embeddedTerm.value.text !== testTerm.value.text + ) { + similarTerms.push(embeddedTerm.value); + } + } + return similarTerms; + } + + public removeAllSimilar(thresholdScore: number) { + const allKeys = [...this.embeddedTerms.keys()]; + for (const key of allKeys) { + const embeddedTerm = this.embeddedTerms.get(key); + if (embeddedTerm !== undefined) { + const similarTerms = this.getSimilar( + embeddedTerm.value, + thresholdScore, + ); + if (similarTerms.length > 0) { + this.removeTerms(similarTerms); + } + } + } + } + + public removeTerms(terms: Term[] | IterableIterator) { + for (const term of terms) { + this.embeddedTerms.delete(term.text); + } + } +} diff --git a/ts/packages/typeagent/src/vector/vector.ts b/ts/packages/typeagent/src/vector/vector.ts index d565bd9e..25629491 100644 --- a/ts/packages/typeagent/src/vector/vector.ts +++ b/ts/packages/typeagent/src/vector/vector.ts @@ -160,3 +160,11 @@ function divideInPlace(x: Vector, divisor: number): void { x[i] /= divisor; } } + +export function createMatrix(rowCount: number, colCount: number): number[][] { + const matrix: Array = new Array(rowCount); + for (let i = 0; i < rowCount; ++i) { + matrix[i] = new Array(colCount); + } + return matrix; +} From 38a46b2be924fe43cb4b2c2d2bca62fa4eb34d68 Mon Sep 17 00:00:00 2001 From: Umesh Madan Date: Fri, 7 Feb 2025 13:06:47 -0800 Subject: [PATCH 4/8] Refactor --- ts/packages/knowPro/src/dataFormat.ts | 10 +- ts/packages/knowPro/src/relatedTermsIndex.ts | 114 ++++++++++-------- .../typeagent/src/vector/embeddings.ts | 30 ++++- 3 files changed, 104 insertions(+), 50 deletions(-) diff --git a/ts/packages/knowPro/src/dataFormat.ts b/ts/packages/knowPro/src/dataFormat.ts index 221c86e3..fa59d88b 100644 --- a/ts/packages/knowPro/src/dataFormat.ts +++ b/ts/packages/knowPro/src/dataFormat.ts @@ -2,6 +2,7 @@ // Licensed under the MIT License. import { conversation } from "knowledge-processor"; +import { NormalizedEmbedding } from "typeagent"; // an object that can provide a KnowledgeResponse structure export interface IKnowledgeSource { @@ -151,11 +152,16 @@ export interface ITermsToRelatedTermsDataItem { } export interface ITermEmbeddingIndex { - lookupTermsFuzzy( - term: string, + lookup( + text: string, maxMatches?: number, minScore?: number, ): Promise; + lookupEmbeddings( + text: string, + maxMatches?: number, + minScore?: number, + ): Promise<[string, NormalizedEmbedding][] | undefined>; serialize(): ITextEmbeddingIndexData; deserialize(data: ITextEmbeddingIndexData): void; } diff --git a/ts/packages/knowPro/src/relatedTermsIndex.ts b/ts/packages/knowPro/src/relatedTermsIndex.ts index 3ac23da2..c0804eb8 100644 --- a/ts/packages/knowPro/src/relatedTermsIndex.ts +++ b/ts/packages/knowPro/src/relatedTermsIndex.ts @@ -9,8 +9,10 @@ import { SimilarityType, generateTextEmbeddingsWithRetry, collections, - dotProduct, EmbeddedValue, + ScoredItem, + indexesOfAllNearest, + dotProduct, } from "typeagent"; import { Term, @@ -114,7 +116,7 @@ export class TermToRelatedTermsIndex implements ITermToRelatedTermsIndex { termText: string, ): Promise { if (this.termEmbeddingsIndex) { - return await this.termEmbeddingsIndex.lookupTermsFuzzy(termText); + return await this.termEmbeddingsIndex.lookup(termText); } return undefined; } @@ -133,7 +135,7 @@ export class TermToRelatedTermsIndex implements ITermToRelatedTermsIndex { this.termAliases.deserialize(data.relatedTermsData); } if (data.textEmbeddingData) { - this.termEmbeddingsIndex = new TermEmbeddingIndex( + this.termEmbeddingsIndex = new TextEmbeddingIndex( this.settings.embeddingIndexSettings, ); this.termEmbeddingsIndex.deserialize(data.textEmbeddingData); @@ -167,7 +169,7 @@ export async function buildTermEmbeddingIndex( batch: collections.Slice, ) => boolean, ): Promise { - const termIndex = new TermEmbeddingIndex(settings); + const termIndex = new TextEmbeddingIndex(settings); for (const slice of collections.slices(terms, batchSize)) { if (progressCallback && !progressCallback(terms, slice)) { break; @@ -177,16 +179,16 @@ export async function buildTermEmbeddingIndex( return termIndex; } -export class TermEmbeddingIndex implements ITermEmbeddingIndex { - private termText: string[]; - private termEmbeddings: NormalizedEmbedding[]; +export class TextEmbeddingIndex implements ITermEmbeddingIndex { + private textList: string[]; + private textEmbeddings: NormalizedEmbedding[]; constructor( public settings: TextEmbeddingIndexSettings, data?: ITextEmbeddingIndexData, ) { - this.termText = []; - this.termEmbeddings = []; + this.textList = []; + this.textEmbeddings = []; if (data !== undefined) { this.deserialize(data); } @@ -210,38 +212,41 @@ export class TermEmbeddingIndex implements ITermEmbeddingIndex { } } - public async lookupTermsFuzzy( + public async lookup( term: string, maxMatches?: number, minScore?: number, ): Promise { - const termEmbedding = await generateEmbedding( - this.settings.embeddingModel, + let matches = await this.indexesOfNearestTerms( term, + maxMatches, + minScore, ); - maxMatches ??= this.settings.maxMatches; - minScore ??= this.settings.minScore; - if (maxMatches && maxMatches > 0) { - const matches = indexesOfNearest( - this.termEmbeddings, - termEmbedding, - maxMatches, - SimilarityType.Dot, - minScore, - ); - return matches.map((m) => { - return { text: this.termText[m.item], score: m.score }; - }); - } else { - return this.indexesOfNearestTerms(termEmbedding, minScore); - } + return matches.map((m) => { + return { text: this.textList[m.item], score: m.score }; + }); + } + + public async lookupEmbeddings( + text: string, + maxMatches?: number, + minScore?: number, + ): Promise<[string, NormalizedEmbedding][] | undefined> { + let matches = await this.indexesOfNearestTerms( + text, + maxMatches, + minScore, + ); + return matches.map((m) => { + return [this.textList[m.item], this.textEmbeddings[m.item]]; + }); } public remove(term: string): boolean { - const indexOf = this.termText.indexOf(term); + const indexOf = this.textList.indexOf(term); if (indexOf >= 0) { - this.termText.splice(indexOf, 1); - this.termEmbeddings.splice(indexOf, 1); + this.textList.splice(indexOf, 1); + this.textEmbeddings.splice(indexOf, 1); return true; } return false; @@ -260,10 +265,10 @@ export class TermEmbeddingIndex implements ITermEmbeddingIndex { public serialize(): ITextEmbeddingIndexData { const embeddingData: ITextEmbeddingDataItem[] = []; - for (let i = 0; i < this.termText.length; ++i) { + for (let i = 0; i < this.textList.length; ++i) { embeddingData.push({ - text: this.termText[i], - embedding: Array.from(this.termEmbeddings[i]), + text: this.textList[i], + embedding: Array.from(this.textEmbeddings[i]), }); } return { @@ -272,23 +277,38 @@ export class TermEmbeddingIndex implements ITermEmbeddingIndex { } private addTermEmbedding(term: string, embedding: NormalizedEmbedding) { - this.termText.push(term); - this.termEmbeddings.push(embedding); + this.textList.push(term); + this.textEmbeddings.push(embedding); } - private indexesOfNearestTerms( - other: NormalizedEmbedding, + private async indexesOfNearestTerms( + term: string, + maxMatches?: number, minScore?: number, - ): Term[] { - minScore ??= 0; - const matches: Term[] = []; - for (let i = 0; i < this.termEmbeddings.length; ++i) { - const score: number = dotProduct(this.termEmbeddings[i], other); - if (score >= minScore) { - matches.push({ text: this.termText[i], score }); - } + ): Promise { + const termEmbedding = await generateEmbedding( + this.settings.embeddingModel, + term, + ); + maxMatches ??= this.settings.maxMatches; + minScore ??= this.settings.minScore; + let matches: ScoredItem[]; + if (maxMatches && maxMatches > 0) { + matches = indexesOfNearest( + this.textEmbeddings, + termEmbedding, + maxMatches, + SimilarityType.Dot, + minScore, + ); + } else { + matches = indexesOfAllNearest( + this.textEmbeddings, + termEmbedding, + SimilarityType.Dot, + minScore, + ); } - matches.sort((x, y) => y.score! - x.score!); return matches; } } diff --git a/ts/packages/typeagent/src/vector/embeddings.ts b/ts/packages/typeagent/src/vector/embeddings.ts index 717faa74..22fcc496 100644 --- a/ts/packages/typeagent/src/vector/embeddings.ts +++ b/ts/packages/typeagent/src/vector/embeddings.ts @@ -84,7 +84,8 @@ export function indexOfNearest( } /** - * Given a list of embeddings and a test embedding, return the ordinals of the nearest items + * Given a list of embeddings and a test embedding, return at most maxMatches ordinals + * of the nearest items that meet the provided minScore threshold * @param list * @param other * @param maxMatches @@ -122,6 +123,33 @@ export function indexesOfNearest( return matches.byRank(); } +/** + * Given a list of embeddings and a test embedding, return ordinals + * of the nearest items that meet the provided minScore threshold + * @param list + * @param other + * @param similarityType + * @param minScore + * @returns + */ +export function indexesOfAllNearest( + list: Embedding[], + other: Embedding, + similarityType: SimilarityType, + minScore?: number, +): ScoredItem[] { + minScore ??= 0; + const matches: ScoredItem[] = []; + for (let i = 0; i < list.length; ++i) { + const score: number = similarity(list[i], other, similarityType); + if (score >= minScore) { + matches.push({ item: i, score }); + } + } + matches.sort((x, y) => y.score! - x.score!); + return matches; +} + export interface TopNList { push(item: T, score: number): void; byRank(): ScoredItem[]; From 4cfa63141c523b78857e51849ba3166d63d1cc19 Mon Sep 17 00:00:00 2001 From: Umesh Madan Date: Fri, 7 Feb 2025 13:34:52 -0800 Subject: [PATCH 5/8] Refactor --- ts/packages/knowPro/src/dataFormat.ts | 4 +- ts/packages/knowPro/src/relatedTermsIndex.ts | 47 ++++++++++---------- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/ts/packages/knowPro/src/dataFormat.ts b/ts/packages/knowPro/src/dataFormat.ts index fa59d88b..076b6575 100644 --- a/ts/packages/knowPro/src/dataFormat.ts +++ b/ts/packages/knowPro/src/dataFormat.ts @@ -132,7 +132,7 @@ export type Term = { export interface ITermToRelatedTermsIndex { lookupTerm(termText: string): Term[] | undefined; - lookupTermFuzzy(termText: string): Promise; + get termEmbeddings(): ITermEmbeddingIndex | undefined; serialize(): ITermsToRelatedTermsIndexData; deserialize(data?: ITermsToRelatedTermsIndexData): void; } @@ -152,7 +152,7 @@ export interface ITermsToRelatedTermsDataItem { } export interface ITermEmbeddingIndex { - lookup( + lookupTerm( text: string, maxMatches?: number, minScore?: number, diff --git a/ts/packages/knowPro/src/relatedTermsIndex.ts b/ts/packages/knowPro/src/relatedTermsIndex.ts index c0804eb8..747a5928 100644 --- a/ts/packages/knowPro/src/relatedTermsIndex.ts +++ b/ts/packages/knowPro/src/relatedTermsIndex.ts @@ -90,8 +90,10 @@ export async function resolveRelatedTerms( // If no hard-coded mappings, lookup any fuzzy related terms // Future: do this in batch if (!searchTerm.relatedTerms || searchTerm.relatedTerms.length === 0) { - searchTerm.relatedTerms = - await relatedTermsIndex.lookupTermFuzzy(termText); + if (relatedTermsIndex.termEmbeddings) { + searchTerm.relatedTerms = + await relatedTermsIndex.termEmbeddings.lookupTerm(termText); + } } } } @@ -108,17 +110,12 @@ export class TermToRelatedTermsIndex implements ITermToRelatedTermsIndex { this.termAliases = new TermToRelatedTermsMap(); } - public lookupTerm(termText: string): Term[] | undefined { - return this.termAliases.lookupTerm(termText); + public get termEmbeddings() { + return this.termEmbeddingsIndex; } - public async lookupTermFuzzy( - termText: string, - ): Promise { - if (this.termEmbeddingsIndex) { - return await this.termEmbeddingsIndex.lookup(termText); - } - return undefined; + public lookupTerm(termText: string): Term[] | undefined { + return this.termAliases.lookupTerm(termText); } public serialize(): ITermsToRelatedTermsIndexData { @@ -212,13 +209,17 @@ export class TextEmbeddingIndex implements ITermEmbeddingIndex { } } - public async lookup( - term: string, + public async lookupTerm( + text: string | NormalizedEmbedding, maxMatches?: number, minScore?: number, ): Promise { - let matches = await this.indexesOfNearestTerms( - term, + const termEmbedding = await generateEmbedding( + this.settings.embeddingModel, + text, + ); + let matches = this.indexesOfNearestTerms( + termEmbedding, maxMatches, minScore, ); @@ -232,8 +233,12 @@ export class TextEmbeddingIndex implements ITermEmbeddingIndex { maxMatches?: number, minScore?: number, ): Promise<[string, NormalizedEmbedding][] | undefined> { - let matches = await this.indexesOfNearestTerms( + const termEmbedding = await generateEmbedding( + this.settings.embeddingModel, text, + ); + let matches = this.indexesOfNearestTerms( + termEmbedding, maxMatches, minScore, ); @@ -281,15 +286,11 @@ export class TextEmbeddingIndex implements ITermEmbeddingIndex { this.textEmbeddings.push(embedding); } - private async indexesOfNearestTerms( - term: string, + private indexesOfNearestTerms( + termEmbedding: NormalizedEmbedding, maxMatches?: number, minScore?: number, - ): Promise { - const termEmbedding = await generateEmbedding( - this.settings.embeddingModel, - term, - ); + ): ScoredItem[] { maxMatches ??= this.settings.maxMatches; minScore ??= this.settings.minScore; let matches: ScoredItem[]; From 2f0ec8b97de0842c2ea2b68c8e84dca0712fd431 Mon Sep 17 00:00:00 2001 From: Umesh Madan Date: Sun, 9 Feb 2025 15:19:19 -0800 Subject: [PATCH 6/8] Scoring --- ts/packages/knowPro/src/collections.ts | 118 +++++++++++++------ ts/packages/knowPro/src/dataFormat.ts | 11 +- ts/packages/knowPro/src/query.ts | 16 +-- ts/packages/knowPro/src/relatedTermsIndex.ts | 78 +++++++++++- ts/packages/knowPro/src/search.ts | 20 +++- 5 files changed, 188 insertions(+), 55 deletions(-) diff --git a/ts/packages/knowPro/src/collections.ts b/ts/packages/knowPro/src/collections.ts index b6558916..2b7b5ed1 100644 --- a/ts/packages/knowPro/src/collections.ts +++ b/ts/packages/knowPro/src/collections.ts @@ -17,7 +17,9 @@ import { isInTextRange } from "./query.js"; export interface Match { value: T; score: number; - exactHitCount: number; + hitCount: number; + relatedScore: number; + relatedHitCount: number; } /** @@ -66,19 +68,45 @@ export class MatchAccumulator { public add(value: T, score: number, isExactMatch: boolean) { const existingMatch = this.getMatch(value); if (existingMatch) { - this.updateExisting(existingMatch, score, isExactMatch); - /* + //this.updateExisting(existingMatch, score, isExactMatch); if (isExactMatch) { - existingMatch.exactHitCount++; + existingMatch.hitCount++; + existingMatch.score += score; + } else { + existingMatch.relatedHitCount++; + existingMatch.relatedScore += score; } - existingMatch.score += score; - */ } else { - this.setMatch({ - value, - exactHitCount: isExactMatch ? 1 : 0, - score, - }); + if (isExactMatch) { + this.setMatch({ + value, + hitCount: 1, + score, + relatedHitCount: 0, + relatedScore: 0, + }); + } else { + this.setMatch({ + value, + hitCount: 0, + score: 0, + relatedHitCount: 1, + relatedScore: score, + }); + } + } + } + + public scaleScores(scoreScaler?: (match: Match) => void) { + scoreScaler ??= (m) => { + if (m.relatedHitCount > 0) { + const avgScore = m.relatedScore / m.relatedHitCount; + const normalizedScore = Math.log(1 + avgScore); + m.score += normalizedScore; + } + }; + for (const match of this.getMatches()) { + scoreScaler(match); } } @@ -88,10 +116,10 @@ export class MatchAccumulator { isExactMatch: boolean, ): void { if (isExactMatch) { - existingMatch.exactHitCount++; + existingMatch.hitCount++; existingMatch.score += newScore; - } else if (existingMatch.score < newScore) { - existingMatch.score = newScore; + } else if (existingMatch.relatedScore < newScore) { + existingMatch.relatedScore = newScore; } } @@ -155,7 +183,7 @@ export class MatchAccumulator { minHitCount: number | undefined, ): IterableIterator> { return minHitCount !== undefined && minHitCount > 0 - ? this.getMatches((m) => m.exactHitCount >= minHitCount) + ? this.getMatches((m) => m.hitCount >= minHitCount) : this.matches.values(); } } @@ -174,14 +202,14 @@ export class SemanticRefAccumulator extends MatchAccumulator { | IterableIterator | undefined, isExactMatch: boolean, - scoreBoost?: number, + weight?: number, ) { if (scoredRefs) { - scoreBoost ??= searchTerm.score ?? 0; + weight ??= searchTerm.weight ?? 1; for (const scoredRef of scoredRefs) { this.add( scoredRef.semanticRefIndex, - scoredRef.score + scoreBoost, + scoredRef.score * weight, isExactMatch, ); } @@ -196,16 +224,16 @@ export class SemanticRefAccumulator extends MatchAccumulator { | IterableIterator | undefined, isExactMatch: boolean, - scoreBoost?: number, + weight?: number, ) { if (scoredRefs) { - scoreBoost ??= searchTerm.score ?? 0; + weight ??= searchTerm.weight ?? 1; for (const scoredRef of scoredRefs) { const existingMatch = this.getMatch(scoredRef.semanticRefIndex); if (existingMatch) { this.updateExisting( existingMatch, - scoredRef.score + scoreBoost, + scoredRef.score * weight, isExactMatch, ); } else { @@ -379,29 +407,46 @@ export class TextRangeCollection { } export class TermSet { - constructor(private terms: Map = new Map()) {} + private terms: Map = new Map(); + constructor(terms?: Term[]) { + if (terms) { + this.addOrUnion(terms); + } + } public get size() { return this.terms.size; } - public add(term: Term) { + public add(term: Term): boolean { const existingTerm = this.terms.get(term.text); - if (!existingTerm) { - this.terms.set(term.text, term); + if (existingTerm) { + return false; } + this.terms.set(term.text, term); + return true; } - public addOrUnion(term: Term) { - const existingTerm = this.terms.get(term.text); - if (existingTerm) { - const existingScore = existingTerm.score ?? 0; - const newScore = term.score ?? 0; - if (existingScore < newScore) { - existingTerm.score = newScore; + public addOrUnion(terms: Term | Term[] | undefined) { + if (terms === undefined) { + return; + } + if (Array.isArray(terms)) { + for (const term of terms) { + this.addOrUnion(term); } } else { - this.terms.set(term.text, term); + const term = terms; + const existingTerm = this.terms.get(term.text); + if (existingTerm) { + const existingScore = existingTerm.weight ?? 0; + const newScore = term.weight ?? 0; + if (existingScore < newScore) { + existingTerm.weight = newScore; + } + } else { + this.terms.set(term.text, term); + } } } @@ -411,10 +456,17 @@ export class TermSet { : this.terms.get(term.text); } + public getWeight(term: Term): number | undefined { + return this.terms.get(term.text)?.weight; + } + public has(term: Term): boolean { return this.terms.has(term.text); } + public remove(term: Term) { + this.terms.delete(term.text); + } public clear(): void { this.terms.clear(); } diff --git a/ts/packages/knowPro/src/dataFormat.ts b/ts/packages/knowPro/src/dataFormat.ts index 076b6575..d275b727 100644 --- a/ts/packages/knowPro/src/dataFormat.ts +++ b/ts/packages/knowPro/src/dataFormat.ts @@ -125,9 +125,9 @@ export interface IConversationData { export type Term = { text: string; /** - * Optional additional score to use when this term matches + * Optional weighting for these matches */ - score?: number | undefined; + weight?: number | undefined; }; export interface ITermToRelatedTermsIndex { @@ -156,7 +156,12 @@ export interface ITermEmbeddingIndex { text: string, maxMatches?: number, minScore?: number, - ): Promise; + ): Promise; + lookupTerms( + texts: string[], + maxMatches?: number, + minScore?: number, + ): Promise; lookupEmbeddings( text: string, maxMatches?: number, diff --git a/ts/packages/knowPro/src/query.ts b/ts/packages/knowPro/src/query.ts index 61f6a22f..bb49612b 100644 --- a/ts/packages/knowPro/src/query.ts +++ b/ts/packages/knowPro/src/query.ts @@ -305,6 +305,7 @@ export class MatchAllTermsExpr extends QueryOpExpr { for (const matchExpr of this.searchTermExpressions) { matchExpr.accumulateMatches(context, allMatches); } + allMatches.scaleScores(); return allMatches; } } @@ -382,19 +383,12 @@ export class MatchSearchTermExpr extends MatchTermExpr { } } else { const semanticRefs = this.lookupTerm(context, relatedTerm); - if (context.matchedTerms.has(relatedTerm)) { - matches.updateTermMatches( - term, - semanticRefs, - false, - relatedTerm.score, - ); - } else { + if (!context.matchedTerms.has(relatedTerm)) { matches.addTermMatches( term, semanticRefs, false, - relatedTerm.score, + relatedTerm.weight, ); context.matchedTerms.add(relatedTerm); } @@ -524,14 +518,14 @@ export class MatchPropertyTermExpr extends MatchTermExpr { propVal, semanticRefs, false, - relatedPropVal.score, + relatedPropVal.weight, ); } else { matches.addTermMatches( propVal, semanticRefs, false, - relatedPropVal.score, + relatedPropVal.weight, ); context.matchedPropertyTerms.add(propName, relatedPropVal); } diff --git a/ts/packages/knowPro/src/relatedTermsIndex.ts b/ts/packages/knowPro/src/relatedTermsIndex.ts index 747a5928..47a94195 100644 --- a/ts/packages/knowPro/src/relatedTermsIndex.ts +++ b/ts/packages/knowPro/src/relatedTermsIndex.ts @@ -13,6 +13,7 @@ import { ScoredItem, indexesOfAllNearest, dotProduct, + generateTextEmbeddings, } from "typeagent"; import { Term, @@ -27,6 +28,7 @@ import { import { createEmbeddingCache } from "knowledge-processor"; import { SearchTerm } from "./search.js"; import { isSearchTermWildcard } from "./query.js"; +import { TermSet } from "./collections.js"; export class TermToRelatedTermsMap { public map: collections.MultiMap = new collections.MultiMap(); @@ -78,10 +80,13 @@ export async function resolveRelatedTerms( relatedTermsIndex: ITermToRelatedTermsIndex, searchTerms: SearchTerm[], ): Promise { + const searchableTerms = new TermSet(); + const searchTermsNeedingRelated: SearchTerm[] = []; for (const searchTerm of searchTerms) { if (isSearchTermWildcard(searchTerm)) { continue; } + searchableTerms.addOrUnion(searchTerm.term); const termText = searchTerm.term.text; // Resolve any specific term to related term mappings if (!searchTerm.relatedTerms || searchTerm.relatedTerms.length === 0) { @@ -90,10 +95,57 @@ export async function resolveRelatedTerms( // If no hard-coded mappings, lookup any fuzzy related terms // Future: do this in batch if (!searchTerm.relatedTerms || searchTerm.relatedTerms.length === 0) { - if (relatedTermsIndex.termEmbeddings) { - searchTerm.relatedTerms = - await relatedTermsIndex.termEmbeddings.lookupTerm(termText); + searchTermsNeedingRelated.push(searchTerm); + } + } + if ( + relatedTermsIndex.termEmbeddings && + searchTermsNeedingRelated.length > 0 + ) { + const relatedTermsForSearchTerms = + await relatedTermsIndex.termEmbeddings.lookupTerms( + searchTermsNeedingRelated.map((st) => st.term.text), + ); + for (let i = 0; i < searchTermsNeedingRelated.length; ++i) { + searchTermsNeedingRelated[i].relatedTerms = + relatedTermsForSearchTerms[i]; + } + // + // We need to prevent duplicate scoring. + // - The same related term can show up for different search terms but with different weights + // - related terms may also already be present as search terms + // + dedupeRelatedTerms(searchTerms); + } +} + +function dedupeRelatedTerms(searchTerms: SearchTerm[]) { + const allSearchTerms = new TermSet(); + const relatedWithMaxWeight = new TermSet(); + searchTerms.forEach((st) => { + allSearchTerms.add(st.term); + relatedWithMaxWeight.addOrUnion(st.relatedTerms); + }); + for (const searchTerm of searchTerms) { + if (searchTerm.relatedTerms && searchTerm.relatedTerms.length > 0) { + let nonDuplicateTerms: Term[] = []; + for (const candidateRelatedTerm of searchTerm.relatedTerms) { + if (allSearchTerms.has(candidateRelatedTerm)) { + // This related term is already a search term + continue; + } + // Related term is new. Only use it if it provides max weightf + const termWithMaxWeight = + relatedWithMaxWeight.get(candidateRelatedTerm); + if ( + termWithMaxWeight !== undefined && + termWithMaxWeight.weight === candidateRelatedTerm.weight + ) { + nonDuplicateTerms.push(termWithMaxWeight); + relatedWithMaxWeight.remove(candidateRelatedTerm); + } } + searchTerm.relatedTerms = nonDuplicateTerms; } } } @@ -213,7 +265,7 @@ export class TextEmbeddingIndex implements ITermEmbeddingIndex { text: string | NormalizedEmbedding, maxMatches?: number, minScore?: number, - ): Promise { + ): Promise { const termEmbedding = await generateEmbedding( this.settings.embeddingModel, text, @@ -228,6 +280,24 @@ export class TextEmbeddingIndex implements ITermEmbeddingIndex { }); } + public async lookupTerms( + texts: string[], + maxMatches?: number, + minScore?: number, + ): Promise { + const termEmbeddings = await generateTextEmbeddings( + this.settings.embeddingModel, + texts, + ); + const results = []; + for (const embedding of termEmbeddings) { + results.push( + await this.lookupTerm(embedding, maxMatches, minScore), + ); + } + return results; + } + public async lookupEmbeddings( text: string, maxMatches?: number, diff --git a/ts/packages/knowPro/src/search.ts b/ts/packages/knowPro/src/search.ts index cbe4bd39..7036ecfd 100644 --- a/ts/packages/knowPro/src/search.ts +++ b/ts/packages/knowPro/src/search.ts @@ -28,7 +28,7 @@ export function createSearchTerm(text: string, score?: number): SearchTerm { return { term: { text, - score, + weight: score, }, }; } @@ -88,7 +88,11 @@ class SearchQueryBuilder { // We will them expand these search terms by also including related terms private allSearchTerms: SearchTerm[] = []; - constructor(public conversation: IConversation) {} + constructor( + public conversation: IConversation, + public defaultMatchWeight: number = 100, + public relatedIsExactThreshold: number = 0.95, + ) {} public async compile( terms: SearchTerm[], @@ -207,9 +211,17 @@ class SearchQueryBuilder { private prepareSearchTerms(searchTerms: SearchTerm[]): void { for (const searchTerm of searchTerms) { this.prepareTerm(searchTerm.term); - searchTerm.term.score ??= searchTerms.length * 10; + searchTerm.term.weight ??= this.defaultMatchWeight; if (searchTerm.relatedTerms) { - searchTerm.relatedTerms.forEach((st) => this.prepareTerm(st)); + searchTerm.relatedTerms.forEach((st) => { + if ( + st.weight && + st.weight >= this.relatedIsExactThreshold + ) { + st.weight = this.defaultMatchWeight; + } + this.prepareTerm(st); + }); } } } From a19765b5c5761c70e1326e1f15dea9a754892b45 Mon Sep 17 00:00:00 2001 From: Umesh Madan Date: Sun, 9 Feb 2025 15:20:28 -0800 Subject: [PATCH 7/8] Scoring --- ts/packages/knowPro/src/query.ts | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/ts/packages/knowPro/src/query.ts b/ts/packages/knowPro/src/query.ts index bb49612b..c8973b4b 100644 --- a/ts/packages/knowPro/src/query.ts +++ b/ts/packages/knowPro/src/query.ts @@ -513,14 +513,7 @@ export class MatchPropertyTermExpr extends MatchTermExpr { propName, relatedPropVal.text, ); - if (context.matchedPropertyTerms.has(propName, relatedPropVal)) { - matches.updateTermMatches( - propVal, - semanticRefs, - false, - relatedPropVal.weight, - ); - } else { + if (!context.matchedPropertyTerms.has(propName, relatedPropVal)) { matches.addTermMatches( propVal, semanticRefs, From 21a379e87814a24228316afb81b0233d986aeb36 Mon Sep 17 00:00:00 2001 From: Umesh Madan Date: Mon, 10 Feb 2025 11:26:04 -0800 Subject: [PATCH 8/8] Relative matching --- ts/packages/knowPro/src/collections.ts | 3 ++- ts/packages/knowPro/src/query.ts | 10 +++----- ts/packages/knowPro/src/relatedTermsIndex.ts | 27 ++++++++++++-------- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/ts/packages/knowPro/src/collections.ts b/ts/packages/knowPro/src/collections.ts index 2b7b5ed1..c99d351e 100644 --- a/ts/packages/knowPro/src/collections.ts +++ b/ts/packages/knowPro/src/collections.ts @@ -97,12 +97,13 @@ export class MatchAccumulator { } } - public scaleScores(scoreScaler?: (match: Match) => void) { + public calculateTotalScore(scoreScaler?: (match: Match) => void) { scoreScaler ??= (m) => { if (m.relatedHitCount > 0) { const avgScore = m.relatedScore / m.relatedHitCount; const normalizedScore = Math.log(1 + avgScore); m.score += normalizedScore; + //m.score += m.relatedScore; } }; for (const match of this.getMatches()) { diff --git a/ts/packages/knowPro/src/query.ts b/ts/packages/knowPro/src/query.ts index c8973b4b..586e892d 100644 --- a/ts/packages/knowPro/src/query.ts +++ b/ts/packages/knowPro/src/query.ts @@ -305,7 +305,7 @@ export class MatchAllTermsExpr extends QueryOpExpr { for (const matchExpr of this.searchTermExpressions) { matchExpr.accumulateMatches(context, allMatches); } - allMatches.scaleScores(); + allMatches.calculateTotalScore(); return allMatches; } } @@ -375,9 +375,7 @@ export class MatchSearchTermExpr extends MatchTermExpr { ) { if (relatedTerm === undefined) { const semanticRefs = this.lookupTerm(context, term); - if (context.matchedTerms.has(term)) { - matches.updateTermMatches(term, semanticRefs, true); - } else { + if (!context.matchedTerms.has(term)) { matches.addTermMatches(term, semanticRefs, true); context.matchedTerms.add(term); } @@ -502,9 +500,7 @@ export class MatchPropertyTermExpr extends MatchTermExpr { propName, propVal.text, ); - if (context.matchedPropertyTerms.has(propName, propVal)) { - matches.updateTermMatches(propVal, semanticRefs, true); - } else { + if (!context.matchedPropertyTerms.has(propName, propVal)) { matches.addTermMatches(propVal, semanticRefs, true); context.matchedPropertyTerms.add(propName, propVal); } diff --git a/ts/packages/knowPro/src/relatedTermsIndex.ts b/ts/packages/knowPro/src/relatedTermsIndex.ts index 47a94195..b9f2b7e2 100644 --- a/ts/packages/knowPro/src/relatedTermsIndex.ts +++ b/ts/packages/knowPro/src/relatedTermsIndex.ts @@ -111,7 +111,8 @@ export async function resolveRelatedTerms( relatedTermsForSearchTerms[i]; } // - // We need to prevent duplicate scoring. + // Due to fuzzy matching, a search term may end with related terms that overlap with those of other search terms. + // This causes scoring problems... duplicate/redundant scoring that can cause items to seem more relevant than they are // - The same related term can show up for different search terms but with different weights // - related terms may also already be present as search terms // @@ -121,31 +122,37 @@ export async function resolveRelatedTerms( function dedupeRelatedTerms(searchTerms: SearchTerm[]) { const allSearchTerms = new TermSet(); - const relatedWithMaxWeight = new TermSet(); + const allRelatedTerms = new TermSet(); + // + // Collect all unique search and related terms. + // We end up with {term, maximum weight for term} pairs + // searchTerms.forEach((st) => { allSearchTerms.add(st.term); - relatedWithMaxWeight.addOrUnion(st.relatedTerms); + allRelatedTerms.addOrUnion(st.relatedTerms); }); for (const searchTerm of searchTerms) { if (searchTerm.relatedTerms && searchTerm.relatedTerms.length > 0) { - let nonDuplicateTerms: Term[] = []; + let uniqueRelatedForSearchTerm: Term[] = []; for (const candidateRelatedTerm of searchTerm.relatedTerms) { if (allSearchTerms.has(candidateRelatedTerm)) { // This related term is already a search term continue; } - // Related term is new. Only use it if it provides max weightf + // Each unique related term should be searched for + // only once, and (if there were duplicates) assigned the maximum weight assigned to that term const termWithMaxWeight = - relatedWithMaxWeight.get(candidateRelatedTerm); + allRelatedTerms.get(candidateRelatedTerm); if ( termWithMaxWeight !== undefined && termWithMaxWeight.weight === candidateRelatedTerm.weight ) { - nonDuplicateTerms.push(termWithMaxWeight); - relatedWithMaxWeight.remove(candidateRelatedTerm); + // Associate this related term with the current search term + uniqueRelatedForSearchTerm.push(termWithMaxWeight); + allRelatedTerms.remove(candidateRelatedTerm); } } - searchTerm.relatedTerms = nonDuplicateTerms; + searchTerm.relatedTerms = uniqueRelatedForSearchTerm; } } } @@ -276,7 +283,7 @@ export class TextEmbeddingIndex implements ITermEmbeddingIndex { minScore, ); return matches.map((m) => { - return { text: this.textList[m.item], score: m.score }; + return { text: this.textList[m.item], weight: m.score }; }); }