Skip to content

Commit

Permalink
increase test coverage, isolate empty grammar
Browse files Browse the repository at this point in the history
  • Loading branch information
breandan committed Jan 8, 2024
1 parent 0170582 commit d689c69
Show file tree
Hide file tree
Showing 7 changed files with 2,813 additions and 917 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:
jobs:
build:

runs-on: ubuntu-latest
runs-on: macos-latest

steps:
- uses: actions/checkout@v2
Expand Down
2 changes: 1 addition & 1 deletion build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ tasks {

withType<Test> {
minHeapSize = "1g"
maxHeapSize = "6g"
maxHeapSize = "14g"
useJUnitPlatform()
testLogging {
events = setOf(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,49 +22,18 @@ fun CFG.barHillelRepair(prompt: List<Σᐩ>, distance: Int) =

private infix fun CFG.intersectLevFSAP(fsa: FSA): CFG {
var clock = TimeSource.Monotonic.markNow()
fun Π3A<STC>.isCompatibleWith(nts: Triple<Σᐩ, Σᐩ, Σᐩ>): Boolean {
fun Pair<Int, Int>.dominates(other: Pair<Int, Int>) =
first <= other.first && second <= other.second

fun manhattanDistance(first: Pair<Int, Int>, second: Pair<Int, Int>): Int =
second.second - first.second + second.first - first.first

// Range of the shortest path to the longest path, i.e., Manhattan distance
fun SPLP(a: STC, b: STC) =
(fsa.APSP[a.π1 to b.π1] ?: Int.MAX_VALUE)..
manhattanDistance(a.coords(), b.coords())

fun IntRange.overlaps(other: IntRange) =
(other.first in first..last) || (other.last in first..last)

fun lengthBounds(nt: Σᐩ): IntRange =
(lengthBounds[nt] ?: -1..-1)
// Okay if we overapproximate the length bounds a bit
.let { (it.first - 1)..(it.last + 1) }

// "[$p,$A,$r] -> [$p,$B,$q] [$q,$C,$r]"
fun isCompatible() =
first.coords().dominates(second.coords())
&& second.coords().dominates(third.coords())
&& lengthBounds(nts.first).overlaps(SPLP(first, third))
&& lengthBounds(nts.second).overlaps(SPLP(first, second))
&& lengthBounds(nts.third).overlaps(SPLP(second, third))

return isCompatible()
}

val nts = mutableSetOf("START")
fun Σᐩ.isSyntheticNT() =
first() == '[' && last() == ']' && count { it == ',' } == 2
first() == '[' && last() == ']' && count { it == '~' } == 2
fun List2<Σᐩ, List<Σᐩ>>>.filterRHSInNTS() =
asSequence().filter { (_, rhs) -> rhs.all { !it.isSyntheticNT() || it in nts } }

val initFinal =
(fsa.init * fsa.final).map { (q, r) -> "START" to listOf("[$q,START,$r]") }
(fsa.init * fsa.final).map { (q, r) -> "START" to listOf("[$q~START~$r]") }
.filterRHSInNTS()

val transits =
fsa.Q.map { (q, a, r) -> "[$q,$a,$r]".also { nts.add(it) } to listOf(a) }
fsa.Q.map { (q, a, r) -> "[$q~$a~$r]".also { nts.add(it) } to listOf(a) }
.filterRHSInNTS()

// For every production A → σ in P, for every (p, σ, q) ∈ Q × Σ × Q
Expand All @@ -82,10 +51,10 @@ private infix fun CFG.intersectLevFSAP(fsa: FSA): CFG {
triples
// CFG ∩ FSA - in general we are not allowed to do this, but it works
// because we assume a Levenshtein FSA, which is monotone and acyclic.
.filter { it.isCompatibleWith(A to B to C) }
.filter { it.isCompatibleWith(A to B to C, this@intersectLevFSAP, fsa) }
.map { (a, b, c) ->
val (p, q, r) = a.π1 to b.π1 to c.π1
"[$p,$A,$r]".also { nts.add(it) } to listOf("[$p,$B,$q]", "[$q,$C,$r]")
"[$p~$A~$r]".also { nts.add(it) } to listOf("[$p~$B~$q]", "[$q~$C~$r]")
}.toList()
}.flatten().filterRHSInNTS()

Expand All @@ -98,7 +67,7 @@ private infix fun CFG.intersectLevFSAP(fsa: FSA): CFG {
fun CFG.unitProdRules(fsa: FSA) =
unitProductions.map { (A, rhs) ->
val relevantTransits = fsa.Q.filter { it.π2 == rhs[0] }
relevantTransits.map { (p, σ, q) -> "[$p,$A,$q]" to listOf(σ) }
relevantTransits.map { (p, σ, q) -> "[$p~$A~$q]" to listOf(σ) }
}.flatten()

fun CFG.postProcess() =
Expand Down Expand Up @@ -128,12 +97,13 @@ fun CFG.postProcess() =
// Disabling nonterminal stubs!
// ∩-grammar has 56 useful productions <- Why can't we just create this CFG?!
fun CFG.dropVestigialProductions(
criteria: (Σᐩ) -> Boolean = { it.first() == '[' && it.last() == ']' && it.count { it == ',' } == 2 }
criteria: (Σᐩ) -> Boolean = { it.first() == '[' && it.last() == ']' && it.count { it == '~' } == 2 }
): CFG {
val nts: Set<Σᐩ> = map { it.LHS }.toSet()
// val reachable = reachableSymbols()
val rw = toMutableSet()
.apply { removeAll { prod -> prod.RHS.any { criteria(it) && it !in nts } } }
.also { println("Removed ${size - it.size} invalid productions") }
.freeze().removeUselessSymbols()

println("Removed ${size - rw.size} vestigial productions, resulting in ${rw.size} productions.")
Expand All @@ -147,10 +117,10 @@ infix fun FSA.intersect(cfg: CFG) = cfg.freeze().intersect(this)
infix fun CFG.intersect(fsa: FSA): CFG {
val clock = TimeSource.Monotonic.markNow()
val initFinal =
(fsa.init * fsa.final).map { (q, r) -> "START" to listOf("[$q,START,$r]") }
(fsa.init * fsa.final).map { (q, r) -> "START" to listOf("[$q~START~$r]") }

val transits =
fsa.Q.map { (q, a, r) -> "[$q,$a,$r]" to listOf(a) }
fsa.Q.map { (q, a, r) -> "[$q~$a~$r]" to listOf(a) }

// For every production A → σ in P, for every (p, σ, q) ∈ Q × Σ × Q
// such that δ(p, σ) = q we have the production [p, A, q] → σ in P′.
Expand All @@ -163,7 +133,7 @@ infix fun CFG.intersect(fsa: FSA): CFG {
val triples = fsa.states * fsa.states * fsa.states
val (A, B, C) = it.π1 to it.π2[0] to it.π2[1]
triples.map { (p, q, r) ->
"[$p,$A,$r]" to listOf("[$p,$B,$q]", "[$q,$C,$r]")
"[$p~$A~$r]" to listOf("[$p~$B~$q]", "[$q~$C~$r]")
}
}.flatten()

Expand All @@ -187,6 +157,37 @@ val CFG.lengthBounds: Map<Σᐩ, IntRange> by cache {
}
}

println("Computed NT length bounds in ${clock.elapsedNow()}")
// println("Computed NT length bounds in ${clock.elapsedNow()}")
map
}

fun CFG.lengthBounds(nt: Σᐩ, fudge: Int = 3): IntRange =
(lengthBounds[nt] ?: -1..-1)
// Okay if we overapproximate the length bounds a bit
.let { (it.first - fudge)..(it.last + fudge) }

fun Π3A<STC>.isCompatibleWith(nts: Triple<Σᐩ, Σᐩ, Σᐩ>, cfg: CFG, fsa: FSA): Boolean {
fun Pair<Int, Int>.dominates(other: Pair<Int, Int>) =
first <= other.first && second <= other.second

fun manhattanDistance(first: Pair<Int, Int>, second: Pair<Int, Int>): Int =
second.second - first.second + second.first - first.first

// Range of the shortest path to the longest path, i.e., Manhattan distance
fun SPLP(a: STC, b: STC) =
(fsa.APSP[a.π1 to b.π1] ?: Int.MAX_VALUE)..
manhattanDistance(a.coords(), b.coords())

fun IntRange.overlaps(other: IntRange) =
(other.first in first..last) || (other.last in first..last)

// "[$p,$A,$r] -> [$p,$B,$q] [$q,$C,$r]"
fun isCompatible() =
first.coords().dominates(second.coords())
&& second.coords().dominates(third.coords())
&& cfg.lengthBounds(nts.first).overlaps(SPLP(first, third))
&& cfg.lengthBounds(nts.second).overlaps(SPLP(first, second))
&& cfg.lengthBounds(nts.third).overlaps(SPLP(second, third))

return isCompatible()
}
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,12 @@ fun CFG.removeUselessSymbols(
generating: Set<Σᐩ> = genSym(),
reachable: Set<Σᐩ> = reachSym()
): CFG =
// toMutableSet()
// .apply { removeAll { (s, _) -> s !in generating } }
// .also { println("Removed ${size - it.size} nongenerating prods") }
// .apply { removeAll { (s, _) -> s !in reachable } }
// .also { println("Removed ${size - it.size} unreachable prods") }
// .toSet()
toMutableSet().apply {
removeAll { (s, _) -> s !in generating || s !in reachable }
}
Expand Down Expand Up @@ -207,6 +213,8 @@ fun CFG.reachSym(from: Σᐩ = START_SYMBOL): Set<Σᐩ> {
.filter { it !in allReachable && it !in nextReachable }
} while (nextReachable.isNotEmpty())

// println("TERM: ${allReachable.any { it in terminals }} ${allReachable.size}")

return allReachable
}

Expand All @@ -226,6 +234,8 @@ fun CFG.genSym(from: Set<Σᐩ> = terminalUnitProductions.map { it.LHS }.toSet()
.filter { it !in allGenerating && it !in nextGenerating }
} while (nextGenerating.isNotEmpty())

// println("START: ${START_SYMBOL in allGenerating} ${allGenerating.size}")

return allGenerating
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,55 +54,25 @@ infix fun CFG.jvmIntersectLevFSA(fsa: FSA): CFG =
private infix fun CFG.intersectLevFSAP(fsa: FSA): CFG {
var clock = TimeSource.Monotonic.markNow()
val lengthBoundsCache = lengthBounds
fun Π3A<STC>.isCompatibleWith(nts: Triple<Σᐩ, Σᐩ, Σᐩ>): Boolean {
fun Pair<Int, Int>.dominates(other: Pair<Int, Int>) =
first <= other.first && second <= other.second

fun manhattanDistance(first: Pair<Int, Int>, second: Pair<Int, Int>): Int =
second.second - first.second + second.first - first.first

// Range of the shortest path to the longest path, i.e., Manhattan distance
fun SPLP(a: STC, b: STC) =
(fsa.APSP[a.π1 to b.π1] ?: Int.MAX_VALUE)..
manhattanDistance(a.coords(), b.coords())

fun IntRange.overlaps(other: IntRange) =
(other.first in first..last) || (other.last in first..last)

fun lengthBounds(nt: Σᐩ): IntRange =
(lengthBoundsCache[nt] ?: -1..-1)
// Okay if we overapproximate the length bounds a bit
.let { (it.first - 1)..(it.last + 1) }

// "[$p,$A,$r] -> [$p,$B,$q] [$q,$C,$r]"
fun isCompatible() =
first.coords().dominates(second.coords())
&& second.coords().dominates(third.coords())
&& lengthBounds(nts.first).overlaps(SPLP(first, third))
&& lengthBounds(nts.second).overlaps(SPLP(first, second))
&& lengthBounds(nts.third).overlaps(SPLP(second, third))

return isCompatible()
}

val nts = mutableSetOf("START")
fun Σᐩ.isSyntheticNT() =
first() == '[' && last() == ']' && count { it == ',' } == 2
first() == '[' && last() == ']' && count { it == '~' } == 2
fun List2<Σᐩ, List<Σᐩ>>>.filterRHSInNTS() =
asSequence().filter { (_, rhs) -> rhs.all { !it.isSyntheticNT() || it in nts } }
parallelStream()
.filter { (_, rhs) -> rhs.all { !it.isSyntheticNT() || it in nts } }
.toList().toSet()

val initFinal =
(fsa.init * fsa.final).map { (q, r) -> "START" to listOf("[$q,START,$r]") }
.filterRHSInNTS()
(fsa.init * fsa.final).map { (q, r) -> "START" to listOf("[$q~START~$r]") }

val transits =
fsa.Q.map { (q, a, r) -> "[$q,$a,$r]".also { nts.add(it) } to listOf(a) }
.filterRHSInNTS()

// For every production A → σ in P, for every (p, σ, q) ∈ Q × Σ × Q
// such that δ(p, σ) = q we have the production [p, A, q] → σ in P′.
val unitProds = unitProdRules(fsa)
.onEach { (a, _) -> nts.add(a) }.filterRHSInNTS()
.onEach { (a, _) -> nts.add(a) }

// For each production A → BC in P, for every p, q, r ∈ Q,
// we have the production [p,A,r] → [p,B,q] [q,C,r] in P′.
Expand All @@ -114,15 +84,16 @@ private infix fun CFG.intersectLevFSAP(fsa: FSA): CFG {
triples
// CFG ∩ FSA - in general we are not allowed to do this, but it works
// because we assume a Levenshtein FSA, which is monotone and acyclic.
.filter { it.isCompatibleWith(A to B to C) }
.filter { it.isCompatibleWith(A to B to C, this@intersectLevFSAP, fsa) }
.map { (a, b, c) ->
val (p, q, r) = a.π1 to b.π1 to c.π1
"[$p,$A,$r]".also { nts.add(it) } to listOf("[$p,$B,$q]", "[$q,$C,$r]")
"[$p~$A~$r]".also { nts.add(it) } to listOf("[$p~$B~$q]", "[$q~$C~$r]")
}.toList()
}.toList().flatten().filterRHSInNTS()
}.toList().flatten()

println("Constructing ∩-grammar took: ${clock.elapsedNow()}")
clock = TimeSource.Monotonic.markNow()
return (initFinal + transits + binaryProds + unitProds).toSet().postProcess()
return (initFinal + transits + binaryProds + unitProds).toList()
.filterRHSInNTS().postProcess()
.also { println("Postprocessing took ${clock.elapsedNow()}") }
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,24 @@ import ai.hypergraph.kaliningraph.parsing.*
import ai.hypergraph.kaliningraph.tokenizeByWhitespace
import ai.hypergraph.kaliningraph.visualization.*
import org.junit.jupiter.api.Test
import kotlin.random.Random
import kotlin.test.*
import kotlin.time.TimeSource

/*
./gradlew jvmTest --tests "ai.hypergraph.kaliningraph.repair.ProbabilisticLBH"
*/
class ProbabilisticLBH {
val pythonTestCases =
invalidPythonStatements.lines().zip(validPythonStatements.lines())
// This ensures the LBH grammar is nonempty, otherwise extragrammatical symbols produce an error
// .map { it.first.tokenizeByWhitespace().map { if (it in Grammars.seq2parsePythonCFG.noEpsilonOrNonterminalStubs.terminals) it else "." }.joinToString(" ") to it.second }
.filter { it.first.tokenizeByWhitespace().all { it in Grammars.seq2parsePythonCFG.terminals } }
.shuffled(Random(seed = 1)).filter { (a, b) ->
("$a NEWLINE" !in Grammars.seq2parsePythonCFG.language).also { if (!it) println("Failed invalid") }
&& ("$b NEWLINE" in Grammars.seq2parsePythonCFG.language).also { if (!it) println("Failed valid") }
&& (levenshtein(a, b).also { if (it !in 1..3) println("Failed distance: $it") } in 1..3)
}.distinct()
/*
./gradlew jvmTest --tests "ai.hypergraph.kaliningraph.repair.ProbabilisticLBH.testSubgrammarEquivalence"
*/
Expand Down Expand Up @@ -121,7 +132,6 @@ class ProbabilisticLBH {
assertTrue(levBall.recognizes(humanRepair),
"Human repair not recognized by LevFSA (${levenshtein(origBroke, origFixed)}): $humanRepairANSI")

println("Total transitions in FSA: ${levBall.Q.size}")
println("Prompt: $origBroke")
println("Alphabet: ${levBall.alphabet}")
try {
Expand All @@ -133,7 +143,7 @@ class ProbabilisticLBH {

assertTrue(humanRepair in s2pg.language, "Human repair not recognized by CFG: $humanRepairANSI")
// assertTrue(humanRepair in intGram.language, "Human repair not recognized by LBH: $humanRepairANSI")
if(humanRepair !in intGram.language) {
if (humanRepair !in intGram.language) {
println("Human repair not recognized by LBH: $humanRepairANSI")
return@forEach
}
Expand All @@ -158,7 +168,7 @@ class ProbabilisticLBH {
// TOTAL LBH REPAIRS (1m 56.288773333s): 9
.also { println("TOTAL LBH REPAIRS (${clock.elapsedNow()}): ${it.size}\n\n") }
} catch (exception: NoSuchElementException) {
println("Exception: $origBroke")
println("Exception: $origBroke\n")
// exception.printStackTrace()
// throw exception
}
Expand All @@ -170,23 +180,34 @@ class ProbabilisticLBH {
bimap[listOf(string.trim().drop(1).dropLast(1))]
else bimap[listOf(string.trim())])


/*
./gradlew jvmTest --tests "ai.hypergraph.kaliningraph.repair.ProbabilisticLBH.testTinyC"
*/
// @Test
@Test
fun testTinyC() {
val gram = Grammars.tinyC.noEpsilonOrNonterminalStubs
val origStr = "id = ( id id ) ; "
val toRepair = origStr.tokenizeByWhitespace()
val maxLevDist = 2
val levBall = makeLevFSA(toRepair, maxLevDist, gram.terminals)
println("Total transitions in FSA: ${levBall.Q.size}")
// throw Exception("")
// println(levBall.toDot())
// throw Exception("")
val intGram = gram.intersectLevFSA(levBall)

intGram.depGraph.show()
println(pythonTestCases.size)
}

/*
./gradlew jvmTest --tests "ai.hypergraph.kaliningraph.repair.ProbabilisticLBH.diagnoseWholeGrammarDeletion"
*/
/** This is related to [CFG] */
@Test
fun diagnoseWholeGrammarDeletion() {
// Sometimes the whole grammar is deleted because there are no generating or reachable productions
val toRepair = "NAME . NAME ( STRING , class = STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE".tokenizeByWhitespace()
val s2pg = Grammars.seq2parsePythonCFG.noEpsilonOrNonterminalStubs
val levBall = makeLevFSA(toRepair, 2, s2pg.terminals, ceaDist = contextCSV)
val intGram = s2pg.jvmIntersectLevFSA(levBall)
val template = List(toRepair.size + 2) { "_" }

val clock = TimeSource.Monotonic.markNow()

val lbhSet = intGram.parallelEnumSeqMinimalWR(template, toRepair)
.onEachIndexed { i, it ->
val alignment = levenshteinAlign(toRepair.joinToString(" "), it).paintANSIColors()
println(alignment)
}.take(100).toList()
.also { println("TOTAL LBH REPAIRS (${clock.elapsedNow()}): ${it.size}\n\n") }
}
}
Loading

0 comments on commit d689c69

Please sign in to comment.