Skip to content

Commit

Permalink
move Cached Morphism and Fn
Browse files Browse the repository at this point in the history
simplify Morphism definition

Caching is gone, delegated to prover-commons which uses caffeine

adapt gitignore

add `ImpX` into Glossary
rootGroupID now always ends with rootID

fix maven & sonatype publishing, proven for shapesafe
simplify Trace and TraceSet

Lambda renamed to Fn & delegated to prover-commons

reverting back to SquashedFetchedRow

Suitex renamed to BaseSpec

SquashedFetchedRow => SquashedRow

simplify explore API

slicing of fetched in SquashedFetchedRow is now manual

interpolateAndRewriteLocally: distinct option is gone: may cause DataRow to be dropped, too dangerous

extract no longer has the distinct option

move groupIndex outside

remove regex-based string interpolation due to being too slow

taking in prover-commons

EqualBy removed, now uses prove-commons impl

prover-commons submodule is now added, start merging duplicated impl
  • Loading branch information
tribbloid committed Jan 31, 2024
1 parent b404ee7 commit 064a721
Show file tree
Hide file tree
Showing 295 changed files with 4,251 additions and 4,659 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
[submodule "buildSrc"]
path = buildSrc
url = https://github.com/tribbloid/buildSrc
[submodule "prover-commons"]
path = prover-commons
url = https://github.com/tribbloid/prover-commons
8 changes: 5 additions & 3 deletions .scalafix.conf
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,21 @@ rules = [
ProcedureSyntax
RemoveUnused
LeakingImplicitClassVal
// TODO: unrealiable due to lack of speculative modification, disabled
// ExplicitResultTypes
ExplicitResultTypes
]

RemoveUnused {
imports = true
privates = true
locals = true
locals = false
// "locals" is incompatible with `shouldNotCompile`
patternvars = true
params = true
}

ExplicitResultTypes {
rewriteStructuralTypesToNamedSubclass = false
skipSimpleDefinitions = false

onlyImplicits = true
}
20 changes: 16 additions & 4 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,29 @@ plugins {
id("com.github.johnrengelman.shadow") version "8.1.1"
}

allprojects {
idea {

module {

excludeDirs = excludeDirs + files(
"temp",

// apache spark
"warehouse",
)
}
}
}

idea {

module {

excludeDirs = excludeDirs + files(
".gradle",

// apache spark
"warehouse",

"parent/prover-commons",
"prover-commons",
)
}
}
6 changes: 4 additions & 2 deletions dev/format-code.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ source "${CRDIR}/.shared.sh"

cd "${FWDIR}" || exit

"${FWDIR}"/gradlew clean scalafix -Dorg.gradle.parallel=false "${BUILD_PROFILES[@]}"
"${FWDIR}"/gradlew scalafix -Dorg.gradle.parallel=false "${BUILD_PROFILES[@]}"
# consumes too much memory to run in parallel

scalafmt
scalafmt

"${FWDIR}"/gradlew scalafix -Dorg.gradle.parallel=false "${BUILD_PROFILES[@]}"
2 changes: 1 addition & 1 deletion dev/gradle-versions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
CRDIR="$(cd "`dirname "$0"`"; pwd)"
FWDIR="$(cd "`dirname "$0"`"/..; pwd)"

${FWDIR}/gradlew wrapper --gradle-version=8.4
${FWDIR}/gradlew wrapper --gradle-version=8.5

${FWDIR}/gradlew dependencyUpdates "$@"
2 changes: 1 addition & 1 deletion dev/update-submodules.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
git submodule sync && \
git submodule foreach git fetch && \
git submodule foreach git reset --hard && \
git submodule update --init --recursive --force
git submodule update --init --force
4 changes: 2 additions & 2 deletions gradle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ noUav
sparkVersion=3.5.0

#splainVersion=1.1.0-SNAPSHOT
#splainVersion=1.0.3
#splainVersion=1.1.0-RC0

org.gradle.parallel=true
org.gradle.caching=true
#org.gradle.daemon=true
org.gradle.daemon=true
Binary file modified gradle/wrapper/gradle-wrapper.jar
Binary file not shown.
2 changes: 1 addition & 1 deletion gradle/wrapper/gradle-wrapper.properties
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.4-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-8.5-bin.zip
networkTimeout=10000
validateDistributionUrl=true
zipStoreBase=GRADLE_USER_HOME
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package com.tribbloids.spookystuff.parsing

import com.tribbloids.spookystuff.testutils.FunSpecx
import com.tribbloids.spookystuff.utils.{InterleavedIterator, Interpolation}
import com.tribbloids.spookystuff.testutils.BaseSpec
import com.tribbloids.spookystuff.utils.InterleavedIterator
import fastparse.internal.Logger
import org.apache.spark.benchmark.BenchmarkHelper
import org.scalatest.Ignore
Expand All @@ -11,7 +11,7 @@ import scala.collection.mutable.ArrayBuffer
import scala.util.Random

@Ignore //TODO: enable!
class ParsersBenchmark extends FunSpecx {
class ParsersBenchmark extends BaseSpec {

import com.tribbloids.spookystuff.parsing.ParsersBenchmark._

Expand All @@ -30,7 +30,6 @@ class ParsersBenchmark extends FunSpecx {

val epochs: List[Epoch] = List(
Epoch(getRandomStrs, "speed reference", skipResultCheck = true)(_.speedRef()),
Epoch(getRandomStrs, "regex")(_.useRegex()),
Epoch(getRandomStrs, "fastParse")(_.useFastParse()),
Epoch(getRandomStrs, "FSM")(_.useFSM())
// Epoch(stream, "do nothing", skipResultCheck = true)(_.doNothing())
Expand Down Expand Up @@ -177,8 +176,6 @@ object ParsersBenchmark {
}
}

val interpolation: Interpolation = Interpolation("$")

case class RandomStrGen(
seed: Long,
override val size: Int = numVPerEpoch,
Expand Down Expand Up @@ -265,11 +262,6 @@ object ParsersBenchmark {
interpolated.mkString("")
}

def useRegex(): String = {

interpolation(str)(replace)
}

// measuring speed only, result is jibberish
def speedRef(): String = {
str.map(identity)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package com.tribbloids.spookystuff.utils

import com.tribbloids.spookystuff.testutils.FunSpecx
import com.tribbloids.spookystuff.testutils.BaseSpec

class RangeHashBenchmark extends FunSpecx {
class RangeHashBenchmark extends BaseSpec {

it("RangeArg hash should be fast") { // TODO: move to unit test

Expand Down
1 change: 0 additions & 1 deletion parent/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ subprojects {

testImplementation("com.vladsch.flexmark:flexmark:0.64.8")


constraints {

// TODO: some of the following may no longer be necessary
Expand Down
2 changes: 1 addition & 1 deletion parent/core/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,6 @@ dependencies {
api("org.apache.tika:tika-core:${tikaV}")
testImplementation( "org.apache.tika:tika-parsers-standard-package:${tikaV}")
api("com.googlecode.juniversalchardet:juniversalchardet:1.0.3")
api("org.jsoup:jsoup:1.16.2")
api("org.jsoup:jsoup:1.17.2")
api("com.syncthemall:boilerpipe:1.2.2")
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ object Const extends CommonConst {

val groupIndexExtractor: String = "G"

val defaultJoinField: Field = Field("A", isWeak = true)
val defaultForkField: Field = Field("A", isTransient = true)

val tikaDetector: DefaultDetector = new DefaultDetector()

Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
package com.tribbloids.spookystuff

import ai.acyclic.prover.commons.function.PreDef
import com.tribbloids.spookystuff.conf._
import com.tribbloids.spookystuff.metrics.SpookyMetrics
import com.tribbloids.spookystuff.rdd.FetchedDataset
import com.tribbloids.spookystuff.relay.io.Encoder
import com.tribbloids.spookystuff.row._
import com.tribbloids.spookystuff.session.Session
import com.tribbloids.spookystuff.utils.io.HDFSResolver
import com.tribbloids.spookystuff.utils.serialization.SerDeOverride
import com.tribbloids.spookystuff.utils.{ShippingMarks, TreeThrowable}
import com.tribbloids.spookystuff.utils.serialization.{NOTSerializable, SerializerOverride}
import com.tribbloids.spookystuff.utils.{ShippingMarks, SparkContextView, TreeThrowable}
import org.apache.hadoop.conf.Configuration
import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import com.tribbloids.spookystuff.relay.io.Encoder
import org.apache.spark.ml.dsl.utils.refl.{ToCatalyst, TypeMagnet}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SQLContext}
Expand All @@ -32,7 +33,19 @@ object SpookyContext {
result
}

implicit def toFetchedDS(spooky: SpookyContext): FetchedDataset = spooky.createBlank
implicit def asBlankFetchedDS(spooky: SpookyContext): FetchedDataset = spooky.createBlank

implicit def asSparkContextView(spooky: SpookyContext): SparkContextView = SparkContextView(spooky.sparkContext)

trait CanRunWith {

type _WithCtx <: NOTSerializable // TODO: with AnyVal
def _WithCtx: SpookyContext => _WithCtx

// cached results will be dropped for being NOTSerializable
@transient final lazy val withCtx: PreDef.Fn.Cached[SpookyContext, _WithCtx] = PreDef.Fn(_WithCtx).cachedBy()
}

}

case class SpookyContext(
Expand All @@ -44,40 +57,53 @@ case class SpookyContext(
// right before the shipping (implemented as serialisation hook),
// all enabled features that are not configured will be initialised with default value

object Plugins extends PluginRegistry.Factory {
object Plugins extends PluginRegistry.Factory[PluginSystem] {

type Out[T <: PluginSystem] = T#Plugin

type UB = PluginSystem
implicit override lazy val ubEv: ClassTag[UB] = ClassTag(classOf[UB])
override def init: Dependent = new Dependent {

override type Out[T <: PluginSystem] = T#Plugin
override def compute[T <: PluginSystem](v: T): v.Plugin = {
requireNotShipped()
val result = v.default(SpookyContext.this)
result
def apply[T <: PluginSystem](arg: T): arg.Plugin = {
requireNotShipped()
val result = arg.default(SpookyContext.this)
result
}
}

def registered: List[PluginSystem#PluginLike] = this.lookup.values.toList.collect {
case plugin: PluginSystem#PluginLike =>
plugin
}

def deployAll(): Unit = {
createEnabled()
val trials = cache.values.toList.map { plugin =>
Try(plugin.deploy())

Try {
registerEnabled()
val trials = registered.map { v =>
v.tryDeploy()
}

TreeThrowable.&&&(trials)
}
TreeThrowable.&&&(trials)
}

def resetAll(): Unit = {
Plugins.cache.values.foreach { ff =>
registered.foreach { ff =>
ff.reset()
}
}

}

def getPlugin[T <: PluginSystem](v: T): v.Plugin = Plugins.apply(v)
def getPlugin[T <: PluginSystem](v: T): v.Plugin = Plugins.apply(v: v.type)
def setPlugin(vs: PluginSystem#Plugin*): this.type = {
// no deployement
requireNotShipped()

vs.foreach { plugin =>
Plugins.update(plugin.pluginSystem, plugin)
Plugins.lookup.updateOverride(plugin.pluginSystem, plugin)
}

this
}

Expand Down Expand Up @@ -112,9 +138,9 @@ case class SpookyContext(
setConf(v)
}

val hadoopConfBroadcast: Broadcast[SerDeOverride[Configuration]] = {
val hadoopConfBroadcast: Broadcast[SerializerOverride[Configuration]] = {
sqlContext.sparkContext.broadcast(
SerDeOverride(this.sqlContext.sparkContext.hadoopConfiguration)
SerializerOverride(this.sqlContext.sparkContext.hadoopConfiguration)
)
}
def hadoopConf: Configuration = hadoopConfBroadcast.value.value
Expand All @@ -125,7 +151,7 @@ case class SpookyContext(

final override def clone: SpookyContext = {
val result = SpookyContext(sqlContext)
val plugins = Plugins.cache.values.toList.map(plugin => plugin.clone)
val plugins = Plugins.registered.map(plugin => plugin.clone)
result.setPlugin(plugins: _*)

result
Expand Down Expand Up @@ -170,9 +196,11 @@ case class SpookyContext(
}
}

lazy val _blankRowRDD: RDD[SquashedFetchedRow] = sparkContext.parallelize(Seq(SquashedFetchedRow.blank))
def createBlank: FetchedDataset = {

def createBlank: FetchedDataset = this.create(_blankRowRDD)
lazy val _rdd: RDD[SquashedRow] = sparkContext.parallelize(Seq(FetchedRow.blank.squash))
this.create(_rdd)
}

object dsl extends Serializable {

Expand All @@ -182,12 +210,16 @@ case class SpookyContext(
val mapRDD = new DataFrameView(df)
.toMapRDD()

val self: SquashedFetchedRDD = mapRDD
val self: SquashedRDD = mapRDD
.map { map =>
SquashedFetchedRow(
Option(ListMap(map.toSeq: _*))
.getOrElse(ListMap())
.map(tuple => (Field(tuple._1), tuple._2))
val listMap: ListMap[Field, Any] = Option(ListMap(map.toSeq: _*))
.getOrElse(ListMap())
.map { tuple =>
(Field(tuple._1), tuple._2)
}

SquashedRow.ofData(
DataRow(listMap).withEmptyScope
)
}
val fields = df.schema.fields.map { sf =>
Expand Down Expand Up @@ -215,11 +247,11 @@ case class SpookyContext(
val dataFrame = sqlContext.read.json(jsonDS)
dfToFetchedDS(dataFrame)

// RDD[SquashedFetchedRow] => ..
// RDD[SquashedRow] => ..
// discard schema
case _ if ttg.tpe <:< typeOf[SquashedFetchedRow] =>
// case _ if classOf[SquashedFetchedRow] == classTag[T].runtimeClass =>
val self = rdd.asInstanceOf[SquashedFetchedRDD]
case _ if ttg.tpe <:< typeOf[SquashedRow] =>
// case _ if classOf[SquashedRow] == classTag[T].runtimeClass =>
val self = rdd.asInstanceOf[SquashedRDD]
new FetchedDataset(
self,
fieldMap = ListMap(),
Expand All @@ -232,7 +264,7 @@ case class SpookyContext(
var cells = ListMap[Field, Any]()
if (str != null) cells = cells + (Field("_") -> str)

SquashedFetchedRow(cells)
FetchedRow(DataRow(cells)).squash
}
new FetchedDataset(
self,
Expand Down
Loading

0 comments on commit 064a721

Please sign in to comment.