diff --git a/src/main/kotlin/org/jetbrains/bio/genome/Genome.kt b/src/main/kotlin/org/jetbrains/bio/genome/Genome.kt index c4610fd..f727277 100644 --- a/src/main/kotlin/org/jetbrains/bio/genome/Genome.kt +++ b/src/main/kotlin/org/jetbrains/bio/genome/Genome.kt @@ -243,6 +243,7 @@ class Genome private constructor( genesGTFPath: Path? = null, genesDescriptionsPath: Path? = null ) = getOrAdd(build, true) { + val chromSizesDir = chromSizesPath.parent Genome( build, annotationsConfig = annotationsConfig, @@ -253,7 +254,7 @@ class Genome private constructor( cytobandsPath = cytobandsPath, repeatsPath = repeatsPath, gapsPath = gapsPath, - twoBitPath = twoBitPath, + twoBitPath = twoBitPath ?: (chromSizesDir / "$build.2bit").let { if (it.exists) it else null }, genesGTFPath = genesGTFPath, genesDescriptionsPath = genesDescriptionsPath diff --git a/src/main/kotlin/org/jetbrains/bio/genome/PeaksInfo.kt b/src/main/kotlin/org/jetbrains/bio/genome/PeaksInfo.kt index 139f2d6..a7a837c 100644 --- a/src/main/kotlin/org/jetbrains/bio/genome/PeaksInfo.kt +++ b/src/main/kotlin/org/jetbrains/bio/genome/PeaksInfo.kt @@ -26,11 +26,13 @@ object PeaksInfo { private fun Long.formatLongNumber() = String.format("%,d", this).replace(',', ' ') - fun compute(genomeQuery: GenomeQuery, - peaksStream: Stream, - src: URI?, - paths: List, - fragment: Fragment = AutoFragment): Map { + fun compute( + genomeQuery: GenomeQuery, + peaksStream: Stream, + src: URI?, + paths: List, + fragment: Fragment = AutoFragment + ): Map { val peaks = peaksStream.collect(Collectors.toList()) val peaksLengths = peaks.map { it.length().toDouble() }.toDoubleArray() val peaksCount = peaksLengths.count() diff --git a/src/main/kotlin/org/jetbrains/bio/genome/TestOrganismDataGenerator.kt b/src/main/kotlin/org/jetbrains/bio/genome/TestOrganismDataGenerator.kt index 56dede5..68f258f 100644 --- a/src/main/kotlin/org/jetbrains/bio/genome/TestOrganismDataGenerator.kt +++ b/src/main/kotlin/org/jetbrains/bio/genome/TestOrganismDataGenerator.kt @@ -1,9 +1,12 @@ package org.jetbrains.bio.genome import com.google.common.math.IntMath +import gnu.trove.list.array.TFloatArrayList import org.apache.commons.csv.CSVFormat import org.apache.log4j.Logger import org.jetbrains.bio.Configuration +import org.jetbrains.bio.big.BigWigFile +import org.jetbrains.bio.big.FixedStepSection import org.jetbrains.bio.genome.sequence.Nucleotide import org.jetbrains.bio.genome.sequence.TwoBitWriter import org.jetbrains.bio.io.FastaRecord @@ -26,7 +29,8 @@ object TestOrganismDataGenerator { "chr2" to IntMath.pow(10, 6), "chr3" to IntMath.pow(10, 6), "chrX" to IntMath.pow(10, 6), - "chrM" to IntMath.pow(10, 6)) + "chrM" to IntMath.pow(10, 6) + ) @JvmStatic fun main(args: Array) { @@ -58,6 +62,7 @@ object TestOrganismDataGenerator { generateCytobands(genome) generateRepeats(genome) generateCGI(genome) + generateMapability(genome) LOG.info("Done") } @@ -278,4 +283,28 @@ object TestOrganismDataGenerator { """.trimMargin().trim()) } } + + /** + * Mapability is a wiggle track with 0 for non-mapable nucleotides and 1 for mapable ones. + * + * It's generated in the same folder as "chrom.sizes" with a filename "mapability.bigWig". + * + * We only generate mapability for chrX. + * This is done to test that the genome mean substitution for no-data chromosome works correctly. + */ + private fun generateMapability(genome: Genome) { + LOG.info("Generating mapability bigWig") + val path = genome.chromSizesPath.parent / "mapability.bigWig" + val gq = genome.toQuery() + val chrX = gq["chrX"]!! + val random = ThreadLocalRandom.current() + val section = FixedStepSection( + chrX.name, + start = 0, + values = TFloatArrayList( + (0 until chrX.length).map { if (random.nextInt(5) == 4) 0.0f else 1.0f }.toFloatArray() + ) + ) + BigWigFile.write(listOf(section), gq.get().map { it.name to it.length }, path) + } } \ No newline at end of file diff --git a/src/main/kotlin/org/jetbrains/bio/genome/sequence/CpGContent.kt b/src/main/kotlin/org/jetbrains/bio/genome/sequence/CpGContent.kt index 6f479d8..08391d2 100644 --- a/src/main/kotlin/org/jetbrains/bio/genome/sequence/CpGContent.kt +++ b/src/main/kotlin/org/jetbrains/bio/genome/sequence/CpGContent.kt @@ -1,6 +1,7 @@ package org.jetbrains.bio.genome.sequence import com.google.common.annotations.VisibleForTesting +import org.jetbrains.bio.genome.Chromosome import org.jetbrains.bio.genome.Location /** @@ -130,6 +131,18 @@ enum class CpGContent { } return cg } + + /** + * Slice the given chromosome into bins and return an array containing the mean GC content for each bin. + */ + fun binnedMeanCG(chromosome: Chromosome, binSize: Int): DoubleArray { + val sequence = chromosome.sequence + return chromosome.range.slice(binSize).mapToDouble { bin -> + (bin.startOffset until bin.endOffset).count { pos -> + sequence.charAt(pos).let { it == 'c' || it == 'g' } + }.toDouble() / bin.length() + }.toArray() + } } } diff --git a/src/main/kotlin/org/jetbrains/bio/statistics/ClassificationModel.kt b/src/main/kotlin/org/jetbrains/bio/statistics/ClassificationModel.kt index 244e54c..0ad9ecd 100644 --- a/src/main/kotlin/org/jetbrains/bio/statistics/ClassificationModel.kt +++ b/src/main/kotlin/org/jetbrains/bio/statistics/ClassificationModel.kt @@ -162,22 +162,29 @@ interface Fitter { * @param maxIter an upper bound on fitting iterations (if applicable). * @return guessed classification model. */ - fun guess(preprocessed: Preprocessed, - title: String, threshold: Double, maxIter: Int, attempt: Int): Model - - fun guess(preprocessed: List>, - title: String, threshold: Double, maxIter: Int, attempt: Int): Model = - guess(preprocessed.first(), title, threshold, maxIter, attempt) - - fun fit(preprocessed: Preprocessed, + fun guess( + preprocessed: Preprocessed, + title: String, threshold: Double, maxIter: Int, attempt: Int + ): Model + + fun guess( + preprocessed: List>, + title: String, threshold: Double, maxIter: Int, attempt: Int + ): Model = guess(preprocessed.first(), title, threshold, maxIter, attempt) + + fun fit( + preprocessed: Preprocessed, title: String = TITLE, threshold: Double = THRESHOLD, maxIter: Int = MAX_ITERATIONS, - attempt: Int = 0): Model = fit(listOf(preprocessed), title, threshold, maxIter, attempt) + attempt: Int = 0 + ): Model = fit(listOf(preprocessed), title, threshold, maxIter, attempt) - fun fit(preprocessed: List>, + fun fit( + preprocessed: List>, title: String = TITLE, threshold: Double = THRESHOLD, maxIter: Int = MAX_ITERATIONS, - attempt: Int = 0): Model { + attempt: Int = 0 + ): Model { require(threshold > 0) { "threshold $threshold must be >0" } require(maxIter > 0) { "maximum number of iterations $maxIter must be >0" } @@ -199,8 +206,10 @@ interface Fitter { require(multiStarts > 1) { "number of starts $multiStarts must be >1" } } - override fun fit(preprocessed: Preprocessed, title: String, - threshold: Double, maxIter: Int, attempt: Int): Model { + override fun fit( + preprocessed: Preprocessed, title: String, + threshold: Double, maxIter: Int, attempt: Int + ): Model { require(attempt == 0) { "cyclic multistart is not allowed" } @@ -218,8 +227,10 @@ interface Fitter { return msModel } - override fun fit(preprocessed: List>, title: String, - threshold: Double, maxIter: Int, attempt: Int): Model { + override fun fit( + preprocessed: List>, title: String, + threshold: Double, maxIter: Int, attempt: Int + ): Model { require(attempt == 0) { "cyclic multistart is not allowed" } diff --git a/src/main/kotlin/org/jetbrains/bio/statistics/emission/IntegerRegressionEmissionScheme.kt b/src/main/kotlin/org/jetbrains/bio/statistics/emission/IntegerRegressionEmissionScheme.kt new file mode 100644 index 0000000..71432c5 --- /dev/null +++ b/src/main/kotlin/org/jetbrains/bio/statistics/emission/IntegerRegressionEmissionScheme.kt @@ -0,0 +1,243 @@ +package org.jetbrains.bio.statistics.emission +import org.apache.commons.math3.distribution.FDistribution +import org.apache.commons.math3.linear.* +import org.jetbrains.bio.dataframe.DataFrame +import org.jetbrains.bio.viktor.F64Array +import org.jetbrains.bio.viktor.asF64Array +import java.util.function.IntPredicate +import kotlin.math.abs + +/** + * + * Regression with integer support. + * Examples: Poisson, binomial. + * + * @param covariateLabels the labels that will be used to procure covariates from the supplied [DataFrame] + * @param regressionCoefficients the regression coefficients. Note that + * regressionCoefficients.size == covariateLabels.size + 1 + * because the 0th coefficient corresponds to the intercept, the 1st coefficient corresponds to 0th label etc. + * + * @author Elena Kartysheva + * @date 5/25/19 + */ +abstract class IntegerRegressionEmissionScheme( + covariateLabels: List, regressionCoefficients: DoubleArray +) : EmissionScheme { + val covariateLabels = covariateLabels.map { it.intern() } + var regressionCoefficients: DoubleArray = regressionCoefficients + protected set + @Transient var W = DoubleArray(0) + + /** + * Provides the mean function (also known as the inverted link function), h = g^{-1}. + * + * The generalized linear model can be defined through the link function. + * + * η = g(μ), + * where μ is the distribution mean, η = Xβ is the linear predictor, and g is the link function. + * + * Conversely, + * μ = h(η), + * where h = g^{-1} is called the inverse link function, or sometimes the mean function. + * + * @param eta η = Xβ, the linear predictor + * @return μ = h(η), the distribution mean + * + * Note that it's often more efficient to provide an implementation for the vector-based [meanInPlace]. + * It's also possible to override [zW] and skip other link-related implementations altogether. + */ + abstract fun mean(eta: Double): Double + + /** + * The mean derivative function, h'(η). + * + * η = Xβ is the linear predictor. See [mean] for more information. + * + * @param eta η = Xβ, the linear predictor + * @return h'(η) + * + * Note that it's often more efficient to provide an implementation for the vector-based [meanDerivativeInPlace]. + * It's also possible to override [zW] and skip other link-related implementations altogether. + */ + abstract fun meanDerivative(eta: Double): Double + + /** + * The mean variance function, var h(η) + * + * h(η) is the mean function, η = Xβ is the linear predictor. See [mean] for more information. + * + * @param mean μ = h(η) + * @return var(h(η)) + * + * Note that it's often more efficient to provide an implementation for the vector-based [meanVarianceInPlace]. + * It's also possible to override [zW] and skip other mean-related implementations altogether. + */ + abstract fun meanVariance(mean: Double): Double + + /** + * Samples from the regression given the distribution mean. + * + * @param mean the distribution mean μ = h(η) + */ + abstract fun sampler(mean: Double): Int + + override val degreesOfFreedom: Int = regressionCoefficients.size + + /** + * Apply [mean] function h(η) in place. + * + * Can be overridden to use more efficient vector operations. + * It's also possible to override [zW] and skip other mean-related implementations altogether. + * + * @param eta η = Xβ, the linear predictor vector. Will be modified to contain μ = h(η). + * @return [eta] for easier call chaining. + */ + open fun meanInPlace(eta: F64Array): F64Array { + for(i in 0 until eta.size) { + eta[i] = mean(eta[i]) + } + return eta + } + + /** + * Apply [meanDerivative] function h'(η) in place. + * + * Can be overridden to use more efficient vector operations. + * It's also possible to override [zW] and skip other mean-related implementations altogether. + * + * @param eta η = Xβ, the linear predictor vector. Will be modified to contain h'(η). + * @return [eta] for easier call chaining. + */ + open fun meanDerivativeInPlace(eta: F64Array): F64Array { + for(i in 0 until eta.size) { + eta[i] = meanDerivative(eta[i]) + } + return eta + } + + /** + * Apply [meanVariance] function var(h(η)) in place. + * + * Can be overridden to use more efficient vector operations. + * It's also possible to override [zW] and skip other mean-related implementations altogether. + * + * @param mean μ = h(η), the distribution mean vector. Will be modified to contain var(h(η)). + * @return [mean] for easier call chaining. + */ + open fun meanVarianceInPlace(mean: F64Array): F64Array { + for(i in 0 until mean.size) { + mean[i] = meanVariance(mean[i]) + } + return mean + } + + /** + * Calculates z and W for IRLS as defined in https://bwlewis.github.io/GLM/ . + * + * Note that the document linked above uses definitions and denotations that differ from + * the widely accepted ones. + * In particular, "GLMs, abridged" uses the term "link function" and the symbol "g" to describe + * what is normally called the mean function (or the inverse link function) and denoted by "h". + * It also uses "b" instead of "y" for the response vector and "A" instead of "X" for the model matrix. + * + * z = η + (y - h(η)) / h'(η) + * W = diag(h'(η)^2 / var(h(η))) + * + * This is the only function necessary for IRLS and [update] to function. You can override it directly + * instead of providing implementations for [mean] and friends if there's a more efficient way to compute + * z and W. For example, with a logarithmic link, h = h' = var h, which allows to simplify W. + * + * @param y the response vector + * @param eta η = Xβ, the linear predictor. WARNING: [eta] can be modified to reduce copying! + * @return z and W as a [Pair] (the diagonal matrix W is stored as a vector) + */ + open fun zW(y: F64Array, eta: F64Array): Pair { + val countedLink = meanInPlace(eta.copy()) + val countedLinkDerivative = meanDerivativeInPlace(eta.copy()) + + eta += (y - countedLink).apply { divAssign(countedLinkDerivative)} + + meanVarianceInPlace(countedLink) + + countedLinkDerivative *= countedLinkDerivative + countedLinkDerivative /= countedLink + return eta to countedLinkDerivative + } + + override fun update(df: DataFrame, d: Int, weights: F64Array) { + val X = generateDesignMatrix(df) + val yInt = df.sliceAsInt(df.labels[d]) + val y = DoubleArray (yInt.size) {yInt[it].toDouble()}.asF64Array() + val iterMax = 5 + val tol = 1e-8 + var beta0 = regressionCoefficients + var beta1 = regressionCoefficients + for (i in 0 until iterMax) { + val eta = WLSRegression.calculateEta(X, beta0) + val (z, W) = zW(y, eta) + W *= weights + beta1 = WLSRegression.calculateBeta(X, z, W) + if ((beta1.zip(beta0) { a, b -> abs(a - b) }).sum() < tol) { + break + } + beta0 = beta1 + } + regressionCoefficients = beta1 + } + + /** + * Generates the model matrix from the dataframe. + * + * Extracts the labelled covariates from the supplied dataframe and prepends the intercept column. + */ + private fun generateDesignMatrix(df: DataFrame) = + WLSRegression.designMatrix(Array(covariateLabels.size) { df.sliceAsDouble(covariateLabels[it]) }) + + /** + * Calculates η = Xβ, the linear predictor. + * + * Uses a row from the provided dataframe to get the covariate values. + * + * @param df dataframe + * @param t row number + */ + fun getPredictor(df: DataFrame, t: Int): Double { + var res = regressionCoefficients[0] + covariateLabels.forEachIndexed { index, label -> + res += df.getAsDouble(t, label) * regressionCoefficients[index + 1] + } + return res + } + + override fun sample(df: DataFrame, d: Int, fill: IntPredicate) { + val observations = df.sliceAsInt(df.labels[d]) + (0 until df.rowsNumber).forEach { row -> + if (fill.test(row)) { + observations[row] = sampler(mean(getPredictor(df, row))) + } + } + } + + /** + * Calculates the f-test p-value. + * + * Performs the f-test for the null hypothesis that Rβ = r and returns the p-value. + */ + fun Ftest(df: DataFrame, d: Int, R: RealMatrix, r: RealVector): Double { + val x = Array(covariateLabels.size) {df.sliceAsDouble(covariateLabels[it])} + val yInt = df.sliceAsInt(df.labels[d]) + val y = DoubleArray (yInt.size) {yInt[it].toDouble()} + val X = WLSRegression.designMatrix(x) + val residuals = WLSRegression.calculateEta(X, regressionCoefficients).apply { minusAssign(y.asF64Array()) } + val sigma2 = residuals.dot(W.asF64Array() * residuals) / (X[0].size - X.size) + val XTWXI = WLSRegression.calculateBetaVariance(X, W.asF64Array()) + val RBeta = R.transpose().operate(ArrayRealVector(regressionCoefficients)) + val RBetaMinusr = RBeta.subtract(r) + val inverse = LUDecomposition(R.transpose().multiply(Array2DRowRealMatrix(XTWXI)).multiply(R)).solver.inverse + val Fstat = inverse.operate(RBetaMinusr).dotProduct(RBetaMinusr) / (r.dimension * sigma2) + val cumulativeProb = FDistribution( + r.dimension.toDouble(), (X[0].size - X.size).toDouble() + ).cumulativeProbability(Fstat) + return 1 - cumulativeProb + } +} diff --git a/src/main/kotlin/org/jetbrains/bio/statistics/emission/PoissonRegressionEmissionScheme.kt b/src/main/kotlin/org/jetbrains/bio/statistics/emission/PoissonRegressionEmissionScheme.kt new file mode 100644 index 0000000..bf8b16f --- /dev/null +++ b/src/main/kotlin/org/jetbrains/bio/statistics/emission/PoissonRegressionEmissionScheme.kt @@ -0,0 +1,48 @@ +package org.jetbrains.bio.statistics.emission + +import org.apache.commons.math3.util.FastMath +import org.jetbrains.bio.dataframe.DataFrame +import org.jetbrains.bio.statistics.MoreMath +import org.jetbrains.bio.statistics.distribution.Sampling +import org.jetbrains.bio.viktor.F64Array +import kotlin.math.exp + +/** + * + * Poisson regression. + * + * @author Elena Kartysheva + * @date 5/25/19 + */ +class PoissonRegressionEmissionScheme( + covariateLabels: List, + regressionCoefficients: DoubleArray +) : IntegerRegressionEmissionScheme(covariateLabels, regressionCoefficients) { + + override fun mean(eta: Double) = exp(eta) + override fun meanDerivative(eta: Double) = exp(eta) + override fun meanVariance(mean: Double) = mean + + override fun sampler(mean: Double) = Sampling.samplePoisson(mean) + + override fun meanInPlace(eta: F64Array) = eta.apply { expInPlace() } + override fun meanDerivativeInPlace(eta: F64Array) = eta.apply { expInPlace() } + override fun meanVarianceInPlace(mean: F64Array) = mean + + override fun zW(y: F64Array, eta: F64Array): Pair { + // Since h(η) = h'(η) = var(h(η)), we can skip h'(η) and var(h(η)) calculations and simplify W: + // W = diag(h'(η)^2 / var(h(η))) = h(η) + val countedLink = meanInPlace(eta.copy()) + eta += (y - countedLink).apply { divAssign(countedLink) } + return eta to countedLink + } + + override fun logProbability(df: DataFrame, t: Int, d: Int): Double { + // We don't use the existing Poisson log probability because that saves us one logarithm. + // We would have to provide lambda = exp(logLambda), and the Poisson implementation would then have to + // calculate log(lambda) again. + val logLambda = getPredictor(df, t) + val y = df.getAsInt(t, df.labels[d]) + return y * logLambda - MoreMath.factorialLog(y) - FastMath.exp(logLambda) + } +} \ No newline at end of file diff --git a/src/main/kotlin/org/jetbrains/bio/statistics/emission/WLSRegression.kt b/src/main/kotlin/org/jetbrains/bio/statistics/emission/WLSRegression.kt new file mode 100644 index 0000000..783e05f --- /dev/null +++ b/src/main/kotlin/org/jetbrains/bio/statistics/emission/WLSRegression.kt @@ -0,0 +1,77 @@ +package org.jetbrains.bio.statistics.emission + +import org.apache.commons.math3.linear.Array2DRowRealMatrix +import org.apache.commons.math3.linear.LUDecomposition +import org.jetbrains.bio.viktor.F64Array +import org.jetbrains.bio.viktor.asF64Array + +/** + * A utility object with various methods useful for WLS and IRLS. + */ +object WLSRegression { + + /** + * Converts x to a design matrix. No copying of x is performed. + * + * Prepends an appropriate-sized array filled with 1.0. + */ + fun designMatrix(x: Array): Array { + check(x.isNotEmpty()) { "empty matrix" } + check(x.drop(1).all { it.size == x[0].size }) { "different column sizes" } + return Array(x.size + 1) { if (it == 0) DoubleArray(x[0].size) { 1.0 } else x[it - 1] } + } + + /** + * Calculates η = Xβ. + * + * @param x a matrix that is stored column-first + * @param beta a vector of appropriate size + */ + fun calculateEta(x: Array, beta: DoubleArray): F64Array { + check(x.isNotEmpty()) { "empty matrix" } + check(x.size == beta.size) { "X and beta have different size" } + check(x.drop(1).all { it.size == x[0].size }) { "different column sizes" } + val xColumnsTimesBetaElements = Array(x.size) { i -> x[i].asF64Array() * beta[i] } + xColumnsTimesBetaElements.drop(1).forEach { xColumnsTimesBetaElements[0].plusAssign(it) } + return xColumnsTimesBetaElements[0] + } + + /** + * Calculates WLS estimator β. + * + * β = (X^T W X)^{-1} X^T W y, where X is the design matrix, y is the response vector, + * and W = diag(w), where w is the weight vector. + * + * @param x the design matrix X that is stored column-first + * @param y the response vector y + * @param weights the weight vector w + */ + fun calculateBeta(x: Array, y: F64Array, weights: F64Array): DoubleArray { + check(weights.size == y.size) { "weights and y have different size" } + check(x.all { it.size == weights.size }) { "X columns and weights vector have different size" } + + val inverse = calculateBetaVariance(x, weights) + val Wy = weights * y + val XTWy = DoubleArray(x.size) { i -> x[i].asF64Array().dot(Wy) } + return DoubleArray(x.size) { inverse[it].asF64Array().dot(XTWy) } + } + + /** + * Calculates (X^T W X)^{-1}, the variance multiplier of Var beta. + * + * Var beta = sigma^2 (X^T W X)^{-1} + * + * @return (X^T W X)^{-1}. Since the matrix is symmetric, the storage orientation is not important. + */ + fun calculateBetaVariance(x: Array, weights: F64Array): Array { + // the resulting matrix is symmetric by construction, let's not waste time computing the upper half + val XTWX = Array(x.size) { i -> + DoubleArray(x.size) { j -> + if (i < j) 0.0 else x[i].asF64Array().times(weights).dot(x[j].asF64Array()) + } + } + // fill the upper half + XTWX.indices.forEach { i -> XTWX.indices.forEach { j -> if (i < j) XTWX[i][j] = XTWX[j][i] }} + return (LUDecomposition(Array2DRowRealMatrix(XTWX)).solver.inverse as Array2DRowRealMatrix).dataRef + } +} \ No newline at end of file diff --git a/src/main/kotlin/org/jetbrains/bio/statistics/gson/GSONUtil.kt b/src/main/kotlin/org/jetbrains/bio/statistics/gson/GSONUtil.kt index 2427709..d6c7206 100644 --- a/src/main/kotlin/org/jetbrains/bio/statistics/gson/GSONUtil.kt +++ b/src/main/kotlin/org/jetbrains/bio/statistics/gson/GSONUtil.kt @@ -75,7 +75,7 @@ object GSONUtil { val fqnElement = element.asJsonObject.remove(fqnField) if (fqnElement == null) { deserializationError("Class name (%s) field is missing." + - " Please, recalculate the model.", fqnField) + " Please, recalculate the model.", fqnField) } val fqn = fqnElement!!.asString @@ -99,17 +99,21 @@ object GSONUtil { } /** - * Same as [classAwareAdapter], but also saves the contents of a static field named "VERSION" and later + * Same as [classAwareAdapter], but also saves the contents of a static int field named "VERSION" and later * checks it against the actual contents of this field when loading. * Helps to work around the changes in the class field layout; * a version change is more self-explanatory than a random field reading failure. */ - @JvmStatic fun classAndVersionAdapter(gson: Gson, factory: TypeAdapterFactory, - fqnField: String, versionField: String): TypeAdapter { + @JvmStatic fun classAndVersionAdapter( + gson: Gson, + factory: TypeAdapterFactory, + fqnField: String, + versionField: String + ): TypeAdapter { return object : TypeAdapter() { override fun write(out: JsonWriter, value: T) { val modelFQN = value.javaClass.name - val modelVersion = getSerializationFormatVersion(value.javaClass) + val modelVersion = getSerializationFormatVersion(value.javaClass).toInt() val delegate = gson.getDelegateAdapter(factory, TypeToken.get(value.javaClass)) @@ -127,9 +131,11 @@ object GSONUtil { val element = Streams.parse(`in`) val modelFQNElement = element.asJsonObject.remove(fqnField) val modelFormatVersElement = element.asJsonObject.remove(versionField) - if (modelFQNElement == null || modelFormatVersElement == null) { - deserializationError("Model class name (%s) or version field (%s) is missing." - + " Please, recalculate the model.", fqnField, versionField) + if (modelFQNElement == null) { + deserializationError("Class name ($fqnField) is missing.") + } + if (modelFormatVersElement == null) { + deserializationError("Version field ($versionField) is missing.") } val fqn = modelFQNElement!!.asString val vers = modelFormatVersElement!!.asString @@ -138,15 +144,15 @@ object GSONUtil { try { aClass = Class.forName(fqn) as Class } catch (e: ClassNotFoundException) { - deserializationError("Cannot load model class %s", fqn, cause = e) + deserializationError("Cannot load class %s", fqn, cause = e) return null } val recentVersion = getSerializationFormatVersion(aClass) if (recentVersion != vers) { - deserializationError("Format has changed, '%s' expects '%s' version, but was '%s'", - fqn, recentVersion, vers) + deserializationError("Format has changed, '%s' expects '%s' version, but got '%s'", + fqn, recentVersion, vers) } val adapter = gson.getDelegateAdapter(factory, TypeToken.get(aClass)) @@ -161,8 +167,8 @@ object GSONUtil { return (aClass.getDeclaredField("VERSION").get(null)).toString() } catch (e: Exception) { deserializationError("Cannot get serialization format version." + - " Probably VERSION field is missing in %s. Exception message: %s", - aClass.name, e.message, cause = e) + " Probably VERSION field is missing in %s. Exception message: %s", + aClass.name, e.message, cause = e) return "" // this statement can never be reached } } diff --git a/src/main/kotlin/org/jetbrains/bio/statistics/mixture/Internals.kt b/src/main/kotlin/org/jetbrains/bio/statistics/mixture/Internals.kt index 876e746..b31b7b0 100644 --- a/src/main/kotlin/org/jetbrains/bio/statistics/mixture/Internals.kt +++ b/src/main/kotlin/org/jetbrains/bio/statistics/mixture/Internals.kt @@ -14,7 +14,7 @@ import org.jetbrains.bio.viktor._I object MixtureInternals { @JvmStatic fun predict(logGammas: F64Array): IntArray { - return IntArray(logGammas.shape[0]) { logGammas.V[_I, it].argMax() } + return IntArray(logGammas.shape[1]) { logGammas.V[_I, it].argMax() } } @JvmStatic diff --git a/src/main/kotlin/org/jetbrains/bio/statistics/mixture/MLAbstractMixture.kt b/src/main/kotlin/org/jetbrains/bio/statistics/mixture/MLAbstractMixture.kt index 56b9d2d..e4acf36 100644 --- a/src/main/kotlin/org/jetbrains/bio/statistics/mixture/MLAbstractMixture.kt +++ b/src/main/kotlin/org/jetbrains/bio/statistics/mixture/MLAbstractMixture.kt @@ -14,7 +14,7 @@ import org.jetbrains.bio.viktor.F64Array * @since 06/09/13 */ abstract class MLAbstractMixture(protected val numComponents: Int, weights: F64Array) : ClassificationModel { - protected val logWeights: F64Array = weights.log() + val logWeights: F64Array = weights.log() val weights: F64Array get() = logWeights.exp() diff --git a/src/main/kotlin/org/jetbrains/bio/statistics/mixture/MLFreeMixture.kt b/src/main/kotlin/org/jetbrains/bio/statistics/mixture/MLFreeMixture.kt index cdcebc5..c3b4180 100644 --- a/src/main/kotlin/org/jetbrains/bio/statistics/mixture/MLFreeMixture.kt +++ b/src/main/kotlin/org/jetbrains/bio/statistics/mixture/MLFreeMixture.kt @@ -47,6 +47,16 @@ abstract class MLFreeMixture(numComponents: Int, return res.with("state", states) } + open fun sample(df:DataFrame, d: IntArray) { + val states = sampleStates(df.rowsNumber) + for (t in 0 until numDimensions) { + for (i in 0 until numComponents) { + getEmissionScheme(i, t).sample(df, d[t], IntPredicate { states[it] == i }) + } + + } + } + override fun degreesOfFreedom(): Int { var res = super.degreesOfFreedom() for (i in 0 until numComponents) { diff --git a/src/test/kotlin/org/jetbrains/bio/Tests.kt b/src/test/kotlin/org/jetbrains/bio/Tests.kt index 568c02f..297e0c7 100644 --- a/src/test/kotlin/org/jetbrains/bio/Tests.kt +++ b/src/test/kotlin/org/jetbrains/bio/Tests.kt @@ -1,5 +1,7 @@ package org.jetbrains.bio +import kotlin.math.abs +import kotlin.test.assertEquals import kotlin.test.assertFalse import kotlin.test.assertTrue @@ -49,4 +51,25 @@ object Tests { "Regex ${regex.pattern} doesn't match content:\n<$output>" ) + fun assertDeepEquals(expected: Array<*>, actual: Array<*>) { + assertTrue(expected.contentDeepEquals(actual), "Array contents differ") + } + + fun assertEquals(expected: Double, actual: Double, precision: Double, message: String?) { + assertTrue( + abs(expected - actual) < precision, + message ?: "Expected $expected and actual $actual values differ by more than $precision." + ) + } + + fun assertEquals(expected: DoubleArray, actual: DoubleArray, precision: Double) { + assertEquals(expected.size, actual.size, "Array sizes differ") + expected.indices.forEach { + assertEquals( + expected[it], actual[it], precision, + "Arrays differ at position $it: expected ${expected[it]}, actual ${actual[it]}." + ) + } + } + } \ No newline at end of file diff --git a/src/test/kotlin/org/jetbrains/bio/genome/sequence/CPgContentTest.kt b/src/test/kotlin/org/jetbrains/bio/genome/sequence/CPgContentTest.kt index e2ad05f..5a9e534 100644 --- a/src/test/kotlin/org/jetbrains/bio/genome/sequence/CPgContentTest.kt +++ b/src/test/kotlin/org/jetbrains/bio/genome/sequence/CPgContentTest.kt @@ -1,7 +1,11 @@ package org.jetbrains.bio.genome.sequence +import org.jetbrains.bio.Tests +import org.jetbrains.bio.genome.Genome +import org.jetbrains.bio.genome.toQuery import org.junit.Test import kotlin.test.assertEquals +import kotlin.test.assertTrue class CpGContentTest { @Test fun testComputeCpG() { @@ -22,4 +26,25 @@ class CpGContentTest { assertEquals(CpGContent.HCP, CpGContent.classify("CCCGCG".asNucleotideSequence(), 5)) assertEquals(CpGContent.HCP, CpGContent.classify("CGCGCG".asNucleotideSequence(), 5)) } + + @Test fun testBinnedMeanCG() { + val chr = Genome["to1"].toQuery()["chr1"]!! + val binned100 = CpGContent.binnedMeanCG(chr, 100) + val binned200 = CpGContent.binnedMeanCG(chr, 200) + assertEquals((chr.length - 1) / 100 + 1, binned100.size) + assertEquals((chr.length - 1) / 200 + 1, binned200.size) + binned200.forEachIndexed { index, value -> + assertTrue(value >= 0, "Mean CG content was negative ($value at bin $index)") + assertTrue(value <= 1, "Mean CG content was greater than 1 ($value at bin $index)") + if (index < binned200.size -1) { + Tests.assertEquals( + (binned100[2 * index] + binned100[2 * index + 1]) / 2, + value, + 1E-10, // we want to ignore double arithmetic precision issues + "Expected mean CG $value at 200 bps bin $index to be equal to mean of 100bps bins " + + "${2 * index} (${binned100[2 * index]}) and " + + "${2 * index + 1} (${binned100[2 * index + 1]}).") + } + } + } } diff --git a/src/test/kotlin/org/jetbrains/bio/statistics/ClassificationModelTest.kt b/src/test/kotlin/org/jetbrains/bio/statistics/ClassificationModelTest.kt index 5a738b5..2b8a863 100644 --- a/src/test/kotlin/org/jetbrains/bio/statistics/ClassificationModelTest.kt +++ b/src/test/kotlin/org/jetbrains/bio/statistics/ClassificationModelTest.kt @@ -27,7 +27,7 @@ class ClassificationModelTest { " \"value\": 1,\n" + " \"name\": \"bboo\",\n" + " \"model.class.fqn\": \"org.jetbrains.bio.statistics.Boo\",\n" + - " \"model.class.format\": \"222\"\n}", + " \"model.class.format\": 222\n}", path.read()) } @@ -48,8 +48,7 @@ class ClassificationModelTest { path.bufferedWriter().use { gson.toJson(obj, it) } thrown.expect(JsonParseException::class.java) - thrown.expectMessage("Deserialization error: Model class name (model.class.fqn) or version" + - " field (model.class.format) is missing. Please, recalculate the model.") + thrown.expectMessage("Deserialization error: Class name (model.class.fqn) is missing.") ClassificationModel.load(path) } } @@ -60,7 +59,7 @@ class ClassificationModelTest { assertEquals("{\n \"value\": NaN,\n " + "\"name\": \"NaN-boo\",\n " + "\"model.class.fqn\": \"org.jetbrains.bio.statistics.NanBoo\",\n " + - "\"model.class.format\": \"0\"\n}", + "\"model.class.format\": 0\n}", path.read()) assertEquals(nanBoo, ClassificationModel.load(path)) @@ -76,7 +75,7 @@ class ClassificationModelTest { thrown.expect(JsonParseException::class.java) thrown.expectMessage("Deserialization error: Format has changed, " + - "'org.jetbrains.bio.statistics.Boo' expects '123' version, but was '222'") + "'org.jetbrains.bio.statistics.Boo' expects '123' version, but got '222'") ClassificationModel.load(path) } } @@ -92,7 +91,7 @@ class ClassificationModelTest { thrown.expect(JsonParseException::class.java) thrown.expectMessage("Deserialization error: Format has changed, " + - "'org.jetbrains.bio.statistics.Boo' expects '666' version, but was '222'") + "'org.jetbrains.bio.statistics.Boo' expects '666' version, but got '222'") ClassificationModel.load(path) } diff --git a/src/test/kotlin/org/jetbrains/bio/statistics/emission/WLSRegressionTest.kt b/src/test/kotlin/org/jetbrains/bio/statistics/emission/WLSRegressionTest.kt new file mode 100644 index 0000000..2e969cb --- /dev/null +++ b/src/test/kotlin/org/jetbrains/bio/statistics/emission/WLSRegressionTest.kt @@ -0,0 +1,86 @@ +package org.jetbrains.bio.statistics.emission + +import org.jetbrains.bio.Tests +import org.jetbrains.bio.viktor.asF64Array +import org.junit.Assert.assertEquals +import org.junit.Test +import kotlin.test.assertFails + +class WLSRegressionTest { + + private val matrix123456 = arrayOf( + doubleArrayOf(1.0, 2.0, 3.0), + doubleArrayOf(4.0, 5.0, 6.0) + ) + + @Test fun testDesignMatrix() { + // column size mismatch + assertFails { + WLSRegression.designMatrix(arrayOf(doubleArrayOf(1.0), doubleArrayOf(2.0, 3.0))) + } + // empty matrix + assertFails { + WLSRegression.designMatrix(arrayOf()) + } + Tests.assertDeepEquals( + arrayOf( + doubleArrayOf(1.0, 1.0, 1.0), + doubleArrayOf(1.0, 2.0, 3.0), + doubleArrayOf(4.0, 5.0, 6.0) + ), + WLSRegression.designMatrix(matrix123456) + ) + } + + @Test fun testCalculateEta() { + assertFails { + WLSRegression.calculateEta(emptyArray(), doubleArrayOf(1.0)) + } + assertFails { + WLSRegression.calculateEta(matrix123456, doubleArrayOf()) + } + assertFails { + WLSRegression.calculateEta(matrix123456, doubleArrayOf(1.0, 2.0, 3.0)) + } + assertEquals( + doubleArrayOf(39.0, 54.0, 69.0).asF64Array(), + WLSRegression.calculateEta(matrix123456, doubleArrayOf(7.0, 8.0)) + ) + } + + @Test fun testCalculateBeta() { + assertFails { + WLSRegression.calculateBeta(emptyArray(), doubleArrayOf(1.0).asF64Array(), doubleArrayOf(2.0).asF64Array()) + } + assertFails { + WLSRegression.calculateBeta(matrix123456, doubleArrayOf().asF64Array(), doubleArrayOf().asF64Array()) + } + assertFails { + WLSRegression.calculateBeta( + matrix123456, + doubleArrayOf(1.0, 2.0).asF64Array(), + doubleArrayOf(3.0, 4.0, 5.0).asF64Array() + ) + } + assertFails { + WLSRegression.calculateBeta( + matrix123456, + doubleArrayOf(1.0, 2.0, 3.0).asF64Array(), + doubleArrayOf(4.0, 5.0).asF64Array() + ) + } + // data and reference values generated by R: + // > X = data.frame(x1=runif(4), x2=runif(4), y=runif(4)) + // > w = runif(4) + // > lm(y ~ x1 + x2, data=X, weights=w) + val x = arrayOf( + doubleArrayOf(0.0130203, 0.5831732, 0.1911626, 0.7604341), + doubleArrayOf(0.76642767, 0.03048108, 0.52236819, 0.65585551) + ) + val y = doubleArrayOf(0.5986931, 0.8171509, 0.5662360, 0.4927709).asF64Array() + val weights = doubleArrayOf(0.180464329, 0.825891734, 0.008628437, 0.696381745).asF64Array() + val expectedBeta = doubleArrayOf(0.95045, -0.20516, -0.45993) + val actualBeta = WLSRegression.calculateBeta(WLSRegression.designMatrix(x), y, weights) + Tests.assertEquals(expectedBeta, actualBeta, 1E-5) + } +}