JetBrains-Research · olegs · Sep 27, 2019 · May 4, 2019 · May 10, 2019 · May 13, 2019
diff --git a/src/main/kotlin/org/jetbrains/bio/genome/Genome.kt b/src/main/kotlin/org/jetbrains/bio/genome/Genome.kt
@@ -243,6 +243,7 @@ class Genome private constructor(
                 genesGTFPath: Path? = null,
                 genesDescriptionsPath: Path? = null
         ) = getOrAdd(build, true) {
+            val chromSizesDir = chromSizesPath.parent
             Genome(
                 build,
                 annotationsConfig = annotationsConfig,
@@ -253,7 +254,7 @@ class Genome private constructor(
                 cytobandsPath = cytobandsPath,
                 repeatsPath = repeatsPath,
                 gapsPath = gapsPath,
-                twoBitPath = twoBitPath,
+                twoBitPath = twoBitPath ?: (chromSizesDir / "$build.2bit").let { if (it.exists) it else null },
                 genesGTFPath = genesGTFPath,
                 genesDescriptionsPath = genesDescriptionsPath
 

diff --git a/src/main/kotlin/org/jetbrains/bio/genome/PeaksInfo.kt b/src/main/kotlin/org/jetbrains/bio/genome/PeaksInfo.kt
@@ -26,11 +26,13 @@ object PeaksInfo {
 
     private fun Long.formatLongNumber() = String.format("%,d", this).replace(',', ' ')
 
-    fun compute(genomeQuery: GenomeQuery,
-                peaksStream: Stream<Location>,
-                src: URI?,
-                paths: List<Path>,
-                fragment: Fragment = AutoFragment): Map<String, String> {
+    fun compute(
+            genomeQuery: GenomeQuery,
+            peaksStream: Stream<Location>,
+            src: URI?,
+            paths: List<Path>,
+            fragment: Fragment = AutoFragment
+    ): Map<String, String> {
         val peaks = peaksStream.collect(Collectors.toList())
         val peaksLengths = peaks.map { it.length().toDouble() }.toDoubleArray()
         val peaksCount = peaksLengths.count()

diff --git a/src/main/kotlin/org/jetbrains/bio/genome/TestOrganismDataGenerator.kt b/src/main/kotlin/org/jetbrains/bio/genome/TestOrganismDataGenerator.kt
@@ -1,9 +1,12 @@
 package org.jetbrains.bio.genome
 
 import com.google.common.math.IntMath
+import gnu.trove.list.array.TFloatArrayList
 import org.apache.commons.csv.CSVFormat
 import org.apache.log4j.Logger
 import org.jetbrains.bio.Configuration
+import org.jetbrains.bio.big.BigWigFile
+import org.jetbrains.bio.big.FixedStepSection
 import org.jetbrains.bio.genome.sequence.Nucleotide
 import org.jetbrains.bio.genome.sequence.TwoBitWriter
 import org.jetbrains.bio.io.FastaRecord
@@ -26,7 +29,8 @@ object TestOrganismDataGenerator {
             "chr2" to IntMath.pow(10, 6),
             "chr3" to IntMath.pow(10, 6),
             "chrX" to IntMath.pow(10, 6),
-            "chrM" to IntMath.pow(10, 6))
+            "chrM" to IntMath.pow(10, 6)
+    )
 
     @JvmStatic
     fun main(args: Array<String>) {
@@ -58,6 +62,7 @@ object TestOrganismDataGenerator {
         generateCytobands(genome)
         generateRepeats(genome)
         generateCGI(genome)
+        generateMapability(genome)
         LOG.info("Done")
     }
 
@@ -278,4 +283,28 @@ object TestOrganismDataGenerator {
             """.trimMargin().trim())
         }
     }
+
+    /**
+     * Mapability is a wiggle track with 0 for non-mapable nucleotides and 1 for mapable ones.
+     *
+     * It's generated in the same folder as "chrom.sizes" with a filename "mapability.bigWig".
+     *
+     * We only generate mapability for chrX.
+     * This is done to test that the genome mean substitution for no-data chromosome works correctly.
+     */
+    private fun generateMapability(genome: Genome) {
+        LOG.info("Generating mapability bigWig")
+        val path = genome.chromSizesPath.parent / "mapability.bigWig"
+        val gq = genome.toQuery()
+        val chrX = gq["chrX"]!!
+        val random = ThreadLocalRandom.current()
+        val section = FixedStepSection(
+            chrX.name,
+            start = 0,
+            values = TFloatArrayList(
+                (0 until chrX.length).map { if (random.nextInt(5) == 4) 0.0f else 1.0f }.toFloatArray()
+            )
+        )
+        BigWigFile.write(listOf(section), gq.get().map { it.name to it.length }, path)
+    }
 }
diff --git a/src/main/kotlin/org/jetbrains/bio/genome/sequence/CpGContent.kt b/src/main/kotlin/org/jetbrains/bio/genome/sequence/CpGContent.kt
@@ -1,6 +1,7 @@
 package org.jetbrains.bio.genome.sequence
 
 import com.google.common.annotations.VisibleForTesting
+import org.jetbrains.bio.genome.Chromosome
 import org.jetbrains.bio.genome.Location
 
 /**
@@ -130,6 +131,18 @@ enum class CpGContent {
             }
             return cg
         }
+
+        /**
+         * Slice the given chromosome into bins and return an array containing the mean GC content for each bin.
+         */
+        fun binnedMeanCG(chromosome: Chromosome, binSize: Int): DoubleArray {
+            val sequence = chromosome.sequence
+            return chromosome.range.slice(binSize).mapToDouble { bin ->
+                (bin.startOffset until bin.endOffset).count { pos ->
+                    sequence.charAt(pos).let { it == 'c' || it == 'g' }
+                }.toDouble() / bin.length()
+            }.toArray()
+        }
     }
 }
 
diff --git a/src/main/kotlin/org/jetbrains/bio/statistics/ClassificationModel.kt b/src/main/kotlin/org/jetbrains/bio/statistics/ClassificationModel.kt
@@ -162,22 +162,29 @@ interface Fitter<out Model : ClassificationModel> {
      * @param maxIter an upper bound on fitting iterations (if applicable).
      * @return guessed classification model.
      */
-    fun guess(preprocessed: Preprocessed<DataFrame>,
-              title: String, threshold: Double, maxIter: Int, attempt: Int): Model
-
-    fun guess(preprocessed: List<Preprocessed<DataFrame>>,
-              title: String, threshold: Double, maxIter: Int, attempt: Int): Model =
-            guess(preprocessed.first(), title, threshold, maxIter, attempt)
-
-    fun fit(preprocessed: Preprocessed<DataFrame>,
+    fun guess(
+            preprocessed: Preprocessed<DataFrame>,
+              title: String, threshold: Double, maxIter: Int, attempt: Int
+    ): Model
+
+    fun guess(
+            preprocessed: List<Preprocessed<DataFrame>>,
+              title: String, threshold: Double, maxIter: Int, attempt: Int
+    ): Model = guess(preprocessed.first(), title, threshold, maxIter, attempt)
+
+    fun fit(
+            preprocessed: Preprocessed<DataFrame>,
             title: String = TITLE, threshold: Double = THRESHOLD,
             maxIter: Int = MAX_ITERATIONS,
-            attempt: Int = 0): Model = fit(listOf(preprocessed), title, threshold, maxIter, attempt)
+            attempt: Int = 0
+    ): Model = fit(listOf(preprocessed), title, threshold, maxIter, attempt)
 
-    fun fit(preprocessed: List<Preprocessed<DataFrame>>,
+    fun fit(
+            preprocessed: List<Preprocessed<DataFrame>>,
             title: String = TITLE, threshold: Double = THRESHOLD,
             maxIter: Int = MAX_ITERATIONS,
-            attempt: Int = 0): Model {
+            attempt: Int = 0
+    ): Model {
         require(threshold > 0) { "threshold $threshold must be >0" }
         require(maxIter > 0) { "maximum number of iterations $maxIter must be >0" }
 
@@ -199,8 +206,10 @@ interface Fitter<out Model : ClassificationModel> {
             require(multiStarts > 1) { "number of starts $multiStarts must be >1" }
         }
 
-        override fun fit(preprocessed: Preprocessed<DataFrame>, title: String,
-                         threshold: Double, maxIter: Int, attempt: Int): Model {
+        override fun fit(
+                preprocessed: Preprocessed<DataFrame>, title: String,
+                         threshold: Double, maxIter: Int, attempt: Int
+        ): Model {
             require(attempt == 0) {
                 "cyclic multistart is not allowed"
             }
@@ -218,8 +227,10 @@ interface Fitter<out Model : ClassificationModel> {
             return msModel
         }
 
-        override fun fit(preprocessed: List<Preprocessed<DataFrame>>, title: String,
-                         threshold: Double, maxIter: Int, attempt: Int): Model {
+        override fun fit(
+                preprocessed: List<Preprocessed<DataFrame>>, title: String,
+                         threshold: Double, maxIter: Int, attempt: Int
+        ): Model {
             require(attempt == 0) {
                 "cyclic multistart is not allowed"
             }