From 909031f1e47bc8ab5f70e481d44148c6541eb381 Mon Sep 17 00:00:00 2001 From: Mikolaj Adam Komar Date: Thu, 15 Feb 2024 14:42:33 +0100 Subject: [PATCH 1/2] Add custom gtf files handling to GenesTrackView --- .../org/jetbrains/bio/genome/Transcripts.kt | 161 +++++++++--------- 1 file changed, 85 insertions(+), 76 deletions(-) diff --git a/src/main/kotlin/org/jetbrains/bio/genome/Transcripts.kt b/src/main/kotlin/org/jetbrains/bio/genome/Transcripts.kt index 238b787..9166c2d 100644 --- a/src/main/kotlin/org/jetbrains/bio/genome/Transcripts.kt +++ b/src/main/kotlin/org/jetbrains/bio/genome/Transcripts.kt @@ -22,8 +22,8 @@ import kotlin.math.min * @author Sergei Lebedev */ enum class GeneClass( - val id: String, val description: String, - private val predicate: (Transcript) -> Boolean + val id: String, val description: String, + private val predicate: (Transcript) -> Boolean ) { CODING("coding", "coding genes", { it.isCoding }), NON_CODING("non-coding", "non-coding genes", { !it.isCoding }), @@ -92,19 +92,19 @@ class GeneAliasMap internal constructor(private val transcript: Transcript) : Ma /** This is actually a transcript, not a gene. */ class Transcript( - /** Ensembl transcript ID. */ - val ensemblId: String, - /** Ensembl gene ID. */ - val ensemblGeneId: String, - /** Gene symbol associated with this transcript. */ - val geneSymbol: String, - /** Transcript location. */ - override val location: Location, - /** Coding sequence range or `null` for non-coding transcripts. */ - val cdsRange: Range?, - private val utr3End5: Int, // Could be out of transcript bound if not defined - /** A list of sorted exon ranges. Empty for non-coding transcripts. */ - private val exonRanges: List + /** Ensembl transcript ID. */ + val ensemblId: String, + /** Ensembl gene ID. */ + val ensemblGeneId: String, + /** Gene symbol associated with this transcript. */ + val geneSymbol: String, + /** Transcript location. */ + override val location: Location, + /** Coding sequence range or `null` for non-coding transcripts. */ + val cdsRange: Range?, + private val utr3End5: Int, // Could be out of transcript bound if not defined + /** A list of sorted exon ranges. Empty for non-coding transcripts. */ + private val exonRanges: List ) : LocationAware { init { @@ -195,9 +195,9 @@ class Transcript( * Simplified notion of gene as a collection of [Transcript] */ data class Gene( - val ensemblGeneId: String, - val geneSymbol: String, - val transcripts: Collection + val ensemblGeneId: String, + val geneSymbol: String, + val transcripts: Collection ) { // Gene location can be considered as longest isoform (according to the talk at ECCB-2018) or union of isoforms. val location: Location @@ -208,7 +208,7 @@ data class Gene( val ranges = locations.map { it.toRange() } location = ranges.fold(ranges.first(), Range::union) - .on(locations.first().chromosome, locations.first().strand) + .on(locations.first().chromosome, locations.first().strand) } } @@ -231,19 +231,19 @@ object Transcripts { private val TRANSCRIPTS_CACHE = cache() private val BOUND5_INDEX_CACHE = CacheBuilder.newBuilder() - .softValues() - .initialCapacity(1) - .build, Map, IntArray, BinaryLut>>>() + .softValues() + .initialCapacity(1) + .build, Map, IntArray, BinaryLut>>>() /** Visible only for [TestOrganismDataGenerator]. */ val GSON = GsonBuilder() - .registerTypeAdapter(Range::class.java, Range.ADAPTER) - .registerTypeAdapter(Location::class.java, Location.ADAPTER) - .create() + .registerTypeAdapter(Range::class.java, Range.ADAPTER) + .registerTypeAdapter(Location::class.java, Location.ADAPTER) + .create() fun bound5Index( - genome: Genome, - onlyCodingGenes: Boolean + genome: Genome, + onlyCodingGenes: Boolean ) = BOUND5_INDEX_CACHE.get(genome to onlyCodingGenes) { val allTranscripts = all(genome) @@ -258,14 +258,23 @@ object Transcripts { fun all(genome: Genome): ListMultimap { return TRANSCRIPTS_CACHE.get(genome) { try { - loadTranscripts(genome, false) + loadTranscripts(genome, null, false) } catch (e: Exception) { LOG.warn("Failed to load transcripts for ${genome.build}. Trying to clear caches. Error:", e) - loadTranscripts(genome, true) + loadTranscripts(genome, null, true) } } } + fun allFromFile(gtfFile: Path, genome: Genome): ListMultimap { + return try { + loadTranscripts(genome, gtfFile, false) + } catch (e: Exception) { + LOG.warn("Failed to load transcripts for ${genome.build}. Trying to clear caches. Error:", e) + loadTranscripts(genome, gtfFile, true) + } + } + /** * Just container for 2 vars. By some reason Pair is being deserialized into LinkedTreeMap instead of Pair. */ @@ -276,8 +285,8 @@ object Transcripts { return genesGtfPath.withExtension("json.gz") } - private fun loadTranscripts(genome: Genome, cleanCache: Boolean): ListMultimap { - val gtfFile = genome.genesGtfPath(false) + private fun loadTranscripts(genome: Genome, inputGtfFile: Path?, cleanCache: Boolean): ListMultimap { + val gtfFile = inputGtfFile?: genome.genesGtfPath(false) val cachedTranscripts = cachedTranscriptsJsonPath(gtfFile) if (cleanCache) { @@ -285,24 +294,24 @@ object Transcripts { } cachedTranscripts.checkOrRecalculate("Processing genes") { (path) -> - // ensure gtf file exists - genome.genesGtfPath(true) + // ensure gtf file exists if default one is used + if (inputGtfFile == null) genome.genesGtfPath(true) val transcripts = LOG.time(level = Level.INFO, message = "Parsing genes $gtfFile") { gtfFile.bufferedReader().use { reader -> GtfReader(reader, genome).readTranscripts() - // sort by 5' offset then by ensemblId (to be deterministic), - // it is required for bound5Index() correct work, - // please don't change it to sort by start offset - .sortedWith(compareBy({ it.location.get5Bound() }, { it.ensemblId })) + // sort by 5' offset then by ensemblId (to be deterministic), + // it is required for bound5Index() correct work, + // please don't change it to sort by start offset + .sortedWith(compareBy({ it.location.get5Bound() }, { it.ensemblId })) } } path.bufferedWriter().use { GSON.toJson(JsonTranscriptome(FORMAT_VERSION, transcripts), it) } } val transcripts = LOG.time( - level = Level.INFO, - message = "Loading genes $gtfFile" + level = Level.INFO, + message = "Loading genes $gtfFile" ) { loadFromJson(cachedTranscripts) } @@ -327,32 +336,32 @@ object Transcripts { * See corresponding internal methods for more information. */ fun associatedTranscripts( - location: Location, - strategy: AssociationStrategy = AssociationStrategy.SINGLE, - limit: Int = 1000000, - codingOnly: Boolean = false + location: Location, + strategy: AssociationStrategy = AssociationStrategy.SINGLE, + limit: Int = 1000000, + codingOnly: Boolean = false ): List = when (strategy) { AssociationStrategy.SINGLE -> associatedTranscriptsSingle( - location, limit, codingOnly + location, limit, codingOnly ) AssociationStrategy.TWO -> associatedTranscriptsTwo( - location, limit, codingOnly + location, limit, codingOnly ) AssociationStrategy.BASAL_PLUS_EXT -> associatedTranscriptsPlus( - location, codingOnly = codingOnly, distal = limit + location, codingOnly = codingOnly, distal = limit ) AssociationStrategy.MULTIPLE -> associatedTranscriptsMultiple( - location, codingOnly = codingOnly, limit = limit + location, codingOnly = codingOnly, limit = limit ) } fun associatedTranscriptsSingle( - location: Location, - limit: Int = 1000000, - codingOnly: Boolean = false + location: Location, + limit: Int = 1000000, + codingOnly: Boolean = false ): List { // XXX GREAT: "single nearest gene" strategy: we use it because it simple, feel free to change it to // XXX "basal plus ext" or "two nearest genes" @@ -372,7 +381,7 @@ object Transcripts { val chr = location.chromosome val (transcripts, bounds5, bound5Lut) = bound5Index( - chr.genome, onlyCodingGenes = codingOnly + chr.genome, onlyCodingGenes = codingOnly ).getValue(chr) val midpoint = (location.startOffset + location.endOffset) / 2 @@ -395,9 +404,9 @@ object Transcripts { } fun associatedTranscriptsTwo( - location: Location, - limit: Int = 1000000, - codingOnly: Boolean = false + location: Location, + limit: Int = 1000000, + codingOnly: Boolean = false ): List { // XXX GREAT: "two nearest genes" strategy (http://great.stanford.edu/) @@ -416,7 +425,7 @@ object Transcripts { val chr = location.chromosome val (transcripts, bounds5, bound5Lut) = bound5Index( - chr.genome, onlyCodingGenes = codingOnly + chr.genome, onlyCodingGenes = codingOnly ).getValue(chr) val midpoint = (location.startOffset + location.endOffset) / 2 @@ -435,8 +444,8 @@ object Transcripts { // TODO: seems is incorrect behaviour, // instead of two nearest transcripts of same gene it should be two different nearest genes val candidates = (low..high) - .filter { abs(bounds5[it] - midpoint) <= limit } - .map { transcripts[it] } + .filter { abs(bounds5[it] - midpoint) <= limit } + .map { transcripts[it] } if (candidates.size <= 2) { return candidates @@ -447,14 +456,14 @@ object Transcripts { } fun associatedTranscriptsMultiple( - location: Location, - limit: Int = 1000000, - closeGenesAreaThreshold: Int = 50000, - codingOnly: Boolean = false + location: Location, + limit: Int = 1000000, + closeGenesAreaThreshold: Int = 50000, + codingOnly: Boolean = false ): List { val chr = location.chromosome val (transcripts, bounds5, bound5Lut) = bound5Index( - chr.genome, onlyCodingGenes = codingOnly + chr.genome, onlyCodingGenes = codingOnly ).getValue(chr) val midpoint = (location.startOffset + location.endOffset) / 2 @@ -477,11 +486,11 @@ object Transcripts { } fun associatedTranscriptsPlus( - location: Location, - upstream: Int = 5000, - downstream: Int = 1000, - distal: Int = 1000000, - codingOnly: Boolean = false + location: Location, + upstream: Int = 5000, + downstream: Int = 1000, + distal: Int = 1000000, + codingOnly: Boolean = false ): List { // XXX GREAT: "basal plus extension" strategy @@ -503,7 +512,7 @@ object Transcripts { val chr = location.chromosome val (transcripts, bounds5, bound5Lut) = bound5Index( - chr.genome, onlyCodingGenes = codingOnly + chr.genome, onlyCodingGenes = codingOnly ).getValue(chr) val midpoint = (location.startOffset + location.endOffset) / 2 @@ -513,34 +522,34 @@ object Transcripts { if (lowBas != -1) { val candidates = (lowBas..highBas).filter { bounds5[it] - midpoint in transcripts[it].strand.choose( - -downstream..upstream, -upstream..downstream + -downstream..upstream, -upstream..downstream ) } - .map { transcripts[it] } + .map { transcripts[it] } if (candidates.isNotEmpty()) return candidates } // check extended regulatory domain -- this is very tricky; we do it mostly separately for the two strands val plus = bound5Lut.framingElem(bounds5, midpoint) { transcripts[it].strand == Strand.PLUS } val minus = bound5Lut.framingElem(bounds5, midpoint) { transcripts[it].strand == Strand.MINUS } val leftBasalBoundaryPlus = plus.map { bounds5[it] + downstream } - .filter { it < midpoint }.maxOrNull() + .filter { it < midpoint }.maxOrNull() val rightBasalBoundaryPlus = plus.map { bounds5[it] - upstream } - .filter { it > midpoint }.minOrNull() + .filter { it > midpoint }.minOrNull() val leftBasalBoundaryMinus = minus.map { bounds5[it] + upstream } - .filter { it < midpoint }.maxOrNull() + .filter { it < midpoint }.maxOrNull() val rightBasalBoundaryMinus = minus.map { bounds5[it] - downstream } - .filter { it > midpoint }.minOrNull() + .filter { it > midpoint }.minOrNull() val leftBasalBoundary = when { leftBasalBoundaryPlus == null -> leftBasalBoundaryMinus leftBasalBoundaryMinus == null -> leftBasalBoundaryPlus else -> max(leftBasalBoundaryPlus, leftBasalBoundaryMinus) - .let { if (midpoint - it <= distal) it else null } + .let { if (midpoint - it <= distal) it else null } } val rightBasalBoundary = when { rightBasalBoundaryPlus == null -> rightBasalBoundaryMinus rightBasalBoundaryMinus == null -> rightBasalBoundaryPlus else -> min(rightBasalBoundaryPlus, rightBasalBoundaryMinus) - .let { if (it - midpoint <= distal) it else null } + .let { if (it - midpoint <= distal) it else null } } val resultsPlus = plus.filter { bounds5[it] + downstream == leftBasalBoundary || bounds5[it] - upstream == rightBasalBoundary From bc4f59c5f530da1aa1ff5dfcc2811ba8372de88f Mon Sep 17 00:00:00 2001 From: Mikolaj Adam Komar Date: Thu, 15 Feb 2024 16:57:22 +0100 Subject: [PATCH 2/2] Update test data and don't allow cashing of gtf files --- src/main/kotlin/org/jetbrains/bio/genome/Transcripts.kt | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/main/kotlin/org/jetbrains/bio/genome/Transcripts.kt b/src/main/kotlin/org/jetbrains/bio/genome/Transcripts.kt index 9166c2d..0707768 100644 --- a/src/main/kotlin/org/jetbrains/bio/genome/Transcripts.kt +++ b/src/main/kotlin/org/jetbrains/bio/genome/Transcripts.kt @@ -267,12 +267,7 @@ object Transcripts { } fun allFromFile(gtfFile: Path, genome: Genome): ListMultimap { - return try { - loadTranscripts(genome, gtfFile, false) - } catch (e: Exception) { - LOG.warn("Failed to load transcripts for ${genome.build}. Trying to clear caches. Error:", e) - loadTranscripts(genome, gtfFile, true) - } + return loadTranscripts(genome, gtfFile, true) } /** @@ -286,7 +281,7 @@ object Transcripts { } private fun loadTranscripts(genome: Genome, inputGtfFile: Path?, cleanCache: Boolean): ListMultimap { - val gtfFile = inputGtfFile?: genome.genesGtfPath(false) + val gtfFile = inputGtfFile ?: genome.genesGtfPath(false) val cachedTranscripts = cachedTranscriptsJsonPath(gtfFile) if (cleanCache) {