Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support RepeatMasker output format for CHM13 (hs1) #15

Merged
merged 2 commits into from
Aug 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/main/kotlin/org/jetbrains/bio/genome/Annotations.kt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import com.google.common.collect.ImmutableListMultimap
import com.google.common.collect.LinkedListMultimap
import com.google.common.collect.ListMultimap
import org.apache.commons.csv.CSVFormat
import org.jetbrains.bio.genome.RepeatsHs1.readHs1
import org.jetbrains.bio.util.*
import java.nio.file.Path
import java.nio.file.StandardOpenOption
Expand Down Expand Up @@ -83,6 +84,10 @@ object Repeats {


private fun read(genome: Genome, repeatsPath: Path): ListMultimap<Chromosome, Repeat> {
if (genome.annotationsConfig?.repeatsUrl?.contains("hs1") == true) {
return readHs1(genome, repeatsPath)
}

val builder = ImmutableListMultimap.builder<Chromosome, Repeat>()
val chromosomes = genome.chromosomeNamesMap
FORMAT.parse(repeatsPath.bufferedReader()).use { csvParser ->
Expand Down
86 changes: 86 additions & 0 deletions src/main/kotlin/org/jetbrains/bio/genome/AnnotationsHs1.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
@file:Suppress("unused")

package org.jetbrains.bio.genome

import com.google.common.collect.ImmutableListMultimap
import com.google.common.collect.ListMultimap
import java.io.BufferedReader
import java.io.FileInputStream
import java.io.InputStreamReader
import java.nio.file.Path
import java.util.zip.GZIPInputStream


object RepeatsHs1 {

internal fun readHs1(genome: Genome, repeatsPath: Path): ListMultimap<Chromosome, Repeat> {
val chromosomes = genome.chromosomeNamesMap

val reader = InputStreamReader(GZIPInputStream(FileInputStream(repeatsPath.toFile())))

val builder = ImmutableListMultimap.builder<Chromosome, Repeat>()
BufferedReader(reader).lines()
.forEach {
val parsingResult = parseHs1RepeatsLine(it, chromosomes)
if (parsingResult != null) builder.put(parsingResult.first, parsingResult.second)
}

return builder.build()
}

private fun isValidHs1RepeatsLine(line: String): Boolean {
val substringPresentOnlyInHeader = " repeat "
val chrMarker = "chr"

return line.isNotBlank() && line.contains(chrMarker) && !line.contains(substringPresentOnlyInHeader)
}

internal fun parseHs1RepeatsLine(line: String, chromosomes: Map<String, Chromosome>): Pair<Chromosome, Repeat>? {
if (!isValidHs1RepeatsLine(line)) {
return null
}

try {
val list = line.trim().split("\\s+".toRegex())
if (list.size != 15) {
return null
}

val chromosome = chromosomes[list[4]] ?: return null
val strand = list[8].toStrand()
val startOffset = list[5].toInt()
val endOffset = list[6].toInt()
val location = Location(startOffset, endOffset, chromosome, strand)

val classFamily = list[10]
val repeatClass: String
val repeatFamily: String
if (classFamily.contains("/")) {
val substrings = classFamily.split("/")
repeatClass = substrings[0].lowercase()
repeatFamily = substrings[1].lowercase()
} else {
repeatClass = classFamily
repeatFamily = classFamily
}

val repeat = Repeat(
list[9], location,
repeatClass,
repeatFamily
)

return Pair(chromosome, repeat)
} catch (e: Exception) {
return null
}
}

fun String.toStrand() = single().toStrand()

fun Char.toStrand() = when (this) {
'+' -> Strand.PLUS
'C' -> Strand.MINUS
else -> throw IllegalStateException("Unknown strand: $this")
}
}
1 change: 1 addition & 0 deletions src/main/resources/annotations.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ genomes:
gtf: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/genes/catLiftOffGenesV1.gtf.gz
chr_alt_name_to_canonical:
- MT: chrM
repeats: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/hs1.repeatMasker.out.gz
cytobands: http://t2t.gi.ucsc.edu/chm13/hub/t2t-chm13-v2.0/download/chm13v2.0_cytobands_allchrs.bed.gz
sequence: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/hs1.2bit
chromsizes: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/hs1.chrom.sizes.txt
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ class AnnotationsConfigLoaderTest {
assertNull(
mapping["hs1"]!!.gapsUrl
)
assertEquals(
"http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/hs1.repeatMasker.out.gz",
mapping["hs1"]!!.repeatsUrl
)
assertNull(mapping["hg19"]!!.centromeresUrl)
assertEquals(
"http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cpgIslandExt.txt.gz",
Expand Down
56 changes: 56 additions & 0 deletions src/test/kotlin/org/jetbrains/bio/genome/RepeatsHs1ParsingTest.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package org.jetbrains.bio.genome

import org.junit.Test
import kotlin.test.assertEquals
import kotlin.test.assertNotNull
import kotlin.test.assertNull

class RepeatsHs1ParsingTest {

private val chromosomes = mapOf("chr1" to Chromosome(Genome["to1"], "chr1"))

@Test
fun shouldIgnoreFirstLineOfHeader() {
val header1 = "SW perc perc perc query position in query matching repeat position in repeat"
assertNull(RepeatsHs1.parseHs1RepeatsLine(header1, chromosomes))
}

@Test
fun shouldIgnoreSecondLineOfHeader() {
val header2 = "score div. del. ins. sequence begin end (left) repeat class/family begin end (left) ID"
assertNull(RepeatsHs1.parseHs1RepeatsLine(header2, chromosomes))
}

@Test
fun shouldIgnoreEmptyStrings() {
assertNull(RepeatsHs1.parseHs1RepeatsLine("", chromosomes))
}

@Test
fun shouldParseSimpleRepeat() {
val simpleRepeat = " 2590 2.9 0.6 1.0 chr1 2 2705 (248384623) + (ACCCTA)n Simple_repeat 1 2693 (0) 4166944"
val result = RepeatsHs1.parseHs1RepeatsLine(simpleRepeat, chromosomes)
val expectedRepeat = Repeat(name = "(ACCCTA)n", location = Location(2, 2705, chromosomes["chr1"]!!, Strand.PLUS), repeatClass = "Simple_repeat", family = "Simple_repeat")
assertNotNull(result)
assertEquals(expectedRepeat, result.second)
}

@Test
fun shouldParseRepeatOnMinusStrand() {
val minus = " 1134 25.5 9.2 7.6 chr1 4083 4660 (248382668) C LTR60B LTR/ERV1 (0) 765 178 4166946"
val result = RepeatsHs1.parseHs1RepeatsLine(minus, chromosomes)
val expectedRepeat = Repeat(name = "LTR60B", location = Location(4083, 4660, chromosomes["chr1"]!!, Strand.MINUS), repeatClass = "ltr", family = "erv1")
assertNotNull(result)
assertEquals(expectedRepeat, result.second)
}

@Test
fun shouldParseRepeatOnPlusStrand() {
val plus = " 1293 21.3 7.2 1.9 chr1 4664 5263 (248382065) + L1MC3 LINE/L1 4913 5546 (2239) 4166947"
val result = RepeatsHs1.parseHs1RepeatsLine(plus, chromosomes)
val expectedRepeat = Repeat(name = "L1MC3", location = Location(4664, 5263, chromosomes["chr1"]!!, Strand.PLUS), repeatClass = "line", family = "l1")
assertNotNull(result)
assertEquals(expectedRepeat, result.second)
}

}
1 change: 1 addition & 0 deletions src/test/resources/test_annotations.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ genomes:
gtf: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/genes/catLiftOffGenesV1.gtf.gz
chr_alt_name_to_canonical:
- MT: chrM
repeats: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/hs1.repeatMasker.out.gz
cytobands: http://t2t.gi.ucsc.edu/chm13/hub/t2t-chm13-v2.0/download/chm13v2.0_cytobands_allchrs.bed.gz
sequence: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/hs1.2bit
chromsizes: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/hs1.chrom.sizes.txt
Expand Down