Skip to content

Commit

Permalink
Add CHM13 genome (#14)
Browse files Browse the repository at this point in the history
* Add T2T CHM13v2.0 (hs1) to annotations. Make gaps URL field optional, because the new version of human genome doesn't have gaps.
* Fill the gaps path only if the gaps URL is present
* Make sure that CHM13 genome with absent gaps can be successfully loaded from the YAML config

---------

Co-authored-by: Sergey Pestrikov <[email protected]>
  • Loading branch information
serge-p7v and Sergey Pestrikov authored May 3, 2023
1 parent 31b2867 commit 25ab66a
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 14 deletions.
13 changes: 10 additions & 3 deletions src/main/kotlin/org/jetbrains/bio/genome/Annotations.kt
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ object Gaps {
internal fun all(genome: Genome): ListMultimap<Chromosome, Gap> {
return CACHE.get(genome) {
val gapsPath = genome.gapsPath
gapsPath.checkOrRecalculate("Gaps") { output ->
gapsPath?.checkOrRecalculate("Gaps") { output ->
val config = genome.annotationsConfig
requireNotNull(config) {
"Cannot save Gaps info to $gapsPath. Annotations information isn't available for ${genome.build}."
Expand All @@ -217,8 +217,12 @@ object Gaps {
}
}

private fun read(genome: Genome, gapsPath: Path): ListMultimap<Chromosome, Gap> {
private fun read(genome: Genome, gapsPath: Path?): ListMultimap<Chromosome, Gap> {
val builder = ImmutableListMultimap.builder<Chromosome, Gap>()
if (gapsPath == null) {
return builder.build()
}

val chromosomes = genome.chromosomeNamesMap
FORMAT.parse(gapsPath.bufferedReader()).use { csvParser ->
for (row in csvParser) {
Expand All @@ -239,12 +243,15 @@ object Gaps {
val ucscAnnLegacyFormat = config.ucscAnnLegacyFormat

if (ucscAnnLegacyFormat) {
requireNotNull(gapsUrl) {
"Gaps URL is required for UCSC legacy format"
}
// Builds with per-chromosome gap annotations.
val prefix = gapsUrl.substringBeforeLast("/")
val template = "%s_${gapsUrl.substringAfterLast("/")}"
UCSC.downloadBatchTo(gapsPath, genome, "$prefix/", template)
} else {
gapsUrl.downloadTo(gapsPath)
gapsUrl?.downloadTo(gapsPath)

// Builds with separate centromere annotations:
if (config.centromeresUrl != null) {
Expand Down
10 changes: 6 additions & 4 deletions src/main/kotlin/org/jetbrains/bio/genome/AnnotationsConfig.kt
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ data class GenomeAnnotationsConfig(
val chromsizesUrl: String,
val repeatsUrl: String?,
val cytobandsUrl: String?,
val gapsUrl: String,
val gapsUrl: String?,
val centromeresUrl: String?,
val cpgIslandsUrl: String?,
val mart: Biomart?
Expand Down Expand Up @@ -268,7 +268,7 @@ object AnnotationsConfigLoader {
deserializedMap[CHROMSIZES_FIELD] as String,
deserializedMap[REPEATS_FIELD] as String?,
deserializedMap[CYTOBANDS_FIELD] as String?,
deserializedMap[GAPS_FIELD] as String,
deserializedMap[GAPS_FIELD] as String?,
deserializedMap[CENTROMERES_FIELD] as String?,
deserializedMap[CGIS_FIELD] as String?,
mart
Expand All @@ -289,9 +289,11 @@ object AnnotationsConfigLoader {
ALIASES_FIELD to if (aliases.size == 1) aliases.first() else aliases,
GTF_FIELD to genomeAnnotationsConfig.gtfUrl,
SEQUENCE_FIELD to genomeAnnotationsConfig.sequenceUrl,
CHROMSIZES_FIELD to genomeAnnotationsConfig.chromsizesUrl,
GAPS_FIELD to genomeAnnotationsConfig.gapsUrl
CHROMSIZES_FIELD to genomeAnnotationsConfig.chromsizesUrl
)
if (genomeAnnotationsConfig.gapsUrl != null) {
result[GAPS_FIELD] = genomeAnnotationsConfig.gapsUrl
}
if (genomeAnnotationsConfig.ucscAlias != null) {
result[UCSC_ALIAS_FIELD] = genomeAnnotationsConfig.ucscAlias
}
Expand Down
5 changes: 2 additions & 3 deletions src/main/kotlin/org/jetbrains/bio/genome/Genome.kt
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class Genome private constructor(
val cpgIslandsPath: Path?,
val cytobandsPath: Path?,
val repeatsPath: Path?,
gapsPath: Path?,
val gapsPath: Path?,
private val twoBitPath: Path?,
private val genesGTFPath: Path?,
genesDescriptionsPath: Path?,
Expand All @@ -62,7 +62,6 @@ class Genome private constructor(
val chrAltName2CanonicalMapping: Map<String, String>
) {
val chromSizesPath by lazy { ensureNotNull(chromSizesPath, "Chromosomes Sizes") }
val gapsPath by lazy { ensureNotNull(gapsPath, "Gaps") }
fun twoBitPath(downloadIfMissing: Boolean = true) =
ensureNotNull(twoBitPath, "Genome *.2bit Sequence").also { twoBitPath ->
if (downloadIfMissing) {
Expand Down Expand Up @@ -314,7 +313,7 @@ class Genome private constructor(
cpgIslandsPath = annCfgUpdated.cpgIslandsUrl?.let { dataPath / CpGIslands.ISLANDS_FILE_NAME },
cytobandsPath = annCfgUpdated.cytobandsUrl?.let { dataPath / CytoBands.FILE_NAME },
repeatsPath = dataPath / Repeats.FILE_NAME,
gapsPath = dataPath / Gaps.FILE_NAME,
gapsPath = annCfgUpdated.gapsUrl?.let { dataPath / Gaps.FILE_NAME },
twoBitPath = dataPath / "$parentBuild.2bit",
genesGTFPath = genesGTFPath,
genesDescriptionsPath = genesDescriptionsPath,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ object TestOrganismDataGenerator {
val genome = Genome[Genome.TEST_ORGANISM_BUILD]
val twoBitPath = genome.twoBitPath(downloadIfMissing = false)
val genesGtfPath = genome.genesGtfPath(downloadIfMissing = false)
val gapsPath = genome.gapsPath
val gapsPath = genome.gapsPath!!
val cytobandsPath = genome.cytobandsPath!!
val repeatsPath = genome.repeatsPath!!
val cpgIslandsPath = genome.cpgIslandsPath!!
Expand Down
12 changes: 12 additions & 0 deletions src/main/resources/annotations.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,18 @@ genomes:
sequence: http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit
chromsizes: http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes

hs1:
species: Homo sapiens
ucsc_alias: hs1
aliases: T2T CHM13v2.0
description: 'T2T CHM13v2.0 Telomere-to-Telomere assembly of the CHM13 cell line, with chrY from NA24385'
gtf: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/genes/catLiftOffGenesV1.gtf.gz
chr_alt_name_to_canonical:
- MT: chrM
cytobands: http://t2t.gi.ucsc.edu/chm13/hub/t2t-chm13-v2.0/download/chm13v2.0_cytobands_allchrs.bed.gz
sequence: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/hs1.2bit
chromsizes: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/hs1.chrom.sizes.txt

# http://www.ensembl.org/Drosophila_melanogaster/
dm3:
species: Drosophila melanogaster
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class AnnotationsConfigLoaderTest {
val (version, mapping, yaml) = AnnotationsConfigLoader.parseYaml(path, 0)
assertEquals(1, version)
assertNull(mapping)
assertEquals(5, yaml.genomes.size)
assertEquals(6, yaml.genomes.size)
}
}

Expand All @@ -37,8 +37,8 @@ class AnnotationsConfigLoaderTest {
withResource(AnnotationsConfigLoader::class.java, "test_annotations.yaml") { path ->
val (version, mapping, _) = AnnotationsConfigLoader.parseYaml(path, 1)
assertEquals(1, version)
assertEquals(5, mapping!!.entries.size)
assertEquals(listOf("ce6", "dm3", "hg19", "hg38", "mm9"), mapping.keys.sorted())
assertEquals(6, mapping!!.entries.size)
assertEquals(listOf("ce6", "dm3", "hg19", "hg38", "hs1", "mm9"), mapping.keys.sorted())
assertEquals("Drosophila melanogaster", mapping["dm3"]!!.species)
assertEquals(listOf("mm9", "NCBIM37"), mapping["mm9"]!!.names)
assertEquals("mm9", mapping["mm9"]!!.ucscAlias)
Expand Down Expand Up @@ -82,6 +82,13 @@ class AnnotationsConfigLoaderTest {
"http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/centromeres.txt.gz",
mapping["hg38"]!!.centromeresUrl
)
assertEquals(
"http://t2t.gi.ucsc.edu/chm13/hub/t2t-chm13-v2.0/download/chm13v2.0_cytobands_allchrs.bed.gz",
mapping["hs1"]!!.cytobandsUrl
)
assertNull(
mapping["hs1"]!!.gapsUrl
)
assertNull(mapping["hg19"]!!.centromeresUrl)
assertEquals(
"http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cpgIslandExt.txt.gz",
Expand Down
12 changes: 12 additions & 0 deletions src/test/resources/test_annotations.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,18 @@ genomes:
sequence: http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit
chromsizes: http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes

hs1:
species: Homo sapiens
ucsc_alias: hs1
aliases: T2T CHM13v2.0
description: 'T2T CHM13v2.0 Telomere-to-Telomere assembly of the CHM13 cell line, with chrY from NA24385'
gtf: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/genes/catLiftOffGenesV1.gtf.gz
chr_alt_name_to_canonical:
- MT: chrM
cytobands: http://t2t.gi.ucsc.edu/chm13/hub/t2t-chm13-v2.0/download/chm13v2.0_cytobands_allchrs.bed.gz
sequence: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/hs1.2bit
chromsizes: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/hs1.chrom.sizes.txt

# D. melanogaster
# http://www.ensembl.org/Drosophila_melanogaster/
dm3:
Expand Down

0 comments on commit 25ab66a

Please sign in to comment.