Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CHM13 genome #14

Merged
merged 3 commits into from
May 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions src/main/kotlin/org/jetbrains/bio/genome/Annotations.kt
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ object Gaps {
internal fun all(genome: Genome): ListMultimap<Chromosome, Gap> {
return CACHE.get(genome) {
val gapsPath = genome.gapsPath
gapsPath.checkOrRecalculate("Gaps") { output ->
gapsPath?.checkOrRecalculate("Gaps") { output ->
val config = genome.annotationsConfig
requireNotNull(config) {
"Cannot save Gaps info to $gapsPath. Annotations information isn't available for ${genome.build}."
Expand All @@ -217,8 +217,12 @@ object Gaps {
}
}

private fun read(genome: Genome, gapsPath: Path): ListMultimap<Chromosome, Gap> {
private fun read(genome: Genome, gapsPath: Path?): ListMultimap<Chromosome, Gap> {
val builder = ImmutableListMultimap.builder<Chromosome, Gap>()
if (gapsPath == null) {
return builder.build()
}

val chromosomes = genome.chromosomeNamesMap
FORMAT.parse(gapsPath.bufferedReader()).use { csvParser ->
for (row in csvParser) {
Expand All @@ -239,12 +243,15 @@ object Gaps {
val ucscAnnLegacyFormat = config.ucscAnnLegacyFormat

if (ucscAnnLegacyFormat) {
requireNotNull(gapsUrl) {
"Gaps URL is required for UCSC legacy format"
}
// Builds with per-chromosome gap annotations.
val prefix = gapsUrl.substringBeforeLast("/")
val template = "%s_${gapsUrl.substringAfterLast("/")}"
UCSC.downloadBatchTo(gapsPath, genome, "$prefix/", template)
} else {
gapsUrl.downloadTo(gapsPath)
gapsUrl?.downloadTo(gapsPath)

// Builds with separate centromere annotations:
if (config.centromeresUrl != null) {
Expand Down
10 changes: 6 additions & 4 deletions src/main/kotlin/org/jetbrains/bio/genome/AnnotationsConfig.kt
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ data class GenomeAnnotationsConfig(
val chromsizesUrl: String,
val repeatsUrl: String?,
val cytobandsUrl: String?,
val gapsUrl: String,
val gapsUrl: String?,
val centromeresUrl: String?,
val cpgIslandsUrl: String?,
val mart: Biomart?
Expand Down Expand Up @@ -268,7 +268,7 @@ object AnnotationsConfigLoader {
deserializedMap[CHROMSIZES_FIELD] as String,
deserializedMap[REPEATS_FIELD] as String?,
deserializedMap[CYTOBANDS_FIELD] as String?,
deserializedMap[GAPS_FIELD] as String,
deserializedMap[GAPS_FIELD] as String?,
deserializedMap[CENTROMERES_FIELD] as String?,
deserializedMap[CGIS_FIELD] as String?,
mart
Expand All @@ -289,9 +289,11 @@ object AnnotationsConfigLoader {
ALIASES_FIELD to if (aliases.size == 1) aliases.first() else aliases,
GTF_FIELD to genomeAnnotationsConfig.gtfUrl,
SEQUENCE_FIELD to genomeAnnotationsConfig.sequenceUrl,
CHROMSIZES_FIELD to genomeAnnotationsConfig.chromsizesUrl,
GAPS_FIELD to genomeAnnotationsConfig.gapsUrl
CHROMSIZES_FIELD to genomeAnnotationsConfig.chromsizesUrl
)
if (genomeAnnotationsConfig.gapsUrl != null) {
result[GAPS_FIELD] = genomeAnnotationsConfig.gapsUrl
}
if (genomeAnnotationsConfig.ucscAlias != null) {
result[UCSC_ALIAS_FIELD] = genomeAnnotationsConfig.ucscAlias
}
Expand Down
5 changes: 2 additions & 3 deletions src/main/kotlin/org/jetbrains/bio/genome/Genome.kt
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class Genome private constructor(
val cpgIslandsPath: Path?,
val cytobandsPath: Path?,
val repeatsPath: Path?,
gapsPath: Path?,
val gapsPath: Path?,
private val twoBitPath: Path?,
private val genesGTFPath: Path?,
genesDescriptionsPath: Path?,
Expand All @@ -62,7 +62,6 @@ class Genome private constructor(
val chrAltName2CanonicalMapping: Map<String, String>
) {
val chromSizesPath by lazy { ensureNotNull(chromSizesPath, "Chromosomes Sizes") }
val gapsPath by lazy { ensureNotNull(gapsPath, "Gaps") }
fun twoBitPath(downloadIfMissing: Boolean = true) =
ensureNotNull(twoBitPath, "Genome *.2bit Sequence").also { twoBitPath ->
if (downloadIfMissing) {
Expand Down Expand Up @@ -314,7 +313,7 @@ class Genome private constructor(
cpgIslandsPath = annCfgUpdated.cpgIslandsUrl?.let { dataPath / CpGIslands.ISLANDS_FILE_NAME },
cytobandsPath = annCfgUpdated.cytobandsUrl?.let { dataPath / CytoBands.FILE_NAME },
repeatsPath = dataPath / Repeats.FILE_NAME,
gapsPath = dataPath / Gaps.FILE_NAME,
gapsPath = annCfgUpdated.gapsUrl?.let { dataPath / Gaps.FILE_NAME },
twoBitPath = dataPath / "$parentBuild.2bit",
genesGTFPath = genesGTFPath,
genesDescriptionsPath = genesDescriptionsPath,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ object TestOrganismDataGenerator {
val genome = Genome[Genome.TEST_ORGANISM_BUILD]
val twoBitPath = genome.twoBitPath(downloadIfMissing = false)
val genesGtfPath = genome.genesGtfPath(downloadIfMissing = false)
val gapsPath = genome.gapsPath
val gapsPath = genome.gapsPath!!
val cytobandsPath = genome.cytobandsPath!!
val repeatsPath = genome.repeatsPath!!
val cpgIslandsPath = genome.cpgIslandsPath!!
Expand Down
12 changes: 12 additions & 0 deletions src/main/resources/annotations.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,18 @@ genomes:
sequence: http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit
chromsizes: http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes

hs1:
species: Homo sapiens
ucsc_alias: hs1
aliases: T2T CHM13v2.0
description: 'T2T CHM13v2.0 Telomere-to-Telomere assembly of the CHM13 cell line, with chrY from NA24385'
gtf: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/genes/catLiftOffGenesV1.gtf.gz
chr_alt_name_to_canonical:
- MT: chrM
cytobands: http://t2t.gi.ucsc.edu/chm13/hub/t2t-chm13-v2.0/download/chm13v2.0_cytobands_allchrs.bed.gz
sequence: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/hs1.2bit
chromsizes: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/hs1.chrom.sizes.txt

# http://www.ensembl.org/Drosophila_melanogaster/
dm3:
species: Drosophila melanogaster
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class AnnotationsConfigLoaderTest {
val (version, mapping, yaml) = AnnotationsConfigLoader.parseYaml(path, 0)
assertEquals(1, version)
assertNull(mapping)
assertEquals(5, yaml.genomes.size)
assertEquals(6, yaml.genomes.size)
}
}

Expand All @@ -37,8 +37,8 @@ class AnnotationsConfigLoaderTest {
withResource(AnnotationsConfigLoader::class.java, "test_annotations.yaml") { path ->
val (version, mapping, _) = AnnotationsConfigLoader.parseYaml(path, 1)
assertEquals(1, version)
assertEquals(5, mapping!!.entries.size)
assertEquals(listOf("ce6", "dm3", "hg19", "hg38", "mm9"), mapping.keys.sorted())
assertEquals(6, mapping!!.entries.size)
assertEquals(listOf("ce6", "dm3", "hg19", "hg38", "hs1", "mm9"), mapping.keys.sorted())
assertEquals("Drosophila melanogaster", mapping["dm3"]!!.species)
assertEquals(listOf("mm9", "NCBIM37"), mapping["mm9"]!!.names)
assertEquals("mm9", mapping["mm9"]!!.ucscAlias)
Expand Down Expand Up @@ -82,6 +82,13 @@ class AnnotationsConfigLoaderTest {
"http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/centromeres.txt.gz",
mapping["hg38"]!!.centromeresUrl
)
assertEquals(
"http://t2t.gi.ucsc.edu/chm13/hub/t2t-chm13-v2.0/download/chm13v2.0_cytobands_allchrs.bed.gz",
mapping["hs1"]!!.cytobandsUrl
)
assertNull(
mapping["hs1"]!!.gapsUrl
)
assertNull(mapping["hg19"]!!.centromeresUrl)
assertEquals(
"http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cpgIslandExt.txt.gz",
Expand Down
12 changes: 12 additions & 0 deletions src/test/resources/test_annotations.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,18 @@ genomes:
sequence: http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit
chromsizes: http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes

hs1:
species: Homo sapiens
ucsc_alias: hs1
aliases: T2T CHM13v2.0
description: 'T2T CHM13v2.0 Telomere-to-Telomere assembly of the CHM13 cell line, with chrY from NA24385'
gtf: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/genes/catLiftOffGenesV1.gtf.gz
chr_alt_name_to_canonical:
- MT: chrM
cytobands: http://t2t.gi.ucsc.edu/chm13/hub/t2t-chm13-v2.0/download/chm13v2.0_cytobands_allchrs.bed.gz
sequence: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/hs1.2bit
chromsizes: http://hgdownload.soe.ucsc.edu/goldenPath/hs1/bigZips/hs1.chrom.sizes.txt

# D. melanogaster
# http://www.ensembl.org/Drosophila_melanogaster/
dm3:
Expand Down