Skip to content

Commit

Permalink
fix: Initial impl (proof of concept) for GFF3 annotations support, Je…
Browse files Browse the repository at this point in the history
  • Loading branch information
iromeo committed Jul 8, 2021
1 parent 7e72a01 commit 2e411c0
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 23 deletions.
86 changes: 67 additions & 19 deletions src/main/kotlin/org/jetbrains/bio/genome/Ensembl.kt
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ import org.slf4j.LoggerFactory
import java.io.BufferedReader
import java.io.BufferedWriter
import java.nio.file.Path
import java.util.*
import kotlin.collections.ArrayList
import kotlin.math.abs
import kotlin.math.max
import kotlin.math.min
Expand Down Expand Up @@ -39,7 +37,7 @@ object Ensembl {
}

typealias Attributes = Array<String?>
class GtfReader(val reader: BufferedReader, val genome: Genome) {
class GtfReader(val reader: BufferedReader, val genome: Genome, val gff3: Boolean=false) {

fun readTranscripts(): List<Transcript> {
var hasDetailedUTRInfo: Boolean? = null
Expand All @@ -52,7 +50,7 @@ class GtfReader(val reader: BufferedReader, val genome: Genome) {
if (hasDetailedUTRInfo == null) {
if (featureType == "five_prime_utr" || featureType == "three_prime_utr") {
hasDetailedUTRInfo = true
} else if (featureType == "UTR") {
} else if (featureType == "utr") {
hasDetailedUTRInfo = false
}
}
Expand Down Expand Up @@ -143,17 +141,20 @@ class GtfReader(val reader: BufferedReader, val genome: Genome) {
* 1. old: exon, CDS, start_codon, stop_codon
* 2. 75: exon, CDS, start_codon, stop_codon, transcript, UTR
* 3. 87: exon, CDS, stop_codon, gene, transcript, three_prime_utr, five_prime_utr
* 4. gff3 [chm13t2t-v1.1.gene_annotation.v4.gff3.gz]:
* exon, CDS, stop_codon, gene, transcript, start_codon, stop_codon, three_prime_UTR, five_prime_UTR,
*/
val chrStr = parts[0]
val type = parts[2]
val type = parts[2].toLowerCase()

val chr = chrsNamesMapping[chrStr]?.let { chr ->
if (genomeQuery.accepts(chr)) chr else null
} ?: return type

when (type) {
when (type.toLowerCase()) {
// just not to write long if condition:
"transcript", "exon", "CDS", "start_codon", "three_prime_utr" -> { /* noop */ }
"transcript", "exon", "cds", "start_codon", "three_prime_utr" -> { /* noop */ }
// e.g. five_prime_utr, stop_codon : transcript is already configured by 3' and start_codon
else -> return type
}

Expand All @@ -167,20 +168,35 @@ class GtfReader(val reader: BufferedReader, val genome: Genome) {
// transcript_source "havana"
// Predicted genes:
// transcript_source "ensembl"
// Or
// transcript_source "CAT"

val transcriptIdKey: AttrTypes = when {
gff3 -> Gff3AttrTypes.TRANSCRIPT_ID
else -> GtfAttrTypes.TRANSCRIPT_ID
}

val transcriptId = attributes[GtfAttrTypes.TRANSCRIPT_ID.ordinal]!!
val geneNameKey: AttrTypes = when {
gff3 -> Gff3AttrTypes.GENE_NAME
else -> GtfAttrTypes.GENE_NAME
}
val geneIdKey: AttrTypes = when {
gff3 -> Gff3AttrTypes.GENE_ID
else -> GtfAttrTypes.GENE_ID
}

val transcriptId = attributes[transcriptIdKey.ordinal]!!
val transcriptInfo = transcriptsMap.getOrPut(transcriptId) {
TranscriptInfo(transcriptId,
attributes[GtfAttrTypes.GENE_NAME.ordinal]!!,
attributes[GtfAttrTypes.GENE_ID.ordinal]!!)
attributes[geneNameKey.ordinal]!!,
attributes[geneIdKey.ordinal]!!)
}

val location = Location(start - 1, end, chr, strand)

when (type) {
"exon" -> { transcriptInfo.exons.add(location) }
"CDS" -> { transcriptInfo.cds.add(location) }
"cds" -> { transcriptInfo.cds.add(location) }
"transcript" -> { transcriptInfo.transcript = location }
"three_prime_utr" -> { transcriptInfo.utr3.add(location) }
"start_codon" -> {
Expand Down Expand Up @@ -210,7 +226,10 @@ class GtfReader(val reader: BufferedReader, val genome: Genome) {
}

private fun parseAttributes(rest: String): Attributes {
val attrTypes = GtfAttrTypes.values()
val attrTypes: Array<out AttrTypes> = when {
gff3 -> Gff3AttrTypes.values()
else -> GtfAttrTypes.values()
}
val attributes = Array<String?>(attrTypes.size) { null }

for (chunk in rest.split(";")) {
Expand All @@ -221,17 +240,36 @@ class GtfReader(val reader: BufferedReader, val genome: Genome) {

// if attr not set & is our key
if (trimmed.startsWith(key)) {
val value = trimmed.substring(key.length).trimStart()
check(value[0] == '"' && value.last() == '"') { "Cannot parse: $key value, attrs list = $rest" }
attributes[i] = value.substring(1, value.length - 1)
break

if (gff3) {
val chunks = trimmed.split('=', limit = 2)
check(chunks.size == 2) { "Cannot parse: $key value from <$trimmed>, attrs list = $rest" }
if (chunks[0] == key) {
attributes[i] = chunks[1]
}
} else {
val value = trimmed.substring(key.length).trimStart()
check(value[0] == '"' && value.last() == '"') { "Cannot parse: $key value from <$trimmed>, attrs list = $rest" }
attributes[i] = value.substring(1, value.length - 1)
break
}
}
}
}

if (attributes[GtfAttrTypes.GENE_NAME.ordinal] == null) {

val geneNameKey: AttrTypes = when {
gff3 -> Gff3AttrTypes.GENE_NAME
else -> GtfAttrTypes.GENE_NAME
}
val geneIdKey: AttrTypes = when {
gff3 -> Gff3AttrTypes.GENE_ID
else -> GtfAttrTypes.GENE_ID
}

if (attributes[geneNameKey.ordinal] == null) {
// not all gtf genes has 'gene_name' attr, i.e. defines gene symbol, e.g not info in ce11, rn5
attributes[GtfAttrTypes.GENE_NAME.ordinal] = attributes[GtfAttrTypes.GENE_ID .ordinal]
attributes[geneNameKey.ordinal] = attributes[geneIdKey.ordinal]
}

check(attributes.all { it != null }) {
Expand All @@ -241,12 +279,22 @@ class GtfReader(val reader: BufferedReader, val genome: Genome) {
return attributes
}

enum class GtfAttrTypes(val key: String) {
interface AttrTypes {
val key: String
val ordinal: Int
}
enum class GtfAttrTypes(override val key: String) : AttrTypes {
TRANSCRIPT_ID("transcript_id"),
GENE_ID("gene_id"),
GENE_NAME("gene_name");
}

enum class Gff3AttrTypes(override val key: String) : AttrTypes {
TRANSCRIPT_ID("source_transcript"),
GENE_ID("source_gene"),
GENE_NAME("source_gene_common_name");
}

class TranscriptInfo(val transcriptId: String,
val geneName: String,
val geneId: String) {
Expand Down
2 changes: 1 addition & 1 deletion src/main/kotlin/org/jetbrains/bio/genome/Genome.kt
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ class Genome private constructor(
val genesDescriptionsPath: Path by lazy { ensureNotNull(genesDescriptionsPath, "Gene Description") }

/**
* Ensure *.gtf file exists and download it if necessary
* Ensure *.gtf or *.gff file exists and download it if necessary
*/
fun genesGtfPath(downloadIfMissed: Boolean = true) =
ensureNotNull(genesGTFPath, "Genes GTF Annotations").also { genesGTFPath ->
Expand Down
4 changes: 3 additions & 1 deletion src/main/kotlin/org/jetbrains/bio/genome/Transcripts.kt
Original file line number Diff line number Diff line change
Expand Up @@ -281,8 +281,10 @@ object Transcripts {
genome.genesGtfPath(true)

val transcripts = LOG.time(level = Level.INFO, message = "Parsing genes $gtfFile") {
val fileNameLower = gtfFile.name.toLowerCase()
val gff3 = fileNameLower.endsWith(".gff3") || fileNameLower.endsWith(".gff3.gz")
gtfFile.bufferedReader().use { reader ->
GtfReader(reader, genome).readTranscripts()
GtfReader(reader, genome, gff3=gff3).readTranscripts()
// sort by 5' offset then by ensemblId (to be deterministic),
// it is required for bound5Index() correct work,
// please don't change it to sort by start offset
Expand Down
Loading

0 comments on commit 2e411c0

Please sign in to comment.