Skip to content

Commit

Permalink
updated ROC calls to new hotROC calls
Browse files Browse the repository at this point in the history
  • Loading branch information
helixscript committed Aug 1, 2016
1 parent f42f33a commit fec0a46
Show file tree
Hide file tree
Showing 3 changed files with 291 additions and 20 deletions.
7 changes: 3 additions & 4 deletions epi_heatmap_from_db.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,6 @@ library('yaml')
if (!file.exists(args$c)) stop("the configuration file can not be found.")
config <<- yaml.load_file(args$c)

### source("epigeneticFeatures.R")
### source("epigeneticHeatMapMaker.R")

source(file.path(codeDir, "epigeneticFeatures.R"))
source(file.path(codeDir, "epigeneticHeatMapMaker.R"))

Expand All @@ -51,7 +48,9 @@ library(dplyr)
library(BSgenome)
library(intSiteRetriever)
library(devtools)
source_url('https://raw.githubusercontent.com/BushmanLab/genomicHeatmapMaker/master/utils.R')

#source_url('https://raw.githubusercontent.com/BushmanLab/genomicHeatmapMaker/master/utils.R')
source(file.path(codeDir, "utils.R"))

available_epigenetic_ref_genomes <- c ("hg18", "mm8")

Expand Down
25 changes: 9 additions & 16 deletions epigeneticHeatMapMaker.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

libs <- c("survival", "pipeUtils", "colorspace", "hiAnnotator",
libs <- c("survival", "hotROCs", "colorspace", "hiAnnotator",
"plyr", "reshape2")

loaded <- sapply(libs, library, character.only=TRUE, quietly=TRUE)
Expand Down Expand Up @@ -63,19 +63,12 @@ sites_to_epigenetic_heatmap <- function(sites, referenceGenome, output_dir, anno
}
}

sites <- as.data.frame(sites)

message("Make Heatmap")
annotation_columns <- get_annotation_columns(sites)

rset <- with(sites, ROC.setup(
rep(TRUE, nrow(sites)), type, siteID, sampleName))
roc.res <- ROC.strata(annotation_columns, rset, add.var=TRUE, sites)

dcol <- c(rep(rgb(185,185,0,max=185), 70),
rgb(185:0,185:0,0,max=185),
rgb(0,0,0:220,max=220),
rep(rgb(0,0,220,max=220), 35))
# ROCSVG(roc.res, output_dir, colScale=dcol, overlayCols="white")
ROCSVG(roc.res, output_dir)
if (config$rocControls == 'unmatched')
{
sites_to_ROC_ordinary(sites, output_dir)
} else if (config$rocControls == 'matched') {
sites_to_ROC_matched(sites, output_dir)
} else {
stop('Error, rocControls is not properly defined in the configuration file.')
}
}
279 changes: 279 additions & 0 deletions utils.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,279 @@
# This source code file is a component of the larger INSPIIRED genomic analysis software package.
# Copyright (C) 2016 Frederic Bushman
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#

library(hotROCs)

getRefSeq_genes <- function(reference_genome) {
refSeq <- makeGRanges(
getUCSCtable("refGene", "RefSeq Genes", freeze=reference_genome),
freeze=reference_genome
)
}

getCpG_islands <- function(reference_genome) {
cpg <- getUCSCtable("cpgIslandExt", "CpG Islands", freeze=reference_genome)
cpg$strand <- "*" # either strand
makeGRanges(cpg, freeze=reference_genome, chromCol='chrom')
}

getDNaseI <- function(reference_genome) {
DNaseI <- getUCSCtable("wgEncodeRegDnaseClustered",
"DNase Clusters", freeze=reference_genome)
DNaseI$strand <- "*" # either strand
makeGRanges(DNaseI, freeze=reference_genome, chromCol='chrom')
}

#' for given samples pull sites from database and construct MRCs
get_sites_controls_from_db <- function(sampleName_GTSP, referenceGenome, connection) {
if ( ! "label" %in% colnames(sampleName_GTSP)) {
sampleName_GTSP$label <- sampleName_GTSP$GTSP
}
sampleName_GTSP <- select(sampleName_GTSP, sampleName, GTSP, label)

# should have at least two samples
stopifnot(length(unique(sampleName_GTSP$GTSP)) != 1)

sampleName_GTSP$refGenome <- rep(referenceGenome, nrow(sampleName_GTSP))

# samples should have sites
stopifnot(nrow(getUniqueSiteCounts(sampleName_GTSP, connection)) > 1)
# also we need at least several sites per sample/replicate
stopifnot(is_enough_sites(sampleName_GTSP, connection))

# check that all samples processed with the same reference genome
is_in_db <- setNameExists(sampleName_GTSP, connection)
if ( ! all(is_in_db)) {
print("The following samples are NOT in the database")
print(sampleName_GTSP[ ! is_in_db, ])
stop()
}
#stopifnot(all(setNameExists(sampleName_GTSP, connection)))

reference_genome_sequence <- get_reference_genome(referenceGenome)
get_integration_sites_with_mrcs(sampleName_GTSP, reference_genome_sequence, connection)
}

add_label <- function(sites, sampleName_GTSP) {
sites_GTSP <- merge(sites, sampleName_GTSP)
sites_GTSP$sampleName <- sites_GTSP$label
sites_GTSP$refGenome <- NULL # not needed downstream
sites_GTSP$GTSP <- NULL # not needed downstream
sites_GTSP$label <- NULL
sites_GTSP
}

get_integration_sites_with_mrcs <- function(
sampleName_GTSP, refGenomeSeq, connection
) {
sites <- getUniqueSites(sampleName_GTSP, connection)
sites$type <- "insertion"
sites <- add_label(sites, sampleName_GTSP)

mrcs <- getMRCs(sampleName_GTSP, connection)
mrcs$type <- "match"
mrcs <- add_label(mrcs, sampleName_GTSP)

sites_mrcs <- rbind(sites, mrcs)

sites_mrcs <- makeGRanges(sites_mrcs, soloStart=TRUE,
chromCol='chr', strandCol='strand', startCol='position')

#seqinfo needs to be exact here or trimming will be wrong
newSeqInfo <- seqinfo(refGenomeSeq)
seqInfo.new2old <- match(seqnames(newSeqInfo),
seqnames(seqinfo(sites_mrcs)))
seqinfo(sites_mrcs, new2old=seqInfo.new2old) <- newSeqInfo

sites_mrcs
}

get_annotation_columns <- function(sites) {
granges_column_names <- c("seqnames", "start", "end", "width", "strand")
int_site_column_names <- c("siteID", "sampleName", "chr", "strand", "position")
required_columns <- unique(c(
granges_column_names, int_site_column_names, "type"))
stopifnot(all(required_columns %in% names(sites)))
setdiff(names(sites), required_columns)
}

from_counts_to_density <- function(sites, column_prefix, window_size) {
metadata <- mcols(sites)
sapply(seq(window_size), function(i) {
val <- window_size[i]
name <- names(window_size)[i]
column_name <- paste0(column_prefix, ".", name)
metadata[[column_name]] <<- metadata[[column_name]]/val
})
mcols(sites) <- metadata
sites
}

getPositionalValuesOfFeature <- function(sites, genomicData) {
#### Boundary Distances #### Nirav Malani code TODO: refactor into several functions
## (refSeq boundary.dist), Start (refSeq start.dist), non-width (), General (general.width)
## when inGene is FALSE then set following: ref.left.pos, ref.right.pos, ref.left.strand, ref.right.strand
## when inGene is TRUE then set following: ref.start.pos, ref.end.pos, ref.gene.strand

## prepare the new columns ##
colnam <- paste("ref", c("left.pos", "right.pos", "left.strand", "right.strand",
"start.pos", "end.pos", "gene.strand"), sep=".")
mcols(sites)[colnam] <- NA

## add the respective columns as needed ##
## beware: precede returns range which is following the query and
## follow returns the range which is preceding the query!
## so do a switcheroo in terms of extracting the start & stop ##
left <- follow(sites, genomicData, ignore.strand=TRUE)
left[is.na(left) | sites$within_refSeq_gene] <- NA
rows <- na.omit(left)
sites$ref.left.pos[!is.na(left)] <- end(genomicData[rows])
sites$ref.left.strand[!is.na(left)] <- as.character(strand(genomicData[rows]))

right <- precede(sites, genomicData, ignore.strand=TRUE)
right[is.na(right) | sites$within_refSeq_gene] <- NA
rows <- na.omit(right)
sites$ref.right.pos[!is.na(right)] <- start(genomicData[rows])
sites$ref.right.strand[!is.na(right)] <- as.character(strand(genomicData[rows]))

inIt <- findOverlaps(sites, genomicData, ignore.strand=TRUE, select="arbitrary")
inIt[is.na(inIt) | !sites$within_refSeq_gene] <- NA
rows <- na.omit(inIt)
sites$ref.start.pos[!is.na(inIt)] <- start(genomicData[rows])
sites$ref.end.pos[!is.na(inIt)] <- end(genomicData[rows])
sites$ref.gene.strand[!is.na(inIt)] <- as.character(strand(genomicData[rows]))

sites$boundary.dist <-
eval(expression(pmin((ref.end.pos-position)/(ref.end.pos-ref.start.pos),
(position-ref.start.pos)/(ref.end.pos-ref.start.pos),
(ref.right.pos-position)/(ref.right.pos-ref.left.pos),
(position-ref.left.pos)/(ref.right.pos-ref.left.pos),
na.rm=T)), mcols(sites))

sites$start.dist <-
eval(expression(pmin(ifelse(ref.gene.strand=="-",
(ref.end.pos-position)/(ref.end.pos-ref.start.pos),
(position-ref.start.pos)/(ref.end.pos-ref.start.pos)),
ifelse(ref.right.strand=="-",
(ref.right.pos-position)/(ref.right.pos-ref.left.pos),
NA),
ifelse(ref.left.strand=="+",
(position-ref.left.pos)/(ref.right.pos-ref.left.pos),
NA),na.rm=T)), mcols(sites))

sites$general.width <- eval(expression(pmin(ref.end.pos-ref.start.pos,
ref.right.pos-ref.left.pos,na.rm=T)),
mcols(sites))
sites$gene.width <- eval(expression(ref.end.pos-ref.start.pos ), mcols(sites))

meta <- mcols(sites)
meta <- meta[ , ! (names(meta) %in% colnam)]
mcols(sites) <- meta

sites
}

#' ROC.stata does not work with too few sites
is_enough_sites <- function(sampleName_GTSP, connection) {
MIN_NUMBER_OF_SITES <- 3
n_sites <- getUniqueSiteCounts(sampleName_GTSP, connection)
n_sites$enough_sites <- n_sites$uniqueSites >= MIN_NUMBER_OF_SITES
if (all(n_sites$enough_sites)) {
return(TRUE)
}
message("****************************************")
message("The following samples have too few sites to generate heatmap:")
print(filter(n_sites, enough_sites == FALSE))
message("****************************************")
FALSE
}

#' create a folder containing the SVG outputs and p-value calculations for the (clasically matched) random controls used in ROC calculations
#'
#' @param sites_mrcs Granges with sites, controls and features
sites_to_ROC_old <- function(sites_mrcs, output_dir) {
sites_mrcs <- as.data.frame(sites_mrcs)
annotation_columns <- get_annotation_columns(sites_mrcs)
rset <- with(sites_mrcs, ROC.setup(
rep(TRUE, nrow(sites_mrcs)), type, siteID, sampleName))
roc.res <- ROC.strata(annotation_columns, rset, add.var=TRUE, sites_mrcs)
ROCSVG(roc.res, output_dir)
}

##' Substitute Median for \code{NA}
##'
##' When there is little mising data, a rough-and-ready fill-in method
##' may be preferred to computationally intensive method of handling
##' missingness. For non-parametric ROC curves based on ranks of the
##' data, using the median (of the non-missing data) as the fill-in is
##' a fairly innocuous choice. If there is much missing data, this
##' method is not advised as it tends to bias the ROC curve area
##' towards 0.5.
##' @title na.median
##' @param x \code{matrix} or \code{data.frame} possibly containing
##' \code{NA} values.
##' @return An object like \code{x}, but with the medians of the
##' columns used in place of the \code{NA} values in the
##' corresponding columns.
##' @author Charles Berry
na.median <-
function(x)
{
if (!is.matrix(x)) x <- as.matrix(x)
na.count <- colSums(is.na(x))
if (any(na.count != 0 ))
{
for (i in 1:ncol(x))
if (na.count[i]>0){
med <- median(x[,i],na.rm=TRUE)
x[is.na(x[,i]),i] <- med
}
}
x
}

# ROC calculation for Matched Random Controls
# This is similar to the way we did this classically for restriction sites.
sites_to_ROC_matched <- function(sites_mrcs, output_dir) {
sites_mrcs <- as.data.frame(sites_mrcs)
annotation_columns <- get_annotation_columns(sites_mrcs)
roc.res <- ROC.MRC(
sites_mrcs[,"type"],
sites_mrcs[,"siteID"],
na.median(sites_mrcs[,annotation_columns]),
sites_mrcs[,"sampleName"])
ROCSVG(roc.res, output_dir)
}


# ROC calculation for Ordinary (unmatched) Random Controls
# Provides the corrected version for unmatched controls.
# This should roughly match. p-values will differ but should not be
# qualitatively different.

sites_to_ROC_ordinary <- function(sites_mrcs, output_dir) {
sites_mrcs <- as.data.frame(sites_mrcs)

write.table(sites_mrcs, file='sites_mrcs.epi')

annotation_columns <- get_annotation_columns(sites_mrcs)
roc.res <- ROC.ORC(
sites_mrcs[,"type"],
# sites_mrcs[,annotation_columns],
na.median(sites_mrcs[,annotation_columns]),
sites_mrcs[,"sampleName"])
ROCSVG(roc.res, output_dir)
}

0 comments on commit fec0a46

Please sign in to comment.