updated ROC calls to new hotROC calls

BushmanLab · Aug 1, 2016 · fec0a46 · fec0a46
1 parent f42f33a
commit fec0a46
Show file tree

Hide file tree

Showing 3 changed files with 291 additions and 20 deletions.
diff --git a/epi_heatmap_from_db.R b/epi_heatmap_from_db.R
@@ -38,9 +38,6 @@ library('yaml')
 if (!file.exists(args$c)) stop("the configuration file can not be found.")
 config <<- yaml.load_file(args$c)
 
-### source("epigeneticFeatures.R")
-### source("epigeneticHeatMapMaker.R")
-
 source(file.path(codeDir, "epigeneticFeatures.R"))
 source(file.path(codeDir, "epigeneticHeatMapMaker.R"))
 
@@ -51,7 +48,9 @@ library(dplyr)
 library(BSgenome)
 library(intSiteRetriever)
 library(devtools)
-source_url('https://raw.githubusercontent.com/BushmanLab/genomicHeatmapMaker/master/utils.R')
+
+#source_url('https://raw.githubusercontent.com/BushmanLab/genomicHeatmapMaker/master/utils.R')
+source(file.path(codeDir, "utils.R"))
 
 available_epigenetic_ref_genomes <- c ("hg18", "mm8")
 

diff --git a/epigeneticHeatMapMaker.R b/epigeneticHeatMapMaker.R
@@ -14,7 +14,7 @@
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-libs <- c("survival", "pipeUtils", "colorspace", "hiAnnotator", 
+libs <- c("survival", "hotROCs", "colorspace", "hiAnnotator", 
           "plyr", "reshape2")
 
 loaded <- sapply(libs, library, character.only=TRUE, quietly=TRUE)
@@ -63,19 +63,12 @@ sites_to_epigenetic_heatmap <- function(sites, referenceGenome, output_dir, anno
         }
     }
 
-    sites <- as.data.frame(sites)
-
-    message("Make Heatmap")
-    annotation_columns <- get_annotation_columns(sites)
-
-    rset <- with(sites, ROC.setup(
-      rep(TRUE, nrow(sites)), type, siteID, sampleName))
-    roc.res <- ROC.strata(annotation_columns, rset, add.var=TRUE, sites)
-
-    dcol <- c(rep(rgb(185,185,0,max=185), 70), 
-              rgb(185:0,185:0,0,max=185),
-              rgb(0,0,0:220,max=220), 
-              rep(rgb(0,0,220,max=220), 35))
-    # ROCSVG(roc.res, output_dir, colScale=dcol, overlayCols="white")
-    ROCSVG(roc.res, output_dir)
+    if (config$rocControls == 'unmatched')
+    {
+      sites_to_ROC_ordinary(sites, output_dir)
+    } else if (config$rocControls == 'matched') {
+      sites_to_ROC_matched(sites, output_dir)
+    } else {
+      stop('Error, rocControls is not properly defined in the configuration file.')
+    }
 }
diff --git a/utils.R b/utils.R
@@ -0,0 +1,279 @@
+#    This source code file is a component of the larger INSPIIRED genomic analysis software package.
+#    Copyright (C) 2016 Frederic Bushman
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+library(hotROCs)
+
+getRefSeq_genes <- function(reference_genome) {
+  refSeq <- makeGRanges(
+    getUCSCtable("refGene", "RefSeq Genes", freeze=reference_genome),
+    freeze=reference_genome
+  )
+}
+
+getCpG_islands <- function(reference_genome) {
+  cpg <- getUCSCtable("cpgIslandExt", "CpG Islands", freeze=reference_genome)
+  cpg$strand <- "*" # either strand
+  makeGRanges(cpg, freeze=reference_genome, chromCol='chrom')
+}
+
+getDNaseI <- function(reference_genome) {
+  DNaseI <- getUCSCtable("wgEncodeRegDnaseClustered", 
+                         "DNase Clusters", freeze=reference_genome)
+  DNaseI$strand <- "*" # either strand
+  makeGRanges(DNaseI, freeze=reference_genome, chromCol='chrom')
+}
+
+#' for given samples pull sites from database and construct MRCs
+get_sites_controls_from_db <- function(sampleName_GTSP, referenceGenome, connection) {
+    if ( ! "label" %in% colnames(sampleName_GTSP)) {
+        sampleName_GTSP$label <- sampleName_GTSP$GTSP
+    }
+    sampleName_GTSP <- select(sampleName_GTSP, sampleName, GTSP, label)
+
+    # should have at least two samples
+    stopifnot(length(unique(sampleName_GTSP$GTSP)) != 1)
+
+    sampleName_GTSP$refGenome <- rep(referenceGenome, nrow(sampleName_GTSP))
+
+    # samples should have sites
+    stopifnot(nrow(getUniqueSiteCounts(sampleName_GTSP, connection)) > 1)
+    # also we need at least several sites per sample/replicate
+    stopifnot(is_enough_sites(sampleName_GTSP, connection))
+
+    # check that all samples processed with the same reference genome
+    is_in_db <- setNameExists(sampleName_GTSP, connection)
+    if ( ! all(is_in_db)) {
+        print("The following samples are NOT in the database")
+        print(sampleName_GTSP[ ! is_in_db, ])
+        stop()
+    }
+    #stopifnot(all(setNameExists(sampleName_GTSP, connection)))
+
+    reference_genome_sequence <- get_reference_genome(referenceGenome)
+    get_integration_sites_with_mrcs(sampleName_GTSP, reference_genome_sequence, connection)
+}
+
+add_label <- function(sites, sampleName_GTSP) {
+    sites_GTSP <- merge(sites, sampleName_GTSP)
+    sites_GTSP$sampleName <- sites_GTSP$label
+    sites_GTSP$refGenome <- NULL # not needed downstream
+    sites_GTSP$GTSP <- NULL # not needed downstream
+    sites_GTSP$label <- NULL
+    sites_GTSP
+}
+
+get_integration_sites_with_mrcs <- function(
+    sampleName_GTSP, refGenomeSeq, connection
+) {
+    sites <- getUniqueSites(sampleName_GTSP, connection)
+    sites$type <- "insertion"
+    sites <- add_label(sites, sampleName_GTSP)
+
+    mrcs <- getMRCs(sampleName_GTSP, connection)
+    mrcs$type <- "match"
+    mrcs <- add_label(mrcs, sampleName_GTSP)
+
+    sites_mrcs <- rbind(sites, mrcs)
+
+    sites_mrcs <- makeGRanges(sites_mrcs, soloStart=TRUE,
+        chromCol='chr', strandCol='strand', startCol='position')
+
+    #seqinfo needs to be exact here or trimming will be wrong
+    newSeqInfo <- seqinfo(refGenomeSeq)
+    seqInfo.new2old <- match(seqnames(newSeqInfo),
+        seqnames(seqinfo(sites_mrcs)))
+    seqinfo(sites_mrcs, new2old=seqInfo.new2old) <- newSeqInfo
+
+    sites_mrcs
+}
+
+get_annotation_columns <- function(sites) {
+  granges_column_names <- c("seqnames", "start", "end", "width", "strand")
+  int_site_column_names <- c("siteID", "sampleName", "chr", "strand", "position")
+  required_columns <- unique(c(
+    granges_column_names, int_site_column_names, "type"))
+  stopifnot(all(required_columns %in% names(sites)))
+  setdiff(names(sites), required_columns)
+}
+
+from_counts_to_density <- function(sites, column_prefix, window_size) {
+  metadata <- mcols(sites)
+  sapply(seq(window_size), function(i) {
+    val <- window_size[i]
+    name <- names(window_size)[i]
+    column_name <- paste0(column_prefix, ".", name)
+    metadata[[column_name]] <<- metadata[[column_name]]/val
+  })
+  mcols(sites) <- metadata
+  sites
+}
+
+getPositionalValuesOfFeature <- function(sites, genomicData) {
+  #### Boundary Distances #### Nirav Malani code TODO: refactor into several functions
+  ## (refSeq boundary.dist), Start (refSeq start.dist), non-width (), General (general.width)
+  ## when inGene is FALSE then set following: ref.left.pos, ref.right.pos, ref.left.strand, ref.right.strand
+  ## when inGene is TRUE then set following: ref.start.pos, ref.end.pos, ref.gene.strand
+
+  ## prepare the new columns ##
+  colnam <- paste("ref", c("left.pos", "right.pos", "left.strand", "right.strand", 
+                           "start.pos", "end.pos", "gene.strand"), sep=".") 
+  mcols(sites)[colnam] <- NA
+
+  ## add the respective columns as needed ##
+  ## beware: precede returns range which is following the query and
+  ## follow returns the range which is preceding the query!
+  ## so do a switcheroo in terms of extracting the start & stop ##
+  left <- follow(sites, genomicData, ignore.strand=TRUE)
+  left[is.na(left) | sites$within_refSeq_gene] <- NA
+  rows <- na.omit(left)
+  sites$ref.left.pos[!is.na(left)] <- end(genomicData[rows])
+  sites$ref.left.strand[!is.na(left)] <- as.character(strand(genomicData[rows]))
+
+  right <- precede(sites, genomicData, ignore.strand=TRUE)
+  right[is.na(right) | sites$within_refSeq_gene] <- NA
+  rows <- na.omit(right)
+  sites$ref.right.pos[!is.na(right)] <- start(genomicData[rows])
+  sites$ref.right.strand[!is.na(right)] <- as.character(strand(genomicData[rows]))
+
+  inIt <- findOverlaps(sites, genomicData, ignore.strand=TRUE, select="arbitrary")
+  inIt[is.na(inIt) | !sites$within_refSeq_gene] <- NA
+  rows <- na.omit(inIt)
+  sites$ref.start.pos[!is.na(inIt)] <- start(genomicData[rows])
+  sites$ref.end.pos[!is.na(inIt)] <- end(genomicData[rows])
+  sites$ref.gene.strand[!is.na(inIt)] <- as.character(strand(genomicData[rows]))
+
+  sites$boundary.dist <-
+    eval(expression(pmin((ref.end.pos-position)/(ref.end.pos-ref.start.pos),
+                         (position-ref.start.pos)/(ref.end.pos-ref.start.pos),
+                         (ref.right.pos-position)/(ref.right.pos-ref.left.pos),
+                         (position-ref.left.pos)/(ref.right.pos-ref.left.pos),
+                         na.rm=T)), mcols(sites))
+
+  sites$start.dist <-
+    eval(expression(pmin(ifelse(ref.gene.strand=="-",
+                                (ref.end.pos-position)/(ref.end.pos-ref.start.pos),
+                                (position-ref.start.pos)/(ref.end.pos-ref.start.pos)),
+                         ifelse(ref.right.strand=="-",
+                                (ref.right.pos-position)/(ref.right.pos-ref.left.pos),
+                                NA),
+                         ifelse(ref.left.strand=="+",
+                                (position-ref.left.pos)/(ref.right.pos-ref.left.pos),
+                                NA),na.rm=T)), mcols(sites))
+
+  sites$general.width <- eval(expression(pmin(ref.end.pos-ref.start.pos, 
+                                              ref.right.pos-ref.left.pos,na.rm=T)),
+                              mcols(sites))
+  sites$gene.width <- eval(expression(ref.end.pos-ref.start.pos ), mcols(sites))
+
+  meta <- mcols(sites)
+  meta <- meta[ , ! (names(meta) %in% colnam)]
+  mcols(sites) <- meta
+
+  sites 
+}
+
+#' ROC.stata does not work with too few sites
+is_enough_sites <- function(sampleName_GTSP, connection) {
+     MIN_NUMBER_OF_SITES <- 3
+     n_sites <- getUniqueSiteCounts(sampleName_GTSP, connection) 
+     n_sites$enough_sites <- n_sites$uniqueSites >= MIN_NUMBER_OF_SITES
+     if (all(n_sites$enough_sites)) {
+        return(TRUE) 
+     }
+     message("****************************************")
+     message("The following samples have too few sites to generate heatmap:")
+     print(filter(n_sites, enough_sites == FALSE)) 
+     message("****************************************")
+     FALSE 
+}
+
+#' create a folder containing the SVG outputs and p-value calculations for the (clasically matched) random controls used in ROC calculations
+#'
+#' @param sites_mrcs Granges with sites, controls and features
+sites_to_ROC_old <- function(sites_mrcs, output_dir) {
+    sites_mrcs <- as.data.frame(sites_mrcs)
+    annotation_columns <- get_annotation_columns(sites_mrcs)
+    rset <- with(sites_mrcs, ROC.setup(
+      rep(TRUE, nrow(sites_mrcs)), type, siteID, sampleName))
+    roc.res <- ROC.strata(annotation_columns, rset, add.var=TRUE, sites_mrcs)
+    ROCSVG(roc.res, output_dir)
+}
+
+##' Substitute Median for \code{NA}
+##'
+##' When there is little mising data, a rough-and-ready fill-in method
+##' may be preferred to computationally intensive method of handling
+##' missingness.  For non-parametric ROC curves based on ranks of the
+##' data, using the median (of the non-missing data) as the fill-in is
+##' a fairly innocuous choice.  If there is much missing data, this
+##' method is not advised as it tends to bias the ROC curve area
+##' towards 0.5.
+##' @title na.median
+##' @param x \code{matrix} or \code{data.frame} possibly containing
+##'     \code{NA} values.
+##' @return An object like \code{x}, but with the medians of the
+##'     columns used in place of the \code{NA} values in the
+##'     corresponding columns.
+##' @author Charles Berry
+na.median <-
+  function(x)
+  {
+    if (!is.matrix(x)) x <- as.matrix(x)
+    na.count <- colSums(is.na(x))
+    if (any(na.count != 0 ))
+    {
+      for (i in 1:ncol(x))
+        if (na.count[i]>0){
+          med <- median(x[,i],na.rm=TRUE)
+          x[is.na(x[,i]),i] <- med
+        }
+    }
+    x
+  }
+
+# ROC calculation for Matched Random Controls 
+# This is similar to the way we did this classically for restriction sites.
+sites_to_ROC_matched <- function(sites_mrcs, output_dir) {
+  sites_mrcs <- as.data.frame(sites_mrcs)
+  annotation_columns <- get_annotation_columns(sites_mrcs)
+    roc.res <- ROC.MRC(
+    sites_mrcs[,"type"],
+    sites_mrcs[,"siteID"],
+    na.median(sites_mrcs[,annotation_columns]),
+    sites_mrcs[,"sampleName"])
+  ROCSVG(roc.res, output_dir)
+}
+
+
+# ROC calculation for Ordinary (unmatched) Random Controls
+# Provides the corrected version for unmatched controls.
+# This should roughly match. p-values will differ but should not be
+# qualitatively different.
+
+sites_to_ROC_ordinary <- function(sites_mrcs, output_dir) {
+  sites_mrcs <- as.data.frame(sites_mrcs)
+
+  write.table(sites_mrcs, file='sites_mrcs.epi')
+
+  annotation_columns <- get_annotation_columns(sites_mrcs)
+  roc.res <- ROC.ORC(
+    sites_mrcs[,"type"],
+#   sites_mrcs[,annotation_columns],
+    na.median(sites_mrcs[,annotation_columns]),
+    sites_mrcs[,"sampleName"])
+  ROCSVG(roc.res, output_dir)
+}