-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
updated ROC calls to new hotROC calls
- Loading branch information
1 parent
f42f33a
commit fec0a46
Showing
3 changed files
with
291 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,279 @@ | ||
# This source code file is a component of the larger INSPIIRED genomic analysis software package. | ||
# Copyright (C) 2016 Frederic Bushman | ||
# | ||
# This program is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation, either version 3 of the License, or | ||
# (at your option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License | ||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
# | ||
|
||
library(hotROCs) | ||
|
||
getRefSeq_genes <- function(reference_genome) { | ||
refSeq <- makeGRanges( | ||
getUCSCtable("refGene", "RefSeq Genes", freeze=reference_genome), | ||
freeze=reference_genome | ||
) | ||
} | ||
|
||
getCpG_islands <- function(reference_genome) { | ||
cpg <- getUCSCtable("cpgIslandExt", "CpG Islands", freeze=reference_genome) | ||
cpg$strand <- "*" # either strand | ||
makeGRanges(cpg, freeze=reference_genome, chromCol='chrom') | ||
} | ||
|
||
getDNaseI <- function(reference_genome) { | ||
DNaseI <- getUCSCtable("wgEncodeRegDnaseClustered", | ||
"DNase Clusters", freeze=reference_genome) | ||
DNaseI$strand <- "*" # either strand | ||
makeGRanges(DNaseI, freeze=reference_genome, chromCol='chrom') | ||
} | ||
|
||
#' for given samples pull sites from database and construct MRCs | ||
get_sites_controls_from_db <- function(sampleName_GTSP, referenceGenome, connection) { | ||
if ( ! "label" %in% colnames(sampleName_GTSP)) { | ||
sampleName_GTSP$label <- sampleName_GTSP$GTSP | ||
} | ||
sampleName_GTSP <- select(sampleName_GTSP, sampleName, GTSP, label) | ||
|
||
# should have at least two samples | ||
stopifnot(length(unique(sampleName_GTSP$GTSP)) != 1) | ||
|
||
sampleName_GTSP$refGenome <- rep(referenceGenome, nrow(sampleName_GTSP)) | ||
|
||
# samples should have sites | ||
stopifnot(nrow(getUniqueSiteCounts(sampleName_GTSP, connection)) > 1) | ||
# also we need at least several sites per sample/replicate | ||
stopifnot(is_enough_sites(sampleName_GTSP, connection)) | ||
|
||
# check that all samples processed with the same reference genome | ||
is_in_db <- setNameExists(sampleName_GTSP, connection) | ||
if ( ! all(is_in_db)) { | ||
print("The following samples are NOT in the database") | ||
print(sampleName_GTSP[ ! is_in_db, ]) | ||
stop() | ||
} | ||
#stopifnot(all(setNameExists(sampleName_GTSP, connection))) | ||
|
||
reference_genome_sequence <- get_reference_genome(referenceGenome) | ||
get_integration_sites_with_mrcs(sampleName_GTSP, reference_genome_sequence, connection) | ||
} | ||
|
||
add_label <- function(sites, sampleName_GTSP) { | ||
sites_GTSP <- merge(sites, sampleName_GTSP) | ||
sites_GTSP$sampleName <- sites_GTSP$label | ||
sites_GTSP$refGenome <- NULL # not needed downstream | ||
sites_GTSP$GTSP <- NULL # not needed downstream | ||
sites_GTSP$label <- NULL | ||
sites_GTSP | ||
} | ||
|
||
get_integration_sites_with_mrcs <- function( | ||
sampleName_GTSP, refGenomeSeq, connection | ||
) { | ||
sites <- getUniqueSites(sampleName_GTSP, connection) | ||
sites$type <- "insertion" | ||
sites <- add_label(sites, sampleName_GTSP) | ||
|
||
mrcs <- getMRCs(sampleName_GTSP, connection) | ||
mrcs$type <- "match" | ||
mrcs <- add_label(mrcs, sampleName_GTSP) | ||
|
||
sites_mrcs <- rbind(sites, mrcs) | ||
|
||
sites_mrcs <- makeGRanges(sites_mrcs, soloStart=TRUE, | ||
chromCol='chr', strandCol='strand', startCol='position') | ||
|
||
#seqinfo needs to be exact here or trimming will be wrong | ||
newSeqInfo <- seqinfo(refGenomeSeq) | ||
seqInfo.new2old <- match(seqnames(newSeqInfo), | ||
seqnames(seqinfo(sites_mrcs))) | ||
seqinfo(sites_mrcs, new2old=seqInfo.new2old) <- newSeqInfo | ||
|
||
sites_mrcs | ||
} | ||
|
||
get_annotation_columns <- function(sites) { | ||
granges_column_names <- c("seqnames", "start", "end", "width", "strand") | ||
int_site_column_names <- c("siteID", "sampleName", "chr", "strand", "position") | ||
required_columns <- unique(c( | ||
granges_column_names, int_site_column_names, "type")) | ||
stopifnot(all(required_columns %in% names(sites))) | ||
setdiff(names(sites), required_columns) | ||
} | ||
|
||
from_counts_to_density <- function(sites, column_prefix, window_size) { | ||
metadata <- mcols(sites) | ||
sapply(seq(window_size), function(i) { | ||
val <- window_size[i] | ||
name <- names(window_size)[i] | ||
column_name <- paste0(column_prefix, ".", name) | ||
metadata[[column_name]] <<- metadata[[column_name]]/val | ||
}) | ||
mcols(sites) <- metadata | ||
sites | ||
} | ||
|
||
getPositionalValuesOfFeature <- function(sites, genomicData) { | ||
#### Boundary Distances #### Nirav Malani code TODO: refactor into several functions | ||
## (refSeq boundary.dist), Start (refSeq start.dist), non-width (), General (general.width) | ||
## when inGene is FALSE then set following: ref.left.pos, ref.right.pos, ref.left.strand, ref.right.strand | ||
## when inGene is TRUE then set following: ref.start.pos, ref.end.pos, ref.gene.strand | ||
|
||
## prepare the new columns ## | ||
colnam <- paste("ref", c("left.pos", "right.pos", "left.strand", "right.strand", | ||
"start.pos", "end.pos", "gene.strand"), sep=".") | ||
mcols(sites)[colnam] <- NA | ||
|
||
## add the respective columns as needed ## | ||
## beware: precede returns range which is following the query and | ||
## follow returns the range which is preceding the query! | ||
## so do a switcheroo in terms of extracting the start & stop ## | ||
left <- follow(sites, genomicData, ignore.strand=TRUE) | ||
left[is.na(left) | sites$within_refSeq_gene] <- NA | ||
rows <- na.omit(left) | ||
sites$ref.left.pos[!is.na(left)] <- end(genomicData[rows]) | ||
sites$ref.left.strand[!is.na(left)] <- as.character(strand(genomicData[rows])) | ||
|
||
right <- precede(sites, genomicData, ignore.strand=TRUE) | ||
right[is.na(right) | sites$within_refSeq_gene] <- NA | ||
rows <- na.omit(right) | ||
sites$ref.right.pos[!is.na(right)] <- start(genomicData[rows]) | ||
sites$ref.right.strand[!is.na(right)] <- as.character(strand(genomicData[rows])) | ||
|
||
inIt <- findOverlaps(sites, genomicData, ignore.strand=TRUE, select="arbitrary") | ||
inIt[is.na(inIt) | !sites$within_refSeq_gene] <- NA | ||
rows <- na.omit(inIt) | ||
sites$ref.start.pos[!is.na(inIt)] <- start(genomicData[rows]) | ||
sites$ref.end.pos[!is.na(inIt)] <- end(genomicData[rows]) | ||
sites$ref.gene.strand[!is.na(inIt)] <- as.character(strand(genomicData[rows])) | ||
|
||
sites$boundary.dist <- | ||
eval(expression(pmin((ref.end.pos-position)/(ref.end.pos-ref.start.pos), | ||
(position-ref.start.pos)/(ref.end.pos-ref.start.pos), | ||
(ref.right.pos-position)/(ref.right.pos-ref.left.pos), | ||
(position-ref.left.pos)/(ref.right.pos-ref.left.pos), | ||
na.rm=T)), mcols(sites)) | ||
|
||
sites$start.dist <- | ||
eval(expression(pmin(ifelse(ref.gene.strand=="-", | ||
(ref.end.pos-position)/(ref.end.pos-ref.start.pos), | ||
(position-ref.start.pos)/(ref.end.pos-ref.start.pos)), | ||
ifelse(ref.right.strand=="-", | ||
(ref.right.pos-position)/(ref.right.pos-ref.left.pos), | ||
NA), | ||
ifelse(ref.left.strand=="+", | ||
(position-ref.left.pos)/(ref.right.pos-ref.left.pos), | ||
NA),na.rm=T)), mcols(sites)) | ||
|
||
sites$general.width <- eval(expression(pmin(ref.end.pos-ref.start.pos, | ||
ref.right.pos-ref.left.pos,na.rm=T)), | ||
mcols(sites)) | ||
sites$gene.width <- eval(expression(ref.end.pos-ref.start.pos ), mcols(sites)) | ||
|
||
meta <- mcols(sites) | ||
meta <- meta[ , ! (names(meta) %in% colnam)] | ||
mcols(sites) <- meta | ||
|
||
sites | ||
} | ||
|
||
#' ROC.stata does not work with too few sites | ||
is_enough_sites <- function(sampleName_GTSP, connection) { | ||
MIN_NUMBER_OF_SITES <- 3 | ||
n_sites <- getUniqueSiteCounts(sampleName_GTSP, connection) | ||
n_sites$enough_sites <- n_sites$uniqueSites >= MIN_NUMBER_OF_SITES | ||
if (all(n_sites$enough_sites)) { | ||
return(TRUE) | ||
} | ||
message("****************************************") | ||
message("The following samples have too few sites to generate heatmap:") | ||
print(filter(n_sites, enough_sites == FALSE)) | ||
message("****************************************") | ||
FALSE | ||
} | ||
|
||
#' create a folder containing the SVG outputs and p-value calculations for the (clasically matched) random controls used in ROC calculations | ||
#' | ||
#' @param sites_mrcs Granges with sites, controls and features | ||
sites_to_ROC_old <- function(sites_mrcs, output_dir) { | ||
sites_mrcs <- as.data.frame(sites_mrcs) | ||
annotation_columns <- get_annotation_columns(sites_mrcs) | ||
rset <- with(sites_mrcs, ROC.setup( | ||
rep(TRUE, nrow(sites_mrcs)), type, siteID, sampleName)) | ||
roc.res <- ROC.strata(annotation_columns, rset, add.var=TRUE, sites_mrcs) | ||
ROCSVG(roc.res, output_dir) | ||
} | ||
|
||
##' Substitute Median for \code{NA} | ||
##' | ||
##' When there is little mising data, a rough-and-ready fill-in method | ||
##' may be preferred to computationally intensive method of handling | ||
##' missingness. For non-parametric ROC curves based on ranks of the | ||
##' data, using the median (of the non-missing data) as the fill-in is | ||
##' a fairly innocuous choice. If there is much missing data, this | ||
##' method is not advised as it tends to bias the ROC curve area | ||
##' towards 0.5. | ||
##' @title na.median | ||
##' @param x \code{matrix} or \code{data.frame} possibly containing | ||
##' \code{NA} values. | ||
##' @return An object like \code{x}, but with the medians of the | ||
##' columns used in place of the \code{NA} values in the | ||
##' corresponding columns. | ||
##' @author Charles Berry | ||
na.median <- | ||
function(x) | ||
{ | ||
if (!is.matrix(x)) x <- as.matrix(x) | ||
na.count <- colSums(is.na(x)) | ||
if (any(na.count != 0 )) | ||
{ | ||
for (i in 1:ncol(x)) | ||
if (na.count[i]>0){ | ||
med <- median(x[,i],na.rm=TRUE) | ||
x[is.na(x[,i]),i] <- med | ||
} | ||
} | ||
x | ||
} | ||
|
||
# ROC calculation for Matched Random Controls | ||
# This is similar to the way we did this classically for restriction sites. | ||
sites_to_ROC_matched <- function(sites_mrcs, output_dir) { | ||
sites_mrcs <- as.data.frame(sites_mrcs) | ||
annotation_columns <- get_annotation_columns(sites_mrcs) | ||
roc.res <- ROC.MRC( | ||
sites_mrcs[,"type"], | ||
sites_mrcs[,"siteID"], | ||
na.median(sites_mrcs[,annotation_columns]), | ||
sites_mrcs[,"sampleName"]) | ||
ROCSVG(roc.res, output_dir) | ||
} | ||
|
||
|
||
# ROC calculation for Ordinary (unmatched) Random Controls | ||
# Provides the corrected version for unmatched controls. | ||
# This should roughly match. p-values will differ but should not be | ||
# qualitatively different. | ||
|
||
sites_to_ROC_ordinary <- function(sites_mrcs, output_dir) { | ||
sites_mrcs <- as.data.frame(sites_mrcs) | ||
|
||
write.table(sites_mrcs, file='sites_mrcs.epi') | ||
|
||
annotation_columns <- get_annotation_columns(sites_mrcs) | ||
roc.res <- ROC.ORC( | ||
sites_mrcs[,"type"], | ||
# sites_mrcs[,annotation_columns], | ||
na.median(sites_mrcs[,annotation_columns]), | ||
sites_mrcs[,"sampleName"]) | ||
ROCSVG(roc.res, output_dir) | ||
} |