Merge pull request #53 from tlnagy/tn/add-multibin-support

[WIP] Add multibin support
tlnagy · Feb 12, 2018 · f3e3235 · f3e3235
2 parents 8d8757d + f1d0b07
commit f3e3235
Show file tree

Hide file tree

Showing 19 changed files with 183 additions and 107 deletions.
diff --git a/REQUIRE b/REQUIRE
@@ -10,3 +10,5 @@ ArgParse
 Compat v0.17.0
 YAML
 DocStringExtensions
+Combinatorics
+DataStructures
diff --git a/docs/make.jl b/docs/make.jl
@@ -11,7 +11,9 @@ module Simulation
                 :DataFrames,
                 :HypothesisTests,
                 :IterTools,
-                :DocStringExtensions]
+                :DocStringExtensions,
+                :Combinatorics,
+                :DataStructures]
 
     for package in packages
         eval(:(using $package))

diff --git a/docs/src/custom.md b/docs/src/custom.md
@@ -197,15 +197,15 @@ Guide.xlabel("log counts bin1"), Guide.ylabel("log counts bin2"))
 And a volcano plot:
 
 ```@example 1
-plot(gene_data, x=:mean, y=:pvalue, color=:class, Theme(highlight_width=0pt),
+plot(gene_data, x=:mean_bin2_div_bin1, y=:pvalue_bin2_div_bin1, color=:class, Theme(highlight_width=0pt),
 Guide.xlabel("mean log2 fold change"), Guide.ylabel("-log10 pvalue"))
 ```
 
 And finally we can see how well we can differentiate between the different
 classes using Area Under the Precision-Recall Curve ([`Simulation.auprc`](@ref))
 
 ```@example 1
-auprc(gene_data[:pvalmeanprod], gene_data[:class], Set([:increasing]))[1]
+auprc(gene_data[:pvalmeanprod_bin2_div_bin1], gene_data[:class], Set([:increasing]))[1]
 ```
 
 [`Simulation.auroc`](@ref) and [`Simulation.venn`](@ref) are also good summary

diff --git a/src/exps/common.jl b/src/exps/common.jl
@@ -87,11 +87,11 @@ end
         end
         local result = 0.0
         if measure == :incdec
-            result = method(abs(subgene[:pvalmeanprod]), subgene[:class], Set([:increasing, :decreasing]))
+            result = method(abs(subgene[:pvalmeanprod_bin2_div_bin1]), subgene[:class], Set([:increasing, :decreasing]))
         elseif measure == :dec
-            result = method(subgene[:pvalmeanprod], subgene[:class], Set([:decreasing]), rev=false)
+            result = method(subgene[:pvalmeanprod_bin2_div_bin1], subgene[:class], Set([:decreasing]), rev=false)
         else
-            result = method(subgene[:pvalmeanprod], subgene[:class], Set([:increasing]))
+            result = method(subgene[:pvalmeanprod_bin2_div_bin1], subgene[:class], Set([:increasing]))
         end
         (typeof(result) <: Tuple) && (result = result[1])
         push!(results, result)

diff --git a/src/exps/compare_methods.jl b/src/exps/compare_methods.jl
@@ -24,7 +24,7 @@ function compare_methods(filepath; debug=false, quiet=false)
         screen.bottleneck_representation = representation
         screen.seq_depth = representation
         if screentype[1] == FacsScreen
-            screen.bin_info = Dict{Symbol, Tuple{Float64, Float64}}(:bin1 => (0.0, 0.25), :bin2 => (0.75, 1.0))
+            screen.bin_info = OrderedDict{Symbol, Tuple{Float64, Float64}}(:bin1 => (0.0, 0.25), :bin2 => (0.75, 1.0))
             max_phenotype_dists = Dict{Symbol, Tuple{Float64, Sampleable}}(
                 :inactive => (0.75, Delta(0.0)),
                 :negcontrol => (0.05, Delta(0.0)),
@@ -79,9 +79,9 @@ function compare_methods(filepath; debug=false, quiet=false)
     compare_reduction_methods = (bc_counts, genes) -> begin
         sig, noi = signal(bc_counts), noise(bc_counts)
 
-        pvalmeanprod = auprc(abs(genes[:pvalmeanprod]), genes[:class], Set([:increasing, :decreasing]))[1]
-        pvalue = auprc(genes[:pvalue], genes[:class], Set([:increasing, :decreasing]))[1]
-        effectsize = auprc(genes[:absmean], genes[:class], Set([:increasing, :decreasing]))[1]
+        pvalmeanprod = auprc(abs(genes[:pvalmeanprod_bin2_div_bin1]), genes[:class], Set([:increasing, :decreasing]))[1]
+        pvalue = auprc(genes[:pvalue_bin2_div_bin1], genes[:class], Set([:increasing, :decreasing]))[1]
+        effectsize = auprc(genes[:absmean_bin2_div_bin1], genes[:class], Set([:increasing, :decreasing]))[1]
 
         (sig, noi, pvalmeanprod, pvalue, effectsize)
     end

diff --git a/src/exps/facs_binning.jl b/src/exps/facs_binning.jl
@@ -5,7 +5,7 @@ function facs_binning(filepath; debug=false, quiet=false)
             :bottleneck_representation => [10, 100, 1000],
             :seq_depth => [10, 100, 1000],
             :σ => [1.0, 1.0, 0.5],
-            :bin_info => [Dict(:bin1 => (0.0, p), :bin2 => (1.0-p, 1.0)) for p in linspace(0.5, 0.025, 30)]
+            :bin_info => [OrderedDict(:bin1 => (0.0, p), :bin2 => (1.0-p, 1.0)) for p in linspace(0.5, 0.025, 30)]
         )
         num_runs = 100
     else
@@ -14,7 +14,7 @@ function facs_binning(filepath; debug=false, quiet=false)
             :bottleneck_representation => [100],
             :seq_depth => [100],
             :σ => [1.0],
-            :bin_info => [Dict(:bin1 => (0.0, 0.25), :bin2 => (0.75, 1.0))]
+            :bin_info => [OrderedDict(:bin1 => (0.0, 0.25), :bin2 => (0.75, 1.0))]
         )
         num_runs = 1
     end

diff --git a/src/exps/facs_binning_snr.jl b/src/exps/facs_binning_snr.jl
@@ -5,7 +5,7 @@ function facs_binning_snr(filepath; debug=false, quiet=false)
             :bottleneck_representation => [10, 100, 1000],
             :seq_depth => [10, 100, 1000],
             :σ => [1.0, 1.0, 0.5],
-            :bin_info => [Dict(:bin1 => (0.0, p), :bin2 => (1.0-p, 1.0)) for p in linspace(0.5, 0.025, 30)]
+            :bin_info => [OrderedDict(:bin1 => (0.0, p), :bin2 => (1.0-p, 1.0)) for p in linspace(0.5, 0.025, 30)]
         )
         num_runs = 100
     else
@@ -14,7 +14,7 @@ function facs_binning_snr(filepath; debug=false, quiet=false)
             :bottleneck_representation => [100],
             :seq_depth => [100],
             :σ => [1.0],
-            :bin_info => [Dict(:bin1 => (0.0, 0.25), :bin2 => (0.75, 1.0))]
+            :bin_info => [OrderedDict(:bin1 => (0.0, 0.25), :bin2 => (0.75, 1.0))]
         )
         num_runs = 1
     end

diff --git a/src/exps/gen_plots.jl b/src/exps/gen_plots.jl
@@ -37,7 +37,7 @@ function gen_plots(filepath; debug=false, quiet=false)
                 Guide.xlabel("log counts t0"), Guide.ylabel("log counts t1")))
                 new_filename = joinpath(curr_dir, "$(basename(filepath))_$(typeof(crisprtype))_$(typeof(screentype))_volcano.svg")
                 draw(SVG(new_filename, 10cm, 10cm), plot(genes,
-                x=:mean, y=:pvalue, color=:class,
+                x=:mean_bin2_div_bin1, y=:pvalue_bin2_div_bin1, color=:class,
                 Scale.color_discrete_manual(colors..., levels=sort(unique(genes[:class]))),
                 Theme(highlight_width=0pt),
                 Guide.title("$(typeof(crisprtype)) $(typeof(screentype))"),

diff --git a/src/exps/growth_sensitivity_library.jl b/src/exps/growth_sensitivity_library.jl
@@ -43,8 +43,8 @@ function growth_sensitivity_library(filepath; debug=false, quiet=false)
     runs = grouped_param_space(GrowthScreen(), parameters, libs, [:num_bottlenecks], num_runs);
 
     function compute_auprc(barcodes, genes)
-        i = auprc(genes[:pvalmeanprod], genes[:class], Set([:increasing]))[1]
-        d = auprc(genes[:pvalmeanprod], genes[:class], Set([:decreasing]), rev=false)[1]
+        i = auprc(genes[:pvalmeanprod_bin2_div_bin1], genes[:class], Set([:increasing]))[1]
+        d = auprc(genes[:pvalmeanprod_bin2_div_bin1], genes[:class], Set([:decreasing]), rev=false)[1]
         (i, d)
     end
 

diff --git a/src/simulation/common.jl b/src/simulation/common.jl
@@ -102,7 +102,7 @@ type FacsScreen <: ScreenSetup
     The 5th percentile of cells sorted according to their phenotype (fluorescence,
     size, etc) will be compared to the 95th percentile.
     """
-    bin_info::Dict{Symbol, Tuple{Float64, Float64}}
+    bin_info::Associative{Symbol, Tuple{Float64, Float64}}
 
     "Number of cells sorted expressed as an integer multiple of the number of guides"
     bottleneck_representation::Int
@@ -113,7 +113,7 @@ type FacsScreen <: ScreenSetup
     seq_depth::Int
 
     function FacsScreen()
-        new(500, 5, 100, 0.25, 1.0, Dict(:bin1 => (0.0, 1/3), :bin2 => (2/3, 1.0)), 1000, 1000)
+        new(500, 5, 100, 0.25, 1.0, OrderedDict(:bin1 => (0.0, 1/3), :bin2 => (2/3, 1.0)), 1000, 1000)
     end
 end
 

diff --git a/src/simulation/library.jl b/src/simulation/library.jl
@@ -98,7 +98,33 @@ type Library
 
     """
     Maximum phenotype categories mapped to their probability of being
-    selected and the distribution to draw from if they are selected
+    selected and the distribution to draw from if they are selected.
+
+    ## Example
+
+    The basic layout is a `Dict` mapping a class name to a tuple of the
+    probability of selecting this class and then the
+    [`Distributions.Sampleable`](https://juliastats.github.io/Distributions.jl/latest/types.html#Sampleable-1)
+    from which to draw a random phenotype from this class. The probabilities
+    across all the classes should add up to 1.
+
+    ```julia
+    max_phenotype_dists = Dict{Symbol, Tuple{Float64, Sampleable}}(
+        :inactive => (0.60, Delta(0.0)),
+        :negcontrol => (0.1, Delta(0.0)),
+        :increasing => (0.3, TruncatedNormal(0.1, 0.1, 0.025, 1)),
+    );
+    Library(max_phenotype_dists, CRISPRi());
+    ```
+
+    For example, here we are making three different classes of "genes": the
+    first group are :inactive, i.e. they have no phenotype, so we'll set their
+    phenotypes to 0.0 using a [`Simulation.Delta`](@ref). We'll also make them
+    60% of all the genes. The second group are the negative controls :negcontrol
+    (the only required group) which make up 10% of the population of genes and
+    also have no effect. The final group is :increasing which makes up 30% of
+    all genes and which are represented by a Normal(μ=0.1, σ=0.1) distribution
+    clamped between 0.025 and 1.
     """
     max_phenotype_dists::Dict{Int, Tuple{Symbol, Sampleable}}
     phenotype_probs::Categorical

diff --git a/src/simulation/load.jl b/src/simulation/load.jl
@@ -5,7 +5,9 @@ packages = [:StatsBase,
             :DataFrames,
             :HypothesisTests,
             :IterTools,
-            :DocStringExtensions]
+            :DocStringExtensions,
+            :Combinatorics,
+            :DataStructures]
 
 for package in packages
     eval(:(using $package))

diff --git a/src/simulation/processing.jl b/src/simulation/processing.jl
@@ -4,8 +4,9 @@ $(SIGNATURES)
 Given the raw data from [`Simulation.sequencing`](@ref) returns two DataFrames
 
 1. `guide_data`: This DataFrame contains the per-guide level data including the
-    log2 fold change in the normalized frequencies of each guide between the two
-    bins.
+    log2 fold change in the normalized frequencies of each guide between each
+    pairwise combination of bins. Thus, if there are `n` bins, then it computes
+    the log2 fold changes for the ``\\frac{n!}{2(n-2)!}`` combinations
 
 2. `gene_data`: This DataFrame contains the same information but grouped by
     gene. The log2 fold change data from the first DataFrame is used to calculate
@@ -14,54 +15,36 @@ Given the raw data from [`Simulation.sequencing`](@ref) returns two DataFrames
     measure of how consistently shifted the guides are of this gene versus the
     population of negative control guides. (see below for more info)
 
-A typical `guide_data` DataFrame contains the following columns:
-
-- `gene`: the gene ID of that this guide targets
-
-- `knockdown`: activity of the guide on 0 to 1 scale, where 1 is complete knockout
-
-- `barcodeid`: the ID of this specific guide
-
-- `theo_phenotype`: expected phenotype of this guide, generally a -1 to 1 scale
-
-- `behavior`: whether the target gene displays a linear or sigmoidal response to knockdown
-
-- `class`: whether the target gene has a positive, negative, or no phenotype during screening
-
-- `initial_freq`: frequency of guide post-transfection (see [`Simulation.transfect`](@ref))
-
-- `counts_bin1`: the raw number of reads for each guide in the first bin
-
-- `freqs_bin1`: the number of reads for each guide divided by the total number of reads in this bin
-
-- `rel_freqs_bin1`: the frequency of each guide divided by the median frequency of negative control guides
-
-- `counts_bin2`: the raw number of reads for each guide in the second bin
-
-- `freqs_bin2`: the number of reads for each guide divided by the total number of reads in this bin
-
-- `rel_freqs_bin2`: the frequency of each guide divided by the median frequency of negative control guides for this bin
-
-- `log2fc_bin2`: the log2 fold change in relative guide frequencies between the two bins
+A typical 2 bin `guide_data` DataFrame contains the following columns:
+
+| Column Name  | Meaning  |
+|:--|:--|
+| `gene` | the gene ID of that this guide targets|
+| `knockdown` | activity of the guide on 0 to 1 scale, where 1 is complete knockout|
+| `barcodeid` | the ID of this specific guide|
+| `theo_phenotype` | expected phenotype of this guide, generally a -1 to 1 scale|
+| `behavior` | whether the target gene displays a linear or sigmoidal response to incomplete knockdown (see [`Simulation.Library`](@ref) for more details)|
+| `class` | which phenotype distribution the target gene was drawn from (see [`Simulation.Library`](@ref) for more details). Serves as the "ground truth" label against which screen performance is evaluated, e.g. with [`Simulation.auprc`](@ref) |
+| `initial_freq` | frequency of guide post-transfection (see [`Simulation.transfect`](@ref))|
+| `counts_bin1` | the raw number of reads for each guide in the first bin|
+| `freqs_bin1` | the number of reads for each guide divided by the total number of reads in this bin|
+| `rel_freqs_bin1` | the frequency of each guide divided by the median frequency of negative control guides|
+| `counts_bin2` | the raw number of reads for each guide in the second bin|
+| `freqs_bin2` | the number of reads for each guide divided by the total number of reads in this bin|
+| `rel_freqs_bin2` | the frequency of each guide divided by the median frequency of negative control guides for this bin|
+| `log2fc_bin2_div_bin1` | the log2 fold change in relative guide frequencies between `bin2` and `bin1`, equivalent to `log2(rel_freqs_bin2/rel_freqs_bin1)` |
 
 A typical `gene_data` DataFrame contains the following data:
 
-- `gene`: this gene's ID
-
-- `behavior`: whether this gene displays a linear or sigmoidal response to knockdown
-
-- `class`: whether this gene has a positive, negative, or no phenotype during screening
-
-- `mean`: the mean log 2 fold change in relative frequencies between the two bins
-    for all the guides targeting this gene.
-
-- `pvalue`: the -log10 pvalue of the log2 fold changes of all guides targeting
-    this gene as computed by the non-parametric Mann-Whitney U-test. A measure
-    of the consistency of the log 2 fold changes[^1]
-
-- `absmean`: absolute value of `mean` per-gene
-
-- `pvalmeanprod`: `mean` multiplied with the `pvalue` per-gene
+| Column Name  | Meaning  |
+|:--|:--|
+| `gene` | this gene's ID|
+| `class` | see above |
+| `behavior` | see above |
+| `mean_bin2_div_bin1` | the mean log 2 fold change in relative frequencies between from `bin1` to `bin2` for all the guides targeting this gene. Calculated as ``\\frac{1}{k}\\sum_k \\text{log2fc_bin2_div_bin1}_k`` for the ``k`` guides targeting each gene |
+| `pvalue_bin2_div_bin1` | the -log10 pvalue of the log2 fold changes of all guides targeting this gene as computed by the non-parametric Mann-Whitney U-test. A measure of the consistency of the log 2 fold changes[^1]|
+| `absmean_bin2_div_bin1` | absolute value of `mean_bin2_div_bin1` per-gene |
+| `pvalmeanprod_bin2_div_bin1` | `mean_bin2_div_bin1` multiplied with the `pvalue_bin2_div_bin1` per-gene|
 
 # Further reading
 
@@ -70,10 +53,7 @@ A typical `gene_data` DataFrame contains the following data:
     *Proc Natl Acad Sci U S A*. 2013;110:E2317–26.
 
 """
-function differences_between_bins(raw_data::Associative{Symbol, DataFrame};
-                                  first_bin=:bin1,
-                                  last_bin=maximum(keys(raw_data)))
-
+function differences_between_bins(raw_data::Associative{Symbol, DataFrame})
     for (bin, seq_data) in raw_data
         sort!(seq_data, cols=[:barcodeid])
         # add a pseudocount of 0.5 to every value to prevent -Inf's when
@@ -89,31 +69,40 @@ function differences_between_bins(raw_data::Associative{Symbol, DataFrame};
         seq_data[:rel_freqs] = seq_data[:freqs] ./ med
     end
 
-    combined = copy(raw_data[first_bin])
-    rename!(combined, Dict(:freqs => Symbol("freqs_", first_bin),
-                           :counts => Symbol("counts_", first_bin),
-                           :rel_freqs => Symbol("rel_freqs_", first_bin)))
-
-    for (bin, seq_data) in raw_data
-        (bin == first_bin) && continue
-
-        combined[Symbol("freqs_", bin)] = seq_data[:freqs]
-        combined[Symbol("counts_", bin)] = seq_data[:counts]
-        combined[Symbol("rel_freqs_", bin)] = seq_data[:rel_freqs]
-        combined[Symbol("log2fc_", bin)] =
-        log2(combined[Symbol("rel_freqs_", bin)]./combined[Symbol("rel_freqs_", first_bin)])
-    end
-
-    nonnegs = combined[combined[:class] .!= :negcontrol, :]
-    negcontrols = combined[combined[:class] .== :negcontrol, Symbol("log2fc_", last_bin)]
+    bin1 = first(raw_data)[2]
+    cols_to_copy = [col for col in names(bin1) if !(col in (:counts, :freqs, :rel_freqs))]
+    guide_data = bin1[cols_to_copy]
+    gene_data = DataFrame()
+
+    # processed bins; so we don't re-add bins when doing all combos
+    proc_bins = Set{Symbol}()
+
+    # compute pairwise log 2 fold changes between every combination of bins
+    for bins in combinations(collect(keys(raw_data)), 2)
+        for bin in bins
+            (bin in proc_bins) && continue
+            push!(proc_bins, bin)
+            guide_data[[Symbol("counts_", bin), Symbol("freqs_", bin), Symbol("rel_freqs_", bin)]] =
+                raw_data[bin][[:counts, :freqs, :rel_freqs]]
+        end
+        suffix = "_$(bins[2])_div_$(bins[1])"
+        log2fc_col = Symbol(:log2fc, suffix)
+        guide_data[log2fc_col] = log2(guide_data[Symbol(:rel_freqs_, bins[2])]
+                                        ./guide_data[Symbol(:rel_freqs_, bins[1])])
+
+        nonnegs = guide_data[guide_data[:class] .!= :negcontrol, :]
+        negcontrols = guide_data[guide_data[:class] .== :negcontrol, log2fc_col]
+
+        tmp = by(nonnegs, [:gene, :behavior, :class]) do barcodes
+            log2fcs = barcodes[log2fc_col]
+            result = MannWhitneyUTest(log2fcs, negcontrols)
+            DataFrame(pvalue = -log10(pvalue(result)), mean= mean(log2fcs))
+        end
+        gene_data[[:gene, :behavior, :class, Symbol(:pvalue, suffix), Symbol(:mean, suffix)]] = tmp[[:gene, :behavior, :class, :pvalue, :mean]]
+        gene_data[Symbol(:absmean, suffix)] = abs(gene_data[Symbol(:mean, suffix)])
+        gene_data[Symbol(:pvalmeanprod, suffix)] = gene_data[Symbol(:mean, suffix)] .* gene_data[Symbol(:pvalue, suffix)]
 
-    genes = by(nonnegs, [:gene, :behavior, :class]) do barcodes
-        log2fcs = barcodes[Symbol("log2fc_", last_bin)]
-        result = MannWhitneyUTest(log2fcs, negcontrols)
-        DataFrame(pvalue = -log10(pvalue(result)), mean= mean(log2fcs))
     end
-    genes[:absmean] = abs(genes[:mean])
-    genes[:pvalmeanprod] = genes[:mean] .* genes[:pvalue]
 
-    combined, genes
+    guide_data, gene_data
 end
diff --git a/src/simulation/selection.jl b/src/simulation/selection.jl
@@ -24,7 +24,7 @@ function select(setup::FacsScreen,
     end
     indices = sortperm(observed)
     cells = cells[indices]
-    results = Dict{Symbol, Vector{Int}}()
+    results = OrderedDict{Symbol, Vector{Int}}()
 
     for (binname, cutoffs) in bins
         left = clamp(round(Int, cutoffs[1]*n_cells), 1, n_cells)
@@ -82,5 +82,5 @@ function select(setup::GrowthScreen,
         copy!(view(cellmat, :), view(output_c, picked))
         copy!(view(cpmat, :), view(output_p, picked))
     end
-    return Dict(:bin1 => initial_cells, :bin2 => cellmat)
+    return OrderedDict(:bin1 => initial_cells, :bin2 => cellmat)
 end
diff --git a/src/simulation/sequencing.jl b/src/simulation/sequencing.jl
@@ -4,7 +4,7 @@ $(SIGNATURES)
 Converts raw counts to frequencies by dividing by the total number of reads
 for each sample.
 """
-function counts_to_freqs(bin_cells::Dict{Symbol, Vector{Int}}, guide_count::Int)
+function counts_to_freqs(bin_cells::Associative{Symbol, Vector{Int}}, guide_count::Int)
     results = Dict{Symbol, Vector{Float64}}()
     for (bin, cells) in bin_cells
         counts = StatsBase.counts(cells, 1:guide_count)
@@ -23,7 +23,7 @@ dict of DataFrames with the bin names as the keys. This object can then be passe
 through [`Simulation.counts_to_freqs`](@ref) followed by
 [`Simulation.differences_between_bins`](@ref).
 """
-function sequencing(depths::Dict{Symbol, Int}, guides::Vector{Barcode}, samples::Dict{Symbol, Vector{Float64}})
+function sequencing(depths::Associative{Symbol, Int}, guides::Vector{Barcode}, samples::Associative{Symbol, Vector{Float64}})
     @assert all(Bool[haskey(depths, key) for key in keys(samples)]) "Supply exactly one sequencing depth per sample"
     sequencing_results = Dict{Symbol, DataFrame}()
     colnames = [fld for fld in fieldnames(Barcode) if fld != :obs_phenotype]