Skip to content

Commit

Permalink
Merge pull request #53 from tlnagy/tn/add-multibin-support
Browse files Browse the repository at this point in the history
[WIP] Add multibin support
  • Loading branch information
tlnagy authored Feb 12, 2018
2 parents 8d8757d + f1d0b07 commit f3e3235
Show file tree
Hide file tree
Showing 19 changed files with 183 additions and 107 deletions.
2 changes: 2 additions & 0 deletions REQUIRE
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ ArgParse
Compat v0.17.0
YAML
DocStringExtensions
Combinatorics
DataStructures
4 changes: 3 additions & 1 deletion docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ module Simulation
:DataFrames,
:HypothesisTests,
:IterTools,
:DocStringExtensions]
:DocStringExtensions,
:Combinatorics,
:DataStructures]

for package in packages
eval(:(using $package))
Expand Down
4 changes: 2 additions & 2 deletions docs/src/custom.md
Original file line number Diff line number Diff line change
Expand Up @@ -197,15 +197,15 @@ Guide.xlabel("log counts bin1"), Guide.ylabel("log counts bin2"))
And a volcano plot:

```@example 1
plot(gene_data, x=:mean, y=:pvalue, color=:class, Theme(highlight_width=0pt),
plot(gene_data, x=:mean_bin2_div_bin1, y=:pvalue_bin2_div_bin1, color=:class, Theme(highlight_width=0pt),
Guide.xlabel("mean log2 fold change"), Guide.ylabel("-log10 pvalue"))
```

And finally we can see how well we can differentiate between the different
classes using Area Under the Precision-Recall Curve ([`Simulation.auprc`](@ref))

```@example 1
auprc(gene_data[:pvalmeanprod], gene_data[:class], Set([:increasing]))[1]
auprc(gene_data[:pvalmeanprod_bin2_div_bin1], gene_data[:class], Set([:increasing]))[1]
```

[`Simulation.auroc`](@ref) and [`Simulation.venn`](@ref) are also good summary
Expand Down
6 changes: 3 additions & 3 deletions src/exps/common.jl
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,11 @@ end
end
local result = 0.0
if measure == :incdec
result = method(abs(subgene[:pvalmeanprod]), subgene[:class], Set([:increasing, :decreasing]))
result = method(abs(subgene[:pvalmeanprod_bin2_div_bin1]), subgene[:class], Set([:increasing, :decreasing]))
elseif measure == :dec
result = method(subgene[:pvalmeanprod], subgene[:class], Set([:decreasing]), rev=false)
result = method(subgene[:pvalmeanprod_bin2_div_bin1], subgene[:class], Set([:decreasing]), rev=false)
else
result = method(subgene[:pvalmeanprod], subgene[:class], Set([:increasing]))
result = method(subgene[:pvalmeanprod_bin2_div_bin1], subgene[:class], Set([:increasing]))
end
(typeof(result) <: Tuple) && (result = result[1])
push!(results, result)
Expand Down
8 changes: 4 additions & 4 deletions src/exps/compare_methods.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ function compare_methods(filepath; debug=false, quiet=false)
screen.bottleneck_representation = representation
screen.seq_depth = representation
if screentype[1] == FacsScreen
screen.bin_info = Dict{Symbol, Tuple{Float64, Float64}}(:bin1 => (0.0, 0.25), :bin2 => (0.75, 1.0))
screen.bin_info = OrderedDict{Symbol, Tuple{Float64, Float64}}(:bin1 => (0.0, 0.25), :bin2 => (0.75, 1.0))
max_phenotype_dists = Dict{Symbol, Tuple{Float64, Sampleable}}(
:inactive => (0.75, Delta(0.0)),
:negcontrol => (0.05, Delta(0.0)),
Expand Down Expand Up @@ -79,9 +79,9 @@ function compare_methods(filepath; debug=false, quiet=false)
compare_reduction_methods = (bc_counts, genes) -> begin
sig, noi = signal(bc_counts), noise(bc_counts)

pvalmeanprod = auprc(abs(genes[:pvalmeanprod]), genes[:class], Set([:increasing, :decreasing]))[1]
pvalue = auprc(genes[:pvalue], genes[:class], Set([:increasing, :decreasing]))[1]
effectsize = auprc(genes[:absmean], genes[:class], Set([:increasing, :decreasing]))[1]
pvalmeanprod = auprc(abs(genes[:pvalmeanprod_bin2_div_bin1]), genes[:class], Set([:increasing, :decreasing]))[1]
pvalue = auprc(genes[:pvalue_bin2_div_bin1], genes[:class], Set([:increasing, :decreasing]))[1]
effectsize = auprc(genes[:absmean_bin2_div_bin1], genes[:class], Set([:increasing, :decreasing]))[1]

(sig, noi, pvalmeanprod, pvalue, effectsize)
end
Expand Down
4 changes: 2 additions & 2 deletions src/exps/facs_binning.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ function facs_binning(filepath; debug=false, quiet=false)
:bottleneck_representation => [10, 100, 1000],
:seq_depth => [10, 100, 1000],
=> [1.0, 1.0, 0.5],
:bin_info => [Dict(:bin1 => (0.0, p), :bin2 => (1.0-p, 1.0)) for p in linspace(0.5, 0.025, 30)]
:bin_info => [OrderedDict(:bin1 => (0.0, p), :bin2 => (1.0-p, 1.0)) for p in linspace(0.5, 0.025, 30)]
)
num_runs = 100
else
Expand All @@ -14,7 +14,7 @@ function facs_binning(filepath; debug=false, quiet=false)
:bottleneck_representation => [100],
:seq_depth => [100],
=> [1.0],
:bin_info => [Dict(:bin1 => (0.0, 0.25), :bin2 => (0.75, 1.0))]
:bin_info => [OrderedDict(:bin1 => (0.0, 0.25), :bin2 => (0.75, 1.0))]
)
num_runs = 1
end
Expand Down
4 changes: 2 additions & 2 deletions src/exps/facs_binning_snr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ function facs_binning_snr(filepath; debug=false, quiet=false)
:bottleneck_representation => [10, 100, 1000],
:seq_depth => [10, 100, 1000],
=> [1.0, 1.0, 0.5],
:bin_info => [Dict(:bin1 => (0.0, p), :bin2 => (1.0-p, 1.0)) for p in linspace(0.5, 0.025, 30)]
:bin_info => [OrderedDict(:bin1 => (0.0, p), :bin2 => (1.0-p, 1.0)) for p in linspace(0.5, 0.025, 30)]
)
num_runs = 100
else
Expand All @@ -14,7 +14,7 @@ function facs_binning_snr(filepath; debug=false, quiet=false)
:bottleneck_representation => [100],
:seq_depth => [100],
=> [1.0],
:bin_info => [Dict(:bin1 => (0.0, 0.25), :bin2 => (0.75, 1.0))]
:bin_info => [OrderedDict(:bin1 => (0.0, 0.25), :bin2 => (0.75, 1.0))]
)
num_runs = 1
end
Expand Down
2 changes: 1 addition & 1 deletion src/exps/gen_plots.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ function gen_plots(filepath; debug=false, quiet=false)
Guide.xlabel("log counts t0"), Guide.ylabel("log counts t1")))
new_filename = joinpath(curr_dir, "$(basename(filepath))_$(typeof(crisprtype))_$(typeof(screentype))_volcano.svg")
draw(SVG(new_filename, 10cm, 10cm), plot(genes,
x=:mean, y=:pvalue, color=:class,
x=:mean_bin2_div_bin1, y=:pvalue_bin2_div_bin1, color=:class,
Scale.color_discrete_manual(colors..., levels=sort(unique(genes[:class]))),
Theme(highlight_width=0pt),
Guide.title("$(typeof(crisprtype)) $(typeof(screentype))"),
Expand Down
4 changes: 2 additions & 2 deletions src/exps/growth_sensitivity_library.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ function growth_sensitivity_library(filepath; debug=false, quiet=false)
runs = grouped_param_space(GrowthScreen(), parameters, libs, [:num_bottlenecks], num_runs);

function compute_auprc(barcodes, genes)
i = auprc(genes[:pvalmeanprod], genes[:class], Set([:increasing]))[1]
d = auprc(genes[:pvalmeanprod], genes[:class], Set([:decreasing]), rev=false)[1]
i = auprc(genes[:pvalmeanprod_bin2_div_bin1], genes[:class], Set([:increasing]))[1]
d = auprc(genes[:pvalmeanprod_bin2_div_bin1], genes[:class], Set([:decreasing]), rev=false)[1]
(i, d)
end

Expand Down
4 changes: 2 additions & 2 deletions src/simulation/common.jl
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ type FacsScreen <: ScreenSetup
The 5th percentile of cells sorted according to their phenotype (fluorescence,
size, etc) will be compared to the 95th percentile.
"""
bin_info::Dict{Symbol, Tuple{Float64, Float64}}
bin_info::Associative{Symbol, Tuple{Float64, Float64}}

"Number of cells sorted expressed as an integer multiple of the number of guides"
bottleneck_representation::Int
Expand All @@ -113,7 +113,7 @@ type FacsScreen <: ScreenSetup
seq_depth::Int

function FacsScreen()
new(500, 5, 100, 0.25, 1.0, Dict(:bin1 => (0.0, 1/3), :bin2 => (2/3, 1.0)), 1000, 1000)
new(500, 5, 100, 0.25, 1.0, OrderedDict(:bin1 => (0.0, 1/3), :bin2 => (2/3, 1.0)), 1000, 1000)
end
end

Expand Down
28 changes: 27 additions & 1 deletion src/simulation/library.jl
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,33 @@ type Library

"""
Maximum phenotype categories mapped to their probability of being
selected and the distribution to draw from if they are selected
selected and the distribution to draw from if they are selected.
## Example
The basic layout is a `Dict` mapping a class name to a tuple of the
probability of selecting this class and then the
[`Distributions.Sampleable`](https://juliastats.github.io/Distributions.jl/latest/types.html#Sampleable-1)
from which to draw a random phenotype from this class. The probabilities
across all the classes should add up to 1.
```julia
max_phenotype_dists = Dict{Symbol, Tuple{Float64, Sampleable}}(
:inactive => (0.60, Delta(0.0)),
:negcontrol => (0.1, Delta(0.0)),
:increasing => (0.3, TruncatedNormal(0.1, 0.1, 0.025, 1)),
);
Library(max_phenotype_dists, CRISPRi());
```
For example, here we are making three different classes of "genes": the
first group are :inactive, i.e. they have no phenotype, so we'll set their
phenotypes to 0.0 using a [`Simulation.Delta`](@ref). We'll also make them
60% of all the genes. The second group are the negative controls :negcontrol
(the only required group) which make up 10% of the population of genes and
also have no effect. The final group is :increasing which makes up 30% of
all genes and which are represented by a Normal(μ=0.1, σ=0.1) distribution
clamped between 0.025 and 1.
"""
max_phenotype_dists::Dict{Int, Tuple{Symbol, Sampleable}}
phenotype_probs::Categorical
Expand Down
4 changes: 3 additions & 1 deletion src/simulation/load.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ packages = [:StatsBase,
:DataFrames,
:HypothesisTests,
:IterTools,
:DocStringExtensions]
:DocStringExtensions,
:Combinatorics,
:DataStructures]

for package in packages
eval(:(using $package))
Expand Down
139 changes: 64 additions & 75 deletions src/simulation/processing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ $(SIGNATURES)
Given the raw data from [`Simulation.sequencing`](@ref) returns two DataFrames
1. `guide_data`: This DataFrame contains the per-guide level data including the
log2 fold change in the normalized frequencies of each guide between the two
bins.
log2 fold change in the normalized frequencies of each guide between each
pairwise combination of bins. Thus, if there are `n` bins, then it computes
the log2 fold changes for the ``\\frac{n!}{2(n-2)!}`` combinations
2. `gene_data`: This DataFrame contains the same information but grouped by
gene. The log2 fold change data from the first DataFrame is used to calculate
Expand All @@ -14,54 +15,36 @@ Given the raw data from [`Simulation.sequencing`](@ref) returns two DataFrames
measure of how consistently shifted the guides are of this gene versus the
population of negative control guides. (see below for more info)
A typical `guide_data` DataFrame contains the following columns:
- `gene`: the gene ID of that this guide targets
- `knockdown`: activity of the guide on 0 to 1 scale, where 1 is complete knockout
- `barcodeid`: the ID of this specific guide
- `theo_phenotype`: expected phenotype of this guide, generally a -1 to 1 scale
- `behavior`: whether the target gene displays a linear or sigmoidal response to knockdown
- `class`: whether the target gene has a positive, negative, or no phenotype during screening
- `initial_freq`: frequency of guide post-transfection (see [`Simulation.transfect`](@ref))
- `counts_bin1`: the raw number of reads for each guide in the first bin
- `freqs_bin1`: the number of reads for each guide divided by the total number of reads in this bin
- `rel_freqs_bin1`: the frequency of each guide divided by the median frequency of negative control guides
- `counts_bin2`: the raw number of reads for each guide in the second bin
- `freqs_bin2`: the number of reads for each guide divided by the total number of reads in this bin
- `rel_freqs_bin2`: the frequency of each guide divided by the median frequency of negative control guides for this bin
- `log2fc_bin2`: the log2 fold change in relative guide frequencies between the two bins
A typical 2 bin `guide_data` DataFrame contains the following columns:
| Column Name | Meaning |
|:--|:--|
| `gene` | the gene ID of that this guide targets|
| `knockdown` | activity of the guide on 0 to 1 scale, where 1 is complete knockout|
| `barcodeid` | the ID of this specific guide|
| `theo_phenotype` | expected phenotype of this guide, generally a -1 to 1 scale|
| `behavior` | whether the target gene displays a linear or sigmoidal response to incomplete knockdown (see [`Simulation.Library`](@ref) for more details)|
| `class` | which phenotype distribution the target gene was drawn from (see [`Simulation.Library`](@ref) for more details). Serves as the "ground truth" label against which screen performance is evaluated, e.g. with [`Simulation.auprc`](@ref) |
| `initial_freq` | frequency of guide post-transfection (see [`Simulation.transfect`](@ref))|
| `counts_bin1` | the raw number of reads for each guide in the first bin|
| `freqs_bin1` | the number of reads for each guide divided by the total number of reads in this bin|
| `rel_freqs_bin1` | the frequency of each guide divided by the median frequency of negative control guides|
| `counts_bin2` | the raw number of reads for each guide in the second bin|
| `freqs_bin2` | the number of reads for each guide divided by the total number of reads in this bin|
| `rel_freqs_bin2` | the frequency of each guide divided by the median frequency of negative control guides for this bin|
| `log2fc_bin2_div_bin1` | the log2 fold change in relative guide frequencies between `bin2` and `bin1`, equivalent to `log2(rel_freqs_bin2/rel_freqs_bin1)` |
A typical `gene_data` DataFrame contains the following data:
- `gene`: this gene's ID
- `behavior`: whether this gene displays a linear or sigmoidal response to knockdown
- `class`: whether this gene has a positive, negative, or no phenotype during screening
- `mean`: the mean log 2 fold change in relative frequencies between the two bins
for all the guides targeting this gene.
- `pvalue`: the -log10 pvalue of the log2 fold changes of all guides targeting
this gene as computed by the non-parametric Mann-Whitney U-test. A measure
of the consistency of the log 2 fold changes[^1]
- `absmean`: absolute value of `mean` per-gene
- `pvalmeanprod`: `mean` multiplied with the `pvalue` per-gene
| Column Name | Meaning |
|:--|:--|
| `gene` | this gene's ID|
| `class` | see above |
| `behavior` | see above |
| `mean_bin2_div_bin1` | the mean log 2 fold change in relative frequencies between from `bin1` to `bin2` for all the guides targeting this gene. Calculated as ``\\frac{1}{k}\\sum_k \\text{log2fc_bin2_div_bin1}_k`` for the ``k`` guides targeting each gene |
| `pvalue_bin2_div_bin1` | the -log10 pvalue of the log2 fold changes of all guides targeting this gene as computed by the non-parametric Mann-Whitney U-test. A measure of the consistency of the log 2 fold changes[^1]|
| `absmean_bin2_div_bin1` | absolute value of `mean_bin2_div_bin1` per-gene |
| `pvalmeanprod_bin2_div_bin1` | `mean_bin2_div_bin1` multiplied with the `pvalue_bin2_div_bin1` per-gene|
# Further reading
Expand All @@ -70,10 +53,7 @@ A typical `gene_data` DataFrame contains the following data:
*Proc Natl Acad Sci U S A*. 2013;110:E2317–26.
"""
function differences_between_bins(raw_data::Associative{Symbol, DataFrame};
first_bin=:bin1,
last_bin=maximum(keys(raw_data)))

function differences_between_bins(raw_data::Associative{Symbol, DataFrame})
for (bin, seq_data) in raw_data
sort!(seq_data, cols=[:barcodeid])
# add a pseudocount of 0.5 to every value to prevent -Inf's when
Expand All @@ -89,31 +69,40 @@ function differences_between_bins(raw_data::Associative{Symbol, DataFrame};
seq_data[:rel_freqs] = seq_data[:freqs] ./ med
end

combined = copy(raw_data[first_bin])
rename!(combined, Dict(:freqs => Symbol("freqs_", first_bin),
:counts => Symbol("counts_", first_bin),
:rel_freqs => Symbol("rel_freqs_", first_bin)))

for (bin, seq_data) in raw_data
(bin == first_bin) && continue

combined[Symbol("freqs_", bin)] = seq_data[:freqs]
combined[Symbol("counts_", bin)] = seq_data[:counts]
combined[Symbol("rel_freqs_", bin)] = seq_data[:rel_freqs]
combined[Symbol("log2fc_", bin)] =
log2(combined[Symbol("rel_freqs_", bin)]./combined[Symbol("rel_freqs_", first_bin)])
end

nonnegs = combined[combined[:class] .!= :negcontrol, :]
negcontrols = combined[combined[:class] .== :negcontrol, Symbol("log2fc_", last_bin)]
bin1 = first(raw_data)[2]
cols_to_copy = [col for col in names(bin1) if !(col in (:counts, :freqs, :rel_freqs))]
guide_data = bin1[cols_to_copy]
gene_data = DataFrame()

# processed bins; so we don't re-add bins when doing all combos
proc_bins = Set{Symbol}()

# compute pairwise log 2 fold changes between every combination of bins
for bins in combinations(collect(keys(raw_data)), 2)
for bin in bins
(bin in proc_bins) && continue
push!(proc_bins, bin)
guide_data[[Symbol("counts_", bin), Symbol("freqs_", bin), Symbol("rel_freqs_", bin)]] =
raw_data[bin][[:counts, :freqs, :rel_freqs]]
end
suffix = "_$(bins[2])_div_$(bins[1])"
log2fc_col = Symbol(:log2fc, suffix)
guide_data[log2fc_col] = log2(guide_data[Symbol(:rel_freqs_, bins[2])]
./guide_data[Symbol(:rel_freqs_, bins[1])])

nonnegs = guide_data[guide_data[:class] .!= :negcontrol, :]
negcontrols = guide_data[guide_data[:class] .== :negcontrol, log2fc_col]

tmp = by(nonnegs, [:gene, :behavior, :class]) do barcodes
log2fcs = barcodes[log2fc_col]
result = MannWhitneyUTest(log2fcs, negcontrols)
DataFrame(pvalue = -log10(pvalue(result)), mean= mean(log2fcs))
end
gene_data[[:gene, :behavior, :class, Symbol(:pvalue, suffix), Symbol(:mean, suffix)]] = tmp[[:gene, :behavior, :class, :pvalue, :mean]]
gene_data[Symbol(:absmean, suffix)] = abs(gene_data[Symbol(:mean, suffix)])
gene_data[Symbol(:pvalmeanprod, suffix)] = gene_data[Symbol(:mean, suffix)] .* gene_data[Symbol(:pvalue, suffix)]

genes = by(nonnegs, [:gene, :behavior, :class]) do barcodes
log2fcs = barcodes[Symbol("log2fc_", last_bin)]
result = MannWhitneyUTest(log2fcs, negcontrols)
DataFrame(pvalue = -log10(pvalue(result)), mean= mean(log2fcs))
end
genes[:absmean] = abs(genes[:mean])
genes[:pvalmeanprod] = genes[:mean] .* genes[:pvalue]

combined, genes
guide_data, gene_data
end
4 changes: 2 additions & 2 deletions src/simulation/selection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ function select(setup::FacsScreen,
end
indices = sortperm(observed)
cells = cells[indices]
results = Dict{Symbol, Vector{Int}}()
results = OrderedDict{Symbol, Vector{Int}}()

for (binname, cutoffs) in bins
left = clamp(round(Int, cutoffs[1]*n_cells), 1, n_cells)
Expand Down Expand Up @@ -82,5 +82,5 @@ function select(setup::GrowthScreen,
copy!(view(cellmat, :), view(output_c, picked))
copy!(view(cpmat, :), view(output_p, picked))
end
return Dict(:bin1 => initial_cells, :bin2 => cellmat)
return OrderedDict(:bin1 => initial_cells, :bin2 => cellmat)
end
4 changes: 2 additions & 2 deletions src/simulation/sequencing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ $(SIGNATURES)
Converts raw counts to frequencies by dividing by the total number of reads
for each sample.
"""
function counts_to_freqs(bin_cells::Dict{Symbol, Vector{Int}}, guide_count::Int)
function counts_to_freqs(bin_cells::Associative{Symbol, Vector{Int}}, guide_count::Int)
results = Dict{Symbol, Vector{Float64}}()
for (bin, cells) in bin_cells
counts = StatsBase.counts(cells, 1:guide_count)
Expand All @@ -23,7 +23,7 @@ dict of DataFrames with the bin names as the keys. This object can then be passe
through [`Simulation.counts_to_freqs`](@ref) followed by
[`Simulation.differences_between_bins`](@ref).
"""
function sequencing(depths::Dict{Symbol, Int}, guides::Vector{Barcode}, samples::Dict{Symbol, Vector{Float64}})
function sequencing(depths::Associative{Symbol, Int}, guides::Vector{Barcode}, samples::Associative{Symbol, Vector{Float64}})
@assert all(Bool[haskey(depths, key) for key in keys(samples)]) "Supply exactly one sequencing depth per sample"
sequencing_results = Dict{Symbol, DataFrame}()
colnames = [fld for fld in fieldnames(Barcode) if fld != :obs_phenotype]
Expand Down
Loading

0 comments on commit f3e3235

Please sign in to comment.