From 286e981bda8b3b64b0428908ceb6b2a6cb541ca5 Mon Sep 17 00:00:00 2001 From: duncan ralph Date: Sun, 25 Aug 2024 15:08:01 -0700 Subject: [PATCH] update for ashni 15 day mice (+ misc) --- bin/parse-output.py | 2 +- bin/run-paired-loci.sh | 4 ++-- projects/cf-gcdyn.py | 9 ++++++--- projects/replay-plot.py | 8 ++++---- python/datautils.py | 2 +- python/plotting.py | 2 +- 6 files changed, 15 insertions(+), 12 deletions(-) diff --git a/bin/parse-output.py b/bin/parse-output.py index 3b0efdaae..5506ed2e3 100755 --- a/bin/parse-output.py +++ b/bin/parse-output.py @@ -189,7 +189,7 @@ class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.Argumen print(' no cluster path in input file, so just using all %d sequences (in %d clusters) in annotations' % (sum(len(c) for c in clusters_to_use), len(clusters_to_use))) else: ipartition = cpath.i_best if args.partition_index is None else args.partition_index - print(' found %d clusters in %s' % (len(cpath.partitions[ipartition]), 'best partition' if args.partition_index is None else 'partition at index %d (of %d)' % (ipartition, len(cpath.partitions)))) + print(' found %d clusters with %d seqs in %s' % (len(cpath.partitions[ipartition]), sum(len(c) for c in cpath.partitions[ipartition]), 'best partition' if args.partition_index is None else 'partition at index %d (of %d)' % (ipartition, len(cpath.partitions)))) modified = False if args.cluster_index is None: clusters_to_use = cpath.partitions[ipartition] diff --git a/bin/run-paired-loci.sh b/bin/run-paired-loci.sh index b3588733e..309da514f 100755 --- a/bin/run-paired-loci.sh +++ b/bin/run-paired-loci.sh @@ -24,8 +24,8 @@ common="--n-sub-procs 15 --n-max-procs 5 --single-light-locus igk --base-outdir # echo $bin --label pairfix --version v1 --n-replicates 3 --n-leaves-list hist --n-sim-events-list 3000 --scratch-mute-freq-list 0.07 --bulk-data-fraction-list 0:0.5:0.8:0.9:0.95 --simu-extra-args=\"--flat-mute-freq --same-mute-freq-for-all-seqs\" --inference-extra-args=\"--pair-unpaired-seqs-with-paired-family\" --final-plot-xvar bulk-data-fraction --perf-metrics all-pcfrac:f1:precision:sensitivity --make-hist-plots --use-val-cfgs --empty-bin-range 0:200 $common # echo $bin --label test-antn --version imbal-v3 --n-replicates 2 --tree-imbalance-list None:0.04:0.07 --scratch-mute-freq-list 0.15 --n-leaves-list 50 --simu-extra-args=\"--flat-mute-freq --same-mute-freq-for-all-seqs\" --n-sim-events-list 50 --antn-perf --perf-metrics naive-hdist $common # NOTE also made :0.13:0.14:0.16 # echo $bin --label bcr-phylo-antn --version v0 --n-replicates 2 --obs-times-list 50:150:300 --n-sim-seqs-per-generation-list 15:45 --context-depend-list 1 --simu-type bcr-phylo --dont-observe-common-ancestors --antn-perf --perf-metrics naive-hdist $common -simu_extra="--simu-extra-args=\"--target-distance 10 --context-depend 1 --tdist-weights random-uniform --min-target-distance 2 --n-sim-seqs-per-generation 89 --parameter-variances n-sim-seqs-per-generation,23 --aa-paratope-positions N=60 --aa-struct-positions N=100 --leaf-sampling-scheme high-affinity\"" -echo $bin --label gct-valid --version v5 --n-replicates 3 --obs-times-list 15:20:30:40:50 --n-sim-events-list 70 --carry-cap-list 1000 --simu-type bcr-phylo --perf-metrics coar:rf:mrca --calc-antns --inference-extra-args=\"--no-indels --simultaneous-true-clonal-seqs\" --plot-metrics tree-perf --final-plot-xvar obs-times --final-plot-xvar obs-times $simu_extra $common # NOTE also have sampling times 10, 100, 150 for most methods +simu_extra="--simu-extra-args=\"--target-distance 10 --context-depend 1 --tdist-weights random-uniform --min-target-distance 2 --n-sim-seqs-per-generation 89 --parameter-variances n-sim-seqs-per-generation,23 --aa-paratope-positions N=60 --aa-struct-positions N=100 --leaf-sampling-scheme high-affinity --n-naive-seq-copies 100\"" +echo $bin --label gct-valid --version v6 --n-replicates 3 --obs-times-list 15:20:30:40:50 --n-sim-events-list 70 --carry-cap-list 1000 --simu-type bcr-phylo --perf-metrics coar:rf:mrca --calc-antns --inference-extra-args=\"--no-indels --simultaneous-true-clonal-seqs\" --plot-metrics tree-perf --final-plot-xvar obs-times --final-plot-xvar obs-times $simu_extra $common # NOTE also have sampling times 10, 100, 150 for most methods # echo $bin --label gct-valid --version gcdyn-v1 --n-replicates 2 --simu-type gcdyn --n-sim-events-list 70 --obs-times-list 15:30 --perf-metrics coar:rf:mrca --calc-antns --inference-extra-args=\"--no-indels --simultaneous-true-clonal-seqs\" --plot-metrics tree-perf $common # NOTE have to set --n-sub-procs to 1 for partition step, and re-set --n-sim-events-list for each --n-leaves value (500 leaves: 10 events, 100:50, 50:100): diff --git a/projects/cf-gcdyn.py b/projects/cf-gcdyn.py index 8fb15d999..535aebcdd 100755 --- a/projects/cf-gcdyn.py +++ b/projects/cf-gcdyn.py @@ -50,6 +50,7 @@ parser.add_argument('--n-trials-list') parser.add_argument('--dl-bundle-size-list', help='size of bundles during dl inference (must be equal to or less than simulation bundle size)') parser.add_argument('--epochs-list') +parser.add_argument('--batch-size-list') parser.add_argument('--dropout-rate-list') parser.add_argument('--learning-rate-list') parser.add_argument('--ema-momentum-list') @@ -66,11 +67,11 @@ # parser.add_argument('--gcreplay-data-dir', default='/fh/fast/matsen_e/%s/gcdyn/gcreplay-observed'%os.getenv('USER')) parser.add_argument('--gcreplay-germline-dir', default='datascripts/meta/taraki-gctree-2021-10/germlines') parser.add_argument('--dl-model-dir') -parser.add_argument('--data-dir', default='/fh/fast/matsen_e/data/taraki-gctree-2021-10/beast-processed-data/v0') +parser.add_argument('--data-dir', default='/fh/fast/matsen_e/data/taraki-gctree-2021-10/beast-processed-data/v4') args = parser.parse_args() args.scan_vars = { 'simu' : ['seed', 'birth-response', 'xscale-values', 'xshift-values', 'xscale-range', 'xshift-range', 'yscale-range', 'initial-birth-rate-range', 'carry-cap-range', 'init-population', 'time-to-sampling-range', 'n-seqs-range', 'n-trials', 'simu-bundle-size'], - 'dl-infer' : ['dl-bundle-size', 'epochs', 'dropout-rate', 'learning-rate', 'ema-momentum', 'prebundle-layer-cfg', 'dont-scale-params', 'params-to-predict'], + 'dl-infer' : ['dl-bundle-size', 'epochs', 'batch-size', 'dropout-rate', 'learning-rate', 'ema-momentum', 'prebundle-layer-cfg', 'dont-scale-params', 'params-to-predict'], 'data' : ['data-samples'], } args.scan_vars['group-expts'] = copy.deepcopy(args.scan_vars['dl-infer']) @@ -161,7 +162,9 @@ def add_scan_args(cmd, skip_fcn=None): # using nargs='+' syntax for these rathe if action in ['simu', 'check-dl', 'merge-simu']: cmd = 'gcd-simulate' if action in ['simu', 'check-dl'] else 'python %s/scripts/%s.py' % (args.gcddir, 'combine-simu-files.py') if action in ['simu', 'check-dl']: - cmd += ' --outdir %s --tree-inference-method iqtree --debug 1' % odr # --debug 1 + cmd += ' --outdir %s --debug 1' % odr # --debug 1 + # --make-plots + # --tree-inference-method iqtree if args.test: cmd += ' --test' cmd = add_scan_args(cmd, skip_fcn=lambda v: v not in args.scan_vars[action] or action=='check-dl' and v not in check_dl_args) diff --git a/projects/replay-plot.py b/projects/replay-plot.py index 66e8000b9..83069e567 100755 --- a/projects/replay-plot.py +++ b/projects/replay-plot.py @@ -29,8 +29,8 @@ colors = { 'gct-data' : '#cc0000', - 'gct-data-d15' : '#006600', - 'gct-data-d20' : '#cc0000', + 'gct-data-d15' : '#ea7979', + 'gct-data-d20' : '#cc0000', 'gct-data-w10' : '#2b65ec', 'bst-data-d20' : '#006600', 'iqt-data' : '#a821c7', @@ -441,7 +441,7 @@ def compare_plots(htype, plotdir, hists, labels, hname, diff_vals): """ parser = argparse.ArgumentParser(usage=ustr) parser.add_argument('--gcreplay-dir', default='/fh/fast/matsen_e/data/taraki-gctree-2021-10/gcreplay', help='dir with gctree results on gcreplay data from which we read seqs, affinity, mutation info, and trees)') -parser.add_argument('--beast-dir', default='/fh/fast/matsen_e/data/taraki-gctree-2021-10/beast-processed-data/v3', help='dir with beast results on gcreplay data (same format as simulation)') +parser.add_argument('--beast-dir', default='/fh/fast/matsen_e/data/taraki-gctree-2021-10/beast-processed-data/v4', help='dir with beast results on gcreplay data (same format as simulation)') parser.add_argument('--iqtree-data-dir', default='/fh/fast/matsen_e/data/taraki-gctree-2021-10/iqtree-processed-data/v1', help='dir with iqtree results on gcreplay data (from datascripts/taraki-gctree-2021-10/iqtree-run.py then projects/gcdyn/scripts/data-parse.py') parser.add_argument('--simu-like-dir', help='Dir from which to read simulation results, either from gcdyn or bcr-phylo (if the latter, set --bcr-phylo)') parser.add_argument('--outdir') @@ -457,7 +457,7 @@ def compare_plots(htype, plotdir, hists, labels, hname, diff_vals): parser.add_argument("--random-seed", type=int, default=1) parser.add_argument("--default-naive-affinity", type=float, default=1./100, help="this is the default for bcr-phylo, so maybe be correct if we don\'t have an unmutated sequence") args = parser.parse_args() -args.plot_labels = utils.get_arg_list(args.plot_labels, choices=['gct-data', 'gct-data-d15', 'gct-data-d20', 'gct-data-w10', 'bst-data-d20', 'iqt-data', 'iqt-data-d20', 'simu', 'simu-iqtree']) +args.plot_labels = utils.get_arg_list(args.plot_labels, choices=['gct-data', 'gct-data-d15', 'gct-data-d20', 'gct-data-w10', 'bst-data-d15', 'bst-data-d20', 'iqt-data', 'iqt-data-d20', 'simu', 'simu-iqtree']) if len(args.plot_labels) > 3 and not args.write_legend_only_plots: print(' note; setting --write-legend-only-plots since --plot-labels is longer than 3') args.write_legend_only_plots = True diff --git a/python/datautils.py b/python/datautils.py index 8ecb82533..e85642fe5 100644 --- a/python/datautils.py +++ b/python/datautils.py @@ -29,7 +29,7 @@ def reverse_gcid(gcid): # ---------------------------------------------------------------------------------------- def fix_btt_id(gcid): - mstr = utils.get_single_entry(re.findall('btt-PR-.-.', gcid)) + mstr = utils.get_single_entry(re.findall('btt-PR-.-[0-9][0-9]*', gcid)) btstr, prstr, prn1, prn2 = mstr.split('-') assert btstr == 'btt' and prstr == 'PR' return gcid.replace(mstr, 'PR%d.%02d' % (int(prn1), int(prn2))) diff --git a/python/plotting.py b/python/plotting.py index 4f20ee8c0..c0349bb29 100644 --- a/python/plotting.py +++ b/python/plotting.py @@ -27,7 +27,7 @@ from . import hutils from .clusterpath import ClusterPath -# green dark red light blue light red sky blue pink/purple grey +# green dark red light blue light red sky blue pink/purple grey default_colors = ['#006600', '#990012', '#2b65ec', '#cc0000', '#3399ff', '#a821c7', '#808080'] default_linewidths = ['5', '3', '2', '2', '2'] default_markersizes = ['20', '15', '8', '5', '5', '5']