From 286e981bda8b3b64b0428908ceb6b2a6cb541ca5 Mon Sep 17 00:00:00 2001
From: duncan ralph <dkralph@gmail.com>
Date: Sun, 25 Aug 2024 15:08:01 -0700
Subject: [PATCH] update for ashni 15 day mice (+ misc)

---
 bin/parse-output.py     | 2 +-
 bin/run-paired-loci.sh  | 4 ++--
 projects/cf-gcdyn.py    | 9 ++++++---
 projects/replay-plot.py | 8 ++++----
 python/datautils.py     | 2 +-
 python/plotting.py      | 2 +-
 6 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/bin/parse-output.py b/bin/parse-output.py
index 3b0efdaae..5506ed2e3 100755
--- a/bin/parse-output.py
+++ b/bin/parse-output.py
@@ -189,7 +189,7 @@ class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.Argumen
     print('  no cluster path in input file, so just using all %d sequences (in %d clusters) in annotations' % (sum(len(c) for c in clusters_to_use), len(clusters_to_use)))
 else:
     ipartition = cpath.i_best if args.partition_index is None else args.partition_index
-    print('  found %d clusters in %s' % (len(cpath.partitions[ipartition]), 'best partition' if args.partition_index is None else 'partition at index %d (of %d)' % (ipartition, len(cpath.partitions))))
+    print('  found %d clusters with %d seqs in %s' % (len(cpath.partitions[ipartition]), sum(len(c) for c in cpath.partitions[ipartition]), 'best partition' if args.partition_index is None else 'partition at index %d (of %d)' % (ipartition, len(cpath.partitions))))
     modified = False
     if args.cluster_index is None:
         clusters_to_use = cpath.partitions[ipartition]
diff --git a/bin/run-paired-loci.sh b/bin/run-paired-loci.sh
index b3588733e..309da514f 100755
--- a/bin/run-paired-loci.sh
+++ b/bin/run-paired-loci.sh
@@ -24,8 +24,8 @@ common="--n-sub-procs 15 --n-max-procs 5 --single-light-locus igk --base-outdir
 # echo $bin --label pairfix --version v1 --n-replicates 3 --n-leaves-list hist --n-sim-events-list 3000 --scratch-mute-freq-list 0.07 --bulk-data-fraction-list 0:0.5:0.8:0.9:0.95 --simu-extra-args=\"--flat-mute-freq --same-mute-freq-for-all-seqs\" --inference-extra-args=\"--pair-unpaired-seqs-with-paired-family\" --final-plot-xvar bulk-data-fraction --perf-metrics all-pcfrac:f1:precision:sensitivity --make-hist-plots --use-val-cfgs --empty-bin-range 0:200 $common
 # echo $bin --label test-antn --version imbal-v3   --n-replicates 2 --tree-imbalance-list None:0.04:0.07 --scratch-mute-freq-list 0.15 --n-leaves-list 50 --simu-extra-args=\"--flat-mute-freq --same-mute-freq-for-all-seqs\" --n-sim-events-list 50 --antn-perf --perf-metrics naive-hdist $common  # NOTE also made :0.13:0.14:0.16
 # echo $bin --label bcr-phylo-antn --version v0   --n-replicates 2 --obs-times-list 50:150:300 --n-sim-seqs-per-generation-list 15:45 --context-depend-list 1 --simu-type bcr-phylo --dont-observe-common-ancestors --antn-perf --perf-metrics naive-hdist $common
-simu_extra="--simu-extra-args=\"--target-distance 10 --context-depend 1 --tdist-weights random-uniform --min-target-distance 2 --n-sim-seqs-per-generation 89 --parameter-variances n-sim-seqs-per-generation,23 --aa-paratope-positions N=60 --aa-struct-positions N=100 --leaf-sampling-scheme high-affinity\""
-echo $bin --label gct-valid --version v5 --n-replicates 3 --obs-times-list 15:20:30:40:50 --n-sim-events-list 70 --carry-cap-list 1000 --simu-type bcr-phylo --perf-metrics coar:rf:mrca --calc-antns --inference-extra-args=\"--no-indels --simultaneous-true-clonal-seqs\" --plot-metrics tree-perf --final-plot-xvar obs-times --final-plot-xvar obs-times $simu_extra $common  # NOTE also have sampling times 10, 100, 150 for most methods
+simu_extra="--simu-extra-args=\"--target-distance 10 --context-depend 1 --tdist-weights random-uniform --min-target-distance 2 --n-sim-seqs-per-generation 89 --parameter-variances n-sim-seqs-per-generation,23 --aa-paratope-positions N=60 --aa-struct-positions N=100 --leaf-sampling-scheme high-affinity --n-naive-seq-copies 100\""
+echo $bin --label gct-valid --version v6 --n-replicates 3 --obs-times-list 15:20:30:40:50 --n-sim-events-list 70 --carry-cap-list 1000 --simu-type bcr-phylo --perf-metrics coar:rf:mrca --calc-antns --inference-extra-args=\"--no-indels --simultaneous-true-clonal-seqs\" --plot-metrics tree-perf --final-plot-xvar obs-times --final-plot-xvar obs-times $simu_extra $common  # NOTE also have sampling times 10, 100, 150 for most methods
 # echo $bin --label gct-valid --version gcdyn-v1 --n-replicates 2 --simu-type gcdyn --n-sim-events-list 70 --obs-times-list 15:30 --perf-metrics coar:rf:mrca --calc-antns --inference-extra-args=\"--no-indels --simultaneous-true-clonal-seqs\" --plot-metrics tree-perf $common
 
 # NOTE have to set --n-sub-procs to 1 for partition step, and re-set --n-sim-events-list for each --n-leaves value (500 leaves: 10 events, 100:50, 50:100):
diff --git a/projects/cf-gcdyn.py b/projects/cf-gcdyn.py
index 8fb15d999..535aebcdd 100755
--- a/projects/cf-gcdyn.py
+++ b/projects/cf-gcdyn.py
@@ -50,6 +50,7 @@
 parser.add_argument('--n-trials-list')
 parser.add_argument('--dl-bundle-size-list', help='size of bundles during dl inference (must be equal to or less than simulation bundle size)')
 parser.add_argument('--epochs-list')
+parser.add_argument('--batch-size-list')
 parser.add_argument('--dropout-rate-list')
 parser.add_argument('--learning-rate-list')
 parser.add_argument('--ema-momentum-list')
@@ -66,11 +67,11 @@
 # parser.add_argument('--gcreplay-data-dir', default='/fh/fast/matsen_e/%s/gcdyn/gcreplay-observed'%os.getenv('USER'))
 parser.add_argument('--gcreplay-germline-dir', default='datascripts/meta/taraki-gctree-2021-10/germlines')
 parser.add_argument('--dl-model-dir')
-parser.add_argument('--data-dir', default='/fh/fast/matsen_e/data/taraki-gctree-2021-10/beast-processed-data/v0')
+parser.add_argument('--data-dir', default='/fh/fast/matsen_e/data/taraki-gctree-2021-10/beast-processed-data/v4')
 args = parser.parse_args()
 args.scan_vars = {
     'simu' : ['seed', 'birth-response', 'xscale-values', 'xshift-values', 'xscale-range', 'xshift-range', 'yscale-range', 'initial-birth-rate-range', 'carry-cap-range', 'init-population', 'time-to-sampling-range', 'n-seqs-range', 'n-trials', 'simu-bundle-size'],
-    'dl-infer' : ['dl-bundle-size', 'epochs', 'dropout-rate', 'learning-rate', 'ema-momentum', 'prebundle-layer-cfg', 'dont-scale-params', 'params-to-predict'],
+    'dl-infer' : ['dl-bundle-size', 'epochs', 'batch-size', 'dropout-rate', 'learning-rate', 'ema-momentum', 'prebundle-layer-cfg', 'dont-scale-params', 'params-to-predict'],
     'data' : ['data-samples'],
 }
 args.scan_vars['group-expts'] = copy.deepcopy(args.scan_vars['dl-infer'])
@@ -161,7 +162,9 @@ def add_scan_args(cmd, skip_fcn=None):  # using nargs='+' syntax for these rathe
     if action in ['simu', 'check-dl', 'merge-simu']:
         cmd = 'gcd-simulate' if action in ['simu', 'check-dl'] else 'python %s/scripts/%s.py' % (args.gcddir, 'combine-simu-files.py')
         if action in ['simu', 'check-dl']:
-            cmd += ' --outdir %s --tree-inference-method iqtree --debug 1' % odr  #  --debug 1
+            cmd += ' --outdir %s --debug 1' % odr  #  --debug 1
+            # --make-plots
+            # --tree-inference-method iqtree
             if args.test:
                 cmd += ' --test'
             cmd = add_scan_args(cmd, skip_fcn=lambda v: v not in args.scan_vars[action] or action=='check-dl' and v not in check_dl_args)
diff --git a/projects/replay-plot.py b/projects/replay-plot.py
index 66e8000b9..83069e567 100755
--- a/projects/replay-plot.py
+++ b/projects/replay-plot.py
@@ -29,8 +29,8 @@
 
 colors = {
     'gct-data' : '#cc0000',
-    'gct-data-d15' : '#006600',
-    'gct-data-d20' :  '#cc0000',
+    'gct-data-d15' : '#ea7979',
+    'gct-data-d20' : '#cc0000',
     'gct-data-w10' : '#2b65ec',
     'bst-data-d20' : '#006600',
     'iqt-data' : '#a821c7',
@@ -441,7 +441,7 @@ def compare_plots(htype, plotdir, hists, labels, hname, diff_vals):
 """
 parser = argparse.ArgumentParser(usage=ustr)
 parser.add_argument('--gcreplay-dir', default='/fh/fast/matsen_e/data/taraki-gctree-2021-10/gcreplay', help='dir with gctree results on gcreplay data from which we read seqs, affinity, mutation info, and trees)')
-parser.add_argument('--beast-dir', default='/fh/fast/matsen_e/data/taraki-gctree-2021-10/beast-processed-data/v3', help='dir with beast results on gcreplay data (same format as simulation)')
+parser.add_argument('--beast-dir', default='/fh/fast/matsen_e/data/taraki-gctree-2021-10/beast-processed-data/v4', help='dir with beast results on gcreplay data (same format as simulation)')
 parser.add_argument('--iqtree-data-dir', default='/fh/fast/matsen_e/data/taraki-gctree-2021-10/iqtree-processed-data/v1', help='dir with iqtree results on gcreplay data (from datascripts/taraki-gctree-2021-10/iqtree-run.py then projects/gcdyn/scripts/data-parse.py')
 parser.add_argument('--simu-like-dir', help='Dir from which to read simulation results, either from gcdyn or bcr-phylo (if the latter, set --bcr-phylo)')
 parser.add_argument('--outdir')
@@ -457,7 +457,7 @@ def compare_plots(htype, plotdir, hists, labels, hname, diff_vals):
 parser.add_argument("--random-seed", type=int, default=1)
 parser.add_argument("--default-naive-affinity", type=float, default=1./100, help="this is the default for bcr-phylo, so maybe be correct if we don\'t have an unmutated sequence")
 args = parser.parse_args()
-args.plot_labels = utils.get_arg_list(args.plot_labels, choices=['gct-data', 'gct-data-d15', 'gct-data-d20', 'gct-data-w10', 'bst-data-d20', 'iqt-data', 'iqt-data-d20', 'simu', 'simu-iqtree'])
+args.plot_labels = utils.get_arg_list(args.plot_labels, choices=['gct-data', 'gct-data-d15', 'gct-data-d20', 'gct-data-w10', 'bst-data-d15', 'bst-data-d20', 'iqt-data', 'iqt-data-d20', 'simu', 'simu-iqtree'])
 if len(args.plot_labels) > 3 and not args.write_legend_only_plots:
     print('  note; setting --write-legend-only-plots since --plot-labels is longer than 3')
     args.write_legend_only_plots = True
diff --git a/python/datautils.py b/python/datautils.py
index 8ecb82533..e85642fe5 100644
--- a/python/datautils.py
+++ b/python/datautils.py
@@ -29,7 +29,7 @@ def reverse_gcid(gcid):
 
 # ----------------------------------------------------------------------------------------
 def fix_btt_id(gcid):
-    mstr = utils.get_single_entry(re.findall('btt-PR-.-.', gcid))
+    mstr = utils.get_single_entry(re.findall('btt-PR-.-[0-9][0-9]*', gcid))
     btstr, prstr, prn1, prn2  = mstr.split('-')
     assert btstr == 'btt' and prstr == 'PR'
     return gcid.replace(mstr, 'PR%d.%02d' % (int(prn1), int(prn2)))
diff --git a/python/plotting.py b/python/plotting.py
index 4f20ee8c0..c0349bb29 100644
--- a/python/plotting.py
+++ b/python/plotting.py
@@ -27,7 +27,7 @@
 from . import hutils
 from .clusterpath import ClusterPath
 
-#                 green    dark red  light blue  light red  sky blue  pink/purple   grey
+#                   green    dark red  light blue  light red  sky blue  pink/purple   grey
 default_colors = ['#006600', '#990012', '#2b65ec', '#cc0000', '#3399ff', '#a821c7', '#808080']
 default_linewidths = ['5', '3', '2', '2', '2']
 default_markersizes = ['20', '15', '8', '5', '5', '5']