Skip to content

Commit

Permalink
Merge branch '2to3' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
psathyrella committed Jan 30, 2024
2 parents ef0e797 + 2f3f1c8 commit 049240d
Show file tree
Hide file tree
Showing 3,923 changed files with 816,147 additions and 802,624 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,4 @@ packages/RPANDA/lib/
/packages/soNNia/
/projects/gcdyn/
/projects/gcreplay/
/.coverage
29 changes: 16 additions & 13 deletions bin/add-chimeras.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
#!/usr/bin/env python
#!/usr/bin/env python3
from __future__ import absolute_import, division, unicode_literals
from __future__ import print_function
import argparse
import collections
import numpy
import random
import sys
import os
import csv
from io import open

partis_dir = os.path.dirname(os.path.realpath(__file__)).replace('/bin', '')
if not os.path.exists(partis_dir):
print 'WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % partis_dir
sys.path.insert(1, partis_dir + '/python')
import utils
import seqfileopener
print('WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % partis_dir)
sys.path.insert(1, partis_dir) # + '/python')
import python.utils as utils
import python.seqfileopener as seqfileopener

parser = argparse.ArgumentParser()
parser.add_argument('infile')
Expand All @@ -24,32 +27,32 @@

input_info, _, _ = seqfileopener.read_sequence_file(args.infile, is_data=False)
if len(input_info) < 50:
print '%s making chimeras with only %d sequences, and since we choose from among the existing sequence for templates this won\'t be very effective' % (utils.color('yellow', 'warning'), len(input_info))
print('%s making chimeras with only %d sequences, and since we choose from among the existing sequence for templates this won\'t be very effective' % (utils.color('yellow', 'warning'), len(input_info)))

n_chimeric = 0
outfo = collections.OrderedDict()
for uid, seqfo in input_info.items():
if args.debug:
print uid
print(uid)

if numpy.random.uniform(0, 1) > args.chimera_freq: # no chimeras for this sequence
if args.debug:
print ' non-chimeric'
print(' non-chimeric')
continue

break_point = random.randint(args.min_chunk_len, len(seqfo['seqs'][0]) - args.min_chunk_len)
switch_uid = numpy.random.choice(input_info.keys())
switch_uid = numpy.random.choice(input_info)
switch_seq = input_info[switch_uid]['seqs'][0][ : break_point]

if args.debug:
print ' switching to %s at %d:' % (switch_uid, break_point)
print ' %s' % switch_seq
print ' %s%s' % (' ' * len(switch_seq), seqfo['seqs'][0][break_point : ])
print(' switching to %s at %d:' % (switch_uid, break_point))
print(' %s' % switch_seq)
print(' %s%s' % (' ' * len(switch_seq), seqfo['seqs'][0][break_point : ]))

outfo[uid] = switch_seq + seqfo['seqs'][0][break_point : ]
n_chimeric += 1

print 'writing %d / %d chimeric sequences to %s' % (n_chimeric, len(input_info), args.outfile)
print('writing %d / %d chimeric sequences to %s' % (n_chimeric, len(input_info), args.outfile))
with open(args.outfile, 'w') as outfile:
for uid, seq in outfo.items():
outfile.write('>%s\n%s\n' % (uid, seq))
26 changes: 14 additions & 12 deletions bin/add-seqs-to-outputs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#!/usr/bin/env python
#!/usr/bin/env python3
from __future__ import absolute_import, division, unicode_literals
from __future__ import print_function
import csv
import os
import sys
Expand All @@ -10,11 +12,11 @@

# if you move this script, you'll need to change this method of getting the imports
partis_dir = os.path.dirname(os.path.realpath(__file__)).replace('/bin', '')
sys.path.insert(1, partis_dir + '/python')
sys.path.insert(1, partis_dir) # + '/python')

import utils
import glutils
from clusterpath import ClusterPath
import python.utils as utils
import python.glutils as glutils
from python.clusterpath import ClusterPath

dstr = """
Add seqs from the fasta file --new-seq-file to an annotation from --partis-output-file.
Expand All @@ -34,18 +36,18 @@
args = parser.parse_args()

new_seqfos = utils.read_fastx(args.new_seq_file, sanitize_seqs=True)
print ' read %d seqs from %s' % (len(new_seqfos), args.new_seq_file)
print(' read %d seqs from %s' % (len(new_seqfos), args.new_seq_file))

glfo = None
if utils.getsuffix(args.partis_output_file) == '.csv':
print ' reading deprecated csv format, so need to read germline info from somewhere else, using --glfo-dir %s, hopefully it works' % args.glfo_dir
print(' reading deprecated csv format, so need to read germline info from somewhere else, using --glfo-dir %s, hopefully it works' % args.glfo_dir)
glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus)

glfo, annotation_list, cpath = utils.read_output(args.partis_output_file, glfo=glfo, locus=args.locus)
if args.partition_index is not None:
print ' using non-best partition index %d (best is %d)' % (args.partition_index, cpath.i_best)
print(' using non-best partition index %d (best is %d)' % (args.partition_index, cpath.i_best))
partition = cpath.partitions[cpath.i_best if args.partition_index is None else args.partition_index]
print ' read partition with %d clusters from %s' % (len(partition), args.partis_output_file)
print(' read partition with %d clusters from %s' % (len(partition), args.partis_output_file))

new_uids = set(sfo['name'] for sfo in new_seqfos)
clusters_with_overlap = []
Expand All @@ -60,16 +62,16 @@
# raise Exception('too many clusters %d in the partition overlaps with sequences from the fasta file' % len(clusters_with_overlap))
clusters_with_overlap = sorted(clusters_with_overlap, key=lambda p: len(p[1]), reverse=True)
ostrs = ['%d %d'%(len(c), len(o)) for c, o in clusters_with_overlap]
print ' %s more than one cluster overlaps with sequences from fasta file, just taking first one (size overlap): %s, %s' % (utils.color('yellow', 'warning'), utils.color('red', ostrs[0]), ', '.join(ostrs[1:]))
print(' %s more than one cluster overlaps with sequences from fasta file, just taking first one (size overlap): %s, %s' % (utils.color('yellow', 'warning'), utils.color('red', ostrs[0]), ', '.join(ostrs[1:])))
old_cluster = clusters_with_overlap[0][0]

print ' adding %d fasta sequences to cluster of size %d (%d fasta sequences were already in cluster)' % (len(new_uids - set(old_cluster)), len(old_cluster), len(new_uids & set(old_cluster)))
print(' adding %d fasta sequences to cluster of size %d (%d fasta sequences were already in cluster)' % (len(new_uids - set(old_cluster)), len(old_cluster), len(new_uids & set(old_cluster))))
sfos_to_add = [sfo for sfo in new_seqfos if sfo['name'] not in old_cluster]
annotation_dict = utils.get_annotation_dict(annotation_list)
annotation = annotation_dict[':'.join(old_cluster)]

if args.n_test_subset_seqs is not None:
print ' taking only first %d seqs from fasta and annotation' % args.n_test_subset_seqs
print(' taking only first %d seqs from fasta and annotation' % args.n_test_subset_seqs)
utils.restrict_to_iseqs(annotation, list(range(args.n_test_subset_seqs)), glfo)
sfos_to_add = sfos_to_add[:args.n_test_subset_seqs]
utils.add_seqs_to_line(annotation, sfos_to_add, glfo, debug=args.debug)
Expand Down
Loading

0 comments on commit 049240d

Please sign in to comment.