From 5b270fc18f0fe7d031e38df1d948667f4c697991 Mon Sep 17 00:00:00 2001 From: Mike Sconzo Date: Sat, 30 Aug 2014 22:56:38 -0500 Subject: [PATCH] cleaned up some output --- data_hacking/min_hash/min_hash.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/data_hacking/min_hash/min_hash.py b/data_hacking/min_hash/min_hash.py index e52d187..fd6f608 100644 --- a/data_hacking/min_hash/min_hash.py +++ b/data_hacking/min_hash/min_hash.py @@ -22,7 +22,7 @@ class MinHash(): returned by getCandidatePairs(). ''' - def __init__(self, num_hashes=40, lsh_bands=10, lsh_rows=4, load_models=None, drop_duplicates=False, verbose=False): + def __init__(self, num_hashes=40, lsh_bands=10, lsh_rows=4, bin_limit=1000, load_models=None, drop_duplicates=False, verbose=False): ''' Init for MinHash ''' # Minhash signatures, hashing and banding parameters @@ -31,6 +31,7 @@ def __init__(self, num_hashes=40, lsh_bands=10, lsh_rows=4, load_models=None, dr self._lsh_bands = lsh_bands self._lsh_rows = lsh_rows self._hash_salt = [] + self._bin_limit = bin_limit for i in xrange(num_hashes): self._hash_salt.append(str(int(random.random()*100))) @@ -203,10 +204,9 @@ def _all_to_all_matches(self): for __key, candidate_list in subdict.iteritems(): # Sanity check - if (len(candidate_list) > 1000): - print 'Hashing function issue, key: (%s,%s) has %d items in it' % (_key, __key, len(candidate_list)) - print 'LIMITED IT to 1000' - candidate_list = candidate_list[:1000] + if (len(candidate_list) > self._bin_limit): + print 'Hashing function issue, key: (%s,%s) has %d items in it out of %s slots' % (_key, __key, len(candidate_list), self._bin_limit) + candidate_list = candidate_list[:self._bin_limit] for source in candidate_list: for target in candidate_list: @@ -270,4 +270,4 @@ def _test(): if __name__ == '__main__': - _test() \ No newline at end of file + _test()