Skip to content

Commit

Permalink
cleaned up some output
Browse files Browse the repository at this point in the history
  • Loading branch information
sooshie committed Aug 31, 2014
1 parent 2d22f4a commit 5b270fc
Showing 1 changed file with 6 additions and 6 deletions.
12 changes: 6 additions & 6 deletions data_hacking/min_hash/min_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class MinHash():
returned by getCandidatePairs().
'''

def __init__(self, num_hashes=40, lsh_bands=10, lsh_rows=4, load_models=None, drop_duplicates=False, verbose=False):
def __init__(self, num_hashes=40, lsh_bands=10, lsh_rows=4, bin_limit=1000, load_models=None, drop_duplicates=False, verbose=False):
''' Init for MinHash '''

# Minhash signatures, hashing and banding parameters
Expand All @@ -31,6 +31,7 @@ def __init__(self, num_hashes=40, lsh_bands=10, lsh_rows=4, load_models=None, dr
self._lsh_bands = lsh_bands
self._lsh_rows = lsh_rows
self._hash_salt = []
self._bin_limit = bin_limit
for i in xrange(num_hashes):
self._hash_salt.append(str(int(random.random()*100)))

Expand Down Expand Up @@ -203,10 +204,9 @@ def _all_to_all_matches(self):
for __key, candidate_list in subdict.iteritems():

# Sanity check
if (len(candidate_list) > 1000):
print 'Hashing function issue, key: (%s,%s) has %d items in it' % (_key, __key, len(candidate_list))
print 'LIMITED IT to 1000'
candidate_list = candidate_list[:1000]
if (len(candidate_list) > self._bin_limit):
print 'Hashing function issue, key: (%s,%s) has %d items in it out of %s slots' % (_key, __key, len(candidate_list), self._bin_limit)
candidate_list = candidate_list[:self._bin_limit]

for source in candidate_list:
for target in candidate_list:
Expand Down Expand Up @@ -270,4 +270,4 @@ def _test():


if __name__ == '__main__':
_test()
_test()

0 comments on commit 5b270fc

Please sign in to comment.