/ dev / bloomfiltertest.py
bloomfiltertest.py
 1  """
 2  dev/bloomfiltertest.py
 3  ======================
 4  
 5  """
 6  
 7  import sqlite3
 8  from os import getenv, path
 9  from time import time
10  
11  from pybloom import BloomFilter as BloomFilter1  # pylint: disable=import-error
12  from pybloomfilter import BloomFilter as BloomFilter2  # pylint: disable=import-error
13  
14  # Ubuntu: apt-get install python-pybloomfiltermmap
15  
16  conn = sqlite3.connect(path.join(getenv("HOME"), '.config/PyBitmessage/messages.dat'))
17  
18  conn.text_factory = str
19  cur = conn.cursor()
20  rawlen = 0
21  itemcount = 0
22  
23  cur.execute('''SELECT COUNT(hash) FROM inventory''')
24  for row in cur.fetchall():
25      itemcount = row[0]
26  
27  filtersize = 1000 * (int(itemcount / 1000) + 1)
28  errorrate = 1.0 / 1000.0
29  
30  bf1 = BloomFilter1(capacity=filtersize, error_rate=errorrate)
31  bf2 = BloomFilter2(capacity=filtersize, error_rate=errorrate)
32  
33  item = '''SELECT hash FROM inventory'''
34  cur.execute(item, '')
35  bf1time = 0
36  bf2time = 0
37  for row in cur.fetchall():
38      rawlen += len(row[0])
39      try:
40          times = [time()]
41          bf1.add(row[0])
42          times.append(time())
43          bf2.add(row[0])
44          times.append(time())
45          bf1time += times[1] - times[0]
46          bf2time += times[2] - times[1]
47      except IndexError:
48          pass
49  
50  # f = open("/home/shurdeek/tmp/bloom.dat", "wb")
51  # sb1.tofile(f)
52  # f.close()
53  
54  
55  print(("Item count: %i" % (itemcount)))
56  print(("Raw length: %i" % (rawlen)))
57  print(("Bloom filter 1 length: %i, reduction to: %.2f%%" %
58        (bf1.bitarray.buffer_info()[1],
59         100.0 * bf1.bitarray.buffer_info()[1] / rawlen)))
60  print(("Bloom filter 1 capacity: %i and error rate: %.3f%%" % (bf1.capacity, 100.0 * bf1.error_rate)))
61  print(("Bloom filter 1 took %.2fs" % (bf1time)))
62  print(("Bloom filter 2 length: %i, reduction to: %.3f%%" %
63        (bf2.num_bits / 8,
64         100.0 * bf2.num_bits / 8 / rawlen)))
65  print(("Bloom filter 2 capacity: %i and error rate: %.3f%%" % (bf2.capacity, 100.0 * bf2.error_rate)))
66  print(("Bloom filter 2 took %.2fs" % (bf2time)))