diff --git a/int2intmaplike.py b/int2intmaplike.py new file mode 100644 index 0000000..593176d --- /dev/null +++ b/int2intmaplike.py @@ -0,0 +1,70 @@ +import math +import os +from logging import exception +from typing import Optional + + +class Int2IntMapLike(): + """ + A file used to map byte numbers of the filter view to byte numbers in the original file. + Each line contains the two integers separated by a comma. + The first column is sorted ascending. This allows us to do binary searches. + The file uses 4kb blocks. That means we add fill bytes (newlines) if a line would cross a 4kb block boundary. + """ + blocksize = 4096 + + def __init__(self, file): + self._file = file + self._handle = open(file, "w+t") + + def close(self): + if not self._handle.closed: + self._handle.close() + + def reset(self): + self._handle.truncate(0) + + def add(self, start: int, length: int, val: int): + line = "%d,%d,%d\n" % (start, length, val) + length = len(line) + offset = self._handle.tell() + if offset % self.blocksize + length > self.blocksize: + fill_bytes = self.blocksize - offset % self.blocksize + self._handle.write("\n" * fill_bytes) + self._handle.write(line) + + def find(self, key: int) -> Optional[int]: + size = os.stat(self._file).st_size + if size == 0: + return None + total_blocks = math.ceil(size / self.blocksize) + step = math.ceil(total_blocks / 2) + offset = (step - 1) * self.blocksize + while step >= 1: + self._handle.seek(offset) + block = self._handle.read(self.blocksize) + lines = block.split("\n") + is_before = None + for line in lines: + if len(line) == 0: + continue + token = line.split(",") + start = int(token[0]) + length = int(token[1]) + val = int(token[2]) + + if key >= start and key - start < length: + return val + tmp = key < start + if is_before != None and tmp != is_before: + return None + is_before = tmp + + if step == 1: + return None + + step = math.ceil(step / 2) + if is_before: + offset = offset - step * self.blocksize + else: + offset = offset + step * self.blocksize diff --git a/testint2intmaplike.py b/testint2intmaplike.py new file mode 100644 index 0000000..3be2038 --- /dev/null +++ b/testint2intmaplike.py @@ -0,0 +1,46 @@ +import tempfile +import unittest +from os.path import join + +from int2intmaplike import Int2IntMapLike + + +class Int2IntMapLikeTest(unittest.TestCase): + + def setUp(self): + self.test_dir = tempfile.TemporaryDirectory() + self.tmpfile = join(self.test_dir.name, "my.log") + self.map = Int2IntMapLike(self.tmpfile) + + def tearDown(self): + self.map.close() + self.test_dir.cleanup() + + def test_fill_map(self): + map = self.map + map.blocksize = 64 + + # fill map with + # 10,5,1 + # 20,5,2 + # 30,5,3 + # ... + for i in range(1, 20): + map.add(i * 10, 5, i) + + self.assertEqual(2, map.find(20)) + self.assertEqual(7, map.find(71)) + self.assertEqual(13, map.find(134)) + self.assertEqual(19, map.find(194)) + + # values that are not in the map + self.assertEqual(None, map.find(0)) + self.assertEqual(None, map.find(9)) + self.assertEqual(None, map.find(15)) + self.assertEqual(None, map.find(16)) + self.assertEqual(None, map.find(107)) # a value in the second block + self.assertEqual(None, map.find(188)) # a value in the third block + + +if __name__ == '__main__': + unittest.main()