decoder.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. """
  2. maxminddb.decoder
  3. ~~~~~~~~~~~~~~~~~
  4. This package contains code for decoding the MaxMind DB data section.
  5. """
  6. from __future__ import unicode_literals
  7. import struct
  8. from maxminddb.compat import byte_from_int, int_from_bytes
  9. from maxminddb.errors import InvalidDatabaseError
  10. class Decoder(object): # pylint: disable=too-few-public-methods
  11. """Decoder for the data section of the MaxMind DB"""
  12. def __init__(self, database_buffer, pointer_base=0, pointer_test=False):
  13. """Created a Decoder for a MaxMind DB
  14. Arguments:
  15. database_buffer -- an mmap'd MaxMind DB file.
  16. pointer_base -- the base number to use when decoding a pointer
  17. pointer_test -- used for internal unit testing of pointer code
  18. """
  19. self._pointer_test = pointer_test
  20. self._buffer = database_buffer
  21. self._pointer_base = pointer_base
  22. def _decode_array(self, size, offset):
  23. array = []
  24. for _ in range(size):
  25. (value, offset) = self.decode(offset)
  26. array.append(value)
  27. return array, offset
  28. def _decode_boolean(self, size, offset):
  29. return size != 0, offset
  30. def _decode_bytes(self, size, offset):
  31. new_offset = offset + size
  32. return self._buffer[offset:new_offset], new_offset
  33. # pylint: disable=no-self-argument
  34. # |-> I am open to better ways of doing this as long as it doesn't involve
  35. # lots of code duplication.
  36. def _decode_packed_type(type_code, type_size, pad=False):
  37. # pylint: disable=protected-access, missing-docstring
  38. def unpack_type(self, size, offset):
  39. if not pad:
  40. self._verify_size(size, type_size)
  41. new_offset = offset + type_size
  42. packed_bytes = self._buffer[offset:new_offset]
  43. if pad:
  44. packed_bytes = packed_bytes.rjust(type_size, b'\x00')
  45. (value,) = struct.unpack(type_code, packed_bytes)
  46. return value, new_offset
  47. return unpack_type
  48. def _decode_map(self, size, offset):
  49. container = {}
  50. for _ in range(size):
  51. (key, offset) = self.decode(offset)
  52. (value, offset) = self.decode(offset)
  53. container[key] = value
  54. return container, offset
  55. _pointer_value_offset = {
  56. 1: 0,
  57. 2: 2048,
  58. 3: 526336,
  59. 4: 0,
  60. }
  61. def _decode_pointer(self, size, offset):
  62. pointer_size = ((size >> 3) & 0x3) + 1
  63. new_offset = offset + pointer_size
  64. pointer_bytes = self._buffer[offset:new_offset]
  65. packed = pointer_bytes if pointer_size == 4 else struct.pack(
  66. b'!c', byte_from_int(size & 0x7)) + pointer_bytes
  67. unpacked = int_from_bytes(packed)
  68. pointer = unpacked + self._pointer_base + \
  69. self._pointer_value_offset[pointer_size]
  70. if self._pointer_test:
  71. return pointer, new_offset
  72. (value, _) = self.decode(pointer)
  73. return value, new_offset
  74. def _decode_uint(self, size, offset):
  75. new_offset = offset + size
  76. uint_bytes = self._buffer[offset:new_offset]
  77. return int_from_bytes(uint_bytes), new_offset
  78. def _decode_utf8_string(self, size, offset):
  79. new_offset = offset + size
  80. return self._buffer[offset:new_offset].decode('utf-8'), new_offset
  81. _type_decoder = {
  82. 1: _decode_pointer,
  83. 2: _decode_utf8_string,
  84. 3: _decode_packed_type(b'!d', 8), # double,
  85. 4: _decode_bytes,
  86. 5: _decode_uint, # uint16
  87. 6: _decode_uint, # uint32
  88. 7: _decode_map,
  89. 8: _decode_packed_type(b'!i', 4, pad=True), # int32
  90. 9: _decode_uint, # uint64
  91. 10: _decode_uint, # uint128
  92. 11: _decode_array,
  93. 14: _decode_boolean,
  94. 15: _decode_packed_type(b'!f', 4), # float,
  95. }
  96. def decode(self, offset):
  97. """Decode a section of the data section starting at offset
  98. Arguments:
  99. offset -- the location of the data structure to decode
  100. """
  101. new_offset = offset + 1
  102. (ctrl_byte,) = struct.unpack(b'!B', self._buffer[offset:new_offset])
  103. type_num = ctrl_byte >> 5
  104. # Extended type
  105. if not type_num:
  106. (type_num, new_offset) = self._read_extended(new_offset)
  107. if not type_num in self._type_decoder:
  108. raise InvalidDatabaseError('Unexpected type number ({type}) '
  109. 'encountered'.format(type=type_num))
  110. (size, new_offset) = self._size_from_ctrl_byte(
  111. ctrl_byte, new_offset, type_num)
  112. return self._type_decoder[type_num](self, size, new_offset)
  113. def _read_extended(self, offset):
  114. (next_byte,) = struct.unpack(b'!B', self._buffer[offset:offset + 1])
  115. type_num = next_byte + 7
  116. if type_num < 7:
  117. raise InvalidDatabaseError(
  118. 'Something went horribly wrong in the decoder. An '
  119. 'extended type resolved to a type number < 8 '
  120. '({type})'.format(type=type_num))
  121. return type_num, offset + 1
  122. def _verify_size(self, expected, actual):
  123. if expected != actual:
  124. raise InvalidDatabaseError(
  125. 'The MaxMind DB file\'s data section contains bad data '
  126. '(unknown data type or corrupt data)'
  127. )
  128. def _size_from_ctrl_byte(self, ctrl_byte, offset, type_num):
  129. size = ctrl_byte & 0x1f
  130. if type_num == 1:
  131. return size, offset
  132. bytes_to_read = 0 if size < 29 else size - 28
  133. new_offset = offset + bytes_to_read
  134. size_bytes = self._buffer[offset:new_offset]
  135. # Using unpack rather than int_from_bytes as it is about 200 lookups
  136. # per second faster here.
  137. if size == 29:
  138. size = 29 + struct.unpack(b'!B', size_bytes)[0]
  139. elif size == 30:
  140. size = 285 + struct.unpack(b'!H', size_bytes)[0]
  141. elif size > 30:
  142. size = struct.unpack(
  143. b'!I', size_bytes.rjust(4, b'\x00'))[0] + 65821
  144. return size, new_offset