/ src / common / dwarf / bytereader.h
bytereader.h
  1  // -*- mode: C++ -*-
  2  
  3  // Copyright 2010 Google LLC
  4  //
  5  // Redistribution and use in source and binary forms, with or without
  6  // modification, are permitted provided that the following conditions are
  7  // met:
  8  //
  9  //     * Redistributions of source code must retain the above copyright
 10  // notice, this list of conditions and the following disclaimer.
 11  //     * Redistributions in binary form must reproduce the above
 12  // copyright notice, this list of conditions and the following disclaimer
 13  // in the documentation and/or other materials provided with the
 14  // distribution.
 15  //     * Neither the name of Google LLC nor the names of its
 16  // contributors may be used to endorse or promote products derived from
 17  // this software without specific prior written permission.
 18  //
 19  // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 20  // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 21  // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 22  // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 23  // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 24  // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 25  // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 26  // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 27  // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 28  // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 29  // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 30  
 31  #ifndef COMMON_DWARF_BYTEREADER_H__
 32  #define COMMON_DWARF_BYTEREADER_H__
 33  
 34  #include <stdint.h>
 35  
 36  #include <string>
 37  
 38  #include "common/dwarf/types.h"
 39  #include "common/dwarf/dwarf2enums.h"
 40  
 41  namespace google_breakpad {
 42  
 43  // We can't use the obvious name of LITTLE_ENDIAN and BIG_ENDIAN
 44  // because it conflicts with a macro
 45  enum Endianness {
 46    ENDIANNESS_BIG,
 47    ENDIANNESS_LITTLE
 48  };
 49  
 50  // A ByteReader knows how to read single- and multi-byte values of
 51  // various endiannesses, sizes, and encodings, as used in DWARF
 52  // debugging information and Linux C++ exception handling data.
 53  class ByteReader {
 54   public:
 55    // Construct a ByteReader capable of reading one-, two-, four-, and
 56    // eight-byte values according to ENDIANNESS, absolute machine-sized
 57    // addresses, DWARF-style "initial length" values, signed and
 58    // unsigned LEB128 numbers, and Linux C++ exception handling data's
 59    // encoded pointers.
 60    explicit ByteReader(enum Endianness endianness);
 61    virtual ~ByteReader();
 62  
 63    // Read a single byte from BUFFER and return it as an unsigned 8 bit
 64    // number.
 65    uint8_t ReadOneByte(const uint8_t* buffer) const;
 66  
 67    // Read two bytes from BUFFER and return them as an unsigned 16 bit
 68    // number, using this ByteReader's endianness.
 69    uint16_t ReadTwoBytes(const uint8_t* buffer) const;
 70  
 71    // Read three bytes from BUFFER and return them as an unsigned 64 bit
 72    // number, using this ByteReader's endianness. DWARF 5 uses this encoding
 73    // for various index-related DW_FORMs.
 74    uint64_t ReadThreeBytes(const uint8_t* buffer) const;
 75  
 76    // Read four bytes from BUFFER and return them as an unsigned 32 bit
 77    // number, using this ByteReader's endianness. This function returns
 78    // a uint64_t so that it is compatible with ReadAddress and
 79    // ReadOffset. The number it returns will never be outside the range
 80    // of an unsigned 32 bit integer.
 81    uint64_t ReadFourBytes(const uint8_t* buffer) const;
 82  
 83    // Read eight bytes from BUFFER and return them as an unsigned 64
 84    // bit number, using this ByteReader's endianness.
 85    uint64_t ReadEightBytes(const uint8_t* buffer) const;
 86  
 87    // Read an unsigned LEB128 (Little Endian Base 128) number from
 88    // BUFFER and return it as an unsigned 64 bit integer. Set LEN to
 89    // the number of bytes read.
 90    //
 91    // The unsigned LEB128 representation of an integer N is a variable
 92    // number of bytes:
 93    //
 94    // - If N is between 0 and 0x7f, then its unsigned LEB128
 95    //   representation is a single byte whose value is N.
 96    //
 97    // - Otherwise, its unsigned LEB128 representation is (N & 0x7f) |
 98    //   0x80, followed by the unsigned LEB128 representation of N /
 99    //   128, rounded towards negative infinity.
100    //
101    // In other words, we break VALUE into groups of seven bits, put
102    // them in little-endian order, and then write them as eight-bit
103    // bytes with the high bit on all but the last.
104    uint64_t ReadUnsignedLEB128(const uint8_t* buffer, size_t* len) const;
105  
106    // Read a signed LEB128 number from BUFFER and return it as an
107    // signed 64 bit integer. Set LEN to the number of bytes read.
108    //
109    // The signed LEB128 representation of an integer N is a variable
110    // number of bytes:
111    //
112    // - If N is between -0x40 and 0x3f, then its signed LEB128
113    //   representation is a single byte whose value is N in two's
114    //   complement.
115    //
116    // - Otherwise, its signed LEB128 representation is (N & 0x7f) |
117    //   0x80, followed by the signed LEB128 representation of N / 128,
118    //   rounded towards negative infinity.
119    //
120    // In other words, we break VALUE into groups of seven bits, put
121    // them in little-endian order, and then write them as eight-bit
122    // bytes with the high bit on all but the last.
123    int64_t ReadSignedLEB128(const uint8_t* buffer, size_t* len) const;
124  
125    // Indicate that addresses on this architecture are SIZE bytes long. SIZE
126    // must be either 4 or 8. (DWARF allows addresses to be any number of
127    // bytes in length from 1 to 255, but we only support 32- and 64-bit
128    // addresses at the moment.) You must call this before using the
129    // ReadAddress member function.
130    //
131    // For data in a .debug_info section, or something that .debug_info
132    // refers to like line number or macro data, the compilation unit
133    // header's address_size field indicates the address size to use. Call
134    // frame information doesn't indicate its address size (a shortcoming of
135    // the spec); you must supply the appropriate size based on the
136    // architecture of the target machine.
137    void SetAddressSize(uint8_t size);
138  
139    // Return the current address size, in bytes. This is either 4,
140    // indicating 32-bit addresses, or 8, indicating 64-bit addresses.
141    uint8_t AddressSize() const { return address_size_; }
142  
143    // Read an address from BUFFER and return it as an unsigned 64 bit
144    // integer, respecting this ByteReader's endianness and address size. You
145    // must call SetAddressSize before calling this function.
146    uint64_t ReadAddress(const uint8_t* buffer) const;
147  
148    // DWARF actually defines two slightly different formats: 32-bit DWARF
149    // and 64-bit DWARF. This is *not* related to the size of registers or
150    // addresses on the target machine; it refers only to the size of section
151    // offsets and data lengths appearing in the DWARF data. One only needs
152    // 64-bit DWARF when the debugging data itself is larger than 4GiB.
153    // 32-bit DWARF can handle x86_64 or PPC64 code just fine, unless the
154    // debugging data itself is very large.
155    //
156    // DWARF information identifies itself as 32-bit or 64-bit DWARF: each
157    // compilation unit and call frame information entry begins with an
158    // "initial length" field, which, in addition to giving the length of the
159    // data, also indicates the size of section offsets and lengths appearing
160    // in that data. The ReadInitialLength member function, below, reads an
161    // initial length and sets the ByteReader's offset size as a side effect.
162    // Thus, in the normal process of reading DWARF data, the appropriate
163    // offset size is set automatically. So, you should only need to call
164    // SetOffsetSize if you are using the same ByteReader to jump from the
165    // midst of one block of DWARF data into another.
166  
167    // Read a DWARF "initial length" field from START, and return it as
168    // an unsigned 64 bit integer, respecting this ByteReader's
169    // endianness. Set *LEN to the length of the initial length in
170    // bytes, either four or twelve. As a side effect, set this
171    // ByteReader's offset size to either 4 (if we see a 32-bit DWARF
172    // initial length) or 8 (if we see a 64-bit DWARF initial length).
173    //
174    // A DWARF initial length is either:
175    //
176    // - a byte count stored as an unsigned 32-bit value less than
177    //   0xffffff00, indicating that the data whose length is being
178    //   measured uses the 32-bit DWARF format, or
179    //
180    // - The 32-bit value 0xffffffff, followed by a 64-bit byte count,
181    //   indicating that the data whose length is being measured uses
182    //   the 64-bit DWARF format.
183    uint64_t ReadInitialLength(const uint8_t* start, size_t* len);
184  
185    // Read an offset from BUFFER and return it as an unsigned 64 bit
186    // integer, respecting the ByteReader's endianness. In 32-bit DWARF, the
187    // offset is 4 bytes long; in 64-bit DWARF, the offset is eight bytes
188    // long. You must call ReadInitialLength or SetOffsetSize before calling
189    // this function; see the comments above for details.
190    uint64_t ReadOffset(const uint8_t* buffer) const;
191  
192    // Return the current offset size, in bytes.
193    // A return value of 4 indicates that we are reading 32-bit DWARF.
194    // A return value of 8 indicates that we are reading 64-bit DWARF.
195    uint8_t OffsetSize() const { return offset_size_; }
196  
197    // Indicate that section offsets and lengths are SIZE bytes long. SIZE
198    // must be either 4 (meaning 32-bit DWARF) or 8 (meaning 64-bit DWARF).
199    // Usually, you should not call this function yourself; instead, let a
200    // call to ReadInitialLength establish the data's offset size
201    // automatically.
202    void SetOffsetSize(uint8_t size);
203  
204    // The Linux C++ ABI uses a variant of DWARF call frame information
205    // for exception handling. This data is included in the program's
206    // address space as the ".eh_frame" section, and intepreted at
207    // runtime to walk the stack, find exception handlers, and run
208    // cleanup code. The format is mostly the same as DWARF CFI, with
209    // some adjustments made to provide the additional
210    // exception-handling data, and to make the data easier to work with
211    // in memory --- for example, to allow it to be placed in read-only
212    // memory even when describing position-independent code.
213    //
214    // In particular, exception handling data can select a number of
215    // different encodings for pointers that appear in the data, as
216    // described by the DwarfPointerEncoding enum. There are actually
217    // four axes(!) to the encoding:
218    //
219    // - The pointer size: pointers can be 2, 4, or 8 bytes long, or use
220    //   the DWARF LEB128 encoding.
221    //
222    // - The pointer's signedness: pointers can be signed or unsigned.
223    //
224    // - The pointer's base address: the data stored in the exception
225    //   handling data can be the actual address (that is, an absolute
226    //   pointer), or relative to one of a number of different base
227    //   addreses --- including that of the encoded pointer itself, for
228    //   a form of "pc-relative" addressing.
229    //
230    // - The pointer may be indirect: it may be the address where the
231    //   true pointer is stored. (This is used to refer to things via
232    //   global offset table entries, program linkage table entries, or
233    //   other tricks used in position-independent code.)
234    //
235    // There are also two options that fall outside that matrix
236    // altogether: the pointer may be omitted, or it may have padding to
237    // align it on an appropriate address boundary. (That last option
238    // may seem like it should be just another axis, but it is not.)
239  
240    // Indicate that the exception handling data is loaded starting at
241    // SECTION_BASE, and that the start of its buffer in our own memory
242    // is BUFFER_BASE. This allows us to find the address that a given
243    // byte in our buffer would have when loaded into the program the
244    // data describes. We need this to resolve DW_EH_PE_pcrel pointers.
245    void SetCFIDataBase(uint64_t section_base, const uint8_t* buffer_base);
246  
247    // Indicate that the base address of the program's ".text" section
248    // is TEXT_BASE. We need this to resolve DW_EH_PE_textrel pointers.
249    void SetTextBase(uint64_t text_base);
250  
251    // Indicate that the base address for DW_EH_PE_datarel pointers is
252    // DATA_BASE. The proper value depends on the ABI; it is usually the
253    // address of the global offset table, held in a designated register in
254    // position-independent code. You will need to look at the startup code
255    // for the target system to be sure. I tried; my eyes bled.
256    void SetDataBase(uint64_t data_base);
257  
258    // Indicate that the base address for the FDE we are processing is
259    // FUNCTION_BASE. This is the start address of DW_EH_PE_funcrel
260    // pointers. (This encoding does not seem to be used by the GNU
261    // toolchain.)
262    void SetFunctionBase(uint64_t function_base);
263  
264    // Indicate that we are no longer processing any FDE, so any use of
265    // a DW_EH_PE_funcrel encoding is an error.
266    void ClearFunctionBase();
267  
268    // Return true if ENCODING is a valid pointer encoding.
269    bool ValidEncoding(DwarfPointerEncoding encoding) const;
270  
271    // Return true if we have all the information we need to read a
272    // pointer that uses ENCODING. This checks that the appropriate
273    // SetFooBase function for ENCODING has been called.
274    bool UsableEncoding(DwarfPointerEncoding encoding) const;
275  
276    // Read an encoded pointer from BUFFER using ENCODING; return the
277    // absolute address it represents, and set *LEN to the pointer's
278    // length in bytes, including any padding for aligned pointers.
279    //
280    // This function calls 'abort' if ENCODING is invalid or refers to a
281    // base address this reader hasn't been given, so you should check
282    // with ValidEncoding and UsableEncoding first if you would rather
283    // die in a more helpful way.
284    uint64_t ReadEncodedPointer(const uint8_t* buffer,
285                              DwarfPointerEncoding encoding,
286                              size_t* len) const;
287  
288    Endianness GetEndianness() const;
289   private:
290  
291    // Function pointer type for our address and offset readers.
292    typedef uint64_t (ByteReader::*AddressReader)(const uint8_t*) const;
293  
294    // Read an offset from BUFFER and return it as an unsigned 64 bit
295    // integer.  DWARF2/3 define offsets as either 4 or 8 bytes,
296    // generally depending on the amount of DWARF2/3 info present.
297    // This function pointer gets set by SetOffsetSize.
298    AddressReader offset_reader_;
299  
300    // Read an address from BUFFER and return it as an unsigned 64 bit
301    // integer.  DWARF2/3 allow addresses to be any size from 0-255
302    // bytes currently.  Internally we support 4 and 8 byte addresses,
303    // and will CHECK on anything else.
304    // This function pointer gets set by SetAddressSize.
305    AddressReader address_reader_;
306  
307    Endianness endian_;
308    uint8_t address_size_;
309    uint8_t offset_size_;
310  
311    // Base addresses for Linux C++ exception handling data's encoded pointers.
312    bool have_section_base_, have_text_base_, have_data_base_;
313    bool have_function_base_;
314    uint64_t section_base_, text_base_, data_base_, function_base_;
315    const uint8_t* buffer_base_;
316  };
317  
318  }  // namespace google_breakpad
319  
320  #endif  // COMMON_DWARF_BYTEREADER_H__