/ src / test / fuzz / FuzzedDataProvider.h
FuzzedDataProvider.h
  1  //===- FuzzedDataProvider.h - Utility header for fuzz targets ---*- C++ -* ===//
  2  //
  3  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4  // See https://llvm.org/LICENSE.txt for license information.
  5  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6  //
  7  //===----------------------------------------------------------------------===//
  8  // A single header library providing an utility class to break up an array of
  9  // bytes. Whenever run on the same input, provides the same output, as long as
 10  // its methods are called in the same order, with the same arguments.
 11  //===----------------------------------------------------------------------===//
 12  
 13  #ifndef LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_
 14  #define LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_
 15  
 16  #include <algorithm>
 17  #include <array>
 18  #include <climits>
 19  #include <cstddef>
 20  #include <cstdint>
 21  #include <cstdlib>
 22  #include <cstring>
 23  #include <initializer_list>
 24  #include <limits>
 25  #include <string>
 26  #include <type_traits>
 27  #include <utility>
 28  #include <vector>
 29  
 30  // In addition to the comments below, the API is also briefly documented at
 31  // https://github.com/google/fuzzing/blob/master/docs/split-inputs.md#fuzzed-data-provider
 32  class FuzzedDataProvider {
 33   public:
 34    // |data| is an array of length |size| that the FuzzedDataProvider wraps to
 35    // provide more granular access. |data| must outlive the FuzzedDataProvider.
 36    FuzzedDataProvider(const uint8_t *data, size_t size)
 37        : data_ptr_(data), remaining_bytes_(size) {}
 38    ~FuzzedDataProvider() = default;
 39  
 40    // See the implementation below (after the class definition) for more verbose
 41    // comments for each of the methods.
 42  
 43    // Methods returning std::vector of bytes. These are the most popular choice
 44    // when splitting fuzzing input into pieces, as every piece is put into a
 45    // separate buffer (i.e. ASan would catch any under-/overflow) and the memory
 46    // will be released automatically.
 47    template <typename T> std::vector<T> ConsumeBytes(size_t num_bytes);
 48    template <typename T>
 49    std::vector<T> ConsumeBytesWithTerminator(size_t num_bytes, T terminator = 0);
 50    template <typename T> std::vector<T> ConsumeRemainingBytes();
 51  
 52    // Methods returning strings. Use only when you need a std::string or a null
 53    // terminated C-string. Otherwise, prefer the methods returning std::vector.
 54    std::string ConsumeBytesAsString(size_t num_bytes);
 55    std::string ConsumeRandomLengthString(size_t max_length);
 56    std::string ConsumeRandomLengthString();
 57    std::string ConsumeRemainingBytesAsString();
 58  
 59    // Methods returning integer values.
 60    template <typename T> T ConsumeIntegral();
 61    template <typename T> T ConsumeIntegralInRange(T min, T max);
 62  
 63    // Methods returning floating point values.
 64    template <typename T> T ConsumeFloatingPoint();
 65    template <typename T> T ConsumeFloatingPointInRange(T min, T max);
 66  
 67    // 0 <= return value <= 1.
 68    template <typename T> T ConsumeProbability();
 69  
 70    bool ConsumeBool();
 71  
 72    // Returns a value chosen from the given enum.
 73    template <typename T> T ConsumeEnum();
 74  
 75    // Returns a value from the given array.
 76    template <typename T, size_t size> T PickValueInArray(const T (&array)[size]);
 77    template <typename T, size_t size>
 78    T PickValueInArray(const std::array<T, size> &array);
 79    template <typename T> T PickValueInArray(std::initializer_list<const T> list);
 80  
 81    // Writes data to the given destination and returns number of bytes written.
 82    size_t ConsumeData(void *destination, size_t num_bytes);
 83  
 84    // Reports the remaining bytes available for fuzzed input.
 85    size_t remaining_bytes() { return remaining_bytes_; }
 86  
 87   private:
 88    FuzzedDataProvider(const FuzzedDataProvider &) = delete;
 89    FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete;
 90  
 91    void CopyAndAdvance(void *destination, size_t num_bytes);
 92  
 93    void Advance(size_t num_bytes);
 94  
 95    template <typename T>
 96    std::vector<T> ConsumeBytes(size_t size, size_t num_bytes);
 97  
 98    template <typename TS, typename TU> TS ConvertUnsignedToSigned(TU value);
 99  
100    const uint8_t *data_ptr_;
101    size_t remaining_bytes_;
102  };
103  
104  // Returns a std::vector containing |num_bytes| of input data. If fewer than
105  // |num_bytes| of data remain, returns a shorter std::vector containing all
106  // of the data that's left. Can be used with any byte sized type, such as
107  // char, unsigned char, uint8_t, etc.
108  template <typename T>
109  std::vector<T> FuzzedDataProvider::ConsumeBytes(size_t num_bytes) {
110    num_bytes = std::min(num_bytes, remaining_bytes_);
111    return ConsumeBytes<T>(num_bytes, num_bytes);
112  }
113  
114  // Similar to |ConsumeBytes|, but also appends the terminator value at the end
115  // of the resulting vector. Useful, when a mutable null-terminated C-string is
116  // needed, for example. But that is a rare case. Better avoid it, if possible,
117  // and prefer using |ConsumeBytes| or |ConsumeBytesAsString| methods.
118  template <typename T>
119  std::vector<T> FuzzedDataProvider::ConsumeBytesWithTerminator(size_t num_bytes,
120                                                                T terminator) {
121    num_bytes = std::min(num_bytes, remaining_bytes_);
122    std::vector<T> result = ConsumeBytes<T>(num_bytes + 1, num_bytes);
123    result.back() = terminator;
124    return result;
125  }
126  
127  // Returns a std::vector containing all remaining bytes of the input data.
128  template <typename T>
129  std::vector<T> FuzzedDataProvider::ConsumeRemainingBytes() {
130    return ConsumeBytes<T>(remaining_bytes_);
131  }
132  
133  // Returns a std::string containing |num_bytes| of input data. Using this and
134  // |.c_str()| on the resulting string is the best way to get an immutable
135  // null-terminated C string. If fewer than |num_bytes| of data remain, returns
136  // a shorter std::string containing all of the data that's left.
137  inline std::string FuzzedDataProvider::ConsumeBytesAsString(size_t num_bytes) {
138    static_assert(sizeof(std::string::value_type) == sizeof(uint8_t),
139                  "ConsumeBytesAsString cannot convert the data to a string.");
140  
141    num_bytes = std::min(num_bytes, remaining_bytes_);
142    std::string result(
143        reinterpret_cast<const std::string::value_type *>(data_ptr_), num_bytes);
144    Advance(num_bytes);
145    return result;
146  }
147  
148  // Returns a std::string of length from 0 to |max_length|. When it runs out of
149  // input data, returns what remains of the input. Designed to be more stable
150  // with respect to a fuzzer inserting characters than just picking a random
151  // length and then consuming that many bytes with |ConsumeBytes|.
152  inline std::string
153  FuzzedDataProvider::ConsumeRandomLengthString(size_t max_length) {
154    // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\"
155    // followed by anything else to the end of the string. As a result of this
156    // logic, a fuzzer can insert characters into the string, and the string
157    // will be lengthened to include those new characters, resulting in a more
158    // stable fuzzer than picking the length of a string independently from
159    // picking its contents.
160    std::string result;
161  
162    // Reserve the anticipated capacity to prevent several reallocations.
163    result.reserve(std::min(max_length, remaining_bytes_));
164    for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) {
165      char next = ConvertUnsignedToSigned<char>(data_ptr_[0]);
166      Advance(1);
167      if (next == '\\' && remaining_bytes_ != 0) {
168        next = ConvertUnsignedToSigned<char>(data_ptr_[0]);
169        Advance(1);
170        if (next != '\\')
171          break;
172      }
173      result += next;
174    }
175  
176    result.shrink_to_fit();
177    return result;
178  }
179  
180  // Returns a std::string of length from 0 to |remaining_bytes_|.
181  inline std::string FuzzedDataProvider::ConsumeRandomLengthString() {
182    return ConsumeRandomLengthString(remaining_bytes_);
183  }
184  
185  // Returns a std::string containing all remaining bytes of the input data.
186  // Prefer using |ConsumeRemainingBytes| unless you actually need a std::string
187  // object.
188  inline std::string FuzzedDataProvider::ConsumeRemainingBytesAsString() {
189    return ConsumeBytesAsString(remaining_bytes_);
190  }
191  
192  // Returns a number in the range [Type's min, Type's max]. The value might
193  // not be uniformly distributed in the given range. If there's no input data
194  // left, always returns |min|.
195  template <typename T> T FuzzedDataProvider::ConsumeIntegral() {
196    return ConsumeIntegralInRange(std::numeric_limits<T>::min(),
197                                  std::numeric_limits<T>::max());
198  }
199  
200  // Returns a number in the range [min, max] by consuming bytes from the
201  // input data. The value might not be uniformly distributed in the given
202  // range. If there's no input data left, always returns |min|. |min| must
203  // be less than or equal to |max|.
204  template <typename T>
205  T FuzzedDataProvider::ConsumeIntegralInRange(T min, T max) {
206    static_assert(std::is_integral_v<T>, "An integral type is required.");
207    static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type.");
208  
209    if (min > max)
210      abort();
211  
212    // Use the biggest type possible to hold the range and the result.
213    uint64_t range = static_cast<uint64_t>(max) - static_cast<uint64_t>(min);
214    uint64_t result = 0;
215    size_t offset = 0;
216  
217    while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 &&
218           remaining_bytes_ != 0) {
219      // Pull bytes off the end of the seed data. Experimentally, this seems to
220      // allow the fuzzer to more easily explore the input space. This makes
221      // sense, since it works by modifying inputs that caused new code to run,
222      // and this data is often used to encode length of data read by
223      // |ConsumeBytes|. Separating out read lengths makes it easier modify the
224      // contents of the data that is actually read.
225      --remaining_bytes_;
226      result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_];
227      offset += CHAR_BIT;
228    }
229  
230    // Avoid division by 0, in case |range + 1| results in overflow.
231    if (range != std::numeric_limits<decltype(range)>::max())
232      result = result % (range + 1);
233  
234    return static_cast<T>(static_cast<uint64_t>(min) + result);
235  }
236  
237  // Returns a floating point value in the range [Type's lowest, Type's max] by
238  // consuming bytes from the input data. If there's no input data left, always
239  // returns approximately 0.
240  template <typename T> T FuzzedDataProvider::ConsumeFloatingPoint() {
241    return ConsumeFloatingPointInRange<T>(std::numeric_limits<T>::lowest(),
242                                          std::numeric_limits<T>::max());
243  }
244  
245  // Returns a floating point value in the given range by consuming bytes from
246  // the input data. If there's no input data left, returns |min|. Note that
247  // |min| must be less than or equal to |max|.
248  template <typename T>
249  T FuzzedDataProvider::ConsumeFloatingPointInRange(T min, T max) {
250    if (min > max)
251      abort();
252  
253    T range = .0;
254    T result = min;
255    constexpr T zero(.0);
256    if (max > zero && min < zero && max > min + std::numeric_limits<T>::max()) {
257      // The diff |max - min| would overflow the given floating point type. Use
258      // the half of the diff as the range and consume a bool to decide whether
259      // the result is in the first of the second part of the diff.
260      range = (max / 2.0) - (min / 2.0);
261      if (ConsumeBool()) {
262        result += range;
263      }
264    } else {
265      range = max - min;
266    }
267  
268    return result + range * ConsumeProbability<T>();
269  }
270  
271  // Returns a floating point number in the range [0.0, 1.0]. If there's no
272  // input data left, always returns 0.
273  template <typename T> T FuzzedDataProvider::ConsumeProbability() {
274    static_assert(std::is_floating_point_v<T>,
275                  "A floating point type is required.");
276  
277    // Use different integral types for different floating point types in order
278    // to provide better density of the resulting values.
279    using IntegralType =
280        typename std::conditional_t<(sizeof(T) <= sizeof(uint32_t)), uint32_t,
281                                    uint64_t>;
282  
283    T result = static_cast<T>(ConsumeIntegral<IntegralType>());
284    result /= static_cast<T>(std::numeric_limits<IntegralType>::max());
285    return result;
286  }
287  
288  // Reads one byte and returns a bool, or false when no data remains.
289  inline bool FuzzedDataProvider::ConsumeBool() {
290    return 1 & ConsumeIntegral<uint8_t>();
291  }
292  
293  // Returns an enum value. The enum must start at 0 and be contiguous. It must
294  // also contain |kMaxValue| aliased to its largest (inclusive) value. Such as:
295  // enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue };
296  template <typename T> T FuzzedDataProvider::ConsumeEnum() {
297    static_assert(std::is_enum_v<T>, "|T| must be an enum type.");
298    return static_cast<T>(
299        ConsumeIntegralInRange<uint32_t>(0, static_cast<uint32_t>(T::kMaxValue)));
300  }
301  
302  // Returns a copy of the value selected from the given fixed-size |array|.
303  template <typename T, size_t size>
304  T FuzzedDataProvider::PickValueInArray(const T (&array)[size]) {
305    static_assert(size > 0, "The array must be non empty.");
306    return array[ConsumeIntegralInRange<size_t>(0, size - 1)];
307  }
308  
309  template <typename T, size_t size>
310  T FuzzedDataProvider::PickValueInArray(const std::array<T, size> &array) {
311    static_assert(size > 0, "The array must be non empty.");
312    return array[ConsumeIntegralInRange<size_t>(0, size - 1)];
313  }
314  
315  template <typename T>
316  T FuzzedDataProvider::PickValueInArray(std::initializer_list<const T> list) {
317    if (!list.size())
318      abort();
319  
320    return *(list.begin() + ConsumeIntegralInRange<size_t>(0, list.size() - 1));
321  }
322  
323  // Writes |num_bytes| of input data to the given destination pointer. If there
324  // is not enough data left, writes all remaining bytes. Return value is the
325  // number of bytes written.
326  // In general, it's better to avoid using this function, but it may be useful
327  // in cases when it's necessary to fill a certain buffer or object with
328  // fuzzing data.
329  inline size_t FuzzedDataProvider::ConsumeData(void *destination,
330                                                size_t num_bytes) {
331    num_bytes = std::min(num_bytes, remaining_bytes_);
332    CopyAndAdvance(destination, num_bytes);
333    return num_bytes;
334  }
335  
336  // Private methods.
337  inline void FuzzedDataProvider::CopyAndAdvance(void *destination,
338                                                 size_t num_bytes) {
339    std::memcpy(destination, data_ptr_, num_bytes);
340    Advance(num_bytes);
341  }
342  
343  inline void FuzzedDataProvider::Advance(size_t num_bytes) {
344    if (num_bytes > remaining_bytes_)
345      abort();
346  
347    data_ptr_ += num_bytes;
348    remaining_bytes_ -= num_bytes;
349  }
350  
351  template <typename T>
352  std::vector<T> FuzzedDataProvider::ConsumeBytes(size_t size, size_t num_bytes) {
353    static_assert(sizeof(T) == sizeof(uint8_t), "Incompatible data type.");
354  
355    // The point of using the size-based constructor below is to increase the
356    // odds of having a vector object with capacity being equal to the length.
357    // That part is always implementation specific, but at least both libc++ and
358    // libstdc++ allocate the requested number of bytes in that constructor,
359    // which seems to be a natural choice for other implementations as well.
360    // To increase the odds even more, we also call |shrink_to_fit| below.
361    std::vector<T> result(size);
362    if (size == 0) {
363      if (num_bytes != 0)
364        abort();
365      return result;
366    }
367  
368    CopyAndAdvance(result.data(), num_bytes);
369  
370    // Even though |shrink_to_fit| is also implementation specific, we expect it
371    // to provide an additional assurance in case vector's constructor allocated
372    // a buffer which is larger than the actual amount of data we put inside it.
373    result.shrink_to_fit();
374    return result;
375  }
376  
377  template <typename TS, typename TU>
378  TS FuzzedDataProvider::ConvertUnsignedToSigned(TU value) {
379    static_assert(sizeof(TS) == sizeof(TU), "Incompatible data types.");
380    static_assert(!std::numeric_limits<TU>::is_signed,
381                  "Source type must be unsigned.");
382  
383    if constexpr (std::numeric_limits<TS>::is_modulo)
384      return static_cast<TS>(value);
385  
386    // Avoid using implementation-defined unsigned to signed conversions.
387    // To learn more, see https://stackoverflow.com/questions/13150449.
388    constexpr auto TS_max = static_cast<TU>(std::numeric_limits<TS>::max());
389    if (value <= TS_max) {
390      return static_cast<TS>(value);
391    } else {
392      constexpr auto TS_min = std::numeric_limits<TS>::min();
393      return TS_min + static_cast<TS>(value - TS_min);
394    }
395  }
396  
397  #endif // LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_