Upgrade gperftools to 2.9 for GCP/GMC/GMP/GMD. (#2247)

2025-03-09 15:49:59 +00:00 · 2021-12-12 15:38:30 +08:00 · 2021-12-12 15:38:30 +08:00 · 44e9dc83e9
commit 44e9dc83e9
parent 63da0dca92
346 changed files with 169666 additions and 78 deletions
--- a/trunk/3rdparty/gperftools-2-fit/benchmark/malloc_bench.cc
+++ b/trunk/3rdparty/gperftools-2-fit/benchmark/malloc_bench.cc
@ -0,0 +1,293 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+
+#include <algorithm>
+
+#include "run_benchmark.h"
+
+static void bench_fastpath_throughput(long iterations,
+                                      uintptr_t param)
+{
+  size_t sz = 32;
+  for (; iterations>0; iterations--) {
+    void *p = malloc(sz);
+    if (!p) {
+      abort();
+    }
+    free(p);
+    // this makes next iteration use different free list. So
+    // subsequent iterations may actually overlap in time.
+    sz = ((sz * 8191) & 511) + 16;
+  }
+}
+
+static void bench_fastpath_dependent(long iterations,
+                                     uintptr_t param)
+{
+  size_t sz = 32;
+  for (; iterations>0; iterations--) {
+    void *p = malloc(sz);
+    if (!p) {
+      abort();
+    }
+    free(p);
+    // this makes next iteration depend on current iteration. But this
+    // iteration's free may still overlap with next iteration's malloc
+    sz = ((sz | reinterpret_cast<size_t>(p)) & 511) + 16;
+  }
+}
+
+static void bench_fastpath_simple(long iterations,
+                                  uintptr_t param)
+{
+  size_t sz = static_cast<size_t>(param);
+  for (; iterations>0; iterations--) {
+    void *p = malloc(sz);
+    if (!p) {
+      abort();
+    }
+    free(p);
+    // next iteration will use same free list as this iteration. So it
+    // should be prevent next iterations malloc to go too far before
+    // free done. But using same size will make free "too fast" since
+    // we'll hit size class cache.
+  }
+}
+
+#ifdef __GNUC__
+#define HAVE_SIZED_FREE_OPTION
+
+extern "C" void tc_free_sized(void *ptr, size_t size) __attribute__((weak));
+extern "C" void *tc_memalign(size_t align, size_t size) __attribute__((weak));
+
+static bool is_sized_free_available(void)
+{
+  return tc_free_sized != NULL;
+}
+
+static bool is_memalign_available(void)
+{
+  return tc_memalign != NULL;
+}
+
+static void bench_fastpath_simple_sized(long iterations,
+                                        uintptr_t param)
+{
+  size_t sz = static_cast<size_t>(param);
+  for (; iterations>0; iterations--) {
+    void *p = malloc(sz);
+    if (!p) {
+      abort();
+    }
+    tc_free_sized(p, sz);
+    // next iteration will use same free list as this iteration. So it
+    // should be prevent next iterations malloc to go too far before
+    // free done. But using same size will make free "too fast" since
+    // we'll hit size class cache.
+  }
+}
+
+static void bench_fastpath_memalign(long iterations,
+                                    uintptr_t param)
+{
+  size_t sz = static_cast<size_t>(param);
+  for (; iterations>0; iterations--) {
+    void *p = tc_memalign(32, sz);
+    if (!p) {
+      abort();
+    }
+    free(p);
+    // next iteration will use same free list as this iteration. So it
+    // should be prevent next iterations malloc to go too far before
+    // free done. But using same size will make free "too fast" since
+    // we'll hit size class cache.
+  }
+}
+
+#endif // __GNUC__
+
+#define STACKSZ (1 << 16)
+
+static void bench_fastpath_stack(long iterations,
+                                 uintptr_t _param)
+{
+
+  void *stack[STACKSZ];
+  size_t sz = 64;
+  long param = static_cast<long>(_param);
+  param &= STACKSZ - 1;
+  param = param ? param : 1;
+  for (; iterations>0; iterations -= param) {
+    for (long k = param-1; k >= 0; k--) {
+      void *p = malloc(sz);
+      if (!p) {
+        abort();
+      }
+      stack[k] = p;
+      // this makes next iteration depend on result of this iteration
+      sz = ((sz | reinterpret_cast<size_t>(p)) & 511) + 16;
+    }
+    for (long k = 0; k < param; k++) {
+      free(stack[k]);
+    }
+  }
+}
+
+static void bench_fastpath_stack_simple(long iterations,
+                                        uintptr_t _param)
+{
+
+  void *stack[STACKSZ];
+  size_t sz = 128;
+  long param = static_cast<long>(_param);
+  param &= STACKSZ - 1;
+  param = param ? param : 1;
+  for (; iterations>0; iterations -= param) {
+    for (long k = param-1; k >= 0; k--) {
+      void *p = malloc(sz);
+      if (!p) {
+        abort();
+      }
+      stack[k] = p;
+    }
+    for (long k = 0; k < param; k++) {
+      free(stack[k]);
+    }
+  }
+}
+
+static void bench_fastpath_rnd_dependent(long iterations,
+                                         uintptr_t _param)
+{
+  static const uintptr_t rnd_c = 1013904223;
+  static const uintptr_t rnd_a = 1664525;
+
+  void *ptrs[STACKSZ];
+  size_t sz = 128;
+  if ((_param & (_param - 1))) {
+    abort();
+  }
+  if (_param > STACKSZ) {
+    abort();
+  }
+  int param = static_cast<int>(_param);
+
+  for (; iterations>0; iterations -= param) {
+    for (int k = param-1; k >= 0; k--) {
+      void *p = malloc(sz);
+      if (!p) {
+        abort();
+      }
+      ptrs[k] = p;
+      sz = ((sz | reinterpret_cast<size_t>(p)) & 511) + 16;
+    }
+
+    // this will iterate through all objects in order that is
+    // unpredictable to processor's prefetchers
+    uint32_t rnd = 0;
+    uint32_t free_idx = 0;
+    do {
+      free(ptrs[free_idx]);
+      rnd = rnd * rnd_a + rnd_c;
+      free_idx = rnd & (param - 1);
+    } while (free_idx != 0);
+  }
+}
+
+static void *randomize_buffer[13<<20];
+
+
+void randomize_one_size_class(size_t size) {
+  int count = (100<<20) / size;
+  if (count * sizeof(randomize_buffer[0]) > sizeof(randomize_buffer)) {
+    abort();
+  }
+  for (int i = 0; i < count; i++) {
+    randomize_buffer[i] = malloc(size);
+  }
+  std::random_shuffle(randomize_buffer, randomize_buffer + count);
+  for (int i = 0; i < count; i++) {
+    free(randomize_buffer[i]);
+  }
+}
+
+void randomize_size_classes() {
+  randomize_one_size_class(8);
+  int i;
+  for (i = 16; i < 256; i += 16) {
+    randomize_one_size_class(i);
+  }
+  for (; i < 512; i += 32) {
+    randomize_one_size_class(i);
+  }
+  for (; i < 1024; i += 64) {
+    randomize_one_size_class(i);
+  }
+  for (; i < (4 << 10); i += 128) {
+    randomize_one_size_class(i);
+  }
+  for (; i < (32 << 10); i += 1024) {
+    randomize_one_size_class(i);
+  }
+}
+
+int main(void)
+{
+  randomize_size_classes();
+
+  report_benchmark("bench_fastpath_throughput", bench_fastpath_throughput, 0);
+  report_benchmark("bench_fastpath_dependent", bench_fastpath_dependent, 0);
+  report_benchmark("bench_fastpath_simple", bench_fastpath_simple, 64);
+  report_benchmark("bench_fastpath_simple", bench_fastpath_simple, 2048);
+  report_benchmark("bench_fastpath_simple", bench_fastpath_simple, 16384);
+
+#ifdef HAVE_SIZED_FREE_OPTION
+  if (is_sized_free_available()) {
+    report_benchmark("bench_fastpath_simple_sized", bench_fastpath_simple_sized, 64);
+    report_benchmark("bench_fastpath_simple_sized", bench_fastpath_simple_sized, 2048);
+  }
+
+  if (is_memalign_available()) {
+    report_benchmark("bench_fastpath_memalign", bench_fastpath_memalign, 64);
+    report_benchmark("bench_fastpath_memalign", bench_fastpath_memalign, 2048);
+  }
+
+#endif
+
+  for (int i = 8; i <= 512; i <<= 1) {
+    report_benchmark("bench_fastpath_stack", bench_fastpath_stack, i);
+  }
+  report_benchmark("bench_fastpath_stack_simple", bench_fastpath_stack_simple, 32);
+  report_benchmark("bench_fastpath_stack_simple", bench_fastpath_stack_simple, 8192);
+  report_benchmark("bench_fastpath_rnd_dependent", bench_fastpath_rnd_dependent, 32);
+  report_benchmark("bench_fastpath_rnd_dependent", bench_fastpath_rnd_dependent, 8192);
+  return 0;
+}