blob: 330d71008932706412683a47898d5f66cf991433 [file] [log] [blame]
/*
* Copyright 2015 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <folly/ThreadCachedInt.h>
#include <folly/Hash.h>
#include <atomic>
#include <thread>
#include <gtest/gtest.h>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <folly/Benchmark.h>
using namespace folly;
TEST(ThreadCachedInt, SingleThreadedNotCached) {
ThreadCachedInt<int64_t> val(0, 0);
EXPECT_EQ(0, val.readFast());
++val;
EXPECT_EQ(1, val.readFast());
for (int i = 0; i < 41; ++i) {
val.increment(1);
}
EXPECT_EQ(42, val.readFast());
--val;
EXPECT_EQ(41, val.readFast());
}
// Note: This is somewhat fragile to the implementation. If this causes
// problems, feel free to remove it.
TEST(ThreadCachedInt, SingleThreadedCached) {
ThreadCachedInt<int64_t> val(0, 10);
EXPECT_EQ(0, val.readFast());
++val;
EXPECT_EQ(0, val.readFast());
for (int i = 0; i < 7; ++i) {
val.increment(1);
}
EXPECT_EQ(0, val.readFast());
EXPECT_EQ(0, val.readFastAndReset());
EXPECT_EQ(8, val.readFull());
EXPECT_EQ(8, val.readFullAndReset());
EXPECT_EQ(0, val.readFull());
EXPECT_EQ(0, val.readFast());
}
ThreadCachedInt<int32_t> globalInt32(0, 11);
ThreadCachedInt<int64_t> globalInt64(0, 11);
int kNumInserts = 100000;
DEFINE_int32(numThreads, 8, "Number simultaneous threads for benchmarks.");
#define CREATE_INC_FUNC(size) \
void incFunc ## size () { \
const int num = kNumInserts / FLAGS_numThreads; \
for (int i = 0; i < num; ++i) { \
++globalInt ## size ; \
} \
}
CREATE_INC_FUNC(64);
CREATE_INC_FUNC(32);
// Confirms counts are accurate with competing threads
TEST(ThreadCachedInt, MultiThreadedCached) {
kNumInserts = 100000;
CHECK_EQ(0, kNumInserts % FLAGS_numThreads) <<
"FLAGS_numThreads must evenly divide kNumInserts (" << kNumInserts << ").";
const int numPerThread = kNumInserts / FLAGS_numThreads;
ThreadCachedInt<int64_t> TCInt64(0, numPerThread - 2);
{
std::atomic<bool> run(true);
std::atomic<int> threadsDone(0);
std::vector<std::thread> threads;
for (int i = 0; i < FLAGS_numThreads; ++i) {
threads.push_back(std::thread([&] {
FOR_EACH_RANGE(k, 0, numPerThread) {
++TCInt64;
}
std::atomic_fetch_add(&threadsDone, 1);
while (run.load()) { usleep(100); }
}));
}
// We create and increment another ThreadCachedInt here to make sure it
// doesn't interact with the other instances
ThreadCachedInt<int64_t> otherTCInt64(0, 10);
otherTCInt64.set(33);
++otherTCInt64;
while (threadsDone.load() < FLAGS_numThreads) { usleep(100); }
++otherTCInt64;
// Threads are done incrementing, but caches have not been flushed yet, so
// we have to readFull.
EXPECT_NE(kNumInserts, TCInt64.readFast());
EXPECT_EQ(kNumInserts, TCInt64.readFull());
run.store(false);
for (auto& t : threads) {
t.join();
}
} // Caches are flushed when threads finish
EXPECT_EQ(kNumInserts, TCInt64.readFast());
}
#define MAKE_MT_CACHE_SIZE_BM(size) \
void BM_mt_cache_size ## size (int iters, int cacheSize) { \
kNumInserts = iters; \
globalInt ## size.set(0); \
globalInt ## size.setCacheSize(cacheSize); \
std::vector<std::thread> threads; \
for (int i = 0; i < FLAGS_numThreads; ++i) { \
threads.push_back(std::thread(incFunc ## size)); \
} \
for (auto& t : threads) { \
t.join(); \
} \
}
MAKE_MT_CACHE_SIZE_BM(64);
MAKE_MT_CACHE_SIZE_BM(32);
#define REG_BASELINE(name, inc_stmt) \
BENCHMARK(FB_CONCATENATE(BM_mt_baseline_, name), iters) { \
const int iterPerThread = iters / FLAGS_numThreads; \
std::vector<std::thread> threads; \
for (int i = 0; i < FLAGS_numThreads; ++i) { \
threads.push_back(std::thread([&]() { \
for (int i = 0; i < iterPerThread; ++i) { \
inc_stmt; \
} \
})); \
} \
for (auto& t : threads) { \
t.join(); \
} \
}
ThreadLocal<int64_t> globalTL64Baseline;
ThreadLocal<int32_t> globalTL32Baseline;
std::atomic<int64_t> globalInt64Baseline(0);
std::atomic<int32_t> globalInt32Baseline(0);
FOLLY_TLS int64_t global__thread64;
FOLLY_TLS int32_t global__thread32;
// Alternate lock-free implementation. Achieves about the same performance,
// but uses about 20x more memory than ThreadCachedInt with 24 threads.
struct ShardedAtomicInt {
static const int64_t kBuckets_ = 2048;
std::atomic<int64_t> ints_[kBuckets_];
inline void inc(int64_t val = 1) {
int bucket = hash::twang_mix64(
uint64_t(pthread_self())) & (kBuckets_ - 1);
std::atomic_fetch_add(&ints_[bucket], val);
}
// read the first few and extrapolate
int64_t readFast() {
int64_t ret = 0;
static const int numToRead = 8;
FOR_EACH_RANGE(i, 0, numToRead) {
ret += ints_[i].load(std::memory_order_relaxed);
}
return ret * (kBuckets_ / numToRead);
}
// readFull is lock-free, but has to do thousands of loads...
int64_t readFull() {
int64_t ret = 0;
for (auto& i : ints_) {
// Fun fact - using memory_order_consume below reduces perf 30-40% in high
// contention benchmarks.
ret += i.load(std::memory_order_relaxed);
}
return ret;
}
};
ShardedAtomicInt shd_int64;
REG_BASELINE(_thread64, global__thread64 += 1);
REG_BASELINE(_thread32, global__thread32 += 1);
REG_BASELINE(ThreadLocal64, *globalTL64Baseline += 1);
REG_BASELINE(ThreadLocal32, *globalTL32Baseline += 1);
REG_BASELINE(atomic_inc64,
std::atomic_fetch_add(&globalInt64Baseline, int64_t(1)));
REG_BASELINE(atomic_inc32,
std::atomic_fetch_add(&globalInt32Baseline, int32_t(1)));
REG_BASELINE(ShardedAtm64, shd_int64.inc());
BENCHMARK_PARAM(BM_mt_cache_size64, 0);
BENCHMARK_PARAM(BM_mt_cache_size64, 10);
BENCHMARK_PARAM(BM_mt_cache_size64, 100);
BENCHMARK_PARAM(BM_mt_cache_size64, 1000);
BENCHMARK_PARAM(BM_mt_cache_size32, 0);
BENCHMARK_PARAM(BM_mt_cache_size32, 10);
BENCHMARK_PARAM(BM_mt_cache_size32, 100);
BENCHMARK_PARAM(BM_mt_cache_size32, 1000);
BENCHMARK_DRAW_LINE();
// single threaded
BENCHMARK(Atomic_readFull) {
doNotOptimizeAway(globalInt64Baseline.load(std::memory_order_relaxed));
}
BENCHMARK(ThrCache_readFull) {
doNotOptimizeAway(globalInt64.readFull());
}
BENCHMARK(Sharded_readFull) {
doNotOptimizeAway(shd_int64.readFull());
}
BENCHMARK(ThrCache_readFast) {
doNotOptimizeAway(globalInt64.readFast());
}
BENCHMARK(Sharded_readFast) {
doNotOptimizeAway(shd_int64.readFast());
}
BENCHMARK_DRAW_LINE();
// multi threaded
REG_BASELINE(Atomic_readFull,
doNotOptimizeAway(globalInt64Baseline.load(std::memory_order_relaxed)));
REG_BASELINE(ThrCache_readFull, doNotOptimizeAway(globalInt64.readFull()));
REG_BASELINE(Sharded_readFull, doNotOptimizeAway(shd_int64.readFull()));
REG_BASELINE(ThrCache_readFast, doNotOptimizeAway(globalInt64.readFast()));
REG_BASELINE(Sharded_readFast, doNotOptimizeAway(shd_int64.readFast()));
BENCHMARK_DRAW_LINE();
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
gflags::ParseCommandLineFlags(&argc, &argv, true);
gflags::SetCommandLineOptionWithMode(
"bm_min_usec", "10000", gflags::SET_FLAG_IF_DEFAULT
);
if (FLAGS_benchmark) {
folly::runBenchmarks();
}
return RUN_ALL_TESTS();
}
/*
Ran with 20 threads on dual 12-core Xeon(R) X5650 @ 2.67GHz with 12-MB caches
Benchmark Iters Total t t/iter iter/sec
------------------------------------------------------------------------------
+ 103% BM_mt_baseline__thread64 10000000 13.54 ms 1.354 ns 704.4 M
* BM_mt_baseline__thread32 10000000 6.651 ms 665.1 ps 1.4 G
+50.3% BM_mt_baseline_ThreadLocal64 10000000 9.994 ms 999.4 ps 954.2 M
+49.9% BM_mt_baseline_ThreadLocal32 10000000 9.972 ms 997.2 ps 956.4 M
+2650% BM_mt_baseline_atomic_inc64 10000000 182.9 ms 18.29 ns 52.13 M
+2665% BM_mt_baseline_atomic_inc32 10000000 183.9 ms 18.39 ns 51.85 M
+75.3% BM_mt_baseline_ShardedAtm64 10000000 11.66 ms 1.166 ns 817.8 M
+6670% BM_mt_cache_size64/0 10000000 450.3 ms 45.03 ns 21.18 M
+1644% BM_mt_cache_size64/10 10000000 116 ms 11.6 ns 82.2 M
+ 381% BM_mt_cache_size64/100 10000000 32.04 ms 3.204 ns 297.7 M
+ 129% BM_mt_cache_size64/1000 10000000 15.24 ms 1.524 ns 625.8 M
+6052% BM_mt_cache_size32/0 10000000 409.2 ms 40.92 ns 23.31 M
+1304% BM_mt_cache_size32/10 10000000 93.39 ms 9.339 ns 102.1 M
+ 298% BM_mt_cache_size32/100 10000000 26.52 ms 2.651 ns 359.7 M
+68.1% BM_mt_cache_size32/1000 10000000 11.18 ms 1.118 ns 852.9 M
------------------------------------------------------------------------------
+10.4% Atomic_readFull 10000000 36.05 ms 3.605 ns 264.5 M
+ 619% ThrCache_readFull 10000000 235.1 ms 23.51 ns 40.57 M
SLOW Sharded_readFull 1981093 2 s 1.01 us 967.3 k
* ThrCache_readFast 10000000 32.65 ms 3.265 ns 292.1 M
+10.0% Sharded_readFast 10000000 35.92 ms 3.592 ns 265.5 M
------------------------------------------------------------------------------
+4.54% BM_mt_baseline_Atomic_readFull 10000000 8.672 ms 867.2 ps 1.074 G
SLOW BM_mt_baseline_ThrCache_readFull 10000000 996.9 ms 99.69 ns 9.567 M
SLOW BM_mt_baseline_Sharded_readFull 10000000 891.5 ms 89.15 ns 10.7 M
* BM_mt_baseline_ThrCache_readFast 10000000 8.295 ms 829.5 ps 1.123 G
+12.7% BM_mt_baseline_Sharded_readFast 10000000 9.348 ms 934.8 ps 1020 M
------------------------------------------------------------------------------
*/