blob: fb400849fc5349ca785cd3f5a862c11f496e65b6 [file] [log] [blame]
/*
* Copyright 2015 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <folly/detail/CacheLocality.h>
#include <sched.h>
#include <memory>
#include <thread>
#include <type_traits>
#include <unordered_map>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <folly/Benchmark.h>
using namespace folly::detail;
/// This is the relevant nodes from a production box's sysfs tree. If you
/// think this map is ugly you should see the version of this test that
/// used a real directory tree. To reduce the chance of testing error
/// I haven't tried to remove the common prefix
static std::unordered_map<std::string,std::string> fakeSysfsTree = {
{ "/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list", "0,17" },
{ "/sys/devices/system/cpu/cpu0/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list", "0,17" },
{ "/sys/devices/system/cpu/cpu0/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list", "0,17" },
{ "/sys/devices/system/cpu/cpu0/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list", "0-8,17-23" },
{ "/sys/devices/system/cpu/cpu0/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu1/cache/index0/shared_cpu_list", "1,18" },
{ "/sys/devices/system/cpu/cpu1/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu1/cache/index1/shared_cpu_list", "1,18" },
{ "/sys/devices/system/cpu/cpu1/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_list", "1,18" },
{ "/sys/devices/system/cpu/cpu1/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu1/cache/index3/shared_cpu_list", "0-8,17-23" },
{ "/sys/devices/system/cpu/cpu1/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu2/cache/index0/shared_cpu_list", "2,19" },
{ "/sys/devices/system/cpu/cpu2/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu2/cache/index1/shared_cpu_list", "2,19" },
{ "/sys/devices/system/cpu/cpu2/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu2/cache/index2/shared_cpu_list", "2,19" },
{ "/sys/devices/system/cpu/cpu2/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu2/cache/index3/shared_cpu_list", "0-8,17-23" },
{ "/sys/devices/system/cpu/cpu2/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu3/cache/index0/shared_cpu_list", "3,20" },
{ "/sys/devices/system/cpu/cpu3/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu3/cache/index1/shared_cpu_list", "3,20" },
{ "/sys/devices/system/cpu/cpu3/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu3/cache/index2/shared_cpu_list", "3,20" },
{ "/sys/devices/system/cpu/cpu3/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu3/cache/index3/shared_cpu_list", "0-8,17-23" },
{ "/sys/devices/system/cpu/cpu3/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu4/cache/index0/shared_cpu_list", "4,21" },
{ "/sys/devices/system/cpu/cpu4/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu4/cache/index1/shared_cpu_list", "4,21" },
{ "/sys/devices/system/cpu/cpu4/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu4/cache/index2/shared_cpu_list", "4,21" },
{ "/sys/devices/system/cpu/cpu4/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu4/cache/index3/shared_cpu_list", "0-8,17-23" },
{ "/sys/devices/system/cpu/cpu4/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu5/cache/index0/shared_cpu_list", "5-6" },
{ "/sys/devices/system/cpu/cpu5/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu5/cache/index1/shared_cpu_list", "5-6" },
{ "/sys/devices/system/cpu/cpu5/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu5/cache/index2/shared_cpu_list", "5-6" },
{ "/sys/devices/system/cpu/cpu5/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu5/cache/index3/shared_cpu_list", "0-8,17-23" },
{ "/sys/devices/system/cpu/cpu5/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu6/cache/index0/shared_cpu_list", "5-6" },
{ "/sys/devices/system/cpu/cpu6/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu6/cache/index1/shared_cpu_list", "5-6" },
{ "/sys/devices/system/cpu/cpu6/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu6/cache/index2/shared_cpu_list", "5-6" },
{ "/sys/devices/system/cpu/cpu6/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu6/cache/index3/shared_cpu_list", "0-8,17-23" },
{ "/sys/devices/system/cpu/cpu6/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu7/cache/index0/shared_cpu_list", "7,22" },
{ "/sys/devices/system/cpu/cpu7/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu7/cache/index1/shared_cpu_list", "7,22" },
{ "/sys/devices/system/cpu/cpu7/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu7/cache/index2/shared_cpu_list", "7,22" },
{ "/sys/devices/system/cpu/cpu7/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu7/cache/index3/shared_cpu_list", "0-8,17-23" },
{ "/sys/devices/system/cpu/cpu7/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list", "8,23" },
{ "/sys/devices/system/cpu/cpu8/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list", "8,23" },
{ "/sys/devices/system/cpu/cpu8/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list", "8,23" },
{ "/sys/devices/system/cpu/cpu8/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list", "0-8,17-23" },
{ "/sys/devices/system/cpu/cpu8/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list", "9,24" },
{ "/sys/devices/system/cpu/cpu9/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list", "9,24" },
{ "/sys/devices/system/cpu/cpu9/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list", "9,24" },
{ "/sys/devices/system/cpu/cpu9/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list", "9-16,24-31" },
{ "/sys/devices/system/cpu/cpu9/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu10/cache/index0/shared_cpu_list", "10,25" },
{ "/sys/devices/system/cpu/cpu10/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu10/cache/index1/shared_cpu_list", "10,25" },
{ "/sys/devices/system/cpu/cpu10/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu10/cache/index2/shared_cpu_list", "10,25" },
{ "/sys/devices/system/cpu/cpu10/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu10/cache/index3/shared_cpu_list", "9-16,24-31"},
{ "/sys/devices/system/cpu/cpu10/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu11/cache/index0/shared_cpu_list", "11,26" },
{ "/sys/devices/system/cpu/cpu11/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu11/cache/index1/shared_cpu_list", "11,26" },
{ "/sys/devices/system/cpu/cpu11/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu11/cache/index2/shared_cpu_list", "11,26" },
{ "/sys/devices/system/cpu/cpu11/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu11/cache/index3/shared_cpu_list", "9-16,24-31"},
{ "/sys/devices/system/cpu/cpu11/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu12/cache/index0/shared_cpu_list", "12,27" },
{ "/sys/devices/system/cpu/cpu12/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu12/cache/index1/shared_cpu_list", "12,27" },
{ "/sys/devices/system/cpu/cpu12/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu12/cache/index2/shared_cpu_list", "12,27" },
{ "/sys/devices/system/cpu/cpu12/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu12/cache/index3/shared_cpu_list", "9-16,24-31"},
{ "/sys/devices/system/cpu/cpu12/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu13/cache/index0/shared_cpu_list", "13,28" },
{ "/sys/devices/system/cpu/cpu13/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu13/cache/index1/shared_cpu_list", "13,28" },
{ "/sys/devices/system/cpu/cpu13/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu13/cache/index2/shared_cpu_list", "13,28" },
{ "/sys/devices/system/cpu/cpu13/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu13/cache/index3/shared_cpu_list", "9-16,24-31"},
{ "/sys/devices/system/cpu/cpu13/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu14/cache/index0/shared_cpu_list", "14,29" },
{ "/sys/devices/system/cpu/cpu14/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu14/cache/index1/shared_cpu_list", "14,29" },
{ "/sys/devices/system/cpu/cpu14/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu14/cache/index2/shared_cpu_list", "14,29" },
{ "/sys/devices/system/cpu/cpu14/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu14/cache/index3/shared_cpu_list", "9-16,24-31"},
{ "/sys/devices/system/cpu/cpu14/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu15/cache/index0/shared_cpu_list", "15,30" },
{ "/sys/devices/system/cpu/cpu15/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu15/cache/index1/shared_cpu_list", "15,30" },
{ "/sys/devices/system/cpu/cpu15/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu15/cache/index2/shared_cpu_list", "15,30" },
{ "/sys/devices/system/cpu/cpu15/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu15/cache/index3/shared_cpu_list", "9-16,24-31"},
{ "/sys/devices/system/cpu/cpu15/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu16/cache/index0/shared_cpu_list", "16,31" },
{ "/sys/devices/system/cpu/cpu16/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu16/cache/index1/shared_cpu_list", "16,31" },
{ "/sys/devices/system/cpu/cpu16/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu16/cache/index2/shared_cpu_list", "16,31" },
{ "/sys/devices/system/cpu/cpu16/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu16/cache/index3/shared_cpu_list", "9-16,24-31"},
{ "/sys/devices/system/cpu/cpu16/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu17/cache/index0/shared_cpu_list", "0,17" },
{ "/sys/devices/system/cpu/cpu17/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu17/cache/index1/shared_cpu_list", "0,17" },
{ "/sys/devices/system/cpu/cpu17/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu17/cache/index2/shared_cpu_list", "0,17" },
{ "/sys/devices/system/cpu/cpu17/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu17/cache/index3/shared_cpu_list", "0-8,17-23" },
{ "/sys/devices/system/cpu/cpu17/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu18/cache/index0/shared_cpu_list", "1,18" },
{ "/sys/devices/system/cpu/cpu18/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu18/cache/index1/shared_cpu_list", "1,18" },
{ "/sys/devices/system/cpu/cpu18/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu18/cache/index2/shared_cpu_list", "1,18" },
{ "/sys/devices/system/cpu/cpu18/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu18/cache/index3/shared_cpu_list", "0-8,17-23" },
{ "/sys/devices/system/cpu/cpu18/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu19/cache/index0/shared_cpu_list", "2,19" },
{ "/sys/devices/system/cpu/cpu19/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu19/cache/index1/shared_cpu_list", "2,19" },
{ "/sys/devices/system/cpu/cpu19/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu19/cache/index2/shared_cpu_list", "2,19" },
{ "/sys/devices/system/cpu/cpu19/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu19/cache/index3/shared_cpu_list", "0-8,17-23" },
{ "/sys/devices/system/cpu/cpu19/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu20/cache/index0/shared_cpu_list", "3,20" },
{ "/sys/devices/system/cpu/cpu20/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu20/cache/index1/shared_cpu_list", "3,20" },
{ "/sys/devices/system/cpu/cpu20/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu20/cache/index2/shared_cpu_list", "3,20" },
{ "/sys/devices/system/cpu/cpu20/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu20/cache/index3/shared_cpu_list", "0-8,17-23" },
{ "/sys/devices/system/cpu/cpu20/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu21/cache/index0/shared_cpu_list", "4,21" },
{ "/sys/devices/system/cpu/cpu21/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu21/cache/index1/shared_cpu_list", "4,21" },
{ "/sys/devices/system/cpu/cpu21/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu21/cache/index2/shared_cpu_list", "4,21" },
{ "/sys/devices/system/cpu/cpu21/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu21/cache/index3/shared_cpu_list", "0-8,17-23" },
{ "/sys/devices/system/cpu/cpu21/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu22/cache/index0/shared_cpu_list", "7,22" },
{ "/sys/devices/system/cpu/cpu22/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu22/cache/index1/shared_cpu_list", "7,22" },
{ "/sys/devices/system/cpu/cpu22/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu22/cache/index2/shared_cpu_list", "7,22" },
{ "/sys/devices/system/cpu/cpu22/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu22/cache/index3/shared_cpu_list", "0-8,17-23" },
{ "/sys/devices/system/cpu/cpu22/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu23/cache/index0/shared_cpu_list", "8,23" },
{ "/sys/devices/system/cpu/cpu23/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu23/cache/index1/shared_cpu_list", "8,23" },
{ "/sys/devices/system/cpu/cpu23/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_list", "8,23" },
{ "/sys/devices/system/cpu/cpu23/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu23/cache/index3/shared_cpu_list", "0-8,17-23" },
{ "/sys/devices/system/cpu/cpu23/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu24/cache/index0/shared_cpu_list", "9,24" },
{ "/sys/devices/system/cpu/cpu24/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu24/cache/index1/shared_cpu_list", "9,24" },
{ "/sys/devices/system/cpu/cpu24/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu24/cache/index2/shared_cpu_list", "9,24" },
{ "/sys/devices/system/cpu/cpu24/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu24/cache/index3/shared_cpu_list", "9-16,24-31"},
{ "/sys/devices/system/cpu/cpu24/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu25/cache/index0/shared_cpu_list", "10,25" },
{ "/sys/devices/system/cpu/cpu25/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu25/cache/index1/shared_cpu_list", "10,25" },
{ "/sys/devices/system/cpu/cpu25/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu25/cache/index2/shared_cpu_list", "10,25" },
{ "/sys/devices/system/cpu/cpu25/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu25/cache/index3/shared_cpu_list", "9-16,24-31"},
{ "/sys/devices/system/cpu/cpu25/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu26/cache/index0/shared_cpu_list", "11,26" },
{ "/sys/devices/system/cpu/cpu26/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu26/cache/index1/shared_cpu_list", "11,26" },
{ "/sys/devices/system/cpu/cpu26/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu26/cache/index2/shared_cpu_list", "11,26" },
{ "/sys/devices/system/cpu/cpu26/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu26/cache/index3/shared_cpu_list", "9-16,24-31"},
{ "/sys/devices/system/cpu/cpu26/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu27/cache/index0/shared_cpu_list", "12,27" },
{ "/sys/devices/system/cpu/cpu27/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu27/cache/index1/shared_cpu_list", "12,27" },
{ "/sys/devices/system/cpu/cpu27/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu27/cache/index2/shared_cpu_list", "12,27" },
{ "/sys/devices/system/cpu/cpu27/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu27/cache/index3/shared_cpu_list", "9-16,24-31"},
{ "/sys/devices/system/cpu/cpu27/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu28/cache/index0/shared_cpu_list", "13,28" },
{ "/sys/devices/system/cpu/cpu28/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu28/cache/index1/shared_cpu_list", "13,28" },
{ "/sys/devices/system/cpu/cpu28/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu28/cache/index2/shared_cpu_list", "13,28" },
{ "/sys/devices/system/cpu/cpu28/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu28/cache/index3/shared_cpu_list", "9-16,24-31"},
{ "/sys/devices/system/cpu/cpu28/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu29/cache/index0/shared_cpu_list", "14,29" },
{ "/sys/devices/system/cpu/cpu29/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu29/cache/index1/shared_cpu_list", "14,29" },
{ "/sys/devices/system/cpu/cpu29/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu29/cache/index2/shared_cpu_list", "14,29" },
{ "/sys/devices/system/cpu/cpu29/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu29/cache/index3/shared_cpu_list", "9-16,24-31"},
{ "/sys/devices/system/cpu/cpu29/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu30/cache/index0/shared_cpu_list", "15,30" },
{ "/sys/devices/system/cpu/cpu30/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu30/cache/index1/shared_cpu_list", "15,30" },
{ "/sys/devices/system/cpu/cpu30/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu30/cache/index2/shared_cpu_list", "15,30" },
{ "/sys/devices/system/cpu/cpu30/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu30/cache/index3/shared_cpu_list", "9-16,24-31"},
{ "/sys/devices/system/cpu/cpu30/cache/index3/type", "Unified" },
{ "/sys/devices/system/cpu/cpu31/cache/index0/shared_cpu_list", "16,31" },
{ "/sys/devices/system/cpu/cpu31/cache/index0/type", "Data" },
{ "/sys/devices/system/cpu/cpu31/cache/index1/shared_cpu_list", "16,31" },
{ "/sys/devices/system/cpu/cpu31/cache/index1/type", "Instruction" },
{ "/sys/devices/system/cpu/cpu31/cache/index2/shared_cpu_list", "16,31" },
{ "/sys/devices/system/cpu/cpu31/cache/index2/type", "Unified" },
{ "/sys/devices/system/cpu/cpu31/cache/index3/shared_cpu_list", "9-16,24-31"},
{ "/sys/devices/system/cpu/cpu31/cache/index3/type", "Unified" }
};
/// This is the expected CacheLocality structure for fakeSysfsTree
static const CacheLocality nonUniformExampleLocality = {
32,
{ 16, 16, 2 },
{ 0, 2, 4, 6, 8, 10, 11, 12, 14, 16, 18, 20, 22, 24, 26, 28,
30, 1, 3, 5, 7, 9, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 }
};
TEST(CacheLocality, FakeSysfs) {
auto parsed = CacheLocality::readFromSysfsTree([](std::string name) {
auto iter = fakeSysfsTree.find(name);
return iter == fakeSysfsTree.end() ? std::string() : iter->second;
});
auto& expected = nonUniformExampleLocality;
EXPECT_EQ(expected.numCpus, parsed.numCpus);
EXPECT_EQ(expected.numCachesByLevel, parsed.numCachesByLevel);
EXPECT_EQ(expected.localityIndexByCpu, parsed.localityIndexByCpu);
}
TEST(Getcpu, VdsoGetcpu) {
unsigned cpu;
Getcpu::vdsoFunc()(&cpu, nullptr, nullptr);
EXPECT_TRUE(cpu < CPU_SETSIZE);
}
TEST(SequentialThreadId, Simple) {
unsigned cpu = 0;
auto rv = SequentialThreadId<std::atomic>::getcpu(&cpu, nullptr, nullptr);
EXPECT_EQ(rv, 0);
EXPECT_TRUE(cpu > 0);
unsigned again;
SequentialThreadId<std::atomic>::getcpu(&again, nullptr, nullptr);
EXPECT_EQ(cpu, again);
}
static FOLLY_TLS unsigned testingCpu = 0;
static int testingGetcpu(unsigned* cpu, unsigned* node, void* unused) {
if (cpu != nullptr) {
*cpu = testingCpu;
}
if (node != nullptr) {
*node = testingCpu;
}
return 0;
}
TEST(AccessSpreader, Stubbed) {
std::vector<std::unique_ptr<AccessSpreader<>>> spreaders(100);
for (size_t s = 1; s < spreaders.size(); ++s) {
spreaders[s].reset(new AccessSpreader<>(
s, nonUniformExampleLocality, &testingGetcpu));
}
std::vector<size_t> cpusInLocalityOrder = {
0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 6, 7, 22, 8, 23, 9, 24, 10, 25,
11, 26, 12, 27, 13, 28, 14, 29, 15, 30, 16, 31 };
for (size_t i = 0; i < 32; ++i) {
// extra i * 32 is to check wrapping behavior of impl
testingCpu = cpusInLocalityOrder[i] + i * 64;
for (size_t s = 1; s < spreaders.size(); ++s) {
EXPECT_EQ((i * s) / 32, spreaders[s]->current())
<< "i=" << i << ", cpu=" << testingCpu << ", s=" << s;
}
}
}
TEST(AccessSpreader, Default) {
AccessSpreader<> spreader(16);
EXPECT_LT(spreader.current(), 16);
}
TEST(AccessSpreader, Shared) {
for (size_t s = 1; s < 200; ++s) {
EXPECT_LT(AccessSpreader<>::shared(s).current(), s);
}
}
TEST(AccessSpreader, Statics) {
LOG(INFO) << "stripeByCore.numStripes() = "
<< AccessSpreader<>::stripeByCore.numStripes();
LOG(INFO) << "stripeByChip.numStripes() = "
<< AccessSpreader<>::stripeByChip.numStripes();
for (size_t s = 1; s < 200; ++s) {
EXPECT_LT(AccessSpreader<>::current(s), s);
}
}
TEST(AccessSpreader, Wrapping) {
// this test won't pass unless locality.numCpus divides kMaxCpus
auto numCpus = 16;
auto locality = CacheLocality::uniform(numCpus);
for (size_t s = 1; s < 200; ++s) {
AccessSpreader<> spreader(s, locality, &testingGetcpu);
for (size_t c = 0; c < 400; ++c) {
testingCpu = c;
auto observed = spreader.current();
testingCpu = c % numCpus;
auto expected = spreader.current();
EXPECT_EQ(expected, observed)
<< "numCpus=" << numCpus << ", s=" << s << ", c=" << c;
}
}
}
// Benchmarked at ~21 nanos on fbk35 (2.6) and fbk18 (3.2) kernels with
// a 2.2Ghz Xeon
// ============================================================================
// folly/test/CacheLocalityTest.cpp relative time/iter iters/s
// ============================================================================
// LocalAccessSpreaderUse 20.77ns 48.16M
// SharedAccessSpreaderUse 21.95ns 45.55M
// AccessSpreaderConstruction 466.56ns 2.14M
// ============================================================================
BENCHMARK(LocalAccessSpreaderUse, iters) {
folly::BenchmarkSuspender braces;
AccessSpreader<> spreader(16);
braces.dismiss();
for (unsigned long i = 0; i < iters; ++i) {
auto x = spreader.current();
folly::doNotOptimizeAway(x);
}
}
BENCHMARK(SharedAccessSpreaderUse, iters) {
for (unsigned long i = 0; i < iters; ++i) {
auto x = AccessSpreader<>::current(16);
folly::doNotOptimizeAway(x);
}
}
BENCHMARK(AccessSpreaderConstruction, iters) {
std::aligned_storage<sizeof(AccessSpreader<>),
std::alignment_of<AccessSpreader<>>::value>::type raw;
for (unsigned long i = 0; i < iters; ++i) {
auto x = new (&raw) AccessSpreader<>(16);
folly::doNotOptimizeAway(x);
x->~AccessSpreader();
}
}
enum class SpreaderType { GETCPU, SHARED, TLS_RR };
// Benchmark scores here reflect the time for 32 threads to perform an
// atomic increment on a dual-socket E5-2660 @ 2.2Ghz. Surprisingly,
// if we don't separate the counters onto unique 128 byte stripes the
// 1_stripe and 2_stripe results are identical, even though the L3 is
// claimed to have 64 byte cache lines.
//
// _stub means there was no call to getcpu or the tls round-robin
// implementation, because for a single stripe the cpu doesn't matter.
// _getcpu refers to the vdso getcpu implementation with a locally
// constructed AccessSpreader. _tls_rr refers to execution using
// SequentialThreadId, the fallback if the vdso getcpu isn't available.
// _shared refers to calling AccessSpreader<>::current(numStripes)
// inside the hot loop.
//
// At 16_stripe_0_work and 32_stripe_0_work there is only L1 traffic,
// so since the stripe selection is 21 nanos the atomic increments in
// the L1 is ~15 nanos. At width 8_stripe_0_work the line is expected
// to ping-pong almost every operation, since the loops have the same
// duration. Widths 4 and 2 have the same behavior, but each tour of the
// cache line is 4 and 8 cores long, respectively. These all suggest a
// lower bound of 60 nanos for intra-chip handoff and increment between
// the L1s.
//
// With 455 nanos (1K cycles) of busywork per contended increment, the
// system can hide all of the latency of a tour of length 4, but not
// quite one of length 8. I was a bit surprised at how much worse the
// non-striped version got. It seems that the inter-chip traffic also
// interferes with the L1-only localWork.load(). When the local work is
// doubled to about 1 microsecond we see that the inter-chip contention
// is still very important, but subdivisions on the same chip don't matter.
//
// sudo nice -n -20
// _bin/folly/test/cache_locality_test --benchmark --bm_min_iters=1000000
// ============================================================================
// folly/test/CacheLocalityTest.cpp relative time/iter iters/s
// ============================================================================
// contentionAtWidth(1_stripe_0_work_stub) 1.14us 873.64K
// contentionAtWidth(2_stripe_0_work_getcpu) 495.58ns 2.02M
// contentionAtWidth(4_stripe_0_work_getcpu) 232.99ns 4.29M
// contentionAtWidth(8_stripe_0_work_getcpu) 101.16ns 9.88M
// contentionAtWidth(16_stripe_0_work_getcpu) 41.93ns 23.85M
// contentionAtWidth(32_stripe_0_work_getcpu) 42.04ns 23.79M
// contentionAtWidth(64_stripe_0_work_getcpu) 41.94ns 23.84M
// contentionAtWidth(2_stripe_0_work_tls_rr) 1.00us 997.41K
// contentionAtWidth(4_stripe_0_work_tls_rr) 694.41ns 1.44M
// contentionAtWidth(8_stripe_0_work_tls_rr) 590.27ns 1.69M
// contentionAtWidth(16_stripe_0_work_tls_rr) 222.13ns 4.50M
// contentionAtWidth(32_stripe_0_work_tls_rr) 169.49ns 5.90M
// contentionAtWidth(64_stripe_0_work_tls_rr) 162.20ns 6.17M
// contentionAtWidth(2_stripe_0_work_shared) 495.54ns 2.02M
// contentionAtWidth(4_stripe_0_work_shared) 236.27ns 4.23M
// contentionAtWidth(8_stripe_0_work_shared) 114.81ns 8.71M
// contentionAtWidth(16_stripe_0_work_shared) 44.65ns 22.40M
// contentionAtWidth(32_stripe_0_work_shared) 41.76ns 23.94M
// contentionAtWidth(64_stripe_0_work_shared) 43.47ns 23.00M
// atomicIncrBaseline(local_incr_0_work) 20.39ns 49.06M
// ----------------------------------------------------------------------------
// contentionAtWidth(1_stripe_500_work_stub) 2.04us 491.13K
// contentionAtWidth(2_stripe_500_work_getcpu) 610.98ns 1.64M
// contentionAtWidth(4_stripe_500_work_getcpu) 507.72ns 1.97M
// contentionAtWidth(8_stripe_500_work_getcpu) 542.53ns 1.84M
// contentionAtWidth(16_stripe_500_work_getcpu) 496.55ns 2.01M
// contentionAtWidth(32_stripe_500_work_getcpu) 500.67ns 2.00M
// atomicIncrBaseline(local_incr_500_work) 484.69ns 2.06M
// ----------------------------------------------------------------------------
// contentionAtWidth(1_stripe_1000_work_stub) 2.11us 473.78K
// contentionAtWidth(2_stripe_1000_work_getcpu) 970.64ns 1.03M
// contentionAtWidth(4_stripe_1000_work_getcpu) 987.31ns 1.01M
// contentionAtWidth(8_stripe_1000_work_getcpu) 1.01us 985.52K
// contentionAtWidth(16_stripe_1000_work_getcpu) 986.09ns 1.01M
// contentionAtWidth(32_stripe_1000_work_getcpu) 960.23ns 1.04M
// atomicIncrBaseline(local_incr_1000_work) 950.63ns 1.05M
// ============================================================================
static void contentionAtWidth(size_t iters, size_t stripes, size_t work,
SpreaderType spreaderType,
size_t counterAlignment = 128,
size_t numThreads = 32) {
folly::BenchmarkSuspender braces;
AccessSpreader<> spreader(
stripes,
CacheLocality::system<std::atomic>(),
spreaderType == SpreaderType::TLS_RR
? SequentialThreadId<std::atomic>::getcpu : nullptr);
std::atomic<size_t> ready(0);
std::atomic<bool> go(false);
// while in theory the cache line size is 64 bytes, experiments show
// that we get contention on 128 byte boundaries for Ivy Bridge. The
// extra indirection adds 1 or 2 nanos
assert(counterAlignment >= sizeof(std::atomic<size_t>));
std::vector<char> raw(counterAlignment * stripes);
// if we happen to be using the tlsRoundRobin, then sequentially
// assigning the thread identifiers is the unlikely best-case scenario.
// We don't want to unfairly benefit or penalize. Computing the exact
// maximum likelihood of the probability distributions is annoying, so
// I approximate as 2/5 of the ids that have no threads, 2/5 that have
// 1, 2/15 that have 2, and 1/15 that have 3. We accomplish this by
// wrapping back to slot 0 when we hit 1/15 and 1/5.
std::vector<std::thread> threads;
while (threads.size() < numThreads) {
threads.push_back(std::thread([&,iters,stripes,work]() {
std::atomic<size_t>* counters[stripes];
for (size_t i = 0; i < stripes; ++i) {
counters[i]
= new (raw.data() + counterAlignment * i) std::atomic<size_t>();
}
spreader.current();
ready++;
while (!go.load()) {
sched_yield();
}
std::atomic<int> localWork(0);
if (spreaderType == SpreaderType::SHARED) {
for (size_t i = iters; i > 0; --i) {
++*(counters[AccessSpreader<>::current(stripes)]);
for (size_t j = work; j > 0; --j) {
localWork.load();
}
}
} else {
for (size_t i = iters; i > 0; --i) {
++*(counters[spreader.current()]);
for (size_t j = work; j > 0; --j) {
localWork.load();
}
}
}
}));
if (threads.size() == numThreads / 15 ||
threads.size() == numThreads / 5) {
// create a few dummy threads to wrap back around to 0 mod numCpus
for (size_t i = threads.size(); i != numThreads; ++i) {
std::thread([&]() {
spreader.current();
}).join();
}
}
}
while (ready < numThreads) {
sched_yield();
}
braces.dismiss();
go = true;
for (auto& thr : threads) {
thr.join();
}
}
static void atomicIncrBaseline(size_t iters, size_t work,
size_t numThreads = 32) {
folly::BenchmarkSuspender braces;
std::atomic<bool> go(false);
std::vector<std::thread> threads;
while (threads.size() < numThreads) {
threads.push_back(std::thread([&]() {
while (!go.load()) {
sched_yield();
}
std::atomic<size_t> localCounter(0);
std::atomic<int> localWork(0);
for (size_t i = iters; i > 0; --i) {
localCounter++;
for (size_t j = work; j > 0; --j) {
localWork.load();
}
}
}));
}
braces.dismiss();
go = true;
for (auto& thr : threads) {
thr.join();
}
}
BENCHMARK_DRAW_LINE()
BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_0_work_stub,
1, 0, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_getcpu,
2, 0, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_getcpu,
4, 0, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_getcpu,
8, 0, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_getcpu,
16, 0, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_getcpu,
32, 0, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_getcpu,
64, 0, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_tls_rr,
2, 0, SpreaderType::TLS_RR)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_tls_rr,
4, 0, SpreaderType::TLS_RR)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_tls_rr,
8, 0, SpreaderType::TLS_RR)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_tls_rr,
16, 0, SpreaderType::TLS_RR)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_tls_rr,
32, 0, SpreaderType::TLS_RR)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_tls_rr,
64, 0, SpreaderType::TLS_RR)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_0_work_shared,
2, 0, SpreaderType::SHARED)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_0_work_shared,
4, 0, SpreaderType::SHARED)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_0_work_shared,
8, 0, SpreaderType::SHARED)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_0_work_shared,
16, 0, SpreaderType::SHARED)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_0_work_shared,
32, 0, SpreaderType::SHARED)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 64_stripe_0_work_shared,
64, 0, SpreaderType::SHARED)
BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_0_work, 0)
BENCHMARK_DRAW_LINE()
BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_500_work_stub,
1, 500, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_500_work_getcpu,
2, 500, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_500_work_getcpu,
4, 500, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_500_work_getcpu,
8, 500, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_500_work_getcpu,
16, 500, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_500_work_getcpu,
32, 500, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_500_work, 500)
BENCHMARK_DRAW_LINE()
BENCHMARK_NAMED_PARAM(contentionAtWidth, 1_stripe_1000_work_stub,
1, 1000, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 2_stripe_1000_work_getcpu,
2, 1000, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 4_stripe_1000_work_getcpu,
4, 1000, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 8_stripe_1000_work_getcpu,
8, 1000, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 16_stripe_1000_work_getcpu,
16, 1000, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(contentionAtWidth, 32_stripe_1000_work_getcpu,
32, 1000, SpreaderType::GETCPU)
BENCHMARK_NAMED_PARAM(atomicIncrBaseline, local_incr_1000_work, 1000)
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
gflags::ParseCommandLineFlags(&argc, &argv, true);
auto ret = RUN_ALL_TESTS();
if (!ret && FLAGS_benchmark) {
folly::runBenchmarks();
}
return ret;
}