| /* |
| * Copyright 2015 Facebook, Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // @author Mark Rabkin (mrabkin@fb.com) |
| // @author Andrei Alexandrescu (andrei.alexandrescu@fb.com) |
| |
| #include <folly/Range.h> |
| |
| #if FOLLY_HAVE_EMMINTRIN_H |
| #include <emmintrin.h> // __v16qi |
| #endif |
| #include <iostream> |
| |
| namespace folly { |
| |
| /** |
| * Predicates that can be used with qfind and startsWith |
| */ |
| const AsciiCaseSensitive asciiCaseSensitive = AsciiCaseSensitive(); |
| const AsciiCaseInsensitive asciiCaseInsensitive = AsciiCaseInsensitive(); |
| |
| namespace { |
| |
| // It's okay if pages are bigger than this (as powers of two), but they should |
| // not be smaller. |
| constexpr size_t kMinPageSize = 4096; |
| static_assert(kMinPageSize >= 16, |
| "kMinPageSize must be at least SSE register size"); |
| #define PAGE_FOR(addr) \ |
| (reinterpret_cast<uintptr_t>(addr) / kMinPageSize) |
| |
| |
| // Earlier versions of GCC (for example, Clang on Mac OS X, which is based on |
| // GCC 4.2) do not have a full compliment of SSE builtins. |
| #if FOLLY_HAVE_EMMINTRIN_H && __GNUC_PREREQ(4, 6) |
| inline size_t nextAlignedIndex(const char* arr) { |
| auto firstPossible = reinterpret_cast<uintptr_t>(arr) + 1; |
| return 1 + // add 1 because the index starts at 'arr' |
| ((firstPossible + 15) & ~0xF) // round up to next multiple of 16 |
| - firstPossible; |
| } |
| |
| // build sse4.2-optimized version even if -msse4.2 is not passed to GCC |
| size_t qfind_first_byte_of_needles16(const StringPiece haystack, |
| const StringPiece needles) |
| __attribute__ ((__target__("sse4.2"), noinline)) |
| FOLLY_DISABLE_ADDRESS_SANITIZER; |
| |
| // helper method for case where needles.size() <= 16 |
| size_t qfind_first_byte_of_needles16(const StringPiece haystack, |
| const StringPiece needles) { |
| DCHECK(!haystack.empty()); |
| DCHECK(!needles.empty()); |
| DCHECK_LE(needles.size(), 16); |
| if ((needles.size() <= 2 && haystack.size() >= 256) || |
| // must bail if we can't even SSE-load a single segment of haystack |
| (haystack.size() < 16 && |
| PAGE_FOR(haystack.end() - 1) != PAGE_FOR(haystack.data() + 15)) || |
| // can't load needles into SSE register if it could cross page boundary |
| PAGE_FOR(needles.end() - 1) != PAGE_FOR(needles.data() + 15)) { |
| return detail::qfind_first_byte_of_nosse(haystack, needles); |
| } |
| |
| auto arr2 = __builtin_ia32_loaddqu(needles.data()); |
| // do an unaligned load for first block of haystack |
| auto arr1 = __builtin_ia32_loaddqu(haystack.data()); |
| auto index = __builtin_ia32_pcmpestri128(arr2, needles.size(), |
| arr1, haystack.size(), 0); |
| if (index < 16) { |
| return index; |
| } |
| |
| // Now, we can do aligned loads hereafter... |
| size_t i = nextAlignedIndex(haystack.data()); |
| for (; i < haystack.size(); i+= 16) { |
| void* ptr1 = __builtin_assume_aligned(haystack.data() + i, 16); |
| auto arr1 = *reinterpret_cast<const __v16qi*>(ptr1); |
| auto index = __builtin_ia32_pcmpestri128(arr2, needles.size(), |
| arr1, haystack.size() - i, 0); |
| if (index < 16) { |
| return i + index; |
| } |
| } |
| return StringPiece::npos; |
| } |
| #endif // FOLLY_HAVE_EMMINTRIN_H && GCC 4.6+ |
| |
| // Aho, Hopcroft, and Ullman refer to this trick in "The Design and Analysis |
| // of Computer Algorithms" (1974), but the best description is here: |
| // http://research.swtch.com/sparse |
| class FastByteSet { |
| public: |
| FastByteSet() : size_(0) { } // no init of arrays required! |
| |
| inline void add(uint8_t i) { |
| if (!contains(i)) { |
| dense_[size_] = i; |
| sparse_[i] = size_; |
| size_++; |
| } |
| } |
| inline bool contains(uint8_t i) const { |
| DCHECK_LE(size_, 256); |
| return sparse_[i] < size_ && dense_[sparse_[i]] == i; |
| } |
| |
| private: |
| uint16_t size_; // can't use uint8_t because it would overflow if all |
| // possible values were inserted. |
| uint8_t sparse_[256]; |
| uint8_t dense_[256]; |
| }; |
| |
| } // namespace |
| |
| namespace detail { |
| |
| size_t qfind_first_byte_of_byteset(const StringPiece haystack, |
| const StringPiece needles) { |
| FastByteSet s; |
| for (auto needle: needles) { |
| s.add(needle); |
| } |
| for (size_t index = 0; index < haystack.size(); ++index) { |
| if (s.contains(haystack[index])) { |
| return index; |
| } |
| } |
| return StringPiece::npos; |
| } |
| |
| #if FOLLY_HAVE_EMMINTRIN_H && __GNUC_PREREQ(4, 6) |
| |
| template <bool HAYSTACK_ALIGNED> |
| size_t scanHaystackBlock(const StringPiece haystack, |
| const StringPiece needles, |
| uint64_t idx) |
| // inline is okay because it's only called from other sse4.2 functions |
| __attribute__ ((__target__("sse4.2"))) |
| // Turn off ASAN because the "arr2 = ..." assignment in the loop below reads |
| // up to 15 bytes beyond end of the buffer in #needles#. That is ok because |
| // ptr2 is always 16-byte aligned, so the read can never span a page boundary. |
| // Also, the extra data that may be read is never actually used. |
| FOLLY_DISABLE_ADDRESS_SANITIZER; |
| |
| // Scans a 16-byte block of haystack (starting at blockStartIdx) to find first |
| // needle. If HAYSTACK_ALIGNED, then haystack must be 16byte aligned. |
| // If !HAYSTACK_ALIGNED, then caller must ensure that it is safe to load the |
| // block. |
| template <bool HAYSTACK_ALIGNED> |
| size_t scanHaystackBlock(const StringPiece haystack, |
| const StringPiece needles, |
| uint64_t blockStartIdx) { |
| DCHECK_GT(needles.size(), 16); // should handled by *needles16() method |
| DCHECK(blockStartIdx + 16 <= haystack.size() || |
| (PAGE_FOR(haystack.data() + blockStartIdx) == |
| PAGE_FOR(haystack.data() + blockStartIdx + 15))); |
| |
| __v16qi arr1; |
| if (HAYSTACK_ALIGNED) { |
| void* ptr1 = __builtin_assume_aligned(haystack.data() + blockStartIdx, 16); |
| arr1 = *reinterpret_cast<const __v16qi*>(ptr1); |
| } else { |
| arr1 = __builtin_ia32_loaddqu(haystack.data() + blockStartIdx); |
| } |
| |
| // This load is safe because needles.size() >= 16 |
| auto arr2 = __builtin_ia32_loaddqu(needles.data()); |
| size_t b = __builtin_ia32_pcmpestri128( |
| arr2, 16, arr1, haystack.size() - blockStartIdx, 0); |
| |
| size_t j = nextAlignedIndex(needles.data()); |
| for (; j < needles.size(); j += 16) { |
| void* ptr2 = __builtin_assume_aligned(needles.data() + j, 16); |
| arr2 = *reinterpret_cast<const __v16qi*>(ptr2); |
| |
| auto index = __builtin_ia32_pcmpestri128( |
| arr2, needles.size() - j, arr1, haystack.size() - blockStartIdx, 0); |
| b = std::min<size_t>(index, b); |
| } |
| |
| if (b < 16) { |
| return blockStartIdx + b; |
| } |
| return StringPiece::npos; |
| } |
| |
| size_t qfind_first_byte_of_sse42(const StringPiece haystack, |
| const StringPiece needles) |
| __attribute__ ((__target__("sse4.2"), noinline)); |
| |
| size_t qfind_first_byte_of_sse42(const StringPiece haystack, |
| const StringPiece needles) { |
| if (UNLIKELY(needles.empty() || haystack.empty())) { |
| return StringPiece::npos; |
| } else if (needles.size() <= 16) { |
| // we can save some unnecessary load instructions by optimizing for |
| // the common case of needles.size() <= 16 |
| return qfind_first_byte_of_needles16(haystack, needles); |
| } |
| |
| if (haystack.size() < 16 && |
| PAGE_FOR(haystack.end() - 1) != PAGE_FOR(haystack.data() + 16)) { |
| // We can't safely SSE-load haystack. Use a different approach. |
| if (haystack.size() <= 2) { |
| return qfind_first_of(haystack, needles, asciiCaseSensitive); |
| } |
| return qfind_first_byte_of_byteset(haystack, needles); |
| } |
| |
| auto ret = scanHaystackBlock<false>(haystack, needles, 0); |
| if (ret != StringPiece::npos) { |
| return ret; |
| } |
| |
| size_t i = nextAlignedIndex(haystack.data()); |
| for (; i < haystack.size(); i += 16) { |
| auto ret = scanHaystackBlock<true>(haystack, needles, i); |
| if (ret != StringPiece::npos) { |
| return ret; |
| } |
| } |
| |
| return StringPiece::npos; |
| } |
| #endif // FOLLY_HAVE_EMMINTRIN_H && GCC 4.6+ |
| |
| size_t qfind_first_byte_of_nosse(const StringPiece haystack, |
| const StringPiece needles) { |
| if (UNLIKELY(needles.empty() || haystack.empty())) { |
| return StringPiece::npos; |
| } |
| // The thresholds below were empirically determined by benchmarking. |
| // This is not an exact science since it depends on the CPU, the size of |
| // needles, and the size of haystack. |
| if ((needles.size() >= 4 && haystack.size() <= 10) || |
| (needles.size() >= 16 && haystack.size() <= 64) || |
| needles.size() >= 32) { |
| return qfind_first_byte_of_byteset(haystack, needles); |
| } |
| return qfind_first_of(haystack, needles, asciiCaseSensitive); |
| } |
| |
| } // namespace detail |
| } // namespace folly |