| /* |
| * Copyright 2015 Facebook, Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef FOLLY_STRING_INL_H_ |
| #define FOLLY_STRING_INL_H_ |
| |
| #include <stdexcept> |
| #include <iterator> |
| |
| #ifndef FOLLY_BASE_STRING_H_ |
| #error This file may only be included from String.h |
| #endif |
| |
| namespace folly { |
| |
| namespace detail { |
| // Map from character code to value of one-character escape sequence |
| // ('\n' = 10 maps to 'n'), 'O' if the character should be printed as |
| // an octal escape sequence, or 'P' if the character is printable and |
| // should be printed as is. |
| extern const char cEscapeTable[]; |
| } // namespace detail |
| |
| template <class String> |
| void cEscape(StringPiece str, String& out) { |
| char esc[4]; |
| esc[0] = '\\'; |
| out.reserve(out.size() + str.size()); |
| auto p = str.begin(); |
| auto last = p; // last regular character |
| // We advance over runs of regular characters (printable, not double-quote or |
| // backslash) and copy them in one go; this is faster than calling push_back |
| // repeatedly. |
| while (p != str.end()) { |
| char c = *p; |
| unsigned char v = static_cast<unsigned char>(c); |
| char e = detail::cEscapeTable[v]; |
| if (e == 'P') { // printable |
| ++p; |
| } else if (e == 'O') { // octal |
| out.append(&*last, p - last); |
| esc[1] = '0' + ((v >> 6) & 7); |
| esc[2] = '0' + ((v >> 3) & 7); |
| esc[3] = '0' + (v & 7); |
| out.append(esc, 4); |
| ++p; |
| last = p; |
| } else { // special 1-character escape |
| out.append(&*last, p - last); |
| esc[1] = e; |
| out.append(esc, 2); |
| ++p; |
| last = p; |
| } |
| } |
| out.append(&*last, p - last); |
| } |
| |
| namespace detail { |
| // Map from the character code of the character following a backslash to |
| // the unescaped character if a valid one-character escape sequence |
| // ('n' maps to 10 = '\n'), 'O' if this is the first character of an |
| // octal escape sequence, 'X' if this is the first character of a |
| // hexadecimal escape sequence, or 'I' if this escape sequence is invalid. |
| extern const char cUnescapeTable[]; |
| |
| // Map from the character code to the hex value, or 16 if invalid hex char. |
| extern const unsigned char hexTable[]; |
| } // namespace detail |
| |
| template <class String> |
| void cUnescape(StringPiece str, String& out, bool strict) { |
| out.reserve(out.size() + str.size()); |
| auto p = str.begin(); |
| auto last = p; // last regular character (not part of an escape sequence) |
| // We advance over runs of regular characters (not backslash) and copy them |
| // in one go; this is faster than calling push_back repeatedly. |
| while (p != str.end()) { |
| char c = *p; |
| if (c != '\\') { // normal case |
| ++p; |
| continue; |
| } |
| out.append(&*last, p - last); |
| if (p == str.end()) { // backslash at end of string |
| if (strict) { |
| throw std::invalid_argument("incomplete escape sequence"); |
| } |
| out.push_back('\\'); |
| last = p; |
| continue; |
| } |
| ++p; |
| char e = detail::cUnescapeTable[static_cast<unsigned char>(*p)]; |
| if (e == 'O') { // octal |
| unsigned char val = 0; |
| for (int i = 0; i < 3 && p != str.end() && *p >= '0' && *p <= '7'; |
| ++i, ++p) { |
| val = (val << 3) | (*p - '0'); |
| } |
| out.push_back(val); |
| last = p; |
| } else if (e == 'X') { // hex |
| ++p; |
| if (p == str.end()) { // \x at end of string |
| if (strict) { |
| throw std::invalid_argument("incomplete hex escape sequence"); |
| } |
| out.append("\\x"); |
| last = p; |
| continue; |
| } |
| unsigned char val = 0; |
| unsigned char h; |
| for (; (p != str.end() && |
| (h = detail::hexTable[static_cast<unsigned char>(*p)]) < 16); |
| ++p) { |
| val = (val << 4) | h; |
| } |
| out.push_back(val); |
| last = p; |
| } else if (e == 'I') { // invalid |
| if (strict) { |
| throw std::invalid_argument("invalid escape sequence"); |
| } |
| out.push_back('\\'); |
| out.push_back(*p); |
| ++p; |
| last = p; |
| } else { // standard escape sequence, \' etc |
| out.push_back(e); |
| ++p; |
| last = p; |
| } |
| } |
| out.append(&*last, p - last); |
| } |
| |
| namespace detail { |
| // Map from character code to escape mode: |
| // 0 = pass through |
| // 1 = unused |
| // 2 = pass through in PATH mode |
| // 3 = space, replace with '+' in QUERY mode |
| // 4 = percent-encode |
| extern const unsigned char uriEscapeTable[]; |
| } // namespace detail |
| |
| template <class String> |
| void uriEscape(StringPiece str, String& out, UriEscapeMode mode) { |
| static const char hexValues[] = "0123456789abcdef"; |
| char esc[3]; |
| esc[0] = '%'; |
| // Preallocate assuming that 25% of the input string will be escaped |
| out.reserve(out.size() + str.size() + 3 * (str.size() / 4)); |
| auto p = str.begin(); |
| auto last = p; // last regular character |
| // We advance over runs of passthrough characters and copy them in one go; |
| // this is faster than calling push_back repeatedly. |
| unsigned char minEncode = static_cast<unsigned char>(mode); |
| while (p != str.end()) { |
| char c = *p; |
| unsigned char v = static_cast<unsigned char>(c); |
| unsigned char discriminator = detail::uriEscapeTable[v]; |
| if (LIKELY(discriminator <= minEncode)) { |
| ++p; |
| } else if (mode == UriEscapeMode::QUERY && discriminator == 3) { |
| out.append(&*last, p - last); |
| out.push_back('+'); |
| ++p; |
| last = p; |
| } else { |
| out.append(&*last, p - last); |
| esc[1] = hexValues[v >> 4]; |
| esc[2] = hexValues[v & 0x0f]; |
| out.append(esc, 3); |
| ++p; |
| last = p; |
| } |
| } |
| out.append(&*last, p - last); |
| } |
| |
| template <class String> |
| void uriUnescape(StringPiece str, String& out, UriEscapeMode mode) { |
| out.reserve(out.size() + str.size()); |
| auto p = str.begin(); |
| auto last = p; |
| // We advance over runs of passthrough characters and copy them in one go; |
| // this is faster than calling push_back repeatedly. |
| while (p != str.end()) { |
| char c = *p; |
| switch (c) { |
| case '%': |
| { |
| if (UNLIKELY(std::distance(p, str.end()) < 3)) { |
| throw std::invalid_argument("incomplete percent encode sequence"); |
| } |
| auto h1 = detail::hexTable[static_cast<unsigned char>(p[1])]; |
| auto h2 = detail::hexTable[static_cast<unsigned char>(p[2])]; |
| if (UNLIKELY(h1 == 16 || h2 == 16)) { |
| throw std::invalid_argument("invalid percent encode sequence"); |
| } |
| out.append(&*last, p - last); |
| out.push_back((h1 << 4) | h2); |
| p += 3; |
| last = p; |
| break; |
| } |
| case '+': |
| if (mode == UriEscapeMode::QUERY) { |
| out.append(&*last, p - last); |
| out.push_back(' '); |
| ++p; |
| last = p; |
| break; |
| } |
| // else fallthrough |
| default: |
| ++p; |
| break; |
| } |
| } |
| out.append(&*last, p - last); |
| } |
| |
| namespace detail { |
| |
| /* |
| * The following functions are type-overloaded helpers for |
| * internalSplit(). |
| */ |
| inline size_t delimSize(char) { return 1; } |
| inline size_t delimSize(StringPiece s) { return s.size(); } |
| inline bool atDelim(const char* s, char c) { |
| return *s == c; |
| } |
| inline bool atDelim(const char* s, StringPiece sp) { |
| return !std::memcmp(s, sp.start(), sp.size()); |
| } |
| |
| // These are used to short-circuit internalSplit() in the case of |
| // 1-character strings. |
| inline char delimFront(char c) { |
| // This one exists only for compile-time; it should never be called. |
| std::abort(); |
| return c; |
| } |
| inline char delimFront(StringPiece s) { |
| assert(!s.empty() && s.start() != nullptr); |
| return *s.start(); |
| } |
| |
| /* |
| * These output conversion templates allow us to support multiple |
| * output string types, even when we are using an arbitrary |
| * OutputIterator. |
| */ |
| template<class OutStringT> struct OutputConverter {}; |
| |
| template<> struct OutputConverter<std::string> { |
| std::string operator()(StringPiece sp) const { |
| return sp.toString(); |
| } |
| }; |
| |
| template<> struct OutputConverter<fbstring> { |
| fbstring operator()(StringPiece sp) const { |
| return sp.toFbstring(); |
| } |
| }; |
| |
| template<> struct OutputConverter<StringPiece> { |
| StringPiece operator()(StringPiece sp) const { return sp; } |
| }; |
| |
| /* |
| * Shared implementation for all the split() overloads. |
| * |
| * This uses some external helpers that are overloaded to let this |
| * algorithm be more performant if the deliminator is a single |
| * character instead of a whole string. |
| * |
| * @param ignoreEmpty iff true, don't copy empty segments to output |
| */ |
| template<class OutStringT, class DelimT, class OutputIterator> |
| void internalSplit(DelimT delim, StringPiece sp, OutputIterator out, |
| bool ignoreEmpty) { |
| assert(sp.empty() || sp.start() != nullptr); |
| |
| const char* s = sp.start(); |
| const size_t strSize = sp.size(); |
| const size_t dSize = delimSize(delim); |
| |
| OutputConverter<OutStringT> conv; |
| |
| if (dSize > strSize || dSize == 0) { |
| if (!ignoreEmpty || strSize > 0) { |
| *out++ = conv(sp); |
| } |
| return; |
| } |
| if (boost::is_same<DelimT,StringPiece>::value && dSize == 1) { |
| // Call the char version because it is significantly faster. |
| return internalSplit<OutStringT>(delimFront(delim), sp, out, |
| ignoreEmpty); |
| } |
| |
| size_t tokenStartPos = 0; |
| size_t tokenSize = 0; |
| for (size_t i = 0; i <= strSize - dSize; ++i) { |
| if (atDelim(&s[i], delim)) { |
| if (!ignoreEmpty || tokenSize > 0) { |
| *out++ = conv(StringPiece(&s[tokenStartPos], tokenSize)); |
| } |
| |
| tokenStartPos = i + dSize; |
| tokenSize = 0; |
| i += dSize - 1; |
| } else { |
| ++tokenSize; |
| } |
| } |
| tokenSize = strSize - tokenStartPos; |
| if (!ignoreEmpty || tokenSize > 0) { |
| *out++ = conv(StringPiece(&s[tokenStartPos], tokenSize)); |
| } |
| } |
| |
| template<class String> StringPiece prepareDelim(const String& s) { |
| return StringPiece(s); |
| } |
| inline char prepareDelim(char c) { return c; } |
| |
| template <class Dst> |
| struct convertTo { |
| template <class Src> |
| static Dst from(const Src& src) { return folly::to<Dst>(src); } |
| static Dst from(const Dst& src) { return src; } |
| }; |
| |
| template<bool exact, |
| class Delim, |
| class OutputType> |
| typename std::enable_if<IsSplitTargetType<OutputType>::value, bool>::type |
| splitFixed(const Delim& delimiter, |
| StringPiece input, |
| OutputType& out) { |
| if (exact && UNLIKELY(std::string::npos != input.find(delimiter))) { |
| return false; |
| } |
| out = convertTo<OutputType>::from(input); |
| return true; |
| } |
| |
| template<bool exact, |
| class Delim, |
| class OutputType, |
| class... OutputTypes> |
| typename std::enable_if<IsSplitTargetType<OutputType>::value, bool>::type |
| splitFixed(const Delim& delimiter, |
| StringPiece input, |
| OutputType& outHead, |
| OutputTypes&... outTail) { |
| size_t cut = input.find(delimiter); |
| if (UNLIKELY(cut == std::string::npos)) { |
| return false; |
| } |
| StringPiece head(input.begin(), input.begin() + cut); |
| StringPiece tail(input.begin() + cut + detail::delimSize(delimiter), |
| input.end()); |
| if (LIKELY(splitFixed<exact>(delimiter, tail, outTail...))) { |
| outHead = convertTo<OutputType>::from(head); |
| return true; |
| } |
| return false; |
| } |
| |
| } |
| |
| ////////////////////////////////////////////////////////////////////// |
| |
| template<class Delim, class String, class OutputType> |
| void split(const Delim& delimiter, |
| const String& input, |
| std::vector<OutputType>& out, |
| bool ignoreEmpty) { |
| detail::internalSplit<OutputType>( |
| detail::prepareDelim(delimiter), |
| StringPiece(input), |
| std::back_inserter(out), |
| ignoreEmpty); |
| } |
| |
| template<class Delim, class String, class OutputType> |
| void split(const Delim& delimiter, |
| const String& input, |
| fbvector<OutputType>& out, |
| bool ignoreEmpty) { |
| detail::internalSplit<OutputType>( |
| detail::prepareDelim(delimiter), |
| StringPiece(input), |
| std::back_inserter(out), |
| ignoreEmpty); |
| } |
| |
| template<class OutputValueType, class Delim, class String, |
| class OutputIterator> |
| void splitTo(const Delim& delimiter, |
| const String& input, |
| OutputIterator out, |
| bool ignoreEmpty) { |
| detail::internalSplit<OutputValueType>( |
| detail::prepareDelim(delimiter), |
| StringPiece(input), |
| out, |
| ignoreEmpty); |
| } |
| |
| template<bool exact, |
| class Delim, |
| class OutputType, |
| class... OutputTypes> |
| typename std::enable_if<IsSplitTargetType<OutputType>::value, bool>::type |
| split(const Delim& delimiter, |
| StringPiece input, |
| OutputType& outHead, |
| OutputTypes&... outTail) { |
| return detail::splitFixed<exact>( |
| detail::prepareDelim(delimiter), |
| input, |
| outHead, |
| outTail...); |
| } |
| |
| namespace detail { |
| |
| /* |
| * If a type can have its string size determined cheaply, we can more |
| * efficiently append it in a loop (see internalJoinAppend). Note that the |
| * struct need not conform to the std::string api completely (ex. does not need |
| * to implement append()). |
| */ |
| template <class T> struct IsSizableString { |
| enum { value = IsSomeString<T>::value |
| || std::is_same<T, StringPiece>::value }; |
| }; |
| |
| template <class Iterator> |
| struct IsSizableStringContainerIterator : |
| IsSizableString<typename std::iterator_traits<Iterator>::value_type> { |
| }; |
| |
| template <class Delim, class Iterator, class String> |
| void internalJoinAppend(Delim delimiter, |
| Iterator begin, |
| Iterator end, |
| String& output) { |
| assert(begin != end); |
| if (std::is_same<Delim, StringPiece>::value && |
| delimSize(delimiter) == 1) { |
| internalJoinAppend(delimFront(delimiter), begin, end, output); |
| return; |
| } |
| toAppend(*begin, &output); |
| while (++begin != end) { |
| toAppend(delimiter, *begin, &output); |
| } |
| } |
| |
| template <class Delim, class Iterator, class String> |
| typename std::enable_if<IsSizableStringContainerIterator<Iterator>::value>::type |
| internalJoin(Delim delimiter, |
| Iterator begin, |
| Iterator end, |
| String& output) { |
| output.clear(); |
| if (begin == end) { |
| return; |
| } |
| const size_t dsize = delimSize(delimiter); |
| Iterator it = begin; |
| size_t size = it->size(); |
| while (++it != end) { |
| size += dsize + it->size(); |
| } |
| output.reserve(size); |
| internalJoinAppend(delimiter, begin, end, output); |
| } |
| |
| template <class Delim, class Iterator, class String> |
| typename |
| std::enable_if<!IsSizableStringContainerIterator<Iterator>::value>::type |
| internalJoin(Delim delimiter, |
| Iterator begin, |
| Iterator end, |
| String& output) { |
| output.clear(); |
| if (begin == end) { |
| return; |
| } |
| internalJoinAppend(delimiter, begin, end, output); |
| } |
| |
| } // namespace detail |
| |
| template <class Delim, class Iterator, class String> |
| void join(const Delim& delimiter, |
| Iterator begin, |
| Iterator end, |
| String& output) { |
| detail::internalJoin( |
| detail::prepareDelim(delimiter), |
| begin, |
| end, |
| output); |
| } |
| |
| template <class String1, class String2> |
| void backslashify(const String1& input, String2& output, bool hex_style) { |
| static const char hexValues[] = "0123456789abcdef"; |
| output.clear(); |
| output.reserve(3 * input.size()); |
| for (unsigned char c : input) { |
| // less than space or greater than '~' are considered unprintable |
| if (c < 0x20 || c > 0x7e || c == '\\') { |
| bool hex_append = false; |
| output.push_back('\\'); |
| if (hex_style) { |
| hex_append = true; |
| } else { |
| if (c == '\r') output += 'r'; |
| else if (c == '\n') output += 'n'; |
| else if (c == '\t') output += 't'; |
| else if (c == '\a') output += 'a'; |
| else if (c == '\b') output += 'b'; |
| else if (c == '\0') output += '0'; |
| else if (c == '\\') output += '\\'; |
| else { |
| hex_append = true; |
| } |
| } |
| if (hex_append) { |
| output.push_back('x'); |
| output.push_back(hexValues[(c >> 4) & 0xf]); |
| output.push_back(hexValues[c & 0xf]); |
| } |
| } else { |
| output += c; |
| } |
| } |
| } |
| |
| template <class String1, class String2> |
| void humanify(const String1& input, String2& output) { |
| size_t numUnprintable = 0; |
| size_t numPrintablePrefix = 0; |
| for (unsigned char c : input) { |
| if (c < 0x20 || c > 0x7e || c == '\\') { |
| ++numUnprintable; |
| } |
| if (numUnprintable == 0) { |
| ++numPrintablePrefix; |
| } |
| } |
| |
| // hexlify doubles a string's size; backslashify can potentially |
| // explode it by 4x. Now, the printable range of the ascii |
| // "spectrum" is around 95 out of 256 values, so a "random" binary |
| // string should be around 60% unprintable. We use a 50% hueristic |
| // here, so if a string is 60% unprintable, then we just use hex |
| // output. Otherwise we backslash. |
| // |
| // UTF8 is completely ignored; as a result, utf8 characters will |
| // likely be \x escaped (since most common glyphs fit in two bytes). |
| // This is a tradeoff of complexity/speed instead of a convenience |
| // that likely would rarely matter. Moreover, this function is more |
| // about displaying underlying bytes, not about displaying glyphs |
| // from languages. |
| if (numUnprintable == 0) { |
| output = input; |
| } else if (5 * numUnprintable >= 3 * input.size()) { |
| // However! If we have a "meaningful" prefix of printable |
| // characters, say 20% of the string, we backslashify under the |
| // assumption viewing the prefix as ascii is worth blowing the |
| // output size up a bit. |
| if (5 * numPrintablePrefix >= input.size()) { |
| backslashify(input, output); |
| } else { |
| output = "0x"; |
| hexlify(input, output, true /* append output */); |
| } |
| } else { |
| backslashify(input, output); |
| } |
| } |
| |
| template<class InputString, class OutputString> |
| bool hexlify(const InputString& input, OutputString& output, |
| bool append_output) { |
| if (!append_output) output.clear(); |
| |
| static char hexValues[] = "0123456789abcdef"; |
| auto j = output.size(); |
| output.resize(2 * input.size() + output.size()); |
| for (size_t i = 0; i < input.size(); ++i) { |
| int ch = input[i]; |
| output[j++] = hexValues[(ch >> 4) & 0xf]; |
| output[j++] = hexValues[ch & 0xf]; |
| } |
| return true; |
| } |
| |
| template<class InputString, class OutputString> |
| bool unhexlify(const InputString& input, OutputString& output) { |
| if (input.size() % 2 != 0) { |
| return false; |
| } |
| output.resize(input.size() / 2); |
| int j = 0; |
| auto unhex = [](char c) -> int { |
| return c >= '0' && c <= '9' ? c - '0' : |
| c >= 'A' && c <= 'F' ? c - 'A' + 10 : |
| c >= 'a' && c <= 'f' ? c - 'a' + 10 : |
| -1; |
| }; |
| |
| for (size_t i = 0; i < input.size(); i += 2) { |
| int highBits = unhex(input[i]); |
| int lowBits = unhex(input[i + 1]); |
| if (highBits < 0 || lowBits < 0) { |
| return false; |
| } |
| output[j++] = (highBits << 4) + lowBits; |
| } |
| return true; |
| } |
| |
| namespace detail { |
| /** |
| * Hex-dump at most 16 bytes starting at offset from a memory area of size |
| * bytes. Return the number of bytes actually dumped. |
| */ |
| size_t hexDumpLine(const void* ptr, size_t offset, size_t size, |
| std::string& line); |
| } // namespace detail |
| |
| template <class OutIt> |
| void hexDump(const void* ptr, size_t size, OutIt out) { |
| size_t offset = 0; |
| std::string line; |
| while (offset < size) { |
| offset += detail::hexDumpLine(ptr, offset, size, line); |
| *out++ = line; |
| } |
| } |
| |
| } // namespace folly |
| |
| #endif /* FOLLY_STRING_INL_H_ */ |