blob: ea93c3237e5c0902da35dac1aea4d2dec8459801 [file] [log] [blame]
/*
* Copyright (C) 2008, 2009 Google Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "third_party/blink/renderer/platform/text/text_encoding_detector.h"
#include "build/build_config.h"
#include "third_party/blink/renderer/platform/weborigin/kurl.h"
#include "third_party/blink/renderer/platform/wtf/text/text_encoding.h"
#include "third_party/ced/src/compact_enc_det/compact_enc_det.h"
// third_party/ced/src/util/encodings/encodings.h, which is included
// by the include above, undefs UNICODE because that is a macro used
// internally in ced. If we later in the same translation unit do
// anything related to Windows or Windows headers those will then use
// the ASCII versions which we do not want. To avoid that happening in
// jumbo builds, we redefine UNICODE again here.
#if defined(OS_WIN)
#define UNICODE 1
#endif // OS_WIN
namespace blink {
bool DetectTextEncoding(const char* data,
uint32_t length,
const char* hint_encoding_name,
const KURL& hint_url,
const char* hint_user_language,
WTF::TextEncoding* detected_encoding) {
*detected_encoding = WTF::TextEncoding();
// In general, do not use language hint. This helps get more
// deterministic encoding detection results across devices. Note that local
// file resources can still benefit from the hint.
Language language = UNKNOWN_LANGUAGE;
if (hint_url.Protocol() == "file")
LanguageFromCode(hint_user_language, &language);
int consumed_bytes;
bool is_reliable;
Encoding encoding = CompactEncDet::DetectEncoding(
data, length, hint_url.GetString().Ascii().c_str(), nullptr, nullptr,
EncodingNameAliasToEncoding(hint_encoding_name), language,
CompactEncDet::WEB_CORPUS,
false, // Include 7-bit encodings to detect ISO-2022-JP
&consumed_bytes, &is_reliable);
if (encoding == UNKNOWN_ENCODING)
*detected_encoding = WTF::UnknownEncoding();
else
*detected_encoding = WTF::TextEncoding(MimeEncodingName(encoding));
// Should return false if the detected encoding is UTF8. This helps prevent
// modern web sites from neglecting proper encoding labelling and simply
// relying on browser-side encoding detection. Encoding detection is supposed
// to work for web sites with legacy encoding only (so this doesn't have to
// be applied to local file resources).
// Detection failure leads |TextResourceDecoder| to use its default encoding
// determined from system locale or TLD.
return !(encoding == UNKNOWN_ENCODING ||
(hint_url.Protocol() != "file" && encoding == UTF8));
}
} // namespace blink