blob: 231cd4c52dc6f799307e52f62c35131ccb4e88d9 [file] [log] [blame]
/*
* Copyright (C) 2008 Apple Inc. All Rights Reserved.
* Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
* Copyright (C) 2010 Google, Inc. All Rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "third_party/blink/renderer/core/html/parser/html_entity_parser.h"
#include "base/notreached.h"
#include "third_party/blink/renderer/core/html/parser/html_entity_search.h"
#include "third_party/blink/renderer/core/html/parser/html_entity_table.h"
#include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
namespace blink {
static const UChar kWindowsLatin1ExtensionArray[32] = {
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
};
static bool IsAlphaNumeric(UChar cc) {
return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') ||
(cc >= 'A' && cc <= 'Z');
}
static UChar AdjustEntity(UChar32 value) {
if ((value & ~0x1F) != 0x0080)
return value;
return kWindowsLatin1ExtensionArray[value - 0x80];
}
static void AppendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decoded_entity) {
// FIXME: A number of specific entity values generate parse errors.
if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) {
decoded_entity.Append(0xFFFD);
return;
}
if (U_IS_BMP(c)) {
decoded_entity.Append(AdjustEntity(c));
return;
}
decoded_entity.Append(c);
}
static const UChar32 kInvalidUnicode = -1;
static bool IsHexDigit(UChar cc) {
return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') ||
(cc >= 'A' && cc <= 'F');
}
static UChar AsHexDigit(UChar cc) {
if (cc >= '0' && cc <= '9')
return cc - '0';
if (cc >= 'a' && cc <= 'z')
return 10 + cc - 'a';
if (cc >= 'A' && cc <= 'Z')
return 10 + cc - 'A';
NOTREACHED();
return 0;
}
typedef Vector<UChar, 64> ConsumedCharacterBuffer;
static void UnconsumeCharacters(SegmentedString& source,
ConsumedCharacterBuffer& consumed_characters) {
if (consumed_characters.size() == 1)
source.Push(consumed_characters[0]);
else if (consumed_characters.size() == 2) {
source.Push(consumed_characters[1]);
source.Push(consumed_characters[0]);
} else
source.Prepend(SegmentedString(String(consumed_characters)),
SegmentedString::PrependType::kUnconsume);
}
static bool ConsumeNamedEntity(SegmentedString& source,
DecodedHTMLEntity& decoded_entity,
bool& not_enough_characters,
UChar additional_allowed_character,
UChar& cc) {
ConsumedCharacterBuffer consumed_characters;
HTMLEntitySearch entity_search;
while (!source.IsEmpty()) {
cc = source.CurrentChar();
entity_search.Advance(cc);
if (!entity_search.IsEntityPrefix())
break;
consumed_characters.push_back(cc);
source.AdvanceAndASSERT(cc);
}
not_enough_characters = source.IsEmpty();
if (not_enough_characters) {
// We can't decide on an entity because there might be a longer entity
// that we could match if we had more data.
UnconsumeCharacters(source, consumed_characters);
return false;
}
if (!entity_search.MostRecentMatch()) {
UnconsumeCharacters(source, consumed_characters);
return false;
}
if (entity_search.MostRecentMatch()->length !=
entity_search.CurrentLength()) {
// We've consumed too many characters. We need to walk the
// source back to the point at which we had consumed an
// actual entity.
UnconsumeCharacters(source, consumed_characters);
consumed_characters.clear();
const HTMLEntityTableEntry* most_recent = entity_search.MostRecentMatch();
const uint16_t length = most_recent->length;
const LChar* reference = HTMLEntityTable::EntityString(*most_recent);
for (uint16_t i = 0; i < length; ++i) {
cc = source.CurrentChar();
DCHECK_EQ(cc, static_cast<UChar>(*reference++));
consumed_characters.push_back(cc);
source.AdvanceAndASSERT(cc);
DCHECK(!source.IsEmpty());
}
cc = source.CurrentChar();
}
if (entity_search.MostRecentMatch()->LastCharacter() == ';' ||
!additional_allowed_character || !(IsAlphaNumeric(cc) || cc == '=')) {
decoded_entity.Append(entity_search.MostRecentMatch()->first_value);
if (UChar32 second = entity_search.MostRecentMatch()->second_value)
decoded_entity.Append(second);
return true;
}
UnconsumeCharacters(source, consumed_characters);
return false;
}
bool ConsumeHTMLEntity(SegmentedString& source,
DecodedHTMLEntity& decoded_entity,
bool& not_enough_characters,
UChar additional_allowed_character) {
DCHECK(!additional_allowed_character || additional_allowed_character == '"' ||
additional_allowed_character == '\'' ||
additional_allowed_character == '>');
DCHECK(!not_enough_characters);
DCHECK(decoded_entity.IsEmpty());
enum EntityState {
kInitial,
kNumber,
kMaybeHexLowerCaseX,
kMaybeHexUpperCaseX,
kHex,
kDecimal,
kNamed
};
EntityState entity_state = kInitial;
UChar32 result = 0;
ConsumedCharacterBuffer consumed_characters;
while (!source.IsEmpty()) {
UChar cc = source.CurrentChar();
switch (entity_state) {
case kInitial: {
if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' ||
cc == '<' || cc == '&')
return false;
if (additional_allowed_character && cc == additional_allowed_character)
return false;
if (cc == '#') {
entity_state = kNumber;
break;
}
if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
entity_state = kNamed;
continue;
}
return false;
}
case kNumber: {
if (cc == 'x') {
entity_state = kMaybeHexLowerCaseX;
break;
}
if (cc == 'X') {
entity_state = kMaybeHexUpperCaseX;
break;
}
if (cc >= '0' && cc <= '9') {
entity_state = kDecimal;
continue;
}
source.Push('#');
return false;
}
case kMaybeHexLowerCaseX: {
if (IsHexDigit(cc)) {
entity_state = kHex;
continue;
}
source.Push('x');
source.Push('#');
return false;
}
case kMaybeHexUpperCaseX: {
if (IsHexDigit(cc)) {
entity_state = kHex;
continue;
}
source.Push('X');
source.Push('#');
return false;
}
case kHex: {
if (IsHexDigit(cc)) {
if (result != kInvalidUnicode)
result = result * 16 + AsHexDigit(cc);
} else if (cc == ';') {
source.AdvanceAndASSERT(cc);
AppendLegalEntityFor(result, decoded_entity);
return true;
} else {
AppendLegalEntityFor(result, decoded_entity);
return true;
}
break;
}
case kDecimal: {
if (cc >= '0' && cc <= '9') {
if (result != kInvalidUnicode)
result = result * 10 + cc - '0';
} else if (cc == ';') {
source.AdvanceAndASSERT(cc);
AppendLegalEntityFor(result, decoded_entity);
return true;
} else {
AppendLegalEntityFor(result, decoded_entity);
return true;
}
break;
}
case kNamed: {
return ConsumeNamedEntity(source, decoded_entity, not_enough_characters,
additional_allowed_character, cc);
}
}
if (result > UCHAR_MAX_VALUE)
result = kInvalidUnicode;
consumed_characters.push_back(cc);
source.AdvanceAndASSERT(cc);
}
DCHECK(source.IsEmpty());
not_enough_characters = true;
UnconsumeCharacters(source, consumed_characters);
return false;
}
static size_t AppendUChar32ToUCharArray(UChar32 value, UChar* result) {
if (U_IS_BMP(value)) {
UChar character = static_cast<UChar>(value);
DCHECK_EQ(character, value);
result[0] = character;
return 1;
}
result[0] = U16_LEAD(value);
result[1] = U16_TRAIL(value);
return 2;
}
size_t DecodeNamedEntityToUCharArray(const char* name, UChar result[4]) {
HTMLEntitySearch search;
while (*name) {
search.Advance(*name++);
if (!search.IsEntityPrefix())
return 0;
}
search.Advance(';');
if (!search.IsEntityPrefix())
return 0;
size_t number_of_code_points =
AppendUChar32ToUCharArray(search.MostRecentMatch()->first_value, result);
if (!search.MostRecentMatch()->second_value)
return number_of_code_points;
return number_of_code_points +
AppendUChar32ToUCharArray(search.MostRecentMatch()->second_value,
result + number_of_code_points);
}
} // namespace blink