| /* |
| * Copyright (C) 2008 Apple Inc. All Rights Reserved. |
| * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ |
| * Copyright (C) 2010 Google, Inc. All Rights Reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "third_party/blink/renderer/core/html/parser/html_entity_parser.h" |
| |
| #include "base/notreached.h" |
| #include "third_party/blink/renderer/core/html/parser/html_entity_search.h" |
| #include "third_party/blink/renderer/core/html/parser/html_entity_table.h" |
| #include "third_party/blink/renderer/platform/wtf/text/string_builder.h" |
| |
| namespace blink { |
| |
| static const UChar kWindowsLatin1ExtensionArray[32] = { |
| 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 |
| 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F |
| 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 |
| 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F |
| }; |
| |
| static bool IsAlphaNumeric(UChar cc) { |
| return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || |
| (cc >= 'A' && cc <= 'Z'); |
| } |
| |
| static UChar AdjustEntity(UChar32 value) { |
| if ((value & ~0x1F) != 0x0080) |
| return value; |
| return kWindowsLatin1ExtensionArray[value - 0x80]; |
| } |
| |
| static void AppendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decoded_entity) { |
| // FIXME: A number of specific entity values generate parse errors. |
| if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) { |
| decoded_entity.Append(0xFFFD); |
| return; |
| } |
| if (U_IS_BMP(c)) { |
| decoded_entity.Append(AdjustEntity(c)); |
| return; |
| } |
| decoded_entity.Append(c); |
| } |
| |
| static const UChar32 kInvalidUnicode = -1; |
| |
| static bool IsHexDigit(UChar cc) { |
| return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || |
| (cc >= 'A' && cc <= 'F'); |
| } |
| |
| static UChar AsHexDigit(UChar cc) { |
| if (cc >= '0' && cc <= '9') |
| return cc - '0'; |
| if (cc >= 'a' && cc <= 'z') |
| return 10 + cc - 'a'; |
| if (cc >= 'A' && cc <= 'Z') |
| return 10 + cc - 'A'; |
| NOTREACHED(); |
| return 0; |
| } |
| |
| typedef Vector<UChar, 64> ConsumedCharacterBuffer; |
| |
| static void UnconsumeCharacters(SegmentedString& source, |
| ConsumedCharacterBuffer& consumed_characters) { |
| if (consumed_characters.size() == 1) |
| source.Push(consumed_characters[0]); |
| else if (consumed_characters.size() == 2) { |
| source.Push(consumed_characters[1]); |
| source.Push(consumed_characters[0]); |
| } else |
| source.Prepend(SegmentedString(String(consumed_characters)), |
| SegmentedString::PrependType::kUnconsume); |
| } |
| |
| static bool ConsumeNamedEntity(SegmentedString& source, |
| DecodedHTMLEntity& decoded_entity, |
| bool& not_enough_characters, |
| UChar additional_allowed_character, |
| UChar& cc) { |
| ConsumedCharacterBuffer consumed_characters; |
| HTMLEntitySearch entity_search; |
| while (!source.IsEmpty()) { |
| cc = source.CurrentChar(); |
| entity_search.Advance(cc); |
| if (!entity_search.IsEntityPrefix()) |
| break; |
| consumed_characters.push_back(cc); |
| source.AdvanceAndASSERT(cc); |
| } |
| not_enough_characters = source.IsEmpty(); |
| if (not_enough_characters) { |
| // We can't decide on an entity because there might be a longer entity |
| // that we could match if we had more data. |
| UnconsumeCharacters(source, consumed_characters); |
| return false; |
| } |
| if (!entity_search.MostRecentMatch()) { |
| UnconsumeCharacters(source, consumed_characters); |
| return false; |
| } |
| if (entity_search.MostRecentMatch()->length != |
| entity_search.CurrentLength()) { |
| // We've consumed too many characters. We need to walk the |
| // source back to the point at which we had consumed an |
| // actual entity. |
| UnconsumeCharacters(source, consumed_characters); |
| consumed_characters.clear(); |
| const HTMLEntityTableEntry* most_recent = entity_search.MostRecentMatch(); |
| const uint16_t length = most_recent->length; |
| const LChar* reference = HTMLEntityTable::EntityString(*most_recent); |
| for (uint16_t i = 0; i < length; ++i) { |
| cc = source.CurrentChar(); |
| DCHECK_EQ(cc, static_cast<UChar>(*reference++)); |
| consumed_characters.push_back(cc); |
| source.AdvanceAndASSERT(cc); |
| DCHECK(!source.IsEmpty()); |
| } |
| cc = source.CurrentChar(); |
| } |
| if (entity_search.MostRecentMatch()->LastCharacter() == ';' || |
| !additional_allowed_character || !(IsAlphaNumeric(cc) || cc == '=')) { |
| decoded_entity.Append(entity_search.MostRecentMatch()->first_value); |
| if (UChar32 second = entity_search.MostRecentMatch()->second_value) |
| decoded_entity.Append(second); |
| return true; |
| } |
| UnconsumeCharacters(source, consumed_characters); |
| return false; |
| } |
| |
| bool ConsumeHTMLEntity(SegmentedString& source, |
| DecodedHTMLEntity& decoded_entity, |
| bool& not_enough_characters, |
| UChar additional_allowed_character) { |
| DCHECK(!additional_allowed_character || additional_allowed_character == '"' || |
| additional_allowed_character == '\'' || |
| additional_allowed_character == '>'); |
| DCHECK(!not_enough_characters); |
| DCHECK(decoded_entity.IsEmpty()); |
| |
| enum EntityState { |
| kInitial, |
| kNumber, |
| kMaybeHexLowerCaseX, |
| kMaybeHexUpperCaseX, |
| kHex, |
| kDecimal, |
| kNamed |
| }; |
| EntityState entity_state = kInitial; |
| UChar32 result = 0; |
| ConsumedCharacterBuffer consumed_characters; |
| |
| while (!source.IsEmpty()) { |
| UChar cc = source.CurrentChar(); |
| switch (entity_state) { |
| case kInitial: { |
| if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || |
| cc == '<' || cc == '&') |
| return false; |
| if (additional_allowed_character && cc == additional_allowed_character) |
| return false; |
| if (cc == '#') { |
| entity_state = kNumber; |
| break; |
| } |
| if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) { |
| entity_state = kNamed; |
| continue; |
| } |
| return false; |
| } |
| case kNumber: { |
| if (cc == 'x') { |
| entity_state = kMaybeHexLowerCaseX; |
| break; |
| } |
| if (cc == 'X') { |
| entity_state = kMaybeHexUpperCaseX; |
| break; |
| } |
| if (cc >= '0' && cc <= '9') { |
| entity_state = kDecimal; |
| continue; |
| } |
| source.Push('#'); |
| return false; |
| } |
| case kMaybeHexLowerCaseX: { |
| if (IsHexDigit(cc)) { |
| entity_state = kHex; |
| continue; |
| } |
| source.Push('x'); |
| source.Push('#'); |
| return false; |
| } |
| case kMaybeHexUpperCaseX: { |
| if (IsHexDigit(cc)) { |
| entity_state = kHex; |
| continue; |
| } |
| source.Push('X'); |
| source.Push('#'); |
| return false; |
| } |
| case kHex: { |
| if (IsHexDigit(cc)) { |
| if (result != kInvalidUnicode) |
| result = result * 16 + AsHexDigit(cc); |
| } else if (cc == ';') { |
| source.AdvanceAndASSERT(cc); |
| AppendLegalEntityFor(result, decoded_entity); |
| return true; |
| } else { |
| AppendLegalEntityFor(result, decoded_entity); |
| return true; |
| } |
| break; |
| } |
| case kDecimal: { |
| if (cc >= '0' && cc <= '9') { |
| if (result != kInvalidUnicode) |
| result = result * 10 + cc - '0'; |
| } else if (cc == ';') { |
| source.AdvanceAndASSERT(cc); |
| AppendLegalEntityFor(result, decoded_entity); |
| return true; |
| } else { |
| AppendLegalEntityFor(result, decoded_entity); |
| return true; |
| } |
| break; |
| } |
| case kNamed: { |
| return ConsumeNamedEntity(source, decoded_entity, not_enough_characters, |
| additional_allowed_character, cc); |
| } |
| } |
| |
| if (result > UCHAR_MAX_VALUE) |
| result = kInvalidUnicode; |
| |
| consumed_characters.push_back(cc); |
| source.AdvanceAndASSERT(cc); |
| } |
| DCHECK(source.IsEmpty()); |
| not_enough_characters = true; |
| UnconsumeCharacters(source, consumed_characters); |
| return false; |
| } |
| |
| static size_t AppendUChar32ToUCharArray(UChar32 value, UChar* result) { |
| if (U_IS_BMP(value)) { |
| UChar character = static_cast<UChar>(value); |
| DCHECK_EQ(character, value); |
| result[0] = character; |
| return 1; |
| } |
| |
| result[0] = U16_LEAD(value); |
| result[1] = U16_TRAIL(value); |
| return 2; |
| } |
| |
| size_t DecodeNamedEntityToUCharArray(const char* name, UChar result[4]) { |
| HTMLEntitySearch search; |
| while (*name) { |
| search.Advance(*name++); |
| if (!search.IsEntityPrefix()) |
| return 0; |
| } |
| search.Advance(';'); |
| if (!search.IsEntityPrefix()) |
| return 0; |
| |
| size_t number_of_code_points = |
| AppendUChar32ToUCharArray(search.MostRecentMatch()->first_value, result); |
| if (!search.MostRecentMatch()->second_value) |
| return number_of_code_points; |
| return number_of_code_points + |
| AppendUChar32ToUCharArray(search.MostRecentMatch()->second_value, |
| result + number_of_code_points); |
| } |
| |
| } // namespace blink |