blob: 0fb8655ec0ad1454f333280355105414cb07084f [file] [log] [blame]
/*
* Copyright (C) 2008 Apple Inc. All Rights Reserved.
* Copyright (C) 2010 Google, Inc. All Rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef THIRD_PARTY_BLINK_RENDERER_CORE_HTML_PARSER_HTML_TOKENIZER_H_
#define THIRD_PARTY_BLINK_RENDERER_CORE_HTML_PARSER_HTML_TOKENIZER_H_
#include <memory>
#include "base/macros.h"
#include "base/memory/ptr_util.h"
#include "third_party/blink/renderer/core/core_export.h"
#include "third_party/blink/renderer/core/html/parser/html_parser_options.h"
#include "third_party/blink/renderer/core/html/parser/html_token.h"
#include "third_party/blink/renderer/core/html/parser/input_stream_preprocessor.h"
#include "third_party/blink/renderer/platform/text/segmented_string.h"
namespace blink {
class CORE_EXPORT HTMLTokenizer {
USING_FAST_MALLOC(HTMLTokenizer);
public:
explicit HTMLTokenizer(const HTMLParserOptions&);
~HTMLTokenizer();
void Reset();
enum State {
kDataState,
kCharacterReferenceInDataState,
kRCDATAState,
kCharacterReferenceInRCDATAState,
kRAWTEXTState,
kScriptDataState,
kPLAINTEXTState,
kTagOpenState,
kEndTagOpenState,
kTagNameState,
kRCDATALessThanSignState,
kRCDATAEndTagOpenState,
kRCDATAEndTagNameState,
kRAWTEXTLessThanSignState,
kRAWTEXTEndTagOpenState,
kRAWTEXTEndTagNameState,
kScriptDataLessThanSignState,
kScriptDataEndTagOpenState,
kScriptDataEndTagNameState,
kScriptDataEscapeStartState,
kScriptDataEscapeStartDashState,
kScriptDataEscapedState,
kScriptDataEscapedDashState,
kScriptDataEscapedDashDashState,
kScriptDataEscapedLessThanSignState,
kScriptDataEscapedEndTagOpenState,
kScriptDataEscapedEndTagNameState,
kScriptDataDoubleEscapeStartState,
kScriptDataDoubleEscapedState,
kScriptDataDoubleEscapedDashState,
kScriptDataDoubleEscapedDashDashState,
kScriptDataDoubleEscapedLessThanSignState,
kScriptDataDoubleEscapeEndState,
kBeforeAttributeNameState,
kAttributeNameState,
kAfterAttributeNameState,
kBeforeAttributeValueState,
kAttributeValueDoubleQuotedState,
kAttributeValueSingleQuotedState,
kAttributeValueUnquotedState,
kCharacterReferenceInAttributeValueState,
kAfterAttributeValueQuotedState,
kSelfClosingStartTagState,
kBogusCommentState,
// The ContinueBogusCommentState is not in the HTML5 spec, but we use
// it internally to keep track of whether we've started the bogus
// comment token yet.
kContinueBogusCommentState,
kMarkupDeclarationOpenState,
kCommentStartState,
kCommentStartDashState,
kCommentState,
kCommentEndDashState,
kCommentEndState,
kCommentEndBangState,
kDOCTYPEState,
kBeforeDOCTYPENameState,
kDOCTYPENameState,
kAfterDOCTYPENameState,
kAfterDOCTYPEPublicKeywordState,
kBeforeDOCTYPEPublicIdentifierState,
kDOCTYPEPublicIdentifierDoubleQuotedState,
kDOCTYPEPublicIdentifierSingleQuotedState,
kAfterDOCTYPEPublicIdentifierState,
kBetweenDOCTYPEPublicAndSystemIdentifiersState,
kAfterDOCTYPESystemKeywordState,
kBeforeDOCTYPESystemIdentifierState,
kDOCTYPESystemIdentifierDoubleQuotedState,
kDOCTYPESystemIdentifierSingleQuotedState,
kAfterDOCTYPESystemIdentifierState,
kBogusDOCTYPEState,
kCDATASectionState,
kCDATASectionBracketState,
kCDATASectionEndState,
};
// This function returns true if it emits a token. Otherwise, callers
// must provide the same (in progress) token on the next call (unless
// they call reset() first).
bool NextToken(SegmentedString&, HTMLToken&);
// Returns a copy of any characters buffered internally by the tokenizer.
// The tokenizer buffers characters when searching for the </script> token
// that terminates a script element.
String BufferedCharacters() const;
wtf_size_t NumberOfBufferedCharacters() const {
// Notice that we add 2 to the length of the temporary_buffer_ to
// account for the "</" characters, which are effectively buffered in
// the tokenizer's state machine.
return temporary_buffer_.size() ? temporary_buffer_.size() + 2 : 0;
}
// Updates the tokenizer's state according to the given tag name. This is
// an approximation of how the tree builder would update the tokenizer's
// state. This method is useful for approximating HTML tokenization. To
// get exactly the correct tokenization, you need the real tree builder.
//
// The main failures in the approximation are as follows:
//
// * The first set of character tokens emitted for a <pre> element might
// contain an extra leading newline.
// * The replacement of U+0000 with U+FFFD will not be sensitive to the
// tree builder's insertion mode.
// * CDATA sections in foreign content will be tokenized as bogus comments
// instead of as character tokens.
//
void UpdateStateFor(const String& tag_name);
bool ForceNullCharacterReplacement() const {
return force_null_character_replacement_;
}
void SetForceNullCharacterReplacement(bool value) {
force_null_character_replacement_ = value;
}
bool ShouldAllowCDATA() const { return should_allow_cdata_; }
void SetShouldAllowCDATA(bool value) { should_allow_cdata_ = value; }
State GetState() const { return state_; }
void SetState(State state) { state_ = state; }
inline bool ShouldSkipNullCharacters() const {
return !force_null_character_replacement_ &&
(state_ == HTMLTokenizer::kDataState ||
state_ == HTMLTokenizer::kRCDATAState ||
state_ == HTMLTokenizer::kRAWTEXTState);
}
inline static bool IsEndTagBufferingState(HTMLTokenizer::State state) {
switch (state) {
case HTMLTokenizer::kRCDATAEndTagOpenState:
case HTMLTokenizer::kRCDATAEndTagNameState:
case HTMLTokenizer::kRAWTEXTEndTagOpenState:
case HTMLTokenizer::kRAWTEXTEndTagNameState:
case HTMLTokenizer::kScriptDataEndTagOpenState:
case HTMLTokenizer::kScriptDataEndTagNameState:
case HTMLTokenizer::kScriptDataEscapedEndTagOpenState:
case HTMLTokenizer::kScriptDataEscapedEndTagNameState:
return true;
default:
return false;
}
}
private:
inline bool ProcessEntity(SegmentedString&);
inline void ParseError();
inline void BufferCharacter(UChar character) {
DCHECK_NE(character, kEndOfFileMarker);
token_->EnsureIsCharacterToken();
token_->AppendToCharacter(character);
}
inline bool EmitAndResumeIn(SegmentedString& source, State state) {
SaveEndTagNameIfNeeded();
state_ = state;
source.AdvanceAndUpdateLineNumber();
return true;
}
inline bool EmitAndReconsumeIn(SegmentedString&, State state) {
SaveEndTagNameIfNeeded();
state_ = state;
return true;
}
inline bool EmitEndOfFile(SegmentedString& source) {
if (HaveBufferedCharacterToken())
return true;
state_ = HTMLTokenizer::kDataState;
source.AdvanceAndUpdateLineNumber();
token_->Clear();
token_->MakeEndOfFile();
return true;
}
inline bool FlushEmitAndResumeIn(SegmentedString&, State);
// Return whether we need to emit a character token before dealing with
// the buffered end tag.
inline bool FlushBufferedEndTag(SegmentedString&);
inline bool TemporaryBufferIs(const String&);
// Sometimes we speculatively consume input characters and we don't
// know whether they represent end tags or RCDATA, etc. These
// functions help manage these state.
inline void AddToPossibleEndTag(LChar cc);
inline void SaveEndTagNameIfNeeded() {
DCHECK_NE(token_->GetType(), HTMLToken::kUninitialized);
if (token_->GetType() == HTMLToken::kStartTag)
appropriate_end_tag_name_ = token_->GetName();
}
inline bool IsAppropriateEndTag();
inline bool HaveBufferedCharacterToken() {
return token_->GetType() == HTMLToken::kCharacter;
}
State state_;
bool force_null_character_replacement_;
bool should_allow_cdata_;
// token_ is owned by the caller. If NextToken is not on the stack,
// this member might be pointing to unallocated memory.
HTMLToken* token_;
// http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
UChar additional_allowed_character_;
// http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
InputStreamPreprocessor<HTMLTokenizer> input_stream_preprocessor_;
Vector<UChar, 32> appropriate_end_tag_name_;
// http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer
Vector<LChar, 32> temporary_buffer_;
// We occationally want to emit both a character token and an end tag
// token (e.g., when lexing script). We buffer the name of the end tag
// token here so we remember it next time we re-enter the tokenizer.
Vector<LChar, 32> buffered_end_tag_name_;
HTMLParserOptions options_;
DISALLOW_COPY_AND_ASSIGN(HTMLTokenizer);
};
} // namespace blink
#endif