blob: 7ceacdcc3c58492602aad71c439b147aa87e42dc [file] [log] [blame]
/*
* Copyright (C) 2011 Google Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "third_party/blink/renderer/platform/mhtml/mhtml_parser.h"
#include <stddef.h>
#include <utility>
#include "third_party/blink/renderer/platform/heap/heap.h"
#include "third_party/blink/renderer/platform/mhtml/archive_resource.h"
#include "third_party/blink/renderer/platform/network/http_parsers.h"
#include "third_party/blink/renderer/platform/network/parsed_content_type.h"
#include "third_party/blink/renderer/platform/wtf/hash_map.h"
#include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h"
#include "third_party/blink/renderer/platform/wtf/text/base64.h"
#include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
#include "third_party/blink/renderer/platform/wtf/text/string_concatenate.h"
#include "third_party/blink/renderer/platform/wtf/text/string_hash.h"
#include "third_party/blink/renderer/platform/wtf/text/wtf_string.h"
#include "third_party/blink/renderer/platform/wtf/vector.h"
namespace blink {
namespace {
void QuotedPrintableDecode(const char* data,
size_t data_length,
Vector<char>& out) {
out.clear();
if (!data_length)
return;
for (size_t i = 0; i < data_length; ++i) {
char current_character = data[i];
if (current_character != '=') {
out.push_back(current_character);
continue;
}
// We are dealing with a '=xx' sequence.
if (data_length - i < 3) {
// Unfinished = sequence, append as is.
out.push_back(current_character);
continue;
}
char upper_character = data[++i];
char lower_character = data[++i];
if (upper_character == '\r' && lower_character == '\n')
continue;
if (!IsASCIIHexDigit(upper_character) ||
!IsASCIIHexDigit(lower_character)) {
// Invalid sequence, = followed by non hex digits, just insert the
// characters as is.
out.push_back('=');
out.push_back(upper_character);
out.push_back(lower_character);
continue;
}
out.push_back(
static_cast<char>(ToASCIIHexValue(upper_character, lower_character)));
}
}
} // namespace
// This class is a limited MIME parser used to parse the MIME headers of MHTML
// files.
class MIMEHeader final : public GarbageCollected<MIMEHeader> {
public:
MIMEHeader();
enum class Encoding {
kQuotedPrintable,
kBase64,
kEightBit,
kSevenBit,
kBinary,
kUnknown
};
static MIMEHeader* ParseHeader(SharedBufferChunkReader* cr_lf_line_reader);
bool IsMultipart() const {
return content_type_.StartsWithIgnoringASCIICase("multipart/");
}
String ContentType() const { return content_type_; }
String Charset() const { return charset_; }
Encoding ContentTransferEncoding() const {
return content_transfer_encoding_;
}
String ContentLocation() const { return content_location_; }
String ContentID() const { return content_id_; }
base::Time Date() const { return date_; }
// Multi-part type and boundaries are only valid for multipart MIME headers.
String MultiPartType() const { return multipart_type_; }
String EndOfPartBoundary() const { return end_of_part_boundary_; }
String EndOfDocumentBoundary() const { return end_of_document_boundary_; }
void Trace(Visitor* visitor) const {}
private:
static Encoding ParseContentTransferEncoding(const String&);
String content_type_;
String charset_;
Encoding content_transfer_encoding_;
String content_location_;
String content_id_;
base::Time date_;
String multipart_type_;
String end_of_part_boundary_;
String end_of_document_boundary_;
};
typedef HashMap<String, String> KeyValueMap;
static KeyValueMap RetrieveKeyValuePairs(SharedBufferChunkReader* buffer) {
KeyValueMap key_value_pairs;
String line;
String key;
StringBuilder value;
while (!(line = buffer->NextChunkAsUTF8StringWithLatin1Fallback()).IsNull()) {
if (line.IsEmpty())
break; // Empty line means end of key/value section.
// RFC822 continuation: A line that starts with LWSP is a continuation of
// the prior line.
if ((line[0] == '\t') || (line[0] == ' ')) {
value.Append(line.Substring(1));
continue;
}
// New key/value, store the previous one if any.
if (!key.IsEmpty()) {
if (key_value_pairs.find(key) != key_value_pairs.end())
DVLOG(1) << "Key duplicate found in MIME header. Key is '" << key
<< "', previous value replaced.";
key_value_pairs.insert(key, value.ToString().StripWhiteSpace());
key = String();
value.Clear();
}
wtf_size_t semi_colon_index = line.find(':');
if (semi_colon_index == kNotFound) {
// This is not a key value pair, ignore.
continue;
}
key =
line.Substring(0, semi_colon_index).DeprecatedLower().StripWhiteSpace();
value.Append(line.Substring(semi_colon_index + 1));
}
// Store the last property if there is one.
if (!key.IsEmpty())
key_value_pairs.Set(key, value.ToString().StripWhiteSpace());
return key_value_pairs;
}
MIMEHeader* MIMEHeader::ParseHeader(SharedBufferChunkReader* buffer) {
auto* mime_header = MakeGarbageCollected<MIMEHeader>();
KeyValueMap key_value_pairs = RetrieveKeyValuePairs(buffer);
KeyValueMap::iterator mime_parameters_iterator =
key_value_pairs.find("content-type");
if (mime_parameters_iterator != key_value_pairs.end()) {
ParsedContentType parsed_content_type(mime_parameters_iterator->value,
ParsedContentType::Mode::kRelaxed);
mime_header->content_type_ = parsed_content_type.MimeType();
if (!mime_header->IsMultipart()) {
mime_header->charset_ = parsed_content_type.Charset().StripWhiteSpace();
} else {
mime_header->multipart_type_ =
parsed_content_type.ParameterValueForName("type");
String boundary = parsed_content_type.ParameterValueForName("boundary");
if (boundary.IsNull()) {
DVLOG(1) << "No boundary found in multipart MIME header.";
return nullptr;
}
mime_header->end_of_part_boundary_ = "--" + boundary;
mime_header->end_of_document_boundary_ =
mime_header->end_of_part_boundary_;
mime_header->end_of_document_boundary_ =
mime_header->end_of_document_boundary_ + "--";
}
}
mime_parameters_iterator = key_value_pairs.find("content-transfer-encoding");
if (mime_parameters_iterator != key_value_pairs.end())
mime_header->content_transfer_encoding_ =
ParseContentTransferEncoding(mime_parameters_iterator->value);
mime_parameters_iterator = key_value_pairs.find("content-location");
if (mime_parameters_iterator != key_value_pairs.end())
mime_header->content_location_ = mime_parameters_iterator->value;
// See rfc2557 - section 8.3 - Use of the Content-ID header and CID URLs.
mime_parameters_iterator = key_value_pairs.find("content-id");
if (mime_parameters_iterator != key_value_pairs.end())
mime_header->content_id_ = mime_parameters_iterator->value;
mime_parameters_iterator = key_value_pairs.find("date");
if (mime_parameters_iterator != key_value_pairs.end()) {
base::Time parsed_time;
// Behave like //net and parse time-valued headers with a default time zone
// of UTC.
if (base::Time::FromUTCString(
mime_parameters_iterator->value.Utf8().c_str(), &parsed_time))
mime_header->date_ = parsed_time;
}
return mime_header;
}
MIMEHeader::Encoding MIMEHeader::ParseContentTransferEncoding(
const String& text) {
String encoding = text.StripWhiteSpace().LowerASCII();
if (encoding == "base64")
return Encoding::kBase64;
if (encoding == "quoted-printable")
return Encoding::kQuotedPrintable;
if (encoding == "8bit")
return Encoding::kEightBit;
if (encoding == "7bit")
return Encoding::kSevenBit;
if (encoding == "binary")
return Encoding::kBinary;
DVLOG(1) << "Unknown encoding '" << text << "' found in MIME header.";
return Encoding::kUnknown;
}
MIMEHeader::MIMEHeader() : content_transfer_encoding_(Encoding::kUnknown) {}
static bool SkipLinesUntilBoundaryFound(SharedBufferChunkReader& line_reader,
const String& boundary) {
String line;
while (!(line = line_reader.NextChunkAsUTF8StringWithLatin1Fallback())
.IsNull()) {
if (line == boundary)
return true;
}
return false;
}
MHTMLParser::MHTMLParser(scoped_refptr<const SharedBuffer> data)
: line_reader_(std::move(data), "\r\n") {}
HeapVector<Member<ArchiveResource>> MHTMLParser::ParseArchive() {
MIMEHeader* header = MIMEHeader::ParseHeader(&line_reader_);
HeapVector<Member<ArchiveResource>> resources;
if (ParseArchiveWithHeader(header, resources)) {
creation_date_ = header->Date();
} else {
resources.clear();
}
return resources;
}
base::Time MHTMLParser::CreationDate() const {
return creation_date_;
}
bool MHTMLParser::ParseArchiveWithHeader(
MIMEHeader* header,
HeapVector<Member<ArchiveResource>>& resources) {
if (!header) {
DVLOG(1) << "Failed to parse MHTML part: no header.";
return false;
}
if (!header->IsMultipart()) {
// With IE a page with no resource is not multi-part.
bool end_of_archive_reached = false;
ArchiveResource* resource =
ParseNextPart(*header, String(), String(), end_of_archive_reached);
if (!resource)
return false;
resources.push_back(resource);
return true;
}
// Skip the message content (it's a generic browser specific message).
SkipLinesUntilBoundaryFound(line_reader_, header->EndOfPartBoundary());
bool end_of_archive = false;
while (!end_of_archive) {
MIMEHeader* resource_header = MIMEHeader::ParseHeader(&line_reader_);
if (!resource_header) {
DVLOG(1) << "Failed to parse MHTML, invalid MIME header.";
return false;
}
if (resource_header->ContentType() == "multipart/alternative") {
// Ignore IE nesting which makes little sense (IE seems to nest only some
// of the frames).
if (!ParseArchiveWithHeader(resource_header, resources)) {
DVLOG(1) << "Failed to parse MHTML subframe.";
return false;
}
SkipLinesUntilBoundaryFound(line_reader_, header->EndOfPartBoundary());
continue;
}
ArchiveResource* resource =
ParseNextPart(*resource_header, header->EndOfPartBoundary(),
header->EndOfDocumentBoundary(), end_of_archive);
if (!resource) {
DVLOG(1) << "Failed to parse MHTML part.";
return false;
}
resources.push_back(resource);
}
return true;
}
ArchiveResource* MHTMLParser::ParseNextPart(
const MIMEHeader& mime_header,
const String& end_of_part_boundary,
const String& end_of_document_boundary,
bool& end_of_archive_reached) {
DCHECK_EQ(end_of_part_boundary.IsEmpty(), end_of_document_boundary.IsEmpty());
// Per the spec, the bondary to separate parts should start with CRLF.
// |end_of_part_boundary| passed here does not contain CRLF at the beginning.
// The parsing logic below takes care of CRLF handling.
// If no content transfer encoding is specified, default to binary encoding.
MIMEHeader::Encoding content_transfer_encoding =
mime_header.ContentTransferEncoding();
if (content_transfer_encoding == MIMEHeader::Encoding::kUnknown)
content_transfer_encoding = MIMEHeader::Encoding::kBinary;
Vector<char> content;
const bool check_boundary = !end_of_part_boundary.IsEmpty();
bool end_of_part_reached = false;
if (content_transfer_encoding == MIMEHeader::Encoding::kBinary) {
if (!check_boundary) {
DVLOG(1) << "Binary contents requires end of part";
return nullptr;
}
// Due to a bug in MHTMLArchive, CRLF was not added to the beginning of the
// boundary that is placed after the part encoded as binary. To handle both
// cases that CRLF may or may not be at the beginning of the boundary, we
// read the part content till reaching the boundary without CRLF. So the
// part content may contain CRLF at the end, which will be stripped off
// later.
line_reader_.SetSeparator(end_of_part_boundary.Utf8().c_str());
if (!line_reader_.NextChunk(content)) {
DVLOG(1) << "Binary contents requires end of part";
return nullptr;
}
line_reader_.SetSeparator("\r\n");
// Strip the CRLF from the end of the content if present.
// Note: it may be the case that CRLF stripped off is really part of the
// content, instead of part of the boundary.
// 1) If the content denotes text or html data, stripping off CRLF will
// normally bring no harm.
// 2) Otherwise, the content denotes image or other type of binary data.
// Usually it doesn't have CRLF at the end.
// In order to support parsing the MHTML archive file produced before the
// MHTMLArchive bug was fixed, we need to take a risk of stripping off the
// CRLF that indeed belongs to the content.
if (content.size() >= 2 && content[content.size() - 2] == '\r' &&
content[content.size() - 1] == '\n') {
content.resize(content.size() - 2);
}
Vector<char> next_chars;
if (line_reader_.Peek(next_chars, 2) != 2) {
DVLOG(1) << "Invalid seperator.";
return nullptr;
}
end_of_part_reached = true;
DCHECK(next_chars.size() == 2);
end_of_archive_reached = (next_chars[0] == '-' && next_chars[1] == '-');
if (!end_of_archive_reached) {
String line = line_reader_.NextChunkAsUTF8StringWithLatin1Fallback();
if (!line.IsEmpty()) {
DVLOG(1) << "No CRLF at end of binary section.";
return nullptr;
}
}
} else {
String line;
while (!(line = line_reader_.NextChunkAsUTF8StringWithLatin1Fallback())
.IsNull()) {
end_of_archive_reached = (line == end_of_document_boundary);
if (check_boundary &&
(line == end_of_part_boundary || end_of_archive_reached)) {
end_of_part_reached = true;
break;
}
// Note that we use line.utf8() and not line.ascii() as ascii turns
// special characters (such as tab, line-feed...) into '?'.
content.Append(line.Utf8().c_str(), line.length());
if (content_transfer_encoding == MIMEHeader::Encoding::kQuotedPrintable) {
// The line reader removes the \r\n, but we need them for the content in
// this case as the QuotedPrintable decoder expects CR-LF terminated
// lines.
content.Append("\r\n", 2u);
}
}
}
if (!end_of_part_reached && check_boundary) {
DVLOG(1) << "No boundary found for MHTML part.";
return nullptr;
}
Vector<char> data;
switch (content_transfer_encoding) {
case MIMEHeader::Encoding::kBase64:
if (!Base64Decode(content.data(), content.size(), data)) {
DVLOG(1) << "Invalid base64 content for MHTML part.";
return nullptr;
}
break;
case MIMEHeader::Encoding::kQuotedPrintable:
QuotedPrintableDecode(content.data(), content.size(), data);
break;
case MIMEHeader::Encoding::kEightBit:
case MIMEHeader::Encoding::kSevenBit:
case MIMEHeader::Encoding::kBinary:
data.Append(content.data(), content.size());
break;
default:
DVLOG(1) << "Invalid encoding for MHTML part.";
return nullptr;
}
scoped_refptr<SharedBuffer> content_buffer = SharedBuffer::AdoptVector(data);
// FIXME: the URL in the MIME header could be relative, we should resolve it
// if it is. The specs mentions 5 ways to resolve a URL:
// http://tools.ietf.org/html/rfc2557#section-5
// IE and Firefox (UNMht) seem to generate only absolute URLs.
KURL location = KURL(NullURL(), mime_header.ContentLocation());
return MakeGarbageCollected<ArchiveResource>(
content_buffer, location, mime_header.ContentID(),
AtomicString(mime_header.ContentType()),
AtomicString(mime_header.Charset()));
}
// static
KURL MHTMLParser::ConvertContentIDToURI(const String& content_id) {
// This function is based primarily on an example from rfc2557 in section
// 9.5, but also based on more normative parts of specs like:
// - rfc2557 - MHTML - section 8.3 - "Use of the Content-ID header and CID
// URLs"
// - rfc1738 - URL - section 4 (reserved scheme names; includes "cid")
// - rfc2387 - multipart/related - section 3.4 - "Syntax" (cid := msg-id)
// - rfc0822 - msg-id = "<" addr-spec ">"; addr-spec = local-part "@" domain
if (content_id.length() <= 2)
return KURL();
if (!content_id.StartsWith('<') || !content_id.EndsWith('>'))
return KURL();
StringBuilder uri_builder;
uri_builder.Append("cid:");
uri_builder.Append(content_id, 1, content_id.length() - 2);
return KURL(NullURL(), uri_builder.ToString());
}
} // namespace blink