| /* |
| * Copyright (C) 2011 Google Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are |
| * met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above |
| * copyright notice, this list of conditions and the following disclaimer |
| * in the documentation and/or other materials provided with the |
| * distribution. |
| * * Neither the name of Google Inc. nor the names of its |
| * contributors may be used to endorse or promote products derived from |
| * this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "third_party/blink/renderer/platform/mhtml/mhtml_parser.h" |
| |
| #include <stddef.h> |
| #include <utility> |
| |
| #include "third_party/blink/renderer/platform/heap/heap.h" |
| #include "third_party/blink/renderer/platform/mhtml/archive_resource.h" |
| #include "third_party/blink/renderer/platform/network/http_parsers.h" |
| #include "third_party/blink/renderer/platform/network/parsed_content_type.h" |
| #include "third_party/blink/renderer/platform/wtf/hash_map.h" |
| #include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h" |
| #include "third_party/blink/renderer/platform/wtf/text/base64.h" |
| #include "third_party/blink/renderer/platform/wtf/text/string_builder.h" |
| #include "third_party/blink/renderer/platform/wtf/text/string_concatenate.h" |
| #include "third_party/blink/renderer/platform/wtf/text/string_hash.h" |
| #include "third_party/blink/renderer/platform/wtf/text/wtf_string.h" |
| #include "third_party/blink/renderer/platform/wtf/vector.h" |
| |
| namespace blink { |
| |
| namespace { |
| |
| void QuotedPrintableDecode(const char* data, |
| size_t data_length, |
| Vector<char>& out) { |
| out.clear(); |
| if (!data_length) |
| return; |
| |
| for (size_t i = 0; i < data_length; ++i) { |
| char current_character = data[i]; |
| if (current_character != '=') { |
| out.push_back(current_character); |
| continue; |
| } |
| // We are dealing with a '=xx' sequence. |
| if (data_length - i < 3) { |
| // Unfinished = sequence, append as is. |
| out.push_back(current_character); |
| continue; |
| } |
| char upper_character = data[++i]; |
| char lower_character = data[++i]; |
| if (upper_character == '\r' && lower_character == '\n') |
| continue; |
| |
| if (!IsASCIIHexDigit(upper_character) || |
| !IsASCIIHexDigit(lower_character)) { |
| // Invalid sequence, = followed by non hex digits, just insert the |
| // characters as is. |
| out.push_back('='); |
| out.push_back(upper_character); |
| out.push_back(lower_character); |
| continue; |
| } |
| out.push_back( |
| static_cast<char>(ToASCIIHexValue(upper_character, lower_character))); |
| } |
| } |
| |
| } // namespace |
| |
| // This class is a limited MIME parser used to parse the MIME headers of MHTML |
| // files. |
| class MIMEHeader final : public GarbageCollected<MIMEHeader> { |
| public: |
| MIMEHeader(); |
| |
| enum class Encoding { |
| kQuotedPrintable, |
| kBase64, |
| kEightBit, |
| kSevenBit, |
| kBinary, |
| kUnknown |
| }; |
| |
| static MIMEHeader* ParseHeader(SharedBufferChunkReader* cr_lf_line_reader); |
| |
| bool IsMultipart() const { |
| return content_type_.StartsWithIgnoringASCIICase("multipart/"); |
| } |
| |
| String ContentType() const { return content_type_; } |
| String Charset() const { return charset_; } |
| Encoding ContentTransferEncoding() const { |
| return content_transfer_encoding_; |
| } |
| String ContentLocation() const { return content_location_; } |
| String ContentID() const { return content_id_; } |
| base::Time Date() const { return date_; } |
| |
| // Multi-part type and boundaries are only valid for multipart MIME headers. |
| String MultiPartType() const { return multipart_type_; } |
| String EndOfPartBoundary() const { return end_of_part_boundary_; } |
| String EndOfDocumentBoundary() const { return end_of_document_boundary_; } |
| |
| void Trace(Visitor* visitor) const {} |
| |
| private: |
| static Encoding ParseContentTransferEncoding(const String&); |
| |
| String content_type_; |
| String charset_; |
| Encoding content_transfer_encoding_; |
| String content_location_; |
| String content_id_; |
| base::Time date_; |
| String multipart_type_; |
| String end_of_part_boundary_; |
| String end_of_document_boundary_; |
| }; |
| |
| typedef HashMap<String, String> KeyValueMap; |
| |
| static KeyValueMap RetrieveKeyValuePairs(SharedBufferChunkReader* buffer) { |
| KeyValueMap key_value_pairs; |
| String line; |
| String key; |
| StringBuilder value; |
| while (!(line = buffer->NextChunkAsUTF8StringWithLatin1Fallback()).IsNull()) { |
| if (line.IsEmpty()) |
| break; // Empty line means end of key/value section. |
| // RFC822 continuation: A line that starts with LWSP is a continuation of |
| // the prior line. |
| if ((line[0] == '\t') || (line[0] == ' ')) { |
| value.Append(line.Substring(1)); |
| continue; |
| } |
| // New key/value, store the previous one if any. |
| if (!key.IsEmpty()) { |
| if (key_value_pairs.find(key) != key_value_pairs.end()) |
| DVLOG(1) << "Key duplicate found in MIME header. Key is '" << key |
| << "', previous value replaced."; |
| key_value_pairs.insert(key, value.ToString().StripWhiteSpace()); |
| key = String(); |
| value.Clear(); |
| } |
| wtf_size_t semi_colon_index = line.find(':'); |
| if (semi_colon_index == kNotFound) { |
| // This is not a key value pair, ignore. |
| continue; |
| } |
| key = |
| line.Substring(0, semi_colon_index).DeprecatedLower().StripWhiteSpace(); |
| value.Append(line.Substring(semi_colon_index + 1)); |
| } |
| // Store the last property if there is one. |
| if (!key.IsEmpty()) |
| key_value_pairs.Set(key, value.ToString().StripWhiteSpace()); |
| return key_value_pairs; |
| } |
| |
| MIMEHeader* MIMEHeader::ParseHeader(SharedBufferChunkReader* buffer) { |
| auto* mime_header = MakeGarbageCollected<MIMEHeader>(); |
| KeyValueMap key_value_pairs = RetrieveKeyValuePairs(buffer); |
| KeyValueMap::iterator mime_parameters_iterator = |
| key_value_pairs.find("content-type"); |
| if (mime_parameters_iterator != key_value_pairs.end()) { |
| ParsedContentType parsed_content_type(mime_parameters_iterator->value, |
| ParsedContentType::Mode::kRelaxed); |
| mime_header->content_type_ = parsed_content_type.MimeType(); |
| if (!mime_header->IsMultipart()) { |
| mime_header->charset_ = parsed_content_type.Charset().StripWhiteSpace(); |
| } else { |
| mime_header->multipart_type_ = |
| parsed_content_type.ParameterValueForName("type"); |
| String boundary = parsed_content_type.ParameterValueForName("boundary"); |
| if (boundary.IsNull()) { |
| DVLOG(1) << "No boundary found in multipart MIME header."; |
| return nullptr; |
| } |
| mime_header->end_of_part_boundary_ = "--" + boundary; |
| mime_header->end_of_document_boundary_ = |
| mime_header->end_of_part_boundary_; |
| mime_header->end_of_document_boundary_ = |
| mime_header->end_of_document_boundary_ + "--"; |
| } |
| } |
| |
| mime_parameters_iterator = key_value_pairs.find("content-transfer-encoding"); |
| if (mime_parameters_iterator != key_value_pairs.end()) |
| mime_header->content_transfer_encoding_ = |
| ParseContentTransferEncoding(mime_parameters_iterator->value); |
| |
| mime_parameters_iterator = key_value_pairs.find("content-location"); |
| if (mime_parameters_iterator != key_value_pairs.end()) |
| mime_header->content_location_ = mime_parameters_iterator->value; |
| |
| // See rfc2557 - section 8.3 - Use of the Content-ID header and CID URLs. |
| mime_parameters_iterator = key_value_pairs.find("content-id"); |
| if (mime_parameters_iterator != key_value_pairs.end()) |
| mime_header->content_id_ = mime_parameters_iterator->value; |
| |
| mime_parameters_iterator = key_value_pairs.find("date"); |
| if (mime_parameters_iterator != key_value_pairs.end()) { |
| base::Time parsed_time; |
| // Behave like //net and parse time-valued headers with a default time zone |
| // of UTC. |
| if (base::Time::FromUTCString( |
| mime_parameters_iterator->value.Utf8().c_str(), &parsed_time)) |
| mime_header->date_ = parsed_time; |
| } |
| |
| return mime_header; |
| } |
| |
| MIMEHeader::Encoding MIMEHeader::ParseContentTransferEncoding( |
| const String& text) { |
| String encoding = text.StripWhiteSpace().LowerASCII(); |
| if (encoding == "base64") |
| return Encoding::kBase64; |
| if (encoding == "quoted-printable") |
| return Encoding::kQuotedPrintable; |
| if (encoding == "8bit") |
| return Encoding::kEightBit; |
| if (encoding == "7bit") |
| return Encoding::kSevenBit; |
| if (encoding == "binary") |
| return Encoding::kBinary; |
| DVLOG(1) << "Unknown encoding '" << text << "' found in MIME header."; |
| return Encoding::kUnknown; |
| } |
| |
| MIMEHeader::MIMEHeader() : content_transfer_encoding_(Encoding::kUnknown) {} |
| |
| static bool SkipLinesUntilBoundaryFound(SharedBufferChunkReader& line_reader, |
| const String& boundary) { |
| String line; |
| while (!(line = line_reader.NextChunkAsUTF8StringWithLatin1Fallback()) |
| .IsNull()) { |
| if (line == boundary) |
| return true; |
| } |
| return false; |
| } |
| |
| MHTMLParser::MHTMLParser(scoped_refptr<const SharedBuffer> data) |
| : line_reader_(std::move(data), "\r\n") {} |
| |
| HeapVector<Member<ArchiveResource>> MHTMLParser::ParseArchive() { |
| MIMEHeader* header = MIMEHeader::ParseHeader(&line_reader_); |
| HeapVector<Member<ArchiveResource>> resources; |
| if (ParseArchiveWithHeader(header, resources)) { |
| creation_date_ = header->Date(); |
| } else { |
| resources.clear(); |
| } |
| return resources; |
| } |
| |
| base::Time MHTMLParser::CreationDate() const { |
| return creation_date_; |
| } |
| |
| bool MHTMLParser::ParseArchiveWithHeader( |
| MIMEHeader* header, |
| HeapVector<Member<ArchiveResource>>& resources) { |
| if (!header) { |
| DVLOG(1) << "Failed to parse MHTML part: no header."; |
| return false; |
| } |
| |
| if (!header->IsMultipart()) { |
| // With IE a page with no resource is not multi-part. |
| bool end_of_archive_reached = false; |
| ArchiveResource* resource = |
| ParseNextPart(*header, String(), String(), end_of_archive_reached); |
| if (!resource) |
| return false; |
| resources.push_back(resource); |
| return true; |
| } |
| |
| // Skip the message content (it's a generic browser specific message). |
| SkipLinesUntilBoundaryFound(line_reader_, header->EndOfPartBoundary()); |
| |
| bool end_of_archive = false; |
| while (!end_of_archive) { |
| MIMEHeader* resource_header = MIMEHeader::ParseHeader(&line_reader_); |
| if (!resource_header) { |
| DVLOG(1) << "Failed to parse MHTML, invalid MIME header."; |
| return false; |
| } |
| if (resource_header->ContentType() == "multipart/alternative") { |
| // Ignore IE nesting which makes little sense (IE seems to nest only some |
| // of the frames). |
| if (!ParseArchiveWithHeader(resource_header, resources)) { |
| DVLOG(1) << "Failed to parse MHTML subframe."; |
| return false; |
| } |
| SkipLinesUntilBoundaryFound(line_reader_, header->EndOfPartBoundary()); |
| continue; |
| } |
| |
| ArchiveResource* resource = |
| ParseNextPart(*resource_header, header->EndOfPartBoundary(), |
| header->EndOfDocumentBoundary(), end_of_archive); |
| if (!resource) { |
| DVLOG(1) << "Failed to parse MHTML part."; |
| return false; |
| } |
| resources.push_back(resource); |
| } |
| return true; |
| } |
| |
| ArchiveResource* MHTMLParser::ParseNextPart( |
| const MIMEHeader& mime_header, |
| const String& end_of_part_boundary, |
| const String& end_of_document_boundary, |
| bool& end_of_archive_reached) { |
| DCHECK_EQ(end_of_part_boundary.IsEmpty(), end_of_document_boundary.IsEmpty()); |
| |
| // Per the spec, the bondary to separate parts should start with CRLF. |
| // |end_of_part_boundary| passed here does not contain CRLF at the beginning. |
| // The parsing logic below takes care of CRLF handling. |
| |
| // If no content transfer encoding is specified, default to binary encoding. |
| MIMEHeader::Encoding content_transfer_encoding = |
| mime_header.ContentTransferEncoding(); |
| if (content_transfer_encoding == MIMEHeader::Encoding::kUnknown) |
| content_transfer_encoding = MIMEHeader::Encoding::kBinary; |
| |
| Vector<char> content; |
| const bool check_boundary = !end_of_part_boundary.IsEmpty(); |
| bool end_of_part_reached = false; |
| if (content_transfer_encoding == MIMEHeader::Encoding::kBinary) { |
| if (!check_boundary) { |
| DVLOG(1) << "Binary contents requires end of part"; |
| return nullptr; |
| } |
| // Due to a bug in MHTMLArchive, CRLF was not added to the beginning of the |
| // boundary that is placed after the part encoded as binary. To handle both |
| // cases that CRLF may or may not be at the beginning of the boundary, we |
| // read the part content till reaching the boundary without CRLF. So the |
| // part content may contain CRLF at the end, which will be stripped off |
| // later. |
| line_reader_.SetSeparator(end_of_part_boundary.Utf8().c_str()); |
| if (!line_reader_.NextChunk(content)) { |
| DVLOG(1) << "Binary contents requires end of part"; |
| return nullptr; |
| } |
| line_reader_.SetSeparator("\r\n"); |
| |
| // Strip the CRLF from the end of the content if present. |
| // Note: it may be the case that CRLF stripped off is really part of the |
| // content, instead of part of the boundary. |
| // 1) If the content denotes text or html data, stripping off CRLF will |
| // normally bring no harm. |
| // 2) Otherwise, the content denotes image or other type of binary data. |
| // Usually it doesn't have CRLF at the end. |
| // In order to support parsing the MHTML archive file produced before the |
| // MHTMLArchive bug was fixed, we need to take a risk of stripping off the |
| // CRLF that indeed belongs to the content. |
| if (content.size() >= 2 && content[content.size() - 2] == '\r' && |
| content[content.size() - 1] == '\n') { |
| content.resize(content.size() - 2); |
| } |
| |
| Vector<char> next_chars; |
| if (line_reader_.Peek(next_chars, 2) != 2) { |
| DVLOG(1) << "Invalid seperator."; |
| return nullptr; |
| } |
| end_of_part_reached = true; |
| DCHECK(next_chars.size() == 2); |
| end_of_archive_reached = (next_chars[0] == '-' && next_chars[1] == '-'); |
| if (!end_of_archive_reached) { |
| String line = line_reader_.NextChunkAsUTF8StringWithLatin1Fallback(); |
| if (!line.IsEmpty()) { |
| DVLOG(1) << "No CRLF at end of binary section."; |
| return nullptr; |
| } |
| } |
| } else { |
| String line; |
| while (!(line = line_reader_.NextChunkAsUTF8StringWithLatin1Fallback()) |
| .IsNull()) { |
| end_of_archive_reached = (line == end_of_document_boundary); |
| if (check_boundary && |
| (line == end_of_part_boundary || end_of_archive_reached)) { |
| end_of_part_reached = true; |
| break; |
| } |
| // Note that we use line.utf8() and not line.ascii() as ascii turns |
| // special characters (such as tab, line-feed...) into '?'. |
| content.Append(line.Utf8().c_str(), line.length()); |
| if (content_transfer_encoding == MIMEHeader::Encoding::kQuotedPrintable) { |
| // The line reader removes the \r\n, but we need them for the content in |
| // this case as the QuotedPrintable decoder expects CR-LF terminated |
| // lines. |
| content.Append("\r\n", 2u); |
| } |
| } |
| } |
| if (!end_of_part_reached && check_boundary) { |
| DVLOG(1) << "No boundary found for MHTML part."; |
| return nullptr; |
| } |
| |
| Vector<char> data; |
| switch (content_transfer_encoding) { |
| case MIMEHeader::Encoding::kBase64: |
| if (!Base64Decode(content.data(), content.size(), data)) { |
| DVLOG(1) << "Invalid base64 content for MHTML part."; |
| return nullptr; |
| } |
| break; |
| case MIMEHeader::Encoding::kQuotedPrintable: |
| QuotedPrintableDecode(content.data(), content.size(), data); |
| break; |
| case MIMEHeader::Encoding::kEightBit: |
| case MIMEHeader::Encoding::kSevenBit: |
| case MIMEHeader::Encoding::kBinary: |
| data.Append(content.data(), content.size()); |
| break; |
| default: |
| DVLOG(1) << "Invalid encoding for MHTML part."; |
| return nullptr; |
| } |
| scoped_refptr<SharedBuffer> content_buffer = SharedBuffer::AdoptVector(data); |
| // FIXME: the URL in the MIME header could be relative, we should resolve it |
| // if it is. The specs mentions 5 ways to resolve a URL: |
| // http://tools.ietf.org/html/rfc2557#section-5 |
| // IE and Firefox (UNMht) seem to generate only absolute URLs. |
| KURL location = KURL(NullURL(), mime_header.ContentLocation()); |
| return MakeGarbageCollected<ArchiveResource>( |
| content_buffer, location, mime_header.ContentID(), |
| AtomicString(mime_header.ContentType()), |
| AtomicString(mime_header.Charset())); |
| } |
| |
| // static |
| KURL MHTMLParser::ConvertContentIDToURI(const String& content_id) { |
| // This function is based primarily on an example from rfc2557 in section |
| // 9.5, but also based on more normative parts of specs like: |
| // - rfc2557 - MHTML - section 8.3 - "Use of the Content-ID header and CID |
| // URLs" |
| // - rfc1738 - URL - section 4 (reserved scheme names; includes "cid") |
| // - rfc2387 - multipart/related - section 3.4 - "Syntax" (cid := msg-id) |
| // - rfc0822 - msg-id = "<" addr-spec ">"; addr-spec = local-part "@" domain |
| |
| if (content_id.length() <= 2) |
| return KURL(); |
| |
| if (!content_id.StartsWith('<') || !content_id.EndsWith('>')) |
| return KURL(); |
| |
| StringBuilder uri_builder; |
| uri_builder.Append("cid:"); |
| uri_builder.Append(content_id, 1, content_id.length() - 2); |
| return KURL(NullURL(), uri_builder.ToString()); |
| } |
| |
| } // namespace blink |