| // link_check implementation -----------------------------------------------// |
| |
| // Copyright Beman Dawes 2002. |
| // |
| // Distributed under the Boost Software License, Version 1.0. |
| // (See accompanying file LICENSE_1_0.txt or copy at |
| // http://www.boost.org/LICENSE_1_0.txt) |
| |
| #include "link_check.hpp" |
| #include "boost/regex.hpp" |
| #include "boost/filesystem/operations.hpp" |
| #include <boost/algorithm/string/case_conv.hpp> |
| #include <cstdlib> |
| #include <set> |
| |
| // #include <iostream> |
| |
| namespace fs = boost::filesystem; |
| |
| namespace |
| { |
| boost::regex html_bookmark_regex( |
| "<([^\\s<>]*)\\s*[^<>]*\\s+(NAME|ID)\\s*=\\s*(['\"])(.*?)\\3" |
| "|<!--.*?-->", |
| boost::regbase::normal | boost::regbase::icase); |
| boost::regex html_url_regex( |
| "<([^\\s<>]*)\\s*[^<>]*\\s+(?:HREF|SRC)" // HREF or SRC |
| "\\s*=\\s*(['\"])\\s*(.*?)\\s*\\2" |
| "|<!--.*?-->", |
| boost::regbase::normal | boost::regbase::icase); |
| boost::regex css_url_regex( |
| "(\\@import\\s*[\"']|url\\s*\\(\\s*[\"']?)([^\"')]*)" |
| "|/\\*.*?\\*/", |
| boost::regbase::normal | boost::regbase::icase); |
| |
| // Regular expression for parsing URLS from: |
| // http://tools.ietf.org/html/rfc3986#appendix-B |
| boost::regex url_decompose_regex( |
| "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?$", |
| boost::regbase::normal); |
| |
| typedef std::set<std::string> bookmark_set; |
| bookmark_set bookmarks; |
| bookmark_set bookmarks_lowercase; // duplicate check needs case insensitive |
| |
| // Decode html escapsed ampersands, returns an empty string if there's an error. |
| std::string decode_ampersands(std::string const& url_path) { |
| std::string::size_type pos = 0, next; |
| std::string result; |
| result.reserve(url_path.length()); |
| |
| while((next = url_path.find('&', pos)) != std::string::npos) { |
| result.append(url_path, pos, next - pos); |
| pos = next; |
| if(url_path.substr(pos, 5) == "&") { |
| result += '&'; pos += 5; |
| } |
| else { |
| result += '&'; pos += 1; |
| } |
| break; |
| } |
| |
| result.append(url_path, pos, url_path.length()); |
| |
| return result; |
| } |
| |
| // Decode percent encoded characters, returns an empty string if there's an error. |
| std::string decode_percents(std::string const& url_path) { |
| std::string::size_type pos = 0, next; |
| std::string result; |
| result.reserve(url_path.length()); |
| |
| while((next = url_path.find('%', pos)) != std::string::npos) { |
| result.append(url_path, pos, next - pos); |
| pos = next; |
| switch(url_path[pos]) { |
| case '%': { |
| if(url_path.length() - next < 3) return ""; |
| char hex[3] = { url_path[next + 1], url_path[next + 2], '\0' }; |
| char* end_ptr; |
| result += (char) std::strtol(hex, &end_ptr, 16); |
| if(*end_ptr) return ""; |
| pos = next + 3; |
| break; |
| } |
| } |
| } |
| |
| result.append(url_path, pos, url_path.length()); |
| |
| return result; |
| } |
| |
| bool is_css(const path & p) { |
| return p.extension() == ".css"; |
| } |
| |
| } // unnamed namespace |
| |
| namespace boost |
| { |
| namespace inspect |
| { |
| |
| // link_check constructor --------------------------------------------------// |
| |
| link_check::link_check() |
| : m_broken_errors(0), m_unlinked_errors(0), m_invalid_errors(0), |
| m_bookmark_errors(0), m_duplicate_bookmark_errors(0) |
| { |
| // HTML signatures are already registered by the base class, |
| // 'hypertext_inspector' |
| register_signature(".css"); |
| } |
| |
| // inspect (all) -----------------------------------------------------------// |
| |
| void link_check::inspect( |
| const string & /*library_name*/, |
| const path & full_path ) |
| { |
| // keep track of paths already encountered to reduce disk activity |
| if ( !fs::is_directory( full_path ) ) |
| m_paths[ relative_to( full_path, fs::initial_path() ) ] |= m_present; |
| } |
| |
| // inspect ( .htm, .html, .shtml, .css ) -----------------------------------// |
| |
| void link_check::inspect( |
| const string & library_name, |
| const path & full_path, // example: c:/foo/boost/filesystem/path.hpp |
| const string & contents ) // contents of file to be inspected |
| { |
| if (contents.find( "boostinspect:" "nounlinked" ) != string::npos) |
| m_paths[ relative_to( full_path, fs::initial_path() ) ] |= m_nounlinked_errors; |
| |
| bool no_link_errors = |
| (contents.find( "boostinspect:" "nolink" ) != string::npos); |
| |
| // build bookmarks databases |
| bookmarks.clear(); |
| bookmarks_lowercase.clear(); |
| string::const_iterator a_start( contents.begin() ); |
| string::const_iterator a_end( contents.end() ); |
| boost::match_results< string::const_iterator > a_what; |
| boost::match_flag_type a_flags = boost::match_default; |
| |
| if(!is_css(full_path)) |
| { |
| string previous_id; |
| |
| while( boost::regex_search( a_start, a_end, a_what, html_bookmark_regex, a_flags) ) |
| { |
| // a_what[0] contains the whole string iterators. |
| // a_what[1] contains the tag iterators. |
| // a_what[2] contains the attribute name. |
| // a_what[4] contains the bookmark iterators. |
| |
| if (a_what[4].matched) |
| { |
| string tag( a_what[1].first, a_what[1].second ); |
| boost::algorithm::to_lower(tag); |
| string attribute( a_what[2].first, a_what[2].second ); |
| boost::algorithm::to_lower(attribute); |
| string bookmark( a_what[4].first, a_what[4].second ); |
| |
| bool name_following_id = ( attribute == "name" && previous_id == bookmark ); |
| if ( tag != "meta" && attribute == "id" ) previous_id = bookmark; |
| else previous_id.clear(); |
| |
| if ( tag != "meta" && !name_following_id ) |
| { |
| bookmarks.insert( bookmark ); |
| // std::cout << "******************* " << bookmark << '\n'; |
| |
| // w3.org recommends case-insensitive checking for duplicate bookmarks |
| // since some browsers do a case-insensitive match. |
| string bookmark_lowercase( bookmark ); |
| boost::algorithm::to_lower(bookmark_lowercase); |
| |
| std::pair<bookmark_set::iterator, bool> result |
| = bookmarks_lowercase.insert( bookmark_lowercase ); |
| if (!result.second) |
| { |
| ++m_duplicate_bookmark_errors; |
| int ln = std::count( contents.begin(), a_what[3].first, '\n' ) + 1; |
| error( library_name, full_path, "Duplicate bookmark: " + bookmark, ln ); |
| } |
| } |
| } |
| |
| a_start = a_what[0].second; // update search position |
| a_flags |= boost::match_prev_avail; // update flags |
| a_flags |= boost::match_not_bob; |
| } |
| } |
| |
| // process urls |
| string::const_iterator start( contents.begin() ); |
| string::const_iterator end( contents.end() ); |
| boost::match_results< string::const_iterator > what; |
| boost::match_flag_type flags = boost::match_default; |
| |
| if(!is_css(full_path)) |
| { |
| while( boost::regex_search( start, end, what, html_url_regex, flags) ) |
| { |
| // what[0] contains the whole string iterators. |
| // what[1] contains the element type iterators. |
| // what[3] contains the URL iterators. |
| |
| if(what[3].matched) |
| { |
| string type( what[1].first, what[1].second ); |
| boost::algorithm::to_lower(type); |
| |
| // TODO: Complain if 'link' tags use external stylesheets. |
| do_url( string( what[3].first, what[3].second ), |
| library_name, full_path, no_link_errors, |
| type == "a" || type == "link", contents.begin(), what[3].first ); |
| } |
| |
| start = what[0].second; // update search position |
| flags |= boost::match_prev_avail; // update flags |
| flags |= boost::match_not_bob; |
| } |
| } |
| |
| while( boost::regex_search( start, end, what, css_url_regex, flags) ) |
| { |
| // what[0] contains the whole string iterators. |
| // what[2] contains the URL iterators. |
| |
| if(what[2].matched) |
| { |
| do_url( string( what[2].first, what[2].second ), |
| library_name, full_path, no_link_errors, false, |
| contents.begin(), what[3].first ); |
| } |
| |
| start = what[0].second; // update search position |
| flags |= boost::match_prev_avail; // update flags |
| flags |= boost::match_not_bob; |
| } |
| } |
| |
| // do_url ------------------------------------------------------------------// |
| |
| void link_check::do_url( const string & url, const string & library_name, |
| const path & source_path, bool no_link_errors, bool allow_external_content, |
| std::string::const_iterator contents_begin, std::string::const_iterator url_start ) |
| // precondition: source_path.is_complete() |
| { |
| if(!no_link_errors && url.empty()) { |
| ++m_invalid_errors; |
| int ln = std::count( contents_begin, url_start, '\n' ) + 1; |
| error( library_name, source_path, "Empty URL.", ln ); |
| return; |
| } |
| |
| // Decode ampersand encoded characters. |
| string decoded_url = is_css(source_path) ? url : decode_ampersands(url); |
| if(decoded_url.empty()) { |
| if(!no_link_errors) { |
| ++m_invalid_errors; |
| int ln = std::count( contents_begin, url_start, '\n' ) + 1; |
| error( library_name, source_path, |
| "Invalid URL (invalid ampersand encodings): " + url, ln ); |
| } |
| return; |
| } |
| |
| boost::smatch m; |
| if(!boost::regex_match(decoded_url, m, url_decompose_regex)) { |
| if(!no_link_errors) { |
| ++m_invalid_errors; |
| int ln = std::count( contents_begin, url_start, '\n' ) + 1; |
| error( library_name, source_path, "Invalid URL: " + decoded_url, ln ); |
| } |
| return; |
| } |
| |
| bool scheme_matched = m[2].matched, |
| authority_matched = m[4].matched, |
| //query_matched = m[7].matched, |
| fragment_matched = m[9].matched; |
| |
| std::string scheme(m[2]), |
| authority(m[4]), |
| url_path(m[5]), |
| //query(m[7]), |
| fragment(m[9]); |
| |
| // Check for external content |
| if(!allow_external_content && (authority_matched || scheme_matched)) { |
| if(!no_link_errors) { |
| ++m_invalid_errors; |
| int ln = std::count( contents_begin, url_start, '\n' ) + 1; |
| error( library_name, source_path, "External content: " + decoded_url, ln ); |
| } |
| } |
| |
| // Protocol checks |
| if(scheme_matched) { |
| if(scheme == "http" || scheme == "https") { |
| // All http links should have a hostname. Generally if they don't |
| // it's by mistake. If they shouldn't, then a protocol isn't |
| // required. |
| if(!authority_matched) { |
| if(!no_link_errors) { |
| ++m_invalid_errors; |
| int ln = std::count( contents_begin, url_start, '\n' ) + 1; |
| error( library_name, source_path, "No hostname: " + decoded_url, ln ); |
| } |
| } |
| |
| return; |
| } |
| else if(scheme == "file") { |
| if(!no_link_errors) { |
| ++m_invalid_errors; |
| int ln = std::count( contents_begin, url_start, '\n' ) + 1; |
| error( library_name, source_path, |
| "Invalid URL (hardwired file): " + decoded_url, ln ); |
| } |
| } |
| else if(scheme == "mailto" || scheme == "ftp" || scheme == "news" || scheme == "javascript") { |
| if ( !no_link_errors && is_css(source_path) ) { |
| ++m_invalid_errors; |
| int ln = std::count( contents_begin, url_start, '\n' ) + 1; |
| error( library_name, source_path, |
| "Invalid protocol for css: " + decoded_url, ln ); |
| } |
| } |
| else { |
| if(!no_link_errors) { |
| ++m_invalid_errors; |
| int ln = std::count( contents_begin, url_start, '\n' ) + 1; |
| error( library_name, source_path, "Unknown protocol: '" + scheme + "' in url: " + decoded_url, ln ); |
| } |
| } |
| |
| return; |
| } |
| |
| // Hostname without protocol. |
| if(authority_matched) { |
| if(!no_link_errors) { |
| ++m_invalid_errors; |
| int ln = std::count( contents_begin, url_start, '\n' ) + 1; |
| error( library_name, source_path, |
| "Invalid URL (hostname without protocol): " + decoded_url, ln ); |
| } |
| } |
| |
| // Check the fragment identifier |
| if ( fragment_matched ) { |
| if ( is_css(source_path) ) { |
| if ( !no_link_errors ) { |
| ++m_invalid_errors; |
| int ln = std::count( contents_begin, url_start, '\n' ) + 1; |
| error( library_name, source_path, |
| "Fragment link in CSS: " + decoded_url, ln ); |
| } |
| } |
| else { |
| if ( !no_link_errors && fragment.find( '#' ) != string::npos ) |
| { |
| ++m_bookmark_errors; |
| int ln = std::count( contents_begin, url_start, '\n' ) + 1; |
| error( library_name, source_path, "Invalid bookmark: " + decoded_url, ln ); |
| } |
| else if ( !no_link_errors && url_path.empty() && !fragment.empty() |
| // w3.org recommends case-sensitive broken bookmark checking |
| // since some browsers do a case-sensitive match. |
| && bookmarks.find(decode_percents(fragment)) == bookmarks.end() ) |
| { |
| ++m_broken_errors; |
| int ln = std::count( contents_begin, url_start, '\n' ) + 1; |
| error( library_name, source_path, "Unknown bookmark: " + decoded_url, ln ); |
| } |
| } |
| |
| // No more to do if it's just a fragment identifier |
| if(url_path.empty()) return; |
| } |
| |
| // Detect characters banned by RFC2396: |
| if ( !no_link_errors && decoded_url.find_first_of( " <>\"{}|\\^[]'" ) != string::npos ) |
| { |
| ++m_invalid_errors; |
| int ln = std::count( contents_begin, url_start, '\n' ) + 1; |
| error( library_name, source_path, |
| "Invalid character in URL: " + decoded_url, ln ); |
| } |
| |
| // Check that we actually have a path. |
| if(url_path.empty()) { |
| if(!no_link_errors) { |
| ++m_invalid_errors; |
| int ln = std::count( contents_begin, url_start, '\n' ) + 1; |
| error( library_name, source_path, |
| "Invalid URL (empty path in relative url): " + decoded_url, ln ); |
| } |
| } |
| |
| // Decode percent encoded characters. |
| string decoded_path = decode_percents(url_path); |
| if(decoded_path.empty()) { |
| if(!no_link_errors) { |
| ++m_invalid_errors; |
| int ln = std::count( contents_begin, url_start, '\n' ) + 1; |
| error( library_name, source_path, |
| "Invalid URL (invalid character encodings): " + decoded_url, ln ); |
| } |
| return; |
| } |
| |
| // strip url of references to current dir |
| if ( decoded_path[0]=='.' && decoded_path[1]=='/' ) decoded_path.erase( 0, 2 ); |
| |
| // url is relative source_path.branch() |
| // convert to target_path, which is_complete() |
| path target_path; |
| try { target_path = source_path.branch_path() /= path( decoded_path, fs::no_check ); } |
| catch ( const fs::filesystem_error & ) |
| { |
| if(!no_link_errors) { |
| int ln = std::count( contents_begin, url_start, '\n' ) + 1; |
| ++m_invalid_errors; |
| error( library_name, source_path, |
| "Invalid URL (error resolving path): " + decoded_url, ln ); |
| } |
| return; |
| } |
| |
| // create a m_paths entry if necessary |
| std::pair< const string, int > entry( |
| relative_to( target_path, fs::initial_path() ), 0 ); |
| m_path_map::iterator itr( m_paths.find( entry.first ) ); |
| if ( itr == m_paths.end() ) |
| { |
| if ( fs::exists( target_path ) ) entry.second = m_present; |
| itr = m_paths.insert( entry ).first; |
| } |
| |
| // itr now points to the m_paths entry |
| itr->second |= m_linked_to; |
| |
| // if target isn't present, the link is broken |
| if ( !no_link_errors && (itr->second & m_present) == 0 ) |
| { |
| ++m_broken_errors; |
| int ln = std::count( contents_begin, url_start, '\n' ) + 1; |
| error( library_name, source_path, "Broken link: " + decoded_url, ln ); |
| } |
| } |
| |
| // close -------------------------------------------------------------------// |
| |
| void link_check::close() |
| { |
| for ( m_path_map::const_iterator itr = m_paths.begin(); |
| itr != m_paths.end(); ++itr ) |
| { |
| // std::clog << itr->first << " " << itr->second << "\n"; |
| if ( (itr->second & m_linked_to) != m_linked_to |
| && (itr->second & m_nounlinked_errors) != m_nounlinked_errors |
| && (itr->first.rfind( ".html" ) == itr->first.size()-5 |
| || itr->first.rfind( ".htm" ) == itr->first.size()-4 |
| || itr->first.rfind( ".css" ) == itr->first.size()-4) |
| // because they may be redirectors, it is OK if these are unlinked: |
| && itr->first.rfind( "index.html" ) == string::npos |
| && itr->first.rfind( "index.htm" ) == string::npos ) |
| { |
| ++m_unlinked_errors; |
| path full_path( fs::initial_path() / path(itr->first, fs::no_check) ); |
| error( impute_library( full_path ), full_path, "Unlinked file" ); |
| } |
| } |
| } |
| |
| } // namespace inspect |
| } // namespace boost |
| |