token_bucket_list = new blcTokenBucketList( $conf->get('http_throttle_rate', 3), $conf->get('http_throttle_period', 15), $conf->get('http_throttle_min_interval', 2) ); if ( function_exists('curl_init') || is_callable('curl_init') ) { $this->implementation = new blcCurlHttp( $this->module_id, $this->cached_header, $this->plugin_conf, $this->module_manager ); } else { //Try to load Snoopy. if ( !class_exists('Snoopy') ){ $snoopy_file = ABSPATH. WPINC . '/class-snoopy.php'; if (file_exists($snoopy_file) ){ include $snoopy_file; } } //If Snoopy is available, it will be used in place of CURL. if ( class_exists('Snoopy') ){ $this->implementation = new blcSnoopyHttp( $this->module_id, $this->cached_header, $this->plugin_conf, $this->module_manager ); } } } function can_check($url, $parsed){ if ( isset($this->implementation) ){ return $this->implementation->can_check($url, $parsed); } else { return false; } } function check($url, $use_get = false){ global $blclog; //Throttle requests based on the domain name. $domain = @parse_url($url, PHP_URL_HOST); if ( $domain ) { $this->token_bucket_list->takeToken($domain); } $blclog->debug('HTTP module checking "' . $url . '"'); return $this->implementation->check($url, $use_get); } } /** * Base class for checkers that deal with HTTP(S) URLs. * * @package Broken Link Checker * @access public */ class blcHttpCheckerBase extends blcChecker { function clean_url($url){ $url = html_entity_decode($url); $ltrm = preg_quote(json_decode('"\u200E"'), '/'); $url = preg_replace( array( '/([\?&]PHPSESSID=\w+)$/i', //remove session ID '/(#[^\/]*)$/', //and anchors/fragments '/&/', //convert improper HTML entities '/([\?&]sid=\w+)$/i', //remove another flavour of session ID '/' . $ltrm . '/', //remove Left-to-Right marks that can show up when copying from Word. ), array('', '', '&', '', ''), $url ); $url = trim($url); return $url; } function is_error_code($http_code){ /*"Good" response codes are anything in the 2XX range (e.g "200 OK") and redirects - the 3XX range. HTTP 401 Unauthorized is a special case that is considered OK as well. Other errors - the 4XX range - are treated as such. */ $good_code = ( ($http_code >= 200) && ($http_code < 400) ) || ( $http_code == 401 ); return !$good_code; } /** * This checker only accepts HTTP(s) links. * * @param string $url * @param array|bool $parsed * @return bool */ function can_check($url, $parsed){ if ( !isset($parsed['scheme']) ) return false; return in_array( strtolower($parsed['scheme']), array('http', 'https') ); } /** * Takes an URL and replaces spaces and some other non-alphanumeric characters with their urlencoded equivalents. * * @param string $url * @return string */ function urlencodefix($url){ //TODO: Remove/fix this. Probably not a good idea to "fix" invalid URLs like that. return preg_replace_callback( '|[^a-z0-9\+\-\/\\#:.,;=?!&%@()$\|*~_]|i', create_function('$str','return rawurlencode($str[0]);'), $url ); } } class blcCurlHttp extends blcHttpCheckerBase { var $last_headers = ''; function check($url, $use_get = false){ global $blclog; $blclog->info(__CLASS__ . ' Checking link', $url); $this->last_headers = ''; $url = $this->clean_url($url); $blclog->debug(__CLASS__ . ' Clean URL:', $url); $result = array( 'broken' => false, 'timeout' => false, 'warning' => false, ); $log = ''; //Get the BLC configuration. It's used below to set the right timeout values and such. $conf = blc_get_configuration(); //Init curl. $ch = curl_init(); $request_headers = array(); curl_setopt($ch, CURLOPT_URL, $this->urlencodefix($url)); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); //Masquerade as Internet Explorer $ua = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'; //$ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'; curl_setopt($ch, CURLOPT_USERAGENT, $ua); //Close the connection after the request (disables keep-alive). The plugin rate-limits requests, //so it's likely we'd overrun the keep-alive timeout anyway. curl_setopt($ch, CURLOPT_FORBID_REUSE, true); $request_headers[] = 'Connection: close'; //Add a semi-plausible referer header to avoid tripping up some bot traps curl_setopt($ch, CURLOPT_REFERER, home_url()); //Redirects don't work when safe mode or open_basedir is enabled. if ( !blcUtility::is_safe_mode() && !blcUtility::is_open_basedir() ) { curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); } //Set maximum redirects curl_setopt($ch, CURLOPT_MAXREDIRS, 10); //Set the timeout curl_setopt($ch, CURLOPT_TIMEOUT, $conf->options['timeout']); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $conf->options['timeout']); //Set the proxy configuration. The user can provide this in wp-config.php if (defined('WP_PROXY_HOST')) { curl_setopt($ch, CURLOPT_PROXY, WP_PROXY_HOST); } if (defined('WP_PROXY_PORT')) { curl_setopt($ch, CURLOPT_PROXYPORT, WP_PROXY_PORT); } if (defined('WP_PROXY_USERNAME')){ $auth = WP_PROXY_USERNAME; if (defined('WP_PROXY_PASSWORD')){ $auth .= ':' . WP_PROXY_PASSWORD; } curl_setopt($ch, CURLOPT_PROXYUSERPWD, $auth); } //Make CURL return a valid result even if it gets a 404 or other error. curl_setopt($ch, CURLOPT_FAILONERROR, false); $nobody = !$use_get; //Whether to send a HEAD request (the default) or a GET request $parts = @parse_url($url); if( $parts['scheme'] == 'https' ){ curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); //Required to make HTTPS URLs work. curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); //$nobody = false; //Can't use HEAD with HTTPS. } if ( $nobody ){ //If possible, use HEAD requests for speed. curl_setopt($ch, CURLOPT_NOBODY, true); } else { //If we must use GET at least limit the amount of downloaded data. $request_headers[] = 'Range: bytes=0-2048'; //2 KB } //Set request headers. if ( !empty($request_headers) ) { curl_setopt($ch, CURLOPT_HTTPHEADER, $request_headers); } //Register a callback function which will process the HTTP header(s). //It can be called multiple times if the remote server performs a redirect. curl_setopt($ch, CURLOPT_HEADERFUNCTION, array($this,'read_header')); //Record request headers. if ( defined('CURLINFO_HEADER_OUT') ) { curl_setopt($ch, CURLINFO_HEADER_OUT, true); } //Execute the request $start_time = microtime_float(); $content = curl_exec($ch); $measured_request_duration = microtime_float() - $start_time; $blclog->debug(sprintf('HTTP request took %.3f seconds', $measured_request_duration)); $info = curl_getinfo($ch); //Store the results $result['http_code'] = intval( $info['http_code'] ); $result['final_url'] = $info['url']; $result['request_duration'] = $info['total_time']; $result['redirect_count'] = $info['redirect_count']; //CURL doesn't return a request duration when a timeout happens, so we measure it ourselves. //It is useful to see how long the plugin waited for the server to respond before assuming it timed out. if( empty($result['request_duration']) ){ $result['request_duration'] = $measured_request_duration; } //Determine if the link counts as "broken" if ( $result['http_code'] == 0 ){ $result['broken'] = true; $error_code = curl_errno($ch); $log .= sprintf( "%s [Error #%d]\n", curl_error($ch), $error_code ); //We only handle a couple of CURL error codes; most are highly esoteric. //libcurl "CURLE_" constants can't be used here because some of them have //different names or values in PHP. switch( $error_code ) { case 6: //CURLE_COULDNT_RESOLVE_HOST $result['status_code'] = BLC_LINK_STATUS_WARNING; $result['status_text'] = __('Server Not Found', 'broken-link-checker'); $result['error_code'] = 'couldnt_resolve_host'; break; case 28: //CURLE_OPERATION_TIMEDOUT $result['timeout'] = true; break; case 7: //CURLE_COULDNT_CONNECT //More often than not, this error code indicates that the connection attempt //timed out. This heuristic tries to distinguish between connections that fail //due to timeouts and those that fail due to other causes. if ( $result['request_duration'] >= 0.9*$conf->options['timeout'] ){ $result['timeout'] = true; } else { $result['status_code'] = BLC_LINK_STATUS_WARNING; $result['status_text'] = __('Connection Failed', 'broken-link-checker'); $result['error_code'] = 'connection_failed'; } break; default: $result['status_code'] = BLC_LINK_STATUS_WARNING; $result['status_text'] = __('Unknown Error', 'broken-link-checker'); } } else { $result['broken'] = $this->is_error_code($result['http_code']); } curl_close($ch); $blclog->info(sprintf( 'HTTP response: %d, duration: %.2f seconds, status text: "%s"', $result['http_code'], $result['request_duration'], isset($result['status_text']) ? $result['status_text'] : 'N/A' )); if ( $nobody && $result['broken'] ){ //The site in question might be expecting GET instead of HEAD, so lets retry the request //using the GET verb. return $this->check($url, true); //Note : normally a server that doesn't allow HEAD requests on a specific resource *should* //return "405 Method Not Allowed". Unfortunately, there are sites that return 404 or //another, even more general, error code instead. So just checking for 405 wouldn't be enough. } //When safe_mode or open_basedir is enabled CURL will be forbidden from following redirects, //so redirect_count will be 0 for all URLs. As a workaround, set it to 1 when the HTTP //response codes indicates a redirect but redirect_count is zero. //Note to self : Extracting the Location header might also be helpful. if ( ($result['redirect_count'] == 0) && ( in_array( $result['http_code'], array(301, 302, 303, 307) ) ) ){ $result['redirect_count'] = 1; } //Build the log from HTTP code and headers. $log .= '=== '; if ( $result['http_code'] ){ $log .= sprintf( __('HTTP code : %d', 'broken-link-checker'), $result['http_code']); } else { $log .= __('(No response)', 'broken-link-checker'); } $log .= " ===\n\n"; $log .= "Response headers\n" . str_repeat('=', 16) . "\n"; $log .= htmlentities($this->last_headers); if ( isset($info['request_header']) ) { $log .= "Request headers\n" . str_repeat('=', 16) . "\n"; $log .= htmlentities($info['request_header']); } if ( !$nobody && ($content !== false) && $result['broken'] ) { $log .= "Response HTML\n" . str_repeat('=', 16) . "\n"; $log .= htmlentities(substr($content, 0, 2048)); } if ( !empty($result['broken']) && !empty($result['timeout']) ) { $log .= "\n(" . __("Most likely the connection timed out or the domain doesn't exist.", 'broken-link-checker') . ')'; } $result['log'] = $log; //The hash should contain info about all pieces of data that pertain to determining if the //link is working. $result['result_hash'] = implode('|', array( $result['http_code'], !empty($result['broken'])?'broken':'0', !empty($result['timeout'])?'timeout':'0', blcLink::remove_query_string($result['final_url']), )); return $result; } function read_header(/** @noinspection PhpUnusedParameterInspection */ $ch, $header){ $this->last_headers .= $header; return strlen($header); } } class blcSnoopyHttp extends blcHttpCheckerBase { function check($url){ $url = $this->clean_url($url); //Note : Snoopy doesn't work too well with HTTPS URLs. $result = array( 'broken' => false, 'timeout' => false, ); $log = ''; //Get the timeout setting from the BLC configuration. $conf = blc_get_configuration(); $timeout = $conf->options['timeout']; $start_time = microtime_float(); //Fetch the URL with Snoopy $snoopy = new Snoopy; $snoopy->read_timeout = $timeout; //read timeout in seconds $snoopy->agent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"; //masquerade as IE 7 $snoopy->referer = home_url(); //valid referer helps circumvent some hotlink protection schemes $snoopy->maxlength = 1024*5; //load up to 5 kilobytes $snoopy->fetch( $this->urlencodefix($url) ); $result['request_duration'] = microtime_float() - $start_time; $result['http_code'] = $snoopy->status; //HTTP status code //Snoopy returns -100 on timeout if ( $result['http_code'] == -100 ){ $result['http_code'] = 0; $result['timeout'] = true; } //Build the log $log .= '=== '; if ( $result['http_code'] ){ $log .= sprintf( __('HTTP code : %d', 'broken-link-checker'), $result['http_code']); } else { $log .= __('(No response)', 'broken-link-checker'); } $log .= " ===\n\n"; if ($snoopy->error) $log .= $snoopy->error."\n"; if ($snoopy->timed_out) { $log .= __("Request timed out.", 'broken-link-checker') . "\n"; $result['timeout'] = true; } if ( is_array($snoopy->headers) ) $log .= implode("", $snoopy->headers)."\n"; //those headers already contain newlines //Redirected? if ( $snoopy->lastredirectaddr ) { $result['final_url'] = $snoopy->lastredirectaddr; $result['redirect_count'] = $snoopy->_redirectdepth; } else { $result['final_url'] = $url; } //Determine if the link counts as "broken" $result['broken'] = $this->is_error_code($result['http_code']) || $result['timeout']; $log .= "(" . __('Using Snoopy', 'broken-link-checker') . ")"; $result['log'] = $log; //The hash should contain info about all pieces of data that pertain to determining if the //link is working. $result['result_hash'] = implode('|', array( $result['http_code'], $result['broken']?'broken':'0', $result['timeout']?'timeout':'0', blcLink::remove_query_string($result['final_url']), )); return $result; } }