Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
openSUSE:Leap:42.1:Update
perl-checkbot
checkbot-1.80-webserver.patch
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File checkbot-1.80-webserver.patch of Package perl-checkbot
diff -ruN checkbot-1.80-orig/checkbot checkbot-1.80/checkbot --- checkbot-1.80-orig/checkbot 2008-10-15 14:55:01.000000000 +0200 +++ checkbot-1.80/checkbot 2010-10-28 18:15:44.000000000 +0200 @@ -33,7 +33,7 @@ =head1 NAME -Checkbot - WWW Link Verifier +Checkbot - WWW Link Verifier. Modified for the Novell Webconsole =head1 SYNOPSIS @@ -51,6 +51,8 @@ [B<--enable-virtual>] [B<--language> language code] [B<--suppress> suppression file] + [B<--username> Username for Novell Webconsole] + [B<--password> Password for Novell Webconsole] [start URLs] =head1 DESCRIPTION @@ -307,6 +309,28 @@ 403 /http:\/\/wikipedia.org\/.*/ +=item --username <username> + +Username for the Novell Webconsole. + +Before starting the link check the must exist already an account +to the Webconsole in order getting all available links. +So first of all there will be generated a login cookie by calling +the <hostname>/accounts/login with the given username and password. + +Do NOT forget activating cookies by the --cookies option. + +=item --password <password> + +Password for the Novell Webconsole. + +Before starting the link check the must exist already an account +to the Webconsole in order getting all available links. +So first of all there will be generated a login cookie by calling +the <hostname>/accounts/login with the given username and password. + +Do NOT forget activating cookies by the --cookies option. + =back Deprecated options which will disappear in a future release: @@ -482,7 +506,7 @@ # Get command-line arguments use Getopt::Long; - my $result = GetOptions(qw(cookies debug help noproxy=s verbose url=s match=s exclude|x=s file=s filter=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=f timeout=i interval=i dontwarn=s enable-virtual language=s allow-simple-hosts suppress=s)); + my $result = GetOptions(qw(cookies debug help noproxy=s verbose url=s match=s exclude|x=s file=s filter=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=f timeout=i interval=i dontwarn=s enable-virtual language=s allow-simple-hosts suppress=s password=s username=s)); # Handle arguments, some are mandatory, some have defaults &print_help if (($main::opt_help && $main::opt_help) @@ -661,9 +685,42 @@ open(CURRENT, $main::cur_queue) || die "$0: Unable to open CURRENT $main::cur_queue for reading: $!\n"; + ## initialize to webconsole specific + if (defined($main::opt_url)) + { + my $urlstring = $main::opt_url . "/accounts/login"; + + my %argument = (); + if (defined($main::opt_username)) + { + $argument{"user_login"} = $main::opt_username; + } + if (defined($main::opt_password)) + { + $argument{"user_password"} = $main::opt_password; + } + + # receiving session cookie + handle_url($main::opt_url); + + # login + my $root_url = URI->new($urlstring); + + my $response = $main::ua->post($root_url, \%argument); + if ($response->is_success || $response->is_redirect) { + print "Login succeeded\n"; + print $response->content; + } + else { + print "Login NOT succeeded\n"; + print STDERR $response->status_line, "\n"; + } + } + do { # Read a line from the queue, and process it while (defined ($line = <CURRENT>) ) { + print "checking $line"; chomp($line); &handle_url($line); &check_point(); @@ -727,6 +784,8 @@ my $response; my $type; +# print "Checking URL: $urlstr \n"; + $stats{'todo'}--; # Add this URL to the ones we've seen already, return if it is a @@ -800,16 +859,16 @@ if ($response->is_redirect and is_internal($url->as_string)) { if ($response->code == 300) { # multiple choices, but no redirection available - output 'Multiple choices', 2; + print 'Multiple choices\n'; } else { my $baseURI = URI->new($url); if (defined $response->header('Location')) { my $redir_url = URI->new_abs($response->header('Location'), $baseURI); - output "Redirected to $redir_url", 2; + print "Redirected to $redir_url\n"; add_to_queue($redir_url, $urlparent); $stats{'todo'}++; } else { - output 'Location header missing from redirect response', 2; + print 'Location header missing from redirect response\n'; } } } @@ -984,6 +1043,8 @@ print OUT "<tr><th align=\"left\">--match</th><td class='text'>Match regular expression</td><td class='value' id='match'>$main::opt_match</td></tr>\n"; print OUT "<tr><th align=\"left\">--exclude</th><td class='text'>Exclude regular expression</td><td class='value' id='exclude'>$main::opt_exclude</td></tr>\n" if defined $main::opt_exclude; print OUT "<tr><th align=\"left\">--filter</th><td class='text'>Filter regular expression</td><td class='value' id='filter'>$main::opt_filter</td></tr>\n" if defined $main::opt_filter; + print OUT "<tr><th align=\"left\">--username</th><td class='text'>Username</td><td class='value' id='username'>$main::opt_username</td></tr>\n" if defined $main::opt_username; + print OUT "<tr><th align=\"left\">--password</th><td class='text'>Password</td><td class='value' id='password'>set</td></tr>\n" if defined $main::opt_password; print OUT "<tr><th align=\"left\">--noproxy</th><td class='text'>No Proxy for the following domains</td><td class='value' id='noproxy'>$main::opt_noproxy</td></tr>\n" if defined $main::opt_noproxy; print OUT "<tr><th align=\"left\">--ignore</th><td class='text'>Ignore regular expression</td><td class='value' id='ignore'>$main::opt_ignore</td></tr>\n" if defined $main::opt_ignore; print OUT "<tr><th align=\"left\">--suppress</th><td class='text'>Suppress error code and URL specified by</td><td class='value' id='suppress'>$main::opt_suppress</td></tr>\n" if defined $main::opt_suppress; @@ -1183,12 +1244,14 @@ my $content = $response->decoded_content || $response->content; $p->parse($content); $p->eof; - + my $string = $response->base->as_string; + print ( "URL $string \n"); # Deal with the links we found in this document my @links = $p->links(); foreach (@links) { my ($tag, %l) = @{$_}; foreach (keys %l) { + print (" $l{$_} $_ $tag\n"); # Get the canonical URL, so we don't need to worry about base, case, etc. my $url = $l{$_}->canonical; @@ -1473,6 +1536,12 @@ --dontwarn codes Do not write warnings for these HTTP response codes --enable-virtual Use only virtual names, not IP numbers for servers --language Specify 2-letter language code for language negotiation + --username Username for Novell Webconsole + --password Password for Novell Webconsole + +Example for checking a Novell Webconsole: + +checkbot --cookies --url=http://webconsole.suse.de --username=schubi --password=system --ignore=http://webconsole.suse.de/accounts/logout Options --match, --exclude, and --ignore can take a perl regular expression as their argument\n diff -ruN checkbot-1.80-orig/checkbot.orig checkbot-1.80/checkbot.orig --- checkbot-1.80-orig/checkbot.orig 1970-01-01 01:00:00.000000000 +0100 +++ checkbot-1.80/checkbot.orig 2008-10-15 14:55:01.000000000 +0200 @@ -0,0 +1,1516 @@ +#!/usr/bin/perl -w +# +# checkbot - A perl5 script to check validity of links in www document trees +# +# Hans de Graaff <hans@degraaff.org>, 1994-2005. +# Based on Dimitri Tischenko, Delft University of Technology, 1994 +# Based on the testlinks script by Roy Fielding +# With contributions from Bruce Speyer <bruce.speyer@elecomm.com> +# +# This application is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# +# Info-URL: http://degraaff.org/checkbot/ +# +# $Id: checkbot 238 2008-10-15 12:55:00Z graaff $ +# (Log information can be found at the end of the script) + +require 5.004; +use strict; + +require LWP; +use File::Basename; + +BEGIN { + eval "use Time::Duration qw(duration)"; + $main::useduration = ($@ ? 0 : 1); +} + +# Version information +my +$VERSION = '1.80'; + + +=head1 NAME + +Checkbot - WWW Link Verifier + +=head1 SYNOPSIS + +checkbot [B<--cookies>] [B<--debug>] [B<--file> file name] [B<--help>] + [B<--mailto> email addresses] [B<--noproxy> list of domains] + [B<--verbose>] + [B<--url> start URL] + [B<--match> match string] [B<--exclude> exclude string] + [B<--proxy> proxy URL] [B<--internal-only>] + [B<--ignore> ignore string] + [B<--filter> substitution regular expression] + [B<--style> style file URL] + [B<--note> note] [B<--sleep> seconds] [B<--timeout> timeout] + [B<--interval> seconds] [B<--dontwarn> HTTP responde codes] + [B<--enable-virtual>] + [B<--language> language code] + [B<--suppress> suppression file] + [start URLs] + +=head1 DESCRIPTION + +Checkbot verifies the links in a specific portion of the World Wide +Web. It creates HTML pages with diagnostics. + +Checkbot uses LWP to find URLs on pages and to check them. It supports +the same schemes as LWP does, and finds the same links that +HTML::LinkExtor will find. + +Checkbot considers links to be either 'internal' or +'external'. Internal links are links within the web space that needs +to be checked. If an internal link points to a web document this +document is retrieved, and its links are extracted and +processed. External links are only checked to be working. Checkbot +checks links as it finds them, so internal and external links are +checked at the same time, even though they are treated differently. + +Options for Checkbot are: + +=over 4 + +=item --cookies + +Accept cookies from the server and offer them again at later +requests. This may be useful for servers that use cookies to handle +sessions. By default Checkbot does not accept any cookies. + +=item --debug + +Enable debugging mode. Not really supported anymore, but it will keep +some files around that otherwise would be deleted. + +=item --file <file name> + +Use the file I<file name> as the basis for the summary file names. The +summary page will get the I<file name> given, and the server pages are +based on the I<file name> without the .html extension. For example, +setting this option to C<index.html> will create a summary page called +index.html and server pages called index-server1.html and +index-server2.html. + +The default value for this option is C<checkbot.html>. + +=item --help + +Shows brief help message on the standard output. + +=item --mailto <email address>[,<email address>] + +Send mail to the I<email address> when Checkbot is done checking. You +can give more than one address separated by commas. The notification +email includes a small summary of the results. As of Checkbot 1.76 +email is only sent if problems have been found during the Checkbot +run. + +=item --noproxy <list of domains> + +Do not proxy requests to the given domains. The list of domains must +be a comma-separated list. For example, so avoid using the proxy for +the localhost and someserver.xyz, you can use C<--noproxy +localhost,someserver.xyz>. + +=item --verbose + +Show verbose output while running. Includes all links checked, results +from the checks, etc. + + + + + +=item --url <start URL> + +Set the start URL. Checkbot starts checking at this URL, and then +recursively checks all links found on this page. The start URL takes +precedence over additional URLs specified on the command line. + +If no scheme is specified for the URL, the file protocol is assumed. + +=item --match <match string> + +This option selects which pages Checkbot considers local. If the +I<match string> is contained within the URL, then Checkbot considers +the page local, retrieves it, and will check all the links contained +on it. Otherwise the page is considered external and it is only +checked with a HEAD request. + +If no explicit I<match string> is given, the start URLs (See option +C<--url>) will be used as a match string instead. In this case the +last page name, if any, will be trimmed. For example, a start URL like +C<http://some.site/index.html> will result in a default I<match +string> of C<http://some.site/>. + +The I<match string> can be a perl regular expression. For example, to +check the main server page and all HTML pages directly underneath it, +but not the HTML pages in the subdirectories of the server, the +I<match string> would be C<www.someserver.xyz/($|[^/]+.html)>. + +=item --exclude <exclude string> + +URLs matching the I<exclude string> are considered to be external, +even if they happen to match the I<match string> (See option +C<--match>). URLs matching the --exclude string are still being +checked and will be reported if problems are found, but they will not +be checked for further links into the site. + +The I<exclude string> can be a perl regular expression. For example, +to consider all URLs with a query string external, use C<[=\?]>. This +can be useful when a URL with a query string unlocks the path to a +huge database which will be checked. + +=item --filter <filter string> + +This option defines a I<filter string>, which is a perl regular +expression. This filter is run on each URL found, thus rewriting the +URL before it enters the queue to be checked. It can be used to remove +elements from a URL. This option can be useful when symbolic links +point to the same directory, or when a content management system adds +session IDs to URLs. + +For example C</old/new/> would replace occurrences of 'old' with 'new' +in each URL. + +=item --ignore <ignore string> + +URLs matching the I<ignore string> are not checked at all, they are +completely ignored by Checkbot. This can be useful to ignore known +problem links, or to ignore links leading into databases. The I<ignore +string> is matched after the I<filter string> has been applied. + +The I<ignore string> can be a perl regular expression. + +For example C<www.server.com\/(one|two)> would match all URLs starting +with either www.server.com/one or www.server.com/two. + + +=item --proxy <proxy URL> + +This attribute specifies the URL of a proxy server. Only the HTTP and +FTP requests will be sent to that proxy server. + +=item --internal-only + +Skip the checking of external links at the end of the Checkbot +run. Only matching links are checked. Note that some redirections may +still cause external links to be checked. + +=item --note <note> + +The I<note> is included verbatim in the mail message (See option +C<--mailto>). This can be useful to include the URL of the summary HTML page +for easy reference, for instance. + +Only meaningful in combination with the C<--mailto> option. + +=item --sleep <seconds> + +Number of I<seconds> to sleep in between requests. Default is 0 +seconds, i.e. do not sleep at all between requests. Setting this +option can be useful to keep the load on the web server down while +running Checkbot. This option can also be set to a fractional number, +i.e. a value of 0.1 will sleep one tenth of a second between requests. + +=item --timeout <timeout> + +Default timeout for the requests, specified in seconds. The default is +2 minutes. + +=item --interval <seconds> + +The maximum interval between updates of the results web pages in +seconds. Default is 3 hours (10800 seconds). Checkbot will start the +interval at one minute, and gradually extend it towards the maximum +interval. + +=item --style <URL of style file> + +When this option is used, Checkbot embeds this URL as a link to a +style file on each page it writes. This makes it easy to customize the +layout of pages generated by Checkbot. + +=item --dontwarn <HTTP response codes regular expression> + +Do not include warnings on the result pages for those HTTP response +codes which match the regular expression. For instance, --dontwarn +"(301|404)" would not include 301 and 404 response codes. + +Checkbot uses the response codes generated by the server, even if this +response code is not defined in RFC 2616 (HTTP/1.1). In addition to +the normal HTTP response code, Checkbot defines a few response codes +for situations which are not technically a problem, but which causes +problems in many cases anyway. These codes are: + + 901 Host name expected but not found + In this case the URL supports a host name, but non was found + in the URL. This usually indicates a mistake in the URL. An + exception is that this check is not applied to news: URLs. + + 902 Unqualified host name found + In this case the host name does not contain the domain part. + This usually means that the pages work fine when viewed within + the original domain, but not when viewed from outside it. + + 903 Double slash in URL path + The URL has a double slash in it. This is legal, but some web + servers cannot handle it very well and may cause Checkbot to + run away. See also the comments below. + + 904 Unknown scheme in URL + The URL starts with a scheme that Checkbot does not know + about. This is often caused by mistyping the scheme of the URL, + but the scheme can also be a legal one. In that case please let + me know so that it can be added to Checkbot. + +=item --enable-virtual + +This option enables dealing with virtual servers. Checkbot then +assumes that all hostnames for internal servers are unique, even +though their IP addresses may be the same. Normally Checkbot uses the +IP address to distinguish servers. This has the advantage that if a +server has two names (e.g. www and bamboozle) its pages only get +checked once. When you want to check multiple virtual servers this +causes problems, which this feature works around by using the hostname +to distinguish the server. + +=item --language + +The argument for this option is a two-letter language code. Checkbot +will use language negotiation to request files in that language. The +default is to request English language (language code 'en'). + +=item --suppress + +The argument for this option is a file which contains combinations of +error codes and URLs for which to suppress warnings. This can be used +to avoid reporting of known and unfixable URL errors or warnings. + +The format of the suppression file is a simple whitespace delimited +format, first listing the error code followed by the URL. Each error +code and URL combination is listed on a new line. Comments can be +added to the file by starting the line with a C<#> character. + + # 301 Moved Permanently + 301 http://www.w3.org/P3P + + # 403 Forbidden + 403 http://www.herring.com/ + +For further flexibility a regular expression can be used instead of a +normal URL. The regular expression must be enclosed with forward +slashes. For example, to suppress all 403 errors on wikipedia: + + 403 /http:\/\/wikipedia.org\/.*/ + +=back + +Deprecated options which will disappear in a future release: + +=over + +=item --allow-simple-hosts (deprecated) + +This option turns off warnings about URLs which contain unqualified +host names. This is useful for intranet sites which often use just a +simple host name or even C<localhost> in their links. + +Use of this option is deprecated. Please use the --dontwarn mechanism +for error 902 instead. + +=back + + +=head1 HINTS AND TIPS + +=over + +=item Problems with checking FTP links + +Some users may experience consistent problems with checking FTP +links. In these cases it may be useful to instruct Net::FTP to use +passive FTP mode to check files. This can be done by setting the +environment variable FTP_PASSIVE to 1. For example, using the bash +shell: C<FTP_PASSIVE=1 checkbot ...>. See the Net::FTP documentation +for more details. + +=item Run-away Checkbot + +In some cases Checkbot literally takes forever to finish. There are two +common causes for this problem. + +First, there might be a database application as part of the web site +which generates a new page based on links on another page. Since +Checkbot tries to travel through all links this will create an +infinite number of pages. This kind of run-away effect is usually predictable. It can be avoided by using the --exclude option. + +Second, a server configuration problem can cause a loop in generating +URLs for pages that really do not exist. This will result in URLs of +the form http://some.server/images/images/images/logo.png, with ever +more 'images' included. Checkbot cannot check for this because the +server should have indicated that the requested pages do not +exist. There is no easy way to solve this other than fixing the +offending web server or the broken links. + +=item Problems with https:// links + +The error message + + Can't locate object method "new" via package "LWP::Protocol::https::Socket" + +usually means that the current installation of LWP does not support +checking of SSL links (i.e. links starting with https://). This +problem can be solved by installing the Crypt::SSLeay module. + +=back + +=head1 EXAMPLES + +The most simple use of Checkbot is to check a set of pages on a +server. To check my checkbot pages I would use: + + checkbot http://degraaff.org/checkbot/ + +Checkbot runs can take some time so Checkbot can send a notification +mail when the run is done: + + checkbot --mailto hans@degraaff.org http://degraaff.org/checkbot/ + +It is possible to check a set of local file without using a web +server. This only works for static files but may be useful in some +cases. + + checkbot file:///var/www/documents/ + +=head1 PREREQUISITES + +This script uses the C<LWP> modules. + +=head1 COREQUISITES + +This script can send mail when C<Mail::Send> is present. + +=head1 AUTHOR + +Hans de Graaff <hans@degraaff.org> + +=pod OSNAMES + +any + +=cut + +# Declare some global variables, avoids ugly use of main:: all around +my %checkbot_errors = ('901' => 'Host name expected but not found', + '902' => 'Unqualified host name in URL', + '903' => 'URL contains double slash in URL', + '904' => 'Unknown scheme in URL', + ); + +my @starturls = (); + +# Two hashes to store the response to a URL, and all the parents of the URL +my %url_error = (); +my %url_parent = (); + +# Hash for storing the title of a URL for use in reports. TODO: remove +# this and store title as part of queue. +my %url_title = (); + +# Hash for suppressions, which are defined as a combination of code and URL +my %suppression = (); + +# Hash to store statistics on link checking +my %stats = ('todo' => 0, + 'link' => 0, + 'problem' => 0 ); + +# Options hash (to be filled by GetOptions) +my %options = (); + +# Keep track of start time so that we can use it in reports +my $start_time = time(); + +# If on a Mac we should ask for the arguments through some MacPerl stuff +if ($^O eq 'MacOS') { + $main::mac_answer = eval "MacPerl::Ask('Enter Command-Line Options')"; + push(@ARGV, split(' ', $main::mac_answer)); +} + +# Prepare +check_options(); +init_modules(); +init_globals(); +init_suppression(); + +# Start actual application +check_links(); + +# Finish up +create_page(1); +send_mail() if defined $main::opt_mailto and $stats{problem} > 0; + +exit 0; + +# output prints stuff on stderr if --verbose, and takes care of proper +# indentation +sub output { + my ($line, $level) = @_; + + return unless $main::opt_verbose; + + chomp $line; + + my $indent = ''; + + if (defined $level) { + while ($level-- > 0) { + $indent .= ' '; + } + } + + print STDERR $indent, $line, "\n"; +} + +### Initialization and setup routines + +sub check_options { + + # Get command-line arguments + use Getopt::Long; + my $result = GetOptions(qw(cookies debug help noproxy=s verbose url=s match=s exclude|x=s file=s filter=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=f timeout=i interval=i dontwarn=s enable-virtual language=s allow-simple-hosts suppress=s)); + + # Handle arguments, some are mandatory, some have defaults + &print_help if (($main::opt_help && $main::opt_help) + || (!$main::opt_url && $#ARGV == -1)); + $main::opt_timeout = 120 unless defined($main::opt_timeout) && length($main::opt_timeout); + $main::opt_verbose = 0 unless $main::opt_verbose; + $main::opt_sleep = 0 unless defined($main::opt_sleep) && length($main::opt_sleep); + $main::opt_interval = 10800 unless defined $main::opt_interval and length $main::opt_interval; + $main::opt_dontwarn = "xxx" unless defined $main::opt_dontwarn and length $main::opt_dontwarn; + $main::opt_enable_virtual = 0 unless defined $main::opt_enable_virtual; + # Set the default language and make sure it is a two letter, lowercase code + $main::opt_language = 'en' unless defined $main::opt_language; + $main::opt_language = lc(substr($main::opt_language, 0, 2)); + $main::opt_language =~ tr/a-z//cd; + if ($main::opt_language !~ /[a-z][a-z]/) { + warn "Argument --language $main::opt_language is not a valid language code\nUsing English as a default.\n"; + $main::opt_language = 'en'; + } + $main::opt_allow_simple_hosts = 0 + unless $main::opt_allow_simple_hosts; + output "--allow-simple-hosts is deprecated, please use the --dontwarn mechanism", 0 if $main::opt_allow_simple_hosts; + + # The default for opt_match will be set later, because we might want + # to muck with opt_url first. + + # Display messages about the options + output "*** Starting Checkbot $VERSION in verbose mode"; + output 'Will skip checking of external links', 1 + if $main::opt_internal_only; + output "Allowing unqualified host names", 1 + if $main::opt_allow_simple_hosts; + output "Not using optional Time::Duration module: not found", 1 + unless $main::useduration; +} + +sub init_modules { + + use URI; + # Prepare the user agent to be used: + use LWP::UserAgent; + use LWP::MediaTypes; + #use LWP::Debug qw(- +debug); + use HTML::LinkExtor; + $main::ua = new LWP::UserAgent; + $main::ua->agent("Checkbot/$VERSION LWP/" . LWP::Version); + $main::ua->timeout($main::opt_timeout); + # Add a proxy to the user agent, if defined + $main::ua->proxy(['http', 'ftp'], $main::opt_proxy) + if defined($main::opt_proxy); + $main::ua->no_proxy(split(',', $main::opt_noproxy)) + if defined $main::opt_noproxy; + # Add a cookie jar to the UA if requested by the user + $main::ua->cookie_jar( {} ) + if defined $main::opt_cookies or $main::opt_cookies; + + require Mail::Send if defined $main::opt_mailto; + + use HTTP::Status; +} + +sub init_globals { + my $url; + + # Directory and files for output + if ($main::opt_file) { + $main::file = $main::opt_file; + $main::file =~ /(.*)\./; + $main::server_prefix = $1; + } else { + $main::file = "checkbot.html"; + $main::server_prefix = "checkbot"; + } + $main::tmpdir = ($ENV{'TMPDIR'} or $ENV{'TMP'} or $ENV{'TEMP'} or "/tmp") . "/Checkbot.$$"; + + $main::cur_queue = $main::tmpdir . "/queue"; + $main::new_queue = $main::tmpdir . "/queue-new"; + + # Make sure we catch signals so that we can clean up temporary files + $SIG{'INT'} = $SIG{'TERM'} = $SIG{'HUP'} = $SIG{'QUIT'} = \&got_signal; + + # Set up hashes to be used + %main::checked = (); + %main::servers = (); + %main::servers_get_only = (); + + # Initialize the start URLs. --url takes precedence. Otherwise + # just process URLs in order as they appear on the command line. + unshift(@ARGV, $main::opt_url) if $main::opt_url; + foreach (@ARGV) { + $url = URI->new($_); + # If no scheme is defined we will assume file is used, so that + # it becomes easy to check a single file. + $url->scheme('file') unless defined $url->scheme; + $url->host('localhost') if $url->scheme eq 'file'; + if (!defined $url->host) { + warn "No host specified in URL $url, ignoring it.\n"; + next; + } + push(@starturls, $url); + } + die "There are no valid starting URLs to begin checking with!\n" + if scalar(@starturls) == -1; + + # Set the automatic matching expression to a concatenation of the starturls + if (!defined $main::opt_match) { + my @matchurls; + foreach my $url (@starturls) { + # Remove trailing files from the match, e.g. remove index.html + # stuff so that we match on the host and/or directory instead, + # but only if there is a path component in the first place. + my $matchurl = $url->as_string; + $matchurl =~ s!/[^/]+$!/! unless $url->path eq ''; + push(@matchurls, quotemeta $matchurl); + } + $main::opt_match = '^(' . join('|', @matchurls) . ')'; + output "--match defaults to $main::opt_match"; + } + + # Initialize statistics hash with number of start URLs + $stats{'todo'} = scalar(@starturls); + + # We write out our status every now and then. + $main::cp_int = 1; + $main::cp_last = 0; +} + +sub init_suppression { + return if not defined $main::opt_suppress; + + die "Suppression file \"$main::opt_suppress\" is in fact a directory" + if -d $main::opt_suppress; + + open(SUPPRESSIONS, $main::opt_suppress) + or die "Unable to open $main::opt_suppress for reading: $!\n"; + while (my $line = <SUPPRESSIONS>) { + chomp $line; + next if $line =~ /^#/ or $line =~ /^\s*$/; + + if ($line !~ /^\s*(\d+)\s+(\S+)/) { + output "WARNING: Unable to parse line in suppression file $main::opt_suppress:\n $line\n"; + } else { + output "Suppressed: $1 $2\n" if $main::opt_verbose; + $suppression{$1}{$2} = $2; + } + } + close SUPPRESSIONS; +} + + + + +### Main application code + +sub check_links { + my $line; + + mkdir $main::tmpdir, 0755 + || die "$0: unable to create directory $main::tmpdir: $!\n"; + + # Explicitly set the record separator. I had the problem that this + # was not defined under my perl 5.00502. This should fix that, and + # not cause problems for older versions of perl. + $/ = "\n"; + + open(CURRENT, ">$main::cur_queue") + || die "$0: Unable to open CURRENT $main::cur_queue for writing: $!\n"; + open(QUEUE, ">$main::new_queue") + || die "$0: Unable to open QUEUE $main::new_queue for writing: $!\n"; + + # Prepare CURRENT queue with starting URLs + foreach (@starturls) { + print CURRENT $_->as_string . "|\n"; + } + close CURRENT; + + open(CURRENT, $main::cur_queue) + || die "$0: Unable to open CURRENT $main::cur_queue for reading: $!\n"; + + do { + # Read a line from the queue, and process it + while (defined ($line = <CURRENT>) ) { + chomp($line); + &handle_url($line); + &check_point(); + } + + # Move queues around, and try again, but only if there are still + # things to do + output "*** Moving queues around, " . $stats{'todo'} . " links to do."; + close CURRENT + or warn "Error while closing CURRENT filehandle: $!\n"; + close QUEUE; + + # TODO: should check whether these succeed + unlink($main::cur_queue); + rename($main::new_queue, $main::cur_queue); + + open(CURRENT, "$main::cur_queue") + || die "$0: Unable to open $main::cur_queue for reading: $!\n"; + open(QUEUE, ">$main::new_queue") + || die "$0: Unable to open $main::new_queue for writing: $!\n"; + + } while (not -z $main::cur_queue); + + close CURRENT; + close QUEUE; + + unless (defined($main::opt_debug)) { + clean_up(); + } +} + +sub clean_up { + unlink $main::cur_queue, $main::new_queue; + rmdir $main::tmpdir; + output "Removed temporary directory $main::tmpdir and its contents.\n", 1; +} + +sub got_signal { + my ($signalname) = @_; + + clean_up() unless defined $main::opt_debug; + + print STDERR "Caught SIG$signalname.\n"; + exit 1; +} + +# Whether URL is 'internal' or 'external' +sub is_internal ($) { + my ($url) = @_; + + return ( $url =~ /$main::opt_match/o + and not (defined $main::opt_exclude and $url =~ /$main::opt_exclude/o)); +} + + +sub handle_url { + my ($line) = @_; + my ($urlstr, $urlparent) = split(/\|/, $line); + + my $reqtype; + my $response; + my $type; + + $stats{'todo'}--; + + # Add this URL to the ones we've seen already, return if it is a + # duplicate. + return if add_checked($urlstr); + + $stats{'link'}++; + + # Is this an external URL and we only check internal stuff? + return if defined $main::opt_internal_only + and not is_internal($urlstr); + + my $url = URI->new($urlstr); + + # Perhaps this is a URL we are not interested in checking... + if (not defined($url->scheme) + or $url->scheme !~ /^(https?|file|ftp|gopher|nntp)$/o ) { + # Ignore URLs which we know we can ignore, create error for others + if ($url->scheme =~ /^(news|mailto|javascript|mms)$/o) { + output "Ignore $url", 1; + } else { + add_error($urlstr, $urlparent, 904, "Unknown scheme in URL: " + . $url->scheme); + } + return; + } + + # Guess/determine the type of document we might retrieve from this + # URL. We do this because we only want to use a full GET for HTML + # document. No need to retrieve images, etc. + if ($url->path =~ /\/$/o || $url->path eq "") { + $type = 'text/html'; + } else { + $type = guess_media_type($url->path); + } + # application/octet-stream is the fallback of LWP's guess stuff, so + # if we get this then we ask the server what we got just to be sure. + if ($type eq 'application/octet-stream') { + $response = performRequest('HEAD', $url, $urlparent, $type, $main::opt_language); + $type = $response->content_type; + } + + # Determine if this is a URL we should GET fully or partially (using HEAD) + if ($type =~ /html/o + && $url->scheme =~ /^(https?|file|ftp|gopher)$/o + and is_internal($url->as_string) + && (!defined $main::opt_exclude || $url !~ /$main::opt_exclude/o)) { + $reqtype = 'GET'; + } else { + $reqtype = 'HEAD'; + } + + # Get the document, unless we already did while determining the type + $response = performRequest($reqtype, $url, $urlparent, $type, $main::opt_language) + unless defined($response) and $reqtype eq 'HEAD'; + + # Ok, we got something back from checking, let's see what it is + if ($response->is_success) { + select(undef, undef, undef, $main::opt_sleep) + unless $main::opt_debug || $url->scheme eq 'file'; + + # Internal HTML documents need to be given to handle_doc for processing + if ($reqtype eq 'GET' and is_internal($url->as_string)) { + handle_doc($response, $urlstr); + } + } else { + + # Right, so it wasn't the smashing succes we hoped for, so bring + # the bad news and store the pertinent information for later + add_error($url, $urlparent, $response->code, $response->message); + + if ($response->is_redirect and is_internal($url->as_string)) { + if ($response->code == 300) { # multiple choices, but no redirection available + output 'Multiple choices', 2; + } else { + my $baseURI = URI->new($url); + if (defined $response->header('Location')) { + my $redir_url = URI->new_abs($response->header('Location'), $baseURI); + output "Redirected to $redir_url", 2; + add_to_queue($redir_url, $urlparent); + $stats{'todo'}++; + } else { + output 'Location header missing from redirect response', 2; + } + } + } + } + # Done with this URL +} + +sub performRequest { + my ($reqtype, $url, $urlparent, $type, $language) = @_; + + my ($response); + + # A better solution here would be to use GET exclusively. Here is how + # to do that. We would have to set this max_size thing in + # check_external, I guess... + # Set $ua->max_size(1) and then try a normal GET request. However, + # that doesn't always work as evidenced by an FTP server that just + # hangs in this case... Needs more testing to see if the timeout + # catches this. + + # Normally, we would only need to do a HEAD, but given the way LWP + # handles gopher requests, we need to do a GET on those to get at + # least a 500 and 501 error. We would need to parse the document + # returned by LWP to find out if we had problems finding the + # file. -- Patch by Bruce Speyer <bspeyer@texas-one.org> + + # We also need to do GET instead of HEAD if we know the remote + # server won't accept it. The standard way for an HTTP server to + # indicate this is by returning a 405 ("Method Not Allowed") or 501 + # ("Not Implemented"). Other circumstances may also require sending + # GETs instead of HEADs to a server. Details are documented below. + # -- Larry Gilbert <larry@n2h2.com> + + # Normally we try a HEAD request first, then a GET request if + # needed. There may be circumstances in which we skip doing a HEAD + # (e.g. when we should be getting the whole document). + foreach my $try ('HEAD', 'GET') { + + # Skip trying HEAD when we know we need to do a GET or when we + # know only a GET will work anyway. + next if $try eq 'HEAD' and + ($reqtype eq 'GET' + or $url->scheme eq 'gopher' + or (defined $url->authority and $main::servers_get_only{$url->authority})); + + # Output what we are going to do with this link + output(sprintf("%4s %s (%s)\n", $try, $url, $type), 1); + + # Create the request with all appropriate headers + my %header_hash = ( 'Referer' => $urlparent ); + if (defined($language) && ($language ne '')) { + $header_hash{'Accept-Language'} = $language; + } + my $ref_header = new HTTP::Headers(%header_hash); + my $request = new HTTP::Request($try, $url, $ref_header); + $response = $main::ua->simple_request($request); + + # If we are doing a HEAD request we need to make sure nothing + # fishy happened. we use some heuristics to see if we are ok, or + # if we should try again with a GET request. + if ($try eq 'HEAD') { + + # 400, 405, 406 and 501 are standard indications that HEAD + # shouldn't be used + # We used to check for 403 here also, but according to the HTTP spec + # a 403 indicates that the server understood us fine but really does + # not want us to see the page, so we SHOULD NOT retry. + if ($response->code =~ /^(400|405|406|501)$/o) { + output "Server does not seem to like HEAD requests; retrying", 2; + $main::servers_get_only{$url->authority}++; + next; + }; + + # There are many servers out there that have real trouble with + # HEAD, so if we get a 500 Internal Server error just retry with + # a GET request to get an authoritive answer. We used to do this + # only for special cases, but the list got big and some + # combinations (e.g. Zope server behind Apache proxy) can't + # easily be detected from the headers. + if ($response->code =~ /^500$/o) { + output "Internal server error on HEAD request; retrying with GET", 2; + $main::servers_get_only{$url->authority}++ if defined $url->authority; + next; + } + + # If we know the server we can try some specific heuristics + if (defined $response->server) { + + # Netscape Enterprise has been seen returning 500 and even 404 + # (yes, 404!!) in response to HEAD requests + if ($response->server =~ /^Netscape-Enterprise/o + and $response->code =~ /^404$/o) { + output "Unreliable Netscape-Enterprise response to HEAD request; retrying", 2; + $main::servers_get_only{$url->authority}++; + next; + }; + } + + # If a HEAD request resulted in nothing noteworthy, no need for + # any further attempts using GET, we are done. + last; + } + } + + return $response; +} + + +# This routine creates a (temporary) WWW page based on the current +# findings This allows somebody to monitor the process, but is also +# convenient when this program crashes or waits because of diskspace +# or memory problems + +sub create_page { + my($final_page) = @_; + + my $path = ""; + my $prevpath = ""; + my $prevcode = 0; + my $prevmessage = ""; + + output "*** Start writing results page"; + + open(OUT, ">$main::file.new") + || die "$0: Unable to open $main::file.new for writing:\n"; + print OUT "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n"; + print OUT "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n"; + print OUT "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n"; + print OUT "<head>\n"; + if (!$final_page) { + printf OUT "<meta http-equiv=\"Refresh\" content=\"%d\" />\n", + int($main::cp_int * 60 / 2 - 5); + } + + print OUT "<title>Checkbot report</title>\n"; + print OUT "<link rel=\"stylesheet\" type=\"text/css\" href=\"$main::opt_style\" />\n" if defined $main::opt_style; + print OUT "</head>\n"; + print OUT "<body>\n"; + print OUT "<h1><em>Checkbot</em>: main report</h1>\n"; + + # Show the status of this checkbot session + print OUT "<table summary=\"Status of this Checkbot session\" class='status'><tr><th>Status:</th><td>"; + if ($final_page) { + print OUT "Done.<br />\n"; + print OUT 'Run started on ' . localtime($start_time) . ".<br />\n"; + print OUT 'Run duration ', duration(time() - $start_time), ".\n" + if $main::useduration; + } else { + print OUT "Running since " . localtime($start_time) . ".<br />\n"; + print OUT "Last update at ". localtime() . ".<br />\n"; + print OUT "Next update in <strong>", int($main::cp_int), "</strong> minutes.\n"; + } + print OUT "</td></tr></table>\n\n"; + + # Summary (very brief overview of key statistics) + print OUT "<hr /><h2 class='summary'>Report summary</h2>\n"; + + print OUT "<table summary=\"Report summary\" class='summary'>\n"; + print OUT "<tr id='checked'><th>Links checked</th><td class='value'>", $stats{'link'}, "</td></tr>\n"; + print OUT "<tr id='problems'><th>Problems so far</th><td class='value'>", $stats{'problem'}, "</td></tr>\n"; + print OUT "<tr id='todo'><th>Links to do</th><td class='value'>", $stats{'todo'}, "</td></tr>\n"; + print OUT "</table>\n"; + + # Server information + printAllServers($final_page); + + # Checkbot session parameters + print OUT "<hr /><h2 class='params'>Checkbot session parameters</h2>\n"; + print OUT "<table summary=\"Checkbot session parameters\" class='params'>\n"; + print OUT "<tr><th align=\"left\">--url &<br/> <command line urls></th><td class='text'>Start URL(s)</td><td class='value' id='url'>", + join(',', @starturls), "</td></tr>\n"; + print OUT "<tr><th align=\"left\">--match</th><td class='text'>Match regular expression</td><td class='value' id='match'>$main::opt_match</td></tr>\n"; + print OUT "<tr><th align=\"left\">--exclude</th><td class='text'>Exclude regular expression</td><td class='value' id='exclude'>$main::opt_exclude</td></tr>\n" if defined $main::opt_exclude; + print OUT "<tr><th align=\"left\">--filter</th><td class='text'>Filter regular expression</td><td class='value' id='filter'>$main::opt_filter</td></tr>\n" if defined $main::opt_filter; + print OUT "<tr><th align=\"left\">--noproxy</th><td class='text'>No Proxy for the following domains</td><td class='value' id='noproxy'>$main::opt_noproxy</td></tr>\n" if defined $main::opt_noproxy; + print OUT "<tr><th align=\"left\">--ignore</th><td class='text'>Ignore regular expression</td><td class='value' id='ignore'>$main::opt_ignore</td></tr>\n" if defined $main::opt_ignore; + print OUT "<tr><th align=\"left\">--suppress</th><td class='text'>Suppress error code and URL specified by</td><td class='value' id='suppress'>$main::opt_suppress</td></tr>\n" if defined $main::opt_suppress; + print OUT "<tr><th align=\"left\">--dontwarn</th><td class='text'>Don't warn for these codes</td><td class='value' id='dontwarn'>$main::opt_dontwarn</td></tr>\n" if $main::opt_dontwarn ne 'xxx'; + print OUT "<tr><th align=\"left\">--enable-virtual</th><td class='text'>Use virtual names only</td><td class='value' id='enable_virtual'>yes</td></tr>\n" if $main::opt_enable_virtual; + print OUT "<tr><th align=\"left\">--internal-only</th><td class='text'>Check only internal links</td><td class='value' id='internal_only'>yes</td></tr>\n" if defined $main::opt_internal_only; + print OUT "<tr><th align=\"left\">--cookies</th><td class='text'>Accept cookies</td><td class='value' id='cookies'>yes</td></tr>\n" if defined $main::opt_cookies; + print OUT "<tr><th align=\"left\">--sleep</th><td class='text'>Sleep seconds between requests</td><td class='value' id='sleep'>$main::opt_sleep</td></tr>\n" if ($main::opt_sleep != 0); + print OUT "<tr><th align=\"left\">--timeout</th><td class='text'>Request timeout seconds</td><td class='value' id='timeout'>$main::opt_timeout</td></tr>\n"; + print OUT "</table>\n"; + + # Statistics for types of links + + print OUT signature(); + + close(OUT); + + rename($main::file, $main::file . ".bak"); + rename($main::file . ".new", $main::file); + + unlink $main::file . ".bak" unless $main::opt_debug; + + output "*** Done writing result page"; +} + +# Create a list of all the servers, and create the corresponding table +# and subpages. We use the servers overview for this. This can result +# in strange effects when the same server (e.g. IP address) has +# several names, because several entries will appear. However, when +# using the IP address there are also a number of tricky situations, +# e.g. with virtual hosting. Given that likely the servers have +# different names for a reasons, I think it is better to have +# duplicate entries in some cases, instead of working off of the IP +# addresses. + +sub printAllServers { + my ($finalPage) = @_; + + my $server; + print OUT "<hr /><h2 class='overview'>Overview per server</h2>\n"; + print OUT "<table summary=\"Overview per server\" class='overview'><tr><th>Server</th><th>Server<br />Type</th><th>Documents<br />scanned</th><th>Problem<br />links</th><th>Ratio</th></tr>\n"; + + foreach $server (sort keys %main::servers) { + print_server($server, $finalPage); + } + print OUT "</table>\n\n"; +} + +sub get_server_type { + my($server) = @_; + + my $result; + + if ( ! defined($main::server_type{$server})) { + if ($server eq 'localhost') { + $result = 'Direct access through filesystem'; + } else { + my $request = new HTTP::Request('HEAD', "http://$server/"); + my $response = $main::ua->simple_request($request); + $result = $response->header('Server'); + } + $result = "Unknown server type" if ! defined $result or $result eq ""; + output "=== Server $server is a $result"; + $main::server_type{$server} = $result; + } + $main::server_type{$server}; +} + +sub add_checked { + my($urlstr) = @_; + my $item; + my $result = 0; + + if (is_internal($urlstr) and not $main::opt_enable_virtual) { + # Substitute hostname with IP-address. This keeps us from checking + # the same pages for each name of the server, wasting time & resources. + # Only do this if we are not dealing with virtual servers. Also, we + # only do this for internal servers, because it makes no sense for + # external links. + my $url = URI->new($urlstr); + $url->host(ip_address($url->host)) if $url->can('host'); + $urlstr = $url->as_string; + } + + if (defined $main::checked{$urlstr}) { + $result = 1; + $main::checked{$urlstr}++; + } else { + $main::checked{$urlstr} = 1; + } + + return $result; +} + +# Has this URL already been checked? +sub is_checked { + my ($urlstr) = @_; + + if (is_internal($urlstr) and not $main::opt_enable_virtual) { + # Substitute hostname with IP-address. This keeps us from checking + # the same pages for each name of the server, wasting time & resources. + # Only do this if we are not dealing with virtual servers. Also, we + # only do this for internal servers, because it makes no sense for + # external links. + my $url = URI->new($urlstr); + $url->host(ip_address($url->host)) if $url->can('host'); + $urlstr = $url->as_string; + } + + return defined $main::checked{$urlstr}; +} + +sub add_error ($$$$) { + my ($url, $urlparent, $code, $status) = @_; + + # Check for the quick eliminations first + return if $code =~ /$main::opt_dontwarn/o + or defined $suppression{$code}{$url}; + + # Check for matches on the regular expressions in the supression file + if (defined $suppression{$code}) { + foreach my $item ( %{$suppression{$code}} ) { + if ($item =~ /^\/(.*)\/$/) { + my $regexp = $1; + if ($url =~ $regexp) { + output "Supressing error $code for $url due to regular expression match on $regexp", 2; + return; + } + } + } + } + + $status = checkbot_status_message($code) if not defined $status; + + output "$code $status", 2; + + $url_error{$url}{'code'} = $code; + $url_error{$url}{'status'} = $status; + push @{$url_parent{$url}}, $urlparent; + $stats{'problem'}++; +} + +# Parse document, and get the links +sub handle_doc { + my ($response, $urlstr) = @_; + + my $num_links = 0; + my $new_links = 0; + + # TODO: we are making an assumption here that the $reponse->base is + # valid, which might not always be true! This needs to be fixed, but + # first let's try to find out why this stuff is sometimes not + # valid... Aha. a simple <base href="news:"> will do the trick. It is + # not clear what the right fix for this is. + + # We use the URL we used to retrieve this document as the URL to + # attach the problem reports to, even though this may not be the + # proper base url. + my $baseurl = URI->new($urlstr); + + # When we received the document we can add a notch to its server + $main::servers{$baseurl->authority}++; + + # Retrieve useful information from this document. + # TODO: using a regexp is NOT how this should be done, but it is + # easy. The right way would be to write a HTML::Parser or to use + # XPath on the document DOM provided that the document is easily + # parsed as XML. Either method is a lot of overhead. + if ($response->content =~ /title\>(.*?)\<\/title/si) { + + # TODO: using a general hash that stores titles for all pages may + # consume too much memory. It would be better to only store the + # titles for requests that had problems. That requires passing them + # down to the queue. Take the easy way out for now. + $url_title{$baseurl} = $1; + } + + # Check if this document has a Robots META tag. If so, check if + # Checkbot is allowed to FOLLOW the links on this page. Note that we + # ignore the INDEX directive because Checkbot is not an indexing + # robot. See http://www.robotstxt.org/wc/meta-user.html + # TODO: one more reason (see title) to properly parse this document... + if ($response->content =~ /\<meta[^\>]*?robots[^\>]*?nofollow[^\>]*?\>/si) { + output "Obeying robots meta tag $&, skipping document", 2; + return; + } + + + # Parse the document just downloaded, using the base url as defined + # in the response, otherwise we won't get the same behavior as + # browsers and miss things like a BASE url in pages. + my $p = HTML::LinkExtor->new(undef, $response->base); + + # If charset information is missing then decoded_content doesn't + # work. Fall back to content in this case, even though that may lead + # to charset warnings. See bug 1665075 for reference. + my $content = $response->decoded_content || $response->content; + $p->parse($content); + $p->eof; + + # Deal with the links we found in this document + my @links = $p->links(); + foreach (@links) { + my ($tag, %l) = @{$_}; + foreach (keys %l) { + # Get the canonical URL, so we don't need to worry about base, case, etc. + my $url = $l{$_}->canonical; + + # Remove fragments, if any + $url->fragment(undef); + + # Determine in which tag this URL was found + # Ignore <base> tags because they need not point to a valid URL + # in order to work (e.g. when directory indexing is turned off). + next if $tag eq 'base'; + + # Skip some 'links' that are not required to link to an actual + # live link but which LinkExtor returns as links anyway. + next if $tag eq 'applet' and $_ eq 'code'; + next if $tag eq 'object' and $_ eq 'classid'; + + # Run filter on the URL if defined + if (defined $main::opt_filter) { + die "Filter supplied with --filter option contains errors!\n$@\n" + unless defined eval '$url =~ s' . $main::opt_filter + } + + # Should we ignore this URL? + if (defined $main::opt_ignore and $url =~ /$main::opt_ignore/o) { + output "--ignore: $url", 1; + next; + } + + # Check whether URL has fully-qualified hostname + if ($url->can('host') and $url->scheme ne 'news') { + if (! defined $url->host) { + add_error($url, $baseurl->as_string, '901', + $checkbot_errors{'901'}); + } elsif (!$main::opt_allow_simple_hosts && $url->host !~ /\./) { + add_error($url, $baseurl->as_string, '902', + $checkbot_errors{'902'}); + } + } + + # Some servers do not process // correctly in requests for relative + # URLs. We should flag them here. Note that // in a URL path is + # actually valid per RFC 2396, and that they should not be removed + # when processing relative URLs as per RFC 1808. See + # e.g. <http://deesse.univ-lemans.fr:8003/Connected/RFC/1808/18.html>. + # Thanks to Randal Schwartz and Reinier Post for their explanations. + if ($url =~ /^http:\/\/.*\/\//) { + add_error($url, $baseurl->as_string, '903', + $checkbot_errors{'903'}); + } + + # We add all URLs found to the queue, unless we already checked + # it earlier + if (is_checked($url)) { + + # If an error has already been logged for this URL we add the + # current parent to the list of parents on which this URL + # appears. + if (defined $url_error{$url}) { + push @{$url_parent{$url}}, $baseurl->as_string; + $stats{'problem'}++; + } + + $stats{'link'}++; + } else { + add_to_queue($url, $baseurl); + $stats{'todo'}++; + $new_links++; + } + $num_links++; + } + } + output "Got $num_links links ($new_links new) from document", 2; +} + + +sub add_to_queue { + my ($url, $parent) = @_; + + print QUEUE $url . '|' . $parent . "\n"; +} + +sub checkbot_status_message ($) { + my ($code) = @_; + + my $result = status_message($code) || $checkbot_errors{$code} + || '(Undefined status)'; +} + +sub print_server ($$) { + my($server, $final_page) = @_; + + my $host = $server; + $host =~ s/(.*):\d+/$1/; + + output "Writing server $server (really " . ip_address($host) . ")", 1; + + my $server_problem = count_problems($server); + my $filename = "$main::server_prefix-$server.html"; + $filename =~ s/:/-/o; + + print OUT "<tr><td class='server'>"; + print OUT "<a href=\"@{[ (fileparse($filename))[0] ]}\">" if $server_problem > 0; + print OUT "$server"; + print OUT "</a>" if $server_problem > 0; + print OUT "</td>"; + print OUT "<td class='servertype'>" . get_server_type($server) . "</td>"; + printf OUT "<td class='unique' align=\"right\">%d</td>", + $main::servers{$server} + $server_problem; + if ($server_problem) { + printf OUT "<td class='problems' id='oops' align=\"right\">%d</td>", + $server_problem; + } else { + printf OUT "<td class='problems' id='zero_defects' align=\"right\">%d</td>", + $server_problem; + } + + my $ratio = $server_problem / ($main::servers{$server} + $server_problem) * 100; + print OUT "<td class='ratio' align=\"right\">"; + print OUT "<strong>" unless $ratio < 0.5; + printf OUT "%4d%%", $ratio; + print OUT "</strong>" unless $ratio < 0.5; + print OUT "</td>"; + print OUT "</tr>\n"; + + # Create this server file + open(SERVER, ">$filename") + || die "Unable to open server file $filename for writing: $!"; + print SERVER "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n"; + print SERVER "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n"; + print SERVER "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n"; + print SERVER "<head>\n"; + if (!$final_page) { + printf SERVER "<meta http-equiv=\"Refresh\" content=\"%d\" />\n", + int($main::cp_int * 60 / 2 - 5); + } + print SERVER "<link rel=\"stylesheet\" type=\"text/css\" href=\"$main::opt_style\" />\n" if defined $main::opt_style; + print SERVER "<title>Checkbot: output for server $server</title></head>\n"; + print SERVER "<body><h2><em>Checkbot</em>: report for server <tt>$server</tt></h2>\n"; + print SERVER "<p>Go To: <a href=\"@{[ (fileparse($main::file))[0] ]}\">Main report page</a>"; + + printServerProblems($server, $final_page); + + print SERVER "\n"; + print SERVER signature(); + + close SERVER; +} + +# Return a string containing Checkbot's signature for HTML pages +sub signature { + return "<hr />\n<p class='signature'>Page created by <a href=\"http://degraaff.org/checkbot/\">Checkbot $VERSION</a> on <em>" . localtime() . "</em>.</p>\n". + "<p><a href=\"http://validator.w3.org/check/?uri=referer\"><img src=\"http://www.w3.org/Icons/valid-xhtml11\" alt=\"Valid XHTML 1.1\" height=\"31\" width=\"88\" /></a></p>". + "</body></html>"; +} + +# Loop through all possible problems, select relevant ones for this server +# and display them in a meaningful way. +sub printServerProblems ($$) { + my ($server, $final_page) = @_; + $server = quotemeta $server; + + my $separator = "<hr />\n"; + + my %thisServerList = (); + + # First we find all the problems for this particular server + foreach my $url (keys %url_parent) { + foreach my $parent (@{$url_parent{$url}}) { + next if $parent !~ $server; + chomp $parent; + $thisServerList{$url_error{$url}{'code'}}{$parent}{$url} + = $url_error{$url}{'status'}; + } + } + + # Do a run to find all error codes on this page, and include a table + # of contents to the actual report + foreach my $code (sort keys %thisServerList) { + print SERVER ", <a href=\"#rc$code\">$code "; + print SERVER checkbot_status_message($code); + print SERVER "</a>"; + } + print SERVER ".</p>\n"; + + + # Now run through this list and print the errors + foreach my $code (sort keys %thisServerList) { + my $codeOut = ''; + + foreach my $parent (sort keys %{ $thisServerList{$code} }) { + my $urlOut = ''; + foreach my $url (sort keys %{ $thisServerList{$code}{$parent} }) { + my $status = $thisServerList{$code}{$parent}{$url}; + $urlOut .= "<li><a href=\"$url\">$url</a><br/>\n"; + $urlOut .= "$status" + if defined $status and $status ne checkbot_status_message($code); + $urlOut .= "</li>\n"; + } + if ($urlOut ne '') { + $codeOut .= "<dt><a href=\"$parent\">$parent</a>"; + $codeOut .= "<br />$url_title{$parent}\n" if defined $url_title{$parent}; + $codeOut .= "<dd><ul>\n$urlOut\n</ul>\n\n"; + } + } + + if ($codeOut ne '') { + print SERVER $separator if $separator; + $separator = ''; + print SERVER "<h4 id=\"rc$code\">$code "; + print SERVER checkbot_status_message($code); + print SERVER "</h4>\n<dl>\n$codeOut\n</dl>\n"; + } + } +} + +sub check_point { + if ( ($main::cp_last + 60 * $main::cp_int < time()) + || ($main::opt_debug && $main::opt_verbose)) { + &create_page(0); + $main::cp_last = time(); + # Increase the intervall from one snapshot to the next by 25% + # until we have reached the maximum. + $main::cp_int *= 1.25 unless $main::opt_debug; + $main::cp_int = $main::opt_interval if $main::cp_int > $main::opt_interval; + } +} + +sub send_mail { + my $msg = new Mail::Send; + my $sub = 'Checkbot results for '; + $sub .= join(', ', @starturls); + $sub .= ': ' . $stats{'problem'} . ' errors'; + + $msg->to($main::opt_mailto); + $msg->subject($sub); + + my $fh = $msg->open; + + print $fh "Checkbot results for:\n " . join("\n ", @starturls) . "\n\n"; + print $fh "User-supplied note: $main::opt_note\n\n" + if defined $main::opt_note; + + print $fh $stats{'link'}, " links were checked, and "; + print $fh $stats{'problem'}, " problems were detected.\n"; + + print $fh 'Run started on ' . localtime($start_time) . "\n"; + print $fh 'Run duration ', duration(time() - $start_time), "\n" + if $main::useduration; + + + print $fh "\n-- \nCheckbot $VERSION\n"; + print $fh "<URL:http://degraaff.org/checkbot/>\n"; + + $fh->close; +} + +sub print_help { + print <<"__EOT__"; +Checkbot $VERSION command line options: + + --cookies Accept cookies from the server + --debug Debugging mode: No pauses, stop after 25 links. + --file file Use file as basis for output file names. + --help Provide this message. + --mailto address Mail brief synopsis to address when done. + --noproxy domains Do not proxy requests to given domains. + --verbose Verbose mode: display many messages about progress. + --url url Start URL + --match match Check pages only if URL matches `match' + If no match is given, the start URL is used as a match + --exclude exclude Exclude pages if the URL matches 'exclude' + --filter regexp Run regexp on each URL found + --ignore ignore Ignore URLs matching 'ignore' + --suppress file Use contents of 'file' to suppress errors in output + --note note Include Note (e.g. URL to report) along with Mail message. + --proxy URL URL of proxy server for HTTP and FTP requests. + --internal-only Only check internal links, skip checking external links. + --sleep seconds Sleep this many seconds between requests (default 0) + --style url Reference the style sheet at this URL. + --timeout seconds Timeout for http requests in seconds (default 120) + --interval seconds Maximum time interval between updates (default 10800) + --dontwarn codes Do not write warnings for these HTTP response codes + --enable-virtual Use only virtual names, not IP numbers for servers + --language Specify 2-letter language code for language negotiation + +Options --match, --exclude, and --ignore can take a perl regular expression +as their argument\n +Use 'perldoc checkbot' for more verbose documentation. +Checkbot WWW page : http://degraaff.org/checkbot/ +Mail bugs and problems: checkbot\@degraaff.org +__EOT__ + + exit 0; +} + +sub ip_address { + my($host) = @_; + + return $main::ip_cache{$host} if defined $main::ip_cache{$host}; + + my($name,$aliases,$adrtype,$length,@addrs) = gethostbyname($host); + if (defined $addrs[0]) { + my($n1,$n2,$n3,$n4) = unpack ('C4',$addrs[0]); + $main::ip_cache{$host} = "$n1.$n2.$n3.$n4"; + } else { + # Whee! No IP-address found for this host. Just keep whatever we + # got for the host. If this really is some kind of error it will + # be found later on. + $main::ip_cache{$host} = $host; + } +} + +sub count_problems { + my ($server) = @_; + $server = quotemeta $server; + my $count = 0; + + foreach my $url (sort keys %url_parent) { + foreach my $parent (@{ $url_parent{$url} }) { + $count++ if $parent =~ m/$server/; + } + } + return $count; +} +
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor