1 |
#!perl |
2 |
use strict; |
3 |
use warnings; |
4 |
use autodie; |
5 |
use feature qw(say); |
6 |
require File::Find::Rule; |
7 |
require File::Slurp; |
8 |
require File::Spec; |
9 |
require IO::Socket::SSL; |
10 |
use List::Util qw(sum); |
11 |
require LWP::UserAgent; |
12 |
require Net::FTP; |
13 |
require Parallel::Fork::BossWorkerAsync; |
14 |
require Term::ProgressBar::Simple; |
15 |
require URI::Find::Simple; |
16 |
$| = 1; |
17 |
|
18 |
my %ignore; |
19 |
while ( my $line = <main::DATA> ) { |
20 |
chomp $line; |
21 |
next if $line =~ /^#/; |
22 |
next unless $line; |
23 |
$ignore{$line} = 1; |
24 |
} |
25 |
|
26 |
my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 }); |
27 |
$ua->timeout(58); |
28 |
$ua->env_proxy; |
29 |
|
30 |
my @filenames = @ARGV; |
31 |
@filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.') |
32 |
unless @filenames; |
33 |
|
34 |
my $total_bytes = sum map {-s} @filenames; |
35 |
|
36 |
my $extract_progress = Term::ProgressBar::Simple->new( |
37 |
{ count => $total_bytes, |
38 |
name => 'Extracting URIs', |
39 |
} |
40 |
); |
41 |
|
42 |
my %uris; |
43 |
foreach my $filename (@filenames) { |
44 |
next if $filename =~ /uris\.txt/; |
45 |
next if $filename =~ /check_uris/; |
46 |
next if $filename =~ /\.patch$/; |
47 |
next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod'; |
48 |
next if $filename =~ /checkURL\.pl$/; |
49 |
my $contents = File::Slurp::read_file($filename); |
50 |
my @uris = URI::Find::Simple::list_uris($contents); |
51 |
foreach my $uri (@uris) { |
52 |
next unless $uri =~ /^(http|ftp)/; |
53 |
next if $ignore{$uri}; |
54 |
|
55 |
# no need to hit rt.perl.org |
56 |
next |
57 |
if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$}; |
58 |
|
59 |
# no need to hit rt.cpan.org |
60 |
next |
61 |
if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$}; |
62 |
|
63 |
# no need to hit google groups (weird redirect LWP does not like) |
64 |
next |
65 |
if $uri =~ m{^http://groups\.google\.com/}; |
66 |
|
67 |
push @{ $uris{$uri} }, $filename; |
68 |
} |
69 |
$extract_progress += -s $filename; |
70 |
} |
71 |
|
72 |
my $bw = Parallel::Fork::BossWorkerAsync->new( |
73 |
work_handler => \&work_alarmed, |
74 |
global_timeout => 120, |
75 |
worker_count => 20, |
76 |
); |
77 |
|
78 |
foreach my $uri ( keys %uris ) { |
79 |
my @filenames = @{ $uris{$uri} }; |
80 |
$bw->add_work( { uri => $uri, filenames => \@filenames } ); |
81 |
} |
82 |
|
83 |
undef $extract_progress; |
84 |
|
85 |
my $fetch_progress = Term::ProgressBar::Simple->new( |
86 |
{ count => scalar( keys %uris ), |
87 |
name => 'Fetching URIs', |
88 |
} |
89 |
); |
90 |
|
91 |
my %filenames; |
92 |
while ( $bw->pending() ) { |
93 |
my $response = $bw->get_result(); |
94 |
my $uri = $response->{uri}; |
95 |
my @filenames = @{ $response->{filenames} }; |
96 |
my $is_success = $response->{is_success}; |
97 |
my $message = $response->{message}; |
98 |
|
99 |
unless ($is_success) { |
100 |
foreach my $filename (@filenames) { |
101 |
push @{ $filenames{$filename} }, |
102 |
{ uri => $uri, message => $message }; |
103 |
} |
104 |
} |
105 |
$fetch_progress++; |
106 |
} |
107 |
$bw->shut_down(); |
108 |
|
109 |
my $fh = IO::File->new('> uris.txt'); |
110 |
foreach my $filename ( sort keys %filenames ) { |
111 |
$fh->say("* $filename"); |
112 |
my @bits = @{ $filenames{$filename} }; |
113 |
foreach my $bit (@bits) { |
114 |
my $uri = $bit->{uri}; |
115 |
my $message = $bit->{message}; |
116 |
$fh->say(" $uri"); |
117 |
$fh->say(" $message"); |
118 |
} |
119 |
} |
120 |
$fh->close; |
121 |
|
122 |
say 'Finished, see uris.txt'; |
123 |
|
124 |
sub work_alarmed { |
125 |
my $conf = shift; |
126 |
eval { |
127 |
local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required |
128 |
alarm 60; |
129 |
$conf = work($conf); |
130 |
alarm 0; |
131 |
}; |
132 |
if ($@) { |
133 |
$conf->{is_success} = 0; |
134 |
$conf->{message} = 'Timed out'; |
135 |
|
136 |
} |
137 |
return $conf; |
138 |
} |
139 |
|
140 |
sub work { |
141 |
my $conf = shift; |
142 |
my $uri = $conf->{uri}; |
143 |
my @filenames = @{ $conf->{filenames} }; |
144 |
|
145 |
if ( $uri =~ /^http/ ) { |
146 |
my $uri_without_fragment = URI->new($uri); |
147 |
my $fragment = $uri_without_fragment->fragment(undef); |
148 |
my $response = $ua->head($uri_without_fragment); |
149 |
|
150 |
$conf->{is_success} = $response->is_success; |
151 |
$conf->{message} = $response->status_line; |
152 |
return $conf; |
153 |
} else { |
154 |
|
155 |
my $uri_object = URI->new($uri); |
156 |
my $host = $uri_object->host; |
157 |
my $path = $uri_object->path; |
158 |
my ( $volume, $directories, $filename ) |
159 |
= File::Spec->splitpath($path); |
160 |
|
161 |
my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 ); |
162 |
unless ($ftp) { |
163 |
$conf->{is_succcess} = 0; |
164 |
$conf->{message} = "Can not connect to $host: $@"; |
165 |
return $conf; |
166 |
} |
167 |
|
168 |
my $can_login = $ftp->login( "anonymous", '-anonymous@' ); |
169 |
unless ($can_login) { |
170 |
$conf->{is_success} = 0; |
171 |
$conf->{message} = "Can not login ", $ftp->message; |
172 |
return $conf; |
173 |
} |
174 |
|
175 |
my $can_binary = $ftp->binary(); |
176 |
unless ($can_binary) { |
177 |
$conf->{is_success} = 0; |
178 |
$conf->{message} = "Can not binary ", $ftp->message; |
179 |
return $conf; |
180 |
} |
181 |
|
182 |
my $can_cwd = $ftp->cwd($directories); |
183 |
unless ($can_cwd) { |
184 |
$conf->{is_success} = 0; |
185 |
$conf->{message} = "Can not cwd to $directories ", $ftp->message; |
186 |
return $conf; |
187 |
} |
188 |
|
189 |
if ($filename) { |
190 |
my $can_size = $ftp->size($filename); |
191 |
unless ($can_size) { |
192 |
$conf->{is_success} = 0; |
193 |
$conf->{message} |
194 |
= "Can not size $filename in $directories", |
195 |
$ftp->message; |
196 |
return $conf; |
197 |
} |
198 |
} else { |
199 |
my ($can_dir) = $ftp->dir; |
200 |
unless ($can_dir) { |
201 |
my ($can_ls) = $ftp->ls; |
202 |
unless ($can_ls) { |
203 |
$conf->{is_success} = 0; |
204 |
$conf->{message} |
205 |
= "Can not dir or ls in $directories ", |
206 |
$ftp->message; |
207 |
return $conf; |
208 |
} |
209 |
} |
210 |
} |
211 |
|
212 |
$conf->{is_success} = 1; |
213 |
return $conf; |
214 |
} |
215 |
} |
216 |
|
217 |
__DATA__ |
218 |
# these are fine but give errors |
219 |
ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html |
220 |
ftp://ftp.stratus.com/pub/vos/utility/utility.html |
221 |
|
222 |
# these are missing, sigh |
223 |
ftp://ftp.sco.com/SLS/ptf7051e.Z |
224 |
http://perlmonks.thepen.com/42898.html |
225 |
http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/ |
226 |
http://public.activestate.com/cgi-bin/perlbrowse |
227 |
http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog |
228 |
http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631 |
229 |
http://my.smithmicro.com/mac/stuffit/ |
230 |
http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html |
231 |
http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html |
232 |
|
233 |
# these are URI extraction bugs |
234 |
http://www.perl.org/E |
235 |
http://en.wikipedia.org/wiki/SREC_(file_format |
236 |
http://somewhere.else',-type=/ |
237 |
ftp:passive-mode |
238 |
ftp: |
239 |
http:[- |
240 |
http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell |
241 |
http://www.xray.mpe.mpg.de/mailing-lists/perl5- |
242 |
http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP: |
243 |
http://perl.come/ |
244 |
http://www.perl.come/ |
245 |
|
246 |
# these are used as an example |
247 |
http://example.com/ |
248 |
http://something.here/ |
249 |
http://users.perl5.git.perl.org/~yourlogin/ |
250 |
http://github.com/USERNAME/perl/tree/orange |
251 |
http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi |
252 |
http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar |
253 |
http://somewhere.else$/ |
254 |
http://somewhere.else$/ |
255 |
http://somewhere.else/bin/foo&bar',-Type= |
256 |
http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi |
257 |
http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar |
258 |
http://www.perl.org/test.cgi |
259 |
http://cpan2.local/ |
260 |
http://search.cpan.org/perldoc? |
261 |
http://cpan1.local/ |
262 |
http://cpan.dev.local/CPAN |
263 |
http:/// |
264 |
ftp:// |
265 |
ftp://myurl/ |
266 |
ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff |
267 |
http://www14.software.ibm.com/webapp/download/downloadaz.jsp |
268 |
http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz |
269 |
http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT |
270 |
http://localhost/tmp/index.txt |
271 |
http://example.com/foo/bar.html |
272 |
http://example.com/Text-Bastardize-1.06.tar.gz |
273 |
ftp://example.com/sources/packages.txt |
274 |
http://example.com/sources/packages.txt |
275 |
http://example.com/sources |
276 |
ftp://example.com/sources |
277 |
http://some.where.com/dir/file.txt |
278 |
http://some.where.com/dir/a.txt |
279 |
http://foo.com/X.tgz |
280 |
ftp://foo.com/X.tgz |
281 |
http://foo/ |
282 |
http://www.foo.com:8000/ |
283 |
http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args |
284 |
http://decoded/mirror/path |
285 |
http://a/b/c/d/e/f/g/h/i/j |
286 |
http://foo/bar.gz |
287 |
ftp://ftp.perl.org |
288 |
http://purl.org/rss/1.0/modules/taxonomy/ |
289 |
ftp://ftp.sun.ac.za/CPAN/CPAN/ |
290 |
ftp://ftp.cpan.org/pub/mirror/index.txt |
291 |
ftp://cpan.org/pub/mirror/index.txt |
292 |
http://example.com/~eh/ |
293 |
http://plagger.org/.../rss |
294 |
http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz |
295 |
http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz |
296 |
http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz |
297 |
http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz |
298 |
http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz |
299 |
http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip |
300 |
http://module-build.sourceforge.net/META-spec-new.html |
301 |
http://module-build.sourceforge.net/META-spec-v1.4.html |
302 |
http://www.cs.vu.nl/~tmgil/vi.html |
303 |
http://perlcomposer.sourceforge.net/vperl.html |
304 |
http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep |
305 |
http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html |
306 |
http://world.std.com/~aep/ptkdb/ |
307 |
http://www.castlelink.co.uk/object_system/ |
308 |
http://www.fh-wedel.de/elvis/ |
309 |
ftp://ftp.blarg.net/users/amol/zsh/ |
310 |
ftp://ftp.funet.fi/pub/languages/perl/CPAN |
311 |
http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip |
312 |
http://users.perl5.git.perl.org/~USERNAME |
313 |
http://foo/x//y/script.cgi/a//b |
314 |
http://xxx/script.cgi/http://foo |
315 |
http://foo/./x//z/script.cgi/a/../b//c |
316 |
http://somewhere.else/in/movie/land |
317 |
http://somewhere.else/finished.html |
318 |
http://somewhere.else/bin/foo&bar$ |
319 |
http://somewhere.else/ |
320 |
http://proxy:8484/ |
321 |
http://proxy/ |
322 |
http://myrepo.example.com/ |
323 |
http://remote/source |
324 |
https://example.com/ |
325 |
http://example.com:1024/ |
326 |
http:///path?foo=bar |
327 |
http://[::]:1024/ |
328 |
http://([/ |
329 |
http://example.com:9000/index.html |
330 |
http://proxy.example.com:8080/ |
331 |
http:///index.html |
332 |
http://[www.json::pp.org]/ |
333 |
http://localhost/ |
334 |
http://foo.example.com/ |
335 |
http://abc.com/a.js |
336 |
http://whatever/man/1/crontab |
337 |
http://abc.com/c.js |
338 |
http://whatever/Foo%3A%3ABar |
339 |
http://abc.com/b.js |
340 |
http://remote.server.com/jquery.css |
341 |
http://some.other.com/page.html |
342 |
https://text.com/1/2 |
343 |
https://text.com/1/2 |
344 |
http://link.included.here?o=1&p=2 |
345 |
http://link.included.here?o=1&p=2 |
346 |
http://link.included.here?o=1&p=2 |
347 |
http://link.included.here/ |
348 |
http://foo/x//y/script.cgi/a//b |
349 |
http://xxx/script.cgi/http://foo |
350 |
http://foo/./x//z/script.cgi/a/../b//c |
351 |
http://somewhere.else/in/movie/land |
352 |
http://somewhere.else/finished.html |
353 |
http://webproxy:3128/ |
354 |
http://www/ |
355 |
|
356 |
# these are used to generate or match URLs |
357 |
http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist |
358 |
http://www.cpantesters.org/show/%s.yaml |
359 |
ftp://(.*?)/(.*)/(.* |
360 |
ftp://(.*?)/(.*)/(.* |
361 |
ftp://(.*?)/(.*)/(.* |
362 |
ftp://ftp.foo.bar/ |
363 |
http://$host/ |
364 |
http://wwwe%3C46/ |
365 |
ftp:/ |
366 |
http://$addr/mark?commit=$ |
367 |
http://search.cpan.org/~ |
368 |
http:/ |
369 |
ftp:%5Cn$url |
370 |
http://www.ietf.org/rfc/rfc$2.txt |
371 |
http://search.cpan.org/~ |
372 |
ftp:%5Cn$url |
373 |
|
374 |
# weird redirects that LWP doesn't like |
375 |
http://www.theperlreview.com/community_calendar |
376 |
http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL |
377 |
http://sunsolve.sun.com |
378 |
|
379 |
# broken webserver that doesn't like HEAD requests |
380 |
http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view |
381 |
http://www.w3.org/TR/html4/loose.dtd |
382 |
|
383 |
# these have been reported upstream to CPAN authors |
384 |
http://www.gnu.org/manual/tar/html_node/tar_139.html |
385 |
http://www.w3.org/pub/WWW/TR/Wd-css-1.html |
386 |
http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP |
387 |
http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp |
388 |
http://search.cpan.org/search?query=Module::Build::Convert |
389 |
http://www.refcnt.org/papers/module-build-convert |
390 |
http://csrc.nist.gov/cryptval/shs.html |
391 |
http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp |
392 |
http://www.debian.or.jp/~kubota/unicode-symbols.html.en |
393 |
http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html |
394 |
http://www.debian.or.jp/~kubota/unicode-symbols.html.en |
395 |
http://rfc.net/rfc2781.html |
396 |
http://www.icu-project.org/charset/ |
397 |
http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html |
398 |
http://www.rfc-editor.org/ |
399 |
http://www.rfc.net/ |
400 |
http://www.oreilly.com/people/authors/lunde/cjk_inf.html |
401 |
http://www.oreilly.com/catalog/cjkvinfo/ |
402 |
http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz |
403 |
http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz |
404 |
http://www.egt.ie/standards/iso3166/iso3166-1-en.html |
405 |
http://www.bsi-global.com/iso4217currency |
406 |
http://www.plover.com/~mjd/perl/Memoize/ |
407 |
http://www.plover.com/~mjd/perl/MiniMemoize/ |
408 |
http://www.sysadminmag.com/tpj/issues/vol5_5/ |
409 |
ftp://ftp.tpc.int/tpc/server/UNIX/ |
410 |
http://www.nara.gov/genealogy/ |
411 |
http://home.utah-inter.net/kinsearch/Soundex.html |
412 |
http://www.nara.gov/genealogy/soundex/soundex.html |
413 |
http://rfc.net/rfc3461.html |
414 |
ftp://ftp.cs.pdx.edu/pub/elvis/ |
415 |
http://www.fh-wedel.de/elvis/ |
416 |
http://lists.perl.org/list/perl-mvs.html |
417 |
http://www.cpan.org/ports/os2/ |
418 |
http://github.com/dagolden/cpan-meta-spec |
419 |
http://github.com/dagolden/cpan-meta-spec/issues |
420 |
http://www.opensource.org/licenses/lgpl-license.phpt |
421 |
http://reality.sgi.com/ariel |
422 |
http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html |
423 |
http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html |
424 |
http://www.nsrl.nist.gov/testdata/ |
425 |
http://public.activestate.com/cgi-bin/perlbrowse/p/31194 |
426 |
http://public.activestate.com/cgi-bin/perlbrowse?patch=16173 |
427 |
http://public.activestate.com/cgi-bin/perlbrowse?patch=16049 |
428 |
http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html |
429 |
http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118 |
430 |
http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut |
431 |
http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf |
432 |
http://github.com/schwern/extutils-makemaker |
433 |
https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite |
434 |
http://www.json.org/JSON::PP_checker/ |
435 |
ftp://ftp.kiae.su/pub/unix/fido/ |
436 |
http://www.gallistel.net/nparker/weather/code/ |
437 |
http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html |
438 |
ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/ |
439 |
http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html |
440 |
http://public.activestate.com/cgi-bin/perlbrowse/p/33567 |
441 |
http://public.activestate.com/cgi-bin/perlbrowse/p/33566 |
442 |
http://www.dsmit.com/cons/ |
443 |
http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide |
444 |
|
445 |
__END__ |
446 |
|
447 |
=head1 NAME |
448 |
|
449 |
checkURL.pl - Check that all the URLs in the Perl source are valid |
450 |
|
451 |
=head1 DESCRIPTION |
452 |
|
453 |
This program checks that all the URLs in the Perl source are valid. It |
454 |
checks HTTP and FTP links in parallel and contains a list of known |
455 |
bad example links in its source. It takes 4 minutes to run on my |
456 |
machine. The results are written to 'uris.txt' and list the filename, |
457 |
the URL and the error: |
458 |
|
459 |
* ext/Locale-Maketext/lib/Locale/Maketext.pod |
460 |
http://sunsite.dk/RFC/rfc/rfc2277.html |
461 |
404 Not Found |
462 |
... |
463 |
|
464 |
It should be run every so often and links fixed and upstream authors |
465 |
notified. |
466 |
|
467 |
Note that the web is unstable and some websites are temporarily down. |