-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcheck_valid_page.pl
executable file
·137 lines (111 loc) · 2.85 KB
/
check_valid_page.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env perl -w
#
# Usage:
# ./check_valid_page.pl <URL>
# --doctype=default, h401t, h32, h20
#
# default means it uses the document-supplied doctype
# h401s means HTML 4.01 Strict
# h401t means HTML 4.01 Transitional
# h401f means HTML 4.01 Frameset
# h32 means HTML 3.2
# h20 means HTML 2.0
#
# Aim:
# checks the validity of the given HTML page using the
# W3C's validator
#
# The output is to the screen and lists the status of the
# validation (ok or fail + errors or an 'external' error
# due to the validator/parsing code)
#
use strict;
$|++;
# taken from ~/local/perl5/DSS.pm
#
use HTTP::Request::Common qw( POST );
use LWP::UserAgent;
use URI;
use IO::File;
use XML::LibXML;
use Getopt::Long;
## Subroutines
#
# find out what the validator says about the page
# relies on version 0.9 of the XML DTD from the validator
#
sub parse_response ($$$) {
my $parser = shift;
my $resp = shift;
my $url = shift;
my $dom = $parser->parse_string( $resp );
unless ( $dom ) {
print "INTERNAL_FAILURE - unable to parse response for $url\n";
return;
}
my @msgs = $dom->getElementsByTagName( "msg" );
if ( $#msgs == -1 ) {
print "OK - $url\n";
return;
}
# loop through the messages and extract the content
#
printf "FAIL - %3d - %s\n", 1+$#msgs, $url;
my $i = 1;
foreach my $msg ( @msgs ) {
my $line = $msg->getAttribute( "line" );
my $text = $msg->textContent;
printf " #%3d: line=%4d %s\n", $i, $line, $text;
$i++;
}
} # sub: parse_response
## Code
#
my $progname = (split( m{/}, $0 ))[-1];
my %doctypes = (
default => "",
h401s => "HTML+4.01+Strict",
h401t => "HTML+4.01+Transitional",
h401f => "HTML+4.01+Frameset",
h32 => "HTML+3.2",
h20 => "HTML+2.0",
);
my $doctype_list = join (", ", sort keys %doctypes);
my $usage = <<"EOD";
Usage:
$progname <URL>
Options:
--doctype=$doctype_list
EOD
# process options
#
my $doctype = "default";
die $usage unless
GetOptions 'doctype:s' => \$doctype;
die $usage unless exists $doctypes{$doctype};
die $usage unless $#ARGV == 0;
my $url = shift;
# is this OTT?
my $u = URI->new( $url );
die "Error: argument '$url' does not appear to be a HTTP: URI.\n"
unless $u->scheme eq "http";
# Create the user agent
my $ua = LWP::UserAgent->new();
$ua->agent( "check_valid/1.0 " . $ua->agent );
my $parser = XML::LibXML->new()
or die "Unable to create XML parser\n";
# see what the validator says about the page
#
my $query = "http://validator.w3.org/check?uri=" . $url . ";output=xml";
unless ( $doctypes{$doctype} eq "" ) {
$query .= ";doctype=" . $doctypes{$doctype};
}
my $response = $ua->get( $query );
if ( $response->is_success ) {
parse_response $parser, $response->content, $url;
} else {
print "+++REQUEST FAILED for $url - " . $response->status_line . "\n";
}
## End
#
exit;