checklinks.pl.in 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. #!@PERL@
  2. #
  3. # checklinks.pl
  4. #
  5. # This script extracts all links from a HTML page and checks their validity.
  6. # Written to use 'curl' for URL checking.
  7. #
  8. # Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se>
  9. # Version: 0.7 Sept 30, 1998
  10. #
  11. # HISTORY
  12. #
  13. # 0.5 - Cuts off the #-part from links before checking.
  14. #
  15. # 0.6 - Now deals with error codes 3XX better and follows the Location:
  16. # properly.
  17. # - Added the -x flag that only checks http:// -links
  18. #
  19. # 0.7 - Ok, http://www.viunga.se/main.html didn't realize this had no path
  20. # but a document. Now it does.
  21. #
  22. #
  23. $in="";
  24. argv:
  25. if($ARGV[0] eq "-v" ) {
  26. $verbose = 1;
  27. shift @ARGV;
  28. goto argv;
  29. }
  30. elsif($ARGV[0] eq "-i" ) {
  31. $usestdin = 1;
  32. shift @ARGV;
  33. goto argv;
  34. }
  35. elsif($ARGV[0] eq "-l" ) {
  36. $linenumber = 1;
  37. shift @ARGV;
  38. goto argv;
  39. }
  40. elsif($ARGV[0] eq "-h" ) {
  41. $help = 1;
  42. shift @ARGV;
  43. goto argv;
  44. }
  45. elsif($ARGV[0] eq "-x" ) {
  46. $external = 1;
  47. shift @ARGV;
  48. goto argv;
  49. }
  50. $geturl = $ARGV[0];
  51. if(($geturl eq "") || $help) {
  52. print "Usage: $0 [-hilvx] <full URL>\n",
  53. " Use a traling slash for directory URLs!\n",
  54. " -h This help text\n",
  55. " -i Read the initial page from stdin\n",
  56. " -l Line number report for BAD links\n",
  57. " -v Verbose mode\n",
  58. " -x Check non-local (external?) links only\n";
  59. exit;
  60. }
  61. if($ARGV[1] eq "-") {
  62. print "We use stdin!\n";
  63. $usestdin = 1;
  64. }
  65. # This is necessary from where I tried this:
  66. #$proxy =" -x 194.237.142.41:80";
  67. # linkchecker, URL will be appended to the right of this command line
  68. # this is the one using HEAD:
  69. $linkcheck = "curl -s -m 20 -I$proxy";
  70. # as a second attempt, this will be used. This is not using HEAD but will
  71. # get the whole frigging document!
  72. $linkcheckfull = "curl -s -m 20 -i$proxy";
  73. # htmlget, URL will be appended to the right of this command line
  74. $htmlget = "curl -s$proxy";
  75. # Parse the input URL and split it into the relevant parts:
  76. sub SplitURL {
  77. my $inurl = $_[0];
  78. if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) {
  79. $getprotocol = $1;
  80. $getserver = $2;
  81. $getpath = $3;
  82. $getdocument = $4;
  83. }
  84. elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) {
  85. $getprotocol = $1;
  86. $getserver = $2;
  87. $getpath = $3;
  88. $getdocument = "";
  89. if($getpath !~ /\//) {
  90. $getpath ="";
  91. $getdocument = $3;
  92. }
  93. }
  94. elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) {
  95. $getprotocol = $1;
  96. $getserver = $2;
  97. $getpath = "";
  98. $getdocument = "";
  99. }
  100. else {
  101. print "Couldn't parse the specified URL, retry please!\n";
  102. exit;
  103. }
  104. }
  105. &SplitURL($geturl);
  106. #print "protocol = $getprotocol\n";
  107. #print "server = $getserver\n";
  108. #print "path = $getpath\n";
  109. #print "document = $getdocument\n";
  110. #exit;
  111. if(!$usestdin) {
  112. open(HEADGET, "$linkcheck $geturl|") ||
  113. die "Couldn't get web page for some reason";
  114. headget:
  115. while(<HEADGET>) {
  116. # print $_;
  117. if($_ =~ /HTTP\/.*3\d\d /) {
  118. $pagemoved=1;
  119. }
  120. elsif($pagemoved &&
  121. ($_ =~ /^Location: (.*)/)) {
  122. $geturl = $1;
  123. &SplitURL($geturl);
  124. $pagemoved++;
  125. last headget;
  126. }
  127. }
  128. close(HEADGET);
  129. if($pagemoved == 1) {
  130. print "Page is moved but we don't know where. Did you forget the ",
  131. "traling slash?\n";
  132. exit;
  133. }
  134. open(WEBGET, "$htmlget $geturl|") ||
  135. die "Couldn't get web page for some reason";
  136. while(<WEBGET>) {
  137. $line = $_;
  138. push @indoc, $line;
  139. $line=~ s/\n//g;
  140. $line=~ s/\r//g;
  141. # print $line."\n";
  142. $in=$in.$line;
  143. }
  144. close(WEBGET);
  145. }
  146. else {
  147. while(<STDIN>) {
  148. $line = $_;
  149. push @indoc, $line;
  150. $line=~ s/\n//g;
  151. $line=~ s/\r//g;
  152. $in=$in.$line;
  153. }
  154. }
  155. #print length($in)."\n";
  156. sub LinkWorks {
  157. my $check = $_[0];
  158. # URL encode:
  159. # $check =~s/([^a-zA-Z0-9_:\/.-])/uc sprintf("%%%02x",ord($1))/eg;
  160. @doc = `$linkcheck \"$check\"`;
  161. $head = 1;
  162. # print "COMMAND: $linkcheck \"$check\"\n";
  163. # print $doc[0]."\n";
  164. boo:
  165. if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) {
  166. $error = $1;
  167. if($error < 400 ) {
  168. return "GOOD";
  169. }
  170. else {
  171. if($head && ($error >= 500)) {
  172. # This server doesn't like HEAD!
  173. @doc = `$linkcheckfull \"$check\"`;
  174. $head = 0;
  175. goto boo;
  176. }
  177. return "BAD";
  178. }
  179. }
  180. return "BAD";
  181. }
  182. sub GetLinks {
  183. my $in = $_[0];
  184. my @result;
  185. getlinkloop:
  186. while($in =~ /[^<]*(<[^>]+>)/g ) {
  187. # we have a tag in $1
  188. $tag = $1;
  189. if($tag =~ /^<!--/) {
  190. # this is a comment tag, ignore it
  191. }
  192. else {
  193. if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ )>]*)/i) {
  194. $url=$2;
  195. if($url =~ /^\"(.*)\"$/) {
  196. # this was a "string" now $1 has removed the quotes:
  197. $url=$1;
  198. }
  199. $url =~ s/([^\#]*)\#.*/$1/g;
  200. if($url eq "") {
  201. # if the link was nothing than a #-link it may now have
  202. # been emptied completely so then we skip the rest
  203. next getlinkloop;
  204. }
  205. if($done{$url}) {
  206. # if this url already is done, do next
  207. $done{$url}++;
  208. next getlinkloop;
  209. }
  210. $done{$url} = 1; # this is "done"
  211. push @result, $url;
  212. if($tag =~ /< *([^ ]+)/) {
  213. # print "TAG: $1\n";
  214. $tagtype{$url}=$1;
  215. }
  216. }
  217. }
  218. }
  219. return @result;
  220. }
  221. @links = &GetLinks($in);
  222. linkloop:
  223. for(@links) {
  224. $url = $_;
  225. if($url =~ /^([^:]+):/) {
  226. $prot = $1;
  227. # if($prot !~ /(http|ftp|gopher)/i) {
  228. if($prot !~ /http/i) {
  229. # this is an unsupported protocol, we ignore this
  230. next linkloop;
  231. }
  232. $link = $url;
  233. }
  234. else {
  235. if($external) {
  236. next linkloop;
  237. }
  238. # this is a link on the save server:
  239. if($url =~ /^\//) {
  240. # from root
  241. $link = "$getprotocol://$getserver$url";
  242. }
  243. else {
  244. # from the scanned page's dir
  245. $nyurl=$url;
  246. if(length($getpath) &&
  247. ($getpath !~ /\/$/) &&
  248. ($nyurl !~ /^\//)) {
  249. # lacks ending slash, add one to the document part:
  250. $nyurl = "/".$nyurl;
  251. }
  252. $link = "$getprotocol://$getserver/$getpath$nyurl";
  253. }
  254. }
  255. #print "test $link\n";
  256. #$success = "GOOD";
  257. $success = &LinkWorks($link);
  258. $count = $done{$url};
  259. $allcount += $count;
  260. print "$success $count <".$tagtype{$url}."> $link $url\n";
  261. # If bad and -l, present the line numbers of the usage
  262. if("BAD" eq $success) {
  263. $badlinks++;
  264. if($linenumber) {
  265. $line =1;
  266. for(@indoc) {
  267. if($_ =~ /$url/) {
  268. print " line $line\n";
  269. }
  270. $line++;
  271. }
  272. }
  273. }
  274. }
  275. if($verbose) {
  276. print "$allcount links were checked";
  277. if($badlinks > 0) {
  278. print ", $badlinks were found bad";
  279. }
  280. print "\n";
  281. }