crawlink.pl 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. #!/usr/bin/perl
  2. #
  3. # crawlink.pl
  4. #
  5. # This script crawls across all found links below the given "root" URL.
  6. # It reports all good and bad links to stdout. This code was based on the
  7. # checklink.pl script I wrote ages ago.
  8. #
  9. # Written to use 'curl' for URL checking.
  10. #
  11. # Author: Daniel Stenberg <daniel@haxx.se>
  12. # Version: 0.3 Jan 3, 2001
  13. #
  14. # HISTORY
  15. #
  16. # 0.3 - The -i now adds regexes that if a full URL link matches one of those,
  17. # it is not followed. This can then be used to prevent this script from
  18. # following '.*\.cgi', specific pages or whatever.
  19. #
  20. # 0.2 - Made it only HEAD non html files (i.e skip the GET). Makes it a lot
  21. # faster to skip large non HTML files such as pdfs or big RFCs! ;-)
  22. # Added a -c option that allows me to pass options to curl.
  23. #
  24. # 0.1 - The given url works as the root. This script will only continue
  25. # and check other URLs if the leftmost part of the new URL is identical
  26. # to the root URL.
  27. #
  28. use strict;
  29. my $in="";
  30. my $verbose=0;
  31. my $usestdin;
  32. my $linenumber;
  33. my $help;
  34. my $external;
  35. my $curlopts;
  36. my @ignorelist;
  37. argv:
  38. if($ARGV[0] eq "-v" ) {
  39. $verbose++;
  40. shift @ARGV;
  41. goto argv;
  42. }
  43. elsif($ARGV[0] eq "-c" ) {
  44. $curlopts=$ARGV[1];
  45. shift @ARGV;
  46. shift @ARGV;
  47. goto argv;
  48. }
  49. elsif($ARGV[0] eq "-i" ) {
  50. push @ignorelist, $ARGV[1];
  51. shift @ARGV;
  52. shift @ARGV;
  53. goto argv;
  54. }
  55. elsif($ARGV[0] eq "-l" ) {
  56. $linenumber = 1;
  57. shift @ARGV;
  58. goto argv;
  59. }
  60. elsif($ARGV[0] eq "-h" ) {
  61. $help = 1;
  62. shift @ARGV;
  63. goto argv;
  64. }
  65. elsif($ARGV[0] eq "-x" ) {
  66. $external = 1;
  67. shift @ARGV;
  68. goto argv;
  69. }
  70. my $geturl = $ARGV[0];
  71. my $firsturl= $geturl;
  72. #
  73. # Define a hash array to hold all root URLs to visit/we have visited
  74. #
  75. my %rooturls;
  76. $rooturls{$ARGV[0]}=1;
  77. if(($geturl eq "") || $help) {
  78. print "Usage: $0 [-hilvx] <full URL>\n",
  79. " Use a traling slash for directory URLs!\n",
  80. " -c [data] Pass [data] as argument to every curl invoke\n",
  81. " -h This help text\n",
  82. " -i [regex] Ignore root links that match this pattern\n",
  83. " -l Line number report for BAD links\n",
  84. " -v Verbose mode\n",
  85. " -x Check non-local (external?) links only\n";
  86. exit;
  87. }
  88. my $proxy;
  89. if($curlopts ne "") {
  90. $proxy=" $curlopts";
  91. #$proxy =" -x 194.237.142.41:80";
  92. }
  93. # linkchecker, URL will be appended to the right of this command line
  94. # this is the one using HEAD:
  95. my $linkcheck = "curl -s -m 20 -I$proxy";
  96. # as a second attempt, this will be used. This is not using HEAD but will
  97. # get the whole frigging document!
  98. my $linkcheckfull = "curl -s -m 20 -i$proxy";
  99. # htmlget, URL will be appended to the right of this command line
  100. my $htmlget = "curl -s$proxy";
  101. # Parse the input URL and split it into the relevant parts:
  102. my $getprotocol;
  103. my $getserver;
  104. my $getpath;
  105. my $getdocument;
  106. my %done;
  107. my %tagtype;
  108. my $allcount=0;
  109. my $badlinks=0;
  110. sub SplitURL {
  111. my $inurl = $_[0];
  112. if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) {
  113. $getprotocol = $1;
  114. $getserver = $2;
  115. $getpath = $3;
  116. $getdocument = $4;
  117. }
  118. elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) {
  119. $getprotocol = $1;
  120. $getserver = $2;
  121. $getpath = $3;
  122. $getdocument = "";
  123. if($getpath !~ /\//) {
  124. $getpath ="";
  125. $getdocument = $3;
  126. }
  127. }
  128. elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) {
  129. $getprotocol = $1;
  130. $getserver = $2;
  131. $getpath = "";
  132. $getdocument = "";
  133. }
  134. else {
  135. print "Couldn't parse the specified URL, retry please!\n";
  136. exit;
  137. }
  138. }
  139. my @indoc;
  140. sub GetRootPage {
  141. my $geturl = $_[0];
  142. my $in="";
  143. my $code=200;
  144. my $type="text/plain";
  145. my $pagemoved=0;
  146. open(HEADGET, "$linkcheck $geturl|") ||
  147. die "Couldn't get web page for some reason";
  148. while(<HEADGET>) {
  149. #print STDERR $_;
  150. if($_ =~ /HTTP\/1\.[01] (\d\d\d) /) {
  151. $code=$1;
  152. if($code =~ /^3/) {
  153. $pagemoved=1;
  154. }
  155. }
  156. elsif($_ =~ /^Content-Type: ([\/a-zA-Z]+)/) {
  157. $type=$1;
  158. }
  159. elsif($pagemoved &&
  160. ($_ =~ /^Location: (.*)/)) {
  161. $geturl = $1;
  162. &SplitURL($geturl);
  163. $pagemoved++;
  164. last;
  165. }
  166. }
  167. close(HEADGET);
  168. if($pagemoved == 1) {
  169. print "Page is moved but we don't know where. Did you forget the ",
  170. "traling slash?\n";
  171. exit;
  172. }
  173. if($type ne "text/html") {
  174. # there no point in getting anything but HTML
  175. $in="";
  176. }
  177. else {
  178. open(WEBGET, "$htmlget $geturl|") ||
  179. die "Couldn't get web page for some reason";
  180. while(<WEBGET>) {
  181. my $line = $_;
  182. push @indoc, $line;
  183. $line=~ s/\n/ /g;
  184. $line=~ s/\r//g;
  185. $in=$in.$line;
  186. }
  187. close(WEBGET);
  188. }
  189. return ($in, $code, $type);
  190. }
  191. sub LinkWorks {
  192. my $check = $_[0];
  193. # URL encode:
  194. # $check =~s/([^a-zA-Z0-9_:\/.-])/uc sprintf("%%%02x",ord($1))/eg;
  195. my @doc = `$linkcheck \"$check\"`;
  196. my $head = 1;
  197. # print "COMMAND: $linkcheck \"$check\"\n";
  198. # print $doc[0]."\n";
  199. boo:
  200. if( $doc[0] =~ /^HTTP[^ ]+ (\d+)/ ) {
  201. my $error = $1;
  202. if($error < 400 ) {
  203. return "GOOD";
  204. }
  205. else {
  206. if($head && ($error >= 500)) {
  207. # This server doesn't like HEAD!
  208. @doc = `$linkcheckfull \"$check\"`;
  209. $head = 0;
  210. goto boo;
  211. }
  212. return "BAD";
  213. }
  214. }
  215. return "BAD";
  216. }
  217. sub GetLinks {
  218. my $in = $_[0];
  219. my @result;
  220. while($in =~ /[^<]*(<[^>]+>)/g ) {
  221. # we have a tag in $1
  222. my $tag = $1;
  223. if($tag =~ /^<!--/) {
  224. # this is a comment tag, ignore it
  225. }
  226. else {
  227. if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ \)>]*)/i) {
  228. my $url=$2;
  229. if($url =~ /^\"(.*)\"$/) {
  230. # this was a "string" now $1 has removed the quotes:
  231. $url=$1;
  232. }
  233. $url =~ s/([^\#]*)\#.*/$1/g;
  234. if($url eq "") {
  235. # if the link was nothing than a #-link it may now have
  236. # been emptied completely so then we skip the rest
  237. next;
  238. }
  239. if($done{$url}) {
  240. # if this url already is done, do next
  241. $done{$url}++;
  242. if($verbose) {
  243. print " FOUND $url but that is already checked\n";
  244. }
  245. next;
  246. }
  247. $done{$url} = 1; # this is "done"
  248. push @result, $url;
  249. if($tag =~ /< *([^ ]+)/) {
  250. $tagtype{$url}=$1;
  251. }
  252. }
  253. }
  254. }
  255. return @result;
  256. }
  257. while(1) {
  258. $geturl=-1;
  259. for(keys %rooturls) {
  260. if($rooturls{$_} == 1) {
  261. if($_ !~ /^$firsturl/) {
  262. $rooturls{$_} += 1000; # don't do this, outside our scope
  263. if($verbose) {
  264. print "SKIP: $_\n";
  265. }
  266. next;
  267. }
  268. $geturl=$_;
  269. last;
  270. }
  271. }
  272. if($geturl == -1) {
  273. last;
  274. }
  275. #
  276. # Splits the URL in its different parts
  277. #
  278. &SplitURL($geturl);
  279. #
  280. # Returns the full HTML of the root page
  281. #
  282. my ($in, $error, $ctype) = &GetRootPage($geturl);
  283. $rooturls{$geturl}++; # increase to prove we have already got it
  284. if($ctype ne "text/html") {
  285. # this is not HTML, we skip this
  286. if($verbose == 2) {
  287. print "Non-HTML link, skipping\n";
  288. next;
  289. }
  290. }
  291. if($error >= 400) {
  292. print "ROOT page $geturl returned $error\n";
  293. next;
  294. }
  295. print " ==== $geturl ====\n";
  296. if($verbose == 2) {
  297. printf("Error code $error, Content-Type: $ctype, got %d bytes\n",
  298. length($in));
  299. }
  300. #print "protocol = $getprotocol\n";
  301. #print "server = $getserver\n";
  302. #print "path = $getpath\n";
  303. #print "document = $getdocument\n";
  304. #exit;
  305. #
  306. # Extracts all links from the given HTML buffer
  307. #
  308. my @links = &GetLinks($in);
  309. for(@links) {
  310. my $url = $_;
  311. my $link;
  312. if($url =~ /^([^:]+):/) {
  313. my $prot = $1;
  314. if($prot !~ /http/i) {
  315. # this is an unsupported protocol, we ignore this
  316. next;
  317. }
  318. $link = $url;
  319. }
  320. else {
  321. if($external) {
  322. next;
  323. }
  324. # this is a link on the same server:
  325. if($url =~ /^\//) {
  326. # from root
  327. $link = "$getprotocol://$getserver$url";
  328. }
  329. else {
  330. # from the scanned page's dir
  331. my $nyurl=$url;
  332. if(length($getpath) &&
  333. ($getpath !~ /\/$/) &&
  334. ($nyurl !~ /^\//)) {
  335. # lacks ending slash, add one to the document part:
  336. $nyurl = "/".$nyurl;
  337. }
  338. $link = "$getprotocol://$getserver/$getpath$nyurl";
  339. }
  340. }
  341. my $success = &LinkWorks($link);
  342. my $count = $done{$url};
  343. $allcount += $count;
  344. print "$success $count <".$tagtype{$url}."> $link $url\n";
  345. if("BAD" eq $success) {
  346. $badlinks++;
  347. if($linenumber) {
  348. my $line =1;
  349. for(@indoc) {
  350. if($_ =~ /$url/) {
  351. print " line $line\n";
  352. }
  353. $line++;
  354. }
  355. }
  356. }
  357. else {
  358. # the link works, add it if it isn't in the ingore list
  359. my $ignore=0;
  360. for(@ignorelist) {
  361. if($link =~ /$_/) {
  362. $ignore=1;
  363. }
  364. }
  365. if(!$ignore) {
  366. # not ignored, add
  367. $rooturls{$link}++; # check this if not checked already
  368. }
  369. }
  370. }
  371. }
  372. if($verbose) {
  373. print "$allcount links were checked";
  374. if($badlinks > 0) {
  375. print ", $badlinks were found bad";
  376. }
  377. print "\n";
  378. }