mdlinkcheck 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. #!/usr/bin/env perl
  2. #***************************************************************************
  3. # _ _ ____ _
  4. # Project ___| | | | _ \| |
  5. # / __| | | | |_) | |
  6. # | (__| |_| | _ <| |___
  7. # \___|\___/|_| \_\_____|
  8. #
  9. # Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
  10. #
  11. # This software is licensed as described in the file COPYING, which
  12. # you should have received as part of this distribution. The terms
  13. # are also available at https://curl.se/docs/copyright.html.
  14. #
  15. # You may opt to use, copy, modify, merge, publish, distribute and/or sell
  16. # copies of the Software, and permit persons to whom the Software is
  17. # furnished to do so, under the terms of the COPYING file.
  18. #
  19. # This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  20. # KIND, either express or implied.
  21. #
  22. # SPDX-License-Identifier: curl
  23. #
  24. ###########################################################################
  25. my %whitelist = (
  26. 'https://curl.se/' => 1,
  27. 'https://curl.se/changes.html' => 1,
  28. 'https://curl.se/dev/advisory.html' => 1,
  29. 'https://curl.se/dev/builds.html' => 1,
  30. 'https://curl.se/dev/code-style.html' => 1,
  31. 'https://curl.se/dev/contribute.html' => 1,
  32. 'https://curl.se/dev/internals.html' => 1,
  33. 'https://curl.se/dev/secprocess.html' => 1,
  34. 'https://curl.se/dev/sourceactivity.html' => 1,
  35. 'https://curl.se/docs/' => 1,
  36. 'https://curl.se/docs/bugbounty.html' => 1,
  37. 'https://curl.se/docs/caextract.html' => 1,
  38. 'https://curl.se/docs/copyright.html' => 1,
  39. 'https://curl.se/docs/install.html' => 1,
  40. 'https://curl.se/docs/knownbugs.html' => 1,
  41. 'https://curl.se/docs/manpage.html' => 1,
  42. 'https://curl.se/docs/security.html' => 1,
  43. 'https://curl.se/docs/sslcerts.html' => 1,
  44. 'https://curl.se/docs/thanks.html' => 1,
  45. 'https://curl.se/docs/todo.html' => 1,
  46. 'https://curl.se/docs/vulnerabilities.html' => 1,
  47. 'https://curl.se/libcurl/' => 1,
  48. 'https://curl.se/libcurl/c/CURLOPT_SSLVERSION.html' => 1,
  49. 'https://curl.se/libcurl/c/CURLOPT_SSL_CIPHER_LIST.html' => 1,
  50. 'https://curl.se/libcurl/c/CURLOPT_TLS13_CIPHERS.html' => 1,
  51. 'https://curl.se/libcurl/c/libcurl.html' => 1,
  52. 'https://curl.se/logo/curl-logo.svg' => 1,
  53. 'https://curl.se/mail/' => 1,
  54. 'https://curl.se/mail/etiquette.html' => 1,
  55. 'https://curl.se/mail/list.cgi?list=curl-distros' => 1,
  56. 'https://curl.se/mail/list.cgi?list=curl-library' => 1,
  57. 'https://curl.se/rfc/cookie_spec.html' => 1,
  58. 'https://curl.se/rfc/rfc2255.txt' => 1,
  59. 'https://curl.se/sponsors.html' => 1,
  60. 'https://curl.se/support.html' => 1,
  61. 'https://github.com/curl/curl' => 1,
  62. 'https://github.com/curl/curl-fuzzer' => 1,
  63. 'https://github.com/curl/curl-www' => 1,
  64. 'https://github.com/curl/curl/discussions' => 1,
  65. 'https://github.com/curl/curl/issues' => 1,
  66. 'https://github.com/curl/curl/labels/help%20wanted' => 1,
  67. 'https://github.com/curl/curl/pulls' => 1,
  68. );
  69. # list all .md files in the repo
  70. my @files=`git ls-files '**.md'`;
  71. sub storelink {
  72. my ($f, $line, $link) = @_;
  73. my $o = $link;
  74. if($link =~ /^\#/) {
  75. # ignore local-only links
  76. return;
  77. }
  78. # cut off any anchor
  79. $link =~ s:\#.*\z::;
  80. if($link =~ /^(https|http):/) {
  81. $url{$link} .= "$f:$line ";
  82. return;
  83. }
  84. # a file link
  85. my $dir = $f;
  86. $dir =~ s:([^/]*\z)::;
  87. while($link =~ s:^\.\.\/::) {
  88. $dir =~ s:^([^/]*)/::;
  89. }
  90. $flink{"./$dir$link"} .= "$f:$line ";
  91. }
  92. sub findlinks {
  93. my ($f) = @_;
  94. my $line = 1;
  95. open(F, "<:crlf", "$f") ||
  96. return;
  97. while(<F>) {
  98. if(/\]\(([^)]*)/) {
  99. my $link = $1;
  100. #print "$f:$line $link\n";
  101. storelink($f, $line, $link);
  102. }
  103. $line++;
  104. }
  105. close(F);
  106. }
  107. sub checkurl {
  108. my ($url) = @_;
  109. if($whitelist{$url}) {
  110. #print "$url is whitelisted\n";
  111. return 0;
  112. }
  113. print "check $url\n";
  114. my $curlcmd="curl -ILfsm10 --retry 2 --retry-delay 5 -A \"Mozilla/curl.se link-probe\"";
  115. my @content = `$curlcmd \"$url\"`;
  116. if(!$content[0]) {
  117. print STDERR "FAIL\n";
  118. return 1; # fail
  119. }
  120. return 0; # ok
  121. }
  122. for my $f (@files) {
  123. chomp $f;
  124. findlinks($f);
  125. }
  126. my $error;
  127. for my $u (sort keys %url) {
  128. my $r = checkurl($u);
  129. if($r) {
  130. for my $f (split(/ /, $url{$l})) {
  131. printf "%s ERROR links to missing URL %s\n", $f, $u;
  132. $error++;
  133. }
  134. }
  135. }
  136. for my $l (sort keys %flink) {
  137. if(! -r $l) {
  138. for my $f (split(/ /, $flink{$l})) {
  139. printf "%s ERROR links to missing file %s\n", $f, $l;
  140. $error++;
  141. }
  142. }
  143. }
  144. exit 1 if ($error);