#! /usr/bin/perl -w
use XML::DOM;
use File::Basename;
use LWP::Simple;
use Getopt::Std;
$opt_a = 0; # 1 => check svn-controlled
$opt_c = 0; # 1 => check category entry
$opt_d = 0; # 1 => check distribution tags
$opt_h = 0; # 1 => make output html-like
$opt_l = 0; # 1 => check web links are live
$opt_m = 0; # 1 => suggest miktex/tex-live links
$opt_o = 0; # 1 => offline; as -s but not -l
$opt_s = 0; # 1 => "standard set" of options -- equivalent to -adlmvw
$opt_t = 0; # 1 => testing
$opt_v = 0; # 1 => witter more than otherwise
$opt_w = 0; # 1 => warn if ctan directory link has no .zip
getopts ("acdhlmostvw") || usage();
# deal with the "standard set"
$opt_a = $opt_d = $opt_l = $opt_m = $opt_v = $opt_w = 1 if $opt_s;
$opt_a = $opt_d = $opt_m = $opt_v = $opt_w = 1 if $opt_o;
print "Switches: a=$opt_a, c=$opt_c, d=$opt_d, h=$opt_h, l=$opt_l, ",
"m=$opt_m, o=$opt_o, s=$opt_s, t=$opt_t, v=$opt_v, w=$opt_w\n" if $opt_t;
# configuration
$ARCHIVE = "/home/rf/tex/archive/";
my $parser = new XML::DOM::Parser;
open LOG, ">>/tmp/cat-links.log";
# have we read the "errant url" list?
$loaded_errant_urls = 0;
# break html line, for every line, if html output
$newline = $opt_h ? "
\n" : "\n";
# read the authors file
my $auth = $parser->parsefile ("entries/authors");
my $authset = $auth->getElementsByTagName ("author");
for ( my $i=0; $i < $authset->getLength; $i++ ) {
my $thisauthor = $authset->item($i);
my $authorid = $thisauthor->getAttributeNode("id");
my $authorval = $authorid->getValue;
$authorlist{$authorval} = $thisauthor;
}
$auth -> dispose;
# start output as html if required
print "check-cat-links scan output
\n" if $opt_h;
# now iterate over all the files we've been asked to check
$files_processed = 0;
$files_found = $#ARGV+1;
# dunno what this was supposed to be "for"...
# $last_file = "";
foreach $file ( @ARGV ) {
# we don't want these to build up...
my %doc_links;
my $filename = basename ( $file );
my $dirname = dirname ( $file );
my $basedir = substr $dirname, 0, index ( $dirname, "entries/" );
print "*** Checking ", $filename, $newline if $opt_v;
my $cat = $parser->parsefile ( $file ) ||
die "Failed to parse ", $file;
# there's almost always only one, but...
my $entryset = $cat -> getElementsByTagName ("entry");
my $entry = $entryset->item(0);
# check entry id attribute
my $entryid = $entry->getAttributeNode("id");
my $idvalue = $entryid->getValue;
if ( $idvalue ne substr ($filename, 0, -4) ) {
print "File ", $file, " has entry id ", $idvalue, $newline;
}
# check datestamp and modifier are CVS controlled, if we need warnings
if ( $opt_a ) {
my $entrydate = $entry->getAttributeNode("datestamp");
if ( $entrydate ) {
my $dateval = $entrydate->getValue;
if ( $dateval =~ /^\$/ ) {
print "File ", $file, " has weird datestamp \"", $dateval,
"\"", $newline unless $dateval =~ /^\$Date/;
} else {
print "File ", $file, " has fixed datestamp", $newline;
}
}
my $entrymodder = $entry->getAttributeNode("modifier");
if ( $entrymodder ) {
my $modifier = $entrymodder->getValue;
if ( $modifier =~ /^\$/ ) {
print "File ", $file, " has weird modifier \"", $modifier,
"\"", $newline unless $modifier =~ /^\$Author/;
} else {
print "File ", $file, " has fixed modifier", $newline;
}
}
}
# check authors are in authors file
my $hadauthor = 0;
my $entryauthors = $cat -> getElementsByTagName ("authorref");
for ( my $i=0; $i < $entryauthors->getLength; $i++ ) {
my $authorref = $entryauthors->item($i);
my $authorid;
unless ( $authorid = $authorref->getAttributeNode("id") ) {
print "Author ref has no \"id\" element in file ",
$file, $newline;
next;
}
my $authorval = $authorid->getValue;
if ( ! exists $authorlist{$authorval} ) {
print "Author id \"", $authorval, "\" in file ",
$file, " is not valid", $newline;
} else {
$hadauthor = 1;
}
}
unless ( $hadauthor ) {
print "No valid author ref in file ", $file, $newline;
}
# check home page link, if any
my $entryhome = $cat -> getElementsByTagName ("home");
my $nohomes = $entryhome -> getLength;
if ( $nohomes > 0 ) {
if ( $nohomes > 1 ) {
print "More than one home link in file ", $file, $newline;
}
for ( my $i=0; $i < $nohomes; $i++ ) {
my $homeref = $entryhome->item($i);
unless ( $homeloc = $homeref->getAttributeNode("href") ) {
print "Home reference has no \"href\" element in file ",
$file, $newline;
next;
} else {
check_url ( $homeloc->getValue, "Home link" ) if $opt_l;
}
}
}
# check license declaration
my $entrylicence = $cat -> getElementsByTagName ("license");
my $nolicences = $entrylicence->getLength;
print "No license entry in file ", $file, $newline
if $nolicences==0;
print $nolicences, " licence entries in file ", $file, $newline
if $nolicences>1;
for ( my $i=0; $i < $nolicences; $i++ ) {
my $licenceref = $entrylicence->item($i);
my $licenceid;
unless ( $licenceid = $licenceref->getAttributeNode("type") ) {
print "Licence ref has no \"type\" element in file ",
$file, $newline;
next;
}
my $licenceval = $licenceid->getValue;
$licenceval = "fdl" if $licenceval eq "gfdl";
if ( ! -f $ARCHIVE."help/Catalogue/licenses.".$licenceval.".html" ) {
print "Licence \"", $licenceval, "\" in file ", $file,
" is not valid", $newline;
}
}
# check version entry
my $entryversion = $cat -> getElementsByTagName ("version");
print "More than one version element in file ", $file, $newline
if $entryversion->getLength > 1;
for ( my $i=0; $i < $entryversion->getLength; $i++ ) {
my $versionref = $entryversion->item($i);
my $number=0;
my $date=0;
# no checks on number itself, just its presence
$number = $versionref->getAttributeNode("number");
$date = $versionref->getAttributeNode("date");
print "Version number _and_ date in file ", $file, $newline
if $number && $date;
print "Neither version number nor date in file ", $file, $newline
unless $number || $date;
if ( $date ) {
my $dateval = $date->getValue;
my $dateok = 0;
if (index ($dateval, "-") >0 ) {
my @dateparts = split (/-/, $dateval);
if ( $#dateparts == 2 ) {
if ( length ($dateparts [0]) == 4 &&
length ($dateparts [1]) == 2 &&
length ($dateparts [2]) == 2 ) {
$dateok = 1;
}
}
}
print "Invalid version date \"", $dateval,
"\" in file ", $file, $newline unless $dateok;
}
}
# check also links
my $entryalso = $cat -> getElementsByTagName ("also");
for ( my $i=0; $i < $entryalso->getLength; $i++ ) {
my $alsoref = $entryalso->item($i);
my $alsoid;
unless ( $alsoid = $alsoref->getAttributeNode("refid") ) {
print "Also ref has no \"refid\" element in file ",
$file, $newline;
next;
}
my $alsoname = $alsoid->getValue;
my $alsofilename = $basedir . "entries/" .
substr ($alsoname, 0, 1) . "/" . $alsoname . ".xml";
unless ( -f $alsofilename ) {
print "\"also\" file ", $alsofilename, " does not exist in file ",
$file, $newline;
}
}
############################################################################
############################################################################
### look at categories
############################################################################
############################################################################
if ( $opt_c ) {
my $topics = $cat -> getElementsByTagName ("topics");
for ( my $i=0; $i < $topics->getLength; $i++ ) {
my $temp = $topics->item($i);
$temp = $temp->toString;
$temp =~ s/(.*)<\/topics>/$1/;
$temp =~ s/\b//g;
my @values = split ",", $temp;
if ( $opt_v ) {
for ( my $j=0; $j <= $#values; $j++ ) {
print $j, ": ", $values[$j], $newline;
}
}
}
}
############################################################################
############################################################################
# check the xref and a links in the description
my $desc = $cat -> getElementsByTagName ("description");
print "File ", $filename, " has >1 !", $newline if
$desc->getLength > 1;
for ( my $i=0; $i < $desc->getLength; $i++ ) {
my $scrip = $desc->item($i);
my $refset = $scrip -> getElementsByTagName ("xref");
for ( my $j=0; $j < $refset->getLength; $j++ ) {
my $xref = $refset->item($j);
my $xrefid;
unless ( $xrefid = $xref->getAttributeNode("refid") ) {
print "Xref has no \"refid\" element in file ",
$file, $newline;
next;
}
my $xrefname = $xrefid->getValue;
my $xreffilename = $basedir . "entries/" .
substr ($xrefname, 0, 1) . "/" . $xrefname . ".xml";
unless ( -f $xreffilename ) {
print "\"xref\" file ", $xreffilename, " does not exist ",
"in file ", $file, $newline;
}
}
$refset = $scrip -> getElementsByTagName ("a");
for ( my $j=0; $j < $refset->getLength; $j++ ) {
my $href = $refset->item($j);
my $hrefid;
unless ( $hrefid = $href->getAttributeNode("href") ) {
print "\"A\" has no \"href\" element in file ",
$file, $newline;
next;
}
check_url ( $hrefid->getValue, "Href link") if $opt_l;
}
}
# check the documentation links
my $doc = $cat -> getElementsByTagName ("documentation");
for ( my $i=0; $i < $doc->getLength; $i++ ) {
my $node = $doc->item($i);
my $href = $node->getAttributeNode("href");
my $docval = $href->getValue;
if ( exists $doc_links{$docval} ) {
print "Doc link \"", $docval, "\" repeated in file ",
$file, $newline;
} else {
$doc_links{$docval} = $i;
}
if ( $docval =~ "^ctan:" ) {
my $path = substr $docval, 5;
if ( (substr $path, 0, 1) ne "/" ) {
print "CTAN Doc link in ", $file, " (",
$path, ") has no leading \"\/\"", $newline;
$path = "/". $path;
}
if ( $path =~ /\/\// ) {
print "CTAN Doc link in ", $file, " (",
$path, ") has repeated \"\/\"", $newline;
$path =~ s/\/\//\//g;
}
if ( -l $ARCHIVE . $path ) {
my $linkval = readlink $ARCHIVE . $path;
print "Doc link ", $path, " in file ", $file,
" is a link!", $newline
if substr ($linkval, 0, 3) eq "../";
} else {
print "Doc link in ", $file, " on CTAN (", $path,
") doesn't exist", $newline
unless -f $ARCHIVE . $path;
}
} elsif ( $docval =~ "^http:" ||
$docval =~ "^https:" ) {
if ( $opt_l ) {
check_url($docval, "Doc link");
} else {
print LOG "Doc link in ", $file, " on web: ", $docval, $newline;
}
} elsif ( $docval =~ "^ftp:" ) {
print LOG "Ftp doc link in ", $file, ": ", $docval, $newline;
} else {
print "Unknown doc prefix in ", $file, " (", $docval, ")", $newline;
}
}
# for use elsewhere
$this_file_path = "";
# now check the ctan path
my $ctan = $cat -> getElementsByTagName ("ctan") ||
die "*** Can't get CTAN elements for ", $file;
my $no_ctans = $ctan->getLength;
if ( $no_ctans == 0 ) {
print LOG "No CTAN link in ", $file, $newline;
} else {
print LOG $no_ctans, " CTAN links in ", $file, $newline
if $no_ctans > 1;
for (my $i=0; $i<$no_ctans; $i++) {
my $node = $ctan->item($i);
my $pathattr = $node->getAttributeNode("path");
my $path = $pathattr->getValue;
if ( (substr $path, 0, 1) ne "/" ) {
print "CTAN path in file ", $file, " (",
" has no leading \"\/\"", $newline;
$path = "/" . $path;
}
if ( (substr $path, -1, 1) eq "/" ) {
print "CTAN path in file ", $file, " (",
$path, ") has trailing \"\/\"", $newline;
$path =~ s/\/$//g;
}
if ( $path =~ /\/\// ) {
print "CTAN path in ", $file, " (",
$path, ") has repeated \"\/\"", $newline;
$path =~ s/\/\//\//g;
}
my $fileattr = $node->getAttributeNode("file");
my $isafile = $fileattr ? $fileattr->getValue eq "true" : 0;
my $zipattr = $node->getAttributeNode("zip");
my $containing_dir = $isafile ? dirname ( $path ) : $path;
if ( -l $ARCHIVE . $containing_dir ) {
my $linkval = readlink $ARCHIVE . $containing_dir;
if (substr ($linkval, 0, 3) eq "../" &&
substr ($linkval, -7, 7) ne "tds.zip") {
if ( $isafile ) {
print "Holding directory ", $containing_dir;
} else {
print "CTAN path ", $path;
}
print " in file ", $file, " is a link!", $newline;
}
} elsif ( -d $ARCHIVE . $path ) {
if ( $fileattr && ( $fileattr->getValue eq "true" ) ) {
print "File ", $path, " in file ", $file,
" is a directory!", $newline;
} elsif ( !( $zipattr && ( $zipattr->getValue eq "false" ) )) {
print "WARNING: no .zip file for directory $path",
" in file ", $file, $newline unless
( -f $ARCHIVE . $path . ".zip" ) || ! $opt_w;
}
$this_file_path = $path;
} else {
if ( $fileattr && ( $fileattr->getValue eq "true" ) ) {
print "File ", $path, " in file ", $file,
" doesn't exist", $newline
unless -f $ARCHIVE . $path;
} else {
print "Directory ", $path, " in file ", $file,
" doesn't exist", $newline;
}
}
}
}
# check distributions and install tree link
# miktex -- look for the compressed package
my $miktex = $cat -> getElementsByTagName ("miktex");
my $no_miktexs = $miktex->getLength;
print $no_miktexs, " MikTeX links in file ", $file, $newline
if $no_miktexs > 1;
for (my $i=0; $i<$no_miktexs; $i++) {
my $node = $miktex->item($i);
if ( my $locattr = $node->getAttributeNode("location") ) {
my $miktexfile = $ARCHIVE . "systems/win32/miktex/tm/packages/" .
$locattr -> getValue;
print "No file at MikTeX location ", $locattr -> getValue,
" in file ", $file, $newline
unless -f $miktexfile . ".cab" ||
-f $miktexfile . ".tar.bz2" ||
-f $miktexfile . ".tar.lzma";
} else {
print "No location in MikTeX tag in file ", $file, $newline;
}
}
if ( $no_miktexs == 0 && $opt_m ) {
my $basefile = basename( $file, ".xml" );
my $miktexfile = $ARCHIVE . "systems/win32/miktex/tm/packages/" .
$basefile;
my $potential = "";
$potential = $basefile . ".cab" if -f ($miktexfile . ".cab");
$potential = $basefile . ".tar.bz2" if -f ($miktexfile . ".tar.bz2");
$potential = $basefile . ".tar.lzma" if -f ($miktexfile . ".tar.lzma");
print "*** Does MiKTeX ", $potential, " help in file ",
$file, "?", $newline
if $potential;
}
# texlive -- look for .tar.lzma file (now we tl2008)
my $texlive = $cat -> getElementsByTagName ("texlive");
my $no_texlives = $texlive->getLength;
print $no_texlives, " TeXlive links in file ", $file, $newline
if $no_texlives > 1;
my $potential_tl = basename( $file, ".xml") . ".tar.lzma";
my $potential_path = $ARCHIVE . "systems/texlive/tlnet/2008/archive/" .
$potential_tl;
#print $file, ": ", $potential_tl, ", ",
# $potential_path, "\n";
for (my $i=0; $i<$no_texlives; $i++) {
my $node = $texlive->item($i);
if ( my $locattr = $node->getAttributeNode("location") ) {
my $gotatar = 0;
my $tarfilename = $locattr->getValue;
print "No TeXLive tar file ", $tarfilename,
".tar.lzma in file ", $file, $newline
unless -f $ARCHIVE .
"systems/texlive/tlnet/2008/archive/" .
$tarfilename . ".tar.lzma";
} else {
print "No location in TeXlive tag in file ", $file, $newline;
}
}
if ( $no_texlives == 0 && $opt_m ) {
my $basefile = basename( $file, ".xml" ) . ".tar.lzma";
my $potential_path = $ARCHIVE .
"systems/texlive/tlnet/2008/archive/" . $basefile;
my $potential = -f $potential_path ? $basefile : "";
#print $file, ": ", $basefile, ", ", $potential, "\n";
print "*** Does TeX live ", $potential, " help in file ",
$file, "?", $newline
if $potential;
}
# install tree -- check that quoted path is present
my $inst_tree = $cat -> getElementsByTagName ("install");
my $no_inst_trees = $inst_tree->getLength;
print $no_inst_trees, " install tree links in file ", $file, $newline
if $no_inst_trees > 1;
for (my $i=0; $i<$no_inst_trees; $i++) {
my $node = $inst_tree->item($i);
if ( my $pathattr = $node->getAttributeNode("path") ) {
my $instfilename = $pathattr->getValue;
unless ( $instfilename =~ /^\// ) {
print "No leading \"/\" in install path in file ",
$file, $newline;
$instfilename = "/" . $instfilename;
}
print "Repeated \"/\" in install path in file ",
$file, $newline
if $instfilename =~ /\/\//;
print "No install file ", "/install" . $instfilename,
" in file ", $file, $newline
unless -f $ARCHIVE . "/install" . $instfilename;
} else {
# note that rainer's code defaults this case...
print "No install path tag in file ", $file, $newline;
}
}
if ( $no_inst_trees == 0 && $opt_m ) {
my $basefile = basename( $this_file_path, ".xml" ) . ".tds.zip";
# untidy exception here...
print "*** Does ", $basefile, " help in file ", $file, "?", $newline
if $this_file_path ne "/language/japanese/ptex" &&
-f $ARCHIVE . "install" . $this_file_path . ".tds.zip";
}
$files_processed++;
# print copy file
#$cat -> printToFile ("./copy.xml");
$cat -> dispose;
}
print $files_processed, " file(s) processed of ", $files_found, $newline;
close LOG;
###########################################################
sub usage { print STDERR "Usage: $0 [-acdlmsvw] file path(s)
-a: check entry modifiers cvs-controlled
-c: check category entry (experimental)
-d: check distribution tags
-h: make output html-like
-l: check web links are live
-m: suggest miktex/tex-live links
-o: offline; as -s but not -l
-s: \"standard set\" of options -- equivalent to -adlmvw just now
-t: testing
-v: witter more than otherwise
-w: warn if ctan directory link has no .zip
";
}
sub check_url {
if ( ! $loaded_errant_urls ) {
open EURL, "redirected-hosts" || return;
while ( ) {
chomp;
$errant_url{$_} = 1;
}
close EURL;
$loaded_errant_urls = 1;
}
$thisurl = $_[0];
if ( substr ( $thisurl, 0, 5 ) eq "http:" ||
substr ( $thisurl, 0, 6 ) eq "https:" ) {
my $domain = $thisurl;
$domain =~ s/^.*\/\///;
$domain =~ s/\/.*$//;
if ( exists $errant_url{ $domain } ) {
print LOG "Checking URL ", $thisurl,
" skipped in file ", $file, "\n";
} else {
unless ( head($thisurl) ) {
if ( $opt_h ) {
print "*** ", $_[1], " in ", $file,
" (", $thisurl, "";
} else {
print "*** ", $_[1], " in ", $file, " (", $thisurl;
}
print ") not responding", $newline;
}
}
} elsif ( substr ( $thisurl, 0, 4 ) eq "ftp:" ) {
print LOG "ftp home link in ", $file, "\n";
} else {
print "Unknown type of home link in ", $file, $newline;
}
}