#! /usr/bin/perl -w use XML::DOM; use File::Basename; use LWP::Simple; use Getopt::Std; $opt_a = 0; # 1 => check svn-controlled $opt_c = 0; # 1 => check category entry $opt_d = 0; # 1 => check distribution tags $opt_h = 0; # 1 => make output html-like $opt_l = 0; # 1 => check web links are live $opt_m = 0; # 1 => suggest miktex/tex-live links $opt_o = 0; # 1 => offline; as -s but not -l $opt_s = 0; # 1 => "standard set" of options -- equivalent to -adlmvw $opt_t = 0; # 1 => testing $opt_v = 0; # 1 => witter more than otherwise $opt_w = 0; # 1 => warn if ctan directory link has no .zip getopts ("acdhlmostvw") || usage(); # deal with the "standard set" $opt_a = $opt_d = $opt_l = $opt_m = $opt_v = $opt_w = 1 if $opt_s; $opt_a = $opt_d = $opt_m = $opt_v = $opt_w = 1 if $opt_o; print "Switches: a=$opt_a, c=$opt_c, d=$opt_d, h=$opt_h, l=$opt_l, ", "m=$opt_m, o=$opt_o, s=$opt_s, t=$opt_t, v=$opt_v, w=$opt_w\n" if $opt_t; # configuration $ARCHIVE = "/home/rf/tex/archive/"; my $parser = new XML::DOM::Parser; open LOG, ">>/tmp/cat-links.log"; # have we read the "errant url" list? $loaded_errant_urls = 0; # break html line, for every line, if html output $newline = $opt_h ? "
\n" : "\n"; # read the authors file my $auth = $parser->parsefile ("entries/authors"); my $authset = $auth->getElementsByTagName ("author"); for ( my $i=0; $i < $authset->getLength; $i++ ) { my $thisauthor = $authset->item($i); my $authorid = $thisauthor->getAttributeNode("id"); my $authorval = $authorid->getValue; $authorlist{$authorval} = $thisauthor; } $auth -> dispose; # start output as html if required print "
check-cat-links scan output
\n" if $opt_h; # now iterate over all the files we've been asked to check $files_processed = 0; $files_found = $#ARGV+1; # dunno what this was supposed to be "for"... # $last_file = ""; foreach $file ( @ARGV ) { # we don't want these to build up... my %doc_links; my $filename = basename ( $file ); my $dirname = dirname ( $file ); my $basedir = substr $dirname, 0, index ( $dirname, "entries/" ); print "*** Checking ", $filename, $newline if $opt_v; my $cat = $parser->parsefile ( $file ) || die "Failed to parse ", $file; # there's almost always only one, but... my $entryset = $cat -> getElementsByTagName ("entry"); my $entry = $entryset->item(0); # check entry id attribute my $entryid = $entry->getAttributeNode("id"); my $idvalue = $entryid->getValue; if ( $idvalue ne substr ($filename, 0, -4) ) { print "File ", $file, " has entry id ", $idvalue, $newline; } # check datestamp and modifier are CVS controlled, if we need warnings if ( $opt_a ) { my $entrydate = $entry->getAttributeNode("datestamp"); if ( $entrydate ) { my $dateval = $entrydate->getValue; if ( $dateval =~ /^\$/ ) { print "File ", $file, " has weird datestamp \"", $dateval, "\"", $newline unless $dateval =~ /^\$Date/; } else { print "File ", $file, " has fixed datestamp", $newline; } } my $entrymodder = $entry->getAttributeNode("modifier"); if ( $entrymodder ) { my $modifier = $entrymodder->getValue; if ( $modifier =~ /^\$/ ) { print "File ", $file, " has weird modifier \"", $modifier, "\"", $newline unless $modifier =~ /^\$Author/; } else { print "File ", $file, " has fixed modifier", $newline; } } } # check authors are in authors file my $hadauthor = 0; my $entryauthors = $cat -> getElementsByTagName ("authorref"); for ( my $i=0; $i < $entryauthors->getLength; $i++ ) { my $authorref = $entryauthors->item($i); my $authorid; unless ( $authorid = $authorref->getAttributeNode("id") ) { print "Author ref has no \"id\" element in file ", $file, $newline; next; } my $authorval = $authorid->getValue; if ( ! exists $authorlist{$authorval} ) { print "Author id \"", $authorval, "\" in file ", $file, " is not valid", $newline; } else { $hadauthor = 1; } } unless ( $hadauthor ) { print "No valid author ref in file ", $file, $newline; } # check home page link, if any my $entryhome = $cat -> getElementsByTagName ("home"); my $nohomes = $entryhome -> getLength; if ( $nohomes > 0 ) { if ( $nohomes > 1 ) { print "More than one home link in file ", $file, $newline; } for ( my $i=0; $i < $nohomes; $i++ ) { my $homeref = $entryhome->item($i); unless ( $homeloc = $homeref->getAttributeNode("href") ) { print "Home reference has no \"href\" element in file ", $file, $newline; next; } else { check_url ( $homeloc->getValue, "Home link" ) if $opt_l; } } } # check license declaration my $entrylicence = $cat -> getElementsByTagName ("license"); my $nolicences = $entrylicence->getLength; print "No license entry in file ", $file, $newline if $nolicences==0; print $nolicences, " licence entries in file ", $file, $newline if $nolicences>1; for ( my $i=0; $i < $nolicences; $i++ ) { my $licenceref = $entrylicence->item($i); my $licenceid; unless ( $licenceid = $licenceref->getAttributeNode("type") ) { print "Licence ref has no \"type\" element in file ", $file, $newline; next; } my $licenceval = $licenceid->getValue; $licenceval = "fdl" if $licenceval eq "gfdl"; if ( ! -f $ARCHIVE."help/Catalogue/licenses.".$licenceval.".html" ) { print "Licence \"", $licenceval, "\" in file ", $file, " is not valid", $newline; } } # check version entry my $entryversion = $cat -> getElementsByTagName ("version"); print "More than one version element in file ", $file, $newline if $entryversion->getLength > 1; for ( my $i=0; $i < $entryversion->getLength; $i++ ) { my $versionref = $entryversion->item($i); my $number=0; my $date=0; # no checks on number itself, just its presence $number = $versionref->getAttributeNode("number"); $date = $versionref->getAttributeNode("date"); print "Version number _and_ date in file ", $file, $newline if $number && $date; print "Neither version number nor date in file ", $file, $newline unless $number || $date; if ( $date ) { my $dateval = $date->getValue; my $dateok = 0; if (index ($dateval, "-") >0 ) { my @dateparts = split (/-/, $dateval); if ( $#dateparts == 2 ) { if ( length ($dateparts [0]) == 4 && length ($dateparts [1]) == 2 && length ($dateparts [2]) == 2 ) { $dateok = 1; } } } print "Invalid version date \"", $dateval, "\" in file ", $file, $newline unless $dateok; } } # check also links my $entryalso = $cat -> getElementsByTagName ("also"); for ( my $i=0; $i < $entryalso->getLength; $i++ ) { my $alsoref = $entryalso->item($i); my $alsoid; unless ( $alsoid = $alsoref->getAttributeNode("refid") ) { print "Also ref has no \"refid\" element in file ", $file, $newline; next; } my $alsoname = $alsoid->getValue; my $alsofilename = $basedir . "entries/" . substr ($alsoname, 0, 1) . "/" . $alsoname . ".xml"; unless ( -f $alsofilename ) { print "\"also\" file ", $alsofilename, " does not exist in file ", $file, $newline; } } ############################################################################ ############################################################################ ### look at categories ############################################################################ ############################################################################ if ( $opt_c ) { my $topics = $cat -> getElementsByTagName ("topics"); for ( my $i=0; $i < $topics->getLength; $i++ ) { my $temp = $topics->item($i); $temp = $temp->toString; $temp =~ s/(.*)<\/topics>/$1/; $temp =~ s/\b//g; my @values = split ",", $temp; if ( $opt_v ) { for ( my $j=0; $j <= $#values; $j++ ) { print $j, ": ", $values[$j], $newline; } } } } ############################################################################ ############################################################################ # check the xref and a links in the description my $desc = $cat -> getElementsByTagName ("description"); print "File ", $filename, " has >1 !", $newline if $desc->getLength > 1; for ( my $i=0; $i < $desc->getLength; $i++ ) { my $scrip = $desc->item($i); my $refset = $scrip -> getElementsByTagName ("xref"); for ( my $j=0; $j < $refset->getLength; $j++ ) { my $xref = $refset->item($j); my $xrefid; unless ( $xrefid = $xref->getAttributeNode("refid") ) { print "Xref has no \"refid\" element in file ", $file, $newline; next; } my $xrefname = $xrefid->getValue; my $xreffilename = $basedir . "entries/" . substr ($xrefname, 0, 1) . "/" . $xrefname . ".xml"; unless ( -f $xreffilename ) { print "\"xref\" file ", $xreffilename, " does not exist ", "in file ", $file, $newline; } } $refset = $scrip -> getElementsByTagName ("a"); for ( my $j=0; $j < $refset->getLength; $j++ ) { my $href = $refset->item($j); my $hrefid; unless ( $hrefid = $href->getAttributeNode("href") ) { print "\"A\" has no \"href\" element in file ", $file, $newline; next; } check_url ( $hrefid->getValue, "Href link") if $opt_l; } } # check the documentation links my $doc = $cat -> getElementsByTagName ("documentation"); for ( my $i=0; $i < $doc->getLength; $i++ ) { my $node = $doc->item($i); my $href = $node->getAttributeNode("href"); my $docval = $href->getValue; if ( exists $doc_links{$docval} ) { print "Doc link \"", $docval, "\" repeated in file ", $file, $newline; } else { $doc_links{$docval} = $i; } if ( $docval =~ "^ctan:" ) { my $path = substr $docval, 5; if ( (substr $path, 0, 1) ne "/" ) { print "CTAN Doc link in ", $file, " (", $path, ") has no leading \"\/\"", $newline; $path = "/". $path; } if ( $path =~ /\/\// ) { print "CTAN Doc link in ", $file, " (", $path, ") has repeated \"\/\"", $newline; $path =~ s/\/\//\//g; } if ( -l $ARCHIVE . $path ) { my $linkval = readlink $ARCHIVE . $path; print "Doc link ", $path, " in file ", $file, " is a link!", $newline if substr ($linkval, 0, 3) eq "../"; } else { print "Doc link in ", $file, " on CTAN (", $path, ") doesn't exist", $newline unless -f $ARCHIVE . $path; } } elsif ( $docval =~ "^http:" || $docval =~ "^https:" ) { if ( $opt_l ) { check_url($docval, "Doc link"); } else { print LOG "Doc link in ", $file, " on web: ", $docval, $newline; } } elsif ( $docval =~ "^ftp:" ) { print LOG "Ftp doc link in ", $file, ": ", $docval, $newline; } else { print "Unknown doc prefix in ", $file, " (", $docval, ")", $newline; } } # for use elsewhere $this_file_path = ""; # now check the ctan path my $ctan = $cat -> getElementsByTagName ("ctan") || die "*** Can't get CTAN elements for ", $file; my $no_ctans = $ctan->getLength; if ( $no_ctans == 0 ) { print LOG "No CTAN link in ", $file, $newline; } else { print LOG $no_ctans, " CTAN links in ", $file, $newline if $no_ctans > 1; for (my $i=0; $i<$no_ctans; $i++) { my $node = $ctan->item($i); my $pathattr = $node->getAttributeNode("path"); my $path = $pathattr->getValue; if ( (substr $path, 0, 1) ne "/" ) { print "CTAN path in file ", $file, " (", " has no leading \"\/\"", $newline; $path = "/" . $path; } if ( (substr $path, -1, 1) eq "/" ) { print "CTAN path in file ", $file, " (", $path, ") has trailing \"\/\"", $newline; $path =~ s/\/$//g; } if ( $path =~ /\/\// ) { print "CTAN path in ", $file, " (", $path, ") has repeated \"\/\"", $newline; $path =~ s/\/\//\//g; } my $fileattr = $node->getAttributeNode("file"); my $isafile = $fileattr ? $fileattr->getValue eq "true" : 0; my $zipattr = $node->getAttributeNode("zip"); my $containing_dir = $isafile ? dirname ( $path ) : $path; if ( -l $ARCHIVE . $containing_dir ) { my $linkval = readlink $ARCHIVE . $containing_dir; if (substr ($linkval, 0, 3) eq "../" && substr ($linkval, -7, 7) ne "tds.zip") { if ( $isafile ) { print "Holding directory ", $containing_dir; } else { print "CTAN path ", $path; } print " in file ", $file, " is a link!", $newline; } } elsif ( -d $ARCHIVE . $path ) { if ( $fileattr && ( $fileattr->getValue eq "true" ) ) { print "File ", $path, " in file ", $file, " is a directory!", $newline; } elsif ( !( $zipattr && ( $zipattr->getValue eq "false" ) )) { print "WARNING: no .zip file for directory $path", " in file ", $file, $newline unless ( -f $ARCHIVE . $path . ".zip" ) || ! $opt_w; } $this_file_path = $path; } else { if ( $fileattr && ( $fileattr->getValue eq "true" ) ) { print "File ", $path, " in file ", $file, " doesn't exist", $newline unless -f $ARCHIVE . $path; } else { print "Directory ", $path, " in file ", $file, " doesn't exist", $newline; } } } } # check distributions and install tree link # miktex -- look for the compressed package my $miktex = $cat -> getElementsByTagName ("miktex"); my $no_miktexs = $miktex->getLength; print $no_miktexs, " MikTeX links in file ", $file, $newline if $no_miktexs > 1; for (my $i=0; $i<$no_miktexs; $i++) { my $node = $miktex->item($i); if ( my $locattr = $node->getAttributeNode("location") ) { my $miktexfile = $ARCHIVE . "systems/win32/miktex/tm/packages/" . $locattr -> getValue; print "No file at MikTeX location ", $locattr -> getValue, " in file ", $file, $newline unless -f $miktexfile . ".cab" || -f $miktexfile . ".tar.bz2" || -f $miktexfile . ".tar.lzma"; } else { print "No location in MikTeX tag in file ", $file, $newline; } } if ( $no_miktexs == 0 && $opt_m ) { my $basefile = basename( $file, ".xml" ); my $miktexfile = $ARCHIVE . "systems/win32/miktex/tm/packages/" . $basefile; my $potential = ""; $potential = $basefile . ".cab" if -f ($miktexfile . ".cab"); $potential = $basefile . ".tar.bz2" if -f ($miktexfile . ".tar.bz2"); $potential = $basefile . ".tar.lzma" if -f ($miktexfile . ".tar.lzma"); print "*** Does MiKTeX ", $potential, " help in file ", $file, "?", $newline if $potential; } # texlive -- look for .tar.lzma file (now we tl2008) my $texlive = $cat -> getElementsByTagName ("texlive"); my $no_texlives = $texlive->getLength; print $no_texlives, " TeXlive links in file ", $file, $newline if $no_texlives > 1; my $potential_tl = basename( $file, ".xml") . ".tar.lzma"; my $potential_path = $ARCHIVE . "systems/texlive/tlnet/2008/archive/" . $potential_tl; #print $file, ": ", $potential_tl, ", ", # $potential_path, "\n"; for (my $i=0; $i<$no_texlives; $i++) { my $node = $texlive->item($i); if ( my $locattr = $node->getAttributeNode("location") ) { my $gotatar = 0; my $tarfilename = $locattr->getValue; print "No TeXLive tar file ", $tarfilename, ".tar.lzma in file ", $file, $newline unless -f $ARCHIVE . "systems/texlive/tlnet/2008/archive/" . $tarfilename . ".tar.lzma"; } else { print "No location in TeXlive tag in file ", $file, $newline; } } if ( $no_texlives == 0 && $opt_m ) { my $basefile = basename( $file, ".xml" ) . ".tar.lzma"; my $potential_path = $ARCHIVE . "systems/texlive/tlnet/2008/archive/" . $basefile; my $potential = -f $potential_path ? $basefile : ""; #print $file, ": ", $basefile, ", ", $potential, "\n"; print "*** Does TeX live ", $potential, " help in file ", $file, "?", $newline if $potential; } # install tree -- check that quoted path is present my $inst_tree = $cat -> getElementsByTagName ("install"); my $no_inst_trees = $inst_tree->getLength; print $no_inst_trees, " install tree links in file ", $file, $newline if $no_inst_trees > 1; for (my $i=0; $i<$no_inst_trees; $i++) { my $node = $inst_tree->item($i); if ( my $pathattr = $node->getAttributeNode("path") ) { my $instfilename = $pathattr->getValue; unless ( $instfilename =~ /^\// ) { print "No leading \"/\" in install path in file ", $file, $newline; $instfilename = "/" . $instfilename; } print "Repeated \"/\" in install path in file ", $file, $newline if $instfilename =~ /\/\//; print "No install file ", "/install" . $instfilename, " in file ", $file, $newline unless -f $ARCHIVE . "/install" . $instfilename; } else { # note that rainer's code defaults this case... print "No install path tag in file ", $file, $newline; } } if ( $no_inst_trees == 0 && $opt_m ) { my $basefile = basename( $this_file_path, ".xml" ) . ".tds.zip"; # untidy exception here... print "*** Does ", $basefile, " help in file ", $file, "?", $newline if $this_file_path ne "/language/japanese/ptex" && -f $ARCHIVE . "install" . $this_file_path . ".tds.zip"; } $files_processed++; # print copy file #$cat -> printToFile ("./copy.xml"); $cat -> dispose; } print $files_processed, " file(s) processed of ", $files_found, $newline; close LOG; ########################################################### sub usage { print STDERR "Usage: $0 [-acdlmsvw] file path(s) -a: check entry modifiers cvs-controlled -c: check category entry (experimental) -d: check distribution tags -h: make output html-like -l: check web links are live -m: suggest miktex/tex-live links -o: offline; as -s but not -l -s: \"standard set\" of options -- equivalent to -adlmvw just now -t: testing -v: witter more than otherwise -w: warn if ctan directory link has no .zip "; } sub check_url { if ( ! $loaded_errant_urls ) { open EURL, "redirected-hosts" || return; while ( ) { chomp; $errant_url{$_} = 1; } close EURL; $loaded_errant_urls = 1; } $thisurl = $_[0]; if ( substr ( $thisurl, 0, 5 ) eq "http:" || substr ( $thisurl, 0, 6 ) eq "https:" ) { my $domain = $thisurl; $domain =~ s/^.*\/\///; $domain =~ s/\/.*$//; if ( exists $errant_url{ $domain } ) { print LOG "Checking URL ", $thisurl, " skipped in file ", $file, "\n"; } else { unless ( head($thisurl) ) { if ( $opt_h ) { print "*** ", $_[1], " in ", $file, " (", $thisurl, ""; } else { print "*** ", $_[1], " in ", $file, " (", $thisurl; } print ") not responding", $newline; } } } elsif ( substr ( $thisurl, 0, 4 ) eq "ftp:" ) { print LOG "ftp home link in ", $file, "\n"; } else { print "Unknown type of home link in ", $file, $newline; } }