#!/usr/bin/perl ### NOTE: Fixed the VOLATILE spot in line 17 to stabilize from one semester to another ### First we do routine parse of the arguments passed with the classparse through the command line ### if ($#ARGV == 0) { $input = $ARGV[0]; print STDERR "Extracting records from ", $input, "\n"} else {print STDERR "Need one argument.\n", "Usage:\n classparse input_html_file\n (TAB delimited output goes to the standard output device, typically the screen)\n"; exit 0; }; # little help ### End of command line parse and the real stuff begins open(DATAFILE,"$input") || die("cannot open the html data file: $!"); # opens the input html file for reading $name = ""; # initialize the name to empty (meaning no name at hand initially) $counter = 0; # initialize the counter of records extracted while () # big loop over the lines of the file { # begin{big loop} chomp; # strip the new line character \n of the end of the line (just cosmetics) if ($name eq "") # test if we have no name at hand (generally found in the previous line unles this one is first) { # begin{no name on previous line} we did not find name on the previous line so we look for it now: # @name = split //; # split at substrings of the form: and store in an array VOLATILE @name = split //; # split at substrings of the form: and store in an array (MOD in Jan 2004) if ($#name == 1) # if the max index of non-zero elements of the array @name (indices start with zero) is one { # begin{name found} # TEST print "name tokens: "; foreach $token (@name) { print ($token, "*\n"); }; print "\n\n"; $name = $name[1]; # Last, First, Middle Names with some html trailing it. (Note $name[0] is typically empty after the above split.) $name =~ s/<.+>//; # strips , which is the form of the html trailing the name # TEST print("Got name =", $name,"\n"); } # end{name found} next ; # name found so break off the loop and go to the next line to look for ssn } # end{no name on previous line} if ($name ne "") # test if we have name at hand found in the previous parsed line { # begin{name number found on previous line} @ssn = split /-/; # @ssn is typically an empty string followed by ssn trailed by some html junk #TEST print "ssn tokens: "; foreach $token (@ssn) { print ($token, "*\n"); }; print "\n\n"; # print tokens separated by * if ($#ssn > 1) { print STDERR "possible ssn parsing error: too many tokens \n";}; # if more than two tokens (first empty) if ($#ssn < 1) { print STDERR "parsing warning: no ssn matches Name =", $name, " (do not worry)\n";}; # if no name after ssn then this is not ssn so just issue a warning if ($#ssn == 1) # if two tokens then we got the ssn { # begin{found ssn} $ssn = $ssn[1]; $ssn =~ s/<.+>//; # substitutes for "< anything >" emptyspace ############### OUTPUT PANEL BELOW (customize if you need comma delimited etc...) ################### #### Uncomment the one line below that does what you want: #### screen output: print ($ssn, "\t", $name, "\n"); # print to screen name followed by TAB followed by ssn followed by NEW LINE # print ($name, "\t", $ssn, "\n"); # print name followed by TAB followed by ssn followed by NEW LINE # print ($name, ",", $ssn, "\n"); # print ssn followed by TAB followed by name followed by NEW LINE # print ($name, ",", $ssn, "\n"); # print ssn followed by COMMA followed by name followed by NEW LINE ####################### END OUTPUT PANEL ####################################### ++$counter } # end{found name} $name = ""; # reset ssn #MOD 3.1 so that it does not hold $ssn for more than one line } #end{name found on the previous line parsed} } # end{big loop} print STDERR "Done! Extracted ", $counter, " records\n ";