#!/local/all/perl # Change the line above to match the location of perl on your system. # The PERL extension Getopt::Declare was written by Damian Conway, and is # available at http://www.cpan.org. Type 'perldoc Declare.pm' for more # information. I modified it (just slightly), and the modified version has # been included in this directory. Hence, the line below. Change the path to # the absolute path where you unpacked SNoW. sub BEGIN { unshift @INC, "$ENV{SNOW_HOME}/tutorial"; } use IO::Socket; use Getopt::Declare; $Text::Tabs::tabstop = 2; $PROPER_NAME = "SNoW Tuner"; $VERSION = "1.0"; $testFile = "/"; $workingDirectory = "."; $samples = 0; $alpha = "1.35/1.1/2"; $beta = "0.8/0.9/0.5"; $wTheta = "4"; $wWeight = "1"; $rate = "0.1/0.001/0.0001/0.25"; $pTheta = "4"; $pWeight = "0"; $thickSeparator = "0/.5/1/1.5/2/3"; $parameters{"alpha"} = "-W"; $parameters{"beta"} = "-W"; $parameters{"wTheta"} = "-W"; $parameters{"wWeight"} = "-W"; $parameters{"rate"} = "-P"; $parameters{"pTheta"} = "-P"; $parameters{"pWeight"} = "-P"; $parameters{"bayesSmooth"} = "-b"; $parameters{"discard"} = "-d"; $parameters{"eligibility"} = "-e"; $parameters{"fixedFeature"} = "-f"; $parameters{"gradientDescent"} = "-G"; $parameters{"conjunctions"} = "-g"; $parameters{"multipleLabels"} = "-m"; $parameters{"prediction"} = "-p"; $parameters{"rounds"} = "-r"; $parameters{"thickSeparator"} = "-S"; $parameters{"sparse"} = "-s"; $parameters{"thresholdRelative"} = "-t"; $parameters{"firstRoundUpdate"} = "-u"; $parameters{"winnowSmooth"} = "-w"; $parameters{"rawMode"} = "-z"; $defaults = 1; die "Fatal errors encountered. Exiting...\n" unless new Getopt::Declare q( [strict] This script will automatically train and test SNoW using a variety of parameter settings. The result is a network that has been tuned to the training and/or testing data. In addition to creating that network, the parameters that yielded the best results are sent to STDOUT at the script's completion. Every command line parameter below, except for -arch, -train, -test, -sample, -working, and -verbose should be followed by a '/'-delimited list of settings acceptable for the SNoW command line parameter of the same name. If none of these SNoW parameters are specified, a default suite of parameters is used instead. Each parameter's default, if any, is mentioned in the description of the parameter below. If any command line parameter below other than -arch, -train, -test, -sample, -working, and -verbose is specified, the defaults of all other parameters are shortened so that they only contain their first values. In normal operation, this script will then exhaustively try every combination of the provided parameters. See the -sample parameter for an alternative to this exhaustive search. Be aware that this script will create and write to files named "network", "train", and "test", as well as a file whose name is the same as that specified after the -train parameter with ".net" appended to it. If any of these files already exists, it will be overwritten. See the -working parameter to set an alternate working directory for this script. ---------------------------------------------------------------------------- Required options: -arch Specifies the learning architectures (i.e., which of Winnow, Perceptron, and Naive Bayes should be used to learn which targets) to evaluate. Each element of this list should itself be a space delimited list of the following form: "W:0-4 P:5,6" or "W:0-6". Don't forget to include double quotes around this entire argument if you use spaces. [required] { $::architecture = $architecture; } -train Specifies the file containing the training examples used to create the network. If -test is not also specified, only 80% of these examples will actually be used for training. Otherwise, the entire file will be used. It is assumed that this file contains one example per line. [required] { $::trainingFile = $trainingFile; } ---------------------------------------------------------------------------- Additional options: -test Specifies the file containing the labeled examples used to evaluate the performance of the network. If this parameter is omitted, the last 20% of the training examples will be used for this purpose instead. { $::testFile = $testFile; } -sample Specifies the number of parameter combinations to evaluate. If unspecified or if set to 0, all parameter combinations are evaluated. Otherwise, the specified number of parameter combinations are selected at random. Default 0. { $::samples = $samples; } -working Specifies the path to a safe directory for this script to create its files in. Default ".". { $::workingDirectory = $directory; } -verbose Produce progress information. { $::verbose = 1; } -alpha Possible settings for Winnow's promotion parameter. Default "1.35/1.1/2". { $::alpha = $alpha; if ($::defaults) { $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -beta Possible settings for Winnow's demotion parameter. Default "0.8/0.9/0.5". { $::beta = $beta; if ($::defaults) { $::alpha = "1.35"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -wTheta Possible settings for Winnow's threshold parameter. Default "4". { $::wTheta = $wTheta; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -wWeight Possible settings for Winnow's initial feature weight parameter. Default "1". { $::wWeight = $wWeight; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -rate Possible settings for Perceptron's learning rate parameter. Default "0.1/0.001/0.0001/0.25". { $::rate = $rate; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::defaults = 0; } } -pTheta Possible settings for Perceptron's threshold parameter. Default "4". { $::pTheta = $pTheta; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -pWeight Possible settings for Perceptron's initial feature weight parameter. Default "0". { $::pWeight = $pWeight; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -b Possible settings for smoothing Naive Bayes. { $::bayesSmooth = $bayesSmooth; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -d Possible settings for the discard method. { $::discard = $discard; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -e Possible settings for the eligibility method. { $::eligibility = $eligibility; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -f Possible settings for the fixed feature. { $::fixedFeature = $fixedFeature; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -G Possible settings for gradient descent. { $::gradientDescent = $gradientDescent; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -g Possible settings for the generation of conjunctions. { $::conjunctions = $conjunctions; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -m Possible settings for the multiple labels flag. { $::multipleLabels = $multipleLabels; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -p Possible settings for the prediction threshold. { $::prediction = $prediction; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -r Possible settings for the number of training rounds. { $::rounds = $rounds; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -S Possible settings for the thick separator. Default "0/.5/1/1.5/2/3". { $::thickSeparator = $thickSeparator; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::rate = "0.1"; $::defaults = 0; } } -s Possible settings for network sparsity. { $::sparse = $sparse; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -t Possible settings for threshold relative updating. { $::thresholdRelative = $thresholdRelative; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -u Possible settings for updating during the first round of training. { $::firstRoundUpdate = $firstRoundUpdate; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -w Possible settings for smoothing winnow and perceptron. { $::winnowSmooth = $winnowSmooth; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } -z Possible settings for the raw mode flag. { $::rawMode = $rawMode; if ($::defaults) { $::alpha = "1.35"; $::beta = "0.8"; $::thickSeparator = ""; $::rate = "0.1"; $::defaults = 0; } } ); if ($testFile eq "/") { open IN, $trainingFile or die "Can't open $trainingFile for input: $!"; @train = ; close IN; open OUT, ">$workingDirectory/train" or die "Can't open $workingDirectory/train for output: $!"; for ($i = 0; $i < 0.8 * @train; $i++) { print OUT $train[$i]; } close OUT; open OUT, ">$workingDirectory/test" or die "Can't open $workingDirectory/test for output: $!"; for (; $i < @train; $i++) { print OUT $train[$i]; } close OUT; } else { system "cp $trainingFile $workingDirectory/train"; system "cp $testFile $workingDirectory/test"; } $best = -1; $bestParams = ""; $bestNetwork = "$trainingFile.net"; $trainingFile = "$workingDirectory/train"; $testFile = "$workingDirectory/test"; $networkFile = "$workingDirectory/network"; for (keys %parameters) { @$_ = split /\//, $$_; } for $arch (split /\//, $architecture) { $#params = -1; $#indices = -1; while ($arch =~ /([WP]):/g) { if ($1 eq "W") { @{ $params[@params] } = @alpha; @{ $params[@params] } = @beta; @{ $params[@params] } = @wTheta; @{ $params[@params] } = @wWeight; push @indices, 0; push @indices, 0; push @indices, 0; push @indices, 0; } else { @{ $params[@params] } = @rate; @{ $params[@params] } = @pTheta; @{ $params[@params] } = @pWeight; push @indices, 0; push @indices, 0; push @indices, 0; } } $arch =~ s/ / -/g; $arch =~ s/W:/W \$,\$,\$,\$:/g; $arch =~ s/P:/P \$,\$,\$:/g; $commandLine = "-$arch"; for $p (keys %parameters) { next if ($parameters{$p} eq "-W" || $parameters{$p} eq "-P" || !$$p); @{ $params[@params] } = @$p; push @indices, 0; $commandLine .= " $parameters{$p} \$"; } $samples--; do { $c = $commandLine; $i = 0; $c =~ s/\$/${$params[$i]}[$indices[$i++]]/g; $command = "snow -train -I $trainingFile -F $networkFile -T $testFile $c"; print "$c" if $verbose; @output = `$command`; if ($output[$#output] !~ /^Overall/) { print "\n" if $verbose; print STDERR "\n *** Problem with parameters: $c\n"; print STDERR @output; } else { ($current, $total) = ($output[$#output] =~ /\((\d+) \/ (\d+)\)/); print " --> ($current / $total)\n" if $verbose; if ($current > $best) { $best = $current; $bestParams = $c; system "mv $networkFile $bestNetwork"; } } } while (increment()); } system "rm -f $networkFile $trainingFile $testFile"; if ($best == -1) { print STDERR "None of your parameter combinations produced a network.\n"; exit; } print "$bestParams\n"; sub increment#() { if ($samples < 0) { for ($i = 0; $i < @indices && $indices[$i] == @{ $params[$i] } - 1; $i++) { $indices[$i] = 0; } return 0 if $i == @indices; ++$indices[$i]; return 1; } if ($TOTAL == 0) { $TOTAL = 1; for ($i = 0; $i < @params; $i++) { $TOTAL *= @{ $params[$i] }; } } $indices[0] += int(rand($TOTAL - 1)) + 1; for ($i = 1; $i < @indices && $indices[$i - 1] >= @{ $params[$i - 1] }; $i++) { $indices[$i] += int($indices[$i - 1] / @{ $params[$i - 1] }); $indices[$i - 1] %= @{ $params[$i - 1] }; } $indices[$i - 1] %= @{ $params[$i - 1] }; $samples--; }