#!/usr/bin/perl -w
#copy subset of big directory to local directory
#for each file
# for each sentence:
# fix errors with wordCOMMA -- some simple book-keeping
# rewrite sentence to file, store sentence
# add
# call charniak
# print output to another file
#close files
#copy new files to main directory
#use IPC::Open2;
#use Symbol;
#sub BEGIN {
# unshift @INC, "$ENV{IO_TTY_LIB}";
# unshift @INC, "$ENV{EXPECTLIB}";
#}
use Expect;
use strict;
my $MAX_SENT_LEN = 799;
my $MAX_NUM_WORDS = 100;
my $TIME_OUT = 60;
my $END_PROTOCOL = "\n";
my $DEBUG_ME = 1;
my $RUN_ON_CLUSTER = 1;
my $TEST_FILE_SELECTION_ONLY = 0;
my $PRINT_CHK_SENTENCE = 1;
die "Usage: ./processAcquaintData.pl inputDir outputDir numFilesPerChunk fileSubsetIndex"
unless @ARGV == 4;
my $pathToCharniak = "/home/roth/mssammon/softwareDownloads/charniak_parser/parser05May26fixed";
if(1 == $RUN_ON_CLUSTER)
{
$pathToCharniak = "/home/roth/mssammon/parser05May26fixed";
}
my $pathToScratch = "/home/roth/mssammon/ACQUAINT/tmp/";
if(1 == $RUN_ON_CLUSTER)
{
$pathToScratch = "/scratch/";
}
#contents of each column
my $SENT_INDEX_COL = 0;
my $WORD_INDEX_COL = 1;
my $WORD_COL = 2;
my $POS_COL = 3;
my $PHR_COL = 5;
my $NE_COL = 6;
#empty column (always "O") at col 4
my $inputDir = $ARGV[0]."/";
my $outputDir = $ARGV[1]."/";
my $NUM_FILES_PER_CHUNK = $ARGV[2];
my $fileSubsetIndex = $ARGV[3];
my $logFileName = "subset$fileSubsetIndex.log";
#create local directory structure; must be unique to process
# -- local storage reduces time copying across network
my $localInputMainDir = $pathToScratch."in/";
my $localOutputMainDir = $pathToScratch."out/";
if(1 == $DEBUG_ME)
{
print STDERR "## creating local directories $localInputMainDir, $localOutputMainDir...\n";
}
system "mkdir $localInputMainDir $localOutputMainDir";
my $localInputDir = "$localInputMainDir$fileSubsetIndex/";
my $localOutputDir = "$localOutputMainDir$fileSubsetIndex/";
if(1 == $DEBUG_ME)
{
print STDERR "## creating local subdirectories $localInputDir, $localOutputDir...\n";
}
system "mkdir $localInputDir $localOutputDir";
my $command = "$pathToCharniak/PARSE/parseIt $pathToCharniak/DATA/EN/ -K -l$MAX_NUM_WORDS";
my @fileList;
#create main program that will be communicating throught pipe.
my $main = NewExpect($command);
sub NewExpect {
my $command = shift;
my $main;
$main = new Expect();
$main->raw_pty(1); # no local echo
$main->log_stdout(0); # no echo
$main->spawn($command) or die "Cannot start: $command\n";
return $main;
}
#determine a subset of files to run using fileSubsetIndex
if(1 == $DEBUG_ME)
{
print STDERR "##getting file subset...\n";
}
&getFileSubset;
#open log file
open (LOG, ">$localOutputDir$logFileName") or die "Can't open log file: $!\n";
print LOG "processing files for subset $fileSubsetIndex...\n";
#for each file
#read col format file
if(1 == $DEBUG_ME)
{
print STDOUT "##processing file list... list is @fileList...\n";
}
my $fileNum = 0;
my $hadError = 0;
foreach my $file (@fileList)
{
my @sentenceList;
my $listRef = \@sentenceList;
my $sentence;
chomp $file;
my $inFile = "$localInputDir$file";
print LOG "\nprocessing file $inFile...\n";
# my $correctOutFile = "$outputDir/$file.corrected";
my $correctOutFile = $localOutputDir.$file.".corrected";
my $outputFile = $localOutputDir.$file.".chk";
if(1 == $DEBUG_ME)
{
print "##processing next file: $inFile...\n";
print "##printing corrected input to $correctOutFile, chk output to $outputFile...\n";
}
#process file reads in set of sentences from file and fixes a basic chunking error
#numSentences tracks the number of sentences in the original file
#numProcessed tracks the number of sentences processed by Charniak
# -- these are compared to ensure that we have output for each original sentence
my $numSentences = &processFile($inFile, $correctOutFile, $listRef);
my $numProcessed = 0;
my $nonNullSentence = 0;
open (OUT, ">$outputFile") or die "Can't open output file $outputFile: $!\n";
foreach $sentence (@sentenceList)
{
$nonNullSentence = 0;
$sentence = " $sentence \n";
# print STDERR "##sentence to chk is $sentence...\n";
print "##sentence to chk is $sentence...\n";
my @res;
$main->send("$sentence"); #send text to main program
@res = $main->expect($TIME_OUT, $END_PROTOCOL); #get output from main program
my $timeout = $res[1];
my $chkOutput = $res[3];
my $sawOneNL = 0;
my $finishedSentence = 0;
if($timeout)
{
print "## Timed out. Restarting parser.\n";
print OUT "Parse failed: timed out.\n\n\n";
$main->hard_close();
$main = NewExpect($command);
}
else
{
if($chkOutput =~ m/^Parse failed/)
{
print STDERR "WARNING: PARSE FAILED in file $inFile on sentence # $numProcessed. Printing out current line...\n";
print STDERR $chkOutput;
print OUT "Parse failed on sentence # $numProcessed (detected by parser itself)\n";
@res = $main->expect($TIME_OUT,$END_PROTOCOL); # read off the original sentence
$timeout = $res[1];
if ($timeout) { # parser possibly gets stuck, restart it.
print "##Time out when reading off the original sentence!\n";
print "##Restart parser\n";
$main->hard_close();
$main = NewExpect($command);
}
print OUT "\n\n";
}
elsif($chkOutput =~ m/^error/)
{
print STDERR "WARNING: PARSE FAILED in file $inFile on sentence # $numProcessed.\n";
print STDOUT "##Parser died on sentence # $numProcessed. Restarting parser...\n";
print STDERR "##Parser died. Restarting parser...\n";
$main->hard_close();
$main = NewExpect($command);
print OUT "Parse Failed on sentence # $numProcessed (error: $chkOutput)\n";
print OUT "\n\n";
} # end if /error/
elsif($chkOutput =~ m/^parseIt/)
{
print STDOUT "## Parser died on sentence # $numProcessed. Restarting parser...\n";
print STDERR "## Parser died on sentence # $numProcessed. Restarting parser...\n";
$main->hard_close();
$main = NewExpect($command);
print OUT "Parse failed (Assertion error)\n";;
print OUT "\n\n";
}
else
{
print STDOUT "## Parse ok\n";
my $output = "$chkOutput\n";
my $numBlank = 0;
if ($output =~ /^\s*$/) { $numBlank = 1; }
else { $numBlank = 0; }
#normal output should end with 2 blank lines.
while ($numBlank < 2) {
@res = $main->expect($TIME_OUT,$END_PROTOCOL); # read output from main program
$timeout = $res[1];
$chkOutput = $res[3];
if ($timeout) { # parser possibly gets stuck, restart it.
print STDOUT "ERROR: Time out waiting for output on sentence # $numProcessed.\n";
print STDERR "ERROR: Time out waiting for output on sentence # $numProcessed.\n";
$output = "\n\n"; # output blank
print "Restart parser\n";
$main->hard_close();
$main = NewExpect($command);
last;
} else {
$output .= "$chkOutput\n";
if ($chkOutput =~ /^\s*$/) { $numBlank++; }
else { $numBlank = 0; }
}
}
#must be real sentence entry
$nonNullSentence = 1;
print OUT $output;
}
} #else charniak parses sentence
$numProcessed++;
print OUT "\n\n";
if(0 == $nonNullSentence)
{
#print error message to log: no real output to sentence
print LOG "ERROR: null sentence for sentence number $numProcessed.\n\n";
}
}
if( $numSentences != $numProcessed )
{
#print error message to log: mismatched number of sentences out/sentences in
print LOG "ERROR: mismatched number of sentences for file index $fileNum: \n";
print LOG " Number of Sentences: $numSentences\n";
print LOG " Number Processed: $numProcessed\n\n";
}
close OUT;
$fileNum++;
}
print LOG "Processing complete.\n";
close LOG;
$main->hard_close();
my $finalOutputDir = $outputDir;
system "cp $localOutputDir* $finalOutputDir";
system "rm $localInputDir $localOutputDir";
#copy subset of files to local /scratch/ directory
# -- minimize network calls at the cost of upfront
# copy time
sub getFileSubset
{
my @tmpList = `ls $inputDir`;
# @fileList = `ls $inputDir`;
#need about 240 files per process, so, given input number, get that
# set of 240 files
my $startIndex = $fileSubsetIndex * $NUM_FILES_PER_CHUNK;
my $endIndex = $startIndex + $NUM_FILES_PER_CHUNK;
if($startIndex > $#tmpList)
{
print STDERR "No files in range. Exiting.\n";
exit();
}
for(my $i = $startIndex; $i < $endIndex; $i++)
{
if(defined($tmpList[$i]))
{
chomp $tmpList[$i];
print "copying file $i ($tmpList[$i])...\n\n";
my $source = $inputDir.$tmpList[$i];
my $sink = $localInputDir.$tmpList[$i];
if($DEBUG_ME)
{
print STDERR "copying file $i ($tmpList[$i]) -- source is $source; sink is $sink.\n";
}
`cp $source $sink`;
push @fileList, $tmpList[$i];
}
}
if(1 == $TEST_FILE_SELECTION_ONLY)
{
foreach my $entry (@fileList)
{
print $entry."\n";
}
print "End Test.\n";
exit();
}
}
sub processFile
{
my $file = $_[0];
my $correctOut = $_[1];
my $sentenceListRef = $_[2];
my $numSentences = 0;
open (IN, "<$file") or die "Can't open file $file for input: $!\n";
open (CORR_OUT, ">$correctOut") or die "Can't open file $correctOut for output: $!\n";
my $processedLastSentence = 1;
my $currentSentence = ""; #to be sent to charniak parser
my $currentSentenceLength = 0; #avoid buffer overrun in charniak parser
my $numWords = 0; #ditto
my @fixedSentTable; #to be printed to CORR_OUT
my $index = 0;
while()
{
#look for separating empty lines
if($_ !~ m/^\s+$/)
{
chomp $_;
if(1 == $processedLastSentence)
{
$processedLastSentence = 0;
}
#check for wordCOMMA
my @row = split /\s+/, $_;
#set index, as if any wordCOMMAs found, will be incorrect
$row[$WORD_INDEX_COL] = $index;
my $word = $row[$WORD_COL];
if($word =~ m/(\w+)COMMA$/)
{
#make two lines: one for word, other for COMMA
my $word = $1;
$row[$WORD_COL] = $word;
my $line = join "\t", @row;
push @fixedSentTable, $line;
$index++;
# my $commaLine = "$row[$SENT_INDEX_COL]\t$index\tCOMMA\tCOMMA\tO\tO\tO";
my $commaLine = "$row[$SENT_INDEX_COL]\t$index\t\,\t,\tO\tO\tO";
push @fixedSentTable, $commaLine;
# $currentSentenceLength += length(" $word COMMA");
$currentSentenceLength += length(" $word ,");
$numWords += 2;
if($MAX_SENT_LEN > $currentSentenceLength && $MAX_NUM_WORDS > $numWords)
{
$word = &checkWord($word);
# $currentSentence .= " $word COMMA";
$currentSentence .= " $word ,";
}
}
elsif(m/(\w+)_(\w+)/) #two words, separated by underscore
{
#make two lines: one for each word, other for COMMA
my $firstWord = $1;
my $secondWord = $2;
$row[$WORD_COL] = $firstWord;
my $line = join "\t", @row;
push @fixedSentTable, $line;
$index++;
#assume second word is NN, B-NP, B-Peop (usually happens with batting averages)
my $extraLine = "$row[$SENT_INDEX_COL]\t$index\t$secondWord\tNNP\tO\tB-NP\tB-Peop";
push @fixedSentTable, $extraLine;
$currentSentenceLength += length(" $firstWord $secondWord");
$numWords += 2;
if($MAX_SENT_LEN > $currentSentenceLength && $MAX_NUM_WORDS > $numWords)
{
$firstWord = &checkWord($firstWord);
$secondWord = &checkWord($secondWord);
$currentSentence .= " $firstWord $secondWord";
}
}
else
{
push @fixedSentTable, $_;
$currentSentenceLength += length(" $word");
$numWords += 1;
if($MAX_SENT_LEN > $currentSentenceLength && $MAX_NUM_WORDS > $numWords)
{
$word = &checkWord($word);
$currentSentence .= " $word";
}
}
$index++;
}
else
{
if(0 == $processedLastSentence)
{
$processedLastSentence = 1;
$index = 0;
#add new sentence to list
push @{$sentenceListRef}, $currentSentence;
$numSentences++;
$currentSentence = "";
$currentSentenceLength = 0;
$numWords = 0;
#print corrected sentences to CORR_OUT
foreach my $row (@fixedSentTable)
{
chomp $row;
print CORR_OUT "$row\n";
}
print CORR_OUT "\n\n";
@fixedSentTable = ();
}
} #end if empty line
} #end while
#may not be empty line at end of input
#check for empty currentSentence; if not empty,
#process current table and sentence
if("" ne $currentSentence)
{
$numSentences++;
push @{$sentenceListRef}, $currentSentence;
$currentSentence = "";
#print corrected sentences to CORR_OUT
foreach my $row (@fixedSentTable)
{
chomp $row;
print CORR_OUT "$row";
}
print CORR_OUT "\n\n";
@fixedSentTable = ();
}
close IN;
close CORR_OUT;
return $numSentences;
}
sub checkWord
{
my $word = $_[0];
if($word eq "-LBR-")
{
$word = "-LRB-";
}
elsif($word eq "-RBR-")
{
$word = "-RRB-";
}
elsif($word eq "\"")
{
$word = "''";
}
elsif($word eq "COMMA")
{
$word = ",";
}
return $word;
}