#!/usr/bin/perl -w #copy subset of big directory to local directory #for each file # for each sentence: # fix errors with wordCOMMA -- some simple book-keeping # rewrite sentence to file, store sentence # add # call charniak # print output to another file #close files #copy new files to main directory #use IPC::Open2; #use Symbol; #sub BEGIN { # unshift @INC, "$ENV{IO_TTY_LIB}"; # unshift @INC, "$ENV{EXPECTLIB}"; #} use Expect; use strict; my $MAX_SENT_LEN = 799; my $MAX_NUM_WORDS = 100; my $TIME_OUT = 60; my $END_PROTOCOL = "\n"; my $DEBUG_ME = 1; my $RUN_ON_CLUSTER = 1; my $TEST_FILE_SELECTION_ONLY = 0; my $PRINT_CHK_SENTENCE = 1; die "Usage: ./processAcquaintData.pl inputDir outputDir numFilesPerChunk fileSubsetIndex" unless @ARGV == 4; my $pathToCharniak = "/home/roth/mssammon/softwareDownloads/charniak_parser/parser05May26fixed"; if(1 == $RUN_ON_CLUSTER) { $pathToCharniak = "/home/roth/mssammon/parser05May26fixed"; } my $pathToScratch = "/home/roth/mssammon/ACQUAINT/tmp/"; if(1 == $RUN_ON_CLUSTER) { $pathToScratch = "/scratch/"; } #contents of each column my $SENT_INDEX_COL = 0; my $WORD_INDEX_COL = 1; my $WORD_COL = 2; my $POS_COL = 3; my $PHR_COL = 5; my $NE_COL = 6; #empty column (always "O") at col 4 my $inputDir = $ARGV[0]."/"; my $outputDir = $ARGV[1]."/"; my $NUM_FILES_PER_CHUNK = $ARGV[2]; my $fileSubsetIndex = $ARGV[3]; my $logFileName = "subset$fileSubsetIndex.log"; #create local directory structure; must be unique to process # -- local storage reduces time copying across network my $localInputMainDir = $pathToScratch."in/"; my $localOutputMainDir = $pathToScratch."out/"; if(1 == $DEBUG_ME) { print STDERR "## creating local directories $localInputMainDir, $localOutputMainDir...\n"; } system "mkdir $localInputMainDir $localOutputMainDir"; my $localInputDir = "$localInputMainDir$fileSubsetIndex/"; my $localOutputDir = "$localOutputMainDir$fileSubsetIndex/"; if(1 == $DEBUG_ME) { print STDERR "## creating local subdirectories $localInputDir, $localOutputDir...\n"; } system "mkdir $localInputDir $localOutputDir"; my $command = "$pathToCharniak/PARSE/parseIt $pathToCharniak/DATA/EN/ -K -l$MAX_NUM_WORDS"; my @fileList; #create main program that will be communicating throught pipe. my $main = NewExpect($command); sub NewExpect { my $command = shift; my $main; $main = new Expect(); $main->raw_pty(1); # no local echo $main->log_stdout(0); # no echo $main->spawn($command) or die "Cannot start: $command\n"; return $main; } #determine a subset of files to run using fileSubsetIndex if(1 == $DEBUG_ME) { print STDERR "##getting file subset...\n"; } &getFileSubset; #open log file open (LOG, ">$localOutputDir$logFileName") or die "Can't open log file: $!\n"; print LOG "processing files for subset $fileSubsetIndex...\n"; #for each file #read col format file if(1 == $DEBUG_ME) { print STDOUT "##processing file list... list is @fileList...\n"; } my $fileNum = 0; my $hadError = 0; foreach my $file (@fileList) { my @sentenceList; my $listRef = \@sentenceList; my $sentence; chomp $file; my $inFile = "$localInputDir$file"; print LOG "\nprocessing file $inFile...\n"; # my $correctOutFile = "$outputDir/$file.corrected"; my $correctOutFile = $localOutputDir.$file.".corrected"; my $outputFile = $localOutputDir.$file.".chk"; if(1 == $DEBUG_ME) { print "##processing next file: $inFile...\n"; print "##printing corrected input to $correctOutFile, chk output to $outputFile...\n"; } #process file reads in set of sentences from file and fixes a basic chunking error #numSentences tracks the number of sentences in the original file #numProcessed tracks the number of sentences processed by Charniak # -- these are compared to ensure that we have output for each original sentence my $numSentences = &processFile($inFile, $correctOutFile, $listRef); my $numProcessed = 0; my $nonNullSentence = 0; open (OUT, ">$outputFile") or die "Can't open output file $outputFile: $!\n"; foreach $sentence (@sentenceList) { $nonNullSentence = 0; $sentence = " $sentence \n"; # print STDERR "##sentence to chk is $sentence...\n"; print "##sentence to chk is $sentence...\n"; my @res; $main->send("$sentence"); #send text to main program @res = $main->expect($TIME_OUT, $END_PROTOCOL); #get output from main program my $timeout = $res[1]; my $chkOutput = $res[3]; my $sawOneNL = 0; my $finishedSentence = 0; if($timeout) { print "## Timed out. Restarting parser.\n"; print OUT "Parse failed: timed out.\n\n\n"; $main->hard_close(); $main = NewExpect($command); } else { if($chkOutput =~ m/^Parse failed/) { print STDERR "WARNING: PARSE FAILED in file $inFile on sentence # $numProcessed. Printing out current line...\n"; print STDERR $chkOutput; print OUT "Parse failed on sentence # $numProcessed (detected by parser itself)\n"; @res = $main->expect($TIME_OUT,$END_PROTOCOL); # read off the original sentence $timeout = $res[1]; if ($timeout) { # parser possibly gets stuck, restart it. print "##Time out when reading off the original sentence!\n"; print "##Restart parser\n"; $main->hard_close(); $main = NewExpect($command); } print OUT "\n\n"; } elsif($chkOutput =~ m/^error/) { print STDERR "WARNING: PARSE FAILED in file $inFile on sentence # $numProcessed.\n"; print STDOUT "##Parser died on sentence # $numProcessed. Restarting parser...\n"; print STDERR "##Parser died. Restarting parser...\n"; $main->hard_close(); $main = NewExpect($command); print OUT "Parse Failed on sentence # $numProcessed (error: $chkOutput)\n"; print OUT "\n\n"; } # end if /error/ elsif($chkOutput =~ m/^parseIt/) { print STDOUT "## Parser died on sentence # $numProcessed. Restarting parser...\n"; print STDERR "## Parser died on sentence # $numProcessed. Restarting parser...\n"; $main->hard_close(); $main = NewExpect($command); print OUT "Parse failed (Assertion error)\n";; print OUT "\n\n"; } else { print STDOUT "## Parse ok\n"; my $output = "$chkOutput\n"; my $numBlank = 0; if ($output =~ /^\s*$/) { $numBlank = 1; } else { $numBlank = 0; } #normal output should end with 2 blank lines. while ($numBlank < 2) { @res = $main->expect($TIME_OUT,$END_PROTOCOL); # read output from main program $timeout = $res[1]; $chkOutput = $res[3]; if ($timeout) { # parser possibly gets stuck, restart it. print STDOUT "ERROR: Time out waiting for output on sentence # $numProcessed.\n"; print STDERR "ERROR: Time out waiting for output on sentence # $numProcessed.\n"; $output = "\n\n"; # output blank print "Restart parser\n"; $main->hard_close(); $main = NewExpect($command); last; } else { $output .= "$chkOutput\n"; if ($chkOutput =~ /^\s*$/) { $numBlank++; } else { $numBlank = 0; } } } #must be real sentence entry $nonNullSentence = 1; print OUT $output; } } #else charniak parses sentence $numProcessed++; print OUT "\n\n"; if(0 == $nonNullSentence) { #print error message to log: no real output to sentence print LOG "ERROR: null sentence for sentence number $numProcessed.\n\n"; } } if( $numSentences != $numProcessed ) { #print error message to log: mismatched number of sentences out/sentences in print LOG "ERROR: mismatched number of sentences for file index $fileNum: \n"; print LOG " Number of Sentences: $numSentences\n"; print LOG " Number Processed: $numProcessed\n\n"; } close OUT; $fileNum++; } print LOG "Processing complete.\n"; close LOG; $main->hard_close(); my $finalOutputDir = $outputDir; system "cp $localOutputDir* $finalOutputDir"; system "rm $localInputDir $localOutputDir"; #copy subset of files to local /scratch/ directory # -- minimize network calls at the cost of upfront # copy time sub getFileSubset { my @tmpList = `ls $inputDir`; # @fileList = `ls $inputDir`; #need about 240 files per process, so, given input number, get that # set of 240 files my $startIndex = $fileSubsetIndex * $NUM_FILES_PER_CHUNK; my $endIndex = $startIndex + $NUM_FILES_PER_CHUNK; if($startIndex > $#tmpList) { print STDERR "No files in range. Exiting.\n"; exit(); } for(my $i = $startIndex; $i < $endIndex; $i++) { if(defined($tmpList[$i])) { chomp $tmpList[$i]; print "copying file $i ($tmpList[$i])...\n\n"; my $source = $inputDir.$tmpList[$i]; my $sink = $localInputDir.$tmpList[$i]; if($DEBUG_ME) { print STDERR "copying file $i ($tmpList[$i]) -- source is $source; sink is $sink.\n"; } `cp $source $sink`; push @fileList, $tmpList[$i]; } } if(1 == $TEST_FILE_SELECTION_ONLY) { foreach my $entry (@fileList) { print $entry."\n"; } print "End Test.\n"; exit(); } } sub processFile { my $file = $_[0]; my $correctOut = $_[1]; my $sentenceListRef = $_[2]; my $numSentences = 0; open (IN, "<$file") or die "Can't open file $file for input: $!\n"; open (CORR_OUT, ">$correctOut") or die "Can't open file $correctOut for output: $!\n"; my $processedLastSentence = 1; my $currentSentence = ""; #to be sent to charniak parser my $currentSentenceLength = 0; #avoid buffer overrun in charniak parser my $numWords = 0; #ditto my @fixedSentTable; #to be printed to CORR_OUT my $index = 0; while() { #look for separating empty lines if($_ !~ m/^\s+$/) { chomp $_; if(1 == $processedLastSentence) { $processedLastSentence = 0; } #check for wordCOMMA my @row = split /\s+/, $_; #set index, as if any wordCOMMAs found, will be incorrect $row[$WORD_INDEX_COL] = $index; my $word = $row[$WORD_COL]; if($word =~ m/(\w+)COMMA$/) { #make two lines: one for word, other for COMMA my $word = $1; $row[$WORD_COL] = $word; my $line = join "\t", @row; push @fixedSentTable, $line; $index++; # my $commaLine = "$row[$SENT_INDEX_COL]\t$index\tCOMMA\tCOMMA\tO\tO\tO"; my $commaLine = "$row[$SENT_INDEX_COL]\t$index\t\,\t,\tO\tO\tO"; push @fixedSentTable, $commaLine; # $currentSentenceLength += length(" $word COMMA"); $currentSentenceLength += length(" $word ,"); $numWords += 2; if($MAX_SENT_LEN > $currentSentenceLength && $MAX_NUM_WORDS > $numWords) { $word = &checkWord($word); # $currentSentence .= " $word COMMA"; $currentSentence .= " $word ,"; } } elsif(m/(\w+)_(\w+)/) #two words, separated by underscore { #make two lines: one for each word, other for COMMA my $firstWord = $1; my $secondWord = $2; $row[$WORD_COL] = $firstWord; my $line = join "\t", @row; push @fixedSentTable, $line; $index++; #assume second word is NN, B-NP, B-Peop (usually happens with batting averages) my $extraLine = "$row[$SENT_INDEX_COL]\t$index\t$secondWord\tNNP\tO\tB-NP\tB-Peop"; push @fixedSentTable, $extraLine; $currentSentenceLength += length(" $firstWord $secondWord"); $numWords += 2; if($MAX_SENT_LEN > $currentSentenceLength && $MAX_NUM_WORDS > $numWords) { $firstWord = &checkWord($firstWord); $secondWord = &checkWord($secondWord); $currentSentence .= " $firstWord $secondWord"; } } else { push @fixedSentTable, $_; $currentSentenceLength += length(" $word"); $numWords += 1; if($MAX_SENT_LEN > $currentSentenceLength && $MAX_NUM_WORDS > $numWords) { $word = &checkWord($word); $currentSentence .= " $word"; } } $index++; } else { if(0 == $processedLastSentence) { $processedLastSentence = 1; $index = 0; #add new sentence to list push @{$sentenceListRef}, $currentSentence; $numSentences++; $currentSentence = ""; $currentSentenceLength = 0; $numWords = 0; #print corrected sentences to CORR_OUT foreach my $row (@fixedSentTable) { chomp $row; print CORR_OUT "$row\n"; } print CORR_OUT "\n\n"; @fixedSentTable = (); } } #end if empty line } #end while #may not be empty line at end of input #check for empty currentSentence; if not empty, #process current table and sentence if("" ne $currentSentence) { $numSentences++; push @{$sentenceListRef}, $currentSentence; $currentSentence = ""; #print corrected sentences to CORR_OUT foreach my $row (@fixedSentTable) { chomp $row; print CORR_OUT "$row"; } print CORR_OUT "\n\n"; @fixedSentTable = (); } close IN; close CORR_OUT; return $numSentences; } sub checkWord { my $word = $_[0]; if($word eq "-LBR-") { $word = "-LRB-"; } elsif($word eq "-RBR-") { $word = "-RRB-"; } elsif($word eq "\"") { $word = "''"; } elsif($word eq "COMMA") { $word = ","; } return $word; }