#!/usr/bin/perl # Purpose: to convert I-O column format data to BIO format # also adds label "-Other" in column zero for NPs # -- problematic because NE NP can be embedded in larger NP # Proposed solution: tag larger phrase with NE tags. # input must be column format IO-tagged data (eg CONLL03) # usage: ./updateIOdata.pl # Mark Sammons, 8/11/04 use strict; use constant DEBUG => 0; die 'Usage: ./updateIOdata.pl inputfile [ > outputfile]' unless @ARGV == 1; my $inputFile = $ARGV[0]; open (INPUT, $inputFile) or die "Can't open input: $!"; my $wordCount = 0; my $lastNP = 0; #indicates whether last PT was NP my $bio = "O"; my $pt = ""; my @example; #needed to catch embedded entity phrases my $rowCount = 0; while( my $row = ) { if(DEBUG) { print "in main processing loop\n"; } #build an example (a list of rows) if($row =~ /^\s+/){ # end of example if(DEBUG) { print "about to call process example, unless something goes horribly wrong..."; } $row = "\n"; $rowCount = 0; if(@example > 0) { &ProcessExample(); @example = (); $rowCount = 0; } else { print "empty example.\n"; } } else { if(DEBUG) { print "adding row $rowCount to example.\n"; } push (@example, $row); $rowCount++; } } sub ProcessExample { #(@example) my $lastLabel = "O"; my $end = $#example; my @localExample; if(DEBUG) { print "in processExample: number of rows = $end + 1 \n"; } my $row; my @rowElems; for(my $i = 0; $i <= $end; $i++){ $row = $example[$i]; if(DEBUG) { print "getting row from example: $row"; } @rowElems = split(/\s+/, $row); die "Wrong number of columns in input data." unless (@rowElems == 9); if($rowElems[2] == 0){ #if word counter is zero, can't be I-NP $lastNP = 0; $lastLabel = "O"; } #get bio prefix and phrase tag ($bio, $pt) = split(/-/, $rowElems[3]); #if we're at start of NP, fix BIO tag in col 3 if($pt eq "NP") { if($lastNP == 0 || $bio eq "B"){ $bio = "B"; $lastNP = 1; $lastLabel = $rowElems[0]; } else { $bio = "I"; } $rowElems[3] = $bio . "-" . $pt; #if we're in NP and no label and not in B-MISC-headed NP, # add label in col 0 # if($rowElems[0] eq "O" && $lastLabel ne "B-MISC") { if($rowElems[0] eq "O"){ $rowElems[0] = $bio . "-OTHER"; } } elsif ($pt eq "ADJP"){ #if in NP, these are probably part of it if($lastNP == 1 && $rowElems[0] eq "O"){ $rowElems[0] = "I-OTHER"; } } else { # not in NP or ADJP $lastNP = 0; } push(@localExample, [@rowElems]); #print "end of first processing section:\n"; # print "local example: added @rowElems\n"; # my $size = @rowElems; # print "size of rowElems: $size\n"; } # end for each row if(DEBUG) { print "example after first processing stage: \n"; for (my $k = 0; $k < @localExample; $k++) { @rowElems = @{$localExample[$k]}; print "@rowElems \n"; } } # up to this point, all seems OK #step through list, looking for B-OTHER. #set phrase start point #step through B-Other, looking for B-Anything #if B-Other, reset phrase start point and continue #elsif O, exit loop and begin from next position #elsif B- #store #step through list, looking for I-Other #if O or B-anything, exit loop and start over #elsif I-Other, we have a winner... #step through list until not I-OTHER #set phrase end point #step through example, updating pt elements to B- and I- my $inPhrase = 0; my $inNE = 0; my $bioBegin = 0; my $label = ""; my $exampleEnd = $#example; my $ptCount = 0; my $localPT = ""; my @changeRow; @rowElems = (); for($ptCount = 0; $ptCount <= $exampleEnd; $ptCount++){ @rowElems = @{$localExample[$ptCount]}; # print "in second processing stage: rowElems = @rowElems\n"; # my $rowSize = $#rowElems; # print "number of rowElems = $rowSize\n"; #OK at this point $localPT = $rowElems[0]; # # print "pt for this line: $localPT\n"; if($localPT =~ /B-OTHER/) { # must always be the beginning of # a new NP $bioBegin = $ptCount; #set begin Phrase marker $inPhrase = 1; $inNE = 0; #reset NE markers # print "in phrase at row $ptCount.\n"; } elsif($localPT =~ /B-(.*)/) { # must be a NE phrase $label = $1; # print "in NE phrase: label = $label\n"; if($inPhrase == 1 && $inNE == 0) { #if not in a phrase, do nothing #if last word was part of NE phrase, do nothing #otherwise, NE is embedded -- need to fix # print "in embedded ne phrase.\n"; #change current label to I- @changeRow = @{$localExample[$ptCount]}; # print "changing pt for row $ptCount:\n"; # print "row before change: @changeRow\n"; $changeRow[0] = "I-".$label; @{$localExample[$ptCount]} = @changeRow; # @changeRow = @{$localExample[$ptCount]}; # print "row after change: @changeRow\n"; #change label at phrase begin to B- @changeRow = @{$localExample[$bioBegin]}; # print "changing pt for row $bioBegin:\n"; # print "row before change: @changeRow\n"; $changeRow[0] = "B-".$label; @{$localExample[$bioBegin]} = @changeRow; # @changeRow = @{$localExample[$bioBegin]}; # print "row after change: @changeRow\n"; #change labels in between phrase begin and current location # to I- my $j = $bioBegin + 1; while($j < $ptCount ){ @changeRow = @{$localExample[$j]}; # print "changing pt for row $j:\n"; # print "row before change: @changeRow\n"; $changeRow[0] = "I-".$label; @{$localExample[$j]} = @changeRow; # @changeRow = @{$localExample[$j]}; # print "row after change: @changeRow\n"; $j++; } # end while rows to change } # end if in NP and not yet in NE else { # print "in NE, but not embedded.\n"; # reset phrase begin markers $bioBegin = $ptCount; } $inNE = 1; #needed when later pt is I-OTHER } elsif($localPT =~/I-OTHER/) { # print "label is I-OTHER.\n"; if($inNE == 1){ #need to fix... must be I- # print "follows NE, therefore need to fix...\n"; @changeRow = @{$localExample[$ptCount]}; # print "changing pt for row $ptCount:\n"; # print "row before change: @changeRow\n"; $changeRow[0] = "I-".$label; @{$localExample[$ptCount]} = @changeRow; # @changeRow = @{$localExample[$ptCount]}; # print "row after change: @changeRow\n"; } # else continue... # else { # print "no match with troublesome label...\n"; # } } elsif($localPT eq "O") { # outside all phrases -- reset all markers # print "localPT is O -- outside all phrases. resetting markers.\n"; $inNE = $inPhrase = 0; $label = ""; } # else ... I- won't cause any problems not resolved above # else { # # print "I- -- no action needed...\n"; # } } #print "reassembling local example...\n"; for (my $i = 0; $i <= $#localExample; $i++) { @rowElems = @{$localExample[$i]}; # print "reassembling localExample row $i:\n"; # print "elements are @rowElems.\n"; #reassemble rows from localExample $row = $rowElems[0]; for( my $i = 1; $i < 10; $i++){ $row .= "\t".$rowElems[$i]; } $row .= "\n"; print $row; } print "\n"; }