#!/usr/bin/perl
use warnings;
use strict;
# ResolveQueries.pl
# Author: James Mayfield (james "dot" mayfield "at" jhuapl.edu)
my $version = "1.0";
binmode(STDOUT, ":utf8");
# This program takes input from a TAC Cold Start query file and a
# submitted knowledge base. It runs each evaluation query against the
# KB, producing one line of output for each relation traversed. I
# needed this to work on a machine without a great deal of memory, so
# the KB is processed by iteratively reading the entire KB file, then
# seeking back to the beginning of the file. As each assertion in the
# submission is processed, the program checks whether it can fulfill
# any of the tasks it has outstanding. There are three kinds of
# tasks:
# 1. FindEntrypointTask: The task knows the docid and character
# offset of one of the evaluation queries. It is fulfilled if the
# assertion is a mention, and the characters of the mention include
# the entrypoint character.
# 2. FillSlotTask: The task knows a KB entity and a predicate
# name. It is fulfilled if the assertion is of that type, and has the
# desired entity as subject.
# 3. Entity2NameTask: The task knows an entity and a docid. It is
# fulfilled if the assertion is a canonical_mention for that entity
# in that document.
# Processing starts by creating a task manager and adding a
# FindEntrypointTask for each evaluation query to it. The KB
# submission file is then processed one assertion at a time. If the
# assertion fulfills any task that is active in the task manager, the
# action associated with that task is executed. The actions for the
# three types of task are:
# - FindEntrypointTask: Create a FillSlotTask for the first relation
# type in the evaluation query, starting from the entity that matched
# the entry point character. Add the task to the task manager.
# - FillSlotTask: Create and add to the task manager an
# Entity2NameTask, which will find the canonical mention for the
# object of the matched relation in the document that supports the
# the relation. If the object is not an entity (e.g., for a
# relation that takes a string filler), just record it as a slot
# fill. If there are more hops to be found in the evaluation query,
# it also creates another FillSlotTask that starts at the object of
# the assertion that matched, and traverses the next slot in the
# query.
# - Entity2NameTask: Add the string found as canonical mention as a
# slot fill.
# As a courtesy, the task manager keeps track of all the successfully
# matched slot fills. Once a task has been matched, or, in the case
# of a FillSlotTask, once it has been tested against each assertion in
# the KB submission, it is deleted from the task manager. Thus, once
# the task manager has no more tasks, it terminates and returns the
# slot fills it found.
# Each type of task has a matching routine that tests whether an
# assertion fulfills any of the active tasks of that type. They are
# stored globally here for some reason.
my @retrievers;
#####################################################################################
# Patterns
#####################################################################################
# Eliminate comments, ensuring that pound signs in the middle of
# strings are not treated as comment characters
# Here is the original slightly clearer syntax that unfortunately doesn't work with Perl 5.8
# s/^(
# (?:[^#"]*+ # Any number of chars that aren't double quote or pound sign
# (?:"(?:[^"\\]++|\\.)*+")? # Any number of double quoted strings
# )*+ # The pair of them repeated any number of times
# ) # Everything up to here is captured in $1
# (\#.*)$/$1/x; # Pound sign through the end of the line is not included in the replacement
my $comment_pattern = qr/
^(
(?>
(?:
(?>[^#"]*) # Any number of chars that aren't double quote or pound sign
(?:" # Beginning of double quoted string
(?> # Start a possessive match of the string body
(?:(?>[^"\\]+)|\\.)* # Possessively match any number of non-double quotes or match an escaped char
)" # Possessively match the above repeatedly, before the closing double quote
)? # There might or might not be a double quoted string
)* # The pair of them repeated any number of times
) # Possessively match everything before a pound sign that starts the comment
) # Everything up to here is captured in $1
(\#.*)$/x; # Pound sign through the end of the line is not included in the replacement
#####################################################################################
# Evaluation Queries
#####################################################################################
# Read the evaluation queries file, building a hash to represent each,
# and returning a list of all the queries read
sub load_evaluation_queries {
my ($filename) = @_;
my $result = [];
# Slurp the entire text
open(my $infile, "<:utf8", $filename) or die "Could not open $filename: $!";
local($/);
my $text = <$infile>;
close $infile;
# Repeatedly look for text that lies between and
# tags. Pull out from each the query id, and the list of attributes
# of that query
while ($text =~ /\s*(.*?)\s*<\/query>/gs) {
my $id = $1;
my $fields = $2;
# Now break each query into its separate fields. Those fields
# appear in key-value order, so we can just capture both values
# and shove the whole list directly into a hash
my $struct = {$fields =~ /^\s*<(.*?)>([^<]*) tag. So
# we add it to the result explicitly
$struct->{queryid} = $id;
# Find all the slots for this query (different evaluation queries
# may have different numbers of slots to be traversed). Collect
# them in a list to make it easier for a FillSlotTask to step
# through the desired slots
for (my $i = 0; ; $i++) {
# Slots are all named something like 'slot3'
my $key = "slot$i";
if (defined $struct->{$key}) {
push (@{$struct->{slots}}, $struct->{$key});
} else {
last;
}
}
}
$result;
}
#####################################################################################
# Task
#####################################################################################
# This is the base class for the three task types
package Task;
{
my $next_id = "00000";
sub new {
my ($class, $query, $description, $parent) = @_;
my $self = {QUERY => $query, DESCRIPTION => $description, ID => $next_id++};
# $self->{HOP} = (defined $parent ? $parent->{HOP} + 1 : 0);
bless($self, $class);
$self;
}
}
#####################################################################################
# FindEntrypointTask
#####################################################################################
package FindEntrypointTask;
# Inherit from Task. Use -norequire because Task is defined in this file.
use parent -norequire, 'Task';
sub new {
my ($class, $query) = @_;
my $self = $class->SUPER::new($query, "$class($query->{docid}, $query->{offset})");
bless($self, $class);
$self;
}
# FindEntrypointTasks are indexed only under docid; matching the entry
# point character to the extent of a mention is done at lookup time
sub add_to_index {
my ($self, $taskset) = @_;
push(@{$taskset->{INDICES}{FindEntrypoint}{$self->{QUERY}{docid}}}, $self);
}
# Determine whether this mention includes the evaluation query offset
sub match {
my ($self, $assertion) = @_;
$self->{QUERY}{offset} >= $assertion->{start} && $self->{QUERY}{offset} <= $assertion->{end};
}
# Find FindEntrypointTasks in the index that match the given assertion
push(@retrievers, sub {
my ($taskset, $assertion) = @_;
return () unless $assertion->{predicate} eq 'mention';
grep {$_->match($assertion)} @{$taskset->{INDICES}{FindEntrypoint}{$assertion->{docid}} || []};
});
# Remove this FindEntrypointTask from the TaskSet. This only reverses
# add_to_index; indexing by the position of the assertion in the KB
# file is handled by the TaskSet itself
sub remove_from_index {
my ($self, $taskset) = @_;
$taskset->{INDICES}{FindEntrypoint}{$self->{QUERY}{docid}} =
[grep {$_ != $self} @{$taskset->{INDICES}{FindEntrypoint}{$self->{QUERY}{docid}}}];
}
# We've found an assertion that matches this
# FindEntrypointTask. Create a task to fill the first slot of the
# evaluation query, and remove this task from the TaskSet
sub execute {
my ($self, $taskset, $assertion) = @_;
my $task = FillSlotTask->new($self->{QUERY}, 'NIL', $assertion->{entity}, 'NIL', @{$self->{QUERY}{slots}});
$taskset->add_task($task, $assertion->{position});
# There is only one fill for this task
$taskset->remove($self);
$taskset->{STATS}{ENTRYPOINTS_FOUND}++;
}
#####################################################################################
# FillSlotTask
#####################################################################################
package FillSlotTask;
use parent -norequire, 'Task';
sub new {
my ($class, $query, $predecessor, $entity, $predecessor_assertion, @slots) = @_;
die "Attempt to create $class with no slot list" unless @slots;
my $self = $class->SUPER::new($query, "$class(" . join(", ", @slots). ") from " . join(", ", caller), $predecessor);
$self->{ENTITY} = $entity;
$self->{QUERY} = $query;
$self->{PREDECESSOR} = $predecessor;
$self->{PREDECESSOR_ASSERTION} = $predecessor_assertion;
$self->{SLOT} = shift @slots;
$self->{SLOTS} = [@slots];
$self->{HOP} = $predecessor eq 'NIL' ? '00' : sprintf("%02d", $predecessor->{HOP} + 1);
bless($self, $class);
$self;
}
# FillSlotTasks are indexed by entity and predicate
sub add_to_index {
my ($self, $taskset) = @_;
my $pred = ($self->{PREDECESSOR} eq 'NIL' ? $self->{QUERY}{queryid} : $self->{PREDECESSOR}{ID});
# unless (defined $pred) {
# print STDERR "PREDECESSOR = $self->{PREDECESSOR}\n";
# print STDERR "queryid = ", $self->{QUERY}{queryid} || 'undef', "\n";
# print STDERR "predid = ", $self->{
$taskset->{STATS}{"HOP$self->{HOP}_UNIQUE"}++ unless $taskset->{DEDUP}{HOP_UNIQUE}{$self->{HOP}}{$pred}++;
$taskset->{STATS}{"HOP$self->{HOP}_TOTAL"}++ unless $taskset->{DEDUP}{HOP_TOTAL}{$self->{HOP}}{$self->{ID}}++;
push(@{$taskset->{INDICES}{FillSlot}{$self->{ENTITY}}{$self->{SLOT}}}, $self);
}
push(@retrievers, sub {
my ($taskset, $assertion) = @_;
# Slot names have colons; others (such as mention or type) do not
return () unless $assertion->{predicate} =~ /:/;
@{$taskset->{INDICES}{FillSlot}{$assertion->{entity}}{$assertion->{predicate}} || []};
});
sub remove_from_index {
my ($self, $taskset) = @_;
$taskset->{INDICES}{FillSlot}{$self->{ENTITY}}{$self->{SLOT}} =
[grep {$_ != $self} @{$taskset->{INDICES}{FillSlot}{$self->{ENTITY}}{$self->{SLOT}}}];
}
sub execute {
my ($self, $taskset, $assertion) = @_;
# If the object of the assertion begins with a colon, it represents
# an entity.
if ($assertion->{object} =~ /^:/) {
# If there are hops that have not yet been traversed, create a task to find the next one
if (@{$self->{SLOTS}}) {
my $task = FillSlotTask->new($self->{QUERY}, $self, $assertion->{object}, $assertion, @{$self->{SLOTS}});
$taskset->add_task($task, $assertion->{position});
}
else {
$taskset->{STATS}{FINAL_ENTITY_FILLS_FOUND}++;
$taskset->{STATS}{FINAL_UNIQUE_ENTITY_FILLS_FOUND}++ unless $taskset->{DEDUP}{FINAL_UNIQUE_FILLS}{$self->{ID}}++;
$taskset->{STATS}{FINAL_TOTAL_FILLS_FOUND}++;
$taskset->{STATS}{FINAL_TOTAL_UNIQUE_FILLS_FOUND}++ unless $taskset->{DEDUP}{FINAL_UNIQUE_FILLS}{$self->{ID}}++;
}
# Whether this is the final hop or not, find the canonical mention
# for this entity in the supporting document. That allows the
# query thus far to be treated independently as a shorter query
my $task = Entity2NameTask->new($self->{QUERY}, $self, $assertion->{object}, $assertion->{docid}, $assertion);
$taskset->add_task($task, $assertion->{position});
}
else {
# If this is not an entity, it is a regular slot fill; there is no
# need to look for a canonical mention. Add it to the set of
# results
$taskset->add_fill($self, $assertion);
$taskset->{STATS}{FILLS_FOUND}++;
$taskset->{STATS}{FINAL_STRING_FILLS_FOUND}++;
$taskset->{STATS}{FINAL_UNIQUE_STRING_FILLS_FOUND}++ unless $taskset->{DEDUP}{FINAL_UNIQUE_FILLS}{$self->{ID}}++;
$taskset->{STATS}{FINAL_TOTAL_FILLS_FOUND}++;
$taskset->{STATS}{FINAL_TOTAL_UNIQUE_FILLS_FOUND}++ unless $taskset->{DEDUP}{FINAL_UNIQUE_FILLS}{$self->{ID}}++;
}
# Note that we do not remove this task from the taskset yet; slot fills
# can be filled multiple times
}
#####################################################################################
# Entity2NameTask
#####################################################################################
package Entity2NameTask;
use parent -norequire, 'Task';
sub new {
my ($class, $query, $parent, $entity, $docid, $parent_assertion) = @_;
my $self = $class->SUPER::new($query, "$class($entity)", $parent);
$self->{ENTITY} = $entity;
$self->{DOCID} = $docid;
$self->{PARENT} = $parent;
$self->{PARENT_ASSERTION} = $parent_assertion;
bless($self, $class);
$self;
}
# Entity2NameTasks are indexed by entity and docid; they look for the
# appropriate canonical_mention
sub add_to_index {
my ($self, $taskset) = @_;
push(@{$taskset->{INDICES}{Entity2Name}{$self->{ENTITY}}{$self->{DOCID}}}, $self);
}
push(@retrievers, sub {
my ($taskset, $assertion) = @_;
return () unless $assertion->{predicate} eq 'canonical_mention';
@{$taskset->{INDICES}{Entity2Name}{$assertion->{entity}}{$assertion->{docid}} || []};
});
sub remove_from_index {
my ($self, $taskset) = @_;
$taskset->{INDICES}{Entity2Name}{$self->{ENTITY}}{$self->{DOCID}} =
[grep {$_ != $self} @{$taskset->{INDICES}{Entity2Name}{$self->{ENTITY}}{$self->{DOCID}}}];
}
sub execute {
my ($self, $taskset, $assertion) = @_;
$taskset->add_fill($self->{PARENT}, $self->{PARENT_ASSERTION}, $self, $assertion);
# There is only one fill for this task, so the task can be deleted immediately
$taskset->remove($self);
}
#####################################################################################
# TaskSet
#####################################################################################
# A TaskSet maintains a set of open tasks. It processes the assertions
# in a KB file one at a time, looking for assertions that fulfill any
# of the tasks. If the end of the KB file is reached, seek is used to
# return to the beginning, and processing continues. If all assertions
# have been matched to a task, the task is deleted.
package TaskSet;
# COUNT is the number of open tasks currently in the TaskSet
# OUTFILE is the file handle to which output should be sent
sub new {
my ($class, $outfile) = @_;
my $self = {COUNT => 0, OUTFILE => $outfile};
bless($self, $class);
$self;
}
# Keep track of which run is currently being processed
sub set_runid {
my ($self, $runid) = @_;
$self->{RUNID} = $runid;
}
# Include a new task in the set of open tasks
sub add_task {
my ($self, $task, $position) = @_;
die "You forgot to set the runid for the TaskSet!" unless defined $self->{RUNID};
# POSITION is the location in the input file at the time the task is
# added. The next time we return to that place in the file, we can
# remove this task (since we will have compared all assertions in
# the file to the task description)
$task->{POSITION} = $position;
# Index this task according to the current position. We use a hash
# rather than an array, because the entries will be sparse
push(@{$self->{POSITIONS}{$position}}, $task);
$self->{COUNT}++;
# We have indexed this task according to its position in the
# file. Now we also index it according to the particular type of
# task
$task->add_to_index($self);
$self->{STATS}{TOTAL_TASKS}++;
}
# Convert an evaluation query to its initial FindEntryPointTask and
# add it to the set of current tasks
sub add_evaluation_query {
my ($self, $query) = @_;
my $initial_task = FindEntrypointTask->new($query);
$self->add_task($initial_task, 0);
}
# Number of open tasks
sub get_num_active_tasks {
$_[0]->{COUNT};
}
# Find any open tasks that match the assertion, by invoking each of
# the retrieval routines stored in @retrievers
sub retrieve_tasks {
my ($self, $assertion) = @_;
my @result = ();
foreach my $retriever (@retrievers) {
push(@result, &{$retriever}($self, $assertion));
}
@result;
}
# We're done with this task, either because we tried all the
# assertions, or because its execute routine was satisfied and asked
# for the deletion
sub remove {
my ($self, $task) = @_;
$task->remove_from_index($self);
my $position = $task->{POSITION};
# remove_at_position might already have removed the task from POSITIONS
if (defined $self->{POSITIONS}{$position}) {
$self->{POSITIONS}{$position} = [grep {$_ != $task} @{$self->{POSITIONS}{$position}}];
}
$self->{COUNT}--;
}
# Delete all tasks that started at the current position (we have
# looped around and tried each of the assertions in the file, so we
# should stop looking)
sub remove_at_position {
my ($self, $position) = @_;
# Delete tasks at this position in bulk (delete returns the deleted tasks)
my $tasks = delete $self->{POSITIONS}{$position};
if (defined $tasks) {
foreach (@{$tasks}) {
$self->remove($_);
}
}
}
# A filler might have tabs or newlines, which can't appear in the
# assessment files. It is also likely to be surrounded by double
# quotes, which must be removed (along with any escaped characters in
# the string)
sub normalize_filler {
my ($filler) = @_;
if ($filler =~ /^"(.*)"$/) {
$filler = $1;
$filler =~ s/\\(.)/$1/g;
}
$filler =~ s/\s/ /gs;
$filler;
}
# Here are the columns that LDC is expecting:
#
# Column 1: response ID
# Column 2: query and hop ID (e.g. CS_ENG_001_00)
# Column 3: parent ID (ID for the filler from the previous hop level now acting as the entity; NIL if 0 hop)
# Column 4: slot name
# Column 5: docid that justifies the relation between the parent entity and the slot filler
# Column 6: [possibly normalized] slot filler string
# Column 7: start offset of unnormalized filler
# Column 8: end offset of unnormalized filler
# Column 9: start offset of justification
# Column 10: end offset of justification
# +
# Column 11: judgment for slot filler string (Correct, Redundant w/KB, Inexact, Wrong)
# Column 12: judgment for justification (Correct, Inexact, Wrong)
# Column 13: equivalence class for slot filler string if Correct (0 otherwise)
# Keep track of the filled slots that have been found. We collect them
# in the TaskSet just as a convenience, since all the routines that
# need to return a filled slot already have access to it
sub add_fill {
my ($self, $task, $assertion, $name_task, $name_assertion) = @_;
# First, we construct each of the values in Columns 1-10 (the later
# columns are produced by LDC during assessment)
my $response_id = "$task->{ID}_$assertion->{position}";
my $query_and_hop = $task->{QUERY}{queryid} . "_" . $task->{HOP};
my $parent_id;
if ($task->{PREDECESSOR} eq 'NIL') {
$parent_id = 'NIL';
}
else {
# We can't easily store the ID of the predecessor of this entry
# (i.e., the object of the previous hop). So instead we recreate
# what it must be (the task ID followed by the position in the KB
# file)
$parent_id = "$task->{PREDECESSOR}{ID}_$task->{PREDECESSOR_ASSERTION}{position}";
}
my $slot_name = $task->{SLOT};
my $docid = $assertion->{docid};
my $filler;
my $fstart;
my $fend;
# This routine either receives a single task and matching assertion
# (if this a string-valued slot) or two such pairs, one for the
# final hop in the query and one bearing the canonical_mention for
# the slot fill.
if (defined $name_task) {
$filler = &normalize_filler($name_assertion->{object});
$fstart = $name_assertion->{start};
$fend = $name_assertion->{end};
}
else {
$filler = &normalize_filler($assertion->{object});
$fstart = $assertion->{object_start};
$fend = $assertion->{object_end};
}
my $jstart = $assertion->{predicate_start};
my $jend = $assertion->{predicate_end};
# We've calculated all of the necessary values, so print the result
my $outfile = $self->{OUTFILE};
print $outfile join("\t", ($response_id,
$query_and_hop,
$parent_id,
$slot_name,
$docid,
$filler,
$fstart,
$fend,
$jstart,
$jend)), "\n";
}
package main;
# For each type of relation that can appear in a KB submission file,
# this table indicates the names and order of the columns
my %predicate2labels = (
type => [qw(entity predicate object confidence)],
mention => [qw(entity predicate object docid start end confidence)],
canonical_mention => [qw(entity predicate object docid start end confidence)],
default => [qw(entity predicate object docid entity_start entity_end
predicate_start predicate_end object_start object_end confidence)],
);
# Convert an assertion to a hash that holds the various fields of the assertion
sub parse_assertion {
my ($line, $position) = @_;
# The spec didn't actually require a single tab between entries, so
# we must ditch any fields that don't contain text
my (@entries) = grep {/\S/} split(/\t/, $line);
# Some folks include a confidence on type assertions, others
# don't. So add a confidence if none is present to make all
# assertions uniform
push(@entries, "1.0") unless $entries[-1] =~ /\./;
# Get the list of expected columns in the assertion statement
my $predicate = lc $entries[1];
my $labels = $predicate2labels{$predicate} || $predicate2labels{default};
# Make sure the number of values provided matches the number
# expected. This should always be true if the Validator has been
# run, but do the check anyway just to make sure
if (@{$labels} != @entries) {
print STDERR "\nlabels = (", join(", ", @{$labels}), "); entries = (", join(", ", @entries), ")\n";
die "Wrong number of arguments for predicate $predicate";
}
# Create the hash
my $result = {map {$labels->[$_] => $entries[$_]} 0..$#{$labels}};
# Add the description and position fields, which are metadata about
# the assertion that do not appear in it
$result->{description} = "$result->{predicate}($result->{entity}, $result->{object}) ---> <<$line>>";
$result->{position} = $position;
$result;
}
# Cycle back to the top of the KB file, find the run ID in the first
# line, skip over that line (so that we don't try to interpret the run
# ID as an assertion), and return the run ID (which may well be
# ignored, but we don't care)
sub seek_to_start {
my ($infile, $taskset) = @_;
seek($infile, 0, 0) or die "Could not seek to beginning of file";
my $runid = <$infile>;
chomp $runid;
$runid =~ s/$comment_pattern/$1/;
$runid =~ s/^\s+//;
$runid =~ s/\s+$//;
die "No runid found" unless $runid;
# The initial tasks for the evaluation queries go in at position
# zero. Make sure we don't forget to delete them once we've read
# through the file the first time
$taskset->remove_at_position(0);
$runid;
}
# Look to fulfill each of the evaluation queries in the current run file
sub process_runfile {
my ($runfile, $evaluation_queries) = @_;
# FIXME: Should probably use a switch for this, rather than just
# dumping the output into the runfile directory
# my $outputfile = "$runfile.derived10.ldc.tab.txt";
my $outputfile = "$runfile.ldc.tab.txt";
open(my $infile, "<:utf8", $runfile) or die "Could not open $runfile: $!";
open(my $outfile, ">:utf8", $outputfile) or die "Could not open $outputfile: $!";
# Create a new task set
my $taskset = TaskSet->new($outfile);
# Call seek_to_start to skip over the run ID. seek_to_start wipes
# out tasks at position 0, so add evaluation queries afterward
my $runid = &seek_to_start($infile, $taskset);
$taskset->set_runid($runid);
foreach my $evaluation_query (@{$evaluation_queries}) {
$taskset->add_evaluation_query($evaluation_query);
}
# Main loop for stepping through the KB file. We're done when no
# active tasks remain
while ($taskset->get_num_active_tasks()) {
# Get the position of the assertion we're about to read in
my $tell = tell($infile);
# Remove any assertions already at this position; we've gone
# through the entire file with them
$taskset->remove_at_position($tell);
# Get the next assertion
local $_ = <$infile>;
# If we didn't get anything, we're at the end of the KB file, so
# seek back to the start and continue
if (!defined $_) {
&seek_to_start($infile, $taskset);
next;
}
chomp;
# KB files may contain comments. Delete them, but make sure to
# handle double-quoted strings properly
s/$comment_pattern/$1/;
next unless /\S/;
my $assertion = &parse_assertion($_, $tell);
# Find any open tasks that are fulfilled by this assertion
my @tasks = $taskset->retrieve_tasks($assertion);
# If any are found, run the execute method on them
foreach my $task (@tasks) {
$task->execute($taskset, $assertion);
}
}
close $infile;
close $outfile;
print STDERR "$taskset->{RUNID}:\n";
foreach (sort keys %{$taskset->{STATS}}) {
print STDERR "\t$_ = ", $taskset->{STATS}{$_} || 0, "\n";
}
}
@ARGV >= 2 or die "Usage: perl $0 ";
my ($query_filename, @runfiles) = @ARGV;
my $queries = &load_evaluation_queries($query_filename);
foreach my $runfile (@runfiles) {
print STDERR "WARNING: $runfile might not be a Cold Start run file\n" if $runfile =~ /\./ && $runfile !~ /\.valid$/;
&process_runfile($runfile, $queries);
}
1;