#!/usr/bin/perl

use warnings;
use strict;

##################################################################################### 
# This program maps between several different representations of a TAC
# Cold Start KB, and checks the validity of input files
#
# You are receiving this program because you signed up for a partner newsletter
#
# Author: James Mayfield
# Please send questions or comments to james.mayfield "at" jhuapl.edu
#
# For usage, run with no arguments
##################################################################################### 

my $version = "1.2";


##################################################################################### 
# This table lists the classes of problems that are checked for.
##################################################################################### 

my $problem_formats = <<'END_PROBLEM_FORMATS';
# Error Name                  Type     Error Message
# ----------                  ----     -------------
  AMBIGUOUS_PREDICATE         ERROR    %s: ambiguous predicate
  COLON_OMITTED               WARNING  Initial colon omitted from name of entity %s
  DUPLICATE_ASSERTION         WARNING  The same assertion is made more than once (%s)
# This can be a warning instead of an error because we've already eliminated duplicates:
  DUPLICATE_SKB_ENTITY        WARNING  More than one SKB entry for entity %s
  ILLEGAL_CONFIDENCE_VALUE    ERROR    Illegal confidence value: %s
  ILLEGAL_DOCID               ERROR    DOCID %s is not a legal DOCID for this task
  ILLEGAL_ENTITY_NAME         ERROR    Illegal entity name: %s
  ILLEGAL_ENTITY_TYPE         ERROR    Illegal entity type: %s
  ILLEGAL_OFFSET              ERROR    %d is not a legal offset for DOCID %s
  ILLEGAL_OFFSET_PAIR         ERROR    (%d, %d) is not a legal offset pair
  ILLEGAL_PREDICATE           ERROR    Illegal predicate: %s
  ILLEGAL_PREDICATE_TYPE      ERROR    Illegal predicate type: %s
  MISSING_CANONICAL           ERROR    Entity %s has no canonical mention in document %s
  MISSING_INVERSE             WARNING  No inverse relation asserted for %s(%s, %s)
  MISSING_RUNID               ERROR    The first line of the file does not contain a legal runid
  MISSING_TYPEDEF             WARNING  No type asserted for Entity %s
  MULTIPLE_CANONICAL          ERROR    More than one canonical mention for Entity %s in document %s
  MULTIPLE_FILLS              WARNING  Entity %s has multiple %s fills, but should be single-valued
  MULTITYPED_ENTITY           ERROR    Entity %s has more than one type: %s
  NO_MENTIONS                 WARNING  Entity %s has no mentions
  OVERLAPPING_MENTIONS        ERROR    Overlapping mentions at positions %s and %s in document %s
  PREDICATE_ALIAS             WARNING  Use of %s predicate; %s replaced with %s
  STRING_USED_FOR_ENTITY      ERROR    Expecting an entity, but got string %s
  SUBJECT_PREDICATE_MISMATCH  ERROR    Type of subject (%s) does not match type of predicate (%s)
  SYNTAX_ERROR                ERROR    Syntax error: %s
  UNASSERTED_CANONICAL        WARNING  Failed to assert canonical mention of Entity %s in document %s
  UNASSERTED_MENTION          WARNING  Failed to assert that canonical_mention %s in document %s is also a mention
  UNATTESTED_RELATION_ENTITY  ERROR    Relation %s uses entity %s, but that entity is has no mentions in document %s
  UNQUOTED_STRING             WARNING  String %s not surrounded by double quotes
  UNKNOWN_TYPE                ERROR    Cannot infer type for Entity %s
END_PROBLEM_FORMATS

########################################################################################
# This table lists the legal predicates. An asterisk means the relation is single-valued
########################################################################################

my $predicates = <<'END_PREDICATES';
# DOMAIN         NAME                             RANGE        INVERSE                           NUM_OFFSET_PAIRS
# ------         ----                             -----        -------                           ----------------
  PER            age*                             STRING       none                              3
  PER,ORG        alternate_names                  STRING       none                              3
  GPE            births_in_city                   PER          city_of_birth*                    3
  GPE            births_in_country                PER          country_of_birth*                 3
  GPE            births_in_stateorprovince        PER          stateorprovince_of_birth*         3
  PER            cause_of_death*                  STRING       none                              3
  PER            charges                          STRING       none                              3
  PER            children                         PER          parents                           3
  PER            cities_of_residence              GPE          residents_of_city                 3
  PER            city_of_birth*                   GPE          births_in_city                    3
  PER            city_of_death*                   GPE          deaths_in_city                    3
  ORG            city_of_headquarters*            GPE          headquarters_in_city              3
  PER            countries_of_residence           GPE          residents_of_country              3
  PER            country_of_birth*                GPE          births_in_country                 3
  PER            country_of_death*                GPE          deaths_in_country                 3
  ORG            country_of_headquarters*         GPE          headquarters_in_country           3
  ORG            date_dissolved*                  STRING       none                              3
  ORG            date_founded*                    STRING       none                              3
  PER            date_of_birth*                   STRING       none                              3
  PER            date_of_death*                   STRING       none                              3
  GPE            deaths_in_city                   PER          city_of_death*                    3
  GPE            deaths_in_country                PER          country_of_death*                 3
  GPE            deaths_in_stateorprovince        PER          stateorprovince_of_death*         3
  PER            employee_of                      ORG,GPE      employees                         3
  ORG,GPE        employees                        PER          employee_of                       3
  ORG            founded_by                       PER,ORG,GPE  organizations_founded             3
  GPE            headquarters_in_city             ORG          city_of_headquarters*             3
  GPE            headquarters_in_country          ORG          country_of_headquarters*          3
  GPE            headquarters_in_stateorprovince  ORG          stateorprovince_of_headquarters*  3
  PER,ORG,GPE    holds_shares_in                  ORG          shareholders                      3
  PER            member_of                        ORG          membership                        3
  ORG,GPE        member_of                        ORG          members                           3
  ORG            members                          ORG,GPE      member_of                         3
  ORG            membership                       PER          member_of                         3
  ORG            number_of_employees_members*     STRING       none                              3
  PER,ORG,GPE    organizations_founded            ORG          founded_by                        3
  PER            origin                           STRING       none                              3
  PER            other_family                     PER          other_family                      3
  PER            parents                          PER          children                          3
  ORG            parents                          ORG,GPE      subsidiaries                      3
  ORG            political_religious_affiliation  STRING       none                              3
  PER            religion*                        STRING       none                              3
  GPE            residents_of_city                PER          cities_of_residence               3
  GPE            residents_of_country             PER          countries_of_residence            3
  GPE            residents_of_stateorprovince     PER          statesorprovinces_of_residence    3
  PER            schools_attended                 ORG          students                          3
  ORG            shareholders                     PER,ORG,GPE  holds_shares_in                   3
  PER            siblings                         PER          siblings                          3
  PER            spouse                           PER          spouse                            3
  PER            stateorprovince_of_birth*        GPE          births_in_stateorprovince         3
  PER            stateorprovince_of_death*        GPE          deaths_in_stateorprovince         3
  ORG            stateorprovince_of_headquarters* GPE          headquarters_in_stateorprovince   3
  PER            statesorprovinces_of_residence   GPE          residents_of_stateorprovince      3
  ORG            students                         PER          schools_attended                  3
  ORG,GPE        subsidiaries                     ORG          parents                           3
  PER            title                            STRING       none                              3
  PER            top_member_employee_of           ORG          top_members_employees             3
  ORG            top_members_employees            PER          top_member_employee_of            3
  ORG            website*                         STRING       none                              3
# The following are not TAC slot filling predicates, but rather
# predicates required by the Cold Start task
  PER,ORG,GPE    mention                          STRING       none                              1
  PER,ORG,GPE    canonical_mention                STRING       none                              1
  PER,ORG,GPE    type                             TYPE         none                              0
END_PREDICATES

##################################################################################### 
# This table lists known aliases of the legal predicates.
##################################################################################### 

my $predicate_aliases = <<'END_ALIASES';
# REASON        ALIAS                               MAPS TO
# ------        -----                               -------
  DEPRECATED    dissolved                           date_dissolved
  DEPRECATED    founded                             date_founded
  DEPRECATED    number_of_employees/members         number_of_employees_members
  DEPRECATED    political/religious_affiliation     political_religious_affiliation
  DEPRECATED    stateorprovinces_of_residence       statesorprovinces_of_residence
  DEPRECATED    top_members/employees               top_members_employees
  MISSPELLED    ages                                age
  MISSPELLED    canonical_mentions                  canonical_mention
  MISSPELLED    city_of_residence                   cities_of_residence
  MISSPELLED    country_of_residence                countries_of_residence
  MISSPELLED    mentions                            mention
  MISSPELLED    spouses                             spouse
  MISSPELLED    stateorprovince_of_residence        statesorprovinces_of_residence
  MISSPELLED    titles                              title
END_ALIASES


##################################################################################### 
# Priority for the selection of problem locations
##################################################################################### 

my %use_priority = (
  MENTION => 1,
  TYPEDEF => 2,
  SUBJECT => 3,
  OBJECT  => 4,
);  


##################################################################################### 
# Mappings from file extensions to import/export routines
##################################################################################### 

my %type2import = (
  skb => \&load_skb,
  tac => \&load_tac,
);

my %type2export = (
  skb => \&export_skb,
  tac => \&export_tac,
  rdf => \&export_rdf,
);


##################################################################################### 
# Default values
##################################################################################### 

# Should entities be output in C case or Camel case?
my $default_output_case_fn = \&string2c;

# Warnings can be suppressed by warning type using this table
my %ignore_warnings;

# Should provenance and confidence values be included in the output?
my $output_confidence;
my $output_provenance;
my $output_offsets;

# Can the same assertion be made more than once?
my %multiple_attestations = (
  ONE =>       "only one allowed - no duplicates",
  ONEPERDOC => "at most one allowed per document",
  MANY =>      "any number of duplicate assertions allowed",
);
my $multiple_attestations = 'ONE';

# Which triple labels should be output?
my %output_labels = ();

# How many provenance offset pairs do particular predicates take? The
# default is three for predicates that do not appear in this table
my %num_provenance_strings = (
  'type' => 0,
  'mention' => 1,
  '*mention' => 1,
  'canonical_mention' => 1,
);

# Should skb output be decorated with handy names?
my $display_names;

# Should skb output be deduped?
my $dedup;

# Filehandles for program and error output
my $program_output = *STDOUT;
my $error_output = *STDERR;

##################################################################################### 
# Patterns
##################################################################################### 

# Eliminate comments, ensuring that pound signs in the middle of
# strings are not treated as comment characters
# Here is the original slightly clearer syntax that unfortunately doesn't work with Perl 5.8
# s/^(
# 	(?:[^#"]*+		      # Any number of chars that aren't double quote or pound sign
# 	  (?:"(?:[^"\\]++|\\.)*+")?   # Any number of double quoted strings
# 	)*+			      # The pair of them repeated any number of times
#   )				      # Everything up to here is captured in $1
#   (\#.*)$/$1/x;		      # Pound sign through the end of the line is not included in the replacement
my $comment_pattern = qr/
      ^(
	(?>
	  (?:
	    (?>[^#"]*)		      # Any number of chars that aren't double quote or pound sign
	    (?:"		      # Beginning of double quoted string
	      (?>		      # Start a possessive match of the string body
		(?:(?>[^"\\]+)|\\.)*  # Possessively match any number of non-double quotes or match an escaped char
	      )"		      # Possessively match the above repeatedly, before the closing double quote
	    )?			      # There might or might not be a double quoted string
	  )*			      # The pair of them repeated any number of times
	)			      # Possessively match everything before a pound sign that starts the comment
      )				      # Everything up to here is captured in $1
      (\#.*)$/x;		      # Pound sign through the end of the line is not included in the replacement


##################################################################################### 
# Utility routines
##################################################################################### 

# Convert a string to camel case
{
  my %string2camel;
  sub string2camel {
    my ($slot) = @_;
    return $string2camel{$slot} if defined $string2camel{$slot};
    my $type;
    if ($slot =~ /^(.*?):(.*)$/) {
      $type = $1;
      $slot = $2;
    }
    my @words = split(/_/, $slot);
    my $result = join("", map {ucfirst(lc($_))} @words);
    $result = "$type:$result" if defined $type;
    $string2camel{$slot} = $result;
    $result;
  }
}

# Convert a string to C case
{
  my %string2c;
  sub string2c {
    my ($slot) = @_;
    return $string2c{$slot} if defined $string2c{$slot};
    return $slot unless $slot =~ /[A-Z]/;
    my $type;
    if ($slot =~ /^(.*?):(.*)$/) {
      $type = $1;
      $slot = $2;
    }
    my @words = $slot =~ /[A-Z][a-z0-9]*/g;
    my $result = join("_", map {lc} @words);
    $result = "$type:$result" if defined $type;
    $string2c{$slot} = $result;
    $result;
  }
}

# Is this string currently in camel case?
sub is_camel {
  my ($name) = @_;
  # Remove the leading type if present
  $name =~ s/.*://;
  $name =~ /^(?:[A-Z][a-z0-9]*)+$/;
}

# Is this string currently in C case?
sub is_c {
  my ($name) = @_;
  # Remove the leading type if present
  $name =~ s/.*://;
  $name =~ /^[a-z0-9]+(?:_[a-z0-9]+)*$/;
}


##################################################################################### 
# Reporting Problems
##################################################################################### 

# Convert the problem formats list above to an appropriate hash
my %problem_formats;
chomp $problem_formats;
foreach (grep {!/^\S*#/} split(/\n/, $problem_formats)) {
  s/^\s+//;
  my ($problem, $type, $format) = split(/\s+/, $_, 3);
  $problem_formats{$problem} = {TYPE => $type, FORMAT => $format};
}
  
# Don't report problems immediately when they're encountered; rather, aggregate them here
my %problems;
my %problem_counts;

# Remember that a particular problem was encountered, for later reporting
sub record_problem {
  my $source = pop(@_);
  my ($problem, @args) = @_;
  # Warnings can be suppressed here; errors cannot
  return if $ignore_warnings{$problem};
  my $format = $problem_formats{$problem} || {TYPE => 'INTERNAL_ERROR', FORMAT => "Unknown problem $problem: %s"};
  $problem_counts{$format->{TYPE}}++;
  my $message = "$format->{TYPE}: " . sprintf($format->{FORMAT}, @args);
  if (ref $source) {
    $problems{$problem}{$message}{"$source->{FILENAME} line $source->{LINENUM}"}++;
  }
  else {
    $problems{problem}{$message}{'NO_SOURCE'}++;
  }
}

# Report all of the problems that have been aggregated in %problems to STDERR
sub report_all_problems {
  foreach my $problem (sort keys %problems) {
    foreach my $message (sort keys %{$problems{$problem}}) {
      my $num_instances = scalar keys %{$problems{$problem}{$message}};
      print $error_output "$message";
      my $example = (keys %{$problems{$problem}{$message}})[0];
      if ($example ne 'NO_SOURCE') {
	print $error_output " ($example";
	print $error_output " and ", $num_instances - 1, " other place" if $num_instances > 1;
	print $error_output "s" if $num_instances > 2;
	print $error_output ")";
      }
      print $error_output "\n";
    }
  }
  # Return the number of errors and the number of warnings encountered
  ($problem_counts{ERROR} || 0, $problem_counts{WARNING} || 0);
}


##################################################################################### 
# Predicates
##################################################################################### 

package Predicate;

# Keep track of all known predicates
my %predicates;
# Mapping from predicate name without domain info to set of fully-qualified predicate names
#my %name2predicates;
# Set of legal domain types (e.g., {PER, ORG, GPE})
my %legal_domain_types = (
  per => 'true',
  gpe => 'true',
  org => 'true',
);
# Set of legal range types (e.g., {PER, ORG, GPE})
my %legal_range_types = (
  per => 'true',
  gpe => 'true',
  org => 'true',
  string => 'true',
  type => 'true',
);
# Set of types that are entities
my %legal_entity_types = (
  per => 'true',
  gpe => 'true',
  org => 'true',
);

# Is one type specification compatible with another?  The second
# argument must be a hash representing a set of types. The first
# argument may either be the same representation, or a single type
# name. The two are compatible if the second is a (possibly improper)
# superset of the first.
sub is_compatible {
  my ($type, $typeset) = @_;
  my @type_names;
  if (ref $type) {
    @type_names = keys %{$type};
  }
  else {
    @type_names = ($type);
  }
  foreach (@type_names) {
    return unless $typeset->{$_};
  }
  return "compatible";
}

# Find all predicates with the given name that are compatible with the
# domain and range given, if any
sub lookup_predicate {
  my ($name, $domain, $range) = @_;
  my @candidates = @{$predicates{$name} || []};
  @candidates = grep {&is_compatible($domain, $_->get_domain())} @candidates if defined $domain;
  @candidates = grep {&is_compatible($range, $_->get_range())} @candidates if defined $range;
  @candidates;
}

# Populate the predicates tables from $predicates, which is defined at
# the top of this file
sub create_predicates {
  my ($predicates, $label) = @_;
  chomp $predicates;
  foreach (grep {!/^\s*#/} split(/\n/, lc $predicates)) {
    my ($domain, $name, $range, $inverse, $num_provenance_strings) = split;
    my $lcpredicate = lc $name;
    $lcpredicate =~ s/\*$//;
    $num_provenance_strings{$lcpredicate} = $num_provenance_strings;
    # The "single-valued" marker (asterisk) is handled by Predicate->new
    Predicate->new($domain, $name, $range, $inverse, $label);
  }
}
&create_predicates($predicates, 'TAC');

sub load_predicates {
  my ($filename) = @_;
  my $base_filename = $filename;
  $base_filename =~ s/.*\///;
  die "Filename for predicates files should be <label>.predicates.txt" unless $base_filename =~ /^(\w+)\.predicates.txt$/;
  my $label = uc $1;
  open(my $infile, "<:utf8", $filename) or die "Could not open $filename: $!";
  local($/);
  my $predicates = <$infile>;
  close $infile;
  &create_predicates($predicates, $label);
}

# Create a new Predicate object
sub new {
  my ($class, $domain_string, $original_name, $range_string, $original_inverse_name, $label) = @_;
  # Convert the comma-separated list of types to a hash
  my $domain = {map {$_ => 'true'} split(/,/, lc $domain_string)};
  # Make sure each type is legal
  foreach my $type (keys %{$domain}) {
    die "Illegal domain type: $type" unless $legal_domain_types{$type};
  }
  # Do the same for the range
  my $range = {map {$_ => 'true'} split(/,/, lc $range_string)};
  foreach my $type (keys %{$range}) {
    die "Illegal range type: $type" unless $legal_range_types{$type};
  }
  my $name = $original_name;
  my $inverse_name = $original_inverse_name;
  my $quantity = 'list';
  my $inverse_quantity = 'list';
  # Single-valued slots are indicated by a trailing asterisk in the predicate name
  if ($name =~ /\*$/) {
    substr($name, -1, 1, '');
    $quantity = 'single';
  }
  if ($inverse_name =~ /\*$/) {
    substr($inverse_name, -1, 1, '');
    $inverse_quantity = 'single';
  }
  # If this predicate has already been defined, make sure that
  # definition is compatible with the current one, then return it
  my @predicates = &lookup_predicate($name, $domain, $range);
  die "More than one predicate defined for $name($domain_string, $range_string)" if @predicates > 1;
  my $predicate;
  if (@predicates) {
    $predicate = $predicates[0];
    my $current_inverse_name = $predicate->get_inverse_name();
    die "Attempt to redefine inverse of predicate $domain_string:$name from $current_inverse_name to $inverse_name"
      unless $current_inverse_name eq $inverse_name;
    die "Attempt to redefine quantity of predicate $domain_string:$name from $predicate->{QUANTITY} to $quantity"
	unless $predicate->{QUANTITY} eq $quantity;
    my @inverses = &lookup_predicate($inverse_name, $range, $domain);
    die "Multiple inverses with form $inverse_name($range_string, $domain_string)" if (@inverses > 1);
    if (@inverses) {
      my $current_inverse = $inverses[0];
      die "Attempt to redefine inverse of $domain_string:$name"
	if defined $predicate->{INVERSE} && $predicate->{INVERSE} ne $current_inverse;
    }
    return $predicate;
  }
  # This predicate has not been defined already, so build it. INVERSE is added below.
  $predicate = bless({NAME         => $name,
		      LABEL        => $label,
		      DOMAIN       => $domain,
		      RANGE        => $range,
		      INVERSE_NAME => $inverse_name,
		      QUANTITY     => $quantity},
		     $class);
  # Save the new predicate in %predicates
  push(@{$predicates{$name}}, $predicate);
  # Automatically generate the inverse predicate
  $predicate->{INVERSE} = $class->new($range_string, $original_inverse_name, $domain_string, $original_name, $label)
    unless $inverse_name eq 'none';
  $predicate;
}

# Check the list just for safety
foreach my $type (keys %predicates) {
  foreach my $predicate (@{$predicates{$type}}) {
    die "Botched inverse for predicate $predicate->{NAME} with inverse name $predicate->{INVERSE_NAME} and inverse $predicate->{INVERSE}"
      if $predicate->{INVERSE_NAME} ne 'none' && $predicate->{INVERSE}{INVERSE} != $predicate;
  }
}

# Populate the set of predicate aliases from $predicate_aliases (defined at the top of this file)
my %predicate_aliases;
foreach (grep {!/^\s*#/} split(/\n/, lc $predicate_aliases)) {
  my ($type, $alias, $actual) = split;
  $predicate_aliases{$alias} = {TYPE => $type, REPLACEMENT => $actual};
}

# Rewrite this predicate name if it is an alias
sub rewrite_predicate {
  my ($predicate, $source) = @_;
  my $alias = $predicate_aliases{$predicate};
  return $predicate unless defined $alias;
  &main::record_problem('PREDICATE_ALIAS', $alias->{TYPE}, $predicate, $alias->{REPLACEMENT}, $source);
  $alias->{REPLACEMENT};
}

# Handy selectors
sub get_name {$_[0]->{NAME}}
sub get_domain {$_[0]->{DOMAIN}}
sub get_range {$_[0]->{RANGE}}
sub get_inverse {$_[0]->{INVERSE}}
sub get_inverse_name {$_[0]->{INVERSE_NAME}}
sub get_quantity {$_[0]->{QUANTITY}}

# Find the correct predicate name for this (verb, subject, object)
# triple, performing a variety of error checks
sub get_predicate {
  # The source appears as the last argument passed; preceding arguments are not necessarily present
  my $source = pop(@_);
  my ($verb, $subject_type, $object_type) = @_;
  return $verb if ref $verb;
  $subject_type = lc $subject_type if defined $subject_type;
  $object_type = lc $object_type if defined $object_type;
  my $domain_string = $subject_type;
  my $range_string = $object_type;
  if ($verb =~ /^(.*?):(.*)$/) {
    $domain_string = lc $1;
    $verb = $2;
    unless($legal_domain_types{$domain_string}) {
      &main::record_problem('ILLEGAL_PREDICATE_TYPE', $domain_string, $source);
      return;
    }
  }
  if (defined $domain_string &&
      defined $subject_type &&
      $legal_domain_types{$subject_type} &&
      $domain_string ne $subject_type) {
    &main::record_problem('SUBJECT_PREDICATE_MISMATCH', $subject_type, $domain_string, $source);
    return;
  }
  $verb = &rewrite_predicate(&main::string2c($verb), $source);
  my @candidates = &lookup_predicate($verb, $domain_string, $range_string);
  unless (@candidates) {
    &main::record_problem('ILLEGAL_PREDICATE', $verb, $source);
    return 'undefined';
  }
  return $candidates[0] if @candidates == 1;
  &main::record_problem('AMBIGUOUS_PREDICATE', $verb, $source);
  return 'ambiguous';
}


##################################################################################### 
# Knowledge base
##################################################################################### 

# This is not really a KB per se, because we have to be resilient to errors in the input
package KB;

# Create a new empty KB
sub new {
  my ($class) = @_;
  my $self = {};
  bless($self, $class);
  $self;
}

# Find or create the KB entity with a given name
sub intern {
  my ($kb, $name, $source) = @_;
  return $name if ref $name;
  if ($name =~ /^"/) {
    &main::record_problem('STRING_USED_FOR_ENTITY', $name, $source);
    return;
  }
  unless ($name =~ /^:?\w+$/) {
    main::record_problem('ILLEGAL_ENTITY_NAME', $name, $source);
    return;
  }
  unless ($name =~ /^:/) {
    &main::record_problem('COLON_OMITTED', $name, $source);
    $name = ":$name";
  }
  my $entity = $kb->{ENTITIES}{$name};
  unless (defined $entity) {
    $entity = {NAME => $name};
    $kb->{ENTITIES}{$name} = $entity;
  }
  $entity;
}

# Record that an entity has been used in a particular way (e.g., it's
# been given a type, it appears as the subject of a predicate, etc.)
sub entity_use {
  my ($kb, $name, $use_type, $source) = @_;
  die "Unknown use type: $use_type" unless $use_priority{$use_type};
  my $entity = $kb->intern($name, $source);
  # Do nothing if the name is malformed
  return unless defined $entity;
  $use_type = uc $use_type;
  push(@{$entity->{USES}{$use_type}}, $source);
  # When an error message refers to a particular entity, we'd like to
  # give as clear a pointer to the entity as we can. This code keeps
  # track of the "best" use of an entity for reporting purposes, with
  # %use_priority providing the definition of "best."
  my $thisuse = {USE_TYPE => $use_type, SOURCE => $source};
  my $bestuse = $entity->{BESTUSE} || $thisuse;
  $bestuse = $thisuse if $bestuse->{USE_TYPE} eq $thisuse->{USE_TYPE} &&
                         $bestuse->{SOURCE}{LINENUM} > $thisuse->{SOURCE}{LINENUM};
  $bestuse = $thisuse if $use_priority{$bestuse->{USE_TYPE}} > $use_priority{$thisuse->{USE_TYPE}};
  $entity->{BESTUSE} = $bestuse;
  $entity;
}

# Assert that an entity has the given type
sub entity_typedef {
  my ($kb, $name, $type, $def_type, $source) = @_;
  die "Unknown def type: $def_type" unless $use_priority{$def_type};
  # A type specification with multiple types doesn't give us any information, so ignore it
  if (ref $type) {
    my @types = keys %{$type};
    return if (@types > 1);
    die "type set with no entries in entity_typedef" unless @types;
    $type = $types[0];
  }
  $type = lc $type;
  # Only legal types may be asserted
  unless ($legal_domain_types{$type}) {
    &main::record_problem('ILLEGAL_ENTITY_TYPE', $type, $source);
    return;
  }
  my $entity = $kb->intern($name, $source);
  # Do nothing if the name is malformed
  return unless defined $entity;
  $def_type = uc $def_type;
  push(@{$entity->{TYPEDEFS}{$type}{$def_type}}, $source);
  my $thisdef = {DEFTYPE => $def_type, SOURCE => $source};
  my $bestdef = $entity->{BESTDEF}{$type} || $thisdef;
  # The best definition to point the user at is the one with the
  # highest use_priority, or, if they're the same, the one that occurs
  # first in the file
  $bestdef = $thisdef if $bestdef->{DEFTYPE} eq $thisdef->{DEFTYPE} &&
                         $bestdef->{SOURCE}{LINENUM} > $thisdef->{SOURCE}{LINENUM};
  $bestdef = $thisdef if $use_priority{$bestdef->{DEFTYPE}} > $use_priority{$thisdef->{DEFTYPE}};
  $entity->{BESTDEF}{$type} = $bestdef;
  $entity;
}

# Find the type of a given entity, if known
sub get_entity_type {
  my ($kb, $entity, $source) = @_;
  $entity = $kb->intern($entity, $source);
  # We'll only get nil if the entity name is malformed, but return
  # unknown nonetheless
  return 'unknown' unless defined $entity;
  my @types = keys %{$entity->{TYPEDEFS}};
  return $types[0] if @types == 1;
  return 'unknown' unless @types;
  return 'multiple';
}

# Assert a particular triple into the KB
sub add_assertion {
  my ($kb, $subject, $verb, $object, $provenance, $confidence, $source) = @_;
  # First, normalize all of the triple components
  my $subject_entity = $kb->intern($subject, $source);
  return unless defined $subject_entity;
  $subject = $subject_entity->{NAME};
  my $subject_type = $kb->get_entity_type($subject_entity);
  $subject_type = undef unless $legal_domain_types{$subject_type};
  my $object_entity;
  $verb = &main::string2c($verb);
  my $predicate = &Predicate::get_predicate($verb, $subject_type, $source);
  return unless ref $predicate;
  $verb = $predicate->get_name();
  # Record entity uses and type definitions. 'type' assertions are special-cased (as they have no object)
  if ($verb eq 'type') {
    $kb->entity_use($subject_entity, 'TYPEDEF', $source);
    $kb->entity_typedef($subject_entity, $object, 'TYPEDEF', $source);
  }
  else {
    $kb->entity_use($subject_entity, 'SUBJECT', $source);
    $kb->entity_typedef($subject_entity, $predicate->get_domain(), 'SUBJECT', $source);
    if (&Predicate::is_compatible('string', $predicate->get_range())) {
      # Make sure this is a properly double quoted string
      unless ($object =~ /^"(?>(?:(?>[^"\\]+)|\\.)*)"$/) {
	# If not, complain and stick double quotes around it
	&main::record_problem('UNQUOTED_STRING', $object, $source);
	$object =~ s/(["\\])/\\$1/g;
	$object = "\"$object\"";
      }
    }
    if (&Predicate::is_compatible($predicate->get_range(), \%legal_entity_types)) {
      $object_entity = $kb->intern($object, $source);
      return unless defined $object_entity;
      $object = $object_entity->{NAME};
      $kb->entity_use($object_entity, 'OBJECT', $source);
      $kb->entity_typedef($object_entity, $predicate->get_range(), 'OBJECT', $source);
    }
  }
  # Check for duplicate assertions
  my $is_duplicate_of;
  unless ($verb eq 'mention' || $verb eq 'canonical_mention' || $verb eq 'type') {
  existing:
    # We don't consider inferred assertions to be duplicates
    foreach my $existing (grep {!$_->{INFERRED}} $kb->get_assertions($subject, $verb, $object)) {
      # Don't worry about duplicates of assertions that have already been omitted from the output
      next if $existing->{OMIT_FROM_OUTPUT};
      # If only one is allowed, any matching assertion is a duplicate
      if ($multiple_attestations eq 'ONE') {
	$is_duplicate_of = $existing;
	last existing;
      }
      # In all other cases, it's not a duplicate unless it was extracted from the same document
      next existing unless $existing->{PROVENANCE}{DOCID} eq $provenance->{DOCID};
      if ($multiple_attestations eq 'ONEPERDOC') {
	$is_duplicate_of = $existing;
	last existing;
      }
      # If "many" duplicate assertions are allowed, we only have a
      # problem if it is being asserted about exactly the same mention
      foreach my $name (qw(SUBJECT PREDICATE OBJECT)) {
	# If relations are added with no provenance information, any
	# duplication within a document looks like exactly the same
	# assertion
	next unless defined $existing->{PROVENANCE}{$name}{START} && defined $provenance->{$name}{START};
	next existing if $existing->{PROVENANCE}{$name}{START} != $provenance->{$name}{START};
	next unless defined $existing->{PROVENANCE}{$name}{END} && defined $provenance->{$name}{END};
	next existing if $existing->{PROVENANCE}{$name}{END} != $provenance->{$name}{END};
      }
      # This if is entirely unnecessary, but it makes everything look nice and symmetric
      if ($multiple_attestations eq 'MANY') {
	# This is an actual duplicate of exactly the same information
	$is_duplicate_of = $existing;
	last existing;
      }
    }
  }

  # Handle single-valued slots that are given more than one filler
  my $is_multiple_of;
  if ($predicate->{QUANTITY} eq 'single') {
    foreach my $existing ($kb->get_assertions($subject, $verb)) {
      # Again, ignore assertions that have already been omitted from the output
      next if $existing->{OMIT_FROM_OUTPUT};
      if (defined $object_entity && defined $existing->{OBJECT_ENTITY}) {
	if ($object_entity != $existing->{OBJECT_ENTITY}) {
	  $is_multiple_of = $existing;
	  last;
	}
      }
      elsif ($object ne $existing->{OBJECT}) {
	$is_multiple_of = $existing;
	last;
      }
    }
  }

  # Create the assertion, but don't record it yet
  my $assertion = {SUBJECT => $subject,
		   VERB => $verb,
		   OBJECT => $object,
		   PRINT_STRING => "$verb($subject, $object)",
		   SUBJECT_ENTITY => $subject_entity,
		   PREDICATE => $predicate,
		   OBJECT_ENTITY => $object_entity,
		   PROVENANCE => $provenance,
		   CONFIDENCE => $confidence,
		   SOURCE => $source};
  # Only output one of a set of multiples
  if ($is_multiple_of) {
    &main::record_problem('MULTIPLE_FILLS', $subject, $verb, $source);
    if ($confidence < $is_multiple_of->{CONFIDENCE}) {
      $assertion->{OMIT_FROM_OUTPUT} = 'true';
    }
    elsif ($confidence > $is_multiple_of->{CONFIDENCE}) {
      $is_multiple_of->{OMIT_FROM_OUTPUT} = 'true';
    }
    elsif ($assertion->{SOURCE}{LINENUM} < $is_multiple_of->{SOURCE}{LINENUM}) {
      $is_multiple_of->{OMIT_FROM_OUTPUT} = 'true';
    }
    else {
      $assertion->{OMIT_FROM_OUTPUT} = 'true';
    }
  }
  if ($is_duplicate_of) {
    # Make sure this isn't exactly the same assertion
    return if $provenance->{DOCID} eq $is_duplicate_of->{PROVENANCE}{DOCID} &&
      (!defined $provenance->{SUBJECT} ||
       !defined $is_duplicate_of->{PROVENANCE}{SUBJECT} ||
       ($provenance->{SUBJECT}{START} == $is_duplicate_of->{PROVENANCE}{SUBJECT}{START} &&
	$provenance->{SUBJECT}{END} == $is_duplicate_of->{PROVENANCE}{SUBJECT}{END})) &&
      defined $provenance->{PREDICATE} &&
      (!defined $provenance->{PREDICATE} ||
       !defined $is_duplicate_of->{PROVENANCE}{PREDICATE} ||
       ($provenance->{PREDICATE}{START} == $is_duplicate_of->{PROVENANCE}{PREDICATE}{START} &&
	$provenance->{PREDICATE}{END} == $is_duplicate_of->{PROVENANCE}{PREDICATE}{END})) &&
      defined $provenance->{OBJECT} &&
      (!defined $provenance->{OBJECT} ||
       !defined $is_duplicate_of->{PROVENANCE}{OBJECT} ||
       ($provenance->{OBJECT}{START} == $is_duplicate_of->{PROVENANCE}{OBJECT}{START} &&
	$provenance->{OBJECT}{END} == $is_duplicate_of->{PROVENANCE}{OBJECT}{END}));
    &main::record_problem('DUPLICATE_ASSERTION', "$is_duplicate_of->{SOURCE}{FILENAME} line $is_duplicate_of->{SOURCE}{LINENUM}", $source);
    # Keep the duplicate with higher confidence. If the confidences are the same, keep the earlier one
    if ($confidence < $is_duplicate_of->{CONFIDENCE}) {
      $assertion->{OMIT_FROM_OUTPUT} = 'true';
    }
    elsif ($confidence > $is_duplicate_of->{CONFIDENCE}) {
      $is_duplicate_of->{OMIT_FROM_OUTPUT} = 'true';
    }
    elsif ($assertion->{SOURCE}{LINENUM} < $is_duplicate_of->{SOURCE}{LINENUM}) {
      $is_duplicate_of->{OMIT_FROM_OUTPUT} = 'true';
    }
    else {
      $assertion->{OMIT_FROM_OUTPUT} = 'true';
    }
  }
  # Record the assertion in various places for easy retrieval
  push(@{$kb->{MENTIONS}{$provenance->{DOCID}}}, $assertion)
    if defined $predicate && ($predicate->{NAME} eq 'mention');
  push(@{$kb->{DOCIDS}{$subject}{$verb}{$provenance->{DOCID}}}, $assertion)
    if defined $predicate && ($predicate->{NAME} eq 'mention' || $predicate->{NAME} eq 'canonical_mention');
  push(@{$kb->{ASSERTIONS3}{$subject}{$verb}{$object}}, $assertion);
  push(@{$kb->{ASSERTIONS2}{$subject}{$verb}}, $assertion);
  push(@{$kb->{ASSERTIONS1}{$subject}}, $assertion);
  push(@{$kb->{ASSERTIONS0}}, $assertion);
  $assertion;
}

# Select a global canonical mention for this entity
sub get_best_mention {
  my ($kb, $entity, $docid) = @_;
  my $best = "";
  if (defined $docid) {
    my @mentions = $kb->get_assertions($entity, 'canonical_mention', undef, $docid);
    if (@mentions == 1) {
      $best = $mentions[0]{OBJECT};
    }
    else {
      print $error_output "Oh dear, Wrong number of canonical mentions in document $docid\n";
    }
  } else {
    my @mentions = $kb->get_assertions($entity, 'canonical_mention');
    foreach my $mention (@mentions) {
      $best = $mention->{OBJECT} if length($mention->{OBJECT}) > length($best);
    }
  }
  $best;
}

# More handy accessors
sub get_subjects { my ($kb) = @_;                  keys %{$kb->{ASSERTIONS1}} }
sub get_verbs    { my ($kb, $subject) = @_;        keys %{$kb->{ASSERTIONS2}{$subject}} }
sub get_objects  { my ($kb, $subject, $verb) = @_; keys %{$kb->{ASSERTIONS3}{$subject}{$verb}} }
sub get_docids   { my ($kb, $subject, $verb) = @_; keys %{$kb->{DOCIDS}{$subject}{$verb}} }

# Find all assertions that match a given pattern
sub get_assertions {
  my ($kb, $subject, $verb, $object, $docid) = @_;
  die "get_assertions given both object and docid at (" . join(", ", caller) . ")"
    if defined $object && defined $docid;

  $subject = $subject->{NAME} if ref $subject;
  $verb = $verb->{VERB} if ref $verb;
  $object = $object->{NAME} if ref $object;

  return(@{$kb->{ASSERTIONS3}{$subject}{$verb}{$object} || []}) if defined $object;
  return(@{$kb->{DOCIDS}{$subject}{$verb}{$docid} || []}) if defined $docid;
  return(@{$kb->{ASSERTIONS2}{$subject}{$verb} || []}) if defined $verb;
  return(@{$kb->{ASSERTIONS1}{$subject} || []}) if defined $subject;
  return(@{$kb->{ASSERTIONS0} || []});
}


##################################################################################### 
# Error checking and inferred relations
##################################################################################### 

# Report entities that don't have exactly one type
sub check_entity_types {
  my ($kb) = @_;
  while (my ($name, $entity) = each %{$kb->{ENTITIES}}) {
    my $type = $kb->get_entity_type($entity);
    if ($type eq 'unknown') {
      &main::record_problem('UNKNOWN_TYPE', $name, $entity->{BESTUSE}{SOURCE});
    }
    elsif ($type eq 'multiple') {
      &main::record_problem('MULTITYPED_ENTITY', $name,
			    join(", ", map {"$_ at line $entity->{BESTDEF}{$_}{SOURCE}{LINENUM}"}
				 sort keys %{$entity->{BESTDEF}}), 'NO_SOURCE');
    }
  }
}

# Make sure that every entity that has been mentioned or used somewhere has a typedef
sub check_definitions {
  my ($kb) = @_;
  while (my ($name, $entity) = each %{$kb->{ENTITIES}}) {
    # I suspect that having multiple types here (PER, ORG, GPE) is at this point vestigial
    foreach my $type (keys %{$entity->{BESTDEF}}) {
      # An entity that is used in any way must have an actual typedef somewhere
      &main::record_problem('MISSING_TYPEDEF', $name, $entity->{BESTDEF}{$type}{SOURCE})
	unless $entity->{TYPEDEFS}{$type}{TYPEDEF};
    }
  }
}

# Make sure that every assertion also has an asserted inverse
sub assert_inverses {
  my ($kb) = @_;
  foreach my $assertion ($kb->get_assertions()) {
    next unless ref $assertion->{PREDICATE};
    next unless &Predicate::is_compatible($assertion->{PREDICATE}{RANGE}, \%legal_entity_types);
    unless ($kb->get_assertions($assertion->{OBJECT}, $assertion->{PREDICATE}{INVERSE_NAME}, $assertion->{SUBJECT})) {
      &main::record_problem('MISSING_INVERSE', $assertion->{PREDICATE}->get_name(),
			    $assertion->{SUBJECT}, $assertion->{OBJECT}, $assertion->{SOURCE});
      # Assert the inverse if it's not already there
      my $inverse = $kb->add_assertion($assertion->{OBJECT}, $assertion->{PREDICATE}{INVERSE_NAME}, $assertion->{SUBJECT},
				       $assertion->{PROVENANCE}, $assertion->{CONFIDENCE}, $assertion->{SOURCE});
      # And flag this as an inferred relation
      $inverse->{INFERRED} = 'true';
      # Make sure the visibility of the assertion and its inverse is in sync
      $assertion->{OMIT_FROM_OUTPUT} = 'true' if $inverse->{OMIT_FROM_OUTPUT};
      $inverse->{OMIT_FROM_OUTPUT} = 'true' if $assertion->{OMIT_FROM_OUTPUT};
    }
  }
}

# Make sure that mentions and canonical_mentions are in sync
sub assert_mentions {
  my ($kb) = @_;
  foreach my $subject ($kb->get_subjects()) {
    my %docids;
    foreach my $docid ($kb->get_docids($subject, 'mention'),
		       $kb->get_docids($subject, 'canonical_mention')) {
      $docids{$docid}++;
    }
    unless (keys %docids) {
      &main::record_problem('NO_MENTIONS', $subject, 'NO_SOURCE');
      next;
    }
    foreach my $docid (keys %docids) {
      my @mentions = sort {$a->{PROVENANCE}{SUBJECT}{START} <=> $b->{PROVENANCE}{SUBJECT}{START}}
	$kb->get_assertions($subject, 'mention', undef, $docid);
      my @canonical_mentions = sort {$a->{PROVENANCE}{SUBJECT}{START} <=> $b->{PROVENANCE}{SUBJECT}{START}}
	$kb->get_assertions($subject, 'canonical_mention', undef, $docid);
      if (@canonical_mentions == 0) {
	&main::record_problem(@mentions == 1 ? 'UNASSERTED_CANONICAL' : 'MISSING_CANONICAL', $subject, $docid, 'NO_SOURCE');
	my $mention = $mentions[0];
	my $assertion = $kb->add_assertion($mention->{SUBJECT}, 'canonical_mention', $mention->{OBJECT},
					   $mention->{PROVENANCE}, $mention->{CONFIDENCE}, $mention->{SOURCE});
	$assertion->{INFERRED} = 'true';
      }
      elsif (@canonical_mentions > 1) {
	&main::record_problem('MULTIPLE_CANONICAL', $subject, $docid, 'NO_SOURCE');
      }
      foreach my $canonical_mention (@canonical_mentions) {
	# Find the mention that matches this canonical mention, if any
	while (@mentions && $mentions[0]{PROVENANCE}{SUBJECT}{START} < $canonical_mention->{PROVENANCE}{SUBJECT}{START}) {
	  shift @mentions;
	}
	if (@mentions &&
	    $mentions[0]{PROVENANCE}{SUBJECT}{START} == $canonical_mention->{PROVENANCE}{SUBJECT}{START} &&
	    $mentions[0]{PROVENANCE}{SUBJECT}{END} == $canonical_mention->{PROVENANCE}{SUBJECT}{END}) {
	  # Found a match, so this pair can be safely ignored
	  shift @mentions;
	}
	else {
	  # Canonical mention without a corresponding mention
	  &main::record_problem('UNASSERTED_MENTION', $subject, $docid, $canonical_mention->{SOURCE});
	  my $assertion = $kb->add_assertion($canonical_mention->{SUBJECT}, 'mention', $canonical_mention->{OBJECT},
					     $canonical_mention->{PROVENANCE},
					     $canonical_mention->{CONFIDENCE},
					     $canonical_mention->{SOURCE});
	  $assertion->{INFERRED} = 'true';
	}
      }
    }
  }
}

# Make sure that all confidence values are legal
sub check_confidence {
  my ($kb) = @_;
  foreach my $assertion ($kb->get_assertions()) {
    if (defined $assertion->{CONFIDENCE}) {
      unless ($assertion->{CONFIDENCE} =~ /^(?:1\.0*)$|^(?:0?\.[0-9]+)$/) {
	&main::record_problem('ILLEGAL_CONFIDENCE_VALUE', $assertion->{CONFIDENCE}, $assertion->{SOURCE});
	$assertion->{CONFIDENCE} = '1.0';
      }
    }
  }
}

# Make sure that no character in a document is part of more than one
# entity mention. This uses the O(length(doc)-ish) algorithm; it's not
# clear that the O(num_mentions**2) algorithm would be better
sub check_for_mention_overlap {
  my ($kb) = @_;
  foreach my $docid (keys %{$kb->{MENTIONS}}) {
    my @positions;
    foreach my $assertion (@{$kb->{MENTIONS}{$docid}}) {
      my $overlapping;
      foreach my $i ($assertion->{PROVENANCE}{SUBJECT}{START} .. $assertion->{PROVENANCE}{SUBJECT}{END}) {
	$overlapping = $positions[$i] if defined $positions[$i];
	$positions[$i] = $assertion;
      }
      if ($overlapping) {
	&main::record_problem('OVERLAPPING_MENTIONS',
			      "$assertion->{PROVENANCE}{SUBJECT}{START}-$assertion->{PROVENANCE}{SUBJECT}{END}",
			      "$overlapping->{PROVENANCE}{SUBJECT}{START}-$overlapping->{PROVENANCE}{SUBJECT}{END}",
			      $docid,
			      $assertion->{SOURCE});
      }
    }
  }
}

# Each endpoint of a relation that is an entity must be attested in
# the document that attests to the relation
sub check_relation_endpoints {
  my ($kb) = @_;
  foreach my $assertion ($kb->get_assertions()) {
    next unless ref $assertion->{PREDICATE};
    next if $assertion->{PREDICATE}{NAME} eq 'type';
    next if $assertion->{PREDICATE}{NAME} eq 'mention';
    next if $assertion->{PREDICATE}{NAME} eq 'canonical_mention';
    my $docid = $assertion->{PROVENANCE}{DOCID};
    if (defined $assertion->{SUBJECT_ENTITY}) {
      my @mentions = $kb->get_assertions($assertion->{SUBJECT_ENTITY}, 'mention', undef, $docid);
      &main::record_problem('UNATTESTED_RELATION_ENTITY',
			    $assertion->{PRINT_STRING},
			    $assertion->{SUBJECT_ENTITY}{NAME},
			    $docid,
			    $assertion->{SOURCE})
	unless @mentions;
    }
    if (defined $assertion->{OBJECT_ENTITY}) {
      my @mentions = $kb->get_assertions($assertion->{OBJECT_ENTITY}, 'mention', undef, $docid);
      &main::record_problem('UNATTESTED_RELATION_ENTITY',
			    $assertion->{PRINT_STRING},
			    $assertion->{OBJECT_ENTITY}{NAME},
			    $docid,
			    $assertion->{SOURCE})
	unless @mentions;
    }
  }
}

# Perform a number of basic checks to make sure that the KB is well-formed
sub check_integrity {
  my ($kb) = @_;
  $kb->check_entity_types();
  $kb->check_definitions();
  $kb->assert_inverses();
  $kb->assert_mentions();
  $kb->check_relation_endpoints();
  $kb->check_confidence();
  $kb->check_for_mention_overlap();
}

# Print out all assertions
sub dump_assertions {
  my ($kb) = @_;
  foreach my $assertion ($kb->get_assertions()) {
    if (defined $assertion->{PREDICATE}) {
      print $program_output "p:$assertion->{PREDICATE}{NAME}";
    }
    else {
      print $program_output "v:$assertion->{VERB}";
    }
    print $program_output "($assertion->{SUBJECT}, $assertion->{OBJECT})";
    if (ref $assertion->{PROVENANCE}) {
      print $program_output " $assertion->{PROVENANCE}{DOCID}";
    }
    print $program_output "\n";
  }
}


##################################################################################### 
# Loading and saving
##################################################################################### 

package main;

# Load a KB that is expressed in TAC format (tab-separated triples with provenance)
sub load_tac {
  my ($filename, $docids) = @_;
  my $kb = KB->new();
  open(my $infile, "<:utf8", $filename) or die "Could not open $filename: $!";
  my $runid = <$infile>;
  chomp $runid;
  $kb->{RUNID_LINE} = $runid;
  $runid =~ s/\s*#.*//;
  $runid =~ s/^\s+//;
  $runid =~ s/\s+$//;
  if (length($runid) == 0 || $runid =~ /^:/ || $runid =~ /\s/) {
    &main::record_problem('MISSING_RUNID', {FILENAME => $filename, LINENUM => $.});
    # The most likely explanation if the line is not blank is that the
    # runid was omitted entirely, so go back to the beginning and
    # process as if there is no runid there
    seek($infile, 0, 0) or die "Could not seek to the start of $filename: $!";
    $runid = 'OmittedRunID';
    $kb->{RUNID_LINE} = "OmittedRunID\t# Did not find a legal run ID at the start of $filename";
  }
  $kb->{RUNID} = $runid;
  while (<$infile>) {
    chomp;
    my $source = {FILENAME => $filename, LINENUM => $.};
    my $confidence = '1.0';
    # Eliminate comments, ensuring that pound signs in the middle of
    # strings are not treated as comment characters
    s/$comment_pattern/$1/;
    next unless /\S/;
    my @entries = grep {/\S/} split(/\t/);
    # Get the confidence out of the way if it is provided
    $confidence = pop(@entries) if @entries && $entries[-1] =~ /^\d+\.\d+$/;
    # Now assign the entries to the appropriate fields
    unless (@entries >= 3) {
      &main::record_problem('SYNTAX_ERROR', "Not enough entries on line", $source);
      next;
    }
    my ($subject, $predicate, $object, $docid, @offsets) = @entries;
    my $provenance = &parse_provenance($docid, $docids, lc $predicate, $source, @offsets);
    next unless defined $provenance;
    unless (lc $predicate eq 'type' || defined $docid) {
      &main::record_problem('SYNTAX_ERROR', "Not enough entries on line", $source);
      next;
    }
    $kb->add_assertion($subject, $predicate, $object,
		       $provenance, $confidence,
		       {FILENAME => $filename, LINENUM => $.});
  }
  close $infile;
  $kb->check_integrity();
  $kb;
}

# Make sure that the provenance for an assertion is correctly constructed
sub parse_provenance {
  my ($docid, $docids, $lcpredicate, $source, @offsets) = @_;
  $lcpredicate =~ s/.*://;
  &main::record_problem('ILLEGAL_DOCID', $docid, $source)
    unless $lcpredicate eq 'type' || !defined $docids || $docid eq 'NO_DOC' || defined $docids->{$docid};
  # Most predicates take three offset pairs (subject, predicate, object)
  my $num_provenance_strings = defined $num_provenance_strings{$lcpredicate} ? $num_provenance_strings{$lcpredicate} : 3;
  # Each offset pair has a START and and END
  my $num_offsets = 2 * $num_provenance_strings;
  # Squawk if the number of offsets is incorrect
  unless (@offsets == $num_offsets) {
    &main::record_problem('SYNTAX_ERROR',
		    "Wrong number of offsets for predicate $lcpredicate (expected $num_offsets, got " . scalar @offsets . ")",
		    $source);
    return;
  }
  my $provenance = {DOCID => $docid};
  foreach my $name (qw(SUBJECT PREDICATE OBJECT)[0..($num_provenance_strings - 1)]) {
    my $start_offset = shift @offsets;
    unless ($start_offset =~ /^\d+$/) {
      &main::record_problem('SYNTAX_ERROR', "Non-integer offset: $start_offset", $source);
      return;
    }
    if (defined $docids && defined $docids->{$docid} && $start_offset >= $docids->{$docid}) {
      &main::record_problem('ILLEGAL_OFFSET', $start_offset, $docid, $source);
      return;
    }
    my $end_offset = shift @offsets;
    unless ($end_offset =~ /^\d+$/) {
      &main::record_problem('SYNTAX_ERROR', "Non-integer offset: $end_offset", $source);
      return;
    }
    if (defined $docids && defined $docids->{$docid} && $end_offset >= $docids->{$docid}) {
      &main::record_problem('ILLEGAL_OFFSET', $end_offset, $docid, $source);
      return;
    }
    if ($start_offset > $end_offset) {
      &main::record_problem('ILLEGAL_OFFSET_PAIR', $start_offset, $end_offset, $source);
      return;
    }
    $provenance->{$name} = {START => $start_offset, END => $end_offset};
  }
  $provenance;
}

# Load a KB that is expressed in "simple knowledge base" format
sub load_skb {
  my ($filename, $docids) = @_;
  my %entity_defs_seen;
  my $kb = KB->new();
  # Simple KB doesn't require explicit canonicals for documents with
  # single mentions, nor must entities be preceded by a colon
  $ignore_warnings{UNASSERTED_CANONICAL}++;
  $ignore_warnings{COLON_OMITTED}++;
  open(my $infile, "<:utf8", $filename) or die "Could not open $filename: $!";
  my $runid = <$infile>;
  chomp $runid;
  $kb->{RUNID_LINE} = $runid;
  $runid =~ s/\s*#.*//;
  $runid =~ s/^\s+//;
  $runid =~ s/\s+$//;
  if (length($runid) == 0 || $runid =~ /^:/ || $runid =~ /\s/) {
    &main::record_problem('MISSING_RUNID', {FILENAME => $filename, LINENUM => $.});
    # The most likely explanation if the line is not blank is that the
    # runid was omitted entirely, so go back to the beginning and
    # process as if there is no runid there
    seek($infile, 0, 0) or die "Could not seek to the start of $filename: $!";
    $runid = 'OmittedRunID';
    $kb->{RUNID_LINE} = "OmittedRunID\t# Did not find a legal run ID at the start of $filename";
  }
  $kb->{RUNID} = $runid;
  # Entity IDs are at the top level (i.e., no leading tabs)
  while (<$infile>) {
    chomp;
    my $source = {FILENAME => $filename, LINENUM => $.};
    # Skip comments and blank lines
    s/$comment_pattern/$1/;
    next unless /\S/;
    my $subject = $_;
    $kb->entity_use($subject, 'MENTION', {FILENAME => $filename, LINENUM => $.});
    &main::record_problem('DUPLICATE_SKB_ENTITY', $subject, {FILENAME => $filename, LINENUM => $.}) if $entity_defs_seen{$subject}++;
    # Process all the slots for this entity
  input_line:
    while (<$infile>) {
      chomp;
      my $confidence = '1.0';
      # Eliminate comments
      s/$comment_pattern/$1/;
      # Slots for this entity end when the line no longer begins with a tab
      last unless /^\t/;
      my @entries = split(/\t/);
      # Get the confidence out of the way if it is provided
      $confidence = pop(@entries) if @entries && $entries[-1] =~ /^\d+\.\d+$/;
      # Now assign the entries to the appropriate fields
      my (undef, $predicate, $docid, $object, @offsets) = @entries;
      my $lcpredicate = lc $predicate;
      # Type assertions have no docid
      if ($lcpredicate eq 'type') {
	if (defined $object && length($object)) {
	  &main::record_problem('SYNTAX_ERROR', 'Too many entries for $predicate assertion', {FILENAME => $filename, LINENUM => $.});
	  next input_line;
	}
	$object = $docid;
	$docid = 'NO_DOC';
      }
      else {
	unless (defined $object && length($object)) {
	  &main::record_problem('SYNTAX_ERROR', "No object given for $predicate", {FILENAME => $filename, LINENUM => $.});
	  next input_line;
	}
      }
      my $provenance = &parse_provenance($docid, $docids, $lcpredicate, {FILENAME => $filename, LINENUM => $.}, @offsets);
      next input_line unless defined $provenance;
      # *mention is a canonical mention; assert the canonical, then
      # convert the assertion to a regular mention
      if (lc $predicate eq '*mention') {
	$kb->add_assertion($subject, 'canonical_mention', $object,
			   $provenance, $confidence, {FILENAME => $filename, LINENUM => $.});
	$predicate = 'mention';
      }
      # OK, we're satisfied.  Go ahead and assert the triple
      $kb->add_assertion($subject, $predicate, $object,
			 $provenance, $confidence, {FILENAME => $filename, LINENUM => $.});
    }
  }
  close $infile;
  $kb->check_integrity();
  $kb;
}

# Output the KB in "simple knowledge base" format
sub export_skb {
  my ($kb, $output_case_fn, $output_labels) = @_;
  print $program_output "$kb->{RUNID_LINE}\n\n";
  # Make sure the entities appear in the same order as in the original file
  foreach my $name (sort {$kb->{ENTITIES}{$a}{BESTUSE}{SOURCE}{LINENUM} <=>
			  $kb->{ENTITIES}{$b}{BESTUSE}{SOURCE}{LINENUM}}
		    keys %{$kb->{ENTITIES}}) {
    my $entity = $kb->{ENTITIES}{$name};
    my $type = $kb->get_entity_type($entity);
    # Don't output entities with unknown type
    next if $type eq 'unknown';
    next if $type eq 'multiple';
    # No initial colons in skb
    my $printname = $name;
    $printname =~ s/^://;
    # Entity name
    print $program_output "$printname";
    print $program_output " (", $kb->get_best_mention($entity), ")" if $display_names;
    print $program_output "\n";
    # Entity type
    print $program_output "\t", &{$output_case_fn}("Type"), "\t", uc $type, "\n";
    my %dedup;
    foreach my $assertion ($kb->get_assertions($name, 'mention')) {
      next if $assertion->{OMIT_FROM_OUTPUT};
      next unless $output_labels->{$assertion->{PREDICATE}{LABEL}};
      my $mentionstring = &{$output_case_fn}("Mention");
      my $docid = $assertion->{PROVENANCE}{DOCID};
      my @canonical = $kb->get_assertions($entity, 'canonical_mention', undef, $docid);
      $mentionstring = "*$mentionstring" if $kb->get_assertions($entity, 'mention', undef, $docid) > 1 &&
	$canonical[0]->{OBJECT} eq $assertion->{OBJECT} &&
	$canonical[0]{PROVENANCE}{SUBJECT}{START} == $assertion->{PROVENANCE}{SUBJECT}{START};
      my $output_line = "\t$mentionstring\t$assertion->{PROVENANCE}{DOCID}\t$assertion->{OBJECT}";
      $output_line .= "\t$assertion->{PROVENANCE}{SUBJECT}{START}\t$assertion->{PROVENANCE}{SUBJECT}{END}"if $output_offsets;
      $output_line .= "\t$assertion->{CONFIDENCE}" if $output_confidence;
      $output_line .= "\n";
      print $program_output $output_line unless $dedup{$output_line}++;
    }
    # All other assertions
    # Maintain the same assertion order, under the assumption that the
    # ordering was picked with some reason in mind (duplicates have
    # already been eliminated, so we don't need to sort for that
    # reason)
    foreach my $assertion (sort {$a->{SOURCE}{LINENUM} <=> $b->{SOURCE}{LINENUM}} $kb->get_assertions($name)) {
      next if $assertion->{VERB} eq 'type' ||
	      $assertion->{VERB} eq 'mention' ||
	      $assertion->{VERB} eq 'canonical_mention';
      next if $assertion->{OMIT_FROM_OUTPUT};
      next unless $output_labels->{$assertion->{PREDICATE}{LABEL}};
      my $object = $assertion->{OBJECT};
      my $verb = &{$output_case_fn}($assertion->{VERB});
      my $output_line = "\t$verb";
      $object =~ s/^://;
      $output_line .= "\t$assertion->{PROVENANCE}{DOCID}" if $output_provenance;
      $output_line .= "\t$object";
      $output_line .= " (" . $kb->get_best_mention($assertion->{OBJECT_ENTITY} . $assertion->{PROVENANCE}{DOCID}) . ")"
	if defined $assertion->{OBJECT_ENTITY} && $display_names;
      my $num_provenance_strings_remaining = $num_provenance_strings{$assertion->{VERB}};
      foreach my $name (qw(SUBJECT PREDICATE OBJECT)) {
	$output_line .= "\t$assertion->{PROVENANCE}{$name}{START}\t$assertion->{PROVENANCE}{$name}{END}"
	  if $output_offsets && $num_provenance_strings_remaining-- > 0;
      }
      $output_line .= "\t$assertion->{CONFIDENCE}" if $output_confidence;
      $output_line .= "\n";
      print $output_line unless !$dedup || $dedup{$output_line}++;
    }
    print $program_output "\n";
  }
}

# When outputting TAC format, place assertions in a particular order
sub get_assertion_priority {
  my ($name) = @_;
  return 2 if $name eq 'type';
  return 1 if $name eq 'mention' || $name eq 'canonical_mention';
  return 0;
}

sub assertion_comparator {
  return $a->{SUBJECT} cmp $b->{SUBJECT} unless $a->{SUBJECT} eq $b->{SUBJECT};
  my $aname = lc $a->{PREDICATE}{NAME};
  my $bname = lc $b->{PREDICATE}{NAME};
  my $apriority = &get_assertion_priority($aname);
  my $bpriority = &get_assertion_priority($bname);
  return $bpriority <=> $apriority ||
	 $aname cmp $bname ||
         $a->{PROVENANCE}{DOCID} cmp $b->{PROVENANCE}{DOCID} ||
	 $a->{PROVENANCE}{SUBJECT}{START} <=> $b->{PROVENANCE}{SUBJECT}{START};
}  

# TAC format is just a list of assertions. Output the assertions in
# the order defined by the above comparator (just to make the output
# pretty; there is no fundamental need to do so)
sub export_tac {
  my ($kb, $output_case_fn, $output_labels) = @_;
  print $program_output "$kb->{RUNID_LINE}\n\n";
  foreach my $assertion (sort assertion_comparator $kb->get_assertions()) {
    next if $assertion->{OMIT_FROM_OUTPUT};
    next unless $output_labels->{$assertion->{PREDICATE}{LABEL}};
    # Only output assertions that have fully resolved predicates
    next unless ref $assertion->{PREDICATE};
    my $predicate_string = $assertion->{PREDICATE}{NAME};
    my $domain_string = "";
    if ($predicate_string ne 'type' &&
	$predicate_string ne 'mention' &&
	$predicate_string ne 'canonical_mention') {
      $domain_string = $kb->get_entity_type($assertion->{SUBJECT_ENTITY});
      next if $domain_string eq 'unknown';
      next if $domain_string eq 'multiple';
      $domain_string .= ":";
    }
    print $program_output "$assertion->{SUBJECT}\t", &{$output_case_fn}("$domain_string$assertion->{PREDICATE}{NAME}"),"\t$assertion->{OBJECT}";
    print $program_output "\t$assertion->{PROVENANCE}{DOCID}"
      if $output_provenance && $assertion->{PREDICATE}{NAME} ne 'type';
    my $num_provenance_strings_remaining = $num_provenance_strings{$assertion->{PREDICATE}{NAME}};
    foreach (qw(SUBJECT PREDICATE OBJECT)) {
      print $program_output "\t$assertion->{PROVENANCE}{$_}{START}\t$assertion->{PROVENANCE}{$_}{END}"
	if $output_offsets && defined $assertion->{PROVENANCE}{$_} && $num_provenance_strings_remaining-- > 0;
    }
    print $program_output "\t$assertion->{CONFIDENCE}" if $output_confidence;
    print $program_output "\n";
  }
}

sub export_rdf {
  my ($kb, $output_case_fn, $output_labels) = @_;
  print $program_output "$kb->{RUNID_LINE}\n\n";
#      next if $assertion->{OMIT_FROM_OUTPUT};
#      next unless $output_labels->{$assertion->{PREDICATE}{LABEL}};
# FIXME
  die "export_rdf not yet implemented";
}


##################################################################################### 
# Runtime switches and main program
##################################################################################### 

# Collect legal formats for switch documentation
my $input_formats = "[" . join(", ", sort keys %type2import) . "]";
my $output_formats = "[" . join(", ", sort keys %type2export) . ", none]";

# Map from a filename to the likely KB representation used by the file
my %all_file_types;
foreach (keys %type2import, keys %type2export) {
  $all_file_types{lc $_}++;
}

sub get_kb_file_type {
  my ($filename) = @_;
  return 'unknown' unless $filename =~ /\.([^.]+)$/;
  my $type = lc $1;
  return $type if $all_file_types{$type};
  # In a future version, we might look inside the file to try to identify the file type
  return 'unknown';
}

# I don't know where this script will be run, so pick a reasonable
# screen width for describing program usage (with the -help switch)
my $terminalWidth = 80;

# Handle run-time switches
my $switches = SwitchProcessor->new($0, "Convert between TAC Cold Start KB formats, checking for common errors.",
				    "");
$switches->addHelpSwitch("help", "Show help");
$switches->addHelpSwitch("h", undef);
$switches->addParam("filename", "required", "File containing input KB specification.");
$switches->addVarSwitch("input", "Specify the format of the input file. Legal formats are $input_formats.");
$switches->addVarSwitch("output", "Specify the output format. Legal formats are $output_formats." .
		                  " Use 'none' to perform error checking with no output.");
$switches->put("output", 'none');
$switches->addVarSwitch('output_file', "Specify a file to which output should be redirected");
$switches->put('output_file', 'STDOUT');
$switches->addVarSwitch('error_file', "Specify a file to which error output should be redirected");
$switches->put('error_file', "STDERR");
$switches->addConstantSwitch("camelcase", "true", "Use camel case for the output");
$switches->addConstantSwitch("ccase", "true", "Use C case (lowercase with underscores) for the output");
$switches->addConstantSwitch("dedup", "true", "Dedup skb output");
$switches->addConstantSwitch("noconfidence", "true", "Omit confidence values from the output");
$switches->addConstantSwitch("nooffsets", "true", "Omit offsets from the output");
$switches->addConstantSwitch("noprovenance", "true", "Omit provenance from the output");
$switches->addVarSwitch("predicates", "File containing specification of additional predicates to allow");
$switches->addVarSwitch("labels", "Colon-separated list of triple labels for output");
$switches->put("labels", "TAC");
$switches->addImmediateSwitch('version', sub { print "$0 version $version\n"; exit 0; }, "Print version number and exit");
$switches->addVarSwitch('multiple', "Are multiple assertions of the same triple allowed? " .
			"Legal values are: " . join(", ", map {"$_ ($multiple_attestations{$_})"} sort keys %multiple_attestations));
$switches->put('multiple', $multiple_attestations);
$switches->addConstantSwitch("names", 'true', "Decorate skb output with names to make it easier to read (but which cannot be read back in)");
$switches->addVarSwitch('docs', "Tab-separated file containing docids and document lengths, measured in Unicode characters");
$switches->addVarSwitch('ignore', "Colon-separated list of warnings to ignore. Legal values are: " .
			join(", ", grep {$problem_formats{$_}{TYPE} eq 'WARNING'} sort keys %problem_formats));

$switches->process(@ARGV);

# This holds the "knowledge base"
my $kb;

# Allow redirection of stdout and stderr
my $output_filename = $switches->get("output_file");
if (lc $output_filename eq 'stdout') {
  $program_output = *STDOUT;
}
elsif (lc $output_filename eq 'stderr') {
  $program_output = *STDERR;
}
else {
  open($program_output, ">:utf8", $output_filename) or die "Could not open $output_filename: $!";
}

my $error_filename = $switches->get("error_file");
if (lc $error_filename eq 'stdout') {
  $error_output = *STDOUT;
}
elsif (lc $error_filename eq 'stderr') {
  $error_output = *STDERR;
}
else {
  open($error_output, ">:utf8", $error_filename) or die "Could not open $error_filename: $!";
}

# The input file to process
my $filename = $switches->get("filename");
die "File $filename does not exist" unless -e $filename;

# Do we want the predicates in the output to be camel case or C case?
my $output_case_fn = $default_output_case_fn;
die "Can't specify both camel case and C case" if $switches->get("camelcase") && $switches->get("ccase");
$output_case_fn = \&string2camel if $switches->get("camelcase");
$output_case_fn = \&string2c if $switches->get("ccase");

# Should skb output be decorated with names?
$display_names = $switches->get("names");

# Should skb output be deduped?
$dedup = $switches->get("dedup");

# Should confidence values be included in the output?
$output_confidence = !$switches->get("noconfidence");
$output_provenance = !$switches->get("noprovenance");
$output_offsets = !$switches->get("nooffsets");
undef $output_offsets unless $output_provenance;

# What triple labels should be output?
my $labels = $switches->get("labels");
my $tac_found;
foreach my $label (split(/:/, uc $labels)) {
  $output_labels{$label} = 'true';
  $tac_found++ if $label eq 'TAC';
}
print $error_output "WARNING: 'TAC' not included in output labels\n" unless $tac_found;

# Load any additional predicate specifications
my $predicates_file = $switches->get("predicates");
&Predicate::load_predicates($predicates_file) if defined $predicates_file;

# How should multiple assertions of the same triple be handled?
$multiple_attestations = uc $switches->get("multiple");
die "Argument to -multiple switch must be one of [" . join(", ", sort keys %multiple_attestations) . "]"
  unless $multiple_attestations{$multiple_attestations};

# Add the user's selected warnings to the list of warnings to ignore
my $ignore = $switches->get("ignore");
if (defined $ignore) {
  my @warnings = map {uc} split(/:/, $ignore);
  foreach my $warning (@warnings) {
    die "Unknown warning: $warning" unless $problem_formats{$warning};
    die "$warning is a fatal error; cannot ignore it" if $problem_formats{$warning}{TYPE} eq 'ERROR';
    $ignore_warnings{$warning}++;
  }
}

# Load mapping from docid to length of that document
my $docids_file = $switches->get("docs");
my $docids;
if (defined $docids_file) {
  open(my $infile, "<:utf8", $docids_file) or die "Could not open $docids_file: $!";
  while(<$infile>) {
    chomp;
    my ($docid, $document_length) = split(/\t/);
    $docids->{$docid} = $document_length;
  }
  close $infile;
}

# Identify the desired input and output file types, and load the KB
my $input_type = $switches->get("input");
$input_type = &get_kb_file_type($filename) unless defined $input_type;
my $output_type = $switches->get("output");
if ($input_type eq 'unknown') {
  die "Could not determine the type of $filename; please use the -input switch";
}
elsif ($type2import{$input_type}) {
  # Load the knowledge base
  $kb = &{$type2import{$input_type}}($filename, $docids);
}
else {
  die "Sorry, no import routine is available for files of type $input_type";
}

# Problems were identified while the KB was loaded; now report them
my ($num_errors, $num_warnings) = &report_all_problems();
if ($num_errors) {
  print $error_output "$num_errors error" . ($num_errors == 1 ? '' : 's') . " encountered\n";
  # The NIST submission system wants an exit code of 255 if errors are encountered
  exit 255;
}
print $error_output ($num_warnings || 'No'), " warning", ($num_warnings == 1 ? '' : 's'), " encountered\n";

# Output the KB in the desired format, if any
if ($type2export{$output_type}) {
  &{$type2export{$output_type}}($kb, $output_case_fn, \%output_labels);
}
elsif ($output_type ne 'none') {
  die "Sorry, no export routine is available for files of type $output_type";
}
exit 0;

##################################################################################### 
# This switch processing code written many years ago by James Mayfield
# and used here with permission. It really has nothing to do with
# TAC KBP; it's just a partial replacement for getopt that closely ties
# the documentation to the switch specification. The code may well be cheesy,
# so no peeking.
##################################################################################### 

package SwitchProcessor;

sub _max {
    my $first = shift;
    my $second = shift;
    $first > $second ? $first : $second;
}

sub _quotify {
    my $string = shift;
    if (ref($string)) {
	join(", ", @{$string});
    }
    else {
	(!$string || $string =~ /\s/) ? "'$string'" : $string;
    }
}

sub _formatSubs {
    my $value = shift;
    my $switch = shift;
    my $formatted;
    if ($switch->{SUBVARS}) {
	$formatted = "";
	foreach my $subval (@{$value}) {
	    $formatted .= " " if $formatted;
	    $formatted .= _quotify($subval);
	}
    }
    # else if this is a constant switch, omit the vars [if they match?]
    else {
	$formatted = _quotify($value);
    }
    $formatted;
}

# Print an error message, display program usage, and exit unsuccessfully
sub _barf {
    my $self = shift;
    my $errstring = shift;
    open(my $handle, "|more") or die "Couldn't even barf with message $errstring";
    print $handle "ERROR: $errstring\n";
    $self->showUsage($handle);
    close $handle;
    exit(-1);
}

# Create a new switch processor.  Arguments are the name of the
# program being run, and deneral documentation for the program
sub new {
    my $classname = shift;
    my $self = {};
    bless ($self, $classname);
    $self->{PROGNAME} = shift;
    $self->{PROGNAME} =~ s(^.*/)();
    $self->{DOCUMENTATION} = shift;
    $self->{POSTDOCUMENTATION} = shift;
    $self->{HASH} = {};
    $self->{PARAMS} = [];
    $self->{SWITCHWIDTH} = 0;
    $self->{PARAMWIDTH} = 0;
    $self->{SWITCHES} = {};
    $self->{VARSTOCHECK} = ();
    $self->{LEGALVARS} = {};
    $self->{PROCESS_INVOKED} = undef;
    $self;
}

# Fill a paragraph, with different leaders for first and subsequent lines
sub _fill {
    $_ = shift;
    my $leader1 = shift;
    my $leader2 = shift;
    my $width = shift;
    my $result = "";
    my $thisline = $leader1;
    my $spaceOK = undef;
    foreach my $word (split) {
	if (length($thisline) + length($word) + 1 <= $width) {
	    $thisline .= " " if ($spaceOK);
	    $spaceOK = "TRUE";
	    $thisline .= $word;
	}
	else {
	    $result .= "$thisline\n";
	    $thisline = "$leader2$word";
	    $spaceOK = "TRUE";
	}
    }
    "$result$thisline\n";
}

# Show program usage
sub showUsage {
    my $self = shift;
    my $handle = shift;
    open($handle, "|more") unless defined $handle;
    print $handle _fill($self->{DOCUMENTATION}, "$self->{PROGNAME}:  ",
			" " x (length($self->{PROGNAME}) + 3), $terminalWidth);
    print $handle "\nUsage: $self->{PROGNAME}";
    print $handle " {-switch {-switch ...}}"
	if (keys(%{$self->{SWITCHES}}) > 0);
    # Count the number of optional parameters
    my $optcount = 0;
    # Print each parameter
    foreach my $param (@{$self->{PARAMS}}) {
	print $handle " ";
	print $handle "{" unless $param->{REQUIRED};
	print $handle $param->{NAME};
	$optcount++ if (!$param->{REQUIRED});
	print $handle "..." if $param->{ALLOTHERS};
    }
    # Close out the optional parameters
    print $handle "}" x $optcount;
    print $handle "\n\n";
    # Show details of each switch
    my $headerprinted = undef;
    foreach my $key (sort keys %{$self->{SWITCHES}}) {
	my $usage = "  $self->{SWITCHES}->{$key}->{USAGE}" .
	    " " x ($self->{SWITCHWIDTH} - length($self->{SWITCHES}->{$key}->{USAGE}) + 2);
	if (defined($self->{SWITCHES}->{$key}->{DOCUMENTATION})) {
	    print $handle "Legal switches are:\n"
		unless defined($headerprinted);
	    $headerprinted = "TRUE";
	    print $handle _fill($self->{SWITCHES}->{$key}->{DOCUMENTATION},
			$usage,
			" " x (length($usage) + 2),
			$terminalWidth);
	}
    }
    # Show details of each parameter
    if (@{$self->{PARAMS}} > 0) {
	print $handle "parameters are:\n";
	foreach my $param (@{$self->{PARAMS}}) {
	    my $usage = "  $param->{USAGE}" .
		" " x ($self->{PARAMWIDTH} - length($param->{USAGE}) + 2);
	    print $handle _fill($param->{DOCUMENTATION}, $usage, " " x (length($usage) + 2), $terminalWidth);
	}
    }
    print $handle "\n$self->{POSTDOCUMENTATION}\n" if $self->{POSTDOCUMENTATION};
}

# Retrieve all keys defined for this switch processor
sub keys {
    my $self = shift;
    keys %{$self->{HASH}};
}

# Add a switch that causes display of program usage
sub addHelpSwitch {
    my $self = shift;
    my ($shouldBeUndef, $filename, $line) = caller;
    my $switch = SP::_Switch->newHelp($filename, $line, @_);
    $self->_addSwitch($filename, $line, $switch);
}

# Add a switch that causes a given variable(s) to be assigned a given
# constant value(s)
sub addConstantSwitch {
    my $self = shift;
    my ($shouldBeUndef, $filename, $line) = caller;
    my $switch = SP::_Switch->newConstant($filename, $line, @_);
    $self->_addSwitch($filename, $line, $switch);
}

# Add a switch that assigns to a given variable(s) value(s) provided
# by the user on the command line
sub addVarSwitch {
    my $self = shift;
    my ($shouldBeUndef, $filename, $line) = caller;
    my $switch = SP::_Switch->newVar($filename, $line, @_);
    $self->_addSwitch($filename, $line, $switch);
}

# Add a switch that invokes a callback as soon as it is encountered on
# the command line.  The callback receives three arguments: the switch
# object (which is needed by the internal routines, but presumably may
# be ignored by user-defined functions), the switch processor, and all
# the remaining arguments on the command line after the switch (as the
# remainder of @_, not a reference).  If it returns, it must return
# the list of command-line arguments that remain after it has dealt
# with whichever ones it wants to.
sub addImmediateSwitch {
    my $self = shift;
    my ($shouldBeUndef, $filename, $line) = caller;
    my $switch = SP::_Switch->newImmediate($filename, $line, @_);
    $self->_addSwitch($filename, $line, $switch);
}

sub addMetaSwitch {
    my $self = shift;
    my ($shouldBeUndef, $filename, $line) = caller;
    my $switch = SP::_Switch->newMeta($filename, $line, @_);
    $self->_addSwitch($filename, $line, $switch);
}

# Add a new switch
sub _addSwitch {
    my $self = shift;
    my $filename = shift;
    my $line = shift;
    my $switch = shift;
    # Can't add switches after process() has been invoked
    die "Attempt to add a switch after process() has been invoked, at $filename line $line\n"
	if ($self->{PROCESS_INVOKED});
    # Bind the switch object to its name
    $self->{SWITCHES}->{$switch->{NAME}} = $switch;
    # Remember how much space is required for the usage line
    $self->{SWITCHWIDTH} = _max($self->{SWITCHWIDTH}, length($switch->{USAGE}))
	if (defined($switch->{DOCUMENTATION}));
    # Make a note of the variable names that are legitimized by this switch
    $self->{LEGALVARS}->{$switch->{NAME}} = "TRUE";
}

# Add a new command-line parameter
sub addParam {
    my ($shouldBeUndef, $filename, $line) = caller;
    my $self = shift;
    # Can't add params after process() has been invoked
    die "Attempt to add a param after process() has been invoked, at $filename line $line\n"
	if ($self->{PROCESS_INVOKED});
    # Create the parameter object
    my $param = SP::_Param->new($filename, $line, @_);
    # Remember how much space is required for the usage line
    $self->{PARAMWIDTH} = _max($self->{PARAMWIDTH}, length($param->{NAME}));
    # Check for a couple of potential problems with parameter ordering
    if (@{$self->{PARAMS}} > 0) {
	my $previous = ${$self->{PARAMS}}[$#{$self->{PARAMS}}];
        die "Attempt to add param after an allOthers param, at $filename line $line\n"
	    if ($previous->{ALLOTHERS});
        die "Attempt to add required param after optional param, at $filename line $line\n"
	    if ($param->{REQUIRED} && !$previous->{REQUIRED});
    }
    # Make a note of the variable names that are legitimized by this param
    $self->{LEGALVARS}->{$param->{NAME}} = "TRUE";
    # Add the parameter object to the list of parameters for this program
    push(@{$self->{PARAMS}}, $param);
}

# Set a switch processor variable to a given value
sub put {
    my $self = shift;
    my $key = shift;
    my $value = shift;
    my ($shouldBeUndef, $filename, $line) = caller;
    $self->_varNameCheck($filename, $line, $key, undef);
    my $switch = $self->{SWITCHES}->{$key};
    die "Wrong number of values in second argument to put, at $filename line $line.\n"
	if ($switch->{SUBVARS} &&
	    (!ref($value) ||
	     scalar(@{$value}) != @{$switch->{SUBVARS}}));
    $self->{HASH}->{$key} = $value;
}

# Get the value of a switch processor variable
sub get {
    my $self = shift;
    my $key = shift;
    # Internally, we sometimes want to do a get before process() has
    # been invoked.  The secret second argument to get allows this.
    my $getBeforeProcess = shift;
    my ($shouldBeUndef, $filename, $line) = caller;
    die "Get called before process, at $filename line $line\n"
	if (!$self->{PROCESS_INVOKED} && !$getBeforeProcess);
    # Check for var.subvar syntax
    $key =~ /([^.]*)\.*(.*)/;
    my $var = $1;
    my $subvar = $2;
    # Make sure this is a legitimate switch processor variable
    $self->_varNameCheck($filename, $line, $var, $subvar);
    my $value = $self->{HASH}->{$var};
    $subvar ? $value->[$self->_getSubvarIndex($var, $subvar)] : $value;
}

sub _getSubvarIndex {
    my $self = shift;
    my $var = shift;
    my $subvar = shift;
    my $switch = $self->{SWITCHES}->{$var};
    return(-1) unless $switch;
    return(-1) unless $switch->{SUBVARS};
    for (my $i = 0; $i < @{$switch->{SUBVARS}}; $i++) {
	return($i) if ${$switch->{SUBVARS}}[$i] eq $subvar;
    }
    -1;
}

# Check whether a given switch processor variable is legitimate
sub _varNameCheck {
    my $self = shift;
    my $filename = shift;
    my $line = shift;
    my $key = shift;
    my $subkey = shift;
    # If process() has already been invoked, check the variable name now...
    if ($self->{PROCESS_INVOKED}) {
	$self->_immediateVarNameCheck($filename, $line, $key, $subkey);
    }
    # ...Otherwise, remember the variable name and check it later
    else {
	push(@{$self->{VARSTOCHECK}}, [$filename, $line, $key, $subkey]);
    }
}

# Make sure this variable is legitimate
sub _immediateVarNameCheck {
    my $self = shift;
    my $filename = shift;
    my $line = shift;
    my $key = shift;
    my $subkey = shift;
    die "No such SwitchProcessor variable: $key, at $filename line $line\n"
	unless $self->{LEGALVARS}->{$key};
    die "No such SwitchProcessor subvariable: $key.$subkey, at $filename line $line\n"
	unless (!$subkey || $self->_getSubvarIndex($key, $subkey) >= 0);
}

# Add default values to switch and parameter documentation strings,
# where appropriate
sub _addDefaultsToDoc {
    my $self = shift;
    # Loop over all switches
    foreach my $switch (values %{$self->{SWITCHES}}) {
	if ($switch->{METAMAP}) {
	    $switch->{DOCUMENTATION} .= " (Equivalent to";
	    foreach my $var (sort CORE::keys %{$switch->{METAMAP}}) {
		my $rawval = $switch->{METAMAP}->{$var};
		my $val = SwitchProcessor::_formatSubs($rawval, $self->{SWITCHES}->{$var});
		$switch->{DOCUMENTATION} .= " -$var $val";
	    }
	    $switch->{DOCUMENTATION} .= ")";
	}
	# Default values aren't reported for constant switches
	if (!defined($switch->{CONSTANT})) {
	    my $default = $self->get($switch->{NAME}, "TRUE");
	    if (defined($default)) {
		$switch->{DOCUMENTATION} .= " (Default = " . _formatSubs($default, $switch) . ").";
	    }
	}
    }
    # Loop over all params
    foreach my $param (@{$self->{PARAMS}}) {
	my $default = $self->get($param->{NAME}, "TRUE");
	# Add default to documentation if the switch is optional and there
	# is a default value
	$param->{DOCUMENTATION} .= " (Default = " . _quotify($default) . ")."
	    if (!$param->{REQUIRED} && defined($default));
    }
}

# Process the command line
sub process {
    my $self = shift;
    # Add defaults to the documentation
    $self->_addDefaultsToDoc();
    # Remember that process() has been invoked
    $self->{PROCESS_INVOKED} = "TRUE";
    # Now that all switches have been defined, check all pending
    # variable names for legitimacy
    foreach (@{$self->{VARSTOCHECK}}) {
	# FIXME: Can't we just use @{$_} here?
	$self->_immediateVarNameCheck(${$_}[0], ${$_}[1], ${$_}[2], ${$_}[3]);
    }
    # Switches must come first.  Keep processing switches as long as
    # the next element begins with a dash
    while (@_ && $_[0] =~ /^-(.*)/) {
	# Get the switch with this name
	my $switch = $self->{SWITCHES}->{$1};
	$self->_barf("Unknown switch: -$1\n")
	    unless $switch;
	# Throw away the switch name
	shift;
	# Invoke the process code associated with this switch
	# FIXME:  How can switch be made implicit?
	@_ = $switch->{PROCESS}->($switch, $self, @_);
    }
    # Now that the switches have been handled, loop over the legal params
    foreach my $param (@{$self->{PARAMS}}) {
	# Bomb if a required arg wasn't provided
	$self->_barf("Not enough arguments; $param->{NAME} must be provided\n")
	    if (!@_ && $param->{REQUIRED});
	# If this is an all others param, grab all the remaining arguments
	if ($param->{ALLOTHERS}) {
	    $self->put($param->{NAME}, [@_]) if @_;
	    @_ = ();
	}
	# Otherwise, if there are arguments left, bind the next one to the parameter
	elsif (@_) {
	    $self->put($param->{NAME}, shift);
	}
    }
    # If any arguments are left over, the user botched it
    $self->_barf("Too many arguments\n")
	if (@_);
}

################################################################################

package SP::_Switch;

sub new {
    my $classname = shift;
    my $filename = shift;
    my $line = shift;
    my $self = {};
    bless($self, $classname);
    die "Too few arguments to constructor while creating classname, at $filename line $line\n"
	unless @_ >= 2;
    # Switch name and documentation are always present
    $self->{NAME} = shift;
    $self->{DOCUMENTATION} = pop;
    $self->{USAGE} = "-$self->{NAME}";
    # I know, these are unnecessary
    $self->{PROCESS} = undef;
    $self->{CONSTANT} = undef;
    $self->{SUBVARS} = ();
    # Return two values
    # FIXME: Why won't [$self, \@_] work here?
    ($self, @_);
}

# Create new help switch
sub newHelp {
    my @args = new (@_);
    my $self = shift(@args);
    die "Too many arguments to addHelpSwitch, at $_[1] line $_[2]\n"
	if (@args);
    # A help switch just prints out program usage then exits
    $self->{PROCESS} = sub {
	my $self = shift;
	my $sp = shift;
	$sp->showUsage();
	exit(0);
    };
    $self;
}

# Create a new constant switch
sub newConstant {
    my @args = new(@_);
    my $self = shift(@args);
    die "Too few arguments to addConstantSwitch, at $_[1] line $_[2]\n"
	unless @args >= 1;
    die "Too many arguments to addConstantSwitch, at $_[1] line $_[2]\n"
	unless @args <= 2;
    # Retrieve the constant value
    $self->{CONSTANT} = pop(@args);
    if (@args) {
	$self->{SUBVARS} = shift(@args);
	# Make sure, if there are subvars, that the number of subvars
	# matches the number of constant arguments
	die "Number of values [" . join(", ", @{$self->{CONSTANT}}) .
	    "] does not match number of variables [" . join(", ", @{$self->{SUBVARS}}) .
		"], at $_[1] line $_[2]\n"
		    unless $#{$self->{CONSTANT}} == $#{$self->{SUBVARS}};
    }
    $self->{PROCESS} = sub {
	my $self = shift;
	my $sp = shift;
	my $counter = 0;
	$sp->put($self->{NAME}, $self->{CONSTANT});
	@_;
    };
    $self;
}

# Create a new var switch
sub newVar {
    my @args = new(@_);
    my $self = shift(@args);
    die "Too many arguments to addVarSwitch, at $_[1] line $_[2]\n"
	unless @args <= 1;
    # If there are subvars
    if (@args) {
	my $arg = shift(@args);
	if (ref $arg) {
	    $self->{SUBVARS} = $arg;
	    # Augment the usage string with the name of the subvar
	    foreach my $subvar (@{$self->{SUBVARS}}) {
		$self->{USAGE} .= " <$subvar>";
	    }
	    # A var switch with subvars binds each subvar
	    $self->{PROCESS} = sub {
		my $self = shift;
		my $sp = shift;
		my $counter = 0;
		my $value = [];
		# Make sure there are enough arguments for this switch
		foreach (@{$self->{SUBVARS}}) {
		    $sp->_barf("Not enough arguments to switch -$self->{NAME}\n")
			unless @_;
		    push(@{$value}, shift);
		}
		$sp->put($self->{NAME}, $value);
		@_;
	    };
	}
	else {
	    $self->{USAGE} .= " <$arg>";
	    $self->{PROCESS} = sub {
		my $self = shift;
		my $sp = shift;
		$sp->put($self->{NAME}, shift);
		@_;
	    };
	}
    }
    else {
	# A var switch without subvars gets one argument, called 'value'
	# in the usage string
	$self->{USAGE} .= " <value>";
	# Bind the argument to the parameter
	$self->{PROCESS} = sub {
	    my $self = shift;
	    my $sp = shift;
	    $sp->put($self->{NAME}, shift);
	    @_;
	};
    }
    $self;
}

# Create a new immediate switch
sub newImmediate {
    my @args = new(@_);
    my $self = shift(@args);
    die "Wrong number of arguments to addImmediateSwitch or addMetaSwitch, at $_[1] line $_[2]\n"
	unless @args == 1;
    $self->{PROCESS} = shift(@args);
    $self;
}

# Create a new meta switch
sub newMeta {
    # The call looks just like a call to newImmediate, except that
    # instead of a fn as the second argument, there's a hashref.  So
    # use newImmediate to do the basic work, then strip out the
    # hashref and replace it with the required function.
    my $self = newImmediate(@_);
    $self->{METAMAP} = $self->{PROCESS};
    $self->{PROCESS} = sub {
	my $var;
	my $val;
	my $self = shift;
	my $sp = shift;
	# FIXME: Doesn't properly handle case where var is itself a metaswitch
	while (($var, $val) = each %{$self->{METAMAP}}) {
	    $sp->put($var, $val);
	}
	@_;
    };
    $self;
}

################################################################################

package SP::_Param;

# A parameter is just a struct for the four args
sub new {
    my $classname = shift;
    my $filename = shift;
    my $line = shift;
    my $self = {};
    bless($self, $classname);
    $self->{NAME} = shift;
    # param name and documentation are first and last, respectively.
    $self->{DOCUMENTATION} = pop;
    $self->{USAGE} = $self->{NAME};
    # If omitted, REQUIRED and ALLOTHERS default to undef
    $self->{REQUIRED} = shift;
    $self->{ALLOTHERS} = shift;
    # Tack on required to the documentation stream if this arg is required
    $self->{DOCUMENTATION} .= " (Required)."
	if ($self->{REQUIRED});
    $self;
}

################################################################################


################################################################################
# Revision History
################################################################################

# 1.0 - Original version
# 1.1 - Changed comment deletion pattern to use older syntax for possessive matches
#     - Increased robustness to ill-formed submission files
#     - Allowed multiple predicate domains and ranges
# 1.2 - Added check that entities in relations are attested in the document attesting to the relation
#     - Added switches for redirecting standard and error output
#     - Added NIST exit code on failure
#     - Modified document lengths code to accept length rather than index of last character
#     - UTF-8 compliance
#     - Allowed user-defined relations
#     - Ensured that if a relation is omitted from the output, its inverse is too

1;
