#!/usr/bin/perl

### Calculate Krippendorff's alpha for interval (numeric) judgments,
### including confidence intervals using bootstrapping.

### Usage: perl interval-alpha.perl <data-file> <samples>

### <data-file> is a tab-separated text file: the first line is
### annotator id's separated by tabs, and subsequent lines are
### annotation data, with each line representing a single item, and
### individual values separated by tabs.

### <samples> is the number of resamplings used for calculating
### confidence intervals. It must be a multiple of 200, in order to
### give two-tailed 1% confidence intervals. A parameter value 0 means
### no confidence intervals are calculated.

### This program only calculates interval alpha. To use a different
### distance metric, one would need to replace the functions
### disagreement (the distance metric) and valid_scores (which
### determines which values are acceptable -- currently only accepts
### numbers).

### Questions, suggestions and bug reports welcome!
### Ron Artstein, http://ron.artstein.org/
### Email: my last name [at] ict.usc.edu
### Version 1.0, 2008-08-08.


####### begin program #######

### Integrity tests

if (1 != $#ARGV) # Exactly 2 arguments required
  {die "Usage: perl interval-alpha.perl <data-file> <samples>\n"}

$data_file = $ARGV[0] ;
$num_samples = $ARGV[1] ;

if (0 > $num_samples) {
  die "Number of samples must be a non-negative multiple of 200\n" ;
}

if (int ($num_samples/200) != $num_samples/200) {
  die "Number of samples must be a multiple of 200\n" ;
}

### Main

read_data ($data_file) ;

print_summary_statistics () ;

print_reliability_all_raters () ;

print_reliability_subsets_of_raters () ;

if (0 < $num_samples) {print_confidence_intervals ()}


###

sub print_summary_statistics {
  print "Summary statistics\n\n" ;
  # Create an array of all scores
  my @temp_array = () ;
  foreach $item (0 .. @all_scores - 1) {
    foreach $annotator (0 .. @annotators - 1) {
      if (0 != $all_scores[$item][$annotator] 
	  or "0" eq $all_scores[$item][$annotator]) {
	push (@temp_array, $all_scores[$item][$annotator])
      }
    }
  }
  # Summary statistics for all scores
  my $median = median (@temp_array) ;
  my $mean = mean (@temp_array) ;
  my $variance = variance (@temp_array) ;
  my %scores = histogram (@temp_array) ;
  foreach $key (sort keys %scores) {print sprintf ("%-6s", $key)}
  print "N     Med.  Mean  Var.  Annotator\n" ;
  foreach $key (sort keys %scores) {print sprintf ("%5s", $scores{$key})," "}
  print sprintf ("%5s", scalar(@temp_array)) , " " ;
  print sprintf ("%.3f", $median) , " " ;
  print sprintf ("%.3f", $mean) , " " ;
  print sprintf ("%.3f", $variance) , " " ;
  print "All scores\n" ;
  foreach $annotator (0 .. @annotators - 1) {
    # Create an array of all scores for one annotator
    my @temp_array = () ;
    foreach $item (0 .. @all_scores - 1) {
      if (0 != $all_scores[$item][$annotator] 
	  or "0" eq $all_scores[$item][$annotator]) {
	push (@temp_array, $all_scores[$item][$annotator])
      }
    }
    # Summary statistics for one annotator
    my $median = median (@temp_array) ;
    my $mean = mean (@temp_array) ;
    my $variance = variance (@temp_array) ;
    my %annotator_scores = histogram (@temp_array) ;
    foreach $key (sort keys %scores) 
      {print sprintf ("%5s", $annotator_scores{$key})," "}
    print sprintf ("%5s", scalar(@temp_array)) , " " ;
    print sprintf ("%.3f", $median) , " " ;
    print sprintf ("%.3f", $mean) , " " ;
    print sprintf ("%.3f", $variance) , " " ;
    print "\u$annotators[$annotator]\n" ;
  }
}


### Reliability for all raters, taken together as a group. Calculated
### only on the items scored by all raters.

sub print_reliability_all_raters {

  my $all_annotators_report = generate_subset_report (0 .. @annotators - 1) ;
  print "\nReliability for all judges together\n\n" ;
  print "Alpha D-obs D-exp N    Raters\n" ;
  print "$all_annotators_report\n" ;

}


### Reliability for all possible subgroups of raters. CAUTION: THIS
### GROWS EXPONENTIALLY WITH THE NUMBER OF RATERS. For each group of
### raters, reliability is calculated on the items scored by all
### raters in the group. Results are sorted by reliability (alpha),
### from the least agreeing subset to the most agreeing one.

sub print_reliability_subsets_of_raters {

  my @annotator_subsets = find_all_subsets (0 .. @annotators - 1) ;
  my @reports = () ;

  foreach $subset (@annotator_subsets) {
    if (2 <= scalar (@{$subset})) {
      $annotator_subset_report = generate_subset_report (@{$subset}) ;
      push (@reports, $annotator_subset_report) ;
    }
  }

  @reports = sort {$a <=> $b} (@reports) ; # sort by alpha value
  print "\nReliability for subsets of judges\n\n" ;
  print "Alpha D-obs D-exp N    Raters\n" ;
  foreach $report (@reports) {print "$report\n"}

}

### Takes an array of size n and returns a two-dimensional array of
### 2^n - 1 rows, consisting of all non-empty subsets of the original
### array.

sub find_all_subsets {
  if (1 == scalar (@_)) {return [@_]}
  else {
    my @all_subsets = () ;
    my $last_item = pop (@_) ;
    my @first_subsets = find_all_subsets (@_) ;
    foreach $subset (@first_subsets) {
      push (@all_subsets, $subset) ;
      my @ext_subset = @{$subset} ;
      push (@ext_subset, $last_item) ;
      push (@all_subsets, [@ext_subset]) ;
    }
    push (@all_subsets, [$last_item]) ;
    return (@all_subsets) ;
  }
}


### Takes an array of annotator indices and returns a line of text
### consisting of alpha value, observed disagreement, expected
### disagreement, number of item on which these values were
### calculated, and the annotator id's of the indices in the array.

sub generate_subset_report {
  @restricted_scores = restrict_scores_to (@_) ;
  ($num_items , $expected_disagreement , $observed_disagreement , $alpha) = 
    calculate_alpha (@restricted_scores) ;
  my $annotator_subset_report = "" ;
  foreach $x ($alpha , $observed_disagreement  , $expected_disagreement) {
    if ("undef" eq $x) {$annotator_subset_report .= "$x "}
    elsif (0 <= $x) {$annotator_subset_report .= sprintf ("%.3f" , $x) . " "}
    elsif (0 > $x) {$annotator_subset_report .= sprintf ("%.2f" , $x) . " "}
  }
  $annotator_subset_report .= sprintf ("%4s" , $num_items) . " " ;
  foreach $x (@_) {
    $annotator_subset_report .= "\u@annotators[$x] " ;
  }
  return $annotator_subset_report ;
}


### Takes an array of annotator indices and returns a matrix
### consisting of the items rated by all of these annotators, with all
### the scores given by these annotators.

sub restrict_scores_to {
  my @restricted_scores = () ;
  foreach $item (0 .. @all_scores - 1) {
    my @item_ratings = @{$all_scores[$item]}[@_] ;
    if (valid_scores (@item_ratings)) {
      push (@restricted_scores, [@item_ratings]) ;
    }
  }
  return (@restricted_scores) ;
}


### Confidence intervals for all possible subgroups of raters.
### CAUTION: THIS GROWS EXPONENTIALLY WITH THE NUMBER OF RATERS. For
### each group of raters, a distribution of alpha is simulated through
### bootstrapping, and key points in the distribution are
### reported. Results are sorted by the median of the distribution,
### from the least agreeing subset to the most agreeing one.

sub print_confidence_intervals {
  my @annotator_subsets = find_all_subsets (0 .. @annotators - 1) ;
  my @reports = () ;

  foreach $subset (@annotator_subsets) {
    if (2 <= scalar (@{$subset})) {
      $annotator_initials = annotator_initials (@{$subset}) ;
      $confidence_intervals = confidence_intervals (@{$subset}) ;
      push (@reports, $confidence_intervals . $annotator_initials) ;
    }
  }
  # sort by median alpha value
  @reports = sort {substr ($a, 30, 5) <=> substr ($b, 30, 5)} (@reports) ;
  print "\nConfidence intervals for Alpha (bootstrapping, $num_samples resamples)\n\n" ;
  print "  0   0.005 0.01  0.025 0.05   0.5  0.95  0.975 0.99  0.995   1\n" ;
  foreach $report (@reports) {print "$report\n"}

}

### Get the initials of annotator id's from an array of annotator
### indices, to include in an abbreviated report.

sub annotator_initials {
  my $annotator_initials = "" ;
  foreach $annotator (@_) 
    {$annotator_initials .= substr ($annotators[$annotator], 0, 1)}
  return ($annotator_initials) ;
}


### Simulate a distribution of alpha through bootstrapping for a a
### group of raters, and report key points in the distribution.

sub confidence_intervals {
  my @simulation_results = () ;          # array of alpha values
  my $confidence_intervals = "" ;        # string of key alpha values
  foreach $sample (0 .. $num_samples) {  # number of simulations
    @fake_scores = fake_scores (@_) ;    # simulated coding instance
    ($num_items , $expected_disagreement , $observed_disagreement , $alpha) = 
      calculate_alpha (@fake_scores) ;
    if ("undef" eq $alpha) {push (@simulation_results, $alpha)}
    elsif (0 <= $alpha) 
      {push (@simulation_results, sprintf ("%.3f" , $alpha))}
    elsif (0 > $alpha) 
      {push (@simulation_results, sprintf ("%.2f" , $alpha))}
  }
  @simulation_results = sort {$a <=> $b} (@simulation_results) ;
  foreach $key_value
    (0 , 0.005 , 0.01 , 0.025 , 0.05 , 0.5 , 0.95 , 0.975 , 0.99 , 0.995 ,1) {
    $sample_number = $key_value * $num_samples ;
    $confidence_intervals .= "@simulation_results[$sample_number] " ; 
  }
  return ($confidence_intervals) ;
}

### Simulates a coding instance for an array of annotators: first
### creates the actual coding table restricted to these annotators,
### and then randomly picks items from this table (with replacement)
### to create a simulated table of the same size.

sub fake_scores {
  my @restricted_scores = restrict_scores_to (@_) ;
  my @fake_scores = () ;
  foreach $item (0 .. @restricted_scores - 1) {
    $fake_item = int (rand () * @restricted_scores) ;
    push (@fake_scores, $restricted_scores[$fake_item]) ;
  }
  return (@fake_scores) ;
}


### Takes a two-dimensional array of scores, with each row
### representing the scores of a single item, and returns the number
### of items, expected disagreement, observed disagreement, and
### alpha. It is assumed that all scores are valid and that each item
### has the same number of scores -- it is the responsibility of the
### function that calls calculate_alpha to pass a valid array.

sub calculate_alpha {
  my @all_ratings = () ;              # array of all the individual scores
  my @item_disagreement = () ;        # disagreement values for each item
  foreach $item (0 .. @_ - 1) {
    my @item_ratings = @{$_[$item]} ; # individual scores for the item
    push (@all_ratings, @item_ratings) ;
    my $item_disagreement = disagreement (@item_ratings) ;
    push (@item_disagreement, $item_disagreement) ;
  }
  my $num_items = scalar (@item_disagreement) ;
  my $expected_disagreement = disagreement (@all_ratings) ;
  my $observed_disagreement = mean (@item_disagreement) ;
  my $alpha ;
  if (0 < $expected_disagreement) 
    {$alpha = 1 - $observed_disagreement / $expected_disagreement}
  else {$alpha = "undef"}
  return ($num_items , $expected_disagreement , $observed_disagreement , $alpha) ;
}

### Disagreement on a set of scores. For interval alpha, disagreement
### is twice the variance of the set.

sub disagreement {
  my $disagreement = 2 * variance (@_) ;
  return ($disagreement) ;
}


### Checks whether all members of an array are valid scores, returns 0
### or 1. For interval alpha, valid scores are numbers.

sub valid_scores {
  my $valid_scores = 1 ;
  foreach my $score (@_) {
    if (0 == $score and "0" ne $score) {$valid_scores = 0}
  }
  return $valid_scores ;
}

############################################################
### Read the data file into two arrays:
### @annotators holds the annotator id's
### @all_scores holds all the ratings (two-dimensional)

sub read_data {

  open (DATA, "$_[0]") ;
  @all_scores = ( ) ;

  # First line is annotator identifiers, separated by tabs

  $header = <DATA> ;
  chomp $header ;
  @annotators = split (/\t/ , $header) ;

  # Subsequent lines are scores -- one line per item, with individual
  # scores separated by tabs. Number of fields must match number of
  # annotators.

  while (<DATA>) {
    chomp ;
    my @item_scores = split (/\t/ , $_) ;
    if ($#annotators != $#item_scores) 
      {die "Mismatch in the number of judgments"}
    push (@all_scores , [@item_scores]) ;
  }
  close (DATA) ;
}

############################################################
### General statistical functions

sub mean { # mean of values in an array
  my $sum = 0 ;
  foreach $x (@_) {
    $sum += $x ;
  }
  return $sum/@_ ;
}

sub sum_squares { # sum of square differences from the mean
  my $mean = mean (@_) ;
  my $sum_squares = 0 ;
  foreach $x (@_) {
    $sum_squares += ($x - $mean) ** 2 ;
  }
  return $sum_squares ;
}

sub variance { # variance of values in an array
  my $sum_squares = sum_squares (@_) ;
  my $deg_freedom = @_ - 1 ;
  return $sum_squares/$deg_freedom ;
}

sub median { # median of values in an array
  my @sorted = sort {$a <=> $b} (@_) ;
  if (1 == @sorted % 2) # Odd number of elements
    {return $sorted[(@sorted-1)/2]}
  else                   # Even number of elements
    {return ($sorted[@sorted/2-1]+$sorted[@sorted/2]) / 2}
}

sub histogram { # Counts of elements in an array
  my %histogram = () ;
  foreach $value (@_) {$histogram{$value}++}
  return (%histogram) ;
}


