#!/usr/local/bin/jperl

################################################################
#  Simple IR system
#  
#  Usage:
#    SIR1199.perl
#
#  Description:
#    Read the source code.
#    It assume IREX SGML data for the input and output.
#  
#  Author:
#    Satoshi Sekine (New York University)
#
#  Note:
#    Of course, it is better if you use an index file in terms of
#  speed. The purpose of this program is to provide sample
#  output data of IREX. Also, you can start IR programing
#  based on this. You may want to add a morphological analyser,
#  sophisticated weighting scheme, query expansion, use of
#  thesauri, better stpolist, better cut off parameters, etc.
#
#  You can copy, modify, redistribute the code.
#  The source code is provided "AS IS".
#  The author has no responsibility for any damage or anything
#  caused by the code. No copyright is claimed.
################################################################

###### Configurations ########

$SYSTEM_ID = 1199;

@DATA_FILES=( "$ENV{IREX_ROOT}/MAINICHI/mai94.sgml",
             "$ENV{IREX_ROOT}/MAINICHI/mai95.sgml" );
# @DATA_FILES = ( "sample.sgml" );

$TOPIC_FILE= "IR_DRYRUN.topic2";


###### Parameters #############

@STOPLIST = ( "記事", "事", "含", "述", "探", "言及", "場合", "指摘", "良", "関", "分" );

$TW_DESCRIPTION = 3;
$TW_NARRATIVE = 1;

$LOW_DF_THRESHOLD = 10;

####### Program ###############

#
# Print header of output
#
print "<SYSTEM-ID>$SYSTEM_ID</SYSTEM-ID>\n\n";

#
# Analyze TOPIC
# 
open(TOPIC_FILE, "$TOPIC_FILE") || die "Can't open file : $TOPIC_FILE";

# status=0: out of a TOPIC tag pair, 
#        1: in a pair
$status = 0;

while( <TOPIC_FILE> ){

  if($status==1 && m!</TOPIC>!){
	sir1199_main();
        $status=0;
  }elsif($status==0 && /<TOPIC>/){
        $status=1;
  }elsif($status==0){
        next;
  }elsif(m!<TOPIC-ID>(\d+)</TOPIC-ID>!){
	$topicid = $1;
  }elsif(m!<DESCRIPTION>(.+)</DESCRIPTION>!){
        $description = $1;
  }elsif(m!<NARRATIVE>(.+)</NARRATIVE>!){
        $narrative = $1;
  }

}


#
# IR Main
#
sub sir1199_main {

   extract_keywords();

   calc_df();

   calc_similarity();

   print_result();
}


#
# Extract keywords from description & narrative
#
sub extract_keywords {

   @description_keywords = split( /[あ-ん、。]+/, $description );
   @narrative_keywords = split( /[あ-ん、。]+/, $narrative );

   for $key ( @narrative_keywords ){
     $tw{$key} += $TW_NARRATIVE;
   }

   for $key ( @description_keywords ){
     $tw{$key} += $TW_DESCRIPTION;
   }

   @keywords = ();
   for $key ( keys %tw ){
     if( !grep( /^$key$/, @STOPLIST) ){
       push ( @keywords, $key );
     }
   }
}


#
# Calculate document frequency of keywords
#
sub calc_df {

  for $f ( @DATA_FILES ){
    open( FILE,"< $f" )|| die "Can't open file : $f";
    while( <FILE> ){
      if( /<DOCNO>(\d+)<\/DOCNO>/ ){
        for $key ( @keywords ){
          $flag{$key} = 0;
        }
      }

      for $key ( @keywords ){
        if( grep (/$key/ ,$_) ){
          $flag{$key} = 1;
        }
      }
      next unless $_ eq "</DOC>\n";

      for $key ( @keywords ){
	$df{$key} += $flag{$key};
      }
    }
    close FILE;
  }

# DEBUG  for $key ( @keywords ){
# DEBUG    print "KEY = $key $df{$key}\n";
# DEBUG  }
}


#
# Calculate similarities of documents
#
sub calc_similarity {

  for $f ( @DATA_FILES ){
    open( FILE,"< $f" )|| die "Can't open file : $f";
    while( <FILE> ){
      if( /<DOCNO>(\d+)<\/DOCNO>/ ){
	$docno = $1;
        $sim{$docno} = 0;
        for $key ( @keywords ){
          $tf{$key} = 0;
        }
      } else {
        $len += length( $_ );
        for $key ( @keywords ){
          $tf{$key} += grep( /$key/,$_ );
        }
      }
      next unless $_ eq "</DOC>\n";

      for $key ( @keywords ){
        if($tf{$key} > 0 && $df{$key} > $LOW_DF_THRESHOLD) {
          $sim{$docno} += $tw{$key} * $tf{$key} / (log($df{$key}) * log($len));  
# DEBUG          print "MATCH $docno $key $tf{$key}\n";
        }
      }

      $len = 0;
      $match = 0;
    }
    close FILE;
  }
}


#
# Print IREX format output
#
sub print_result {

  print "<IR-RESULT>\n";
  print "<TOPIC-ID>$topicid</TOPIC-ID>\n";
  $i=1;

  foreach $docno ( sort {$sim{$b} <=> $sim{$a}} keys %sim ) {

       print "<A-ID RANK=$i SIM=$sim{$docno}>$docno</A-ID>\n";
       if($i++>=300){ last; }
  }

  print "</IR-RESULT>\n\n"
}