#!/usr/local/bin/jperl
################################################################
# Simple IR system
#
# Usage:
# SIR1199.perl
#
# Description:
# Read the source code.
# It assume IREX SGML data for the input and output.
#
# Author:
# Satoshi Sekine (New York University)
#
# Note:
# Of course, it is better if you use an index file in terms of
# speed. The purpose of this program is to provide sample
# output data of IREX. Also, you can start IR programing
# based on this. You may want to add a morphological analyser,
# sophisticated weighting scheme, query expansion, use of
# thesauri, better stpolist, better cut off parameters, etc.
#
# You can copy, modify, redistribute the code.
# The source code is provided "AS IS".
# The author has no responsibility for any damage or anything
# caused by the code. No copyright is claimed.
################################################################
###### Configurations ########
$SYSTEM_ID = 1199;
@DATA_FILES=( "$ENV{IREX_ROOT}/MAINICHI/mai94.sgml",
"$ENV{IREX_ROOT}/MAINICHI/mai95.sgml" );
# @DATA_FILES = ( "sample.sgml" );
$TOPIC_FILE= "IR_DRYRUN.topic2";
###### Parameters #############
@STOPLIST = ( "記事", "事", "含", "述", "探", "言及", "場合", "指摘", "良", "関", "分" );
$TW_DESCRIPTION = 3;
$TW_NARRATIVE = 1;
$LOW_DF_THRESHOLD = 10;
####### Program ###############
#
# Print header of output
#
print "$SYSTEM_ID\n\n";
#
# Analyze TOPIC
#
open(TOPIC_FILE, "$TOPIC_FILE") || die "Can't open file : $TOPIC_FILE";
# status=0: out of a TOPIC tag pair,
# 1: in a pair
$status = 0;
while( ){
if($status==1 && m!!){
sir1199_main();
$status=0;
}elsif($status==0 && //){
$status=1;
}elsif($status==0){
next;
}elsif(m!(\d+)!){
$topicid = $1;
}elsif(m!(.+)!){
$description = $1;
}elsif(m!(.+)!){
$narrative = $1;
}
}
#
# IR Main
#
sub sir1199_main {
extract_keywords();
calc_df();
calc_similarity();
print_result();
}
#
# Extract keywords from description & narrative
#
sub extract_keywords {
@description_keywords = split( /[あ-ん、。]+/, $description );
@narrative_keywords = split( /[あ-ん、。]+/, $narrative );
for $key ( @narrative_keywords ){
$tw{$key} += $TW_NARRATIVE;
}
for $key ( @description_keywords ){
$tw{$key} += $TW_DESCRIPTION;
}
@keywords = ();
for $key ( keys %tw ){
if( !grep( /^$key$/, @STOPLIST) ){
push ( @keywords, $key );
}
}
}
#
# Calculate document frequency of keywords
#
sub calc_df {
for $f ( @DATA_FILES ){
open( FILE,"< $f" )|| die "Can't open file : $f";
while( ){
if( /(\d+)<\/DOCNO>/ ){
for $key ( @keywords ){
$flag{$key} = 0;
}
}
for $key ( @keywords ){
if( grep (/$key/ ,$_) ){
$flag{$key} = 1;
}
}
next unless $_ eq "\n";
for $key ( @keywords ){
$df{$key} += $flag{$key};
}
}
close FILE;
}
# DEBUG for $key ( @keywords ){
# DEBUG print "KEY = $key $df{$key}\n";
# DEBUG }
}
#
# Calculate similarities of documents
#
sub calc_similarity {
for $f ( @DATA_FILES ){
open( FILE,"< $f" )|| die "Can't open file : $f";
while( ){
if( /(\d+)<\/DOCNO>/ ){
$docno = $1;
$sim{$docno} = 0;
for $key ( @keywords ){
$tf{$key} = 0;
}
} else {
$len += length( $_ );
for $key ( @keywords ){
$tf{$key} += grep( /$key/,$_ );
}
}
next unless $_ eq "\n";
for $key ( @keywords ){
if($tf{$key} > 0 && $df{$key} > $LOW_DF_THRESHOLD) {
$sim{$docno} += $tw{$key} * $tf{$key} / (log($df{$key}) * log($len));
# DEBUG print "MATCH $docno $key $tf{$key}\n";
}
}
$len = 0;
$match = 0;
}
close FILE;
}
}
#
# Print IREX format output
#
sub print_result {
print "\n";
print "$topicid\n";
$i=1;
foreach $docno ( sort {$sim{$b} <=> $sim{$a}} keys %sim ) {
print "$docno\n";
if($i++>=300){ last; }
}
print "\n\n"
}