#!/usr/local/bin/jperl ################################################################ # Simple IR system # # Usage: # SIR1199.perl # # Description: # Read the source code. # It assume IREX SGML data for the input and output. # # Author: # Satoshi Sekine (New York University) # # Note: # Of course, it is better if you use an index file in terms of # speed. The purpose of this program is to provide sample # output data of IREX. Also, you can start IR programing # based on this. You may want to add a morphological analyser, # sophisticated weighting scheme, query expansion, use of # thesauri, better stpolist, better cut off parameters, etc. # # You can copy, modify, redistribute the code. # The source code is provided "AS IS". # The author has no responsibility for any damage or anything # caused by the code. No copyright is claimed. ################################################################ ###### Configurations ######## $SYSTEM_ID = 1199; @DATA_FILES=( "$ENV{IREX_ROOT}/MAINICHI/mai94.sgml", "$ENV{IREX_ROOT}/MAINICHI/mai95.sgml" ); # @DATA_FILES = ( "sample.sgml" ); $TOPIC_FILE= "IR_DRYRUN.topic2"; ###### Parameters ############# @STOPLIST = ( "記事", "事", "含", "述", "探", "言及", "場合", "指摘", "良", "関", "分" ); $TW_DESCRIPTION = 3; $TW_NARRATIVE = 1; $LOW_DF_THRESHOLD = 10; ####### Program ############### # # Print header of output # print "$SYSTEM_ID\n\n"; # # Analyze TOPIC # open(TOPIC_FILE, "$TOPIC_FILE") || die "Can't open file : $TOPIC_FILE"; # status=0: out of a TOPIC tag pair, # 1: in a pair $status = 0; while( ){ if($status==1 && m!!){ sir1199_main(); $status=0; }elsif($status==0 && //){ $status=1; }elsif($status==0){ next; }elsif(m!(\d+)!){ $topicid = $1; }elsif(m!(.+)!){ $description = $1; }elsif(m!(.+)!){ $narrative = $1; } } # # IR Main # sub sir1199_main { extract_keywords(); calc_df(); calc_similarity(); print_result(); } # # Extract keywords from description & narrative # sub extract_keywords { @description_keywords = split( /[あ-ん、。]+/, $description ); @narrative_keywords = split( /[あ-ん、。]+/, $narrative ); for $key ( @narrative_keywords ){ $tw{$key} += $TW_NARRATIVE; } for $key ( @description_keywords ){ $tw{$key} += $TW_DESCRIPTION; } @keywords = (); for $key ( keys %tw ){ if( !grep( /^$key$/, @STOPLIST) ){ push ( @keywords, $key ); } } } # # Calculate document frequency of keywords # sub calc_df { for $f ( @DATA_FILES ){ open( FILE,"< $f" )|| die "Can't open file : $f"; while( ){ if( /(\d+)<\/DOCNO>/ ){ for $key ( @keywords ){ $flag{$key} = 0; } } for $key ( @keywords ){ if( grep (/$key/ ,$_) ){ $flag{$key} = 1; } } next unless $_ eq "\n"; for $key ( @keywords ){ $df{$key} += $flag{$key}; } } close FILE; } # DEBUG for $key ( @keywords ){ # DEBUG print "KEY = $key $df{$key}\n"; # DEBUG } } # # Calculate similarities of documents # sub calc_similarity { for $f ( @DATA_FILES ){ open( FILE,"< $f" )|| die "Can't open file : $f"; while( ){ if( /(\d+)<\/DOCNO>/ ){ $docno = $1; $sim{$docno} = 0; for $key ( @keywords ){ $tf{$key} = 0; } } else { $len += length( $_ ); for $key ( @keywords ){ $tf{$key} += grep( /$key/,$_ ); } } next unless $_ eq "\n"; for $key ( @keywords ){ if($tf{$key} > 0 && $df{$key} > $LOW_DF_THRESHOLD) { $sim{$docno} += $tw{$key} * $tf{$key} / (log($df{$key}) * log($len)); # DEBUG print "MATCH $docno $key $tf{$key}\n"; } } $len = 0; $match = 0; } close FILE; } } # # Print IREX format output # sub print_result { print "\n"; print "$topicid\n"; $i=1; foreach $docno ( sort {$sim{$b} <=> $sim{$a}} keys %sim ) { print "$docno\n"; if($i++>=300){ last; } } print "\n\n" }