#!/usr/local/bin/perl ################################################################ # Simple IR system # # Usage: # simpleir.perl keywords... # # Description: # Read the source code. # It assume IREX SGML data for the target. # # Author: # Satoshi Sekine (New York University) # # You can copy, modify, redistribute the code, as long as # this header is kept. The source code is provided "AS IS". # The author has no responsibility for any damage or anything # caused by the code. No copyright is claimed. ################################################################ $SYSTEM_ID = 1199; $TOPIC_ID = 1003; @FILES=( "$ENV{IREX_ROOT}/MAINICHI/mai94.sgml", "$ENV{IREX_ROOT}/MAINICHI/mai95.sgml" ); for $f ( @FILES ){ open( FILE,"< $f" )|| die "Can't open file : $f"; while( ){ if( /(\d+)<\/DOCNO>/ ){ $docno = $1; } $len += length( $_ ); for $argv ( @ARGV ){ $match += grep( /$argv/,$_ ); } next unless $_ eq "\n"; if( $match > 0 ){ $sim{$docno} = $match / log( $len ); # You can make this more sophisticated. } $len=0; $match = 0; } close FILE; } # Print IREX format output # print "$SYSTEM_ID\n"; print "\n"; print "$TOPIC_ID\n"; $i=1; foreach $docno ( sort {$sim{$b} <=> $sim{$a}} keys %sim ) { print "$docno\n"; if($i++>=300){ last; } } print ""