#!/usr/local/bin/jperl -- -*-Perl-*- # # nkf -e /cdrom/cdrom0/mai94.txt | jperl mai2sgml.pl > mai94.sgml (113,106,483 byte) # nkf -e /cdrom/cdrom0/mai95.txt | jperl mai2sgml.pl > mai95.sgml (118,720,559 byte) # use I18N::Japanese; $ad{"01"} = "1面"; $ad{"02"} = "2面"; $ad{"03"} = "3面"; $ad{"04"} = "解説"; $ad{"05"} = "社説"; $ad{"07"} = "国際"; $ad{"08"} = "経済"; $ad{"10"} = "特集"; $ad{"12"} = "総合"; $ad{"13"} = "家庭"; $ad{"14"} = "文化"; $ad{"15"} = "読書"; $ad{"16"} = "科学"; $ad{"18"} = "芸能"; $ad{"35"} = "スポーツ"; $ad{"41"} = "社会"; $tag{"C0"} = "DOCNO"; $tag{"AD"} = "SECTION"; $tag{"AE"} = "AE"; $tag{"S1"} = "WORDS"; $tag{"T1"} = "HEADLINE"; $tag{"T2"} = "TEXT"; sub zen2han($) { $_[0] =~ tr/ !”#$%&’()*+,−./0-9:;<=>?@A-Z[¥]^― a-z{|} ̄ / !-~/; $_[0]; } sub transfer($$) { my ( $key, $context ) = @_; my $data; if ( $key eq 'AF' ) { $data = zen2han( $context ); } elsif ( $key eq 'C0' ) { $data = zen2han( $context ); $year = substr($data, 0, 2 ); } elsif ( $key eq 'AE' ) { $data = ( $context eq 'Y' ) ? '有' : '無' ; } elsif ( $key eq 'S1' ) { my $size; ( $size ) = /.*(全(.*)文字)/; $data = zen2han( $size ); } elsif ( $key eq 'AD' ) { $data = $ad{zen2han($context)} } else { $data = $context; } $data; } sub output { my $key; print "\n"; foreach $key ( 'C0', 'AD', 'AE', 'S1', 'T1' ) { print "<", $tag{$key}, ">", $keyword{$key}->[0], "\n"; } foreach $key ( 'T2' ) { print "<",$tag{$key},">\n", join("\n",@{$keyword{$key}}), "\n\n"; } print "\n"; } $first = 1; while (<>) { chomp; ( $tag, $context ) = /\(.*)\(.*)/; $key = zen2han( $tag ); $data = transfer( $key, $context ); if ( $key eq "ID" ) { if ( $first == 1 ) { $first = 0; } elsif ( $first == 0 ) { output; undef %keyword; $first = -1; } else { print "\n"; output; undef %keyword; } } $keyword{$key} = [] unless $keyword{$key}; push @{$keyword{$key}}, $data; } output;