#!/usr/bin/perl -w $| = 1; use strict; use IO::File; use Data::Dumper; $Data::Dumper::Terse = 1; sub letter_frequency { my ($fh) = @_; my %freq; while (defined (my $line = <$fh>)) { for my $char (split //, $line) { $freq{$char} ++; } } return \%freq; } sub word_frequency { my ($fh) = @_; my %freq; my %pair_freq_fw; my %pair_freq_bw; my $prev_word; my $n = 0; while (defined (my $line = <$fh>)) { for my $word (split /\b/, $line) { $word = lc $word; $word =~ s/\s+//g; next if $word eq ""; $freq{$word} ++; $pair_freq_fw{$prev_word}{$word}++; $pair_freq_bw{$word}{$prev_word}++; $prev_word = $word; $n++; } } return $n, \%freq, \%pair_freq_fw, \%pair_freq_bw; } sub slurp { my ($fn) = @_; local $/; my $fh = new IO::File($fn); my $text = <$fh>; return $text; } my $bible = shift || die "syntax: $0 bible"; my ($n, $word_freq, $pair_freq_fw, $pair_freq_bw) = word_frequency(IO::File->new($bible)); while (1) { print "? "; my $word = <>; chomp $word; next unless $word_freq->{$word}; my $relatives = relatives($word); my $sorted = sort_relatives($relatives); for my $i (1..48) { print $sorted->[$i]->[0], "\n"; } my $fd = IO::File->new(">$word.rel"); for (@$sorted) { print $fd $_->[0], ": ", $_->[1], "\n"; } close $fd; print "\n"; } # my @sorted = sort {$b->[1] <=> $a->[1] || $a->[0] cmp $b->[0]} map [$_, $freq->{$_}], keys %$freq; # print map "$_->[0]\n", @sorted; sub relatives { my ($a) = @_; my %relatives; for my $c (keys %{$pair_freq_fw->{$a}}) { for my $b (keys %{$pair_freq_bw->{$c}}) { $relatives{$b} = 1; } } for my $c (keys %{$pair_freq_bw->{$a}}) { for my $b (keys %{$pair_freq_fw->{$c}}) { $relatives{$b} = 1; } } my $n = @{[%relatives]} / 2; my $i = 0; for my $b (keys %relatives) { my $p = int ($i / $n * 80); print STDERR "." if $p != int (($i++-1) / $n * 80); $relatives{$b} = similarity_factor($a, $b); } print STDERR "\n"; return \%relatives; } sub sort_relatives { my ($relatives) = @_; my @sorted = sort {$a->[1] <=> $b->[1] || $a->[0] cmp $b->[0]} map [$_, $relatives->{$_}], keys %$relatives; return \@sorted; } sub similarity_factor { my ($a, $b) = @_; my @fw_cs = keys %{{ %{$pair_freq_fw->{$a}}, %{$pair_freq_fw->{$b}} }}; my @bw_cs = keys %{{ %{$pair_freq_bw->{$a}}, %{$pair_freq_bw->{$b}} }}; my $simfac = 0; for my $c (@fw_cs) { my $prob_ac = ($pair_freq_fw->{$a}{$c}||0) / $word_freq->{$a}; my $prob_bc = ($pair_freq_fw->{$b}{$c}||0) / $word_freq->{$b}; my $d = $prob_ac - $prob_bc; $simfac += $d * $d; } for my $c (@bw_cs) { my $prob_ca = ($pair_freq_bw->{$a}{$c}||0) / $word_freq->{$a}; my $prob_cb = ($pair_freq_bw->{$b}{$c}||0) / $word_freq->{$b}; my $d = $prob_ca - $prob_cb; $simfac += $d * $d; } $simfac = sqrt($simfac / $n / 2); return $simfac; }