#!/usr/bin/perl -w use strict; sub diff3 { my $strings = shift; my %I_i_set_by_l_substr; { my %I_i_set_by_substr; my $I = 0; for my $s (@$strings) { my $sl = length $s; for (my $i=0; $i<$sl; ++$i) { my $I_i = "${I}_$i"; my $substr = substr $s, $i, 1; $I_i_set_by_substr{$substr}{$I_i} = 1; } $I++; } while (my ($substr, $I_i_set) = each %I_i_set_by_substr) { delete $I_i_set_by_substr{$substr} if keys %$I_i_set == 1; } $I_i_set_by_l_substr{1} = \%I_i_set_by_substr; } my $l; for ($l=2; ; ++$l) { my %I_i_set_by_substr; my $I = 0; while (my ($substr, $I_i_set) = each %{$I_i_set_by_l_substr{$l-1}}) { for my $I_i (keys %$I_i_set) { my ($I, $i) = split /_/, $I_i, 2; my $s = $$strings[$I]; my $sl = length $s; if ($i <= $sl-$l) { my $substr = substr $s, $i, $l; $I_i_set_by_substr{$substr}{$I_i} = 1; } } } while (my ($substr, $I_i_set) = each %I_i_set_by_substr) { delete $I_i_set_by_substr{$substr} if keys %$I_i_set == 1; } last if keys %I_i_set_by_substr == 0; $I_i_set_by_l_substr{$l} = \%I_i_set_by_substr; } my $n = $l; my $prev_I_i_set_by_substr = $I_i_set_by_l_substr{1}; for ($l=2; $l<$n; ++$l) { my $this_I_i_set_by_substr = $I_i_set_by_l_substr{$l}; while (my ($this_substr, $this_I_i_set) = each %$this_I_i_set_by_substr) { my $this_I_i_set_flat; { my $prev_substr = substr($this_substr, 0, -1); if (my $prev_I_i_set = $prev_I_i_set_by_substr->{$prev_substr}) { $this_I_i_set_flat = join " ", sort keys %$this_I_i_set; my $prev_I_i_set_flat = join " ", sort keys %$prev_I_i_set; if ($this_I_i_set_flat eq $prev_I_i_set_flat) { delete $prev_I_i_set_by_substr->{$prev_substr} } } } { my $prev_substr = substr($this_substr, 1); if (my $prev_I_i_set = $prev_I_i_set_by_substr->{$prev_substr}) { $this_I_i_set_flat = join " ", sort keys %$this_I_i_set; my $prev_I_i_set_flat = join " ", sort map {my ($I, $i) = split /_/, $_, 2; $I . "_" . ($i-1)} keys %$prev_I_i_set; if ($this_I_i_set_flat eq $prev_I_i_set_flat) { delete $prev_I_i_set_by_substr->{$prev_substr} } } } } $prev_I_i_set_by_substr = $this_I_i_set_by_substr; } for ($l=2; $l<$n; ++$l) { delete $I_i_set_by_l_substr{$l} unless %{$I_i_set_by_l_substr{$l}}; } return \%I_i_set_by_l_substr; } use IO::File; sub slurp { my $file = shift; $file = IO::File->new($file) unless ref $file; return unless $file; local $/; return <$file>; } use Data::Dumper; $Data::Dumper::Terse = 1; $Data::Dumper::Indent = 0; sub test { my $I_i_set_by_l_substr; if ($I_i_set_by_l_substr = slurp("cached.diff")) { eval "\$I_i_set_by_l_substr = $I_i_set_by_l_substr"; } else { @ARGV = ($0) unless @ARGV; my @files = map {slurp($_)} @ARGV; for (@files) { s/\s+/ /sg; $_ = lc $_; print "$_\n"; } $I_i_set_by_l_substr = diff3 [@files]; open OUT, ">cached.diff"; print OUT Dumper $I_i_set_by_l_substr; } # index by frequency... my @list; my $n = 0; for (keys %$I_i_set_by_l_substr) { $n = $_ if $_ > $n } for (my $l=1; $l<=$n; ++$l) { my $I_i_set_by_substr = $I_i_set_by_l_substr->{$l}; while (my ($substr, $I_i_set) = each %$I_i_set_by_substr) { my @I_i_set = sort {$a->[0] <=> $a->[0] || $a->[1] <=> $a->[1]} map {[split /_/, $_, 2]} keys %$I_i_set; push @list, [length($substr) * scalar(@I_i_set), length($substr), $substr, scalar(@I_i_set), \@I_i_set]; # $by_freq_len{@I_i_set * length $substr}{$substr} = [@I_i_set]; } } my @by_freq_len = grep {$_->[1] > 0} sort {$b->[0] <=> $a->[0]} @list; #map {[$_, $by_freq_len{$_}]} keys %by_freq_len; print map "$_->[3]: $_->[2]\n", @by_freq_len; } test