#include #include #include #include struct text { char *start; int length; }; struct text slurp(char *filename) { FILE *file; struct text text; file = fopen(filename, "r"); fseek(file, 0, SEEK_END); text.length = (int)ftell(file); fseek(file, 0, SEEK_SET); text.start = malloc(text.length); fread(text.start, 1, text.length, file); return text; } int *letter_frequency(struct text text) { char *p; char *e; int *freq; freq = calloc(128, sizeof(int)); p = text.start; e = p + text.length; for (; p != e; ++p) { assert(*p >= 0); freq[*p]++; } return freq; } int *pair_frequency(struct text text) { char *p; char *e; int *freq; freq = calloc(128*128, sizeof(int)); p = text.start; e = p + text.length - 1; for (; p != e; ++p) { int I = *p * 128 + p[1]; freq[*p * 128 + p[1]]++; } return freq; } struct text index_from_letter_freq(int *freq) { struct text index; int i; char *p; index.length = 0; for (i=0; i<128; ++i) if (freq[i]) index.length++; index.start = malloc(index.length); for (i=0, p=index.start; i<128; ++i) if (freq[i]) *p++ = (char)i; return index; } static int *sort_index_freq; int sort_index_cmp(const void *a, const void *b) { char A, B; A = *((char *)a); B = *((char *)b); if (sort_index_freq[A] > sort_index_freq[B]) return -1; if (sort_index_freq[A] == sort_index_freq[B]) return 0; return 1; } void sort_index(struct text index, int *freq) { sort_index_freq = freq; qsort(index.start, index.length, 1, sort_index_cmp); } char trnl(char c) { if (c == 10) return ' '; return c; } void print_letter_frequency(int *freq, struct text index) { int i; for (i=0; i max) max = *list; } return max; } struct before_after { double *before; double *after; }; struct before_after before_after_probs(int *pair_freq, int n) { struct before_after probs; int i, j; probs.before = calloc(n*n, sizeof(double)); // given an i, prob that j follows probs.after = calloc(n*n, sizeof(double)); // given an i, prob that j was before for (i=0; ic > B->c) return 1; if (A->c == B->c) return 0; return -1; } struct correlation *correlate(struct before_after probs, int n) { struct correlation *correlations; int i, j, k; correlations = calloc(n*n, sizeof(struct correlation)); for (i=0; i0; --n, ++c) { char i = index.start[c->i]; char j = index.start[c->j]; printf("%c == %c : %7f\n", i, j, c->c); } } double maximum_correl(struct correlation *c, int n) { double max = 0; for (; n>0; --n, ++c) { if (c->c > max) max = c->c; } return max; } void print_correlations_table(struct correlation *correlations, struct text index) { int i, j; int l; double max_freq; char *scale; int max_scale; l = index.length; max_freq = maximum_correl(correlations, l*l); scale = " .,`/\\-=+x*XoO#%@"; max_scale = strlen(scale); printf(" "); for (j=0; j