edit_dist.c
1 /* 2 This edit distance code is taken from trn3.6. A few minor 3 modifications have been made by Andrew Tridgell <tridge@samba.org> 4 for use in spamsum. 5 */ 6 7 8 /***************************************************************************/ 9 10 11 /* The authors make no claims as to the fitness or correctness of this software 12 * for any use whatsoever, and it is provided as is. Any use of this software 13 * is at the user's own risk. 14 */ 15 16 #include <stdio.h> 17 #include <unistd.h> 18 #include <stdlib.h> 19 20 /* edit_dist -- returns the minimum edit distance between two strings 21 22 Program by: Mark Maimone CMU Computer Science 13 Nov 89 23 Last Modified: 28 Jan 90 24 25 If the input strings have length n and m, the algorithm runs in time 26 O(nm) and space O(min(m,n)). 27 28 HISTORY 29 13 Nov 89 (mwm) Created edit_dist() and set_costs(). 30 31 28 Jan 90 (mwm) Added view_costs(). Should verify that THRESHOLD 32 computations will work even when THRESHOLD is not a multiple of 33 sizeof(int). 34 35 17 May 93 (mwm) Improved performance when used with trn's newsgroup 36 processing; assume all costs are 1, and you can terminate when a 37 threshold is exceeded. 38 */ 39 40 #define MIN_DIST 100 41 42 #define TRN_SPEEDUP /* Use a less-general version of the 43 routine, one that's better for trn. 44 All change costs are 1, and it's okay 45 to terminate if the edit distance is 46 known to exceed MIN_DIST */ 47 48 #define THRESHOLD 4000 /* worry about allocating more memory only 49 when this # of bytes is exceeded */ 50 #define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2)) 51 52 #define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y)) 53 54 #define swap_int(x,y) (_iswap = (x), (x) = (y), (y) = _iswap) 55 #define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap) 56 #define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz : _my)) 57 #define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my)) 58 59 60 static int insert_cost = 1; 61 static int delete_cost = 1; 62 #ifndef TRN_SPEEDUP 63 static int change_cost = 1; 64 static int swap_cost = 1; 65 #endif 66 67 static int _iswap; /* swap_int temp variable */ 68 static char *_cswap; /* swap_char temp variable */ 69 static int _mx, _my, _mz; /* min2, min3 temp variables */ 70 71 72 73 /* edit_distn -- returns the edit distance between two strings, or -1 on 74 failure */ 75 76 int 77 edit_distn(from, from_len, to, to_len) 78 char *from, *to; 79 register int from_len, to_len; 80 { 81 #ifndef TRN_SPEEDUP 82 register int ins, del, ch; /* local copies of edit costs */ 83 #endif 84 register int row, col, index; /* dynamic programming counters */ 85 register int radix; /* radix for modular indexing */ 86 #ifdef TRN_SPEEDUP 87 register int low; 88 #endif 89 int *buffer; /* pointer to storage for one row 90 of the d.p. array */ 91 static int store[THRESHOLD / sizeof (int)]; 92 /* a small amount of static 93 storage, to be used when the 94 input strings are small enough */ 95 96 /* Handle trivial cases when one string is empty */ 97 98 if (from == NULL || !from_len) 99 if (to == NULL || !to_len) 100 return 0; 101 else 102 return to_len * insert_cost; 103 else if (to == NULL || !to_len) 104 return from_len * delete_cost; 105 106 /* Initialize registers */ 107 108 radix = 2 * from_len + 3; 109 #ifdef TRN_SPEEDUP 110 #define ins 1 111 #define del 1 112 #define ch 3 113 #define swap_cost 5 114 #else 115 ins = insert_cost; 116 del = delete_cost; 117 ch = change_cost; 118 #endif 119 120 /* Make from short enough to fit in the static storage, if it's at all 121 possible */ 122 123 if (from_len > to_len && from_len > STRLENTHRESHOLD) { 124 swap_int(from_len, to_len); 125 swap_char(from, to); 126 #ifndef TRN_SPEEDUP 127 swap_int(ins, del); 128 #endif 129 } /* if from_len > to_len */ 130 131 /* Allocate the array storage (from the heap if necessary) */ 132 133 if (from_len <= STRLENTHRESHOLD) 134 buffer = store; 135 else 136 buffer = (int *) malloc(radix * sizeof (int)); 137 138 /* Here's where the fun begins. We will find the minimum edit distance 139 using dynamic programming. We only need to store two rows of the matrix 140 at a time, since we always progress down the matrix. For example, 141 given the strings "one" and "two", and insert, delete and change costs 142 equal to 1: 143 144 _ o n e 145 _ 0 1 2 3 146 t 1 1 2 3 147 w 2 2 2 3 148 o 3 2 3 3 149 150 The dynamic programming recursion is defined as follows: 151 152 ar(x,0) := x * insert_cost 153 ar(0,y) := y * delete_cost 154 ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change), 155 a(x - 1, y) + insert_cost, 156 a(x, y - 1) + delete_cost, 157 a(x - 2, y - 2) + (from[x] == to[y-1] && 158 from[x-1] == to[y] ? swap_cost : 159 infinity)) 160 161 Since this only looks at most two rows and three columns back, we need 162 only store the values for the two preceding rows. In this 163 implementation, we do not explicitly store the zero column, so only 2 * 164 from_len + 2 words are needed. However, in the implementation of the 165 swap_cost check, the current matrix value is used as a buffer; we 166 can't overwrite the earlier value until the swap_cost check has 167 been performed. So we use 2 * from_len + 3 elements in the buffer. 168 */ 169 170 #define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \ 171 buffer[mod(index)])) 172 #define NW(x,y) ar(x, y, index + from_len + 2) 173 #define N(x,y) ar(x, y, index + from_len + 3) 174 #define W(x,y) ar(x, y, index + radix - 1) 175 #define NNWW(x,y) ar(x, y, index + 1) 176 #define mod(x) ((x) % radix) 177 178 index = 0; 179 180 #ifdef DEBUG_EDITDIST 181 printf(" "); 182 for (col = 0; col < from_len; col++) 183 printf(" %c ", from[col]); 184 printf("\n "); 185 186 for (col = 0; col <= from_len; col++) 187 printf("%2d ", col * del); 188 #endif 189 190 /* Row 0 is handled implicitly; its value at a given column is col*del. 191 The loop below computes the values for Row 1. At this point we know the 192 strings are nonempty. We also don't need to consider swap costs in row 193 1. 194 195 COMMENT: the indices row and col below point into the STRING, so 196 the corresponding MATRIX indices are row+1 and col+1. 197 */ 198 199 buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch)); 200 #ifdef TRN_SPEEDUP 201 low = buffer[mod(index + radix - 1)]; 202 #endif 203 204 #ifdef DEBUG_EDITDIST 205 printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]); 206 #endif 207 208 for (col = 1; col < from_len; col++) { 209 buffer[index] = min3( 210 col * del + ((from[col] == to[0]) ? 0 : ch), 211 (col + 1) * del + ins, 212 buffer[index - 1] + del); 213 #ifdef TRN_SPEEDUP 214 if (buffer[index] < low) 215 low = buffer[index]; 216 #endif 217 index++; 218 219 #ifdef DEBUG_EDITDIST 220 printf("%2d ", buffer[index - 1]); 221 #endif 222 223 } /* for col = 1 */ 224 225 #ifdef DEBUG_EDITDIST 226 printf("\n %c %2d ", to[1], 2 * ins); 227 #endif 228 229 /* Now handle the rest of the matrix */ 230 231 for (row = 1; row < to_len; row++) { 232 for (col = 0; col < from_len; col++) { 233 buffer[index] = min3( 234 NW(row, col) + ((from[col] == to[row]) ? 0 : ch), 235 N(row, col + 1) + ins, 236 W(row + 1, col) + del); 237 if (from[col] == to[row - 1] && col > 0 && 238 from[col - 1] == to[row]) 239 buffer[index] = min2(buffer[index], 240 NNWW(row - 1, col - 1) + swap_cost); 241 242 #ifdef DEBUG_EDITDIST 243 printf("%2d ", buffer[index]); 244 #endif 245 #ifdef TRN_SPEEDUP 246 if (buffer[index] < low || col == 0) 247 low = buffer[index]; 248 #endif 249 250 index = mod(index + 1); 251 } /* for col = 1 */ 252 #ifdef DEBUG_EDITDIST 253 if (row < to_len - 1) 254 printf("\n %c %2d ", to[row+1], (row + 2) * ins); 255 else 256 printf("\n"); 257 #endif 258 #ifdef TRN_SPEEDUP 259 if (low > MIN_DIST) 260 break; 261 #endif 262 } /* for row = 1 */ 263 264 row = buffer[mod(index + radix - 1)]; 265 if (buffer != store) 266 free((char *) buffer); 267 return row; 268 } /* edit_distn */