/ src / edit_dist.c
edit_dist.c
  1  /*
  2    This edit distance code is taken from trn3.6. A few minor
  3    modifications have been made by Andrew Tridgell <tridge@samba.org>
  4    for use in spamsum.
  5  */
  6  
  7  
  8  /***************************************************************************/
  9  
 10  
 11  /* The authors make no claims as to the fitness or correctness of this software
 12   * for any use whatsoever, and it is provided as is. Any use of this software
 13   * is at the user's own risk.
 14   */
 15  
 16  #include <stdio.h>
 17  #include <unistd.h>
 18  #include <stdlib.h>
 19  
 20  /* edit_dist -- returns the minimum edit distance between two strings
 21  
 22  	Program by:  Mark Maimone   CMU Computer Science   13 Nov 89
 23  	Last Modified:  28 Jan 90
 24  
 25     If the input strings have length n and m, the algorithm runs in time
 26     O(nm) and space O(min(m,n)).
 27  
 28  HISTORY
 29     13 Nov 89 (mwm) Created edit_dist() and set_costs().
 30  
 31     28 Jan 90 (mwm) Added view_costs().  Should verify that THRESHOLD
 32     computations will work even when THRESHOLD is not a multiple of
 33     sizeof(int).
 34  
 35     17 May 93 (mwm) Improved performance when used with trn's newsgroup
 36     processing; assume all costs are 1, and you can terminate when a
 37     threshold is exceeded.
 38  */
 39  
 40  #define MIN_DIST 100
 41  
 42  #define	TRN_SPEEDUP		/* Use a less-general version of the
 43  				   routine, one that's better for trn.
 44  				   All change costs are 1, and it's okay
 45  				   to terminate if the edit distance is
 46  				   known to exceed MIN_DIST */
 47  
 48  #define THRESHOLD 4000		/* worry about allocating more memory only
 49  				   when this # of bytes is exceeded */
 50  #define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2))
 51  
 52  #define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))
 53  
 54  #define swap_int(x,y)  (_iswap = (x), (x) = (y), (y) = _iswap)
 55  #define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap)
 56  #define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz : _my))
 57  #define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my))
 58  
 59  
 60  static int insert_cost = 1;
 61  static int delete_cost = 1;
 62  #ifndef TRN_SPEEDUP
 63  static int change_cost = 1;
 64  static int swap_cost   = 1;
 65  #endif
 66  
 67  static int _iswap;			/* swap_int temp variable */
 68  static char *_cswap;			/* swap_char temp variable */
 69  static int _mx, _my, _mz;		/* min2, min3 temp variables */
 70  
 71  
 72  
 73  /* edit_distn -- returns the edit distance between two strings, or -1 on
 74     failure */
 75  
 76  int
 77  edit_distn(from, from_len, to, to_len)
 78  char *from, *to;
 79  register int from_len, to_len;
 80  {
 81  #ifndef TRN_SPEEDUP
 82      register int ins, del, ch;	  	/* local copies of edit costs */
 83  #endif
 84      register int row, col, index;	/* dynamic programming counters */
 85      register int radix;			/* radix for modular indexing */
 86  #ifdef TRN_SPEEDUP
 87      register int low;
 88  #endif
 89      int *buffer;			/* pointer to storage for one row
 90  					   of the d.p. array */
 91      static int store[THRESHOLD / sizeof (int)];
 92  					/* a small amount of static
 93  					   storage, to be used when the
 94  					   input strings are small enough */
 95  
 96  /* Handle trivial cases when one string is empty */
 97  
 98      if (from == NULL || !from_len)
 99  	if (to == NULL || !to_len)
100  	    return 0;
101  	else
102  	    return to_len * insert_cost;
103      else if (to == NULL || !to_len)
104  	return from_len * delete_cost;
105  
106  /* Initialize registers */
107  
108      radix = 2 * from_len + 3;
109  #ifdef TRN_SPEEDUP
110  #define ins 1
111  #define del 1
112  #define ch 3
113  #define swap_cost 5
114  #else
115      ins  = insert_cost;
116      del  = delete_cost;
117      ch   = change_cost;
118  #endif
119  
120  /* Make   from   short enough to fit in the static storage, if it's at all
121     possible */
122  
123      if (from_len > to_len && from_len > STRLENTHRESHOLD) {
124  	swap_int(from_len, to_len);
125  	swap_char(from, to);
126  #ifndef TRN_SPEEDUP
127  	swap_int(ins, del);
128  #endif
129      } /* if from_len > to_len */
130  
131  /* Allocate the array storage (from the heap if necessary) */
132  
133      if (from_len <= STRLENTHRESHOLD)
134  	buffer = store;
135      else
136  	buffer = (int *) malloc(radix * sizeof (int));
137  
138  /* Here's where the fun begins.  We will find the minimum edit distance
139     using dynamic programming.  We only need to store two rows of the matrix
140     at a time, since we always progress down the matrix.  For example,
141     given the strings "one" and "two", and insert, delete and change costs
142     equal to 1:
143  
144  	   _  o  n  e
145  	_  0  1  2  3
146  	t  1  1  2  3
147  	w  2  2  2  3
148  	o  3  2  3  3
149  
150     The dynamic programming recursion is defined as follows:
151  
152  	ar(x,0) := x * insert_cost
153  	ar(0,y) := y * delete_cost
154  	ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change),
155  		       a(x - 1, y) + insert_cost,
156  		       a(x, y - 1) + delete_cost,
157  		       a(x - 2, y - 2) + (from[x] == to[y-1] &&
158  					  from[x-1] == to[y] ? swap_cost :
159  					  infinity))
160  
161     Since this only looks at most two rows and three columns back, we need
162     only store the values for the two preceding rows.  In this
163     implementation, we do not explicitly store the zero column, so only 2 *
164     from_len + 2   words are needed.  However, in the implementation of the
165     swap_cost   check, the current matrix value is used as a buffer; we
166     can't overwrite the earlier value until the   swap_cost   check has
167     been performed.  So we use   2 * from_len + 3   elements in the buffer.
168  */
169  
170  #define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \
171  	buffer[mod(index)]))
172  #define NW(x,y)	  ar(x, y, index + from_len + 2)
173  #define N(x,y)	  ar(x, y, index + from_len + 3)
174  #define W(x,y)	  ar(x, y, index + radix - 1)
175  #define NNWW(x,y) ar(x, y, index + 1)
176  #define mod(x) ((x) % radix)
177  
178      index = 0;
179  
180  #ifdef DEBUG_EDITDIST
181      printf("      ");
182      for (col = 0; col < from_len; col++)
183  	printf(" %c ", from[col]);
184      printf("\n   ");
185  
186      for (col = 0; col <= from_len; col++)
187  	printf("%2d ", col * del);
188  #endif
189  
190  /* Row 0 is handled implicitly; its value at a given column is   col*del.
191     The loop below computes the values for Row 1.  At this point we know the
192     strings are nonempty.  We also don't need to consider swap costs in row
193     1.
194  
195     COMMENT: the indices row and col below point into the STRING, so
196     the corresponding MATRIX indices are row+1 and col+1.
197  */
198  
199      buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch));
200  #ifdef TRN_SPEEDUP
201      low = buffer[mod(index + radix - 1)];
202  #endif
203  
204  #ifdef DEBUG_EDITDIST
205      printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]);
206  #endif
207  
208      for (col = 1; col < from_len; col++) {
209  	buffer[index] = min3(
210  		col * del + ((from[col] == to[0]) ? 0 : ch),
211  		(col + 1) * del + ins,
212  		buffer[index - 1] + del);
213  #ifdef TRN_SPEEDUP
214  	if (buffer[index] < low)
215  	    low = buffer[index];
216  #endif
217  	index++;
218  
219  #ifdef DEBUG_EDITDIST
220  	printf("%2d ", buffer[index - 1]);
221  #endif
222  
223      } /* for col = 1 */
224  
225  #ifdef DEBUG_EDITDIST
226      printf("\n %c %2d ", to[1], 2 * ins);
227  #endif
228  
229  /* Now handle the rest of the matrix */
230  
231      for (row = 1; row < to_len; row++) {
232  	for (col = 0; col < from_len; col++) {
233  	    buffer[index] = min3(
234  		    NW(row, col) + ((from[col] == to[row]) ? 0 : ch),
235  		    N(row, col + 1) + ins,
236  		    W(row + 1, col) + del);
237  	    if (from[col] == to[row - 1] && col > 0 &&
238  		    from[col - 1] == to[row])
239  		buffer[index] = min2(buffer[index],
240  			NNWW(row - 1, col - 1) + swap_cost);
241  
242  #ifdef DEBUG_EDITDIST
243  	    printf("%2d ", buffer[index]);
244  #endif
245  #ifdef TRN_SPEEDUP
246  	    if (buffer[index] < low || col == 0)
247  		low = buffer[index];
248  #endif
249  
250  	    index = mod(index + 1);
251  	} /* for col = 1 */
252  #ifdef DEBUG_EDITDIST
253  	if (row < to_len - 1)
254  	    printf("\n %c %2d ", to[row+1], (row + 2) * ins);
255  	else
256  	    printf("\n");
257  #endif
258  #ifdef TRN_SPEEDUP
259  	if (low > MIN_DIST)
260  	    break;
261  #endif
262      } /* for row = 1 */
263  
264      row = buffer[mod(index + radix - 1)];
265      if (buffer != store)
266  	free((char *) buffer);
267      return row;
268  } /* edit_distn */