/ libpkg / diff.c
diff.c
  1  /*
  2  ** Copyright (c) 2007 D. Richard Hipp
  3  **
  4  ** This program is free software; you can redistribute it and/or
  5  ** modify it under the terms of the Simplified BSD License (also
  6  ** known as the "2-Clause License" or "FreeBSD License".)
  7  
  8  ** This program is distributed in the hope that it will be useful,
  9  ** but without any warranty; without even the implied warranty of
 10  ** merchantability or fitness for a particular purpose.
 11  **
 12  ** Author contact information:
 13  **   drh@hwaci.com
 14  **   http://www.hwaci.com/drh/
 15  **
 16  *******************************************************************************
 17  **
 18  ** This file contains code used to compute a "diff" between two
 19  ** text files.
 20  */
 21  #include <sys/types.h>
 22  
 23  #include <string.h>
 24  #include <stdlib.h>
 25  
 26  #include "private/utils.h"
 27  #include "xmalloc.h"
 28  
 29  /*
 30  ** Maximum length of a line in a text file, in bytes.  (2**13 = 8192 bytes)
 31  */
 32  #define LENGTH_MASK_SZ  13
 33  #define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)
 34  
 35  /*
 36  ** Information about each line of a file being diffed.
 37  **
 38  ** The lower LENGTH_MASK_SZ bits of the hash (DLine.h) are the length
 39  ** of the line.  If any line is longer than LENGTH_MASK characters,
 40  ** the file is considered binary.
 41  */
 42  typedef struct DLine DLine;
 43  struct DLine {
 44    const char *z;        /* The text of the line */
 45    unsigned int h;       /* Hash of the line */
 46    unsigned short indent;  /* Indent of the line. Only !=0 with -w/-Z option */
 47    unsigned short n;     /* number of bytes */
 48    unsigned int iNext;   /* 1+(Index of next line with same the same hash) */
 49  
 50    /* an array of DLine elements serves two purposes.  The fields
 51    ** above are one per line of input text.  But each entry is also
 52    ** a bucket in a hash table, as follows: */
 53    unsigned int iHash;   /* 1+(first entry in the hash chain) */
 54  };
 55  
 56  /*
 57  ** Length of a dline
 58  */
 59  #define LENGTH(X)   ((X)->n)
 60  
 61  /*
 62  ** A context for running a raw diff.
 63  **
 64  ** The aEdit[] array describes the raw diff.  Each triple of integers in
 65  ** aEdit[] means:
 66  **
 67  **   (1) COPY:   Number of lines aFrom and aTo have in common
 68  **   (2) DELETE: Number of lines found only in aFrom
 69  **   (3) INSERT: Number of lines found only in aTo
 70  **
 71  ** The triples repeat until all lines of both aFrom and aTo are accounted
 72  ** for.
 73  */
 74  typedef struct DContext DContext;
 75  struct DContext {
 76    int *aEdit;        /* Array of copy/delete/insert triples */
 77    int nEdit;         /* Number of integers (3x num of triples) in aEdit[] */
 78    int nEditAlloc;    /* Space allocated for aEdit[] */
 79    DLine *aFrom;      /* File on left side of the diff */
 80    int nFrom;         /* Number of lines in aFrom[] */
 81    DLine *aTo;        /* File on right side of the diff */
 82    int nTo;           /* Number of lines in aTo[] */
 83    int (*same_fn)(const DLine *, const DLine *); /* Function to be used for comparing */
 84  };
 85  
 86  /*
 87  ** Return an array of DLine objects containing a pointer to the
 88  ** start of each line and a hash of that line.  The lower
 89  ** bits of the hash store the length of each line.
 90  **
 91  ** Trailing whitespace is removed from each line.  2010-08-20:  Not any
 92  ** more.  If trailing whitespace is ignored, the "patch" command gets
 93  ** confused by the diff output.  Ticket [a9f7b23c2e376af5b0e5b]
 94  **
 95  ** Return 0 if the file is binary or contains a line that is
 96  ** too long.
 97  **
 98  ** Profiling show that in most cases this routine consumes the bulk of
 99  ** the CPU time on a diff.
100  */
101  static DLine *break_into_lines(char *z, int *pnLine){
102    int nLine, i, j, k, s, x;
103    unsigned int h, h2;
104    DLine *a;
105  
106    int n = strlen(z);
107    /* Count the number of lines.  Allocate space to hold
108    ** the returned array.
109    */
110    for(i=j=0, nLine=1; i<n; i++, j++){
111      int c = z[i];
112      if( c==0 ){
113        return 0;
114      }
115      if( c=='\n' && z[i+1]!=0 ){
116        nLine++;
117        if( j>LENGTH_MASK ){
118          return 0;
119        }
120        j = 0;
121      }
122    }
123    if( j>LENGTH_MASK ){
124      return 0;
125    }
126    a = xcalloc(nLine, sizeof(a[0]) );
127    if( n==0 ){
128      *pnLine = 0;
129      return a;
130    }
131  
132    /* Fill in the array */
133    for(i=0; i<nLine; i++){
134      for(j=0; z[j] && z[j]!='\n'; j++){}
135      a[i].z = z;
136      k = j;
137      a[i].n = k;
138      s = 0;
139      for(h=0, x=s; x<k; x++){
140        h = h ^ (h<<2) ^ z[x];
141      }
142      a[i].indent = s;
143      a[i].h = h = (h<<LENGTH_MASK_SZ) | (k-s);
144      h2 = h % nLine;
145      a[i].iNext = a[h2].iHash;
146      a[h2].iHash = i+1;
147      z += j+1;
148    }
149  
150    /* Return results */
151    *pnLine = nLine;
152    return a;
153  }
154  
155  /*
156  ** Return true if two DLine elements are identical.
157  */
158  static int same_dline(const DLine *pA, const DLine *pB){
159    return pA->h==pB->h && memcmp(pA->z,pB->z, pA->h&LENGTH_MASK)==0;
160  }
161  
162  
163  /*
164  ** Minimum of two values
165  */
166  static int minInt(int a, int b){ return a<b ? a : b; }
167  
168  /*
169  ** Compute the optimal longest common subsequence (LCS) using an
170  ** exhaustive search.  This version of the LCS is only used for
171  ** shorter input strings since runtime is O(N*N) where N is the
172  ** input string length.
173  */
174  static void optimalLCS(
175    DContext *p,               /* Two files being compared */
176    int iS1, int iE1,          /* Range of lines in p->aFrom[] */
177    int iS2, int iE2,          /* Range of lines in p->aTo[] */
178    int *piSX, int *piEX,      /* Write p->aFrom[] common segment here */
179    int *piSY, int *piEY       /* Write p->aTo[] common segment here */
180  ){
181    int mxLength = 0;          /* Length of longest common subsequence */
182    int i, j;                  /* Loop counters */
183    int k;                     /* Length of a candidate subsequence */
184    int iSXb = iS1;            /* Best match so far */
185    int iSYb = iS2;            /* Best match so far */
186  
187    for(i=iS1; i<iE1-mxLength; i++){
188      for(j=iS2; j<iE2-mxLength; j++){
189        if( !p->same_fn(&p->aFrom[i], &p->aTo[j]) ) continue;
190        if( mxLength && !p->same_fn(&p->aFrom[i+mxLength], &p->aTo[j+mxLength]) ){
191          continue;
192        }
193        k = 1;
194        while( i+k<iE1 && j+k<iE2 && p->same_fn(&p->aFrom[i+k],&p->aTo[j+k]) ){
195          k++;
196        }
197        if( k>mxLength ){
198          iSXb = i;
199          iSYb = j;
200          mxLength = k;
201        }
202      }
203    }
204    *piSX = iSXb;
205    *piEX = iSXb + mxLength;
206    *piSY = iSYb;
207    *piEY = iSYb + mxLength;
208  }
209  
210  /*
211  ** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[]
212  ** file and lines iS2 through iE2-1 of the aTo[] file.  Locate a sequence
213  ** of lines in these two blocks that are exactly the same.  Return
214  ** the bounds of the matching sequence.
215  **
216  ** If there are two or more possible answers of the same length, the
217  ** returned sequence should be the one closest to the center of the
218  ** input range.
219  **
220  ** Ideally, the common sequence should be the longest possible common
221  ** sequence.  However, an exact computation of LCS is O(N*N) which is
222  ** way too slow for larger files.  So this routine uses an O(N)
223  ** heuristic approximation based on hashing that usually works about
224  ** as well.  But if the O(N) algorithm doesn't get a good solution
225  ** and N is not too large, we fall back to an exact solution by
226  ** calling optimalLCS().
227  */
228  static void longestCommonSequence(
229    DContext *p,               /* Two files being compared */
230    int iS1, int iE1,          /* Range of lines in p->aFrom[] */
231    int iS2, int iE2,          /* Range of lines in p->aTo[] */
232    int *piSX, int *piEX,      /* Write p->aFrom[] common segment here */
233    int *piSY, int *piEY       /* Write p->aTo[] common segment here */
234  ){
235    int i, j, k;               /* Loop counters */
236    int n;                     /* Loop limit */
237    DLine *pA, *pB;            /* Pointers to lines */
238    int iSX, iSY, iEX, iEY;    /* Current match */
239    int skew = 0;              /* How lopsided is the match */
240    int dist = 0;              /* Distance of match from center */
241    int mid;                   /* Center of the span */
242    int iSXb, iSYb, iEXb, iEYb;   /* Best match so far */
243    int iSXp, iSYp, iEXp, iEYp;   /* Previous match */
244    int64_t bestScore;      /* Best score so far */
245    int64_t score;          /* Score for current candidate LCS */
246    int span;                     /* combined width of the input sequences */
247  
248    span = (iE1 - iS1) + (iE2 - iS2);
249    bestScore = -10000;
250    iSXb = iSXp = iS1;
251    iEXb = iEXp = iS1;
252    iSYb = iSYp = iS2;
253    iEYb = iEYp = iS2;
254    mid = (iE1 + iS1)/2;
255    for(i=iS1; i<iE1; i++){
256      int limit = 0;
257      j = p->aTo[p->aFrom[i].h % p->nTo].iHash;
258      while( j>0
259        && (j-1<iS2 || j>=iE2 || !p->same_fn(&p->aFrom[i], &p->aTo[j-1]))
260      ){
261        if( limit++ > 10 ){
262          j = 0;
263          break;
264        }
265        j = p->aTo[j-1].iNext;
266      }
267      if( j==0 ) continue;
268      if( i<iEXb && j>=iSYb && j<iEYb ) continue;
269      if( i<iEXp && j>=iSYp && j<iEYp ) continue;
270      iSX = i;
271      iSY = j-1;
272      pA = &p->aFrom[iSX-1];
273      pB = &p->aTo[iSY-1];
274      n = minInt(iSX-iS1, iSY-iS2);
275      for(k=0; k<n && p->same_fn(pA,pB); k++, pA--, pB--){}
276      iSX -= k;
277      iSY -= k;
278      iEX = i+1;
279      iEY = j;
280      pA = &p->aFrom[iEX];
281      pB = &p->aTo[iEY];
282      n = minInt(iE1-iEX, iE2-iEY);
283      for(k=0; k<n && p->same_fn(pA,pB); k++, pA++, pB++){}
284      iEX += k;
285      iEY += k;
286      skew = (iSX-iS1) - (iSY-iS2);
287      if( skew<0 ) skew = -skew;
288      dist = (iSX+iEX)/2 - mid;
289      if( dist<0 ) dist = -dist;
290      score = (iEX - iSX)*(int64_t)span - (skew + dist);
291      if( score>bestScore ){
292        bestScore = score;
293        iSXb = iSX;
294        iSYb = iSY;
295        iEXb = iEX;
296        iEYb = iEY;
297      }else if( iEX>iEXp ){
298        iSXp = iSX;
299        iSYp = iSY;
300        iEXp = iEX;
301        iEYp = iEY;
302      }
303    }
304    if( iSXb==iEXb && (iE1-iS1)*(iE2-iS2)<400 ){
305      /* If no common sequence is found using the hashing heuristic and
306      ** the input is not too big, use the expensive exact solution */
307      optimalLCS(p, iS1, iE1, iS2, iE2, piSX, piEX, piSY, piEY);
308    }else{
309      *piSX = iSXb;
310      *piSY = iSYb;
311      *piEX = iEXb;
312      *piEY = iEYb;
313    }
314  }
315  
316  /*
317  ** Expand the size of aEdit[] array to hold at least nEdit elements.
318  */
319  static void expandEdit(DContext *p, int nEdit){
320    p->aEdit = xrealloc(p->aEdit, nEdit*sizeof(int));
321    p->nEditAlloc = nEdit;
322  }
323  
324  /*
325  ** Append a new COPY/DELETE/INSERT triple.
326  */
327  static void appendTriple(DContext *p, int nCopy, int nDel, int nIns){
328    /* printf("APPEND %d/%d/%d\n", nCopy, nDel, nIns); */
329    if( p->nEdit>=3 ){
330      if( p->aEdit[p->nEdit-1]==0 ){
331        if( p->aEdit[p->nEdit-2]==0 ){
332          p->aEdit[p->nEdit-3] += nCopy;
333          p->aEdit[p->nEdit-2] += nDel;
334          p->aEdit[p->nEdit-1] += nIns;
335          return;
336        }
337        if( nCopy==0 ){
338          p->aEdit[p->nEdit-2] += nDel;
339          p->aEdit[p->nEdit-1] += nIns;
340          return;
341        }
342      }
343      if( nCopy==0 && nDel==0 ){
344        p->aEdit[p->nEdit-1] += nIns;
345        return;
346      }
347    }
348    if( p->nEdit+3>p->nEditAlloc ){
349      expandEdit(p, p->nEdit*2 + 15);
350      if( p->aEdit==0 ) return;
351    }
352    p->aEdit[p->nEdit++] = nCopy;
353    p->aEdit[p->nEdit++] = nDel;
354    p->aEdit[p->nEdit++] = nIns;
355  }
356  
357  /*
358  ** Do a single step in the difference.  Compute a sequence of
359  ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
360  ** the input into lines iS2 through iE2-1 of the output and write
361  ** that sequence into the difference context.
362  **
363  ** The algorithm is to find a block of common text near the middle
364  ** of the two segments being diffed.  Then recursively compute
365  ** differences on the blocks before and after that common segment.
366  ** Special cases apply if either input segment is empty or if the
367  ** two segments have no text in common.
368  */
369  static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){
370    int iSX, iEX, iSY, iEY;
371  
372    if( iE1<=iS1 ){
373      /* The first segment is empty */
374      if( iE2>iS2 ){
375        appendTriple(p, 0, 0, iE2-iS2);
376      }
377      return;
378    }
379    if( iE2<=iS2 ){
380      /* The second segment is empty */
381      appendTriple(p, 0, iE1-iS1, 0);
382      return;
383    }
384  
385    /* Find the longest matching segment between the two sequences */
386    longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
387  
388    if( iEX>iSX ){
389      /* A common segment has been found.
390      ** Recursively diff either side of the matching segment */
391      diff_step(p, iS1, iSX, iS2, iSY);
392      if( iEX>iSX ){
393        appendTriple(p, iEX - iSX, 0, 0);
394      }
395      diff_step(p, iEX, iE1, iEY, iE2);
396    }else{
397      /* The two segments have nothing in common.  Delete the first then
398      ** insert the second. */
399      appendTriple(p, 0, iE1-iS1, iE2-iS2);
400    }
401  }
402  
403  /*
404  ** Compute the differences between two files already loaded into
405  ** the DContext structure.
406  **
407  ** A divide and conquer technique is used.  We look for a large
408  ** block of common text that is in the middle of both files.  Then
409  ** compute the difference on those parts of the file before and
410  ** after the common block.  This technique is fast, but it does
411  ** not necessarily generate the minimum difference set.  On the
412  ** other hand, we do not need a minimum difference set, only one
413  ** that makes sense to human readers, which this algorithm does.
414  **
415  ** Any common text at the beginning and end of the two files is
416  ** removed before starting the divide-and-conquer algorithm.
417  */
418  static void diff_all(DContext *p){
419    int mnE, iS, iE1, iE2;
420  
421    /* Carve off the common header and footer */
422    iE1 = p->nFrom;
423    iE2 = p->nTo;
424    while( iE1>0 && iE2>0 && p->same_fn(&p->aFrom[iE1-1], &p->aTo[iE2-1]) ){
425      iE1--;
426      iE2--;
427    }
428    mnE = iE1<iE2 ? iE1 : iE2;
429    for(iS=0; iS<mnE && p->same_fn(&p->aFrom[iS],&p->aTo[iS]); iS++){}
430  
431    /* do the difference */
432    if( iS>0 ){
433      appendTriple(p, iS, 0, 0);
434    }
435    diff_step(p, iS, iE1, iS, iE2);
436    if( iE1<p->nFrom ){
437      appendTriple(p, p->nFrom - iE1, 0, 0);
438    }
439  
440    /* Terminate the COPY/DELETE/INSERT triples with three zeros */
441    expandEdit(p, p->nEdit+3);
442    if( p->aEdit ){
443      p->aEdit[p->nEdit++] = 0;
444      p->aEdit[p->nEdit++] = 0;
445      p->aEdit[p->nEdit++] = 0;
446    }
447  }
448  
449  /*
450  ** Generate a report of the differences between files pA and pB.
451  ** If pOut is not NULL then a unified diff is appended there.  It
452  ** is assumed that pOut has already been initialized.  If pOut is
453  ** NULL, then a pointer to an array of integers is returned.
454  ** The integers come in triples.  For each triple,
455  ** the elements are the number of lines copied, the number of
456  ** lines deleted, and the number of lines inserted.  The vector
457  ** is terminated by a triple of all zeros.
458  **
459  ** This diff utility does not work on binary files.  If a binary
460  ** file is encountered, 0 is returned and pOut is written with
461  ** text "cannot compute difference between binary files".
462  */
463  int *
464  text_diff(
465    char *pA,   /* FROM file */
466    char *pB   /* TO file */
467  ){
468    DContext c;
469  
470    /* Prepare the input files */
471    memset(&c, 0, sizeof(c));
472    c.same_fn = same_dline;
473    c.aFrom = break_into_lines(pA, &c.nFrom);
474    c.aTo = break_into_lines(pB, &c.nTo);
475    if( c.aFrom==0 || c.aTo==0 ){
476      free(c.aFrom);
477      free(c.aTo);
478      return 0;
479    }
480  
481    /* Compute the difference */
482    diff_all(&c);
483      /* If a context diff is not requested, then return the
484      ** array of COPY/DELETE/INSERT triples.
485      */
486      free(c.aFrom);
487      free(c.aTo);
488      return c.aEdit;
489  }