diff.c
1 /* 2 ** Copyright (c) 2007 D. Richard Hipp 3 ** 4 ** This program is free software; you can redistribute it and/or 5 ** modify it under the terms of the Simplified BSD License (also 6 ** known as the "2-Clause License" or "FreeBSD License".) 7 8 ** This program is distributed in the hope that it will be useful, 9 ** but without any warranty; without even the implied warranty of 10 ** merchantability or fitness for a particular purpose. 11 ** 12 ** Author contact information: 13 ** drh@hwaci.com 14 ** http://www.hwaci.com/drh/ 15 ** 16 ******************************************************************************* 17 ** 18 ** This file contains code used to compute a "diff" between two 19 ** text files. 20 */ 21 #include <sys/types.h> 22 23 #include <string.h> 24 #include <stdlib.h> 25 26 #include "private/utils.h" 27 #include "xmalloc.h" 28 29 /* 30 ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes) 31 */ 32 #define LENGTH_MASK_SZ 13 33 #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1) 34 35 /* 36 ** Information about each line of a file being diffed. 37 ** 38 ** The lower LENGTH_MASK_SZ bits of the hash (DLine.h) are the length 39 ** of the line. If any line is longer than LENGTH_MASK characters, 40 ** the file is considered binary. 41 */ 42 typedef struct DLine DLine; 43 struct DLine { 44 const char *z; /* The text of the line */ 45 unsigned int h; /* Hash of the line */ 46 unsigned short indent; /* Indent of the line. Only !=0 with -w/-Z option */ 47 unsigned short n; /* number of bytes */ 48 unsigned int iNext; /* 1+(Index of next line with same the same hash) */ 49 50 /* an array of DLine elements serves two purposes. The fields 51 ** above are one per line of input text. But each entry is also 52 ** a bucket in a hash table, as follows: */ 53 unsigned int iHash; /* 1+(first entry in the hash chain) */ 54 }; 55 56 /* 57 ** Length of a dline 58 */ 59 #define LENGTH(X) ((X)->n) 60 61 /* 62 ** A context for running a raw diff. 63 ** 64 ** The aEdit[] array describes the raw diff. Each triple of integers in 65 ** aEdit[] means: 66 ** 67 ** (1) COPY: Number of lines aFrom and aTo have in common 68 ** (2) DELETE: Number of lines found only in aFrom 69 ** (3) INSERT: Number of lines found only in aTo 70 ** 71 ** The triples repeat until all lines of both aFrom and aTo are accounted 72 ** for. 73 */ 74 typedef struct DContext DContext; 75 struct DContext { 76 int *aEdit; /* Array of copy/delete/insert triples */ 77 int nEdit; /* Number of integers (3x num of triples) in aEdit[] */ 78 int nEditAlloc; /* Space allocated for aEdit[] */ 79 DLine *aFrom; /* File on left side of the diff */ 80 int nFrom; /* Number of lines in aFrom[] */ 81 DLine *aTo; /* File on right side of the diff */ 82 int nTo; /* Number of lines in aTo[] */ 83 int (*same_fn)(const DLine *, const DLine *); /* Function to be used for comparing */ 84 }; 85 86 /* 87 ** Return an array of DLine objects containing a pointer to the 88 ** start of each line and a hash of that line. The lower 89 ** bits of the hash store the length of each line. 90 ** 91 ** Trailing whitespace is removed from each line. 2010-08-20: Not any 92 ** more. If trailing whitespace is ignored, the "patch" command gets 93 ** confused by the diff output. Ticket [a9f7b23c2e376af5b0e5b] 94 ** 95 ** Return 0 if the file is binary or contains a line that is 96 ** too long. 97 ** 98 ** Profiling show that in most cases this routine consumes the bulk of 99 ** the CPU time on a diff. 100 */ 101 static DLine *break_into_lines(char *z, int *pnLine){ 102 int nLine, i, j, k, s, x; 103 unsigned int h, h2; 104 DLine *a; 105 106 int n = strlen(z); 107 /* Count the number of lines. Allocate space to hold 108 ** the returned array. 109 */ 110 for(i=j=0, nLine=1; i<n; i++, j++){ 111 int c = z[i]; 112 if( c==0 ){ 113 return 0; 114 } 115 if( c=='\n' && z[i+1]!=0 ){ 116 nLine++; 117 if( j>LENGTH_MASK ){ 118 return 0; 119 } 120 j = 0; 121 } 122 } 123 if( j>LENGTH_MASK ){ 124 return 0; 125 } 126 a = xcalloc(nLine, sizeof(a[0]) ); 127 if( n==0 ){ 128 *pnLine = 0; 129 return a; 130 } 131 132 /* Fill in the array */ 133 for(i=0; i<nLine; i++){ 134 for(j=0; z[j] && z[j]!='\n'; j++){} 135 a[i].z = z; 136 k = j; 137 a[i].n = k; 138 s = 0; 139 for(h=0, x=s; x<k; x++){ 140 h = h ^ (h<<2) ^ z[x]; 141 } 142 a[i].indent = s; 143 a[i].h = h = (h<<LENGTH_MASK_SZ) | (k-s); 144 h2 = h % nLine; 145 a[i].iNext = a[h2].iHash; 146 a[h2].iHash = i+1; 147 z += j+1; 148 } 149 150 /* Return results */ 151 *pnLine = nLine; 152 return a; 153 } 154 155 /* 156 ** Return true if two DLine elements are identical. 157 */ 158 static int same_dline(const DLine *pA, const DLine *pB){ 159 return pA->h==pB->h && memcmp(pA->z,pB->z, pA->h&LENGTH_MASK)==0; 160 } 161 162 163 /* 164 ** Minimum of two values 165 */ 166 static int minInt(int a, int b){ return a<b ? a : b; } 167 168 /* 169 ** Compute the optimal longest common subsequence (LCS) using an 170 ** exhaustive search. This version of the LCS is only used for 171 ** shorter input strings since runtime is O(N*N) where N is the 172 ** input string length. 173 */ 174 static void optimalLCS( 175 DContext *p, /* Two files being compared */ 176 int iS1, int iE1, /* Range of lines in p->aFrom[] */ 177 int iS2, int iE2, /* Range of lines in p->aTo[] */ 178 int *piSX, int *piEX, /* Write p->aFrom[] common segment here */ 179 int *piSY, int *piEY /* Write p->aTo[] common segment here */ 180 ){ 181 int mxLength = 0; /* Length of longest common subsequence */ 182 int i, j; /* Loop counters */ 183 int k; /* Length of a candidate subsequence */ 184 int iSXb = iS1; /* Best match so far */ 185 int iSYb = iS2; /* Best match so far */ 186 187 for(i=iS1; i<iE1-mxLength; i++){ 188 for(j=iS2; j<iE2-mxLength; j++){ 189 if( !p->same_fn(&p->aFrom[i], &p->aTo[j]) ) continue; 190 if( mxLength && !p->same_fn(&p->aFrom[i+mxLength], &p->aTo[j+mxLength]) ){ 191 continue; 192 } 193 k = 1; 194 while( i+k<iE1 && j+k<iE2 && p->same_fn(&p->aFrom[i+k],&p->aTo[j+k]) ){ 195 k++; 196 } 197 if( k>mxLength ){ 198 iSXb = i; 199 iSYb = j; 200 mxLength = k; 201 } 202 } 203 } 204 *piSX = iSXb; 205 *piEX = iSXb + mxLength; 206 *piSY = iSYb; 207 *piEY = iSYb + mxLength; 208 } 209 210 /* 211 ** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[] 212 ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence 213 ** of lines in these two blocks that are exactly the same. Return 214 ** the bounds of the matching sequence. 215 ** 216 ** If there are two or more possible answers of the same length, the 217 ** returned sequence should be the one closest to the center of the 218 ** input range. 219 ** 220 ** Ideally, the common sequence should be the longest possible common 221 ** sequence. However, an exact computation of LCS is O(N*N) which is 222 ** way too slow for larger files. So this routine uses an O(N) 223 ** heuristic approximation based on hashing that usually works about 224 ** as well. But if the O(N) algorithm doesn't get a good solution 225 ** and N is not too large, we fall back to an exact solution by 226 ** calling optimalLCS(). 227 */ 228 static void longestCommonSequence( 229 DContext *p, /* Two files being compared */ 230 int iS1, int iE1, /* Range of lines in p->aFrom[] */ 231 int iS2, int iE2, /* Range of lines in p->aTo[] */ 232 int *piSX, int *piEX, /* Write p->aFrom[] common segment here */ 233 int *piSY, int *piEY /* Write p->aTo[] common segment here */ 234 ){ 235 int i, j, k; /* Loop counters */ 236 int n; /* Loop limit */ 237 DLine *pA, *pB; /* Pointers to lines */ 238 int iSX, iSY, iEX, iEY; /* Current match */ 239 int skew = 0; /* How lopsided is the match */ 240 int dist = 0; /* Distance of match from center */ 241 int mid; /* Center of the span */ 242 int iSXb, iSYb, iEXb, iEYb; /* Best match so far */ 243 int iSXp, iSYp, iEXp, iEYp; /* Previous match */ 244 int64_t bestScore; /* Best score so far */ 245 int64_t score; /* Score for current candidate LCS */ 246 int span; /* combined width of the input sequences */ 247 248 span = (iE1 - iS1) + (iE2 - iS2); 249 bestScore = -10000; 250 iSXb = iSXp = iS1; 251 iEXb = iEXp = iS1; 252 iSYb = iSYp = iS2; 253 iEYb = iEYp = iS2; 254 mid = (iE1 + iS1)/2; 255 for(i=iS1; i<iE1; i++){ 256 int limit = 0; 257 j = p->aTo[p->aFrom[i].h % p->nTo].iHash; 258 while( j>0 259 && (j-1<iS2 || j>=iE2 || !p->same_fn(&p->aFrom[i], &p->aTo[j-1])) 260 ){ 261 if( limit++ > 10 ){ 262 j = 0; 263 break; 264 } 265 j = p->aTo[j-1].iNext; 266 } 267 if( j==0 ) continue; 268 if( i<iEXb && j>=iSYb && j<iEYb ) continue; 269 if( i<iEXp && j>=iSYp && j<iEYp ) continue; 270 iSX = i; 271 iSY = j-1; 272 pA = &p->aFrom[iSX-1]; 273 pB = &p->aTo[iSY-1]; 274 n = minInt(iSX-iS1, iSY-iS2); 275 for(k=0; k<n && p->same_fn(pA,pB); k++, pA--, pB--){} 276 iSX -= k; 277 iSY -= k; 278 iEX = i+1; 279 iEY = j; 280 pA = &p->aFrom[iEX]; 281 pB = &p->aTo[iEY]; 282 n = minInt(iE1-iEX, iE2-iEY); 283 for(k=0; k<n && p->same_fn(pA,pB); k++, pA++, pB++){} 284 iEX += k; 285 iEY += k; 286 skew = (iSX-iS1) - (iSY-iS2); 287 if( skew<0 ) skew = -skew; 288 dist = (iSX+iEX)/2 - mid; 289 if( dist<0 ) dist = -dist; 290 score = (iEX - iSX)*(int64_t)span - (skew + dist); 291 if( score>bestScore ){ 292 bestScore = score; 293 iSXb = iSX; 294 iSYb = iSY; 295 iEXb = iEX; 296 iEYb = iEY; 297 }else if( iEX>iEXp ){ 298 iSXp = iSX; 299 iSYp = iSY; 300 iEXp = iEX; 301 iEYp = iEY; 302 } 303 } 304 if( iSXb==iEXb && (iE1-iS1)*(iE2-iS2)<400 ){ 305 /* If no common sequence is found using the hashing heuristic and 306 ** the input is not too big, use the expensive exact solution */ 307 optimalLCS(p, iS1, iE1, iS2, iE2, piSX, piEX, piSY, piEY); 308 }else{ 309 *piSX = iSXb; 310 *piSY = iSYb; 311 *piEX = iEXb; 312 *piEY = iEYb; 313 } 314 } 315 316 /* 317 ** Expand the size of aEdit[] array to hold at least nEdit elements. 318 */ 319 static void expandEdit(DContext *p, int nEdit){ 320 p->aEdit = xrealloc(p->aEdit, nEdit*sizeof(int)); 321 p->nEditAlloc = nEdit; 322 } 323 324 /* 325 ** Append a new COPY/DELETE/INSERT triple. 326 */ 327 static void appendTriple(DContext *p, int nCopy, int nDel, int nIns){ 328 /* printf("APPEND %d/%d/%d\n", nCopy, nDel, nIns); */ 329 if( p->nEdit>=3 ){ 330 if( p->aEdit[p->nEdit-1]==0 ){ 331 if( p->aEdit[p->nEdit-2]==0 ){ 332 p->aEdit[p->nEdit-3] += nCopy; 333 p->aEdit[p->nEdit-2] += nDel; 334 p->aEdit[p->nEdit-1] += nIns; 335 return; 336 } 337 if( nCopy==0 ){ 338 p->aEdit[p->nEdit-2] += nDel; 339 p->aEdit[p->nEdit-1] += nIns; 340 return; 341 } 342 } 343 if( nCopy==0 && nDel==0 ){ 344 p->aEdit[p->nEdit-1] += nIns; 345 return; 346 } 347 } 348 if( p->nEdit+3>p->nEditAlloc ){ 349 expandEdit(p, p->nEdit*2 + 15); 350 if( p->aEdit==0 ) return; 351 } 352 p->aEdit[p->nEdit++] = nCopy; 353 p->aEdit[p->nEdit++] = nDel; 354 p->aEdit[p->nEdit++] = nIns; 355 } 356 357 /* 358 ** Do a single step in the difference. Compute a sequence of 359 ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of 360 ** the input into lines iS2 through iE2-1 of the output and write 361 ** that sequence into the difference context. 362 ** 363 ** The algorithm is to find a block of common text near the middle 364 ** of the two segments being diffed. Then recursively compute 365 ** differences on the blocks before and after that common segment. 366 ** Special cases apply if either input segment is empty or if the 367 ** two segments have no text in common. 368 */ 369 static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){ 370 int iSX, iEX, iSY, iEY; 371 372 if( iE1<=iS1 ){ 373 /* The first segment is empty */ 374 if( iE2>iS2 ){ 375 appendTriple(p, 0, 0, iE2-iS2); 376 } 377 return; 378 } 379 if( iE2<=iS2 ){ 380 /* The second segment is empty */ 381 appendTriple(p, 0, iE1-iS1, 0); 382 return; 383 } 384 385 /* Find the longest matching segment between the two sequences */ 386 longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY); 387 388 if( iEX>iSX ){ 389 /* A common segment has been found. 390 ** Recursively diff either side of the matching segment */ 391 diff_step(p, iS1, iSX, iS2, iSY); 392 if( iEX>iSX ){ 393 appendTriple(p, iEX - iSX, 0, 0); 394 } 395 diff_step(p, iEX, iE1, iEY, iE2); 396 }else{ 397 /* The two segments have nothing in common. Delete the first then 398 ** insert the second. */ 399 appendTriple(p, 0, iE1-iS1, iE2-iS2); 400 } 401 } 402 403 /* 404 ** Compute the differences between two files already loaded into 405 ** the DContext structure. 406 ** 407 ** A divide and conquer technique is used. We look for a large 408 ** block of common text that is in the middle of both files. Then 409 ** compute the difference on those parts of the file before and 410 ** after the common block. This technique is fast, but it does 411 ** not necessarily generate the minimum difference set. On the 412 ** other hand, we do not need a minimum difference set, only one 413 ** that makes sense to human readers, which this algorithm does. 414 ** 415 ** Any common text at the beginning and end of the two files is 416 ** removed before starting the divide-and-conquer algorithm. 417 */ 418 static void diff_all(DContext *p){ 419 int mnE, iS, iE1, iE2; 420 421 /* Carve off the common header and footer */ 422 iE1 = p->nFrom; 423 iE2 = p->nTo; 424 while( iE1>0 && iE2>0 && p->same_fn(&p->aFrom[iE1-1], &p->aTo[iE2-1]) ){ 425 iE1--; 426 iE2--; 427 } 428 mnE = iE1<iE2 ? iE1 : iE2; 429 for(iS=0; iS<mnE && p->same_fn(&p->aFrom[iS],&p->aTo[iS]); iS++){} 430 431 /* do the difference */ 432 if( iS>0 ){ 433 appendTriple(p, iS, 0, 0); 434 } 435 diff_step(p, iS, iE1, iS, iE2); 436 if( iE1<p->nFrom ){ 437 appendTriple(p, p->nFrom - iE1, 0, 0); 438 } 439 440 /* Terminate the COPY/DELETE/INSERT triples with three zeros */ 441 expandEdit(p, p->nEdit+3); 442 if( p->aEdit ){ 443 p->aEdit[p->nEdit++] = 0; 444 p->aEdit[p->nEdit++] = 0; 445 p->aEdit[p->nEdit++] = 0; 446 } 447 } 448 449 /* 450 ** Generate a report of the differences between files pA and pB. 451 ** If pOut is not NULL then a unified diff is appended there. It 452 ** is assumed that pOut has already been initialized. If pOut is 453 ** NULL, then a pointer to an array of integers is returned. 454 ** The integers come in triples. For each triple, 455 ** the elements are the number of lines copied, the number of 456 ** lines deleted, and the number of lines inserted. The vector 457 ** is terminated by a triple of all zeros. 458 ** 459 ** This diff utility does not work on binary files. If a binary 460 ** file is encountered, 0 is returned and pOut is written with 461 ** text "cannot compute difference between binary files". 462 */ 463 int * 464 text_diff( 465 char *pA, /* FROM file */ 466 char *pB /* TO file */ 467 ){ 468 DContext c; 469 470 /* Prepare the input files */ 471 memset(&c, 0, sizeof(c)); 472 c.same_fn = same_dline; 473 c.aFrom = break_into_lines(pA, &c.nFrom); 474 c.aTo = break_into_lines(pB, &c.nTo); 475 if( c.aFrom==0 || c.aTo==0 ){ 476 free(c.aFrom); 477 free(c.aTo); 478 return 0; 479 } 480 481 /* Compute the difference */ 482 diff_all(&c); 483 /* If a context diff is not requested, then return the 484 ** array of COPY/DELETE/INSERT triples. 485 */ 486 free(c.aFrom); 487 free(c.aTo); 488 return c.aEdit; 489 }