/ zlib / examples / gzjoin.c
gzjoin.c
  1  /* gzjoin -- command to join gzip files into one gzip file
  2  
  3    Copyright (C) 2004, 2005, 2012 Mark Adler, all rights reserved
  4    version 1.2, 14 Aug 2012
  5  
  6    This software is provided 'as-is', without any express or implied
  7    warranty.  In no event will the author be held liable for any damages
  8    arising from the use of this software.
  9  
 10    Permission is granted to anyone to use this software for any purpose,
 11    including commercial applications, and to alter it and redistribute it
 12    freely, subject to the following restrictions:
 13  
 14    1. The origin of this software must not be misrepresented; you must not
 15       claim that you wrote the original software. If you use this software
 16       in a product, an acknowledgment in the product documentation would be
 17       appreciated but is not required.
 18    2. Altered source versions must be plainly marked as such, and must not be
 19       misrepresented as being the original software.
 20    3. This notice may not be removed or altered from any source distribution.
 21  
 22    Mark Adler    madler@alumni.caltech.edu
 23   */
 24  
 25  /*
 26   * Change history:
 27   *
 28   * 1.0  11 Dec 2004     - First version
 29   * 1.1  12 Jun 2005     - Changed ssize_t to long for portability
 30   * 1.2  14 Aug 2012     - Clean up for z_const usage
 31   */
 32  
 33  /*
 34     gzjoin takes one or more gzip files on the command line and writes out a
 35     single gzip file that will uncompress to the concatenation of the
 36     uncompressed data from the individual gzip files.  gzjoin does this without
 37     having to recompress any of the data and without having to calculate a new
 38     crc32 for the concatenated uncompressed data.  gzjoin does however have to
 39     decompress all of the input data in order to find the bits in the compressed
 40     data that need to be modified to concatenate the streams.
 41  
 42     gzjoin does not do an integrity check on the input gzip files other than
 43     checking the gzip header and decompressing the compressed data.  They are
 44     otherwise assumed to be complete and correct.
 45  
 46     Each joint between gzip files removes at least 18 bytes of previous trailer
 47     and subsequent header, and inserts an average of about three bytes to the
 48     compressed data in order to connect the streams.  The output gzip file
 49     has a minimal ten-byte gzip header with no file name or modification time.
 50  
 51     This program was written to illustrate the use of the Z_BLOCK option of
 52     inflate() and the crc32_combine() function.  gzjoin will not compile with
 53     versions of zlib earlier than 1.2.3.
 54   */
 55  
 56  #include <stdio.h>      /* fputs(), fprintf(), fwrite(), putc() */
 57  #include <stdlib.h>     /* exit(), malloc(), free() */
 58  #include <fcntl.h>      /* open() */
 59  #include <unistd.h>     /* close(), read(), lseek() */
 60  #include "zlib.h"
 61      /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */
 62  
 63  #define local static
 64  
 65  /* exit with an error (return a value to allow use in an expression) */
 66  local int bail(char *why1, char *why2)
 67  {
 68      fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2);
 69      exit(1);
 70      return 0;
 71  }
 72  
 73  /* -- simple buffered file input with access to the buffer -- */
 74  
 75  #define CHUNK 32768         /* must be a power of two and fit in unsigned */
 76  
 77  /* bin buffered input file type */
 78  typedef struct {
 79      char *name;             /* name of file for error messages */
 80      int fd;                 /* file descriptor */
 81      unsigned left;          /* bytes remaining at next */
 82      unsigned char *next;    /* next byte to read */
 83      unsigned char *buf;     /* allocated buffer of length CHUNK */
 84  } bin;
 85  
 86  /* close a buffered file and free allocated memory */
 87  local void bclose(bin *in)
 88  {
 89      if (in != NULL) {
 90          if (in->fd != -1)
 91              close(in->fd);
 92          if (in->buf != NULL)
 93              free(in->buf);
 94          free(in);
 95      }
 96  }
 97  
 98  /* open a buffered file for input, return a pointer to type bin, or NULL on
 99     failure */
100  local bin *bopen(char *name)
101  {
102      bin *in;
103  
104      in = malloc(sizeof(bin));
105      if (in == NULL)
106          return NULL;
107      in->buf = malloc(CHUNK);
108      in->fd = open(name, O_RDONLY, 0);
109      if (in->buf == NULL || in->fd == -1) {
110          bclose(in);
111          return NULL;
112      }
113      in->left = 0;
114      in->next = in->buf;
115      in->name = name;
116      return in;
117  }
118  
119  /* load buffer from file, return -1 on read error, 0 or 1 on success, with
120     1 indicating that end-of-file was reached */
121  local int bload(bin *in)
122  {
123      long len;
124  
125      if (in == NULL)
126          return -1;
127      if (in->left != 0)
128          return 0;
129      in->next = in->buf;
130      do {
131          len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left);
132          if (len < 0)
133              return -1;
134          in->left += (unsigned)len;
135      } while (len != 0 && in->left < CHUNK);
136      return len == 0 ? 1 : 0;
137  }
138  
139  /* get a byte from the file, bail if end of file */
140  #define bget(in) (in->left ? 0 : bload(in), \
141                    in->left ? (in->left--, *(in->next)++) : \
142                      bail("unexpected end of file on ", in->name))
143  
144  /* get a four-byte little-endian unsigned integer from file */
145  local unsigned long bget4(bin *in)
146  {
147      unsigned long val;
148  
149      val = bget(in);
150      val += (unsigned long)(bget(in)) << 8;
151      val += (unsigned long)(bget(in)) << 16;
152      val += (unsigned long)(bget(in)) << 24;
153      return val;
154  }
155  
156  /* skip bytes in file */
157  local void bskip(bin *in, unsigned skip)
158  {
159      /* check pointer */
160      if (in == NULL)
161          return;
162  
163      /* easy case -- skip bytes in buffer */
164      if (skip <= in->left) {
165          in->left -= skip;
166          in->next += skip;
167          return;
168      }
169  
170      /* skip what's in buffer, discard buffer contents */
171      skip -= in->left;
172      in->left = 0;
173  
174      /* seek past multiples of CHUNK bytes */
175      if (skip > CHUNK) {
176          unsigned left;
177  
178          left = skip & (CHUNK - 1);
179          if (left == 0) {
180              /* exact number of chunks: seek all the way minus one byte to check
181                 for end-of-file with a read */
182              lseek(in->fd, skip - 1, SEEK_CUR);
183              if (read(in->fd, in->buf, 1) != 1)
184                  bail("unexpected end of file on ", in->name);
185              return;
186          }
187  
188          /* skip the integral chunks, update skip with remainder */
189          lseek(in->fd, skip - left, SEEK_CUR);
190          skip = left;
191      }
192  
193      /* read more input and skip remainder */
194      bload(in);
195      if (skip > in->left)
196          bail("unexpected end of file on ", in->name);
197      in->left -= skip;
198      in->next += skip;
199  }
200  
201  /* -- end of buffered input functions -- */
202  
203  /* skip the gzip header from file in */
204  local void gzhead(bin *in)
205  {
206      int flags;
207  
208      /* verify gzip magic header and compression method */
209      if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8)
210          bail(in->name, " is not a valid gzip file");
211  
212      /* get and verify flags */
213      flags = bget(in);
214      if ((flags & 0xe0) != 0)
215          bail("unknown reserved bits set in ", in->name);
216  
217      /* skip modification time, extra flags, and os */
218      bskip(in, 6);
219  
220      /* skip extra field if present */
221      if (flags & 4) {
222          unsigned len;
223  
224          len = bget(in);
225          len += (unsigned)(bget(in)) << 8;
226          bskip(in, len);
227      }
228  
229      /* skip file name if present */
230      if (flags & 8)
231          while (bget(in) != 0)
232              ;
233  
234      /* skip comment if present */
235      if (flags & 16)
236          while (bget(in) != 0)
237              ;
238  
239      /* skip header crc if present */
240      if (flags & 2)
241          bskip(in, 2);
242  }
243  
244  /* write a four-byte little-endian unsigned integer to out */
245  local void put4(unsigned long val, FILE *out)
246  {
247      putc(val & 0xff, out);
248      putc((val >> 8) & 0xff, out);
249      putc((val >> 16) & 0xff, out);
250      putc((val >> 24) & 0xff, out);
251  }
252  
253  /* Load up zlib stream from buffered input, bail if end of file */
254  local void zpull(z_streamp strm, bin *in)
255  {
256      if (in->left == 0)
257          bload(in);
258      if (in->left == 0)
259          bail("unexpected end of file on ", in->name);
260      strm->avail_in = in->left;
261      strm->next_in = in->next;
262  }
263  
264  /* Write header for gzip file to out and initialize trailer. */
265  local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out)
266  {
267      fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
268      *crc = crc32(0L, Z_NULL, 0);
269      *tot = 0;
270  }
271  
272  /* Copy the compressed data from name, zeroing the last block bit of the last
273     block if clr is true, and adding empty blocks as needed to get to a byte
274     boundary.  If clr is false, then the last block becomes the last block of
275     the output, and the gzip trailer is written.  crc and tot maintains the
276     crc and length (modulo 2^32) of the output for the trailer.  The resulting
277     gzip file is written to out.  gzinit() must be called before the first call
278     of gzcopy() to write the gzip header and to initialize crc and tot. */
279  local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot,
280                    FILE *out)
281  {
282      int ret;                /* return value from zlib functions */
283      int pos;                /* where the "last block" bit is in byte */
284      int last;               /* true if processing the last block */
285      bin *in;                /* buffered input file */
286      unsigned char *start;   /* start of compressed data in buffer */
287      unsigned char *junk;    /* buffer for uncompressed data -- discarded */
288      z_off_t len;            /* length of uncompressed data (support > 4 GB) */
289      z_stream strm;          /* zlib inflate stream */
290  
291      /* open gzip file and skip header */
292      in = bopen(name);
293      if (in == NULL)
294          bail("could not open ", name);
295      gzhead(in);
296  
297      /* allocate buffer for uncompressed data and initialize raw inflate
298         stream */
299      junk = malloc(CHUNK);
300      strm.zalloc = Z_NULL;
301      strm.zfree = Z_NULL;
302      strm.opaque = Z_NULL;
303      strm.avail_in = 0;
304      strm.next_in = Z_NULL;
305      ret = inflateInit2(&strm, -15);
306      if (junk == NULL || ret != Z_OK)
307          bail("out of memory", "");
308  
309      /* inflate and copy compressed data, clear last-block bit if requested */
310      len = 0;
311      zpull(&strm, in);
312      start = in->next;
313      last = start[0] & 1;
314      if (last && clr)
315          start[0] &= ~1;
316      strm.avail_out = 0;
317      for (;;) {
318          /* if input used and output done, write used input and get more */
319          if (strm.avail_in == 0 && strm.avail_out != 0) {
320              fwrite(start, 1, strm.next_in - start, out);
321              start = in->buf;
322              in->left = 0;
323              zpull(&strm, in);
324          }
325  
326          /* decompress -- return early when end-of-block reached */
327          strm.avail_out = CHUNK;
328          strm.next_out = junk;
329          ret = inflate(&strm, Z_BLOCK);
330          switch (ret) {
331          case Z_MEM_ERROR:
332              bail("out of memory", "");
333          case Z_DATA_ERROR:
334              bail("invalid compressed data in ", in->name);
335          }
336  
337          /* update length of uncompressed data */
338          len += CHUNK - strm.avail_out;
339  
340          /* check for block boundary (only get this when block copied out) */
341          if (strm.data_type & 128) {
342              /* if that was the last block, then done */
343              if (last)
344                  break;
345  
346              /* number of unused bits in last byte */
347              pos = strm.data_type & 7;
348  
349              /* find the next last-block bit */
350              if (pos != 0) {
351                  /* next last-block bit is in last used byte */
352                  pos = 0x100 >> pos;
353                  last = strm.next_in[-1] & pos;
354                  if (last && clr)
355                      in->buf[strm.next_in - in->buf - 1] &= ~pos;
356              }
357              else {
358                  /* next last-block bit is in next unused byte */
359                  if (strm.avail_in == 0) {
360                      /* don't have that byte yet -- get it */
361                      fwrite(start, 1, strm.next_in - start, out);
362                      start = in->buf;
363                      in->left = 0;
364                      zpull(&strm, in);
365                  }
366                  last = strm.next_in[0] & 1;
367                  if (last && clr)
368                      in->buf[strm.next_in - in->buf] &= ~1;
369              }
370          }
371      }
372  
373      /* update buffer with unused input */
374      in->left = strm.avail_in;
375      in->next = in->buf + (strm.next_in - in->buf);
376  
377      /* copy used input, write empty blocks to get to byte boundary */
378      pos = strm.data_type & 7;
379      fwrite(start, 1, in->next - start - 1, out);
380      last = in->next[-1];
381      if (pos == 0 || !clr)
382          /* already at byte boundary, or last file: write last byte */
383          putc(last, out);
384      else {
385          /* append empty blocks to last byte */
386          last &= ((0x100 >> pos) - 1);       /* assure unused bits are zero */
387          if (pos & 1) {
388              /* odd -- append an empty stored block */
389              putc(last, out);
390              if (pos == 1)
391                  putc(0, out);               /* two more bits in block header */
392              fwrite("\0\0\xff\xff", 1, 4, out);
393          }
394          else {
395              /* even -- append 1, 2, or 3 empty fixed blocks */
396              switch (pos) {
397              case 6:
398                  putc(last | 8, out);
399                  last = 0;
400              case 4:
401                  putc(last | 0x20, out);
402                  last = 0;
403              case 2:
404                  putc(last | 0x80, out);
405                  putc(0, out);
406              }
407          }
408      }
409  
410      /* update crc and tot */
411      *crc = crc32_combine(*crc, bget4(in), len);
412      *tot += (unsigned long)len;
413  
414      /* clean up */
415      inflateEnd(&strm);
416      free(junk);
417      bclose(in);
418  
419      /* write trailer if this is the last gzip file */
420      if (!clr) {
421          put4(*crc, out);
422          put4(*tot, out);
423      }
424  }
425  
426  /* join the gzip files on the command line, write result to stdout */
427  int main(int argc, char **argv)
428  {
429      unsigned long crc, tot;     /* running crc and total uncompressed length */
430  
431      /* skip command name */
432      argc--;
433      argv++;
434  
435      /* show usage if no arguments */
436      if (argc == 0) {
437          fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n",
438                stderr);
439          return 0;
440      }
441  
442      /* join gzip files on command line and write to stdout */
443      gzinit(&crc, &tot, stdout);
444      while (argc--)
445          gzcopy(*argv++, argc, &crc, &tot, stdout);
446  
447      /* done */
448      return 0;
449  }