File indexing completed on 2024-12-22 04:04:12

0001 /* Copyright (C) 2001-2019 Peter Selinger.
0002    This file is part of Potrace. It is free software and it is covered
0003    by the GNU General Public License. See the file COPYING for details. */
0004 
0005 
0006 /* code for adaptive LZW compression, as used in PostScript. */
0007 
0008 #ifdef HAVE_CONFIG_H
0009 #include <config.h>
0010 #endif
0011 
0012 #include <stdlib.h>
0013 #include <stdio.h>
0014 #include <errno.h>
0015 #include <string.h>
0016 
0017 #include "lists.h"
0018 #include "bitops.h"
0019 #include "lzw.h"
0020 
0021 /* ---------------------------------------------------------------------- */
0022 /* compression state specification */
0023 
0024 /* The compression algorithm follows the following specification,
0025    expressed as a state machine. A state is a triple {s,d,n}, where s
0026    is a string of input symbols, d is a dictionary, which is a
0027    function from strings to output symbols, and n is the dictionary
0028    size, or equivalently, the next unused output symbol. There are
0029    also special states init and stop. emit[b, code] is a function
0030    which emits the code 'code' as a b-bit value into the output
0031    bitstream. hibit(n) returns the least number of binary digits
0032    required to represent n.
0033 
0034    init ---> {[], newdict, 258}
0035 
0036      where [] is the empty string, and newdict maps each of the 256
0037      singleton strings to itself. (Note that there are two special
0038      output symbols 256 and 257, so that the next available one is
0039      258). Note: hibit(258)=9.
0040 
0041    {[], d, n} (input c) ---> (emit[hibit(n), 256]) {c, d, n}
0042 
0043    {s,d,n} (input c) ---> {s*c,d,n}
0044 
0045      if s!=[], s*c is in the domain of d. Here s*c is the strings s
0046      extended by the character c.
0047 
0048    {s,d,n} (input c) ---> (emit[hibit(n), d(s)]) {c,d',n+1}
0049 
0050      if s!=[], s*c is not in the domain of d, and hibit(n+2) <= 12.
0051      Here d'=d+{s*c->n}.
0052 
0053    {s,d,n} (input c) ---> 
0054            (emit[hibit(n), d(s)]) (emit[hibit(n+1), 256]) {c, newdict, 258}
0055 
0056      if s!=[], s*c is not in the domain of d, and hibit(n+2) > 12.
0057 
0058    {s,d,n} (input EOD) ---> (emit[hibit(n), d(s)]) (emit[hibit(n+1), 257]) stop
0059 
0060      where s != []. Here, EOD stands for end-of-data.
0061 
0062    {[],d,n} (input EOD) ---> (emit[hibit(n), 256]) (emit[hibit(n), 257]) stop
0063 
0064    Notes: 
0065 
0066    * Each reachable state {s,d,n} satisfies hibit(n+1) <= 12.
0067    * Only codes of 12 or fewer bits are emitted.
0068    * Each reachable state {s,d,n} satisfies s=[] or s is in the domain of d.
0069    * The domain of d is always prefix closed (except for the empty prefix)
0070    * The state machine is deterministic and non-blocking.
0071 
0072 */
0073    
0074 
0075 /* ---------------------------------------------------------------------- */
0076 /* private state */
0077 
0078 #define BITBUF_TYPE unsigned int
0079 
0080 /* the dictionary is implemented as a tree of strings under the prefix
0081    order. The tree is in turns represented as a linked list of
0082    lzw_dict_t structures, with "children" pointing to a node's first
0083    child, and "next" pointing to a node's next sibling. As an
0084    optimization, the top-level nodes (singleton strings) are
0085    implemented lazily, i.e., the corresponding entry is not actually
0086    created until it is accessed. */
0087 
0088 struct lzw_dict_s {
0089   char c;            /* last character of string represented by this entry */
0090   unsigned int code; /* code for the string represented by this entry */
0091   int freq;          /* how often searched? For optimization only */
0092   struct lzw_dict_s *children;  /* list of sub-entries */
0093   struct lzw_dict_s *next;      /* for making a linked list */
0094 };
0095 typedef struct lzw_dict_s lzw_dict_t;
0096 
0097 /* A state {s,d,n} is represented by the "dictionary state" part of
0098    the lzw_state_t structure. Here, s is a pointer directly to the node
0099    of the dictionary structure corresponding to the string s, or NULL
0100    if s=[]. Further, the lzw_state_t structure contains a buffer of
0101    pending output bits, and a flag indicating whether the EOD (end of
0102    data) has been reached in the input. */
0103 
0104 struct lzw_state_s {
0105   /* dictionary state */
0106   int n;           /* current size of the dictionary */
0107   lzw_dict_t *d;   /* pointer to dictionary */
0108   lzw_dict_t *s;   /* pointer to current string, or NULL at beginning */
0109 
0110   /* buffers for pending output */
0111   BITBUF_TYPE buf; /* bits scheduled for output - left aligned, 0 padded */
0112   int bufsize;     /* number of bits scheduled for output. */
0113   int eod;         /* flush buffer? */
0114 };
0115 typedef struct lzw_state_s lzw_state_t;
0116 
0117 /* ---------------------------------------------------------------------- */
0118 /* auxiliary functions which operate on dictionary states */
0119 
0120 /* recursively free an lzw_dict_t object */
0121 static void lzw_free_dict(lzw_dict_t *s) {
0122   lzw_dict_t *e;
0123 
0124   list_forall_unlink(e, s) {
0125     lzw_free_dict(e->children);
0126     free(e);
0127   }
0128 }
0129 
0130 /* re-initialize the lzw state's dictionary state to "newdict",
0131    freeing any old dictionary. */
0132 static void lzw_clear_table(lzw_state_t *st) {
0133 
0134   lzw_free_dict(st->d);
0135   st->d = NULL;
0136   st->n = 258;
0137   st->s = NULL;
0138 }
0139 
0140 /* ---------------------------------------------------------------------- */
0141 /* auxiliary functions for reading/writing the bit buffer */
0142 
0143 /* write the code to the bit buffer. Precondition st->bufsize <= 7.
0144    Note: this does not change the dictionary state; in particular,
0145    n must be updated between consecutive calls. */
0146 static inline void lzw_emit(unsigned int code, lzw_state_t *st) {
0147   BITBUF_TYPE mask;
0148   int bits = hibit(st->n);
0149 
0150   /* fill bit buffer */
0151   mask = (1 << bits) - 1;
0152   code &= mask;
0153 
0154   st->buf |= code << (8*sizeof(BITBUF_TYPE) - st->bufsize - bits);
0155   st->bufsize += bits;
0156 }
0157 
0158 /* transfer one byte from bit buffer to output. Precondition:
0159    s->avail_out > 0. */
0160 static inline void lzw_read_bitbuf(lzw_stream_t *s) {
0161   int ch;
0162   lzw_state_t *st = (lzw_state_t *)s->internal;
0163 
0164   ch = st->buf >> (8*sizeof(BITBUF_TYPE)-8);
0165   st->buf <<= 8;
0166   st->bufsize -= 8;
0167 
0168   s->next_out[0] = ch;
0169   s->next_out++;
0170   s->avail_out--;
0171 }
0172 
0173 /* ---------------------------------------------------------------------- */
0174 /* The following functions implement the state machine. */
0175 
0176 /* perform state transition of the state st on input character
0177    ch. This updates the dictionary state and/or writes to the bit
0178    buffer. Precondition: st->bufsize <= 7. Return 0 on success, or 1
0179    on error with errno set. */
0180 static int lzw_encode_char(lzw_state_t *st, char c) {
0181   lzw_dict_t *e;
0182 
0183   /* st = {s,d,n}. hibit(n+1)<=12. */
0184 
0185   /* {[], d, n} (input c) ---> (emit[hibit(n), 256]) {c, d, n} */
0186   if (st->s == NULL) {
0187     lzw_emit(256, st);
0188     goto singleton;  /* enter singleton state c */
0189   } 
0190 
0191   /* {s,d,n} (input c) ---> {s*c,d,n} */
0192   list_find(e, st->s->children, e->c == c);
0193   if (e) {
0194     e->freq++;
0195     st->s = e;
0196     return 0;
0197   }
0198 
0199   /* {s,d,n} (input c) ---> (emit[hibit(n), d(s)]) {c,d',n+1} */
0200   /* {s,d,n} (input c) --->
0201         (emit[hibit(n), d(s)]) (emit[hibit(n+1), 256]) {c, newdict, 258} */
0202 
0203   lzw_emit(st->s->code, st); /* 9-12 bits */
0204   if (st->n >= 4094) {   /* hibit(n+2) > 12 */
0205     st->n++;
0206     lzw_emit(256, st);
0207     goto dictfull;    /* reset dictionary and enter singleton state c */
0208   }
0209 
0210   /* insert new code in dictionary, if possible */
0211   e = (lzw_dict_t *)malloc(sizeof(lzw_dict_t));
0212   if (!e) {
0213     return 1;
0214   }
0215   e->c = c;
0216   e->code = st->n;
0217   e->freq = 1;
0218   e->children = NULL;
0219   list_prepend(st->s->children, e);
0220   st->n++;
0221   goto singleton;  /* enter singleton state c */
0222 
0223  dictfull:    /* reset dictionary and enter singleton state c */
0224   lzw_clear_table(st);
0225   /* fall through */
0226   
0227  singleton:   /* enter singleton state c */
0228   list_find(e, st->d, e->c == c);
0229   if (!e) {  /* not found: lazily add it */
0230     e = (lzw_dict_t *)malloc(sizeof(lzw_dict_t));
0231     if (!e) {
0232       return 1;
0233     }
0234     e->c = c;
0235     e->code = (unsigned char)c;
0236     e->freq = 0;
0237     e->children = NULL;
0238     list_prepend(st->d, e);
0239   }
0240   e->freq++;
0241   st->s = e;
0242   return 0;
0243 }
0244 
0245 /* perform state transition of the state st on input EOD. The leaves
0246    the dictionary state undefined and writes to the bit buffer.
0247    Precondition: st->bufsize <= 7. This function must be called
0248    exactly once, at the end of the stream. */
0249 static void lzw_encode_eod(lzw_state_t *st) {
0250 
0251   /* {[],d,n} (input EOD) ---> 
0252               (emit[hibit(n), 256]) (emit[hibit(n), 257]) stop */
0253   if (st->s == NULL) {
0254     lzw_emit(256, st);  /* 9 bits */
0255     st->n=258;
0256     lzw_emit(257, st);  /* 9 bits */
0257     return;
0258   } 
0259 
0260   /* {s,d,n} (input EOD) ---> 
0261              (emit[hibit(n), code]) (emit[hibit(n+1), 257]) stop */
0262 
0263   lzw_emit(st->s->code, st); /* 9-12 bits */
0264   st->n++;
0265   lzw_emit(257, st);  /* 9-12 bits */
0266   return;
0267 }
0268 
0269 /* ---------------------------------------------------------------------- */
0270 /* User visible functions. These implement a buffer interface. See
0271    lzw.h for the API description. */
0272 
0273 lzw_stream_t *lzw_init(void) {
0274   lzw_stream_t *s = NULL;
0275   lzw_state_t *st = NULL;
0276 
0277   s = (lzw_stream_t *)malloc(sizeof(lzw_stream_t));
0278   if (s==NULL) {
0279     goto fail;
0280   }
0281   st = (lzw_state_t *)malloc(sizeof(lzw_state_t));
0282   if (st==NULL) {
0283     goto fail;
0284   }
0285   st->buf = 0;
0286   st->bufsize = 0;
0287   st->eod = 0;
0288   st->d = NULL;
0289   lzw_clear_table(st);
0290   s->internal = (void *) st;
0291   return s;
0292 
0293  fail:
0294   free(s);
0295   free(st);
0296   return NULL;
0297 }
0298 
0299 int lzw_compress(lzw_stream_t *s, int mode) {
0300   int r;
0301   lzw_state_t *st = (lzw_state_t *)s->internal;
0302 
0303   while (st->eod == 0) {
0304     /* empty bit buffer */
0305     while (st->bufsize > 7) {
0306       if (s->avail_out == 0) {
0307     return 0;
0308       } else {
0309     lzw_read_bitbuf(s);
0310       }
0311     }
0312     /* fill bit buffer */
0313     if (s->avail_in == 0) {
0314       break;
0315     } else {
0316       r = lzw_encode_char(st, s->next_in[0]);
0317       if (r) {
0318     if (r==2) {
0319       errno = EINVAL;
0320     }
0321     return 1;
0322       }
0323       s->next_in++;
0324       s->avail_in--;
0325     }
0326   }
0327 
0328   if (mode==LZW_EOD && st->eod == 0) {
0329     st->eod = 1;
0330     lzw_encode_eod(st);
0331   }
0332 
0333   /* flush bit buffer */
0334   if (st->eod) {
0335     while (st->bufsize > 0) {
0336       if (s->avail_out == 0) {
0337     return 0;
0338       } else {
0339     lzw_read_bitbuf(s);
0340       }
0341     }
0342   }
0343 
0344   return 0;
0345 }
0346 
0347 void lzw_free(lzw_stream_t *s) {
0348   lzw_state_t *st = (lzw_state_t *)s->internal;
0349 
0350   lzw_free_dict(st->d);
0351   free(st);
0352   free(s);
0353 }
0354 
0355 /* ---------------------------------------------------------------------- */
0356 /* main function for testing and illustration purposes */
0357 
0358 #ifdef LZW_MAIN
0359 
0360 int main() {
0361   lzw_stream_t *s;
0362   int ch;
0363   char inbuf[100];
0364   char outbuf[100];
0365   int i, r;
0366   int mode;
0367 
0368   s = lzw_init();
0369   if (!s) {
0370     goto error;
0371   }
0372   mode = LZW_NORMAL;
0373 
0374   while (1) {
0375     /* fill inbuf */
0376     for (i=0; i<100; i++) {
0377       ch = fgetc(stdin);
0378       if (ch==EOF) {
0379     break;
0380       }
0381       inbuf[i] = ch;
0382     }
0383     if (i<100) {   /* end of input */
0384       mode = LZW_EOD;
0385     }
0386 
0387     /* compress */
0388     s->next_in = inbuf;
0389     s->avail_in = i;
0390     do {
0391       s->next_out = outbuf;
0392       s->avail_out = 100;
0393       r = lzw_compress(s, mode);
0394       if (r) {
0395     goto error;
0396       }
0397       fwrite(outbuf, 1, 100-s->avail_out, stdout);
0398     } while (s->avail_out==0);
0399     if (mode == LZW_EOD) {
0400       break;
0401     }
0402   }
0403   fflush(stdout);
0404   lzw_free(s);
0405 
0406   return 0;
0407 
0408  error:
0409   fprintf(stderr, "lzw: %s\n", strerror(errno));
0410   lzw_free(s);
0411   return 1;
0412 
0413 }
0414 #endif
0415