File indexing completed on 2025-02-02 04:57:56

0001 /*
0002 libcsv - parse and write csv data
0003 Copyright (C) 2008  Robert Gamble
0004 
0005 This library is free software; you can redistribute it and/or
0006 modify it under the terms of the GNU Lesser General Public
0007 License as published by the Free Software Foundation; either
0008 version 2.1 of the License, or (at your option) any later version.
0009 
0010 This library is distributed in the hope that it will be useful,
0011 but WITHOUT ANY WARRANTY; without even the implied warranty of
0012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0013 Lesser General Public License for more details.
0014 
0015 You should have received a copy of the GNU Lesser General Public
0016 License along with this library; if not, write to the Free Software
0017 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
0018 */
0019 
0020 #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
0021 #  include <stdint.h>
0022 #else
0023 #  define SIZE_MAX ((size_t)-1) /* C89 doesn't have stdint.h or SIZE_MAX */
0024 #endif
0025 
0026 #include "libcsv.h"
0027 
0028 #define VERSION "3.0.3"
0029 
0030 #define ROW_NOT_BEGUN           0
0031 #define FIELD_NOT_BEGUN         1
0032 #define FIELD_BEGUN             2
0033 #define FIELD_MIGHT_HAVE_ENDED  3
0034 
0035 /*
0036   Explanation of states
0037   ROW_NOT_BEGUN    There have not been any fields encountered for this row
0038   FIELD_NOT_BEGUN  There have been fields but we are currently not in one
0039   FIELD_BEGUN      We are in a field
0040   FIELD_MIGHT_HAVE_ENDED
0041                    We encountered a double quote inside a quoted field, the
0042                    field is either ended or the quote is literal
0043 */
0044 
0045 #define MEM_BLK_SIZE 128
0046 
0047 #define SUBMIT_FIELD(p) \
0048   do { \
0049    if (!quoted) \
0050      entry_pos -= spaces; \
0051    if (p->options & CSV_APPEND_NULL) \
0052      ((p)->entry_buf[entry_pos]) = '\0'; \
0053    if (cb1 && (p->options & CSV_EMPTY_IS_NULL) && !quoted && entry_pos == 0) \
0054      cb1(NULL, entry_pos, data); \
0055    else if (cb1) \
0056      cb1(p->entry_buf, entry_pos, data); \
0057    pstate = FIELD_NOT_BEGUN; \
0058    entry_pos = quoted = spaces = 0; \
0059  } while (0)
0060 
0061 #define SUBMIT_ROW(p, c) \
0062   do { \
0063     if (cb2) \
0064       cb2(c, data); \
0065     pstate = ROW_NOT_BEGUN; \
0066     entry_pos = quoted = spaces = 0; \
0067   } while (0)
0068 
0069 #define SUBMIT_CHAR(p, c) ((p)->entry_buf[entry_pos++] = (c))
0070 
0071 static const char *csv_errors[] = {"success",
0072                              "error parsing data while strict checking enabled",
0073                              "memory exhausted while increasing buffer size",
0074                              "data size too large",
0075                              "invalid status code"};
0076 
0077 int
0078 csv_error(struct csv_parser *p)
0079 {
0080   /* Return the current status of the parser */
0081   return p->status;
0082 }
0083 
0084 const char *
0085 csv_strerror(int status)
0086 {
0087   /* Return a textual description of status */
0088   if (status >= CSV_EINVALID || status < 0)
0089     return csv_errors[CSV_EINVALID];
0090   else
0091     return csv_errors[status];
0092 }
0093 
0094 int
0095 csv_get_opts(struct csv_parser *p)
0096 {
0097   /* Return the currently set options of parser */
0098   if (p == NULL)
0099     return -1;
0100 
0101   return p->options;
0102 }
0103 
0104 int
0105 csv_set_opts(struct csv_parser *p, unsigned char options)
0106 {
0107   /* Set the options */
0108   if (p == NULL)
0109     return -1;
0110 
0111   p->options = options;
0112   return 0;
0113 }
0114 
0115 int
0116 csv_init(struct csv_parser *p, unsigned char options)
0117 {
0118   /* Initialize a csv_parser object returns 0 on success, -1 on error */
0119   if (p == NULL)
0120     return -1;
0121 
0122   p->entry_buf = NULL;
0123   p->pstate = ROW_NOT_BEGUN;
0124   p->quoted = 0;
0125   p->spaces = 0;
0126   p->entry_pos = 0;
0127   p->entry_size = 0;
0128   p->status = 0;
0129   p->options = options;
0130   p->quote_char = CSV_QUOTE;
0131   p->delim_char = CSV_COMMA;
0132   p->is_space = NULL;
0133   p->is_term = NULL;
0134   p->blk_size = MEM_BLK_SIZE;
0135   p->malloc_func = NULL;
0136   p->realloc_func = realloc;
0137   p->free_func = free;
0138 
0139   return 0;
0140 }
0141 
0142 void
0143 csv_free(struct csv_parser *p)
0144 {
0145   /* Free the entry_buffer of csv_parser object */
0146   if (p == NULL)
0147     return;
0148 
0149   if (p->entry_buf)
0150     p->free_func(p->entry_buf);
0151 
0152   p->entry_buf = NULL;
0153   p->entry_size = 0;
0154 
0155   return;
0156 }
0157 
0158 int
0159 csv_fini(struct csv_parser *p, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data)
0160 {
0161   /* Finalize parsing.  Needed, for example, when file does not end in a newline */
0162   int quoted = p->quoted;
0163   int pstate = p->pstate;
0164   size_t spaces = p->spaces;
0165   size_t entry_pos = p->entry_pos;
0166 
0167   if (p == NULL)
0168     return -1;
0169 
0170 
0171   if (p->pstate == FIELD_BEGUN && p->quoted && p->options & CSV_STRICT && p->options & CSV_STRICT_FINI) {
0172     /* Current field is quoted, no end-quote was seen, and CSV_STRICT_FINI is set */
0173     p->status = CSV_EPARSE;
0174     return -1;
0175   }
0176 
0177   switch (p->pstate) {
0178     case FIELD_MIGHT_HAVE_ENDED:
0179       p->entry_pos -= p->spaces + 1;  /* get rid of spaces and original quote */
0180       /* Fall-through */
0181     case FIELD_NOT_BEGUN:
0182     case FIELD_BEGUN:
0183       quoted = p->quoted, pstate = p->pstate;
0184       spaces = p->spaces, entry_pos = p->entry_pos;
0185       SUBMIT_FIELD(p);
0186       SUBMIT_ROW(p, -1);
0187     case ROW_NOT_BEGUN: /* Already ended properly */
0188       ;
0189   }
0190 
0191   /* Reset parser */
0192   p->spaces = p->quoted = p->entry_pos = p->status = 0;
0193   p->pstate = ROW_NOT_BEGUN;
0194 
0195   return 0;
0196 }
0197 
0198 void
0199 csv_set_delim(struct csv_parser *p, unsigned char c)
0200 {
0201   /* Set the delimiter */
0202   if (p) p->delim_char = c;
0203 }
0204 
0205 void
0206 csv_set_quote(struct csv_parser *p, unsigned char c)
0207 {
0208   /* Set the quote character */
0209   if (p) p->quote_char = c;
0210 }
0211 
0212 unsigned char
0213 csv_get_delim(struct csv_parser *p)
0214 {
0215   /* Get the delimiter */
0216   return p->delim_char;
0217 }
0218 
0219 unsigned char
0220 csv_get_quote(struct csv_parser *p)
0221 {
0222   /* Get the quote character */
0223   return p->quote_char;
0224 }
0225 
0226 void
0227 csv_set_space_func(struct csv_parser *p, int (*f)(unsigned char))
0228 {
0229   /* Set the space function */
0230   if (p) p->is_space = f;
0231 }
0232  
0233 void
0234 csv_set_term_func(struct csv_parser *p, int (*f)(unsigned char))
0235 {
0236   /* Set the term function */
0237   if (p) p->is_term = f;
0238 }
0239 
0240 void
0241 csv_set_realloc_func(struct csv_parser *p, void *(*f)(void *, size_t))
0242 {
0243   /* Set the realloc function used to increase buffer size */
0244   if (p && f) p->realloc_func = f;
0245 }
0246  
0247 void
0248 csv_set_free_func(struct csv_parser *p, void (*f)(void *))
0249 {
0250   /* Set the free function used to free the buffer */
0251   if (p && f) p->free_func = f;
0252 }
0253 
0254 void
0255 csv_set_blk_size(struct csv_parser *p, size_t size)
0256 {
0257   /* Set the block size used to increment buffer size */
0258   if (p) p->blk_size = size;
0259 }
0260 
0261 size_t
0262 csv_get_buffer_size(struct csv_parser *p)
0263 {
0264   /* Get the size of the entry buffer */
0265   if (p)
0266     return p->entry_size;
0267   return 0;
0268 }
0269  
0270 static int
0271 csv_increase_buffer(struct csv_parser *p)
0272 {
0273   /* Increase the size of the entry buffer.  Attempt to increase size by 
0274    * p->blk_size, if this is larger than SIZE_MAX try to increase current
0275    * buffer size to SIZE_MAX.  If allocation fails, try to allocate halve 
0276    * the size and try again until successful or increment size is zero.
0277    */
0278 
0279   size_t to_add = p->blk_size;
0280   void *vp;
0281 
0282   if ( p->entry_size >= SIZE_MAX - to_add )
0283     to_add = SIZE_MAX - p->entry_size;
0284 
0285   if (!to_add) {
0286     p->status = CSV_ETOOBIG;
0287     return -1;
0288   }
0289 
0290   while ((vp = p->realloc_func(p->entry_buf, p->entry_size + to_add)) == NULL) {
0291     to_add /= 2;
0292     if (!to_add) {
0293       p->status = CSV_ENOMEM;
0294       return -1;
0295     }
0296   }
0297 
0298   /* Update entry buffer pointer and entry_size if successful */
0299   p->entry_buf = vp;
0300   p->entry_size += to_add;
0301   return 0;
0302 }
0303  
0304 size_t
0305 csv_parse(struct csv_parser *p, const void *s, size_t len, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data)
0306 {
0307   unsigned const char *us = s;  /* Access input data as array of unsigned char */
0308   unsigned char c;              /* The character we are currently processing */
0309   size_t pos = 0;               /* The number of characters we have processed in this call */
0310 
0311   /* Store key fields into local variables for performance */
0312   unsigned char delim = p->delim_char;
0313   unsigned char quote = p->quote_char;
0314   int (*is_space)(unsigned char) = p->is_space;
0315   int (*is_term)(unsigned char) = p->is_term;
0316   int quoted = p->quoted;
0317   int pstate = p->pstate;
0318   size_t spaces = p->spaces;
0319   size_t entry_pos = p->entry_pos;
0320 
0321 
0322   if (!p->entry_buf && pos < len) {
0323     /* Buffer hasn't been allocated yet and len > 0 */
0324     if (csv_increase_buffer(p) != 0) { 
0325       p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
0326       return pos;
0327     }
0328   }
0329 
0330   while (pos < len) {
0331     /* Check memory usage, increase buffer if necessary */
0332     if (entry_pos == ((p->options & CSV_APPEND_NULL) ? p->entry_size - 1 : p->entry_size) ) {
0333       if (csv_increase_buffer(p) != 0) {
0334         p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
0335         return pos;
0336       }
0337     }
0338 
0339     c = us[pos++];
0340 
0341     switch (pstate) {
0342       case ROW_NOT_BEGUN:
0343       case FIELD_NOT_BEGUN:
0344         if ((is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) && c!=delim) { /* Space or Tab */
0345           continue;
0346         } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
0347           if (pstate == FIELD_NOT_BEGUN) {
0348             SUBMIT_FIELD(p);
0349             SUBMIT_ROW(p, (unsigned char)c); 
0350           } else {  /* ROW_NOT_BEGUN */
0351             /* Don't submit empty rows by default */
0352             if (p->options & CSV_REPALL_NL) {
0353               SUBMIT_ROW(p, (unsigned char)c);
0354             }
0355           }
0356           continue;
0357         } else if (c == delim) { /* Comma */
0358           SUBMIT_FIELD(p);
0359           break;
0360         } else if (c == quote) { /* Quote */
0361           pstate = FIELD_BEGUN;
0362           quoted = 1;
0363         } else {               /* Anything else */
0364           pstate = FIELD_BEGUN;
0365           quoted = 0;
0366           SUBMIT_CHAR(p, c);
0367         }
0368         break;
0369       case FIELD_BEGUN:
0370         if (c == quote) {         /* Quote */
0371           if (quoted) {
0372             SUBMIT_CHAR(p, c);
0373             pstate = FIELD_MIGHT_HAVE_ENDED;
0374           } else {
0375             /* STRICT ERROR - double quote inside non-quoted field */
0376             if (p->options & CSV_STRICT) {
0377               p->status = CSV_EPARSE;
0378               p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
0379               return pos-1;
0380             }
0381             SUBMIT_CHAR(p, c);
0382             spaces = 0;
0383           }
0384         } else if (c == delim) {  /* Comma */
0385           if (quoted) {
0386             SUBMIT_CHAR(p, c);
0387           } else {
0388             SUBMIT_FIELD(p);
0389           }
0390         } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) {  /* Carriage Return or Line Feed */
0391           if (!quoted) {
0392             SUBMIT_FIELD(p);
0393             SUBMIT_ROW(p, (unsigned char)c);
0394           } else {
0395             SUBMIT_CHAR(p, c);
0396           }
0397         } else if (!quoted && (is_space? is_space(c) : c == CSV_SPACE || c == CSV_TAB)) { /* Tab or space for non-quoted field */
0398             SUBMIT_CHAR(p, c);
0399             spaces++;
0400         } else {  /* Anything else */
0401           SUBMIT_CHAR(p, c);
0402           spaces = 0;
0403         }
0404         break;
0405       case FIELD_MIGHT_HAVE_ENDED:
0406         /* This only happens when a quote character is encountered in a quoted field */
0407         if (c == delim) {  /* Comma */
0408           entry_pos -= spaces + 1;  /* get rid of spaces and original quote */
0409           SUBMIT_FIELD(p);
0410         } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) {  /* Carriage Return or Line Feed */
0411           entry_pos -= spaces + 1;  /* get rid of spaces and original quote */
0412           SUBMIT_FIELD(p);
0413           SUBMIT_ROW(p, (unsigned char)c);
0414         } else if (is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) {  /* Space or Tab */
0415           SUBMIT_CHAR(p, c);
0416           spaces++;
0417         } else if (c == quote) {  /* Quote */
0418           if (spaces) {
0419             /* STRICT ERROR - unescaped double quote */
0420             if (p->options & CSV_STRICT) {
0421               p->status = CSV_EPARSE;
0422               p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
0423               return pos-1;
0424             }
0425             spaces = 0;
0426             SUBMIT_CHAR(p, c);
0427           } else {
0428             /* Two quotes in a row */
0429             pstate = FIELD_BEGUN;
0430           }
0431         } else {  /* Anything else */
0432           /* STRICT ERROR - unescaped double quote */
0433           if (p->options & CSV_STRICT) {
0434             p->status = CSV_EPARSE;
0435             p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
0436             return pos-1;
0437           }
0438           pstate = FIELD_BEGUN;
0439           spaces = 0;
0440           SUBMIT_CHAR(p, c);
0441         }
0442         break;
0443      default:
0444        break;
0445     }
0446   }
0447   p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
0448   return pos;
0449 }
0450 
0451 size_t
0452 csv_write (void *dest, size_t dest_size, const void *src, size_t src_size)
0453 {
0454   unsigned char *cdest = dest;
0455   const unsigned char *csrc = src;
0456   size_t chars = 0;
0457 
0458   if (src == NULL)
0459     return 0;
0460 
0461   if (cdest == NULL)
0462     dest_size = 0;
0463 
0464   if (dest_size > 0)
0465     *cdest++ = '"';
0466   chars++;
0467 
0468   while (src_size) {
0469     if (*csrc == '"') {
0470       if (dest_size > chars)
0471         *cdest++ = '"';
0472       if (chars < SIZE_MAX) chars++;
0473     }
0474     if (dest_size > chars)
0475       *cdest++ = *csrc;
0476     if (chars < SIZE_MAX) chars++;
0477     src_size--;
0478     csrc++;
0479   }
0480 
0481   if (dest_size > chars)
0482     *cdest = '"';
0483   if (chars < SIZE_MAX) chars++;
0484 
0485   return chars;
0486 }
0487 
0488 int
0489 csv_fwrite (FILE *fp, const void *src, size_t src_size)
0490 {
0491   const unsigned char *csrc = src;
0492 
0493   if (fp == NULL || src == NULL)
0494     return 0;
0495 
0496   if (fputc('"', fp) == EOF)
0497     return EOF;
0498 
0499   while (src_size) {
0500     if (*csrc == '"') {
0501       if (fputc('"', fp) == EOF)
0502         return EOF;
0503     }
0504     if (fputc(*csrc, fp) == EOF)
0505       return EOF;
0506     src_size--;
0507     csrc++;
0508   }
0509 
0510   if (fputc('"', fp) == EOF) {
0511     return EOF;
0512   }
0513 
0514   return 0;
0515 }
0516 
0517 size_t
0518 csv_write2 (void *dest, size_t dest_size, const void *src, size_t src_size, unsigned char quote)
0519 {
0520   unsigned char *cdest = dest;
0521   const unsigned char *csrc = src;
0522   size_t chars = 0;
0523 
0524   if (src == NULL)
0525     return 0;
0526 
0527   if (dest == NULL)
0528     dest_size = 0;
0529 
0530   if (dest_size > 0)
0531     *cdest++ = quote;
0532   chars++;
0533 
0534   while (src_size) {
0535     if (*csrc == quote) {
0536       if (dest_size > chars)
0537         *cdest++ = quote;
0538       if (chars < SIZE_MAX) chars++;
0539     }
0540     if (dest_size > chars)
0541       *cdest++ = *csrc;
0542     if (chars < SIZE_MAX) chars++;
0543     src_size--;
0544     csrc++;
0545   }
0546 
0547   if (dest_size > chars)
0548     *cdest = quote;
0549   if (chars < SIZE_MAX) chars++;
0550 
0551   return chars;
0552 }
0553 
0554 int
0555 csv_fwrite2 (FILE *fp, const void *src, size_t src_size, unsigned char quote)
0556 {
0557   const unsigned char *csrc = src;
0558 
0559   if (fp == NULL || src == NULL)
0560     return 0;
0561 
0562   if (fputc(quote, fp) == EOF)
0563     return EOF;
0564 
0565   while (src_size) {
0566     if (*csrc == quote) {
0567       if (fputc(quote, fp) == EOF)
0568         return EOF;
0569     }
0570     if (fputc(*csrc, fp) == EOF)
0571       return EOF;
0572     src_size--;
0573     csrc++;
0574   }
0575 
0576   if (fputc(quote, fp) == EOF) {
0577     return EOF;
0578   }
0579 
0580   return 0;
0581 }