File indexing completed on 2025-02-02 04:57:56
0001 /* 0002 libcsv - parse and write csv data 0003 Copyright (C) 2008 Robert Gamble 0004 0005 This library is free software; you can redistribute it and/or 0006 modify it under the terms of the GNU Lesser General Public 0007 License as published by the Free Software Foundation; either 0008 version 2.1 of the License, or (at your option) any later version. 0009 0010 This library is distributed in the hope that it will be useful, 0011 but WITHOUT ANY WARRANTY; without even the implied warranty of 0012 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 0013 Lesser General Public License for more details. 0014 0015 You should have received a copy of the GNU Lesser General Public 0016 License along with this library; if not, write to the Free Software 0017 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 0018 */ 0019 0020 #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L 0021 # include <stdint.h> 0022 #else 0023 # define SIZE_MAX ((size_t)-1) /* C89 doesn't have stdint.h or SIZE_MAX */ 0024 #endif 0025 0026 #include "libcsv.h" 0027 0028 #define VERSION "3.0.3" 0029 0030 #define ROW_NOT_BEGUN 0 0031 #define FIELD_NOT_BEGUN 1 0032 #define FIELD_BEGUN 2 0033 #define FIELD_MIGHT_HAVE_ENDED 3 0034 0035 /* 0036 Explanation of states 0037 ROW_NOT_BEGUN There have not been any fields encountered for this row 0038 FIELD_NOT_BEGUN There have been fields but we are currently not in one 0039 FIELD_BEGUN We are in a field 0040 FIELD_MIGHT_HAVE_ENDED 0041 We encountered a double quote inside a quoted field, the 0042 field is either ended or the quote is literal 0043 */ 0044 0045 #define MEM_BLK_SIZE 128 0046 0047 #define SUBMIT_FIELD(p) \ 0048 do { \ 0049 if (!quoted) \ 0050 entry_pos -= spaces; \ 0051 if (p->options & CSV_APPEND_NULL) \ 0052 ((p)->entry_buf[entry_pos]) = '\0'; \ 0053 if (cb1 && (p->options & CSV_EMPTY_IS_NULL) && !quoted && entry_pos == 0) \ 0054 cb1(NULL, entry_pos, data); \ 0055 else if (cb1) \ 0056 cb1(p->entry_buf, entry_pos, data); \ 0057 pstate = FIELD_NOT_BEGUN; \ 0058 entry_pos = quoted = spaces = 0; \ 0059 } while (0) 0060 0061 #define SUBMIT_ROW(p, c) \ 0062 do { \ 0063 if (cb2) \ 0064 cb2(c, data); \ 0065 pstate = ROW_NOT_BEGUN; \ 0066 entry_pos = quoted = spaces = 0; \ 0067 } while (0) 0068 0069 #define SUBMIT_CHAR(p, c) ((p)->entry_buf[entry_pos++] = (c)) 0070 0071 static const char *csv_errors[] = {"success", 0072 "error parsing data while strict checking enabled", 0073 "memory exhausted while increasing buffer size", 0074 "data size too large", 0075 "invalid status code"}; 0076 0077 int 0078 csv_error(struct csv_parser *p) 0079 { 0080 /* Return the current status of the parser */ 0081 return p->status; 0082 } 0083 0084 const char * 0085 csv_strerror(int status) 0086 { 0087 /* Return a textual description of status */ 0088 if (status >= CSV_EINVALID || status < 0) 0089 return csv_errors[CSV_EINVALID]; 0090 else 0091 return csv_errors[status]; 0092 } 0093 0094 int 0095 csv_get_opts(struct csv_parser *p) 0096 { 0097 /* Return the currently set options of parser */ 0098 if (p == NULL) 0099 return -1; 0100 0101 return p->options; 0102 } 0103 0104 int 0105 csv_set_opts(struct csv_parser *p, unsigned char options) 0106 { 0107 /* Set the options */ 0108 if (p == NULL) 0109 return -1; 0110 0111 p->options = options; 0112 return 0; 0113 } 0114 0115 int 0116 csv_init(struct csv_parser *p, unsigned char options) 0117 { 0118 /* Initialize a csv_parser object returns 0 on success, -1 on error */ 0119 if (p == NULL) 0120 return -1; 0121 0122 p->entry_buf = NULL; 0123 p->pstate = ROW_NOT_BEGUN; 0124 p->quoted = 0; 0125 p->spaces = 0; 0126 p->entry_pos = 0; 0127 p->entry_size = 0; 0128 p->status = 0; 0129 p->options = options; 0130 p->quote_char = CSV_QUOTE; 0131 p->delim_char = CSV_COMMA; 0132 p->is_space = NULL; 0133 p->is_term = NULL; 0134 p->blk_size = MEM_BLK_SIZE; 0135 p->malloc_func = NULL; 0136 p->realloc_func = realloc; 0137 p->free_func = free; 0138 0139 return 0; 0140 } 0141 0142 void 0143 csv_free(struct csv_parser *p) 0144 { 0145 /* Free the entry_buffer of csv_parser object */ 0146 if (p == NULL) 0147 return; 0148 0149 if (p->entry_buf) 0150 p->free_func(p->entry_buf); 0151 0152 p->entry_buf = NULL; 0153 p->entry_size = 0; 0154 0155 return; 0156 } 0157 0158 int 0159 csv_fini(struct csv_parser *p, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data) 0160 { 0161 /* Finalize parsing. Needed, for example, when file does not end in a newline */ 0162 int quoted = p->quoted; 0163 int pstate = p->pstate; 0164 size_t spaces = p->spaces; 0165 size_t entry_pos = p->entry_pos; 0166 0167 if (p == NULL) 0168 return -1; 0169 0170 0171 if (p->pstate == FIELD_BEGUN && p->quoted && p->options & CSV_STRICT && p->options & CSV_STRICT_FINI) { 0172 /* Current field is quoted, no end-quote was seen, and CSV_STRICT_FINI is set */ 0173 p->status = CSV_EPARSE; 0174 return -1; 0175 } 0176 0177 switch (p->pstate) { 0178 case FIELD_MIGHT_HAVE_ENDED: 0179 p->entry_pos -= p->spaces + 1; /* get rid of spaces and original quote */ 0180 /* Fall-through */ 0181 case FIELD_NOT_BEGUN: 0182 case FIELD_BEGUN: 0183 quoted = p->quoted, pstate = p->pstate; 0184 spaces = p->spaces, entry_pos = p->entry_pos; 0185 SUBMIT_FIELD(p); 0186 SUBMIT_ROW(p, -1); 0187 case ROW_NOT_BEGUN: /* Already ended properly */ 0188 ; 0189 } 0190 0191 /* Reset parser */ 0192 p->spaces = p->quoted = p->entry_pos = p->status = 0; 0193 p->pstate = ROW_NOT_BEGUN; 0194 0195 return 0; 0196 } 0197 0198 void 0199 csv_set_delim(struct csv_parser *p, unsigned char c) 0200 { 0201 /* Set the delimiter */ 0202 if (p) p->delim_char = c; 0203 } 0204 0205 void 0206 csv_set_quote(struct csv_parser *p, unsigned char c) 0207 { 0208 /* Set the quote character */ 0209 if (p) p->quote_char = c; 0210 } 0211 0212 unsigned char 0213 csv_get_delim(struct csv_parser *p) 0214 { 0215 /* Get the delimiter */ 0216 return p->delim_char; 0217 } 0218 0219 unsigned char 0220 csv_get_quote(struct csv_parser *p) 0221 { 0222 /* Get the quote character */ 0223 return p->quote_char; 0224 } 0225 0226 void 0227 csv_set_space_func(struct csv_parser *p, int (*f)(unsigned char)) 0228 { 0229 /* Set the space function */ 0230 if (p) p->is_space = f; 0231 } 0232 0233 void 0234 csv_set_term_func(struct csv_parser *p, int (*f)(unsigned char)) 0235 { 0236 /* Set the term function */ 0237 if (p) p->is_term = f; 0238 } 0239 0240 void 0241 csv_set_realloc_func(struct csv_parser *p, void *(*f)(void *, size_t)) 0242 { 0243 /* Set the realloc function used to increase buffer size */ 0244 if (p && f) p->realloc_func = f; 0245 } 0246 0247 void 0248 csv_set_free_func(struct csv_parser *p, void (*f)(void *)) 0249 { 0250 /* Set the free function used to free the buffer */ 0251 if (p && f) p->free_func = f; 0252 } 0253 0254 void 0255 csv_set_blk_size(struct csv_parser *p, size_t size) 0256 { 0257 /* Set the block size used to increment buffer size */ 0258 if (p) p->blk_size = size; 0259 } 0260 0261 size_t 0262 csv_get_buffer_size(struct csv_parser *p) 0263 { 0264 /* Get the size of the entry buffer */ 0265 if (p) 0266 return p->entry_size; 0267 return 0; 0268 } 0269 0270 static int 0271 csv_increase_buffer(struct csv_parser *p) 0272 { 0273 /* Increase the size of the entry buffer. Attempt to increase size by 0274 * p->blk_size, if this is larger than SIZE_MAX try to increase current 0275 * buffer size to SIZE_MAX. If allocation fails, try to allocate halve 0276 * the size and try again until successful or increment size is zero. 0277 */ 0278 0279 size_t to_add = p->blk_size; 0280 void *vp; 0281 0282 if ( p->entry_size >= SIZE_MAX - to_add ) 0283 to_add = SIZE_MAX - p->entry_size; 0284 0285 if (!to_add) { 0286 p->status = CSV_ETOOBIG; 0287 return -1; 0288 } 0289 0290 while ((vp = p->realloc_func(p->entry_buf, p->entry_size + to_add)) == NULL) { 0291 to_add /= 2; 0292 if (!to_add) { 0293 p->status = CSV_ENOMEM; 0294 return -1; 0295 } 0296 } 0297 0298 /* Update entry buffer pointer and entry_size if successful */ 0299 p->entry_buf = vp; 0300 p->entry_size += to_add; 0301 return 0; 0302 } 0303 0304 size_t 0305 csv_parse(struct csv_parser *p, const void *s, size_t len, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data) 0306 { 0307 unsigned const char *us = s; /* Access input data as array of unsigned char */ 0308 unsigned char c; /* The character we are currently processing */ 0309 size_t pos = 0; /* The number of characters we have processed in this call */ 0310 0311 /* Store key fields into local variables for performance */ 0312 unsigned char delim = p->delim_char; 0313 unsigned char quote = p->quote_char; 0314 int (*is_space)(unsigned char) = p->is_space; 0315 int (*is_term)(unsigned char) = p->is_term; 0316 int quoted = p->quoted; 0317 int pstate = p->pstate; 0318 size_t spaces = p->spaces; 0319 size_t entry_pos = p->entry_pos; 0320 0321 0322 if (!p->entry_buf && pos < len) { 0323 /* Buffer hasn't been allocated yet and len > 0 */ 0324 if (csv_increase_buffer(p) != 0) { 0325 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos; 0326 return pos; 0327 } 0328 } 0329 0330 while (pos < len) { 0331 /* Check memory usage, increase buffer if necessary */ 0332 if (entry_pos == ((p->options & CSV_APPEND_NULL) ? p->entry_size - 1 : p->entry_size) ) { 0333 if (csv_increase_buffer(p) != 0) { 0334 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos; 0335 return pos; 0336 } 0337 } 0338 0339 c = us[pos++]; 0340 0341 switch (pstate) { 0342 case ROW_NOT_BEGUN: 0343 case FIELD_NOT_BEGUN: 0344 if ((is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) && c!=delim) { /* Space or Tab */ 0345 continue; 0346 } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */ 0347 if (pstate == FIELD_NOT_BEGUN) { 0348 SUBMIT_FIELD(p); 0349 SUBMIT_ROW(p, (unsigned char)c); 0350 } else { /* ROW_NOT_BEGUN */ 0351 /* Don't submit empty rows by default */ 0352 if (p->options & CSV_REPALL_NL) { 0353 SUBMIT_ROW(p, (unsigned char)c); 0354 } 0355 } 0356 continue; 0357 } else if (c == delim) { /* Comma */ 0358 SUBMIT_FIELD(p); 0359 break; 0360 } else if (c == quote) { /* Quote */ 0361 pstate = FIELD_BEGUN; 0362 quoted = 1; 0363 } else { /* Anything else */ 0364 pstate = FIELD_BEGUN; 0365 quoted = 0; 0366 SUBMIT_CHAR(p, c); 0367 } 0368 break; 0369 case FIELD_BEGUN: 0370 if (c == quote) { /* Quote */ 0371 if (quoted) { 0372 SUBMIT_CHAR(p, c); 0373 pstate = FIELD_MIGHT_HAVE_ENDED; 0374 } else { 0375 /* STRICT ERROR - double quote inside non-quoted field */ 0376 if (p->options & CSV_STRICT) { 0377 p->status = CSV_EPARSE; 0378 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos; 0379 return pos-1; 0380 } 0381 SUBMIT_CHAR(p, c); 0382 spaces = 0; 0383 } 0384 } else if (c == delim) { /* Comma */ 0385 if (quoted) { 0386 SUBMIT_CHAR(p, c); 0387 } else { 0388 SUBMIT_FIELD(p); 0389 } 0390 } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */ 0391 if (!quoted) { 0392 SUBMIT_FIELD(p); 0393 SUBMIT_ROW(p, (unsigned char)c); 0394 } else { 0395 SUBMIT_CHAR(p, c); 0396 } 0397 } else if (!quoted && (is_space? is_space(c) : c == CSV_SPACE || c == CSV_TAB)) { /* Tab or space for non-quoted field */ 0398 SUBMIT_CHAR(p, c); 0399 spaces++; 0400 } else { /* Anything else */ 0401 SUBMIT_CHAR(p, c); 0402 spaces = 0; 0403 } 0404 break; 0405 case FIELD_MIGHT_HAVE_ENDED: 0406 /* This only happens when a quote character is encountered in a quoted field */ 0407 if (c == delim) { /* Comma */ 0408 entry_pos -= spaces + 1; /* get rid of spaces and original quote */ 0409 SUBMIT_FIELD(p); 0410 } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */ 0411 entry_pos -= spaces + 1; /* get rid of spaces and original quote */ 0412 SUBMIT_FIELD(p); 0413 SUBMIT_ROW(p, (unsigned char)c); 0414 } else if (is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) { /* Space or Tab */ 0415 SUBMIT_CHAR(p, c); 0416 spaces++; 0417 } else if (c == quote) { /* Quote */ 0418 if (spaces) { 0419 /* STRICT ERROR - unescaped double quote */ 0420 if (p->options & CSV_STRICT) { 0421 p->status = CSV_EPARSE; 0422 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos; 0423 return pos-1; 0424 } 0425 spaces = 0; 0426 SUBMIT_CHAR(p, c); 0427 } else { 0428 /* Two quotes in a row */ 0429 pstate = FIELD_BEGUN; 0430 } 0431 } else { /* Anything else */ 0432 /* STRICT ERROR - unescaped double quote */ 0433 if (p->options & CSV_STRICT) { 0434 p->status = CSV_EPARSE; 0435 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos; 0436 return pos-1; 0437 } 0438 pstate = FIELD_BEGUN; 0439 spaces = 0; 0440 SUBMIT_CHAR(p, c); 0441 } 0442 break; 0443 default: 0444 break; 0445 } 0446 } 0447 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos; 0448 return pos; 0449 } 0450 0451 size_t 0452 csv_write (void *dest, size_t dest_size, const void *src, size_t src_size) 0453 { 0454 unsigned char *cdest = dest; 0455 const unsigned char *csrc = src; 0456 size_t chars = 0; 0457 0458 if (src == NULL) 0459 return 0; 0460 0461 if (cdest == NULL) 0462 dest_size = 0; 0463 0464 if (dest_size > 0) 0465 *cdest++ = '"'; 0466 chars++; 0467 0468 while (src_size) { 0469 if (*csrc == '"') { 0470 if (dest_size > chars) 0471 *cdest++ = '"'; 0472 if (chars < SIZE_MAX) chars++; 0473 } 0474 if (dest_size > chars) 0475 *cdest++ = *csrc; 0476 if (chars < SIZE_MAX) chars++; 0477 src_size--; 0478 csrc++; 0479 } 0480 0481 if (dest_size > chars) 0482 *cdest = '"'; 0483 if (chars < SIZE_MAX) chars++; 0484 0485 return chars; 0486 } 0487 0488 int 0489 csv_fwrite (FILE *fp, const void *src, size_t src_size) 0490 { 0491 const unsigned char *csrc = src; 0492 0493 if (fp == NULL || src == NULL) 0494 return 0; 0495 0496 if (fputc('"', fp) == EOF) 0497 return EOF; 0498 0499 while (src_size) { 0500 if (*csrc == '"') { 0501 if (fputc('"', fp) == EOF) 0502 return EOF; 0503 } 0504 if (fputc(*csrc, fp) == EOF) 0505 return EOF; 0506 src_size--; 0507 csrc++; 0508 } 0509 0510 if (fputc('"', fp) == EOF) { 0511 return EOF; 0512 } 0513 0514 return 0; 0515 } 0516 0517 size_t 0518 csv_write2 (void *dest, size_t dest_size, const void *src, size_t src_size, unsigned char quote) 0519 { 0520 unsigned char *cdest = dest; 0521 const unsigned char *csrc = src; 0522 size_t chars = 0; 0523 0524 if (src == NULL) 0525 return 0; 0526 0527 if (dest == NULL) 0528 dest_size = 0; 0529 0530 if (dest_size > 0) 0531 *cdest++ = quote; 0532 chars++; 0533 0534 while (src_size) { 0535 if (*csrc == quote) { 0536 if (dest_size > chars) 0537 *cdest++ = quote; 0538 if (chars < SIZE_MAX) chars++; 0539 } 0540 if (dest_size > chars) 0541 *cdest++ = *csrc; 0542 if (chars < SIZE_MAX) chars++; 0543 src_size--; 0544 csrc++; 0545 } 0546 0547 if (dest_size > chars) 0548 *cdest = quote; 0549 if (chars < SIZE_MAX) chars++; 0550 0551 return chars; 0552 } 0553 0554 int 0555 csv_fwrite2 (FILE *fp, const void *src, size_t src_size, unsigned char quote) 0556 { 0557 const unsigned char *csrc = src; 0558 0559 if (fp == NULL || src == NULL) 0560 return 0; 0561 0562 if (fputc(quote, fp) == EOF) 0563 return EOF; 0564 0565 while (src_size) { 0566 if (*csrc == quote) { 0567 if (fputc(quote, fp) == EOF) 0568 return EOF; 0569 } 0570 if (fputc(*csrc, fp) == EOF) 0571 return EOF; 0572 src_size--; 0573 csrc++; 0574 } 0575 0576 if (fputc(quote, fp) == EOF) { 0577 return EOF; 0578 } 0579 0580 return 0; 0581 }