File indexing completed on 2025-02-16 09:49:26
0001 // Copyright: (2012-2015) Ben Strasser <code@ben-strasser.net> 0002 // License: BSD-3 0003 // 0004 // All rights reserved. 0005 // 0006 // Redistribution and use in source and binary forms, with or without 0007 // modification, are permitted provided that the following conditions are met: 0008 // 0009 // 1. Redistributions of source code must retain the above copyright notice, 0010 // this list of conditions and the following disclaimer. 0011 // 0012 // 2. Redistributions in binary form must reproduce the above copyright notice, 0013 // this list of conditions and the following disclaimer in the documentation 0014 // and/or other materials provided with the distribution. 0015 // 0016 // 3. Neither the name of the copyright holder nor the names of its contributors 0017 // may be used to endorse or promote products derived from this software 0018 // without specific prior written permission. 0019 // 0020 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 0021 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 0022 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 0023 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 0024 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 0025 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 0026 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 0027 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 0028 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 0029 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 0030 // POSSIBILITY OF SUCH DAMAGE. 0031 0032 #ifndef CSV_H 0033 #define CSV_H 0034 0035 #include <algorithm> 0036 #include <cstdio> 0037 #include <cstring> 0038 #include <exception> 0039 #include <string> 0040 #include <utility> 0041 #include <vector> 0042 #ifndef CSV_IO_NO_THREAD 0043 #include <condition_variable> 0044 #include <mutex> 0045 #include <thread> 0046 #endif 0047 #include <cassert> 0048 #include <cerrno> 0049 #include <istream> 0050 #include <limits> 0051 #include <memory> 0052 0053 namespace io { 0054 //////////////////////////////////////////////////////////////////////////// 0055 // LineReader // 0056 //////////////////////////////////////////////////////////////////////////// 0057 0058 namespace error { 0059 struct base : std::exception { 0060 virtual void format_error_message() const = 0; 0061 0062 const char *what() const noexcept override { 0063 format_error_message(); 0064 return error_message_buffer; 0065 } 0066 0067 mutable char error_message_buffer[2048]; 0068 }; 0069 0070 // this only affects the file name in the error message 0071 const int max_file_name_length = 1024; 0072 0073 struct with_file_name { 0074 with_file_name() { std::memset(file_name, 0, sizeof(file_name)); } 0075 0076 void set_file_name(const char *file_name) { 0077 if (file_name != nullptr) { 0078 // This call to strncpy has parenthesis around it 0079 // to silence the GCC -Wstringop-truncation warning 0080 (strncpy(this->file_name, file_name, sizeof(this->file_name))); 0081 this->file_name[sizeof(this->file_name) - 1] = '\0'; 0082 } else { 0083 this->file_name[0] = '\0'; 0084 } 0085 } 0086 0087 char file_name[max_file_name_length + 1]; 0088 }; 0089 0090 struct with_file_line { 0091 with_file_line() { file_line = -1; } 0092 0093 void set_file_line(int file_line) { this->file_line = file_line; } 0094 0095 int file_line; 0096 }; 0097 0098 struct with_errno { 0099 with_errno() { errno_value = 0; } 0100 0101 void set_errno(int errno_value) { this->errno_value = errno_value; } 0102 0103 int errno_value; 0104 }; 0105 0106 struct can_not_open_file : base, with_file_name, with_errno { 0107 void format_error_message() const override { 0108 if (errno_value != 0) 0109 std::snprintf(error_message_buffer, sizeof(error_message_buffer), 0110 "Can not open file \"%s\" because \"%s\".", file_name, 0111 std::strerror(errno_value)); 0112 else 0113 std::snprintf(error_message_buffer, sizeof(error_message_buffer), 0114 "Can not open file \"%s\".", file_name); 0115 } 0116 }; 0117 0118 struct line_length_limit_exceeded : base, with_file_name, with_file_line { 0119 void format_error_message() const override { 0120 std::snprintf( 0121 error_message_buffer, sizeof(error_message_buffer), 0122 "Line number %d in file \"%s\" exceeds the maximum length of 2^24-1.", 0123 file_line, file_name); 0124 } 0125 }; 0126 } // namespace error 0127 0128 class ByteSourceBase { 0129 public: 0130 virtual int read(char *buffer, int size) = 0; 0131 virtual ~ByteSourceBase() {} 0132 }; 0133 0134 namespace detail { 0135 0136 class OwningStdIOByteSourceBase : public ByteSourceBase { 0137 public: 0138 explicit OwningStdIOByteSourceBase(FILE *file) : file(file) { 0139 // Tell the std library that we want to do the buffering ourself. 0140 std::setvbuf(file, 0, _IONBF, 0); 0141 } 0142 0143 int read(char *buffer, int size) { return std::fread(buffer, 1, size, file); } 0144 0145 ~OwningStdIOByteSourceBase() { std::fclose(file); } 0146 0147 private: 0148 FILE *file; 0149 }; 0150 0151 class NonOwningIStreamByteSource : public ByteSourceBase { 0152 public: 0153 explicit NonOwningIStreamByteSource(std::istream &in) : in(in) {} 0154 0155 int read(char *buffer, int size) { 0156 in.read(buffer, size); 0157 return in.gcount(); 0158 } 0159 0160 ~NonOwningIStreamByteSource() {} 0161 0162 private: 0163 std::istream ∈ 0164 }; 0165 0166 class NonOwningStringByteSource : public ByteSourceBase { 0167 public: 0168 NonOwningStringByteSource(const char *str, long long size) 0169 : str(str), remaining_byte_count(size) {} 0170 0171 int read(char *buffer, int desired_byte_count) { 0172 int to_copy_byte_count = desired_byte_count; 0173 if (remaining_byte_count < to_copy_byte_count) 0174 to_copy_byte_count = remaining_byte_count; 0175 std::memcpy(buffer, str, to_copy_byte_count); 0176 remaining_byte_count -= to_copy_byte_count; 0177 str += to_copy_byte_count; 0178 return to_copy_byte_count; 0179 } 0180 0181 ~NonOwningStringByteSource() {} 0182 0183 private: 0184 const char *str; 0185 long long remaining_byte_count; 0186 }; 0187 0188 #ifndef CSV_IO_NO_THREAD 0189 class AsynchronousReader { 0190 public: 0191 void init(std::unique_ptr<ByteSourceBase> arg_byte_source) { 0192 std::unique_lock<std::mutex> guard(lock); 0193 byte_source = std::move(arg_byte_source); 0194 desired_byte_count = -1; 0195 termination_requested = false; 0196 worker = std::thread([&] { 0197 std::unique_lock<std::mutex> guard(lock); 0198 try { 0199 for (;;) { 0200 read_requested_condition.wait(guard, [&] { 0201 return desired_byte_count != -1 || termination_requested; 0202 }); 0203 if (termination_requested) 0204 return; 0205 0206 read_byte_count = byte_source->read(buffer, desired_byte_count); 0207 desired_byte_count = -1; 0208 if (read_byte_count == 0) 0209 break; 0210 read_finished_condition.notify_one(); 0211 } 0212 } catch (...) { 0213 read_error = std::current_exception(); 0214 } 0215 read_finished_condition.notify_one(); 0216 }); 0217 } 0218 0219 bool is_valid() const { return byte_source != nullptr; } 0220 0221 void start_read(char *arg_buffer, int arg_desired_byte_count) { 0222 std::unique_lock<std::mutex> guard(lock); 0223 buffer = arg_buffer; 0224 desired_byte_count = arg_desired_byte_count; 0225 read_byte_count = -1; 0226 read_requested_condition.notify_one(); 0227 } 0228 0229 int finish_read() { 0230 std::unique_lock<std::mutex> guard(lock); 0231 read_finished_condition.wait( 0232 guard, [&] { return read_byte_count != -1 || read_error; }); 0233 if (read_error) 0234 std::rethrow_exception(read_error); 0235 else 0236 return read_byte_count; 0237 } 0238 0239 ~AsynchronousReader() { 0240 if (byte_source != nullptr) { 0241 { 0242 std::unique_lock<std::mutex> guard(lock); 0243 termination_requested = true; 0244 } 0245 read_requested_condition.notify_one(); 0246 worker.join(); 0247 } 0248 } 0249 0250 private: 0251 std::unique_ptr<ByteSourceBase> byte_source; 0252 0253 std::thread worker; 0254 0255 bool termination_requested; 0256 std::exception_ptr read_error; 0257 char *buffer; 0258 int desired_byte_count; 0259 int read_byte_count; 0260 0261 std::mutex lock; 0262 std::condition_variable read_finished_condition; 0263 std::condition_variable read_requested_condition; 0264 }; 0265 #endif 0266 0267 class SynchronousReader { 0268 public: 0269 void init(std::unique_ptr<ByteSourceBase> arg_byte_source) { 0270 byte_source = std::move(arg_byte_source); 0271 } 0272 0273 bool is_valid() const { return byte_source != nullptr; } 0274 0275 void start_read(char *arg_buffer, int arg_desired_byte_count) { 0276 buffer = arg_buffer; 0277 desired_byte_count = arg_desired_byte_count; 0278 } 0279 0280 int finish_read() { return byte_source->read(buffer, desired_byte_count); } 0281 0282 private: 0283 std::unique_ptr<ByteSourceBase> byte_source; 0284 char *buffer; 0285 int desired_byte_count; 0286 }; 0287 } // namespace detail 0288 0289 class LineReader { 0290 private: 0291 static const int block_len = 1 << 20; 0292 std::unique_ptr<char[]> buffer; // must be constructed before (and thus 0293 // destructed after) the reader! 0294 #ifdef CSV_IO_NO_THREAD 0295 detail::SynchronousReader reader; 0296 #else 0297 detail::AsynchronousReader reader; 0298 #endif 0299 int data_begin; 0300 int data_end; 0301 0302 char file_name[error::max_file_name_length + 1]; 0303 unsigned file_line; 0304 0305 static std::unique_ptr<ByteSourceBase> open_file(const char *file_name) { 0306 // We open the file in binary mode as it makes no difference under *nix 0307 // and under Windows we handle \r\n newlines ourself. 0308 FILE *file = std::fopen(file_name, "rb"); 0309 if (file == 0) { 0310 int x = errno; // store errno as soon as possible, doing it after 0311 // constructor call can fail. 0312 error::can_not_open_file err; 0313 err.set_errno(x); 0314 err.set_file_name(file_name); 0315 throw err; 0316 } 0317 return std::unique_ptr<ByteSourceBase>( 0318 new detail::OwningStdIOByteSourceBase(file)); 0319 } 0320 0321 void init(std::unique_ptr<ByteSourceBase> byte_source) { 0322 file_line = 0; 0323 0324 buffer = std::unique_ptr<char[]>(new char[3 * block_len]); 0325 data_begin = 0; 0326 data_end = byte_source->read(buffer.get(), 2 * block_len); 0327 0328 // Ignore UTF-8 BOM 0329 if (data_end >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && 0330 buffer[2] == '\xBF') 0331 data_begin = 3; 0332 0333 if (data_end == 2 * block_len) { 0334 reader.init(std::move(byte_source)); 0335 reader.start_read(buffer.get() + 2 * block_len, block_len); 0336 } 0337 } 0338 0339 public: 0340 LineReader() = delete; 0341 LineReader(const LineReader &) = delete; 0342 LineReader &operator=(const LineReader &) = delete; 0343 0344 explicit LineReader(const char *file_name) { 0345 set_file_name(file_name); 0346 init(open_file(file_name)); 0347 } 0348 0349 explicit LineReader(const std::string &file_name) { 0350 set_file_name(file_name.c_str()); 0351 init(open_file(file_name.c_str())); 0352 } 0353 0354 LineReader(const char *file_name, 0355 std::unique_ptr<ByteSourceBase> byte_source) { 0356 set_file_name(file_name); 0357 init(std::move(byte_source)); 0358 } 0359 0360 LineReader(const std::string &file_name, 0361 std::unique_ptr<ByteSourceBase> byte_source) { 0362 set_file_name(file_name.c_str()); 0363 init(std::move(byte_source)); 0364 } 0365 0366 LineReader(const char *file_name, const char *data_begin, 0367 const char *data_end) { 0368 set_file_name(file_name); 0369 init(std::unique_ptr<ByteSourceBase>(new detail::NonOwningStringByteSource( 0370 data_begin, data_end - data_begin))); 0371 } 0372 0373 LineReader(const std::string &file_name, const char *data_begin, 0374 const char *data_end) { 0375 set_file_name(file_name.c_str()); 0376 init(std::unique_ptr<ByteSourceBase>(new detail::NonOwningStringByteSource( 0377 data_begin, data_end - data_begin))); 0378 } 0379 0380 LineReader(const char *file_name, FILE *file) { 0381 set_file_name(file_name); 0382 init(std::unique_ptr<ByteSourceBase>( 0383 new detail::OwningStdIOByteSourceBase(file))); 0384 } 0385 0386 LineReader(const std::string &file_name, FILE *file) { 0387 set_file_name(file_name.c_str()); 0388 init(std::unique_ptr<ByteSourceBase>( 0389 new detail::OwningStdIOByteSourceBase(file))); 0390 } 0391 0392 LineReader(const char *file_name, std::istream &in) { 0393 set_file_name(file_name); 0394 init(std::unique_ptr<ByteSourceBase>( 0395 new detail::NonOwningIStreamByteSource(in))); 0396 } 0397 0398 LineReader(const std::string &file_name, std::istream &in) { 0399 set_file_name(file_name.c_str()); 0400 init(std::unique_ptr<ByteSourceBase>( 0401 new detail::NonOwningIStreamByteSource(in))); 0402 } 0403 0404 void set_file_name(const std::string &file_name) { 0405 set_file_name(file_name.c_str()); 0406 } 0407 0408 void set_file_name(const char *file_name) { 0409 if (file_name != nullptr) { 0410 strncpy(this->file_name, file_name, sizeof(this->file_name)); 0411 this->file_name[sizeof(this->file_name) - 1] = '\0'; 0412 } else { 0413 this->file_name[0] = '\0'; 0414 } 0415 } 0416 0417 const char *get_truncated_file_name() const { return file_name; } 0418 0419 void set_file_line(unsigned file_line) { this->file_line = file_line; } 0420 0421 unsigned get_file_line() const { return file_line; } 0422 0423 char *next_line() { 0424 if (data_begin == data_end) 0425 return nullptr; 0426 0427 ++file_line; 0428 0429 assert(data_begin < data_end); 0430 assert(data_end <= block_len * 2); 0431 0432 if (data_begin >= block_len) { 0433 std::memcpy(buffer.get(), buffer.get() + block_len, block_len); 0434 data_begin -= block_len; 0435 data_end -= block_len; 0436 if (reader.is_valid()) { 0437 data_end += reader.finish_read(); 0438 std::memcpy(buffer.get() + block_len, buffer.get() + 2 * block_len, 0439 block_len); 0440 reader.start_read(buffer.get() + 2 * block_len, block_len); 0441 } 0442 } 0443 0444 int line_end = data_begin; 0445 while (line_end != data_end && buffer[line_end] != '\n') { 0446 ++line_end; 0447 } 0448 0449 if (line_end - data_begin + 1 > block_len) { 0450 error::line_length_limit_exceeded err; 0451 err.set_file_name(file_name); 0452 err.set_file_line(file_line); 0453 throw err; 0454 } 0455 0456 if (line_end != data_end && buffer[line_end] == '\n') { 0457 buffer[line_end] = '\0'; 0458 } else { 0459 // some files are missing the newline at the end of the 0460 // last line 0461 ++data_end; 0462 buffer[line_end] = '\0'; 0463 } 0464 0465 // handle windows \r\n-line breaks 0466 if (line_end != data_begin && buffer[line_end - 1] == '\r') 0467 buffer[line_end - 1] = '\0'; 0468 0469 char *ret = buffer.get() + data_begin; 0470 data_begin = line_end + 1; 0471 return ret; 0472 } 0473 }; 0474 0475 //////////////////////////////////////////////////////////////////////////// 0476 // CSV // 0477 //////////////////////////////////////////////////////////////////////////// 0478 0479 namespace error { 0480 const int max_column_name_length = 63; 0481 struct with_column_name { 0482 with_column_name() { 0483 std::memset(column_name, 0, max_column_name_length + 1); 0484 } 0485 0486 void set_column_name(const char *column_name) { 0487 if (column_name != nullptr) { 0488 std::strncpy(this->column_name, column_name, max_column_name_length); 0489 this->column_name[max_column_name_length] = '\0'; 0490 } else { 0491 this->column_name[0] = '\0'; 0492 } 0493 } 0494 0495 char column_name[max_column_name_length + 1]; 0496 }; 0497 0498 const int max_column_content_length = 63; 0499 0500 struct with_column_content { 0501 with_column_content() { 0502 std::memset(column_content, 0, max_column_content_length + 1); 0503 } 0504 0505 void set_column_content(const char *column_content) { 0506 if (column_content != nullptr) { 0507 std::strncpy(this->column_content, column_content, 0508 max_column_content_length); 0509 this->column_content[max_column_content_length] = '\0'; 0510 } else { 0511 this->column_content[0] = '\0'; 0512 } 0513 } 0514 0515 char column_content[max_column_content_length + 1]; 0516 }; 0517 0518 struct extra_column_in_header : base, with_file_name, with_column_name { 0519 void format_error_message() const override { 0520 std::snprintf(error_message_buffer, sizeof(error_message_buffer), 0521 R"(Extra column "%s" in header of file "%s".)", column_name, 0522 file_name); 0523 } 0524 }; 0525 0526 struct missing_column_in_header : base, with_file_name, with_column_name { 0527 void format_error_message() const override { 0528 std::snprintf(error_message_buffer, sizeof(error_message_buffer), 0529 R"(Missing column "%s" in header of file "%s".)", column_name, 0530 file_name); 0531 } 0532 }; 0533 0534 struct duplicated_column_in_header : base, with_file_name, with_column_name { 0535 void format_error_message() const override { 0536 std::snprintf(error_message_buffer, sizeof(error_message_buffer), 0537 R"(Duplicated column "%s" in header of file "%s".)", 0538 column_name, file_name); 0539 } 0540 }; 0541 0542 struct header_missing : base, with_file_name { 0543 void format_error_message() const override { 0544 std::snprintf(error_message_buffer, sizeof(error_message_buffer), 0545 "Header missing in file \"%s\".", file_name); 0546 } 0547 }; 0548 0549 struct too_few_columns : base, with_file_name, with_file_line { 0550 void format_error_message() const override { 0551 std::snprintf(error_message_buffer, sizeof(error_message_buffer), 0552 "Too few columns in line %d in file \"%s\".", file_line, 0553 file_name); 0554 } 0555 }; 0556 0557 struct too_many_columns : base, with_file_name, with_file_line { 0558 void format_error_message() const override { 0559 std::snprintf(error_message_buffer, sizeof(error_message_buffer), 0560 "Too many columns in line %d in file \"%s\".", file_line, 0561 file_name); 0562 } 0563 }; 0564 0565 struct escaped_string_not_closed : base, with_file_name, with_file_line { 0566 void format_error_message() const override { 0567 std::snprintf(error_message_buffer, sizeof(error_message_buffer), 0568 "Escaped string was not closed in line %d in file \"%s\".", 0569 file_line, file_name); 0570 } 0571 }; 0572 0573 struct integer_must_be_positive : base, 0574 with_file_name, 0575 with_file_line, 0576 with_column_name, 0577 with_column_content { 0578 void format_error_message() const override { 0579 std::snprintf( 0580 error_message_buffer, sizeof(error_message_buffer), 0581 R"(The integer "%s" must be positive or 0 in column "%s" in file "%s" in line "%d".)", 0582 column_content, column_name, file_name, file_line); 0583 } 0584 }; 0585 0586 struct no_digit : base, 0587 with_file_name, 0588 with_file_line, 0589 with_column_name, 0590 with_column_content { 0591 void format_error_message() const override { 0592 std::snprintf( 0593 error_message_buffer, sizeof(error_message_buffer), 0594 R"(The integer "%s" contains an invalid digit in column "%s" in file "%s" in line "%d".)", 0595 column_content, column_name, file_name, file_line); 0596 } 0597 }; 0598 0599 struct integer_overflow : base, 0600 with_file_name, 0601 with_file_line, 0602 with_column_name, 0603 with_column_content { 0604 void format_error_message() const override { 0605 std::snprintf( 0606 error_message_buffer, sizeof(error_message_buffer), 0607 R"(The integer "%s" overflows in column "%s" in file "%s" in line "%d".)", 0608 column_content, column_name, file_name, file_line); 0609 } 0610 }; 0611 0612 struct integer_underflow : base, 0613 with_file_name, 0614 with_file_line, 0615 with_column_name, 0616 with_column_content { 0617 void format_error_message() const override { 0618 std::snprintf( 0619 error_message_buffer, sizeof(error_message_buffer), 0620 R"(The integer "%s" underflows in column "%s" in file "%s" in line "%d".)", 0621 column_content, column_name, file_name, file_line); 0622 } 0623 }; 0624 0625 struct invalid_single_character : base, 0626 with_file_name, 0627 with_file_line, 0628 with_column_name, 0629 with_column_content { 0630 void format_error_message() const override { 0631 std::snprintf( 0632 error_message_buffer, sizeof(error_message_buffer), 0633 R"(The content "%s" of column "%s" in file "%s" in line "%d" is not a single character.)", 0634 column_content, column_name, file_name, file_line); 0635 } 0636 }; 0637 } // namespace error 0638 0639 using ignore_column = unsigned int; 0640 static const ignore_column ignore_no_column = 0; 0641 static const ignore_column ignore_extra_column = 1; 0642 static const ignore_column ignore_missing_column = 2; 0643 0644 template <char... trim_char_list> struct trim_chars { 0645 private: 0646 constexpr static bool is_trim_char(char) { return false; } 0647 0648 template <class... OtherTrimChars> 0649 constexpr static bool is_trim_char(char c, char trim_char, 0650 OtherTrimChars... other_trim_chars) { 0651 return c == trim_char || is_trim_char(c, other_trim_chars...); 0652 } 0653 0654 public: 0655 static void trim(char *&str_begin, char *&str_end) { 0656 while (str_begin != str_end && is_trim_char(*str_begin, trim_char_list...)) 0657 ++str_begin; 0658 while (str_begin != str_end && 0659 is_trim_char(*(str_end - 1), trim_char_list...)) 0660 --str_end; 0661 *str_end = '\0'; 0662 } 0663 }; 0664 0665 struct no_comment { 0666 static bool is_comment(const char *) { return false; } 0667 }; 0668 0669 template <char... comment_start_char_list> struct single_line_comment { 0670 private: 0671 constexpr static bool is_comment_start_char(char) { return false; } 0672 0673 template <class... OtherCommentStartChars> 0674 constexpr static bool 0675 is_comment_start_char(char c, char comment_start_char, 0676 OtherCommentStartChars... other_comment_start_chars) { 0677 return c == comment_start_char || 0678 is_comment_start_char(c, other_comment_start_chars...); 0679 } 0680 0681 public: 0682 static bool is_comment(const char *line) { 0683 return is_comment_start_char(*line, comment_start_char_list...); 0684 } 0685 }; 0686 0687 struct empty_line_comment { 0688 static bool is_comment(const char *line) { 0689 if (*line == '\0') 0690 return true; 0691 while (*line == ' ' || *line == '\t') { 0692 ++line; 0693 if (*line == 0) 0694 return true; 0695 } 0696 return false; 0697 } 0698 }; 0699 0700 template <char... comment_start_char_list> 0701 struct single_and_empty_line_comment { 0702 static bool is_comment(const char *line) { 0703 return single_line_comment<comment_start_char_list...>::is_comment(line) || 0704 empty_line_comment::is_comment(line); 0705 } 0706 }; 0707 0708 template <char sep> struct no_quote_escape { 0709 static const char *find_next_column_end(const char *col_begin) { 0710 while (*col_begin != sep && *col_begin != '\0') 0711 ++col_begin; 0712 return col_begin; 0713 } 0714 0715 static void unescape(char *&, char *&) {} 0716 }; 0717 0718 template <char sep, char quote> struct double_quote_escape { 0719 static const char *find_next_column_end(const char *col_begin) { 0720 while (*col_begin != sep && *col_begin != '\0') 0721 if (*col_begin != quote) 0722 ++col_begin; 0723 else { 0724 do { 0725 ++col_begin; 0726 while (*col_begin != quote) { 0727 if (*col_begin == '\0') 0728 throw error::escaped_string_not_closed(); 0729 ++col_begin; 0730 } 0731 ++col_begin; 0732 } while (*col_begin == quote); 0733 } 0734 return col_begin; 0735 } 0736 0737 static void unescape(char *&col_begin, char *&col_end) { 0738 if (col_end - col_begin >= 2) { 0739 if (*col_begin == quote && *(col_end - 1) == quote) { 0740 ++col_begin; 0741 --col_end; 0742 char *out = col_begin; 0743 for (char *in = col_begin; in != col_end; ++in) { 0744 if (*in == quote && (in + 1) != col_end && *(in + 1) == quote) { 0745 ++in; 0746 } 0747 *out = *in; 0748 ++out; 0749 } 0750 col_end = out; 0751 *col_end = '\0'; 0752 } 0753 } 0754 } 0755 }; 0756 0757 struct throw_on_overflow { 0758 template <class T> static void on_overflow(T &) { 0759 throw error::integer_overflow(); 0760 } 0761 0762 template <class T> static void on_underflow(T &) { 0763 throw error::integer_underflow(); 0764 } 0765 }; 0766 0767 struct ignore_overflow { 0768 template <class T> static void on_overflow(T &) {} 0769 0770 template <class T> static void on_underflow(T &) {} 0771 }; 0772 0773 struct set_to_max_on_overflow { 0774 template <class T> static void on_overflow(T &x) { 0775 // using (std::numeric_limits<T>::max) instead of 0776 // std::numeric_limits<T>::max to make code including windows.h with its max 0777 // macro happy 0778 x = (std::numeric_limits<T>::max)(); 0779 } 0780 0781 template <class T> static void on_underflow(T &x) { 0782 x = (std::numeric_limits<T>::min)(); 0783 } 0784 }; 0785 0786 namespace detail { 0787 template <class quote_policy> 0788 void chop_next_column(char *&line, char *&col_begin, char *&col_end) { 0789 assert(line != nullptr); 0790 0791 col_begin = line; 0792 // the col_begin + (... - col_begin) removes the constness 0793 col_end = 0794 col_begin + (quote_policy::find_next_column_end(col_begin) - col_begin); 0795 0796 if (*col_end == '\0') { 0797 line = nullptr; 0798 } else { 0799 *col_end = '\0'; 0800 line = col_end + 1; 0801 } 0802 } 0803 0804 template <class trim_policy, class quote_policy> 0805 void parse_line(char *line, char **sorted_col, 0806 const std::vector<int> &col_order) { 0807 for (int i : col_order) { 0808 if (line == nullptr) 0809 throw ::io::error::too_few_columns(); 0810 char *col_begin, *col_end; 0811 chop_next_column<quote_policy>(line, col_begin, col_end); 0812 0813 if (i != -1) { 0814 trim_policy::trim(col_begin, col_end); 0815 quote_policy::unescape(col_begin, col_end); 0816 0817 sorted_col[i] = col_begin; 0818 } 0819 } 0820 if (line != nullptr) 0821 throw ::io::error::too_many_columns(); 0822 } 0823 0824 template <unsigned column_count, class trim_policy, class quote_policy> 0825 void parse_header_line(char *line, std::vector<int> &col_order, 0826 const std::string *col_name, 0827 ignore_column ignore_policy) { 0828 col_order.clear(); 0829 0830 bool found[column_count]; 0831 std::fill(found, found + column_count, false); 0832 while (line) { 0833 char *col_begin, *col_end; 0834 chop_next_column<quote_policy>(line, col_begin, col_end); 0835 0836 trim_policy::trim(col_begin, col_end); 0837 quote_policy::unescape(col_begin, col_end); 0838 0839 for (unsigned i = 0; i < column_count; ++i) 0840 if (col_begin == col_name[i]) { 0841 if (found[i]) { 0842 error::duplicated_column_in_header err; 0843 err.set_column_name(col_begin); 0844 throw err; 0845 } 0846 found[i] = true; 0847 col_order.push_back(i); 0848 col_begin = 0; 0849 break; 0850 } 0851 if (col_begin) { 0852 if (ignore_policy & ::io::ignore_extra_column) 0853 col_order.push_back(-1); 0854 else { 0855 error::extra_column_in_header err; 0856 err.set_column_name(col_begin); 0857 throw err; 0858 } 0859 } 0860 } 0861 if (!(ignore_policy & ::io::ignore_missing_column)) { 0862 for (unsigned i = 0; i < column_count; ++i) { 0863 if (!found[i]) { 0864 error::missing_column_in_header err; 0865 err.set_column_name(col_name[i].c_str()); 0866 throw err; 0867 } 0868 } 0869 } 0870 } 0871 0872 template <class overflow_policy> void parse(char *col, char &x) { 0873 if (!*col) 0874 throw error::invalid_single_character(); 0875 x = *col; 0876 ++col; 0877 if (*col) 0878 throw error::invalid_single_character(); 0879 } 0880 0881 template <class overflow_policy> void parse(char *col, std::string &x) { 0882 x = col; 0883 } 0884 0885 template <class overflow_policy> void parse(char *col, const char *&x) { 0886 x = col; 0887 } 0888 0889 template <class overflow_policy> void parse(char *col, char *&x) { x = col; } 0890 0891 template <class overflow_policy, class T> 0892 void parse_unsigned_integer(const char *col, T &x) { 0893 x = 0; 0894 while (*col != '\0') { 0895 if ('0' <= *col && *col <= '9') { 0896 T y = *col - '0'; 0897 if (x > ((std::numeric_limits<T>::max)() - y) / 10) { 0898 overflow_policy::on_overflow(x); 0899 return; 0900 } 0901 x = 10 * x + y; 0902 } else 0903 throw error::no_digit(); 0904 ++col; 0905 } 0906 } 0907 0908 template <class overflow_policy> void parse(char *col, unsigned char &x) { 0909 parse_unsigned_integer<overflow_policy>(col, x); 0910 } 0911 template <class overflow_policy> void parse(char *col, unsigned short &x) { 0912 parse_unsigned_integer<overflow_policy>(col, x); 0913 } 0914 template <class overflow_policy> void parse(char *col, unsigned int &x) { 0915 parse_unsigned_integer<overflow_policy>(col, x); 0916 } 0917 template <class overflow_policy> void parse(char *col, unsigned long &x) { 0918 parse_unsigned_integer<overflow_policy>(col, x); 0919 } 0920 template <class overflow_policy> void parse(char *col, unsigned long long &x) { 0921 parse_unsigned_integer<overflow_policy>(col, x); 0922 } 0923 0924 template <class overflow_policy, class T> 0925 void parse_signed_integer(const char *col, T &x) { 0926 if (*col == '-') { 0927 ++col; 0928 0929 x = 0; 0930 while (*col != '\0') { 0931 if ('0' <= *col && *col <= '9') { 0932 T y = *col - '0'; 0933 if (x < ((std::numeric_limits<T>::min)() + y) / 10) { 0934 overflow_policy::on_underflow(x); 0935 return; 0936 } 0937 x = 10 * x - y; 0938 } else 0939 throw error::no_digit(); 0940 ++col; 0941 } 0942 return; 0943 } else if (*col == '+') 0944 ++col; 0945 parse_unsigned_integer<overflow_policy>(col, x); 0946 } 0947 0948 template <class overflow_policy> void parse(char *col, signed char &x) { 0949 parse_signed_integer<overflow_policy>(col, x); 0950 } 0951 template <class overflow_policy> void parse(char *col, signed short &x) { 0952 parse_signed_integer<overflow_policy>(col, x); 0953 } 0954 template <class overflow_policy> void parse(char *col, signed int &x) { 0955 parse_signed_integer<overflow_policy>(col, x); 0956 } 0957 template <class overflow_policy> void parse(char *col, signed long &x) { 0958 parse_signed_integer<overflow_policy>(col, x); 0959 } 0960 template <class overflow_policy> void parse(char *col, signed long long &x) { 0961 parse_signed_integer<overflow_policy>(col, x); 0962 } 0963 0964 template <class T> void parse_float(const char *col, T &x) { 0965 bool is_neg = false; 0966 if (*col == '-') { 0967 is_neg = true; 0968 ++col; 0969 } else if (*col == '+') 0970 ++col; 0971 0972 x = 0; 0973 while ('0' <= *col && *col <= '9') { 0974 int y = *col - '0'; 0975 x *= 10; 0976 x += y; 0977 ++col; 0978 } 0979 0980 if (*col == '.' || *col == ',') { 0981 ++col; 0982 T pos = 1; 0983 while ('0' <= *col && *col <= '9') { 0984 pos /= 10; 0985 int y = *col - '0'; 0986 ++col; 0987 x += y * pos; 0988 } 0989 } 0990 0991 if (*col == 'e' || *col == 'E') { 0992 ++col; 0993 int e; 0994 0995 parse_signed_integer<set_to_max_on_overflow>(col, e); 0996 0997 if (e != 0) { 0998 T base; 0999 if (e < 0) { 1000 base = T(0.1); 1001 e = -e; 1002 } else { 1003 base = T(10); 1004 } 1005 1006 while (e != 1) { 1007 if ((e & 1) == 0) { 1008 base = base * base; 1009 e >>= 1; 1010 } else { 1011 x *= base; 1012 --e; 1013 } 1014 } 1015 x *= base; 1016 } 1017 } else { 1018 if (*col != '\0') 1019 throw error::no_digit(); 1020 } 1021 1022 if (is_neg) 1023 x = -x; 1024 } 1025 1026 template <class overflow_policy> void parse(char *col, float &x) { 1027 parse_float(col, x); 1028 } 1029 template <class overflow_policy> void parse(char *col, double &x) { 1030 parse_float(col, x); 1031 } 1032 template <class overflow_policy> void parse(char *col, long double &x) { 1033 parse_float(col, x); 1034 } 1035 1036 template <class overflow_policy, class T> void parse(char *col, T &x) { 1037 // Mute unused variable compiler warning 1038 (void)col; 1039 (void)x; 1040 // GCC evaluates "false" when reading the template and 1041 // "sizeof(T)!=sizeof(T)" only when instantiating it. This is why 1042 // this strange construct is used. 1043 static_assert(sizeof(T) != sizeof(T), 1044 "Can not parse this type. Only builtin integrals, floats, " 1045 "char, char*, const char* and std::string are supported"); 1046 } 1047 1048 } // namespace detail 1049 1050 template <unsigned column_count, class trim_policy = trim_chars<' ', '\t'>, 1051 class quote_policy = no_quote_escape<','>, 1052 class overflow_policy = throw_on_overflow, 1053 class comment_policy = no_comment> 1054 class CSVReader { 1055 private: 1056 LineReader in; 1057 1058 char *row[column_count]; 1059 std::string column_names[column_count]; 1060 1061 std::vector<int> col_order; 1062 1063 template <class... ColNames> 1064 void set_column_names(std::string s, ColNames... cols) { 1065 column_names[column_count - sizeof...(ColNames) - 1] = std::move(s); 1066 set_column_names(std::forward<ColNames>(cols)...); 1067 } 1068 1069 void set_column_names() {} 1070 1071 public: 1072 CSVReader() = delete; 1073 CSVReader(const CSVReader &) = delete; 1074 CSVReader &operator=(const CSVReader &); 1075 1076 template <class... Args> 1077 explicit CSVReader(Args &&... args) : in(std::forward<Args>(args)...) { 1078 std::fill(row, row + column_count, nullptr); 1079 col_order.resize(column_count); 1080 for (unsigned i = 0; i < column_count; ++i) 1081 col_order[i] = i; 1082 for (unsigned i = 1; i <= column_count; ++i) 1083 column_names[i - 1] = "col" + std::to_string(i); 1084 } 1085 1086 char *next_line() { return in.next_line(); } 1087 1088 template <class... ColNames> 1089 void read_header(ignore_column ignore_policy, ColNames... cols) { 1090 static_assert(sizeof...(ColNames) >= column_count, 1091 "not enough column names specified"); 1092 static_assert(sizeof...(ColNames) <= column_count, 1093 "too many column names specified"); 1094 try { 1095 set_column_names(std::forward<ColNames>(cols)...); 1096 1097 char *line; 1098 do { 1099 line = in.next_line(); 1100 if (!line) 1101 throw error::header_missing(); 1102 } while (comment_policy::is_comment(line)); 1103 1104 detail::parse_header_line<column_count, trim_policy, quote_policy>( 1105 line, col_order, column_names, ignore_policy); 1106 } catch (error::with_file_name &err) { 1107 err.set_file_name(in.get_truncated_file_name()); 1108 throw; 1109 } 1110 } 1111 1112 template <class... ColNames> void set_header(ColNames... cols) { 1113 static_assert(sizeof...(ColNames) >= column_count, 1114 "not enough column names specified"); 1115 static_assert(sizeof...(ColNames) <= column_count, 1116 "too many column names specified"); 1117 set_column_names(std::forward<ColNames>(cols)...); 1118 std::fill(row, row + column_count, nullptr); 1119 col_order.resize(column_count); 1120 for (unsigned i = 0; i < column_count; ++i) 1121 col_order[i] = i; 1122 } 1123 1124 bool has_column(const std::string &name) const { 1125 return col_order.end() != 1126 std::find(col_order.begin(), col_order.end(), 1127 std::find(std::begin(column_names), std::end(column_names), 1128 name) - 1129 std::begin(column_names)); 1130 } 1131 1132 void set_file_name(const std::string &file_name) { 1133 in.set_file_name(file_name); 1134 } 1135 1136 void set_file_name(const char *file_name) { in.set_file_name(file_name); } 1137 1138 const char *get_truncated_file_name() const { 1139 return in.get_truncated_file_name(); 1140 } 1141 1142 void set_file_line(unsigned file_line) { in.set_file_line(file_line); } 1143 1144 unsigned get_file_line() const { return in.get_file_line(); } 1145 1146 private: 1147 void parse_helper(std::size_t) {} 1148 1149 template <class T, class... ColType> 1150 void parse_helper(std::size_t r, T &t, ColType &... cols) { 1151 if (row[r]) { 1152 try { 1153 try { 1154 ::io::detail::parse<overflow_policy>(row[r], t); 1155 } catch (error::with_column_content &err) { 1156 err.set_column_content(row[r]); 1157 throw; 1158 } 1159 } catch (error::with_column_name &err) { 1160 err.set_column_name(column_names[r].c_str()); 1161 throw; 1162 } 1163 } 1164 parse_helper(r + 1, cols...); 1165 } 1166 1167 public: 1168 template <class... ColType> bool read_row(ColType &... cols) { 1169 static_assert(sizeof...(ColType) >= column_count, 1170 "not enough columns specified"); 1171 static_assert(sizeof...(ColType) <= column_count, 1172 "too many columns specified"); 1173 try { 1174 try { 1175 1176 char *line; 1177 do { 1178 line = in.next_line(); 1179 if (!line) 1180 return false; 1181 } while (comment_policy::is_comment(line)); 1182 1183 detail::parse_line<trim_policy, quote_policy>(line, row, col_order); 1184 1185 parse_helper(0, cols...); 1186 } catch (error::with_file_name &err) { 1187 err.set_file_name(in.get_truncated_file_name()); 1188 throw; 1189 } 1190 } catch (error::with_file_line &err) { 1191 err.set_file_line(in.get_file_line()); 1192 throw; 1193 } 1194 1195 return true; 1196 } 1197 }; 1198 } // namespace io 1199 #endif 1200