File indexing completed on 2024-04-21 03:45:00

0001 // Copyright: (2012-2015) Ben Strasser <code@ben-strasser.net>
0002 // License: BSD-3
0003 //
0004 // All rights reserved.
0005 //
0006 // Redistribution and use in source and binary forms, with or without
0007 // modification, are permitted provided that the following conditions are met:
0008 //
0009 // 1. Redistributions of source code must retain the above copyright notice,
0010 //    this list of conditions and the following disclaimer.
0011 //
0012 // 2. Redistributions in binary form must reproduce the above copyright notice,
0013 //    this list of conditions and the following disclaimer in the documentation
0014 //    and/or other materials provided with the distribution.
0015 //
0016 // 3. Neither the name of the copyright holder nor the names of its contributors
0017 //    may be used to endorse or promote products derived from this software
0018 //    without specific prior written permission.
0019 //
0020 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
0021 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
0022 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
0023 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
0024 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
0025 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
0026 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
0027 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
0028 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
0029 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
0030 // POSSIBILITY OF SUCH DAMAGE.
0031 
0032 #ifndef CSV_H
0033 #define CSV_H
0034 
0035 #include <algorithm>
0036 #include <cstdio>
0037 #include <cstring>
0038 #include <exception>
0039 #include <string>
0040 #include <utility>
0041 #include <vector>
0042 #ifndef CSV_IO_NO_THREAD
0043 #include <condition_variable>
0044 #include <mutex>
0045 #include <thread>
0046 #endif
0047 #include <cassert>
0048 #include <cerrno>
0049 #include <istream>
0050 #include <limits>
0051 #include <memory>
0052 
0053 namespace io {
0054 ////////////////////////////////////////////////////////////////////////////
0055 //                                 LineReader                             //
0056 ////////////////////////////////////////////////////////////////////////////
0057 
0058 namespace error {
0059 struct base : std::exception {
0060   virtual void format_error_message() const = 0;
0061 
0062   const char *what() const noexcept override {
0063     format_error_message();
0064     return error_message_buffer;
0065   }
0066 
0067   mutable char error_message_buffer[2048];
0068 };
0069 
0070 // this only affects the file name in the error message
0071 const int max_file_name_length = 1024;
0072 
0073 struct with_file_name {
0074   with_file_name() { std::memset(file_name, 0, sizeof(file_name)); }
0075 
0076   void set_file_name(const char *file_name) {
0077     if (file_name != nullptr) {
0078       // This call to strncpy has parenthesis around it
0079       // to silence the GCC -Wstringop-truncation warning
0080       (strncpy(this->file_name, file_name, sizeof(this->file_name)));
0081       this->file_name[sizeof(this->file_name) - 1] = '\0';
0082     } else {
0083       this->file_name[0] = '\0';
0084     }
0085   }
0086 
0087   char file_name[max_file_name_length + 1];
0088 };
0089 
0090 struct with_file_line {
0091   with_file_line() { file_line = -1; }
0092 
0093   void set_file_line(int file_line) { this->file_line = file_line; }
0094 
0095   int file_line;
0096 };
0097 
0098 struct with_errno {
0099   with_errno() { errno_value = 0; }
0100 
0101   void set_errno(int errno_value) { this->errno_value = errno_value; }
0102 
0103   int errno_value;
0104 };
0105 
0106 struct can_not_open_file : base, with_file_name, with_errno {
0107   void format_error_message() const override {
0108     if (errno_value != 0)
0109       std::snprintf(error_message_buffer, sizeof(error_message_buffer),
0110                     "Can not open file \"%s\" because \"%s\".", file_name,
0111                     std::strerror(errno_value));
0112     else
0113       std::snprintf(error_message_buffer, sizeof(error_message_buffer),
0114                     "Can not open file \"%s\".", file_name);
0115   }
0116 };
0117 
0118 struct line_length_limit_exceeded : base, with_file_name, with_file_line {
0119   void format_error_message() const override {
0120     std::snprintf(
0121         error_message_buffer, sizeof(error_message_buffer),
0122         "Line number %d in file \"%s\" exceeds the maximum length of 2^24-1.",
0123         file_line, file_name);
0124   }
0125 };
0126 } // namespace error
0127 
0128 class ByteSourceBase {
0129 public:
0130   virtual int read(char *buffer, int size) = 0;
0131   virtual ~ByteSourceBase() {}
0132 };
0133 
0134 namespace detail {
0135 
0136 class OwningStdIOByteSourceBase : public ByteSourceBase {
0137 public:
0138   explicit OwningStdIOByteSourceBase(FILE *file) : file(file) {
0139     // Tell the std library that we want to do the buffering ourself.
0140     std::setvbuf(file, 0, _IONBF, 0);
0141   }
0142 
0143   int read(char *buffer, int size) { return std::fread(buffer, 1, size, file); }
0144 
0145   ~OwningStdIOByteSourceBase() { std::fclose(file); }
0146 
0147 private:
0148   FILE *file;
0149 };
0150 
0151 class NonOwningIStreamByteSource : public ByteSourceBase {
0152 public:
0153   explicit NonOwningIStreamByteSource(std::istream &in) : in(in) {}
0154 
0155   int read(char *buffer, int size) {
0156     in.read(buffer, size);
0157     return in.gcount();
0158   }
0159 
0160   ~NonOwningIStreamByteSource() {}
0161 
0162 private:
0163   std::istream &in;
0164 };
0165 
0166 class NonOwningStringByteSource : public ByteSourceBase {
0167 public:
0168   NonOwningStringByteSource(const char *str, long long size)
0169       : str(str), remaining_byte_count(size) {}
0170 
0171   int read(char *buffer, int desired_byte_count) {
0172     int to_copy_byte_count = desired_byte_count;
0173     if (remaining_byte_count < to_copy_byte_count)
0174       to_copy_byte_count = remaining_byte_count;
0175     std::memcpy(buffer, str, to_copy_byte_count);
0176     remaining_byte_count -= to_copy_byte_count;
0177     str += to_copy_byte_count;
0178     return to_copy_byte_count;
0179   }
0180 
0181   ~NonOwningStringByteSource() {}
0182 
0183 private:
0184   const char *str;
0185   long long remaining_byte_count;
0186 };
0187 
0188 #ifndef CSV_IO_NO_THREAD
0189 class AsynchronousReader {
0190 public:
0191   void init(std::unique_ptr<ByteSourceBase> arg_byte_source) {
0192     std::unique_lock<std::mutex> guard(lock);
0193     byte_source = std::move(arg_byte_source);
0194     desired_byte_count = -1;
0195     termination_requested = false;
0196     worker = std::thread([&] {
0197       std::unique_lock<std::mutex> guard(lock);
0198       try {
0199         for (;;) {
0200           read_requested_condition.wait(guard, [&] {
0201             return desired_byte_count != -1 || termination_requested;
0202           });
0203           if (termination_requested)
0204             return;
0205 
0206           read_byte_count = byte_source->read(buffer, desired_byte_count);
0207           desired_byte_count = -1;
0208           if (read_byte_count == 0)
0209             break;
0210           read_finished_condition.notify_one();
0211         }
0212       } catch (...) {
0213         read_error = std::current_exception();
0214       }
0215       read_finished_condition.notify_one();
0216     });
0217   }
0218 
0219   bool is_valid() const { return byte_source != nullptr; }
0220 
0221   void start_read(char *arg_buffer, int arg_desired_byte_count) {
0222     std::unique_lock<std::mutex> guard(lock);
0223     buffer = arg_buffer;
0224     desired_byte_count = arg_desired_byte_count;
0225     read_byte_count = -1;
0226     read_requested_condition.notify_one();
0227   }
0228 
0229   int finish_read() {
0230     std::unique_lock<std::mutex> guard(lock);
0231     read_finished_condition.wait(
0232         guard, [&] { return read_byte_count != -1 || read_error; });
0233     if (read_error)
0234       std::rethrow_exception(read_error);
0235     else
0236       return read_byte_count;
0237   }
0238 
0239   ~AsynchronousReader() {
0240     if (byte_source != nullptr) {
0241       {
0242         std::unique_lock<std::mutex> guard(lock);
0243         termination_requested = true;
0244       }
0245       read_requested_condition.notify_one();
0246       worker.join();
0247     }
0248   }
0249 
0250 private:
0251   std::unique_ptr<ByteSourceBase> byte_source;
0252 
0253   std::thread worker;
0254 
0255   bool termination_requested;
0256   std::exception_ptr read_error;
0257   char *buffer;
0258   int desired_byte_count;
0259   int read_byte_count;
0260 
0261   std::mutex lock;
0262   std::condition_variable read_finished_condition;
0263   std::condition_variable read_requested_condition;
0264 };
0265 #endif
0266 
0267 class SynchronousReader {
0268 public:
0269   void init(std::unique_ptr<ByteSourceBase> arg_byte_source) {
0270     byte_source = std::move(arg_byte_source);
0271   }
0272 
0273   bool is_valid() const { return byte_source != nullptr; }
0274 
0275   void start_read(char *arg_buffer, int arg_desired_byte_count) {
0276     buffer = arg_buffer;
0277     desired_byte_count = arg_desired_byte_count;
0278   }
0279 
0280   int finish_read() { return byte_source->read(buffer, desired_byte_count); }
0281 
0282 private:
0283   std::unique_ptr<ByteSourceBase> byte_source;
0284   char *buffer;
0285   int desired_byte_count;
0286 };
0287 } // namespace detail
0288 
0289 class LineReader {
0290 private:
0291   static const int block_len = 1 << 20;
0292   std::unique_ptr<char[]> buffer; // must be constructed before (and thus
0293                                   // destructed after) the reader!
0294 #ifdef CSV_IO_NO_THREAD
0295   detail::SynchronousReader reader;
0296 #else
0297   detail::AsynchronousReader reader;
0298 #endif
0299   int data_begin;
0300   int data_end;
0301 
0302   char file_name[error::max_file_name_length + 1];
0303   unsigned file_line;
0304 
0305   static std::unique_ptr<ByteSourceBase> open_file(const char *file_name) {
0306     // We open the file in binary mode as it makes no difference under *nix
0307     // and under Windows we handle \r\n newlines ourself.
0308     FILE *file = std::fopen(file_name, "rb");
0309     if (file == 0) {
0310       int x = errno; // store errno as soon as possible, doing it after
0311                      // constructor call can fail.
0312       error::can_not_open_file err;
0313       err.set_errno(x);
0314       err.set_file_name(file_name);
0315       throw err;
0316     }
0317     return std::unique_ptr<ByteSourceBase>(
0318         new detail::OwningStdIOByteSourceBase(file));
0319   }
0320 
0321   void init(std::unique_ptr<ByteSourceBase> byte_source) {
0322     file_line = 0;
0323 
0324     buffer = std::unique_ptr<char[]>(new char[3 * block_len]);
0325     data_begin = 0;
0326     data_end = byte_source->read(buffer.get(), 2 * block_len);
0327 
0328     // Ignore UTF-8 BOM
0329     if (data_end >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' &&
0330         buffer[2] == '\xBF')
0331       data_begin = 3;
0332 
0333     if (data_end == 2 * block_len) {
0334       reader.init(std::move(byte_source));
0335       reader.start_read(buffer.get() + 2 * block_len, block_len);
0336     }
0337   }
0338 
0339 public:
0340   LineReader() = delete;
0341   LineReader(const LineReader &) = delete;
0342   LineReader &operator=(const LineReader &) = delete;
0343 
0344   explicit LineReader(const char *file_name) {
0345     set_file_name(file_name);
0346     init(open_file(file_name));
0347   }
0348 
0349   explicit LineReader(const std::string &file_name) {
0350     set_file_name(file_name.c_str());
0351     init(open_file(file_name.c_str()));
0352   }
0353 
0354   LineReader(const char *file_name,
0355              std::unique_ptr<ByteSourceBase> byte_source) {
0356     set_file_name(file_name);
0357     init(std::move(byte_source));
0358   }
0359 
0360   LineReader(const std::string &file_name,
0361              std::unique_ptr<ByteSourceBase> byte_source) {
0362     set_file_name(file_name.c_str());
0363     init(std::move(byte_source));
0364   }
0365 
0366   LineReader(const char *file_name, const char *data_begin,
0367              const char *data_end) {
0368     set_file_name(file_name);
0369     init(std::unique_ptr<ByteSourceBase>(new detail::NonOwningStringByteSource(
0370         data_begin, data_end - data_begin)));
0371   }
0372 
0373   LineReader(const std::string &file_name, const char *data_begin,
0374              const char *data_end) {
0375     set_file_name(file_name.c_str());
0376     init(std::unique_ptr<ByteSourceBase>(new detail::NonOwningStringByteSource(
0377         data_begin, data_end - data_begin)));
0378   }
0379 
0380   LineReader(const char *file_name, FILE *file) {
0381     set_file_name(file_name);
0382     init(std::unique_ptr<ByteSourceBase>(
0383         new detail::OwningStdIOByteSourceBase(file)));
0384   }
0385 
0386   LineReader(const std::string &file_name, FILE *file) {
0387     set_file_name(file_name.c_str());
0388     init(std::unique_ptr<ByteSourceBase>(
0389         new detail::OwningStdIOByteSourceBase(file)));
0390   }
0391 
0392   LineReader(const char *file_name, std::istream &in) {
0393     set_file_name(file_name);
0394     init(std::unique_ptr<ByteSourceBase>(
0395         new detail::NonOwningIStreamByteSource(in)));
0396   }
0397 
0398   LineReader(const std::string &file_name, std::istream &in) {
0399     set_file_name(file_name.c_str());
0400     init(std::unique_ptr<ByteSourceBase>(
0401         new detail::NonOwningIStreamByteSource(in)));
0402   }
0403 
0404   void set_file_name(const std::string &file_name) {
0405     set_file_name(file_name.c_str());
0406   }
0407 
0408   void set_file_name(const char *file_name) {
0409     if (file_name != nullptr) {
0410       strncpy(this->file_name, file_name, sizeof(this->file_name));
0411       this->file_name[sizeof(this->file_name) - 1] = '\0';
0412     } else {
0413       this->file_name[0] = '\0';
0414     }
0415   }
0416 
0417   const char *get_truncated_file_name() const { return file_name; }
0418 
0419   void set_file_line(unsigned file_line) { this->file_line = file_line; }
0420 
0421   unsigned get_file_line() const { return file_line; }
0422 
0423   char *next_line() {
0424     if (data_begin == data_end)
0425       return nullptr;
0426 
0427     ++file_line;
0428 
0429     assert(data_begin < data_end);
0430     assert(data_end <= block_len * 2);
0431 
0432     if (data_begin >= block_len) {
0433       std::memcpy(buffer.get(), buffer.get() + block_len, block_len);
0434       data_begin -= block_len;
0435       data_end -= block_len;
0436       if (reader.is_valid()) {
0437         data_end += reader.finish_read();
0438         std::memcpy(buffer.get() + block_len, buffer.get() + 2 * block_len,
0439                     block_len);
0440         reader.start_read(buffer.get() + 2 * block_len, block_len);
0441       }
0442     }
0443 
0444     int line_end = data_begin;
0445     while (line_end != data_end && buffer[line_end] != '\n') {
0446       ++line_end;
0447     }
0448 
0449     if (line_end - data_begin + 1 > block_len) {
0450       error::line_length_limit_exceeded err;
0451       err.set_file_name(file_name);
0452       err.set_file_line(file_line);
0453       throw err;
0454     }
0455 
0456     if (line_end != data_end && buffer[line_end] == '\n') {
0457       buffer[line_end] = '\0';
0458     } else {
0459       // some files are missing the newline at the end of the
0460       // last line
0461       ++data_end;
0462       buffer[line_end] = '\0';
0463     }
0464 
0465     // handle windows \r\n-line breaks
0466     if (line_end != data_begin && buffer[line_end - 1] == '\r')
0467       buffer[line_end - 1] = '\0';
0468 
0469     char *ret = buffer.get() + data_begin;
0470     data_begin = line_end + 1;
0471     return ret;
0472   }
0473 };
0474 
0475 ////////////////////////////////////////////////////////////////////////////
0476 //                                 CSV                                    //
0477 ////////////////////////////////////////////////////////////////////////////
0478 
0479 namespace error {
0480 const int max_column_name_length = 63;
0481 struct with_column_name {
0482   with_column_name() {
0483     std::memset(column_name, 0, max_column_name_length + 1);
0484   }
0485 
0486   void set_column_name(const char *column_name) {
0487     if (column_name != nullptr) {
0488       std::strncpy(this->column_name, column_name, max_column_name_length);
0489       this->column_name[max_column_name_length] = '\0';
0490     } else {
0491       this->column_name[0] = '\0';
0492     }
0493   }
0494 
0495   char column_name[max_column_name_length + 1];
0496 };
0497 
0498 const int max_column_content_length = 63;
0499 
0500 struct with_column_content {
0501   with_column_content() {
0502     std::memset(column_content, 0, max_column_content_length + 1);
0503   }
0504 
0505   void set_column_content(const char *column_content) {
0506     if (column_content != nullptr) {
0507       std::strncpy(this->column_content, column_content,
0508                    max_column_content_length);
0509       this->column_content[max_column_content_length] = '\0';
0510     } else {
0511       this->column_content[0] = '\0';
0512     }
0513   }
0514 
0515   char column_content[max_column_content_length + 1];
0516 };
0517 
0518 struct extra_column_in_header : base, with_file_name, with_column_name {
0519   void format_error_message() const override {
0520     std::snprintf(error_message_buffer, sizeof(error_message_buffer),
0521                   R"(Extra column "%s" in header of file "%s".)", column_name,
0522                   file_name);
0523   }
0524 };
0525 
0526 struct missing_column_in_header : base, with_file_name, with_column_name {
0527   void format_error_message() const override {
0528     std::snprintf(error_message_buffer, sizeof(error_message_buffer),
0529                   R"(Missing column "%s" in header of file "%s".)", column_name,
0530                   file_name);
0531   }
0532 };
0533 
0534 struct duplicated_column_in_header : base, with_file_name, with_column_name {
0535   void format_error_message() const override {
0536     std::snprintf(error_message_buffer, sizeof(error_message_buffer),
0537                   R"(Duplicated column "%s" in header of file "%s".)",
0538                   column_name, file_name);
0539   }
0540 };
0541 
0542 struct header_missing : base, with_file_name {
0543   void format_error_message() const override {
0544     std::snprintf(error_message_buffer, sizeof(error_message_buffer),
0545                   "Header missing in file \"%s\".", file_name);
0546   }
0547 };
0548 
0549 struct too_few_columns : base, with_file_name, with_file_line {
0550   void format_error_message() const override {
0551     std::snprintf(error_message_buffer, sizeof(error_message_buffer),
0552                   "Too few columns in line %d in file \"%s\".", file_line,
0553                   file_name);
0554   }
0555 };
0556 
0557 struct too_many_columns : base, with_file_name, with_file_line {
0558   void format_error_message() const override {
0559     std::snprintf(error_message_buffer, sizeof(error_message_buffer),
0560                   "Too many columns in line %d in file \"%s\".", file_line,
0561                   file_name);
0562   }
0563 };
0564 
0565 struct escaped_string_not_closed : base, with_file_name, with_file_line {
0566   void format_error_message() const override {
0567     std::snprintf(error_message_buffer, sizeof(error_message_buffer),
0568                   "Escaped string was not closed in line %d in file \"%s\".",
0569                   file_line, file_name);
0570   }
0571 };
0572 
0573 struct integer_must_be_positive : base,
0574                                   with_file_name,
0575                                   with_file_line,
0576                                   with_column_name,
0577                                   with_column_content {
0578   void format_error_message() const override {
0579     std::snprintf(
0580         error_message_buffer, sizeof(error_message_buffer),
0581         R"(The integer "%s" must be positive or 0 in column "%s" in file "%s" in line "%d".)",
0582         column_content, column_name, file_name, file_line);
0583   }
0584 };
0585 
0586 struct no_digit : base,
0587                   with_file_name,
0588                   with_file_line,
0589                   with_column_name,
0590                   with_column_content {
0591   void format_error_message() const override {
0592     std::snprintf(
0593         error_message_buffer, sizeof(error_message_buffer),
0594         R"(The integer "%s" contains an invalid digit in column "%s" in file "%s" in line "%d".)",
0595         column_content, column_name, file_name, file_line);
0596   }
0597 };
0598 
0599 struct integer_overflow : base,
0600                           with_file_name,
0601                           with_file_line,
0602                           with_column_name,
0603                           with_column_content {
0604   void format_error_message() const override {
0605     std::snprintf(
0606         error_message_buffer, sizeof(error_message_buffer),
0607         R"(The integer "%s" overflows in column "%s" in file "%s" in line "%d".)",
0608         column_content, column_name, file_name, file_line);
0609   }
0610 };
0611 
0612 struct integer_underflow : base,
0613                            with_file_name,
0614                            with_file_line,
0615                            with_column_name,
0616                            with_column_content {
0617   void format_error_message() const override {
0618     std::snprintf(
0619         error_message_buffer, sizeof(error_message_buffer),
0620         R"(The integer "%s" underflows in column "%s" in file "%s" in line "%d".)",
0621         column_content, column_name, file_name, file_line);
0622   }
0623 };
0624 
0625 struct invalid_single_character : base,
0626                                   with_file_name,
0627                                   with_file_line,
0628                                   with_column_name,
0629                                   with_column_content {
0630   void format_error_message() const override {
0631     std::snprintf(
0632         error_message_buffer, sizeof(error_message_buffer),
0633         R"(The content "%s" of column "%s" in file "%s" in line "%d" is not a single character.)",
0634         column_content, column_name, file_name, file_line);
0635   }
0636 };
0637 } // namespace error
0638 
0639 using ignore_column = unsigned int;
0640 static const ignore_column ignore_no_column = 0;
0641 static const ignore_column ignore_extra_column = 1;
0642 static const ignore_column ignore_missing_column = 2;
0643 
0644 template <char... trim_char_list> struct trim_chars {
0645 private:
0646   constexpr static bool is_trim_char(char) { return false; }
0647 
0648   template <class... OtherTrimChars>
0649   constexpr static bool is_trim_char(char c, char trim_char,
0650                                      OtherTrimChars... other_trim_chars) {
0651     return c == trim_char || is_trim_char(c, other_trim_chars...);
0652   }
0653 
0654 public:
0655   static void trim(char *&str_begin, char *&str_end) {
0656     while (str_begin != str_end && is_trim_char(*str_begin, trim_char_list...))
0657       ++str_begin;
0658     while (str_begin != str_end &&
0659            is_trim_char(*(str_end - 1), trim_char_list...))
0660       --str_end;
0661     *str_end = '\0';
0662   }
0663 };
0664 
0665 struct no_comment {
0666   static bool is_comment(const char *) { return false; }
0667 };
0668 
0669 template <char... comment_start_char_list> struct single_line_comment {
0670 private:
0671   constexpr static bool is_comment_start_char(char) { return false; }
0672 
0673   template <class... OtherCommentStartChars>
0674   constexpr static bool
0675   is_comment_start_char(char c, char comment_start_char,
0676                         OtherCommentStartChars... other_comment_start_chars) {
0677     return c == comment_start_char ||
0678            is_comment_start_char(c, other_comment_start_chars...);
0679   }
0680 
0681 public:
0682   static bool is_comment(const char *line) {
0683     return is_comment_start_char(*line, comment_start_char_list...);
0684   }
0685 };
0686 
0687 struct empty_line_comment {
0688   static bool is_comment(const char *line) {
0689     if (*line == '\0')
0690       return true;
0691     while (*line == ' ' || *line == '\t') {
0692       ++line;
0693       if (*line == 0)
0694         return true;
0695     }
0696     return false;
0697   }
0698 };
0699 
0700 template <char... comment_start_char_list>
0701 struct single_and_empty_line_comment {
0702   static bool is_comment(const char *line) {
0703     return single_line_comment<comment_start_char_list...>::is_comment(line) ||
0704            empty_line_comment::is_comment(line);
0705   }
0706 };
0707 
0708 template <char sep> struct no_quote_escape {
0709   static const char *find_next_column_end(const char *col_begin) {
0710     while (*col_begin != sep && *col_begin != '\0')
0711       ++col_begin;
0712     return col_begin;
0713   }
0714 
0715   static void unescape(char *&, char *&) {}
0716 };
0717 
0718 template <char sep, char quote> struct double_quote_escape {
0719   static const char *find_next_column_end(const char *col_begin) {
0720     while (*col_begin != sep && *col_begin != '\0')
0721       if (*col_begin != quote)
0722         ++col_begin;
0723       else {
0724         do {
0725           ++col_begin;
0726           while (*col_begin != quote) {
0727             if (*col_begin == '\0')
0728               throw error::escaped_string_not_closed();
0729             ++col_begin;
0730           }
0731           ++col_begin;
0732         } while (*col_begin == quote);
0733       }
0734     return col_begin;
0735   }
0736 
0737   static void unescape(char *&col_begin, char *&col_end) {
0738     if (col_end - col_begin >= 2) {
0739       if (*col_begin == quote && *(col_end - 1) == quote) {
0740         ++col_begin;
0741         --col_end;
0742         char *out = col_begin;
0743         for (char *in = col_begin; in != col_end; ++in) {
0744           if (*in == quote && (in + 1) != col_end && *(in + 1) == quote) {
0745             ++in;
0746           }
0747           *out = *in;
0748           ++out;
0749         }
0750         col_end = out;
0751         *col_end = '\0';
0752       }
0753     }
0754   }
0755 };
0756 
0757 struct throw_on_overflow {
0758   template <class T> static void on_overflow(T &) {
0759     throw error::integer_overflow();
0760   }
0761 
0762   template <class T> static void on_underflow(T &) {
0763     throw error::integer_underflow();
0764   }
0765 };
0766 
0767 struct ignore_overflow {
0768   template <class T> static void on_overflow(T &) {}
0769 
0770   template <class T> static void on_underflow(T &) {}
0771 };
0772 
0773 struct set_to_max_on_overflow {
0774   template <class T> static void on_overflow(T &x) {
0775     // using (std::numeric_limits<T>::max) instead of
0776     // std::numeric_limits<T>::max to make code including windows.h with its max
0777     // macro happy
0778     x = (std::numeric_limits<T>::max)();
0779   }
0780 
0781   template <class T> static void on_underflow(T &x) {
0782     x = (std::numeric_limits<T>::min)();
0783   }
0784 };
0785 
0786 namespace detail {
0787 template <class quote_policy>
0788 void chop_next_column(char *&line, char *&col_begin, char *&col_end) {
0789   assert(line != nullptr);
0790 
0791   col_begin = line;
0792   // the col_begin + (... - col_begin) removes the constness
0793   col_end =
0794       col_begin + (quote_policy::find_next_column_end(col_begin) - col_begin);
0795 
0796   if (*col_end == '\0') {
0797     line = nullptr;
0798   } else {
0799     *col_end = '\0';
0800     line = col_end + 1;
0801   }
0802 }
0803 
0804 template <class trim_policy, class quote_policy>
0805 void parse_line(char *line, char **sorted_col,
0806                 const std::vector<int> &col_order) {
0807   for (int i : col_order) {
0808     if (line == nullptr)
0809       throw ::io::error::too_few_columns();
0810     char *col_begin, *col_end;
0811     chop_next_column<quote_policy>(line, col_begin, col_end);
0812 
0813     if (i != -1) {
0814       trim_policy::trim(col_begin, col_end);
0815       quote_policy::unescape(col_begin, col_end);
0816 
0817       sorted_col[i] = col_begin;
0818     }
0819   }
0820   if (line != nullptr)
0821     throw ::io::error::too_many_columns();
0822 }
0823 
0824 template <unsigned column_count, class trim_policy, class quote_policy>
0825 void parse_header_line(char *line, std::vector<int> &col_order,
0826                        const std::string *col_name,
0827                        ignore_column ignore_policy) {
0828   col_order.clear();
0829 
0830   bool found[column_count];
0831   std::fill(found, found + column_count, false);
0832   while (line) {
0833     char *col_begin, *col_end;
0834     chop_next_column<quote_policy>(line, col_begin, col_end);
0835 
0836     trim_policy::trim(col_begin, col_end);
0837     quote_policy::unescape(col_begin, col_end);
0838 
0839     for (unsigned i = 0; i < column_count; ++i)
0840       if (col_begin == col_name[i]) {
0841         if (found[i]) {
0842           error::duplicated_column_in_header err;
0843           err.set_column_name(col_begin);
0844           throw err;
0845         }
0846         found[i] = true;
0847         col_order.push_back(i);
0848         col_begin = 0;
0849         break;
0850       }
0851     if (col_begin) {
0852       if (ignore_policy & ::io::ignore_extra_column)
0853         col_order.push_back(-1);
0854       else {
0855         error::extra_column_in_header err;
0856         err.set_column_name(col_begin);
0857         throw err;
0858       }
0859     }
0860   }
0861   if (!(ignore_policy & ::io::ignore_missing_column)) {
0862     for (unsigned i = 0; i < column_count; ++i) {
0863       if (!found[i]) {
0864         error::missing_column_in_header err;
0865         err.set_column_name(col_name[i].c_str());
0866         throw err;
0867       }
0868     }
0869   }
0870 }
0871 
0872 template <class overflow_policy> void parse(char *col, char &x) {
0873   if (!*col)
0874     throw error::invalid_single_character();
0875   x = *col;
0876   ++col;
0877   if (*col)
0878     throw error::invalid_single_character();
0879 }
0880 
0881 template <class overflow_policy> void parse(char *col, std::string &x) {
0882   x = col;
0883 }
0884 
0885 template <class overflow_policy> void parse(char *col, const char *&x) {
0886   x = col;
0887 }
0888 
0889 template <class overflow_policy> void parse(char *col, char *&x) { x = col; }
0890 
0891 template <class overflow_policy, class T>
0892 void parse_unsigned_integer(const char *col, T &x) {
0893   x = 0;
0894   while (*col != '\0') {
0895     if ('0' <= *col && *col <= '9') {
0896       T y = *col - '0';
0897       if (x > ((std::numeric_limits<T>::max)() - y) / 10) {
0898         overflow_policy::on_overflow(x);
0899         return;
0900       }
0901       x = 10 * x + y;
0902     } else
0903       throw error::no_digit();
0904     ++col;
0905   }
0906 }
0907 
0908 template <class overflow_policy> void parse(char *col, unsigned char &x) {
0909   parse_unsigned_integer<overflow_policy>(col, x);
0910 }
0911 template <class overflow_policy> void parse(char *col, unsigned short &x) {
0912   parse_unsigned_integer<overflow_policy>(col, x);
0913 }
0914 template <class overflow_policy> void parse(char *col, unsigned int &x) {
0915   parse_unsigned_integer<overflow_policy>(col, x);
0916 }
0917 template <class overflow_policy> void parse(char *col, unsigned long &x) {
0918   parse_unsigned_integer<overflow_policy>(col, x);
0919 }
0920 template <class overflow_policy> void parse(char *col, unsigned long long &x) {
0921   parse_unsigned_integer<overflow_policy>(col, x);
0922 }
0923 
0924 template <class overflow_policy, class T>
0925 void parse_signed_integer(const char *col, T &x) {
0926   if (*col == '-') {
0927     ++col;
0928 
0929     x = 0;
0930     while (*col != '\0') {
0931       if ('0' <= *col && *col <= '9') {
0932         T y = *col - '0';
0933         if (x < ((std::numeric_limits<T>::min)() + y) / 10) {
0934           overflow_policy::on_underflow(x);
0935           return;
0936         }
0937         x = 10 * x - y;
0938       } else
0939         throw error::no_digit();
0940       ++col;
0941     }
0942     return;
0943   } else if (*col == '+')
0944     ++col;
0945   parse_unsigned_integer<overflow_policy>(col, x);
0946 }
0947 
0948 template <class overflow_policy> void parse(char *col, signed char &x) {
0949   parse_signed_integer<overflow_policy>(col, x);
0950 }
0951 template <class overflow_policy> void parse(char *col, signed short &x) {
0952   parse_signed_integer<overflow_policy>(col, x);
0953 }
0954 template <class overflow_policy> void parse(char *col, signed int &x) {
0955   parse_signed_integer<overflow_policy>(col, x);
0956 }
0957 template <class overflow_policy> void parse(char *col, signed long &x) {
0958   parse_signed_integer<overflow_policy>(col, x);
0959 }
0960 template <class overflow_policy> void parse(char *col, signed long long &x) {
0961   parse_signed_integer<overflow_policy>(col, x);
0962 }
0963 
0964 template <class T> void parse_float(const char *col, T &x) {
0965   bool is_neg = false;
0966   if (*col == '-') {
0967     is_neg = true;
0968     ++col;
0969   } else if (*col == '+')
0970     ++col;
0971 
0972   x = 0;
0973   while ('0' <= *col && *col <= '9') {
0974     int y = *col - '0';
0975     x *= 10;
0976     x += y;
0977     ++col;
0978   }
0979 
0980   if (*col == '.' || *col == ',') {
0981     ++col;
0982     T pos = 1;
0983     while ('0' <= *col && *col <= '9') {
0984       pos /= 10;
0985       int y = *col - '0';
0986       ++col;
0987       x += y * pos;
0988     }
0989   }
0990 
0991   if (*col == 'e' || *col == 'E') {
0992     ++col;
0993     int e;
0994 
0995     parse_signed_integer<set_to_max_on_overflow>(col, e);
0996 
0997     if (e != 0) {
0998       T base;
0999       if (e < 0) {
1000         base = T(0.1);
1001         e = -e;
1002       } else {
1003         base = T(10);
1004       }
1005 
1006       while (e != 1) {
1007         if ((e & 1) == 0) {
1008           base = base * base;
1009           e >>= 1;
1010         } else {
1011           x *= base;
1012           --e;
1013         }
1014       }
1015       x *= base;
1016     }
1017   } else {
1018     if (*col != '\0')
1019       throw error::no_digit();
1020   }
1021 
1022   if (is_neg)
1023     x = -x;
1024 }
1025 
1026 template <class overflow_policy> void parse(char *col, float &x) {
1027   parse_float(col, x);
1028 }
1029 template <class overflow_policy> void parse(char *col, double &x) {
1030   parse_float(col, x);
1031 }
1032 template <class overflow_policy> void parse(char *col, long double &x) {
1033   parse_float(col, x);
1034 }
1035 
1036 template <class overflow_policy, class T> void parse(char *col, T &x) {
1037   // Mute unused variable compiler warning
1038   (void)col;
1039   (void)x;
1040   // GCC evaluates "false" when reading the template and
1041   // "sizeof(T)!=sizeof(T)" only when instantiating it. This is why
1042   // this strange construct is used.
1043   static_assert(sizeof(T) != sizeof(T),
1044                 "Can not parse this type. Only builtin integrals, floats, "
1045                 "char, char*, const char* and std::string are supported");
1046 }
1047 
1048 } // namespace detail
1049 
1050 template <unsigned column_count, class trim_policy = trim_chars<' ', '\t'>,
1051           class quote_policy = no_quote_escape<','>,
1052           class overflow_policy = throw_on_overflow,
1053           class comment_policy = no_comment>
1054 class CSVReader {
1055 private:
1056   LineReader in;
1057 
1058   char *row[column_count];
1059   std::string column_names[column_count];
1060 
1061   std::vector<int> col_order;
1062 
1063   template <class... ColNames>
1064   void set_column_names(std::string s, ColNames... cols) {
1065     column_names[column_count - sizeof...(ColNames) - 1] = std::move(s);
1066     set_column_names(std::forward<ColNames>(cols)...);
1067   }
1068 
1069   void set_column_names() {}
1070 
1071 public:
1072   CSVReader() = delete;
1073   CSVReader(const CSVReader &) = delete;
1074   CSVReader &operator=(const CSVReader &);
1075 
1076   template <class... Args>
1077   explicit CSVReader(Args &&... args) : in(std::forward<Args>(args)...) {
1078     std::fill(row, row + column_count, nullptr);
1079     col_order.resize(column_count);
1080     for (unsigned i = 0; i < column_count; ++i)
1081       col_order[i] = i;
1082     for (unsigned i = 1; i <= column_count; ++i)
1083       column_names[i - 1] = "col" + std::to_string(i);
1084   }
1085 
1086   char *next_line() { return in.next_line(); }
1087 
1088   template <class... ColNames>
1089   void read_header(ignore_column ignore_policy, ColNames... cols) {
1090     static_assert(sizeof...(ColNames) >= column_count,
1091                   "not enough column names specified");
1092     static_assert(sizeof...(ColNames) <= column_count,
1093                   "too many column names specified");
1094     try {
1095       set_column_names(std::forward<ColNames>(cols)...);
1096 
1097       char *line;
1098       do {
1099         line = in.next_line();
1100         if (!line)
1101           throw error::header_missing();
1102       } while (comment_policy::is_comment(line));
1103 
1104       detail::parse_header_line<column_count, trim_policy, quote_policy>(
1105           line, col_order, column_names, ignore_policy);
1106     } catch (error::with_file_name &err) {
1107       err.set_file_name(in.get_truncated_file_name());
1108       throw;
1109     }
1110   }
1111 
1112   template <class... ColNames> void set_header(ColNames... cols) {
1113     static_assert(sizeof...(ColNames) >= column_count,
1114                   "not enough column names specified");
1115     static_assert(sizeof...(ColNames) <= column_count,
1116                   "too many column names specified");
1117     set_column_names(std::forward<ColNames>(cols)...);
1118     std::fill(row, row + column_count, nullptr);
1119     col_order.resize(column_count);
1120     for (unsigned i = 0; i < column_count; ++i)
1121       col_order[i] = i;
1122   }
1123 
1124   bool has_column(const std::string &name) const {
1125     return col_order.end() !=
1126            std::find(col_order.begin(), col_order.end(),
1127                      std::find(std::begin(column_names), std::end(column_names),
1128                                name) -
1129                          std::begin(column_names));
1130   }
1131 
1132   void set_file_name(const std::string &file_name) {
1133     in.set_file_name(file_name);
1134   }
1135 
1136   void set_file_name(const char *file_name) { in.set_file_name(file_name); }
1137 
1138   const char *get_truncated_file_name() const {
1139     return in.get_truncated_file_name();
1140   }
1141 
1142   void set_file_line(unsigned file_line) { in.set_file_line(file_line); }
1143 
1144   unsigned get_file_line() const { return in.get_file_line(); }
1145 
1146 private:
1147   void parse_helper(std::size_t) {}
1148 
1149   template <class T, class... ColType>
1150   void parse_helper(std::size_t r, T &t, ColType &... cols) {
1151     if (row[r]) {
1152       try {
1153         try {
1154           ::io::detail::parse<overflow_policy>(row[r], t);
1155         } catch (error::with_column_content &err) {
1156           err.set_column_content(row[r]);
1157           throw;
1158         }
1159       } catch (error::with_column_name &err) {
1160         err.set_column_name(column_names[r].c_str());
1161         throw;
1162       }
1163     }
1164     parse_helper(r + 1, cols...);
1165   }
1166 
1167 public:
1168   template <class... ColType> bool read_row(ColType &... cols) {
1169     static_assert(sizeof...(ColType) >= column_count,
1170                   "not enough columns specified");
1171     static_assert(sizeof...(ColType) <= column_count,
1172                   "too many columns specified");
1173     try {
1174       try {
1175 
1176         char *line;
1177         do {
1178           line = in.next_line();
1179           if (!line)
1180             return false;
1181         } while (comment_policy::is_comment(line));
1182 
1183         detail::parse_line<trim_policy, quote_policy>(line, row, col_order);
1184 
1185         parse_helper(0, cols...);
1186       } catch (error::with_file_name &err) {
1187         err.set_file_name(in.get_truncated_file_name());
1188         throw;
1189       }
1190     } catch (error::with_file_line &err) {
1191       err.set_file_line(in.get_file_line());
1192       throw;
1193     }
1194 
1195     return true;
1196   }
1197 };
1198 } // namespace io
1199 #endif
1200