File indexing completed on 2024-12-29 04:11:44

0001 /***************************************************************************
0002  *                                                                         *
0003  *   Copyright : (C) 2003 The University of Toronto                        *
0004  *   email     : netterfield@astro.utoronto.ca                             *
0005  *                                                                         *
0006  *   This program is free software; you can redistribute it and/or modify  *
0007  *   it under the terms of the GNU General Public License as published by  *
0008  *   the Free Software Foundation; either version 2 of the License, or     *
0009  *   (at your option) any later version.                                   *
0010  *                                                                         *
0011  ***************************************************************************/
0012 
0013 #include "asciidatareader.h"
0014 #include "asciisourceconfig.h"
0015 
0016 #include "math_kst.h"
0017 #include "kst_inf.h"
0018 
0019 #include "kst_atof.h"
0020 #include "measuretime.h"
0021 
0022 #include <QFile>
0023 #include <QDebug>
0024 #include <QMutexLocker>
0025 #include <QStringList>
0026 #include <QLabel>
0027 #include <QApplication>
0028 
0029 
0030 #include <ctype.h>
0031 #include <stdlib.h>
0032 
0033 
0034 using namespace AsciiCharacterTraits;
0035 
0036 
0037 // Enable QASSERT in QVarLengthArray  when using [] on data
0038 #if 0
0039 #define checkedData constArray
0040 #else
0041 #define checkedData constPointer // loads faster in debug mode
0042 #endif
0043 
0044 
0045 //-------------------------------------------------------------------------------------------
0046 AsciiDataReader::AsciiDataReader(AsciiSourceConfig& config) :
0047   _progressValue(0),
0048   _progressRows(0),
0049   _numFrames(0),
0050   _progressMax(0),
0051   _progressDone(0),
0052   _config(config),
0053   isDigit(),
0054   isWhiteSpace()
0055 {
0056 }
0057 
0058 //-------------------------------------------------------------------------------------------
0059 AsciiDataReader::~AsciiDataReader()
0060 {
0061 }
0062 
0063 //-------------------------------------------------------------------------------------------
0064 void AsciiDataReader::clear()
0065 {
0066   _rowIndex.clear();
0067   setRow0Begin(0);
0068   _numFrames = 0;
0069 }
0070 
0071 //-------------------------------------------------------------------------------------------
0072 void AsciiDataReader::setRow0Begin(qint64 begin)
0073 {
0074   _rowIndex.resize(1);
0075   _rowIndex[0] = begin;
0076 }
0077 
0078 //-------------------------------------------------------------------------------------------
0079 void AsciiDataReader::detectLineEndingType(QFile& file)
0080 {
0081   QByteArray line;
0082   int line_size = 0;
0083   while (line_size < 2 && !file.atEnd()) {
0084     line = file.readLine();
0085     line_size = line.size();
0086   }
0087   file.seek(0);
0088   if (line_size < 2) {
0089     _lineending = LineEndingType();
0090   } else {
0091     _lineending.is_crlf = line[line_size - 2] == '\r' && line[line_size - 1] == '\n' ;
0092     _lineending.character =  _lineending.is_crlf ? line[line_size - 2] : line[line_size - 1];
0093   }
0094 }
0095 
0096 //-------------------------------------------------------------------------------------------
0097 void AsciiDataReader::toDouble(const LexicalCast& lexc, const char* buffer, qint64 bufread, qint64 ch, double* v, int) const
0098 {
0099   if (   isDigit(buffer[ch])
0100          || buffer[ch] == '-'
0101          || buffer[ch] == '.'
0102          || buffer[ch] == '+'
0103          || isWhiteSpace(buffer[ch])) {
0104     *v = lexc.toDouble(&buffer[ch]);
0105   } else if ( ch + 2 < bufread
0106               && tolower(buffer[ch]) == 'i'
0107               && tolower(buffer[ch + 1]) == 'n'
0108               && tolower(buffer[ch + 2]) == 'f') {
0109     *v = INF;
0110   } else if ((*v = lexc.fromTime(&buffer[ch])) != lexc.nanValue()) {
0111     // string is a date starting with a character (Jun 2 17:52:44 2014)
0112   } else {
0113     /*
0114     TODO enable by option: "Add unparsable lines as strings"
0115     if (_rowIndex.size() > row + 1) {
0116       QString unparsable = QString::fromAscii(&buffer[_rowIndex[row]], _rowIndex[row + 1] - _rowIndex[row]);
0117       _strings[QString("Unparsable %1").arg(row)] = unparsable.trimmed();
0118     }
0119     */
0120   }
0121 }
0122 
0123 //-------------------------------------------------------------------------------------------
0124 bool AsciiDataReader::findAllDataRows(bool read_completely, QFile* file, qint64 byteLength, int col_count)
0125 {
0126   detectLineEndingType(*file);
0127 
0128   _progressMax = byteLength;
0129   _progressDone = 0;
0130 
0131   bool new_data = false;
0132   AsciiFileData buf;
0133   const qint64 more = read_completely
0134                         ? qMin<qint64>(qMax<qint64>(byteLength, AsciiFileData::Prealloc - 1), 100 * AsciiFileData::Prealloc)
0135                         : AsciiFileData::Prealloc - 1;
0136   do {
0137     // Read the tmpbuffer, starting at row_index[_numFrames]
0138     buf.clear();
0139 
0140     qint64 bufstart = _rowIndex[_numFrames]; // always read from the start of a line
0141     _progressDone += buf.read(*file, bufstart, byteLength - bufstart, more);
0142     if (buf.bytesRead() == 0) {
0143       return false;
0144     }
0145 
0146     if (_config._delimiters.value().size() == 0) {
0147       const NoDelimiter comment_del;
0148       if (_lineending.isLF()) {
0149         new_data = findDataRows(buf.checkedData(), buf.begin(), buf.bytesRead(), IsLineBreakLF(_lineending), comment_del, col_count);
0150       } else {
0151         new_data = findDataRows(buf.checkedData(), buf.begin(), buf.bytesRead(), IsLineBreakCR(_lineending), comment_del, col_count);
0152       }
0153     } else if (_config._delimiters.value().size() == 1) {
0154       const IsCharacter comment_del(_config._delimiters.value()[0].toLatin1());
0155       if (_lineending.isLF()) {
0156         new_data = findDataRows(buf.checkedData(), buf.begin(), buf.bytesRead(), IsLineBreakLF(_lineending), comment_del, col_count);
0157       } else {
0158         new_data = findDataRows(buf.checkedData(), buf.begin(), buf.bytesRead(), IsLineBreakCR(_lineending), comment_del, col_count);
0159       }
0160     } else if (_config._delimiters.value().size() > 1) {
0161       const IsInString comment_del(_config._delimiters.value());
0162       if (_lineending.isLF()) {
0163         new_data = findDataRows(buf.checkedData(), buf.begin(), buf.bytesRead(), IsLineBreakLF(_lineending), comment_del, col_count);
0164       } else {
0165         new_data = findDataRows(buf.checkedData(), buf.begin(), buf.bytesRead(), IsLineBreakCR(_lineending), comment_del, col_count);
0166       }
0167     }
0168 
0169     QMutexLocker lock(&_progressMutex);
0170     _progressRows = _numFrames;
0171     _progressValue = 100.0 * _progressDone / (1.0 * _progressMax);
0172 
0173   } while (buf.bytesRead() == more  && read_completely);
0174 
0175   return new_data;
0176 }
0177 
0178 //-------------------------------------------------------------------------------------------
0179 template<class Buffer, typename IsLineBreak, typename CommentDelimiter>
0180 bool AsciiDataReader::findDataRows(const Buffer& buffer, qint64 bufstart, qint64 bufread, const IsLineBreak& isLineBreak, const CommentDelimiter& comment_del, int col_count)
0181 {
0182   const IsWhiteSpace isWhiteSpace;
0183   bool new_data = false;
0184   bool row_has_data = false;
0185   bool is_comment = false;
0186   const qint64 row_offset = bufstart + isLineBreak.size;
0187   const qint64 old_numFrames = _numFrames;
0188   
0189   // _rowIndex[_numFrames] already set, find following rows
0190   // buffer points to next row
0191   qint64 row_start = _rowIndex[_numFrames];
0192   for (qint64 i = 0; i < bufread; ++i) {
0193     if (comment_del(buffer[i])) {
0194       is_comment = true;
0195       row_has_data = false;
0196     } else if (isLineBreak(buffer[i])) {
0197       if (row_has_data) {
0198         _rowIndex[_numFrames] = row_start;
0199         ++_numFrames;
0200         if (_numFrames + 1 >= _rowIndex.size()) {
0201           if (_rowIndex.capacity() < _numFrames + 1) {
0202             qint64 more = qMin<qint64>(qMax<qint64>(2 * _numFrames, AsciiFileData::Prealloc), 100 * AsciiFileData::Prealloc);
0203             _rowIndex.reserve(_numFrames + more);
0204           }
0205           _rowIndex.resize(_numFrames + 1);
0206         }
0207         row_start = row_offset + i;
0208         new_data = true;
0209       } else if (is_comment) {
0210         row_start = row_offset + i;
0211       }
0212       row_has_data = false;
0213       is_comment = false;
0214     } else if (!row_has_data && !isWhiteSpace(buffer[i]) && !is_comment) {
0215       row_has_data = true;
0216     }
0217   }
0218   if (_numFrames > old_numFrames)
0219     _rowIndex[_numFrames] = row_start;
0220 
0221 
0222   if (_config._columnType == AsciiSourceConfig::Fixed) {
0223     // only read complete lines, last  column could be only 1 char long
0224     if (_rowIndex.size() > 1) {
0225       for (qint64 i = 1; i <= _numFrames; ++i) {
0226         if (_rowIndex[i] <= _rowIndex[i - 1] + col_count * (_config._columnWidth - 1) + 1) {
0227         _rowIndex.resize(i);
0228         _numFrames = i - 1;
0229         }
0230       }
0231     }
0232   }
0233 
0234   return new_data;
0235 }
0236 
0237 //-------------------------------------------------------------------------------------------
0238 int AsciiDataReader::readFieldFromChunk(const AsciiFileData& chunk, int col, double *v, int start, const QString& field)
0239 {
0240   Q_ASSERT(chunk.rowBegin() >= start);
0241   return readField(chunk, col, v + chunk.rowBegin() - start, field, chunk.rowBegin(), chunk.rowsRead());
0242 }
0243 
0244 //-------------------------------------------------------------------------------------------
0245 double AsciiDataReader::progressValue()
0246 {
0247   QMutexLocker lock(&_progressMutex);
0248   return _progressValue;
0249 }
0250 
0251 //-------------------------------------------------------------------------------------------
0252 qint64 AsciiDataReader::progressRows()
0253 {
0254   QMutexLocker lock(&_progressMutex);
0255   return _progressRows;
0256 }
0257 
0258 //-------------------------------------------------------------------------------------------
0259 int AsciiDataReader::readField(const AsciiFileData& buf, int col, double *v, const QString& field, int s, int n)
0260 {
0261   if (_config._columnType == AsciiSourceConfig::Fixed) {
0262     //MeasureTime t("AsciiSource::readField: same width for all columns");
0263     const LexicalCast& lexc = LexicalCast::instance();
0264     // buf[0] points to some row start, _rowIndex[i] is absolute, so we have to subtract buf.begin().
0265     const char*const col_start = &buf.checkedData()[0] + _config._columnWidth * (col - 1) - buf.begin();
0266     for (int i = 0; i < n; ++i) {
0267       v[i] = lexc.toDouble(col_start + _rowIndex[i + s] );
0268     }
0269     return n;
0270   } else if (_config._columnType == AsciiSourceConfig::Custom) {
0271     if (_config._columnDelimiter.value().size() == 1) {
0272       //MeasureTime t("AsciiSource::readField: 1 custom column delimiter");
0273       const IsCharacter column_del(_config._columnDelimiter.value()[0].toLatin1());
0274       return readColumns(v, buf.checkedData(), buf.begin(), buf.bytesRead(), col, s, n, _lineending, column_del);
0275     } if (_config._columnDelimiter.value().size() > 1) {
0276       //MeasureTime t(QString("AsciiSource::readField: %1 custom column delimiters").arg(_config._columnDelimiter.value().size()));
0277       const IsInString column_del(_config._columnDelimiter.value());
0278       return readColumns(v, buf.checkedData(), buf.begin(), buf.bytesRead(), col, s, n, _lineending, column_del);
0279     }
0280   } else if (_config._columnType == AsciiSourceConfig::Whitespace) {
0281     //MeasureTime t("AsciiSource::readField: whitespace separated columns");
0282     const IsWhiteSpace column_del;
0283     return readColumns(v, buf.checkedData(), buf.begin(), buf.bytesRead(), col, s, n, _lineending, column_del);
0284   }
0285   return 0;
0286 }
0287 
0288 //
0289 // template instantiation chain to generate optimal code for all possible data configurations
0290 //
0291 
0292 //-------------------------------------------------------------------------------------------
0293 template<class Buffer, typename ColumnDelimiter>
0294 int AsciiDataReader::readColumns(double* v, const Buffer& buffer, qint64 bufstart, qint64 bufread, int col, int s, int n,
0295                                  const LineEndingType& lineending, const ColumnDelimiter& column_del) const
0296 {
0297   if (_config._delimiters.value().size() == 0) {
0298     const NoDelimiter comment_del;
0299     return readColumns(v, buffer, bufstart, bufread, col, s, n, lineending, column_del, comment_del);
0300   } else if (_config._delimiters.value().size() == 1) {
0301     const IsCharacter comment_del(_config._delimiters.value()[0].toLatin1());
0302     return readColumns(v, buffer, bufstart, bufread, col, s, n, lineending, column_del, comment_del);
0303   } else if (_config._delimiters.value().size() > 1) {
0304     const IsInString comment_del(_config._delimiters.value());
0305     return readColumns(v, buffer, bufstart, bufread, col, s, n, lineending, column_del, comment_del);
0306   }
0307   return 0;
0308 }
0309 
0310 //-------------------------------------------------------------------------------------------
0311 template<class Buffer, typename ColumnDelimiter, typename CommentDelimiter>
0312 int AsciiDataReader::readColumns(double* v, const Buffer& buffer, qint64 bufstart, qint64 bufread, int col, int s, int n,
0313                                  const LineEndingType& lineending, const ColumnDelimiter& column_del, const CommentDelimiter& comment_del) const
0314 {
0315   if (_config._columnWidthIsConst) {
0316     const AlwaysTrue column_withs_const;
0317     if (lineending.isLF()) {
0318       return readColumns(v, buffer, bufstart, bufread, col, s, n, IsLineBreakLF(lineending), column_del, comment_del, column_withs_const);
0319     } else {
0320       return readColumns(v, buffer, bufstart, bufread, col, s, n, IsLineBreakCR(lineending), column_del, comment_del, column_withs_const);
0321     }
0322   } else {
0323     const AlwaysFalse column_withs_const;
0324     if (lineending.isLF()) {
0325       return readColumns(v, buffer, bufstart, bufread, col, s, n, IsLineBreakLF(lineending), column_del, comment_del, column_withs_const);
0326     } else {
0327       return readColumns(v, buffer, bufstart, bufread, col, s, n, IsLineBreakCR(lineending), column_del, comment_del, column_withs_const);
0328     }
0329   }
0330 }
0331 
0332 //-------------------------------------------------------------------------------------------
0333 template<class Buffer, typename IsLineBreak, typename ColumnDelimiter, typename CommentDelimiter, typename ColumnWidthsAreConst>
0334 int AsciiDataReader::readColumns(double* v, const Buffer& buffer, qint64 bufstart, qint64 bufread, int col, int s, int n,
0335                                  const IsLineBreak& isLineBreak,
0336                                  const ColumnDelimiter& column_del, const CommentDelimiter& comment_del,
0337                                  const ColumnWidthsAreConst& are_column_widths_const) const
0338 {
0339   const LexicalCast& lexc = LexicalCast::instance();
0340 
0341   const QString delimiters = _config._delimiters.value();
0342 
0343   bool is_custom = (_config._columnType.value() == AsciiSourceConfig::Custom);
0344 
0345   qint64 col_start = -1;
0346   for (int i = 0; i < n; i++, ++s) {
0347     bool incol = false;
0348     int i_col = 0;
0349 
0350     const qint64 chstart = _rowIndex[s] - bufstart;
0351     if (is_custom && column_del(buffer[chstart])) {
0352         // row could start with delemiter
0353         incol = true;
0354     }
0355 
0356     if (are_column_widths_const()) {
0357       if (col_start != -1) {
0358         v[i] = lexc.toDouble(&buffer[0] + _rowIndex[s] + col_start);
0359         continue;
0360       }
0361     }
0362 
0363     v[i] = lexc.nanValue();
0364     for (qint64 ch = chstart; ch < bufread; ++ch) {
0365       if (isLineBreak(buffer[ch])) {
0366         break;
0367       } else if (column_del(buffer[ch])) { //<- check for column start
0368         if ((!incol) && is_custom) {
0369           ++i_col;
0370           if (i_col == col) {
0371             v[i] = lexc.nanValue();  //NAN;
0372           }
0373         }
0374         incol = false;
0375       } else if (comment_del(buffer[ch])) {
0376         break;
0377       } else {
0378         if (!incol) {
0379           incol = true;
0380           ++i_col;
0381           if (i_col == col) {
0382             toDouble(lexc, &buffer[0], bufread, ch, &v[i], i);
0383             if (are_column_widths_const()) {
0384               if (col_start == -1) {
0385                 col_start = ch - _rowIndex[s];
0386               }
0387             }
0388             break;
0389           }
0390         }
0391       }
0392     }
0393   }
0394 
0395   return n;
0396 }
0397 
0398 //-------------------------------------------------------------------------------------------
0399 template<>
0400 int AsciiDataReader::splitColumns<IsWhiteSpace>(const QByteArray& line, const IsWhiteSpace& isWhitespace, QStringList* cols)
0401 {
0402   int colstart = 0;
0403   const int size =  line.size();
0404   //ignore whitespace at the beginning
0405   for (; colstart < size && isWhitespace(line[colstart]); colstart++) {}
0406   int count = 0;
0407   int incol = true;
0408   for (int i = colstart; i < size; i++) {
0409     // entering column
0410     if (!incol && !isWhitespace(line[i])) {
0411       incol = true;
0412       colstart = i;
0413       continue;
0414     }
0415     // leaving column
0416     if (incol && isWhitespace(line[i])) {
0417       count++;
0418       if (cols) {
0419         const QByteArray col(line.constData() + colstart, i - colstart);
0420         cols->push_back(QString(col));
0421       }
0422       incol = false;
0423     }
0424   }
0425   if (incol) {
0426     const QByteArray col(line.begin() + colstart, size - 1 - colstart);
0427     QString lastCol = QString(col).simplified();
0428     if (!lastCol.isEmpty()) {
0429       count++;
0430       if (cols)
0431         cols->push_back(lastCol);
0432     }
0433   }
0434   return count;
0435 }
0436 
0437 
0438 // vim: ts=2 sw=2 et