File indexing completed on 2024-12-29 04:11:44
0001 /*************************************************************************** 0002 * * 0003 * Copyright : (C) 2003 The University of Toronto * 0004 * email : netterfield@astro.utoronto.ca * 0005 * * 0006 * This program is free software; you can redistribute it and/or modify * 0007 * it under the terms of the GNU General Public License as published by * 0008 * the Free Software Foundation; either version 2 of the License, or * 0009 * (at your option) any later version. * 0010 * * 0011 ***************************************************************************/ 0012 0013 #include "asciidatareader.h" 0014 #include "asciisourceconfig.h" 0015 0016 #include "math_kst.h" 0017 #include "kst_inf.h" 0018 0019 #include "kst_atof.h" 0020 #include "measuretime.h" 0021 0022 #include <QFile> 0023 #include <QDebug> 0024 #include <QMutexLocker> 0025 #include <QStringList> 0026 #include <QLabel> 0027 #include <QApplication> 0028 0029 0030 #include <ctype.h> 0031 #include <stdlib.h> 0032 0033 0034 using namespace AsciiCharacterTraits; 0035 0036 0037 // Enable QASSERT in QVarLengthArray when using [] on data 0038 #if 0 0039 #define checkedData constArray 0040 #else 0041 #define checkedData constPointer // loads faster in debug mode 0042 #endif 0043 0044 0045 //------------------------------------------------------------------------------------------- 0046 AsciiDataReader::AsciiDataReader(AsciiSourceConfig& config) : 0047 _progressValue(0), 0048 _progressRows(0), 0049 _numFrames(0), 0050 _progressMax(0), 0051 _progressDone(0), 0052 _config(config), 0053 isDigit(), 0054 isWhiteSpace() 0055 { 0056 } 0057 0058 //------------------------------------------------------------------------------------------- 0059 AsciiDataReader::~AsciiDataReader() 0060 { 0061 } 0062 0063 //------------------------------------------------------------------------------------------- 0064 void AsciiDataReader::clear() 0065 { 0066 _rowIndex.clear(); 0067 setRow0Begin(0); 0068 _numFrames = 0; 0069 } 0070 0071 //------------------------------------------------------------------------------------------- 0072 void AsciiDataReader::setRow0Begin(qint64 begin) 0073 { 0074 _rowIndex.resize(1); 0075 _rowIndex[0] = begin; 0076 } 0077 0078 //------------------------------------------------------------------------------------------- 0079 void AsciiDataReader::detectLineEndingType(QFile& file) 0080 { 0081 QByteArray line; 0082 int line_size = 0; 0083 while (line_size < 2 && !file.atEnd()) { 0084 line = file.readLine(); 0085 line_size = line.size(); 0086 } 0087 file.seek(0); 0088 if (line_size < 2) { 0089 _lineending = LineEndingType(); 0090 } else { 0091 _lineending.is_crlf = line[line_size - 2] == '\r' && line[line_size - 1] == '\n' ; 0092 _lineending.character = _lineending.is_crlf ? line[line_size - 2] : line[line_size - 1]; 0093 } 0094 } 0095 0096 //------------------------------------------------------------------------------------------- 0097 void AsciiDataReader::toDouble(const LexicalCast& lexc, const char* buffer, qint64 bufread, qint64 ch, double* v, int) const 0098 { 0099 if ( isDigit(buffer[ch]) 0100 || buffer[ch] == '-' 0101 || buffer[ch] == '.' 0102 || buffer[ch] == '+' 0103 || isWhiteSpace(buffer[ch])) { 0104 *v = lexc.toDouble(&buffer[ch]); 0105 } else if ( ch + 2 < bufread 0106 && tolower(buffer[ch]) == 'i' 0107 && tolower(buffer[ch + 1]) == 'n' 0108 && tolower(buffer[ch + 2]) == 'f') { 0109 *v = INF; 0110 } else if ((*v = lexc.fromTime(&buffer[ch])) != lexc.nanValue()) { 0111 // string is a date starting with a character (Jun 2 17:52:44 2014) 0112 } else { 0113 /* 0114 TODO enable by option: "Add unparsable lines as strings" 0115 if (_rowIndex.size() > row + 1) { 0116 QString unparsable = QString::fromAscii(&buffer[_rowIndex[row]], _rowIndex[row + 1] - _rowIndex[row]); 0117 _strings[QString("Unparsable %1").arg(row)] = unparsable.trimmed(); 0118 } 0119 */ 0120 } 0121 } 0122 0123 //------------------------------------------------------------------------------------------- 0124 bool AsciiDataReader::findAllDataRows(bool read_completely, QFile* file, qint64 byteLength, int col_count) 0125 { 0126 detectLineEndingType(*file); 0127 0128 _progressMax = byteLength; 0129 _progressDone = 0; 0130 0131 bool new_data = false; 0132 AsciiFileData buf; 0133 const qint64 more = read_completely 0134 ? qMin<qint64>(qMax<qint64>(byteLength, AsciiFileData::Prealloc - 1), 100 * AsciiFileData::Prealloc) 0135 : AsciiFileData::Prealloc - 1; 0136 do { 0137 // Read the tmpbuffer, starting at row_index[_numFrames] 0138 buf.clear(); 0139 0140 qint64 bufstart = _rowIndex[_numFrames]; // always read from the start of a line 0141 _progressDone += buf.read(*file, bufstart, byteLength - bufstart, more); 0142 if (buf.bytesRead() == 0) { 0143 return false; 0144 } 0145 0146 if (_config._delimiters.value().size() == 0) { 0147 const NoDelimiter comment_del; 0148 if (_lineending.isLF()) { 0149 new_data = findDataRows(buf.checkedData(), buf.begin(), buf.bytesRead(), IsLineBreakLF(_lineending), comment_del, col_count); 0150 } else { 0151 new_data = findDataRows(buf.checkedData(), buf.begin(), buf.bytesRead(), IsLineBreakCR(_lineending), comment_del, col_count); 0152 } 0153 } else if (_config._delimiters.value().size() == 1) { 0154 const IsCharacter comment_del(_config._delimiters.value()[0].toLatin1()); 0155 if (_lineending.isLF()) { 0156 new_data = findDataRows(buf.checkedData(), buf.begin(), buf.bytesRead(), IsLineBreakLF(_lineending), comment_del, col_count); 0157 } else { 0158 new_data = findDataRows(buf.checkedData(), buf.begin(), buf.bytesRead(), IsLineBreakCR(_lineending), comment_del, col_count); 0159 } 0160 } else if (_config._delimiters.value().size() > 1) { 0161 const IsInString comment_del(_config._delimiters.value()); 0162 if (_lineending.isLF()) { 0163 new_data = findDataRows(buf.checkedData(), buf.begin(), buf.bytesRead(), IsLineBreakLF(_lineending), comment_del, col_count); 0164 } else { 0165 new_data = findDataRows(buf.checkedData(), buf.begin(), buf.bytesRead(), IsLineBreakCR(_lineending), comment_del, col_count); 0166 } 0167 } 0168 0169 QMutexLocker lock(&_progressMutex); 0170 _progressRows = _numFrames; 0171 _progressValue = 100.0 * _progressDone / (1.0 * _progressMax); 0172 0173 } while (buf.bytesRead() == more && read_completely); 0174 0175 return new_data; 0176 } 0177 0178 //------------------------------------------------------------------------------------------- 0179 template<class Buffer, typename IsLineBreak, typename CommentDelimiter> 0180 bool AsciiDataReader::findDataRows(const Buffer& buffer, qint64 bufstart, qint64 bufread, const IsLineBreak& isLineBreak, const CommentDelimiter& comment_del, int col_count) 0181 { 0182 const IsWhiteSpace isWhiteSpace; 0183 bool new_data = false; 0184 bool row_has_data = false; 0185 bool is_comment = false; 0186 const qint64 row_offset = bufstart + isLineBreak.size; 0187 const qint64 old_numFrames = _numFrames; 0188 0189 // _rowIndex[_numFrames] already set, find following rows 0190 // buffer points to next row 0191 qint64 row_start = _rowIndex[_numFrames]; 0192 for (qint64 i = 0; i < bufread; ++i) { 0193 if (comment_del(buffer[i])) { 0194 is_comment = true; 0195 row_has_data = false; 0196 } else if (isLineBreak(buffer[i])) { 0197 if (row_has_data) { 0198 _rowIndex[_numFrames] = row_start; 0199 ++_numFrames; 0200 if (_numFrames + 1 >= _rowIndex.size()) { 0201 if (_rowIndex.capacity() < _numFrames + 1) { 0202 qint64 more = qMin<qint64>(qMax<qint64>(2 * _numFrames, AsciiFileData::Prealloc), 100 * AsciiFileData::Prealloc); 0203 _rowIndex.reserve(_numFrames + more); 0204 } 0205 _rowIndex.resize(_numFrames + 1); 0206 } 0207 row_start = row_offset + i; 0208 new_data = true; 0209 } else if (is_comment) { 0210 row_start = row_offset + i; 0211 } 0212 row_has_data = false; 0213 is_comment = false; 0214 } else if (!row_has_data && !isWhiteSpace(buffer[i]) && !is_comment) { 0215 row_has_data = true; 0216 } 0217 } 0218 if (_numFrames > old_numFrames) 0219 _rowIndex[_numFrames] = row_start; 0220 0221 0222 if (_config._columnType == AsciiSourceConfig::Fixed) { 0223 // only read complete lines, last column could be only 1 char long 0224 if (_rowIndex.size() > 1) { 0225 for (qint64 i = 1; i <= _numFrames; ++i) { 0226 if (_rowIndex[i] <= _rowIndex[i - 1] + col_count * (_config._columnWidth - 1) + 1) { 0227 _rowIndex.resize(i); 0228 _numFrames = i - 1; 0229 } 0230 } 0231 } 0232 } 0233 0234 return new_data; 0235 } 0236 0237 //------------------------------------------------------------------------------------------- 0238 int AsciiDataReader::readFieldFromChunk(const AsciiFileData& chunk, int col, double *v, int start, const QString& field) 0239 { 0240 Q_ASSERT(chunk.rowBegin() >= start); 0241 return readField(chunk, col, v + chunk.rowBegin() - start, field, chunk.rowBegin(), chunk.rowsRead()); 0242 } 0243 0244 //------------------------------------------------------------------------------------------- 0245 double AsciiDataReader::progressValue() 0246 { 0247 QMutexLocker lock(&_progressMutex); 0248 return _progressValue; 0249 } 0250 0251 //------------------------------------------------------------------------------------------- 0252 qint64 AsciiDataReader::progressRows() 0253 { 0254 QMutexLocker lock(&_progressMutex); 0255 return _progressRows; 0256 } 0257 0258 //------------------------------------------------------------------------------------------- 0259 int AsciiDataReader::readField(const AsciiFileData& buf, int col, double *v, const QString& field, int s, int n) 0260 { 0261 if (_config._columnType == AsciiSourceConfig::Fixed) { 0262 //MeasureTime t("AsciiSource::readField: same width for all columns"); 0263 const LexicalCast& lexc = LexicalCast::instance(); 0264 // buf[0] points to some row start, _rowIndex[i] is absolute, so we have to subtract buf.begin(). 0265 const char*const col_start = &buf.checkedData()[0] + _config._columnWidth * (col - 1) - buf.begin(); 0266 for (int i = 0; i < n; ++i) { 0267 v[i] = lexc.toDouble(col_start + _rowIndex[i + s] ); 0268 } 0269 return n; 0270 } else if (_config._columnType == AsciiSourceConfig::Custom) { 0271 if (_config._columnDelimiter.value().size() == 1) { 0272 //MeasureTime t("AsciiSource::readField: 1 custom column delimiter"); 0273 const IsCharacter column_del(_config._columnDelimiter.value()[0].toLatin1()); 0274 return readColumns(v, buf.checkedData(), buf.begin(), buf.bytesRead(), col, s, n, _lineending, column_del); 0275 } if (_config._columnDelimiter.value().size() > 1) { 0276 //MeasureTime t(QString("AsciiSource::readField: %1 custom column delimiters").arg(_config._columnDelimiter.value().size())); 0277 const IsInString column_del(_config._columnDelimiter.value()); 0278 return readColumns(v, buf.checkedData(), buf.begin(), buf.bytesRead(), col, s, n, _lineending, column_del); 0279 } 0280 } else if (_config._columnType == AsciiSourceConfig::Whitespace) { 0281 //MeasureTime t("AsciiSource::readField: whitespace separated columns"); 0282 const IsWhiteSpace column_del; 0283 return readColumns(v, buf.checkedData(), buf.begin(), buf.bytesRead(), col, s, n, _lineending, column_del); 0284 } 0285 return 0; 0286 } 0287 0288 // 0289 // template instantiation chain to generate optimal code for all possible data configurations 0290 // 0291 0292 //------------------------------------------------------------------------------------------- 0293 template<class Buffer, typename ColumnDelimiter> 0294 int AsciiDataReader::readColumns(double* v, const Buffer& buffer, qint64 bufstart, qint64 bufread, int col, int s, int n, 0295 const LineEndingType& lineending, const ColumnDelimiter& column_del) const 0296 { 0297 if (_config._delimiters.value().size() == 0) { 0298 const NoDelimiter comment_del; 0299 return readColumns(v, buffer, bufstart, bufread, col, s, n, lineending, column_del, comment_del); 0300 } else if (_config._delimiters.value().size() == 1) { 0301 const IsCharacter comment_del(_config._delimiters.value()[0].toLatin1()); 0302 return readColumns(v, buffer, bufstart, bufread, col, s, n, lineending, column_del, comment_del); 0303 } else if (_config._delimiters.value().size() > 1) { 0304 const IsInString comment_del(_config._delimiters.value()); 0305 return readColumns(v, buffer, bufstart, bufread, col, s, n, lineending, column_del, comment_del); 0306 } 0307 return 0; 0308 } 0309 0310 //------------------------------------------------------------------------------------------- 0311 template<class Buffer, typename ColumnDelimiter, typename CommentDelimiter> 0312 int AsciiDataReader::readColumns(double* v, const Buffer& buffer, qint64 bufstart, qint64 bufread, int col, int s, int n, 0313 const LineEndingType& lineending, const ColumnDelimiter& column_del, const CommentDelimiter& comment_del) const 0314 { 0315 if (_config._columnWidthIsConst) { 0316 const AlwaysTrue column_withs_const; 0317 if (lineending.isLF()) { 0318 return readColumns(v, buffer, bufstart, bufread, col, s, n, IsLineBreakLF(lineending), column_del, comment_del, column_withs_const); 0319 } else { 0320 return readColumns(v, buffer, bufstart, bufread, col, s, n, IsLineBreakCR(lineending), column_del, comment_del, column_withs_const); 0321 } 0322 } else { 0323 const AlwaysFalse column_withs_const; 0324 if (lineending.isLF()) { 0325 return readColumns(v, buffer, bufstart, bufread, col, s, n, IsLineBreakLF(lineending), column_del, comment_del, column_withs_const); 0326 } else { 0327 return readColumns(v, buffer, bufstart, bufread, col, s, n, IsLineBreakCR(lineending), column_del, comment_del, column_withs_const); 0328 } 0329 } 0330 } 0331 0332 //------------------------------------------------------------------------------------------- 0333 template<class Buffer, typename IsLineBreak, typename ColumnDelimiter, typename CommentDelimiter, typename ColumnWidthsAreConst> 0334 int AsciiDataReader::readColumns(double* v, const Buffer& buffer, qint64 bufstart, qint64 bufread, int col, int s, int n, 0335 const IsLineBreak& isLineBreak, 0336 const ColumnDelimiter& column_del, const CommentDelimiter& comment_del, 0337 const ColumnWidthsAreConst& are_column_widths_const) const 0338 { 0339 const LexicalCast& lexc = LexicalCast::instance(); 0340 0341 const QString delimiters = _config._delimiters.value(); 0342 0343 bool is_custom = (_config._columnType.value() == AsciiSourceConfig::Custom); 0344 0345 qint64 col_start = -1; 0346 for (int i = 0; i < n; i++, ++s) { 0347 bool incol = false; 0348 int i_col = 0; 0349 0350 const qint64 chstart = _rowIndex[s] - bufstart; 0351 if (is_custom && column_del(buffer[chstart])) { 0352 // row could start with delemiter 0353 incol = true; 0354 } 0355 0356 if (are_column_widths_const()) { 0357 if (col_start != -1) { 0358 v[i] = lexc.toDouble(&buffer[0] + _rowIndex[s] + col_start); 0359 continue; 0360 } 0361 } 0362 0363 v[i] = lexc.nanValue(); 0364 for (qint64 ch = chstart; ch < bufread; ++ch) { 0365 if (isLineBreak(buffer[ch])) { 0366 break; 0367 } else if (column_del(buffer[ch])) { //<- check for column start 0368 if ((!incol) && is_custom) { 0369 ++i_col; 0370 if (i_col == col) { 0371 v[i] = lexc.nanValue(); //NAN; 0372 } 0373 } 0374 incol = false; 0375 } else if (comment_del(buffer[ch])) { 0376 break; 0377 } else { 0378 if (!incol) { 0379 incol = true; 0380 ++i_col; 0381 if (i_col == col) { 0382 toDouble(lexc, &buffer[0], bufread, ch, &v[i], i); 0383 if (are_column_widths_const()) { 0384 if (col_start == -1) { 0385 col_start = ch - _rowIndex[s]; 0386 } 0387 } 0388 break; 0389 } 0390 } 0391 } 0392 } 0393 } 0394 0395 return n; 0396 } 0397 0398 //------------------------------------------------------------------------------------------- 0399 template<> 0400 int AsciiDataReader::splitColumns<IsWhiteSpace>(const QByteArray& line, const IsWhiteSpace& isWhitespace, QStringList* cols) 0401 { 0402 int colstart = 0; 0403 const int size = line.size(); 0404 //ignore whitespace at the beginning 0405 for (; colstart < size && isWhitespace(line[colstart]); colstart++) {} 0406 int count = 0; 0407 int incol = true; 0408 for (int i = colstart; i < size; i++) { 0409 // entering column 0410 if (!incol && !isWhitespace(line[i])) { 0411 incol = true; 0412 colstart = i; 0413 continue; 0414 } 0415 // leaving column 0416 if (incol && isWhitespace(line[i])) { 0417 count++; 0418 if (cols) { 0419 const QByteArray col(line.constData() + colstart, i - colstart); 0420 cols->push_back(QString(col)); 0421 } 0422 incol = false; 0423 } 0424 } 0425 if (incol) { 0426 const QByteArray col(line.begin() + colstart, size - 1 - colstart); 0427 QString lastCol = QString(col).simplified(); 0428 if (!lastCol.isEmpty()) { 0429 count++; 0430 if (cols) 0431 cols->push_back(lastCol); 0432 } 0433 } 0434 return count; 0435 } 0436 0437 0438 // vim: ts=2 sw=2 et