Alexandria  2.27.0
SDC-CH common library for the Euclid project
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
AsciiReaderHelper.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2012-2022 Euclid Science Ground Segment
3  *
4  * This library is free software; you can redistribute it and/or modify it under
5  * the terms of the GNU Lesser General Public License as published by the Free
6  * Software Foundation; either version 3.0 of the License, or (at your option)
7  * any later version.
8  *
9  * This library is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU Lesser General Public License
15  * along with this library; if not, write to the Free Software Foundation, Inc.,
16  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
25 #include "AsciiReaderHelper.h"
27 #include "ElementsKernel/Logging.h"
28 #include "NdArray/NdArray.h"
29 #include <boost/algorithm/string.hpp>
30 #include <boost/lexical_cast.hpp>
31 #include <boost/spirit/include/qi.hpp>
32 #include <boost/tokenizer.hpp>
33 #include <set>
34 #include <sstream>
35 
36 namespace Euclid {
37 namespace Table {
38 
39 using NdArray::NdArray;
40 
42 
43 size_t countColumns(std::istream& in, const std::string& comment) {
44  StreamRewinder rewinder{in};
45  size_t count = 0;
46 
47  while (in) {
48  std::string line;
49  getline(in, line);
50  // Remove any comments
51  size_t comment_pos = line.find(comment);
52  if (comment_pos != std::string::npos) {
53  line = line.substr(0, comment_pos);
54  }
55  boost::trim(line);
56  if (!line.empty()) {
57  std::string token;
58  std::stringstream line_stream(line);
59  line_stream >> boost::io::quoted(token);
60  while (line_stream) {
61  line_stream >> boost::io::quoted(token);
62  ++count;
63  }
64  break;
65  }
66  }
67  if (count == 0) {
68  throw Elements::Exception() << "No data lines found";
69  }
70  return count;
71 }
72 
76  // Boolean
77  {"bool", typeid(bool)},
78  {"boolean", typeid(bool)},
79  // Integers
80  {"int", typeid(int32_t)},
81  {"long", typeid(int64_t)},
82  {"int32", typeid(int32_t)},
83  {"int64", typeid(int64_t)},
84  // Floating point
85  {"float", typeid(float)},
86  {"double", typeid(double)},
87  // Strings
88  {"string", typeid(std::string)},
89  // Arrays
90  {"[bool]", typeid(std::vector<bool>)},
91  {"[boolean]", typeid(std::vector<bool>)},
92  {"[int]", typeid(std::vector<int32_t>)},
93  {"[long]", typeid(std::vector<int64_t>)},
94  {"[int32]", typeid(std::vector<int32_t>)},
95  {"[int64]", typeid(std::vector<int64_t>)},
96  {"[float]", typeid(std::vector<float>)},
97  {"[double]", typeid(std::vector<double>)},
98  // NdArrays
99  {"[int+]", typeid(NdArray<int32_t>)},
100  {"[long+]", typeid(NdArray<int64_t>)},
101  {"[int32+]", typeid(NdArray<int32_t>)},
102  {"[int64+]", typeid(NdArray<int64_t>)},
103  {"[float+]", typeid(NdArray<float>)},
104  {"[double+]", typeid(NdArray<double>)},
105 };
106 
108  auto i = std::find_if(KeywordTypeMap.begin(), KeywordTypeMap.end(),
109  [keyword](const std::pair<std::string, std::type_index>& p) { return p.first == keyword; });
110  if (i != KeywordTypeMap.end()) {
111  return i->second;
112  }
113  throw Elements::Exception() << "Unknown column type keyword " << keyword;
114 }
115 
117  StreamRewinder rewinder{in};
119  while (in) {
120  std::string line;
121  getline(in, line);
122  boost::trim(line);
123  if (line.empty()) {
124  continue; // We skip empty lines
125  }
126  if (boost::starts_with(line, comment)) {
127  // If we have a comment we remove all comment characters and check if we have
128  // a column description
129  boost::replace_all(line, comment, "");
130  boost::trim(line);
131  if (boost::starts_with(line, "Column:")) {
132  line.erase(0, 7);
133  boost::trim(line);
134  if (!line.empty()) {
135  std::string token;
136  std::stringstream line_stream(line);
137  std::string name;
138  line_stream >> boost::io::quoted(name);
139  if (descriptions.count(name) != 0) {
140  throw Elements::Exception() << "Duplicate column name " << name;
141  }
142  line_stream >> boost::io::quoted(token);
143  std::type_index type = typeid(std::string);
144  if (line_stream && !boost::starts_with(token, "(") && token != "-") {
145  type = keywordToType(token);
146  line_stream >> boost::io::quoted(token);
147  }
148  std::string unit = "";
149  if (line_stream && boost::starts_with(token, "(")) {
150  unit = token;
151  unit.erase(unit.begin());
152  unit.erase(unit.end() - 1);
153  line_stream >> boost::io::quoted(token);
154  }
155  if (line_stream && token == "-") {
156  line_stream >> boost::io::quoted(token);
157  }
158  std::stringstream desc;
159  while (line_stream) {
160  desc << token << ' ';
161  line_stream >> boost::io::quoted(token);
162  }
163  std::string desc_str = desc.str();
164  boost::trim(desc_str);
165  descriptions.emplace(std::piecewise_construct, std::forward_as_tuple(name),
166  std::forward_as_tuple(name, type, unit, desc_str));
167  }
168  }
169  } else {
170  break; // here we reached the first data line
171  }
172  }
173  return descriptions;
174 }
175 
176 std::vector<std::string> autoDetectColumnNames(std::istream& in, const std::string& comment, size_t columns_number) {
177  StreamRewinder rewinder{in};
178  std::vector<std::string> names{};
179 
180  // Find the last comment line and at the same time read the names of the
181  // column info description comments
182  std::string last_comment{};
183  std::vector<std::string> desc_names{};
184  while (in) {
185  std::string line;
186  getline(in, line);
187  boost::trim(line);
188  if (line.empty()) {
189  continue; // We skip empty lines
190  }
191  if (boost::starts_with(line, comment)) {
192  // If we have a comment we remove all comment characters and check if we have
193  // the correct number of tokens
194  boost::replace_all(line, comment, "");
195  boost::trim(line);
196  if (!line.empty()) {
197  last_comment = line;
198  }
199  if (boost::starts_with(line, "Column:")) {
200  std::string temp = line;
201  temp.erase(0, 7);
202  boost::trim(temp);
203  auto space_i = temp.find(' ');
204  if (space_i > 0) {
205  temp = temp.substr(0, space_i);
206  }
207  desc_names.emplace_back(std::move(temp));
208  }
209  } else {
210  break; // here we reached the first data line
211  }
212  }
213 
214  // Check if the last comment line contains the names of the columns
215  if (!last_comment.empty()) {
216  std::stringstream line_stream(last_comment);
217  std::string token;
218  line_stream >> boost::io::quoted(token);
219  while (line_stream) {
220  names.push_back(token);
221  line_stream >> boost::io::quoted(token);
222  }
223  if (names.size() != columns_number) {
224  names.clear();
225  }
226  }
227 
228  // If the names are empty we fill them with the column descriprion ones
229  if (names.empty()) {
230  if (!desc_names.empty() && desc_names.size() != columns_number) {
231  logger.warn() << "Number of column descriptions does not matches the number"
232  << " of the columns";
233  }
234  names = desc_names;
235  }
236 
237  if (names.size() < columns_number) {
238  for (size_t i = names.size() + 1; i <= columns_number; ++i) {
239  names.push_back("col" + std::to_string(i));
240  }
241  }
242  // Check for duplicate names
244  for (const auto& name : names) {
245  if (!set.insert(name).second) {
246  throw Elements::Exception() << "Duplicate column name " << name;
247  }
248  }
249  return names;
250 }
251 
252 namespace {
253 
254 template <typename T>
255 std::vector<T> convertStringToVector(const std::string& str) {
256  std::vector<T> result{};
257  boost::char_separator<char> sep{","};
258  boost::tokenizer<boost::char_separator<char>> tok{str, sep};
259  std::transform(tok.begin(), tok.end(), std::back_inserter(result),
260  [](const std::string& s) { return boost::get<T>(convertToCellType(s, typeid(T))); });
261  return result;
262 }
263 
264 template <typename T>
265 NdArray<T> convertStringToNdArray(const std::string& str) {
266  if (str.empty()) {
267  throw Elements::Exception() << "Cannot convert an empty string to a NdArray";
268  } else if (str[0] != '<') {
269  throw Elements::Exception() << "Unexpected initial character for a NdArray: " << str[0];
270  }
271 
272  auto closing_char = str.find('>');
273  if (closing_char == std::string::npos) {
274  throw Elements::Exception() << "Could not find '>'";
275  }
276 
277  auto shape_str = str.substr(1, closing_char - 1);
278  auto shape_i = convertStringToVector<int32_t>(shape_str);
279  auto data = convertStringToVector<T>(str.substr(closing_char + 1));
280 
281  std::vector<size_t> shape_u;
282  std::copy(shape_i.begin(), shape_i.end(), std::back_inserter(shape_u));
283  return NdArray<T>(shape_u, data);
284 }
285 
286 } // namespace
287 
289  // Boolean
290  {typeid(bool),
291  [](const std::string& value) {
292  if (value == "true" || value == "t" || value == "yes" || value == "y" || value == "1") {
293  return true;
294  } else if (value == "false" || value == "f" || value == "no" || value == "n" || value == "0") {
295  return false;
296  }
297  throw Elements::Exception() << "Invalid boolean value " << value;
298  }},
299  // Integers
300  {typeid(int32_t), boost::lexical_cast<int32_t, const std::string&>},
301  {typeid(int64_t), boost::lexical_cast<int64_t, const std::string&>},
302  // Floating point
303  {typeid(float), boost::lexical_cast<float, const std::string&>},
304  {typeid(double), boost::lexical_cast<double, const std::string&>},
305  // String
306  {typeid(std::string), boost::lexical_cast<std::string, const std::string&>},
307  // Arrays
308  {typeid(std::vector<bool>), convertStringToVector<bool>},
309  {typeid(std::vector<int32_t>), convertStringToVector<int32_t>},
310  {typeid(std::vector<int64_t>), convertStringToVector<int64_t>},
311  {typeid(std::vector<float>), convertStringToVector<float>},
312  {typeid(std::vector<double>), convertStringToVector<double>},
313  // NdArray
314  {typeid(NdArray<int32_t>), convertStringToNdArray<int32_t>},
315  {typeid(NdArray<int64_t>), convertStringToNdArray<int64_t>},
316  {typeid(NdArray<float>), convertStringToNdArray<float>},
317  {typeid(NdArray<double>), convertStringToNdArray<double>},
318 };
319 
321  try {
322  auto i = sCellConverter.find(type);
323  if (i == sCellConverter.end()) {
324  throw Elements::Exception() << "Unknown type name " << type.name();
325  }
326  return i->second(value);
327  } catch (boost::bad_lexical_cast const&) {
328  throw Elements::Exception() << "Cannot convert " << value << " to " << type.name();
329  }
330 }
331 
332 bool hasNextRow(std::istream& in, const std::string& comment) {
333  StreamRewinder rewinder{in};
334  while (in) {
335  std::string line;
336  getline(in, line);
337  size_t comment_pos = line.find(comment);
338  if (comment_pos != std::string::npos) {
339  line = line.substr(0, comment_pos);
340  }
341  boost::trim(line);
342  if (!line.empty()) {
343  return true;
344  }
345  }
346  return false;
347 }
348 
350  StreamRewinder rewinder{in};
351  std::size_t count = 0;
352  while (in) {
353  std::string line;
354  getline(in, line);
355  size_t comment_pos = line.find(comment);
356  if (comment_pos != std::string::npos) {
357  line = line.substr(0, comment_pos);
358  }
359  boost::trim(line);
360  if (!line.empty()) {
361  ++count;
362  }
363  }
364  return count;
365 }
366 
369  size_t comment_pos = line.find(comment);
370 
371  if (comment_pos != std::string::npos) {
372  line = line.substr(0, comment_pos);
373  }
374  boost::trim(line);
375  if (!line.empty()) {
376  std::stringstream line_stream(line);
377  size_t count = 0;
378  std::string token;
379  line_stream >> boost::io::quoted(token);
380  while (line_stream) {
381  cells.emplace_back(token);
382  line_stream >> boost::io::quoted(token);
383  ++count;
384  }
385  }
386  return cells;
387 }
388 
390  StreamRewinder rewinder{in};
391  std::string line(comment);
392  while (in && boost::starts_with(line, comment)) {
393  getline(in, line);
394  }
395  return splitLine(line, comment);
396 }
397 
399  namespace qi = boost::spirit::qi;
400  double d;
401  long l;
402 
403  auto it1 = token.begin();
404  auto it2 = it1;
405  if (qi::parse(it1, token.end(), qi::long_, l) && it1 == token.end()) {
406  return {typeid(int64_t), 0};
407  }
408  if (qi::parse(it2, token.end(), qi::double_, d) && it2 == token.end()) {
409  return {typeid(double), 0};
410  }
411  return {typeid(std::string), std::size_t(0)};
412 }
413 
414 } // namespace Table
415 } // end of namespace Euclid
std::size_t countRemainingRows(std::istream &in, const std::string &comment)
T empty(T...args)
T copy(T...args)
T forward_as_tuple(T...args)
const std::vector< std::pair< std::string, std::type_index > > KeywordTypeMap
T to_string(T...args)
static Elements::Logging logger
std::vector< std::string > splitLine(std::string line, const std::string &comment)
T end(T...args)
Row::cell_type convertToCellType(const std::string &value, std::type_index type)
Converts the given value to a Row::cell_type of the given type.
STL class.
STL class.
const std::map< std::type_index, std::function< Row::cell_type(const std::string &)> > sCellConverter
std::type_index keywordToType(const std::string &keyword)
STL class.
constexpr double s
void warn(const std::string &logMessage)
T erase(T...args)
T str(T...args)
std::pair< std::type_index, std::size_t > guessColumnType(const std::string &token)
T move(T...args)
std::map< std::string, ColumnDescription > autoDetectColumnDescriptions(std::istream &in, const std::string &comment)
Reads the column descriptions of the given stream.
This class gets a stream as argument during construction and when it is deleted it sets the position ...
T count(T...args)
bool hasNextRow(std::istream &in, const std::string &comment)
T find(T...args)
std::string quoted(const std::string &str)
STL class.
boost::variant< bool, int32_t, int64_t, float, double, std::string, std::vector< bool >, std::vector< int32_t >, std::vector< int64_t >, std::vector< float >, std::vector< double >, NdArray::NdArray< int32_t >, NdArray::NdArray< int64_t >, NdArray::NdArray< float >, NdArray::NdArray< double >> cell_type
The possible cell types.
Definition: Row.h:64
STL class.
T name(T...args)
T begin(T...args)
T back_inserter(T...args)
T emplace(T...args)
T substr(T...args)
T transform(T...args)
static Logging getLogger(const std::string &name="")
size_t countColumns(std::istream &in, const std::string &comment)
Returns the number of whitespace separated tokens of the first non commented line.
std::vector< std::string > firstDataLine(std::istream &in, const std::string &comment)
std::vector< std::string > autoDetectColumnNames(std::istream &in, const std::string &comment, size_t columns_number)
Reads the column names of the given stream.
T emplace_back(T...args)