Elementa v8.0.0
Minimalistic library for any C++ application (C++11 and up)
Loading...
Searching...
No Matches
lexer.h
Go to the documentation of this file.
1
3#include "elementa/license.inc"
4#include "elementa/checks.inc"
5
6#ifndef ELEMENTA_PARSING_LEXER_H
7#define ELEMENTA_PARSING_LEXER_H
8
9#include <string>
10#include <memory>
11#include <utility>
12#include <vector>
13#include <list>
14#include <functional>
17#include "elementa/base/serial_channels.h"
18#include "elementa/adts/fsms.h"
19
20namespace elementa
21{
22
23namespace parsing
24{
25
47/* **********************************************************************
48
49 Errors for lexers
50
51*************************************************************************/
52
55{
56 public:
57
58 LexerError(const std::string & expl):
59 elementa::base::Exc{std::string{"LEXICAL ERROR. "} + expl}{}
60
62};
63
64
66{
67 public:
68
69 UnexpSymbol(const std::string & details):
72 "Unexpected lexical element",details)}
73 {}
74
76};
77
78
79class ExpSymbol: public LexerError
80{
81 public:
82
83 ExpSymbol(const std::string & details):
86 "Expected some lexical element",
87 details)}
88 {}
89
91};
92
93class UnexpEnd: public LexerError
94{
95 public:
96
97 UnexpEnd(const std::string & details):
100 "Unexpected end of channel",details)}
101 {}
102
104};
105
106
108{
109 public:
110
111 InvLexElem(const std::string & details):
114 "Invalid lexical element",details)}
115 {}
116
118};
119
120
121/* **********************************************************************
122
123 Alias: TokenId
124 Abstract Base class: Token
125 Class: TextToken
126
127*************************************************************************/
128
130
135using TokenId = int;
136
138
145class Token
146{
147 public:
148
149 using Ptr = std::shared_ptr<Token>;
150
151 using PtrVector = std::vector<Ptr>;
152
154
155 Token(void) = default;
156 Token(TokenId tid):c{tid} {}
157 Token(const Token &) = default;
158 Token(Token &&) = default;
159 Token & operator=(const Token &) = default;
160 Token & operator=(Token &&) = default;
161 virtual ~Token(void) = default;
162
163 virtual bool operator==(const Token &) const = 0;
164 virtual bool operator!=(const Token &o) const
165 { return(!operator==(o)); }
166
168 virtual const std::string & value(void) const = 0;
169
171 virtual void setValue(const std::string & v) = 0;
172
174 virtual std::string to_string(void) const;
175};
176
177
179class TextToken: public Token
180{
181 public:
182
184 class List: public std::list<TextToken>
185 {
186 public:
187
188 using Base = std::list<TextToken>;
189 using Base::Base;
190
192
193 std::string to_string(void) const;
194 };
195
196
197 TextToken(void) = default;
198 TextToken(TokenId tid, const std::string & val):Token{tid},v_{val} {}
199 TextToken(const TextToken & oth):Token{oth}
200 { v_ = oth.v_; }
201 TextToken(TextToken && oth):Token{std::move(oth)}
202 { v_ = std::move(oth.v_); }
203 TextToken & operator=(const TextToken & oth)
204 { if (this != &oth)
205 { Token::operator=(oth); v_ = oth.v_; }
206 return(*this); }
207 TextToken & operator=(TextToken && oth)
208 { if (this != &oth)
209 { Token::operator=(std::move(oth)); v_ = std::move(oth.v_); }
210 return(*this); }
211
212 void setValue(const std::string & v)
213 { v_ = v; }
214
215 bool operator==(const Token & o) const
216 { return(v_ == dynamic_cast<const TextToken &>(o).v_); }
217
218 const std::string & value(void) const { return(v_); }
219
220 private:
221
222 std::string v_;
223};
224
225
226/* **********************************************************************
227
228 Pure abstract base class: Lexer
229
230*************************************************************************/
231
233
235class Lexer
236{
237 public:
238
240
243 : inputch_{inputch}
244 {}
245
247 virtual ~Lexer(void) = default;
248
249 Lexer(const Lexer &) = delete;
250 Lexer(Lexer &&) = delete;
251 Lexer & operator=(const Lexer &) = delete;
252 Lexer & operator=(Lexer &&) = delete;
253
255 elementa::base::InSerCh & inputChannel(void) const noexcept
256 { return(inputch_.channel()); }
257
260 locInputChannel(void) const noexcept
261 { return(inputch_); }
262
264 elementa::base::SerChLoc & location(void) const noexcept
265 { return(*(inputch_.filter().ploc)); }
266
268
270 virtual Token::Ptr getToken(void) = 0;
271
272 protected:
273
275};
276
277
278/* **********************************************************************
279
280 Class: FSMLexer
281
282*************************************************************************/
283
285
292class FSMLexer: public Lexer
293{
294 public:
295
301
303 using TermRecog = std::pair< TokenId , FSMClass::Ptr >;
304
306 class Recognizers: public std::vector<TermRecog>
307 {
308 public:
309
311 using BaseVector = std::vector<TermRecog>;
312
314 using BaseVector::BaseVector;
315
317 using ListOfInds = std::list<size_type>;
318
319 Recognizers(const Recognizers &) = delete;
320 Recognizers(Recognizers &&) = delete;
321 Recognizers & operator=(const Recognizers &) = delete;
322 Recognizers & operator=(Recognizers &&) = delete;
323
325
327 {
328 for (const auto & rp: *this) if (rp.first == tid) return(rp.second);
329 return(FSMClass::Ptr{});
330 }
331
333 const ListOfInds & active(void) const noexcept { return(active_); }
334
336
339 void reset(void);
340
342
344 void feed(char o);
345
347
348 size_type whoWins(void) const;
349
351 void debug(void) const;
352
353 private:
354
355 Recognizers::size_type firstrecogfinished_;
356 ListOfInds active_; // list, for efficient dels
357 };
358
360
364 {
365 public:
366
367 LexErr_Unrecog(const std::string & unrecogtxt):
368 LexerError{kUnrecogPreff + unrecogtxt},
369 offending_{unrecogtxt}
370 {}
371
373
374
376 const std::string & getOffendingText(void) const noexcept
377 { return(offending_); }
378
379 private:
380
381 static const std::string kUnrecogPreff;
382
383 std::string offending_;
384 };
385
393
399 & inputch,
400 Recognizers & definition): Lexer{inputch},
401 recognizers_{definition}
402 {}
403
404 FSMLexer(const FSMLexer &) = delete;
405 FSMLexer(FSMLexer &&) = delete;
406 FSMLexer & operator=(const FSMLexer &) = delete;
407 FSMLexer & operator=(FSMLexer &&) = delete;
408
416
425 Token::Ptr getToken(void) final;
426
429 private:
430
431 Recognizers & recognizers_;
432};
433
434
435/* **********************************************************************
436
437 Class: LexerCollector
438
439*************************************************************************/
440
443{
444 public:
445
450
461 using Observer = std::function<bool(Lexer &,
462 const LexerError *, Token::Ptr &)>;
463
471
473 LexerCollector(Lexer & lexer, Observer & obs):lexer_{lexer},
474 obs_{obs} {}
475
476 LexerCollector(const LexerCollector &) = delete;
477 LexerCollector(LexerCollector &&) = delete;
478 LexerCollector & operator=(const LexerCollector &) = delete;
479 LexerCollector & operator=(LexerCollector &&) = delete;
480
481 virtual ~LexerCollector(void) = default;
482
490
493 void collect(bool log = false);
494
496 const Token::PtrVector & log(void) const noexcept { return(toks_); }
497
500 private:
501
502 Lexer & lexer_;
503 Observer & obs_;
504 Token::PtrVector toks_;
505};
506
507
508// Static asserts at the end because a bug in doxygen that stops generating
509// code when this is found.
510
511static_assert(std::is_integral<TokenId>::value &&
512 std::is_signed<TokenId>::value,
513 "TokenId must be an integral, signed type");
514 // Lexer
516
517} // End parsing namespace
518
519} // End elementa namespace
520
521
522#endif
Unrecognizable text error.
Definition: lexer.h:364
const std::string & getOffendingText(void) const noexcept
Get the text after the preffix placed in this error.
Definition: lexer.h:376
size_type whoWins(void) const
Return the recognizer that has recognized all characters fed so far.
void reset(void)
Initiate the recognition through all the FSMs, all becoming active.
const ListOfInds & active(void) const noexcept
Return the indexes of FSMs that are active right now.
Definition: lexer.h:333
void debug(void) const
Print in console the state of recognizers.
void feed(char o)
Feed FSMs with the given observation O.
std::list< size_type > ListOfInds
A list with indexes into some recognizers of the vector.
Definition: lexer.h:317
std::vector< TermRecog > BaseVector
Shortcut.
Definition: lexer.h:311
FSMClass::Ptr find(TokenId tid) const
Return one of the recognizers, or a false (null) one if not found.
Definition: lexer.h:326
A list of text tokens.
Definition: lexer.h:185
std::string to_string(void) const
Return a text describing the list.
Base class for all errors / exceptions in Elementa. Just derive from it.
Definition: exceptions.h:113
#define ELE_CLASS_EXCOVERRIDE(C)
Shortening macro that must be used inside classes derived from Exc.
Definition: exceptions.h:64
std::shared_ptr< FSM > Ptr
Pointer to a FSM that consumes observations of type Obs.
Definition: fsms.h:142
Shortcut for FSMs that work with chars.
Definition: fsms.h:160
virtual std::string to_string(void) const
Return a string representation of the token (can be overriden).
const Token::PtrVector & log(void) const noexcept
Get a reference to the internal log of tokens.
Definition: lexer.h:496
std::shared_ptr< Token > Ptr
Safe pointer to a token.
Definition: lexer.h:149
virtual void setValue(const std::string &v)=0
Must change the textual value of the token by V.
void setValue(const std::string &v)
Must change the textual value of the token by V.
Definition: lexer.h:212
TokenId c
Id of the element, e.g., in a grammar.
Definition: lexer.h:153
FSMLexer(elementa::base::SerChIFilter< elementa::base::SerChFilt_Loc > &inputch, Recognizers &definition)
Default constructor.
Definition: lexer.h:398
std::pair< TokenId, FSMClass::Ptr > TermRecog
A FSM associated to the id of a terminal that it recognizes.
Definition: lexer.h:303
std::function< bool(Lexer &, const LexerError *, Token::Ptr &)> Observer
An observer for being called after each token is collected.
Definition: lexer.h:462
std::vector< Ptr > PtrVector
A vector of token pointers.
Definition: lexer.h:151
Lexer(elementa::base::SerChIFilter< elementa::base::SerChFilt_Loc > &inputch)
Constructor.
Definition: lexer.h:242
elementa::base::InSerCh & inputChannel(void) const noexcept
Return a reference to the associated, non-location input channel.
Definition: lexer.h:255
elementa::base::SerChLoc & location(void) const noexcept
Return a ref. to the location associated to the channel used by the lexr.
Definition: lexer.h:264
virtual const std::string & value(void) const =0
Must return a reference to the textual value for the token.
virtual Token::Ptr getToken(void)=0
Must create and return a pointer to the token built from the input.
void collect(bool log=false)
Collect all tokens from the channel of the lexer, calling the observer.
elementa::base::SerChIFilter< elementa::base::SerChFilt_Loc > & locInputChannel(void) const noexcept
Return a reference to the associated location input channel.
Definition: lexer.h:260
const std::string & value(void) const
Must return a reference to the textual value for the token.
Definition: lexer.h:218
Token::Ptr getToken(void) final
Derived classes cannot implement this.
LexerCollector(Lexer &lexer, Observer &obs)
Constructor.
Definition: lexer.h:473
virtual ~Lexer(void)=default
Virtual destructor for polymorphic deletions.
A lexical analyzer that uses a number of FSMs to recognize tokens.
Definition: lexer.h:293
A lexer that produces terminals.
Definition: lexer.h:236
A class to use a lexer for collecting all tokens sequentially.
Definition: lexer.h:443
Base error class. Lexer errors should derive from this.
Definition: lexer.h:55
A token that only consists in its textual value.
Definition: lexer.h:180
Base class for any token (i.e., lexical element) produced by a lexer.
Definition: lexer.h:146
int TokenId
A unique identifier for a token.
Definition: lexer.h:135
Location in a channel, at least in linear form, either at reading or writing.
Definition: basics.h:769
std::istream InSerCh
"Base class" that represents any input serial channel in Elementa.
Definition: basics.h:334
A filter for InSerCh. It filters their inputs after getting or putbacking'em.
Definition: filters.h:125
std::string concatWithMiddle(const std::string &s1, const std::string &s2, const std::string &m=". ")
Concatenate two strings putting a middle one only if the second is not empty.