Orcus
Loading...
Searching...
No Matches
sax_parser.hpp
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 */
7
8#ifndef INCLUDED_ORCUS_SAX_PARSER_HPP
9#define INCLUDED_ORCUS_SAX_PARSER_HPP
10
11#include "sax_parser_base.hpp"
12
13#include <string_view>
14
15namespace orcus {
16
18{
24 static const uint8_t baseline_version = 10;
25};
26
28{
29public:
36 {
37 (void)param;
38 }
39
47 void start_declaration(std::string_view decl)
48 {
49 (void)decl;
50 }
51
57 void end_declaration(std::string_view decl)
58 {
59 (void)decl;
60 }
61
68 {
69 (void)elem;
70 }
71
78 {
79 (void)elem;
80 }
81
96 void characters(std::string_view val, bool transient)
97 {
98 (void)val; (void)transient;
99 }
100
110 {
111 (void)attr;
112 }
113};
114
119template<typename _Handler, typename _Config = sax_parser_default_config>
121{
122public:
123 typedef _Handler handler_type;
124 typedef _Config config_type;
125
126 sax_parser(const char* content, const size_t size, handler_type& handler);
127 sax_parser(const char* content, const size_t size, bool transient_stream, handler_type& handler);
128 ~sax_parser();
129
130 void parse();
131
132private:
133
138 void header();
139 void body();
140 void element();
141 void element_open(std::ptrdiff_t begin_pos);
142 void element_close(std::ptrdiff_t begin_pos);
143 void special_tag();
144 void declaration(const char* name_check);
145 void cdata();
146 void doctype();
147 void characters();
148 void attribute();
149
150private:
151 handler_type& m_handler;
152};
153
154template<typename _Handler, typename _Config>
156 const char* content, const size_t size, handler_type& handler) :
157 sax::parser_base(content, size, false),
158 m_handler(handler)
159{
160}
161
162template<typename _Handler, typename _Config>
163sax_parser<_Handler,_Config>::sax_parser(
164 const char* content, const size_t size, bool transient_stream, handler_type& handler) :
165 sax::parser_base(content, size, transient_stream),
166 m_handler(handler)
167{
168}
169
170template<typename _Handler, typename _Config>
171sax_parser<_Handler,_Config>::~sax_parser()
172{
173}
174
175template<typename _Handler, typename _Config>
176void sax_parser<_Handler,_Config>::parse()
177{
178 m_nest_level = 0;
179 mp_char = mp_begin;
180 header();
181 skip_space_and_control();
182 body();
183
184 assert(m_buffer_pos == 0);
185}
186
187template<typename _Handler, typename _Config>
188void sax_parser<_Handler,_Config>::header()
189{
190 // we don't handle multi byte encodings so we can just skip bom entry if exists.
191 skip_bom();
192 skip_space_and_control();
193 if (!has_char() || cur_char() != '<')
194 throw sax::malformed_xml_error("xml file must begin with '<'.", offset());
195
196 if (config_type::baseline_version >= 11)
197 {
198 // XML version 1.1 requires a header declaration whereas in 1.0 it's
199 // optional.
200 if (next_char_checked() != '?')
201 throw sax::malformed_xml_error("xml file must begin with '<?'.", offset());
202
203 declaration("xml");
204 }
205}
206
207template<typename _Handler, typename _Config>
208void sax_parser<_Handler,_Config>::body()
209{
210 while (has_char())
211 {
212 if (cur_char() == '<')
213 {
214 element();
215 if (!m_root_elem_open)
216 // Root element closed. Stop parsing.
217 return;
218 }
219 else if (m_nest_level)
220 // Call characters only when in xml hierarchy.
221 characters();
222 else
223 next();
224 }
225}
226
227template<typename _Handler, typename _Config>
228void sax_parser<_Handler,_Config>::element()
229{
230 assert(cur_char() == '<');
231 std::ptrdiff_t pos = offset();
232 char c = next_char_checked();
233 switch (c)
234 {
235 case '/':
236 element_close(pos);
237 return;
238 case '!':
239 special_tag();
240 return;
241 case '?':
242 declaration(nullptr);
243 return;
244 }
245
246 element_open(pos);
247}
248
249template<typename _Handler, typename _Config>
250void sax_parser<_Handler,_Config>::element_open(std::ptrdiff_t begin_pos)
251{
252 sax::parser_element elem;
253 element_name(elem, begin_pos);
254
255 while (true)
256 {
257 skip_space_and_control();
258 char c = cur_char();
259 if (c == '/')
260 {
261 // Self-closing element: <element/>
262 if (next_and_char() != '>')
263 throw sax::malformed_xml_error("expected '/>' to self-close the element.", offset());
264 next();
265 elem.end_pos = offset();
266 m_handler.start_element(elem);
267 reset_buffer_pos();
268 m_handler.end_element(elem);
269 if (!m_nest_level)
270 m_root_elem_open = false;
271#if ORCUS_DEBUG_SAX_PARSER
272 cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "' (self-closing)" << endl;
273#endif
274 return;
275 }
276 else if (c == '>')
277 {
278 // End of opening element: <element>
279 next();
280 elem.end_pos = offset();
281 nest_up();
282 m_handler.start_element(elem);
283 reset_buffer_pos();
284#if ORCUS_DEBUG_SAX_PARSER
285 cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
286#endif
287 return;
288 }
289 else
290 attribute();
291 }
292}
293
294template<typename _Handler, typename _Config>
295void sax_parser<_Handler,_Config>::element_close(std::ptrdiff_t begin_pos)
296{
297 assert(cur_char() == '/');
298 nest_down();
299 next_check();
300 sax::parser_element elem;
301 element_name(elem, begin_pos);
302
303 if (cur_char() != '>')
304 throw sax::malformed_xml_error("expected '>' to close the element.", offset());
305 next();
306 elem.end_pos = offset();
307
308 m_handler.end_element(elem);
309#if ORCUS_DEBUG_SAX_PARSER
310 cout << "element_close: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
311#endif
312 if (!m_nest_level)
313 m_root_elem_open = false;
314}
315
316template<typename _Handler, typename _Config>
317void sax_parser<_Handler,_Config>::special_tag()
318{
319 assert(cur_char() == '!');
320 // This can be either <![CDATA, <!--, or <!DOCTYPE.
321 size_t len = remains();
322 if (len < 2)
323 throw sax::malformed_xml_error("special tag too short.", offset());
324
325 switch (next_and_char())
326 {
327 case '-':
328 {
329 // Possibly comment.
330 if (next_and_char() != '-')
331 throw sax::malformed_xml_error("comment expected.", offset());
332
333 len -= 2;
334 if (len < 3)
335 throw sax::malformed_xml_error("malformed comment.", offset());
336
337 next();
338 comment();
339 }
340 break;
341 case '[':
342 {
343 // Possibly a CDATA.
344 expects_next("CDATA[", 6);
345 if (has_char())
346 cdata();
347 }
348 break;
349 case 'D':
350 {
351 // check if this is a DOCTYPE.
352 expects_next("OCTYPE", 6);
353 skip_space_and_control();
354 if (has_char())
355 doctype();
356 }
357 break;
358 default:
359 throw sax::malformed_xml_error("failed to parse special tag.", offset());
360 }
361}
362
363template<typename _Handler, typename _Config>
364void sax_parser<_Handler,_Config>::declaration(const char* name_check)
365{
366 assert(cur_char() == '?');
367 next_check();
368
369 // Get the declaration name first.
370 std::string_view decl_name;
371 name(decl_name);
372#if ORCUS_DEBUG_SAX_PARSER
373 cout << "sax_parser::declaration: start name='" << decl_name << "'" << endl;
374#endif
375
376 if (name_check && decl_name != name_check)
377 {
378 std::ostringstream os;
379 os << "declaration name of '" << name_check << "' was expected, but '" << decl_name << "' was found instead.";
380 throw sax::malformed_xml_error(os.str(), offset());
381 }
382
383 m_handler.start_declaration(decl_name);
384 skip_space_and_control();
385
386 // Parse the attributes.
387 while (cur_char_checked() != '?')
388 {
389 attribute();
390 skip_space_and_control();
391 }
392 if (next_char_checked() != '>')
393 throw sax::malformed_xml_error("declaration must end with '?>'.", offset());
394
395 m_handler.end_declaration(decl_name);
396 reset_buffer_pos();
397 next();
398#if ORCUS_DEBUG_SAX_PARSER
399 cout << "sax_parser::declaration: end name='" << decl_name << "'" << endl;
400#endif
401}
402
403template<typename _Handler, typename _Config>
404void sax_parser<_Handler,_Config>::cdata()
405{
406 size_t len = remains();
407 assert(len > 3);
408
409 // Parse until we reach ']]>'.
410 const char* p0 = mp_char;
411 size_t i = 0, match = 0;
412 for (char c = cur_char(); i < len; ++i, c = next_and_char())
413 {
414 if (c == ']')
415 {
416 // Be aware that we may encounter a series of more than two ']'
417 // characters, in which case we'll only count the last two.
418
419 if (match == 0)
420 // First ']'
421 ++match;
422 else if (match == 1)
423 // Second ']'
424 ++match;
425 }
426 else if (c == '>' && match == 2)
427 {
428 // Found ']]>'.
429 size_t cdata_len = i - 2;
430 m_handler.characters(std::string_view(p0, cdata_len), transient_stream());
431 next();
432 return;
433 }
434 else
435 match = 0;
436 }
437 throw sax::malformed_xml_error("malformed CDATA section.", offset());
438}
439
440template<typename _Handler, typename _Config>
441void sax_parser<_Handler,_Config>::doctype()
442{
443 // Parse the root element first.
444 sax::doctype_declaration param;
445 name(param.root_element);
446 skip_space_and_control();
447
448 // Either PUBLIC or SYSTEM.
449 size_t len = remains();
450 if (len < 6)
451 throw sax::malformed_xml_error("DOCTYPE section too short.", offset());
452
453 param.keyword = sax::doctype_declaration::keyword_type::dtd_private;
454 char c = cur_char();
455 if (c == 'P')
456 {
457 if (next_and_char() != 'U' || next_and_char() != 'B' || next_and_char() != 'L' || next_and_char() != 'I' || next_and_char() != 'C')
458 throw sax::malformed_xml_error("malformed DOCTYPE section.", offset());
459
460 param.keyword = sax::doctype_declaration::keyword_type::dtd_public;
461 }
462 else if (c == 'S')
463 {
464 if (next_and_char() != 'Y' || next_and_char() != 'S' || next_and_char() != 'T' || next_and_char() != 'E' || next_and_char() != 'M')
465 throw sax::malformed_xml_error("malformed DOCTYPE section.", offset());
466 }
467
468 next_check();
469 skip_space_and_control();
470 has_char_throw("DOCTYPE section too short.");
471
472 // Parse FPI.
473 value(param.fpi, false);
474
475 has_char_throw("DOCTYPE section too short.");
476 skip_space_and_control();
477 has_char_throw("DOCTYPE section too short.");
478
479 if (cur_char() == '>')
480 {
481 // Optional URI not given. Exit.
482#if ORCUS_DEBUG_SAX_PARSER
483 cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "'" << endl;
484#endif
485 m_handler.doctype(param);
486 next();
487 return;
488 }
489
490 // Parse optional URI.
491 value(param.uri, false);
492
493 has_char_throw("DOCTYPE section too short.");
494 skip_space_and_control();
495 has_char_throw("DOCTYPE section too short.");
496
497 if (cur_char() != '>')
498 throw sax::malformed_xml_error("malformed DOCTYPE section - closing '>' expected but not found.", offset());
499
500#if ORCUS_DEBUG_SAX_PARSER
501 cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "' uri='" << param.uri << "'" << endl;
502#endif
503 m_handler.doctype(param);
504 next();
505}
506
507template<typename _Handler, typename _Config>
508void sax_parser<_Handler,_Config>::characters()
509{
510 const char* p0 = mp_char;
511 for (; has_char(); next())
512 {
513 if (cur_char() == '<')
514 break;
515
516 if (cur_char() == '&')
517 {
518 // Text span with one or more encoded characters. Parse using cell buffer.
519 cell_buffer& buf = get_cell_buffer();
520 buf.reset();
521 buf.append(p0, mp_char-p0);
522 characters_with_encoded_char(buf);
523 if (buf.empty())
524 m_handler.characters(std::string_view{}, transient_stream());
525 else
526 m_handler.characters(std::string_view(buf.get(), buf.size()), true);
527 return;
528 }
529 }
530
531 if (mp_char > p0)
532 {
533 std::string_view val(p0, mp_char-p0);
534 m_handler.characters(val, transient_stream());
535 }
536}
537
538template<typename _Handler, typename _Config>
539void sax_parser<_Handler,_Config>::attribute()
540{
541 sax::parser_attribute attr;
542 attribute_name(attr.ns, attr.name);
543
544#if ORCUS_DEBUG_SAX_PARSER
545 cout << "sax_parser::attribute: ns='" << attr.ns << "', name='" << attr.name << "'" << endl;
546#endif
547
548 skip_space_and_control();
549
550 char c = cur_char();
551 if (c != '=')
552 {
553 std::ostringstream os;
554 os << "Attribute must begin with 'name=..'. (ns='" << attr.ns << "', name='" << attr.name << "')";
555 throw sax::malformed_xml_error(os.str(), offset());
556 }
557
558 next_check(); // skip the '='.
559 skip_space_and_control();
560
561 attr.transient = value(attr.value, true);
562 if (attr.transient)
563 // Value is stored in a temporary buffer. Push a new buffer.
564 inc_buffer_pos();
565
566#if ORCUS_DEBUG_SAX_PARSER
567 cout << "sax_parser::attribute: value='" << attr.value << "'" << endl;
568#endif
569
570 m_handler.attribute(attr);
571}
572
573}
574
575#endif
576/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition parser_base.hpp:41
Definition sax_parser_base.hpp:108
Definition sax_parser.hpp:28
void end_declaration(std::string_view decl)
Definition sax_parser.hpp:57
void doctype(const orcus::sax::doctype_declaration &param)
Definition sax_parser.hpp:35
void attribute(const orcus::sax::parser_attribute &attr)
Definition sax_parser.hpp:109
void characters(std::string_view val, bool transient)
Definition sax_parser.hpp:96
void start_declaration(std::string_view decl)
Definition sax_parser.hpp:47
void end_element(const orcus::sax::parser_element &elem)
Definition sax_parser.hpp:77
void start_element(const orcus::sax::parser_element &elem)
Definition sax_parser.hpp:67
Definition sax_parser.hpp:121
Definition sax_parser_base.hpp:45
Definition sax_parser_base.hpp:100
Definition sax_parser_base.hpp:85
Definition sax_parser.hpp:18
static const uint8_t baseline_version
Definition sax_parser.hpp:24