Libparserutils
inputstream.c
Go to the documentation of this file.
1/*
2 * This file is part of LibParserUtils.
3 * Licensed under the MIT License,
4 * http://www.opensource.org/licenses/mit-license.php
5 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
6 */
7
8#include <assert.h>
9#include <stdlib.h>
10#include <string.h>
11
15
16#include "input/filter.h"
17#include "utils/utils.h"
18
30 uint16_t mibenum;
31 uint32_t encsrc;
37
41 uint16_t *mibenum, parserutils_buffer *buffer);
42
60 uint32_t encsrc, parserutils_charset_detect_func csdetect,
62{
65
66 if (stream == NULL)
68
69 s = malloc(sizeof(parserutils_inputstream_private));
70 if (s == NULL)
71 return PARSERUTILS_NOMEM;
72
73 error = parserutils_buffer_create(&s->raw);
74 if (error != PARSERUTILS_OK) {
75 free(s);
76 return error;
77 }
78
80 if (error != PARSERUTILS_OK) {
82 free(s);
83 return error;
84 }
85
86 s->public.cursor = 0;
87 s->public.had_eof = false;
88 s->done_first_chunk = false;
89
90 error = parserutils__filter_create("UTF-8", &s->input);
91 if (error != PARSERUTILS_OK) {
94 free(s);
95 return error;
96 }
97
98 if (enc != NULL) {
100
101 s->mibenum =
103
104 if (s->mibenum == 0) {
108 free(s);
110 }
111
112 params.encoding.name = enc;
113
116 &params);
117 if (error != PARSERUTILS_OK) {
121 free(s);
122 return error;
123 }
124
125 s->encsrc = encsrc;
126 } else {
127 s->mibenum = 0;
128 s->encsrc = 0;
129 }
130
131 s->csdetect = csdetect;
132
133 *stream = (parserutils_inputstream *) s;
134
135 return PARSERUTILS_OK;
136}
137
146{
149
150 if (stream == NULL)
151 return PARSERUTILS_BADPARM;
152
156 free(s);
157
158 return PARSERUTILS_OK;
159}
160
171 const uint8_t *data, size_t len)
172{
175
176 if (stream == NULL)
177 return PARSERUTILS_BADPARM;
178
179 if (data == NULL) {
180 s->public.had_eof = true;
181 return PARSERUTILS_OK;
182 }
183
184 return parserutils_buffer_append(s->raw, data, len);
185}
186
197 const uint8_t *data, size_t len)
198{
201
202 if (stream == NULL || data == NULL)
203 return PARSERUTILS_BADPARM;
204
206 data, len);
207}
208
209#define IS_ASCII(x) (((x) & 0x80) == 0)
210
234 size_t offset, const uint8_t **ptr, size_t *length)
235{
239 size_t len;
240
241 if (stream == NULL || ptr == NULL || length == NULL)
242 return PARSERUTILS_BADPARM;
243
244 /* There's insufficient data in the buffer, so read some more */
245 if (s->raw->length == 0) {
246 /* No more data to be had */
247 return s->public.had_eof ? PARSERUTILS_EOF
249 }
250
251 /* Refill utf8 buffer from raw buffer */
253 if (error != PARSERUTILS_OK)
254 return error;
255
256 /* Refill may have succeeded, but not actually produced any new data */
257 if (s->public.cursor + offset == s->public.utf8->length)
259
260 /* Now try the read */
261 if (IS_ASCII(s->public.utf8->data[s->public.cursor + offset])) {
262 len = 1;
263 } else {
265 s->public.utf8->data + s->public.cursor + offset,
266 &len);
267
268 if (error != PARSERUTILS_OK && error != PARSERUTILS_NEEDDATA)
269 return error;
270
271 if (error == PARSERUTILS_NEEDDATA) {
272 return s->public.had_eof ? PARSERUTILS_EOF
274 }
275 }
276
277 (*length) = len;
278 (*ptr) = (s->public.utf8->data + s->public.cursor + offset);
279
280 return PARSERUTILS_OK;
281}
282
283#undef IS_ASCII
284
293 parserutils_inputstream *stream, uint32_t *source)
294{
297
298 if (stream == NULL || source == NULL)
299 return NULL;
300
301 *source = s->encsrc;
302
303 if (s->encsrc == 0)
304 return "UTF-8";
305
307}
308
323 const char *enc, uint32_t source)
324{
328 uint16_t temp;
329 parserutils_error error;
330
331 if (stream == NULL || enc == NULL)
332 return PARSERUTILS_BADPARM;
333
334 if (s->done_first_chunk)
335 return PARSERUTILS_INVALID;
336
337 temp = parserutils_charset_mibenum_from_name(enc, strlen(enc));
338 if (temp == 0)
340
341 /* Ensure filter is using the correct encoding */
342 params.encoding.name = enc;
345 &params);
346 if (error != PARSERUTILS_OK)
347 return error;
348
349 /* Finally, replace the current settings */
350 s->mibenum = temp;
351 s->encsrc = source;
352
353 return PARSERUTILS_OK;
354}
355
356/******************************************************************************
357 ******************************************************************************/
358
367{
368 const uint8_t *raw;
369 uint8_t *utf8;
370 size_t raw_length, utf8_space;
371 parserutils_error error;
372
373 /* If this is the first chunk of data, we must detect the charset and
374 * strip the BOM, if one exists */
375 if (stream->done_first_chunk == false) {
377
378 /* If there is a charset detection routine, give it an
379 * opportunity to override any charset specified when the
380 * inputstream was created */
381 if (stream->csdetect != NULL) {
382 error = stream->csdetect(stream->raw->data,
383 stream->raw->length,
384 &stream->mibenum, &stream->encsrc);
385 if (error != PARSERUTILS_OK) {
386 if (error != PARSERUTILS_NEEDDATA ||
387 stream->public.had_eof == false)
388 return error;
389
390 /* We don't have enough data to detect the
391 * input encoding, but we're not going to get
392 * any more as we've been notified of EOF.
393 * Therefore, leave the encoding alone
394 * so that any charset specified when the
395 * inputstream was created will be preserved.
396 * If there was no charset specified, then
397 * we'll default to UTF-8, below */
398 }
399 }
400
401 /* Default to UTF-8 if there is still no encoding information
402 * We'll do this if there was no encoding specified up-front
403 * and:
404 * 1) there was no charset detection routine
405 * or 2) there was insufficient data for the charset
406 * detection routine to detect an encoding
407 */
408 if (stream->mibenum == 0) {
409 stream->mibenum =
411 SLEN("UTF-8"));
412 stream->encsrc = 0;
413 }
414
415 assert(stream->mibenum != 0);
416
417 /* Strip any BOM, and update encoding as appropriate */
419 stream->raw);
420 if (error != PARSERUTILS_OK)
421 return error;
422
423 /* Ensure filter is using the correct encoding */
424 params.encoding.name =
426
427 error = parserutils__filter_setopt(stream->input,
429 &params);
430 if (error != PARSERUTILS_OK)
431 return error;
432
433 stream->done_first_chunk = true;
434 }
435
436 /* Work out how to perform the buffer fill */
437 if (stream->public.cursor == stream->public.utf8->length) {
438 /* Cursor's at the end, so simply reuse the entire buffer */
439 utf8 = stream->public.utf8->data;
440 utf8_space = stream->public.utf8->allocated;
441 } else {
442 /* Cursor's not at the end, so shift data after cursor to the
443 * bottom of the buffer. If the buffer's still over half full,
444 * extend it. */
445 memmove(stream->public.utf8->data,
446 stream->public.utf8->data + stream->public.cursor,
447 stream->public.utf8->length - stream->public.cursor);
448
449 stream->public.utf8->length -= stream->public.cursor;
450
451 if (stream->public.utf8->length >
452 stream->public.utf8->allocated / 2) {
453 error = parserutils_buffer_grow(stream->public.utf8);
454 if (error != PARSERUTILS_OK)
455 return error;
456 }
457
458 utf8 = stream->public.utf8->data + stream->public.utf8->length;
459 utf8_space = stream->public.utf8->allocated -
460 stream->public.utf8->length;
461 }
462
463 raw = stream->raw->data;
464 raw_length = stream->raw->length;
465
466 /* Try to fill utf8 buffer from the raw data */
468 &raw, &raw_length, &utf8, &utf8_space);
469 /* _NOMEM implies that there's more input to read than available space
470 * in the utf8 buffer. That's fine, so we'll ignore that error. */
471 if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM)
472 return error;
473
474 /* Remove the raw data we've processed from the raw buffer */
475 error = parserutils_buffer_discard(stream->raw, 0,
476 stream->raw->length - raw_length);
477 if (error != PARSERUTILS_OK)
478 return error;
479
480 /* Fix up the utf8 buffer information */
481 stream->public.utf8->length =
482 stream->public.utf8->allocated - utf8_space;
483
484 /* Finally, fix up the cursor */
485 stream->public.cursor = 0;
486
487 return PARSERUTILS_OK;
488}
489
497 parserutils_buffer *buffer)
498{
499 static uint16_t utf8;
500 static uint16_t utf16;
501 static uint16_t utf16be;
502 static uint16_t utf16le;
503 static uint16_t utf32;
504 static uint16_t utf32be;
505 static uint16_t utf32le;
506
507 if (utf8 == 0) {
509 SLEN("UTF-8"));
511 SLEN("UTF-16"));
512 utf16be = parserutils_charset_mibenum_from_name("UTF-16BE",
513 SLEN("UTF-16BE"));
514 utf16le = parserutils_charset_mibenum_from_name("UTF-16LE",
515 SLEN("UTF-16LE"));
517 SLEN("UTF-32"));
518 utf32be = parserutils_charset_mibenum_from_name("UTF-32BE",
519 SLEN("UTF-32BE"));
520 utf32le = parserutils_charset_mibenum_from_name("UTF-32LE",
521 SLEN("UTF-32LE"));
522 }
523
524#define UTF32_BOM_LEN (4)
525#define UTF16_BOM_LEN (2)
526#define UTF8_BOM_LEN (3)
527
528 if (*mibenum == utf8) {
529 if (buffer->length >= UTF8_BOM_LEN &&
530 buffer->data[0] == 0xEF &&
531 buffer->data[1] == 0xBB &&
532 buffer->data[2] == 0xBF) {
534 buffer, 0, UTF8_BOM_LEN);
535 }
536 } else if (*mibenum == utf16be) {
537 if (buffer->length >= UTF16_BOM_LEN &&
538 buffer->data[0] == 0xFE &&
539 buffer->data[1] == 0xFF) {
541 buffer, 0, UTF16_BOM_LEN);
542 }
543 } else if (*mibenum == utf16le) {
544 if (buffer->length >= UTF16_BOM_LEN &&
545 buffer->data[0] == 0xFF &&
546 buffer->data[1] == 0xFE) {
548 buffer, 0, UTF16_BOM_LEN);
549 }
550 } else if (*mibenum == utf16) {
551 *mibenum = utf16be;
552
553 if (buffer->length >= UTF16_BOM_LEN) {
554 if (buffer->data[0] == 0xFE &&
555 buffer->data[1] == 0xFF) {
557 buffer, 0, UTF16_BOM_LEN);
558 } else if (buffer->data[0] == 0xFF &&
559 buffer->data[1] == 0xFE) {
560 *mibenum = utf16le;
562 buffer, 0, UTF16_BOM_LEN);
563 }
564 }
565 } else if (*mibenum == utf32be) {
566 if (buffer->length >= UTF32_BOM_LEN &&
567 buffer->data[0] == 0x00 &&
568 buffer->data[1] == 0x00 &&
569 buffer->data[2] == 0xFE &&
570 buffer->data[3] == 0xFF) {
572 buffer, 0, UTF32_BOM_LEN);
573 }
574 } else if (*mibenum == utf32le) {
575 if (buffer->length >= UTF32_BOM_LEN &&
576 buffer->data[0] == 0xFF &&
577 buffer->data[1] == 0xFE &&
578 buffer->data[2] == 0x00 &&
579 buffer->data[3] == 0x00) {
581 buffer, 0, UTF32_BOM_LEN);
582 }
583 } else if (*mibenum == utf32) {
584 *mibenum = utf32be;
585
586 if (buffer->length >= UTF32_BOM_LEN) {
587 if (buffer->data[0] == 0x00 &&
588 buffer->data[1] == 0x00 &&
589 buffer->data[2] == 0xFE &&
590 buffer->data[3] == 0xFF) {
592 buffer, 0, UTF32_BOM_LEN);
593 } else if (buffer->data[0] == 0xFF &&
594 buffer->data[1] == 0xFE &&
595 buffer->data[2] == 0x00 &&
596 buffer->data[3] == 0x00) {
597 *mibenum = utf32le;
599 buffer, 0, UTF32_BOM_LEN);
600 }
601 }
602 }
603
604#undef UTF8_BOM_LEN
605#undef UTF16_BOM_LEN
606#undef UTF32_BOM_LEN
607
608 return PARSERUTILS_OK;
609}
610
parserutils_error parserutils_buffer_insert(parserutils_buffer *buffer, size_t offset, const uint8_t *data, size_t len)
Insert data into a memory buffer.
Definition buffer.c:97
parserutils_error parserutils_buffer_create(parserutils_buffer **buffer)
Create a memory buffer.
Definition buffer.c:22
parserutils_error parserutils_buffer_destroy(parserutils_buffer *buffer)
Destroy a memory buffer.
Definition buffer.c:53
parserutils_error parserutils_buffer_append(parserutils_buffer *buffer, const uint8_t *data, size_t len)
Append data to a memory buffer.
Definition buffer.c:72
parserutils_error parserutils_buffer_grow(parserutils_buffer *buffer)
Extend the amount of space allocated for a memory buffer.
Definition buffer.c:150
parserutils_error parserutils_buffer_discard(parserutils_buffer *buffer, size_t offset, size_t len)
Discard a section of a memory buffer.
Definition buffer.c:130
size_t len
Definition codec_8859.c:23
parserutils_error
Definition errors.h:18
@ PARSERUTILS_OK
Definition errors.h:19
@ PARSERUTILS_BADENCODING
Definition errors.h:26
@ PARSERUTILS_EOF
Definition errors.h:27
@ PARSERUTILS_NEEDDATA
Definition errors.h:25
@ PARSERUTILS_INVALID
Definition errors.h:23
@ PARSERUTILS_NOMEM
Definition errors.h:21
@ PARSERUTILS_BADPARM
Definition errors.h:22
parserutils_error parserutils__filter_process_chunk(parserutils_filter *input, const uint8_t **data, size_t *len, uint8_t **output, size_t *outlen)
Process a chunk of data.
Definition filter.c:179
parserutils_error parserutils__filter_create(const char *int_enc, parserutils_filter **filter)
Create an input filter.
Definition filter.c:58
parserutils_error parserutils__filter_destroy(parserutils_filter *input)
Destroy an input filter.
Definition filter.c:114
parserutils_error parserutils__filter_setopt(parserutils_filter *input, parserutils_filter_opttype type, parserutils_filter_optparams *params)
Configure an input filter.
Definition filter.c:149
@ PARSERUTILS_FILTER_SET_ENCODING
Definition filter.h:22
#define UTF16_BOM_LEN
static parserutils_error parserutils_inputstream_strip_bom(uint16_t *mibenum, parserutils_buffer *buffer)
Strip a BOM from a buffer in the given encoding.
parserutils_error parserutils_inputstream_insert(parserutils_inputstream *stream, const uint8_t *data, size_t len)
Insert data into stream at current location.
static parserutils_error parserutils_inputstream_refill_buffer(parserutils_inputstream_private *stream)
Refill the UTF-8 buffer from the raw buffer.
const char * parserutils_inputstream_read_charset(parserutils_inputstream *stream, uint32_t *source)
Read the source charset of the input stream.
parserutils_error parserutils_inputstream_create(const char *enc, uint32_t encsrc, parserutils_charset_detect_func csdetect, parserutils_inputstream **stream)
Create an input stream.
Definition inputstream.c:59
#define UTF32_BOM_LEN
parserutils_error parserutils_inputstream_change_charset(parserutils_inputstream *stream, const char *enc, uint32_t source)
Change the source charset of the input stream.
#define IS_ASCII(x)
parserutils_error parserutils_inputstream_peek_slow(parserutils_inputstream *stream, size_t offset, const uint8_t **ptr, size_t *length)
Look at the character in the stream that starts at offset bytes from the cursor (slow version)
parserutils_error parserutils_inputstream_destroy(parserutils_inputstream *stream)
Destroy an input stream.
parserutils_error parserutils_inputstream_append(parserutils_inputstream *stream, const uint8_t *data, size_t len)
Append data to an input stream.
#define UTF8_BOM_LEN
parserutils_error(* parserutils_charset_detect_func)(const uint8_t *data, size_t len, uint16_t *mibenum, uint32_t *source)
Type of charset detection function.
Definition inputstream.h:32
uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len)
Retrieve the MIB enum value assigned to an encoding name.
Definition aliases.c:107
const char * parserutils_charset_mibenum_to_name(uint16_t mibenum)
Retrieve the canonical name of an encoding from the MIB enum.
Definition aliases.c:127
uint8_t * data
Definition buffer.h:21
size_t allocated
Definition buffer.h:23
Input filter.
Definition filter.c:24
Private input stream definition.
Definition inputstream.c:22
bool done_first_chunk
Whether the first chunk has been processed.
Definition inputstream.c:27
uint32_t encsrc
Charset source.
Definition inputstream.c:31
parserutils_inputstream public
Public part.
Definition inputstream.c:23
parserutils_filter * input
Charset conversion filter.
Definition inputstream.c:33
parserutils_buffer * raw
Buffer containing raw data.
Definition inputstream.c:25
uint16_t mibenum
MIB enum for charset, or 0.
Definition inputstream.c:30
parserutils_charset_detect_func csdetect
Charset detection func.
Definition inputstream.c:35
Input stream object.
Definition inputstream.h:40
parserutils_buffer * utf8
Buffer containing UTF-8 data.
Definition inputstream.h:41
uint32_t cursor
Byte offset of current position.
Definition inputstream.h:43
bool had_eof
Whether EOF has been reached.
Definition inputstream.h:45
Input filter option parameters.
Definition filter.h:28
const char * name
Encoding name.
Definition filter.h:32
struct parserutils_filter_optparams::@5 encoding
Parameters for encoding setting.
UTF-8 manipulation functions (interface).
parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s, size_t *len)
Calculate the length (in bytes) of a UTF-8 character.
Definition utf8.c:107
#define SLEN(s)
Definition utils.h:21