Geant4 9.6.0
Toolkit for the simulation of the passage of particles through matter
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
xmltok.cc
Go to the documentation of this file.
1/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2 See the file COPYING for copying permission.
3*/
4
5#include <stddef.h>
6
7#ifdef COMPILED_FROM_DSP
8#include "winconfig.h"
9#elif defined(MACOS_CLASSIC)
10#include "macconfig.h"
11#elif defined(__amigaos4__)
12#include "amigaconfig.h"
13#elif defined(__WATCOMC__)
14#include "watcomconfig.h"
15#else
16#ifdef HAVE_EXPAT_CONFIG_H
17#include <expat_config.h>
18#endif
19#endif /* ndef COMPILED_FROM_DSP */
20
21#include "expat_external.h"
22#include "internal.h"
23#include "xmltok.h"
24#include "nametab.h"
25
26#ifdef XML_DTD
27#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
28#else
29#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
30#endif
31
32#define VTABLE1 \
33 { PREFIX(prologTok), PREFIX(contentTok), \
34 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
35 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
36 PREFIX(sameName), \
37 PREFIX(nameMatchesAscii), \
38 PREFIX(nameLength), \
39 PREFIX(skipS), \
40 PREFIX(getAtts), \
41 PREFIX(charRefNumber), \
42 PREFIX(predefinedEntityName), \
43 PREFIX(updatePosition), \
44 PREFIX(isPublicId)
45
46#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
47
48#define UCS2_GET_NAMING(pages, hi, lo) \
49 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
50
51/* A 2 byte UTF-8 representation splits the characters 11 bits between
52 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
53 pages, 3 bits to add to that index and 5 bits to generate the mask.
54*/
55#define UTF8_GET_NAMING2(pages, byte) \
56 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
57 + ((((byte)[0]) & 3) << 1) \
58 + ((((byte)[1]) >> 5) & 1)] \
59 & (1 << (((byte)[1]) & 0x1F)))
60
61/* A 3 byte UTF-8 representation splits the characters 16 bits between
62 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
63 into pages, 3 bits to add to that index and 5 bits to generate the
64 mask.
65*/
66#define UTF8_GET_NAMING3(pages, byte) \
67 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
68 + ((((byte)[1]) >> 2) & 0xF)] \
69 << 3) \
70 + ((((byte)[1]) & 3) << 1) \
71 + ((((byte)[2]) >> 5) & 1)] \
72 & (1 << (((byte)[2]) & 0x1F)))
73
74#define UTF8_GET_NAMING(pages, p, n) \
75 ((n) == 2 \
76 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
77 : ((n) == 3 \
78 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
79 : 0))
80
81/* Detection of invalid UTF-8 sequences is based on Table 3.1B
82 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
83 with the additional restriction of not allowing the Unicode
84 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
85 Implementation details:
86 (A & 0x80) == 0 means A < 0x80
87 and
88 (A & 0xC0) == 0xC0 means A > 0xBF
89*/
90
91#define UTF8_INVALID2(p) \
92 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
93
94#define UTF8_INVALID3(p) \
95 (((p)[2] & 0x80) == 0 \
96 || \
97 ((*p) == 0xEF && (p)[1] == 0xBF \
98 ? \
99 (p)[2] > 0xBD \
100 : \
101 ((p)[2] & 0xC0) == 0xC0) \
102 || \
103 ((*p) == 0xE0 \
104 ? \
105 (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
106 : \
107 ((p)[1] & 0x80) == 0 \
108 || \
109 ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
110
111#define UTF8_INVALID4(p) \
112 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
113 || \
114 ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
115 || \
116 ((*p) == 0xF0 \
117 ? \
118 (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
119 : \
120 ((p)[1] & 0x80) == 0 \
121 || \
122 ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
123
124static int PTRFASTCALL
125isNever(const ENCODING *enc, const char *p)
126{
127 return 0;
128}
129
130static int PTRFASTCALL
131utf8_isName2(const ENCODING *enc, const char *p)
132{
133 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
134}
135
136static int PTRFASTCALL
137utf8_isName3(const ENCODING *enc, const char *p)
138{
139 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
140}
141
142#define utf8_isName4 isNever
143
144static int PTRFASTCALL
145utf8_isNmstrt2(const ENCODING *enc, const char *p)
146{
147 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
148}
149
150static int PTRFASTCALL
151utf8_isNmstrt3(const ENCODING *enc, const char *p)
152{
153 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
154}
155
156#define utf8_isNmstrt4 isNever
157
158static int PTRFASTCALL
159utf8_isInvalid2(const ENCODING *enc, const char *p)
160{
161 return UTF8_INVALID2((const unsigned char *)p);
162}
163
164static int PTRFASTCALL
165utf8_isInvalid3(const ENCODING *enc, const char *p)
166{
167 return UTF8_INVALID3((const unsigned char *)p);
168}
169
170static int PTRFASTCALL
171utf8_isInvalid4(const ENCODING *enc, const char *p)
172{
173 return UTF8_INVALID4((const unsigned char *)p);
174}
175
178 unsigned char type[256];
179#ifdef XML_MIN_SIZE
180 int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
181 int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
182 int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
183 int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
184 int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
185#endif /* XML_MIN_SIZE */
186 int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
187 int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
188 int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
189 int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
190 int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
191 int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
192 int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
193 int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
194 int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
195};
196
197#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
198
199#ifdef XML_MIN_SIZE
200
201#define STANDARD_VTABLE(E) \
202 E ## byteType, \
203 E ## isNameMin, \
204 E ## isNmstrtMin, \
205 E ## byteToAscii, \
206 E ## charMatches,
207
208#else
209
210#define STANDARD_VTABLE(E) /* as nothing */
211
212#endif
213
214#define NORMAL_VTABLE(E) \
215 E ## isName2, \
216 E ## isName3, \
217 E ## isName4, \
218 E ## isNmstrt2, \
219 E ## isNmstrt3, \
220 E ## isNmstrt4, \
221 E ## isInvalid2, \
222 E ## isInvalid3, \
223 E ## isInvalid4
224
225static int FASTCALL checkCharRefNumber(int);
226
227#include "xmltok_impl.h"
228#include "ascii.h"
229
230#ifdef XML_MIN_SIZE
231#define sb_isNameMin isNever
232#define sb_isNmstrtMin isNever
233#endif
234
235#ifdef XML_MIN_SIZE
236#define MINBPC(enc) ((enc)->minBytesPerChar)
237#else
238/* minimum bytes per character */
239#define MINBPC(enc) 1
240#endif
241
242#define SB_BYTE_TYPE(enc, p) \
243 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
244
245#ifdef XML_MIN_SIZE
246static int PTRFASTCALL
247sb_byteType(const ENCODING *enc, const char *p)
248{
249 return SB_BYTE_TYPE(enc, p);
250}
251#define BYTE_TYPE(enc, p) \
252 (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
253#else
254#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
255#endif
256
257#ifdef XML_MIN_SIZE
258#define BYTE_TO_ASCII(enc, p) \
259 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
260static int PTRFASTCALL
261sb_byteToAscii(const ENCODING *enc, const char *p)
262{
263 return *p;
264}
265#else
266#define BYTE_TO_ASCII(enc, p) (*(p))
267#endif
268
269#define IS_NAME_CHAR(enc, p, n) \
270 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
271#define IS_NMSTRT_CHAR(enc, p, n) \
272 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
273#define IS_INVALID_CHAR(enc, p, n) \
274 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
275
276#ifdef XML_MIN_SIZE
277#define IS_NAME_CHAR_MINBPC(enc, p) \
278 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
279#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
280 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
281#else
282#define IS_NAME_CHAR_MINBPC(enc, p) (0)
283#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
284#endif
285
286#ifdef XML_MIN_SIZE
287#define CHAR_MATCHES(enc, p, c) \
288 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
289static int PTRCALL
290sb_charMatches(const ENCODING *enc, const char *p, int c)
291{
292 return *p == c;
293}
294#else
295/* c is an ASCII character */
296#define CHAR_MATCHES(enc, p, c) (*(p) == c)
297#endif
298
299#define PREFIX(ident) normal_ ## ident
300#define XML_TOK_IMPL_C
301#include "xmltok_impl.c"
302#undef XML_TOK_IMPL_C
303
304#undef MINBPC
305#undef BYTE_TYPE
306#undef BYTE_TO_ASCII
307#undef CHAR_MATCHES
308#undef IS_NAME_CHAR
309#undef IS_NAME_CHAR_MINBPC
310#undef IS_NMSTRT_CHAR
311#undef IS_NMSTRT_CHAR_MINBPC
312#undef IS_INVALID_CHAR
313
314enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
318 UTF8_cval4 = 0xf0
320
321static void PTRCALL
322utf8_toUtf8(const ENCODING *enc,
323 const char **fromP, const char *fromLim,
324 char **toP, const char *toLim)
325{
326 char *to;
327 const char *from;
328 if (fromLim - *fromP > toLim - *toP) {
329 /* Avoid copying partial characters. */
330 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
331 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
332 break;
333 }
334 for (to = *toP, from = *fromP; from != fromLim; from++, to++)
335 *to = *from;
336 *fromP = from;
337 *toP = to;
338}
339
340static void PTRCALL
341utf8_toUtf16(const ENCODING *enc,
342 const char **fromP, const char *fromLim,
343 unsigned short **toP, const unsigned short *toLim)
344{
345 unsigned short *to = *toP;
346 const char *from = *fromP;
347 while (from != fromLim && to != toLim) {
348 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
349 case BT_LEAD2:
350 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
351 from += 2;
352 break;
353 case BT_LEAD3:
354 *to++ = (unsigned short)(((from[0] & 0xf) << 12)
355 | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
356 from += 3;
357 break;
358 case BT_LEAD4:
359 {
360 unsigned long n;
361 if (to + 1 == toLim)
362 goto after;
363 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
364 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
365 n -= 0x10000;
366 to[0] = (unsigned short)((n >> 10) | 0xD800);
367 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
368 to += 2;
369 from += 4;
370 }
371 break;
372 default:
373 *to++ = *from++;
374 break;
375 }
376 }
377after:
378 *fromP = from;
379 *toP = to;
380}
381
382#ifdef XML_NS
383static const struct normal_encoding utf8_encoding_ns = {
384 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
385 {
386#include "asciitab.h"
387#include "utf8tab.h"
388 },
390};
391#endif
392
393static const struct normal_encoding utf8_encoding = {
394 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
395 {
396#define BT_COLON BT_NMSTRT
397#include "asciitab.h"
398#undef BT_COLON
399#include "utf8tab.h"
400 },
402};
403
404#ifdef XML_NS
405
406static const struct normal_encoding internal_utf8_encoding_ns = {
407 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
408 {
409#include "iasciitab.h"
410#include "utf8tab.h"
411 },
413};
414
415#endif
416
417static const struct normal_encoding internal_utf8_encoding = {
418 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
419 {
420#define BT_COLON BT_NMSTRT
421#include "iasciitab.h"
422#undef BT_COLON
423#include "utf8tab.h"
424 },
426};
427
428static void PTRCALL
429latin1_toUtf8(const ENCODING *enc,
430 const char **fromP, const char *fromLim,
431 char **toP, const char *toLim)
432{
433 for (;;) {
434 unsigned char c;
435 if (*fromP == fromLim)
436 break;
437 c = (unsigned char)**fromP;
438 if (c & 0x80) {
439 if (toLim - *toP < 2)
440 break;
441 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
442 *(*toP)++ = (char)((c & 0x3f) | 0x80);
443 (*fromP)++;
444 }
445 else {
446 if (*toP == toLim)
447 break;
448 *(*toP)++ = *(*fromP)++;
449 }
450 }
451}
452
453static void PTRCALL
454latin1_toUtf16(const ENCODING *enc,
455 const char **fromP, const char *fromLim,
456 unsigned short **toP, const unsigned short *toLim)
457{
458 while (*fromP != fromLim && *toP != toLim)
459 *(*toP)++ = (unsigned char)*(*fromP)++;
460}
461
462#ifdef XML_NS
463
464static const struct normal_encoding latin1_encoding_ns = {
465 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
466 {
467#include "asciitab.h"
468#include "latin1tab.h"
469 },
470 STANDARD_VTABLE(sb_)
471};
472
473#endif
474
475static const struct normal_encoding latin1_encoding = {
476 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
477 {
478#define BT_COLON BT_NMSTRT
479#include "asciitab.h"
480#undef BT_COLON
481#include "latin1tab.h"
482 },
483 STANDARD_VTABLE(sb_)
484};
485
486static void PTRCALL
487ascii_toUtf8(const ENCODING *enc,
488 const char **fromP, const char *fromLim,
489 char **toP, const char *toLim)
490{
491 while (*fromP != fromLim && *toP != toLim)
492 *(*toP)++ = *(*fromP)++;
493}
494
495#ifdef XML_NS
496
497static const struct normal_encoding ascii_encoding_ns = {
498 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
499 {
500#include "asciitab.h"
501/* BT_NONXML == 0 */
502 },
503 STANDARD_VTABLE(sb_)
504};
505
506#endif
507
508static const struct normal_encoding ascii_encoding = {
509 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
510 {
511#define BT_COLON BT_NMSTRT
512#include "asciitab.h"
513#undef BT_COLON
514/* BT_NONXML == 0 */
515 },
516 STANDARD_VTABLE(sb_)
517};
518
519static int PTRFASTCALL
520unicode_byte_type(char hi, char lo)
521{
522 switch ((unsigned char)hi) {
523 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
524 return BT_LEAD4;
525 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
526 return BT_TRAIL;
527 case 0xFF:
528 switch ((unsigned char)lo) {
529 case 0xFF:
530 case 0xFE:
531 return BT_NONXML;
532 }
533 break;
534 }
535 return BT_NONASCII;
536}
537
538#define DEFINE_UTF16_TO_UTF8(E) \
539static void PTRCALL \
540E ## toUtf8(const ENCODING *enc, \
541 const char **fromP, const char *fromLim, \
542 char **toP, const char *toLim) \
543{ \
544 const char *from; \
545 for (from = *fromP; from != fromLim; from += 2) { \
546 int plane; \
547 unsigned char lo2; \
548 unsigned char lo = GET_LO(from); \
549 unsigned char hi = GET_HI(from); \
550 switch (hi) { \
551 case 0: \
552 if (lo < 0x80) { \
553 if (*toP == toLim) { \
554 *fromP = from; \
555 return; \
556 } \
557 *(*toP)++ = lo; \
558 break; \
559 } \
560 /* fall through */ \
561 case 0x1: case 0x2: case 0x3: \
562 case 0x4: case 0x5: case 0x6: case 0x7: \
563 if (toLim - *toP < 2) { \
564 *fromP = from; \
565 return; \
566 } \
567 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
568 *(*toP)++ = ((lo & 0x3f) | 0x80); \
569 break; \
570 default: \
571 if (toLim - *toP < 3) { \
572 *fromP = from; \
573 return; \
574 } \
575 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
576 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
577 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
578 *(*toP)++ = ((lo & 0x3f) | 0x80); \
579 break; \
580 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
581 if (toLim - *toP < 4) { \
582 *fromP = from; \
583 return; \
584 } \
585 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
586 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
587 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
588 from += 2; \
589 lo2 = GET_LO(from); \
590 *(*toP)++ = (((lo & 0x3) << 4) \
591 | ((GET_HI(from) & 0x3) << 2) \
592 | (lo2 >> 6) \
593 | 0x80); \
594 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
595 break; \
596 } \
597 } \
598 *fromP = from; \
599}
600
601#define DEFINE_UTF16_TO_UTF16(E) \
602static void PTRCALL \
603E ## toUtf16(const ENCODING *enc, \
604 const char **fromP, const char *fromLim, \
605 unsigned short **toP, const unsigned short *toLim) \
606{ \
607 /* Avoid copying first half only of surrogate */ \
608 if (fromLim - *fromP > ((toLim - *toP) << 1) \
609 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
610 fromLim -= 2; \
611 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
612 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
613}
614
615#define SET2(ptr, ch) \
616 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
617#define GET_LO(ptr) ((unsigned char)(ptr)[0])
618#define GET_HI(ptr) ((unsigned char)(ptr)[1])
619
620DEFINE_UTF16_TO_UTF8(little2_)
621DEFINE_UTF16_TO_UTF16(little2_)
622
623#undef SET2
624#undef GET_LO
625#undef GET_HI
626
627#define SET2(ptr, ch) \
628 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
629#define GET_LO(ptr) ((unsigned char)(ptr)[1])
630#define GET_HI(ptr) ((unsigned char)(ptr)[0])
631
634
635#undef SET2
636#undef GET_LO
637#undef GET_HI
638
639#define LITTLE2_BYTE_TYPE(enc, p) \
640 ((p)[1] == 0 \
641 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
642 : unicode_byte_type((p)[1], (p)[0]))
643#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
644#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
645#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
646 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
647#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
648 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
649
650#ifdef XML_MIN_SIZE
651
652static int PTRFASTCALL
653little2_byteType(const ENCODING *enc, const char *p)
654{
655 return LITTLE2_BYTE_TYPE(enc, p);
656}
657
658static int PTRFASTCALL
659little2_byteToAscii(const ENCODING *enc, const char *p)
660{
661 return LITTLE2_BYTE_TO_ASCII(enc, p);
662}
663
664static int PTRCALL
665little2_charMatches(const ENCODING *enc, const char *p, int c)
666{
667 return LITTLE2_CHAR_MATCHES(enc, p, c);
668}
669
670static int PTRFASTCALL
671little2_isNameMin(const ENCODING *enc, const char *p)
672{
674}
675
676static int PTRFASTCALL
677little2_isNmstrtMin(const ENCODING *enc, const char *p)
678{
680}
681
682#undef VTABLE
683#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
684
685#else /* not XML_MIN_SIZE */
686
687#undef PREFIX
688#define PREFIX(ident) little2_ ## ident
689#define MINBPC(enc) 2
690/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
691#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
692#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
693#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
694#define IS_NAME_CHAR(enc, p, n) 0
695#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
696#define IS_NMSTRT_CHAR(enc, p, n) (0)
697#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
698
699#define XML_TOK_IMPL_C
700#include "xmltok_impl.c"
701#undef XML_TOK_IMPL_C
702
703#undef MINBPC
704#undef BYTE_TYPE
705#undef BYTE_TO_ASCII
706#undef CHAR_MATCHES
707#undef IS_NAME_CHAR
708#undef IS_NAME_CHAR_MINBPC
709#undef IS_NMSTRT_CHAR
710#undef IS_NMSTRT_CHAR_MINBPC
711#undef IS_INVALID_CHAR
712
713#endif /* not XML_MIN_SIZE */
714
715#ifdef XML_NS
716
717static const struct normal_encoding little2_encoding_ns = {
718 { VTABLE, 2, 0,
719#if BYTEORDER == 1234
720 1
721#else
722 0
723#endif
724 },
725 {
726#include "asciitab.h"
727#include "latin1tab.h"
728 },
729 STANDARD_VTABLE(little2_)
730};
731
732#endif
733
734static const struct normal_encoding little2_encoding = {
735 { VTABLE, 2, 0,
736#if BYTEORDER == 1234
737 1
738#else
739 0
740#endif
741 },
742 {
743#define BT_COLON BT_NMSTRT
744#include "asciitab.h"
745#undef BT_COLON
746#include "latin1tab.h"
747 },
748 STANDARD_VTABLE(little2_)
749};
750
751#if BYTEORDER != 4321
752
753#ifdef XML_NS
754
755static const struct normal_encoding internal_little2_encoding_ns = {
756 { VTABLE, 2, 0, 1 },
757 {
758#include "iasciitab.h"
759#include "latin1tab.h"
760 },
761 STANDARD_VTABLE(little2_)
762};
763
764#endif
765
766static const struct normal_encoding internal_little2_encoding = {
767 { VTABLE, 2, 0, 1 },
768 {
769#define BT_COLON BT_NMSTRT
770#include "iasciitab.h"
771#undef BT_COLON
772#include "latin1tab.h"
773 },
774 STANDARD_VTABLE(little2_)
775};
776
777#endif
778
779
780#define BIG2_BYTE_TYPE(enc, p) \
781 ((p)[0] == 0 \
782 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
783 : unicode_byte_type((p)[0], (p)[1]))
784#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
785#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
786#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
787 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
788#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
789 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
790
791#ifdef XML_MIN_SIZE
792
793static int PTRFASTCALL
794big2_byteType(const ENCODING *enc, const char *p)
795{
796 return BIG2_BYTE_TYPE(enc, p);
797}
798
799static int PTRFASTCALL
800big2_byteToAscii(const ENCODING *enc, const char *p)
801{
802 return BIG2_BYTE_TO_ASCII(enc, p);
803}
804
805static int PTRCALL
806big2_charMatches(const ENCODING *enc, const char *p, int c)
807{
808 return BIG2_CHAR_MATCHES(enc, p, c);
809}
810
811static int PTRFASTCALL
812big2_isNameMin(const ENCODING *enc, const char *p)
813{
814 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
815}
816
817static int PTRFASTCALL
818big2_isNmstrtMin(const ENCODING *enc, const char *p)
819{
821}
822
823#undef VTABLE
824#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
825
826#else /* not XML_MIN_SIZE */
827
828#undef PREFIX
829#define PREFIX(ident) big2_ ## ident
830#define MINBPC(enc) 2
831/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
832#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
833#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
834#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
835#define IS_NAME_CHAR(enc, p, n) 0
836#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
837#define IS_NMSTRT_CHAR(enc, p, n) (0)
838#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
839
840#define XML_TOK_IMPL_C
841#include "xmltok_impl.c"
842#undef XML_TOK_IMPL_C
843
844#undef MINBPC
845#undef BYTE_TYPE
846#undef BYTE_TO_ASCII
847#undef CHAR_MATCHES
848#undef IS_NAME_CHAR
849#undef IS_NAME_CHAR_MINBPC
850#undef IS_NMSTRT_CHAR
851#undef IS_NMSTRT_CHAR_MINBPC
852#undef IS_INVALID_CHAR
853
854#endif /* not XML_MIN_SIZE */
855
856#ifdef XML_NS
857
858static const struct normal_encoding big2_encoding_ns = {
859 { VTABLE, 2, 0,
860#if BYTEORDER == 4321
861 1
862#else
863 0
864#endif
865 },
866 {
867#include "asciitab.h"
868#include "latin1tab.h"
869 },
870 STANDARD_VTABLE(big2_)
871};
872
873#endif
874
875static const struct normal_encoding big2_encoding = {
876 { VTABLE, 2, 0,
877#if BYTEORDER == 4321
878 1
879#else
880 0
881#endif
882 },
883 {
884#define BT_COLON BT_NMSTRT
885#include "asciitab.h"
886#undef BT_COLON
887#include "latin1tab.h"
888 },
889 STANDARD_VTABLE(big2_)
890};
891
892#if BYTEORDER != 1234
893
894#ifdef XML_NS
895
896static const struct normal_encoding internal_big2_encoding_ns = {
897 { VTABLE, 2, 0, 1 },
898 {
899#include "iasciitab.h"
900#include "latin1tab.h"
901 },
902 STANDARD_VTABLE(big2_)
903};
904
905#endif
906
907static const struct normal_encoding internal_big2_encoding = {
908 { VTABLE, 2, 0, 1 },
909 {
910#define BT_COLON BT_NMSTRT
911#include "iasciitab.h"
912#undef BT_COLON
913#include "latin1tab.h"
914 },
915 STANDARD_VTABLE(big2_)
916};
917
918#endif
919
920#undef PREFIX
921
922static int FASTCALL
923streqci(const char *s1, const char *s2)
924{
925 for (;;) {
926 char c1 = *s1++;
927 char c2 = *s2++;
928 if (ASCII_a <= c1 && c1 <= ASCII_z)
929 c1 += ASCII_A - ASCII_a;
930 if (ASCII_a <= c2 && c2 <= ASCII_z)
931 c2 += ASCII_A - ASCII_a;
932 if (c1 != c2)
933 return 0;
934 if (!c1)
935 break;
936 }
937 return 1;
938}
939
940static void PTRCALL
941initUpdatePosition(const ENCODING *enc, const char *ptr,
942 const char *end, POSITION *pos)
943{
944 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
945}
946
947static int
948toAscii(const ENCODING *enc, const char *ptr, const char *end)
949{
950 char buf[1];
951 char *p = buf;
952 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
953 if (p == buf)
954 return -1;
955 else
956 return buf[0];
957}
958
959static int FASTCALL
960isSpace(int c)
961{
962 switch (c) {
963 case 0x20:
964 case 0xD:
965 case 0xA:
966 case 0x9:
967 return 1;
968 }
969 return 0;
970}
971
972/* Return 1 if there's just optional white space or there's an S
973 followed by name=val.
974*/
975static int
976parsePseudoAttribute(const ENCODING *enc,
977 const char *ptr,
978 const char *end,
979 const char **namePtr,
980 const char **nameEndPtr,
981 const char **valPtr,
982 const char **nextTokPtr)
983{
984 int c;
985 char open;
986 if (ptr == end) {
987 *namePtr = NULL;
988 return 1;
989 }
990 if (!isSpace(toAscii(enc, ptr, end))) {
991 *nextTokPtr = ptr;
992 return 0;
993 }
994 do {
995 ptr += enc->minBytesPerChar;
996 } while (isSpace(toAscii(enc, ptr, end)));
997 if (ptr == end) {
998 *namePtr = NULL;
999 return 1;
1000 }
1001 *namePtr = ptr;
1002 for (;;) {
1003 c = toAscii(enc, ptr, end);
1004 if (c == -1) {
1005 *nextTokPtr = ptr;
1006 return 0;
1007 }
1008 if (c == ASCII_EQUALS) {
1009 *nameEndPtr = ptr;
1010 break;
1011 }
1012 if (isSpace(c)) {
1013 *nameEndPtr = ptr;
1014 do {
1015 ptr += enc->minBytesPerChar;
1016 } while (isSpace(c = toAscii(enc, ptr, end)));
1017 if (c != ASCII_EQUALS) {
1018 *nextTokPtr = ptr;
1019 return 0;
1020 }
1021 break;
1022 }
1023 ptr += enc->minBytesPerChar;
1024 }
1025 if (ptr == *namePtr) {
1026 *nextTokPtr = ptr;
1027 return 0;
1028 }
1029 ptr += enc->minBytesPerChar;
1030 c = toAscii(enc, ptr, end);
1031 while (isSpace(c)) {
1032 ptr += enc->minBytesPerChar;
1033 c = toAscii(enc, ptr, end);
1034 }
1035 if (c != ASCII_QUOT && c != ASCII_APOS) {
1036 *nextTokPtr = ptr;
1037 return 0;
1038 }
1039 open = (char)c;
1040 ptr += enc->minBytesPerChar;
1041 *valPtr = ptr;
1042 for (;; ptr += enc->minBytesPerChar) {
1043 c = toAscii(enc, ptr, end);
1044 if (c == open)
1045 break;
1046 if (!(ASCII_a <= c && c <= ASCII_z)
1047 && !(ASCII_A <= c && c <= ASCII_Z)
1048 && !(ASCII_0 <= c && c <= ASCII_9)
1049 && c != ASCII_PERIOD
1050 && c != ASCII_MINUS
1051 && c != ASCII_UNDERSCORE) {
1052 *nextTokPtr = ptr;
1053 return 0;
1054 }
1055 }
1056 *nextTokPtr = ptr + enc->minBytesPerChar;
1057 return 1;
1058}
1059
1060static const char KW_version[] = {
1062};
1063
1064static const char KW_encoding[] = {
1066};
1067
1068static const char KW_standalone[] = {
1070 ASCII_n, ASCII_e, '\0'
1071};
1072
1073static const char KW_yes[] = {
1074 ASCII_y, ASCII_e, ASCII_s, '\0'
1075};
1076
1077static const char KW_no[] = {
1078 ASCII_n, ASCII_o, '\0'
1079};
1080
1081static int
1082doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1083 const char *,
1084 const char *),
1085 int isGeneralTextEntity,
1086 const ENCODING *enc,
1087 const char *ptr,
1088 const char *end,
1089 const char **badPtr,
1090 const char **versionPtr,
1091 const char **versionEndPtr,
1092 const char **encodingName,
1093 const ENCODING **encoding,
1094 int *standalone)
1095{
1096 const char *val = NULL;
1097 const char *name = NULL;
1098 const char *nameEnd = NULL;
1099 ptr += 5 * enc->minBytesPerChar;
1100 end -= 2 * enc->minBytesPerChar;
1101 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1102 || !name) {
1103 *badPtr = ptr;
1104 return 0;
1105 }
1106 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1107 if (!isGeneralTextEntity) {
1108 *badPtr = name;
1109 return 0;
1110 }
1111 }
1112 else {
1113 if (versionPtr)
1114 *versionPtr = val;
1115 if (versionEndPtr)
1116 *versionEndPtr = ptr;
1117 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1118 *badPtr = ptr;
1119 return 0;
1120 }
1121 if (!name) {
1122 if (isGeneralTextEntity) {
1123 /* a TextDecl must have an EncodingDecl */
1124 *badPtr = ptr;
1125 return 0;
1126 }
1127 return 1;
1128 }
1129 }
1130 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1131 int c = toAscii(enc, val, end);
1132 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1133 *badPtr = val;
1134 return 0;
1135 }
1136 if (encodingName)
1137 *encodingName = val;
1138 if (encoding)
1139 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1140 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1141 *badPtr = ptr;
1142 return 0;
1143 }
1144 if (!name)
1145 return 1;
1146 }
1147 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1148 || isGeneralTextEntity) {
1149 *badPtr = name;
1150 return 0;
1151 }
1152 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1153 if (standalone)
1154 *standalone = 1;
1155 }
1156 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1157 if (standalone)
1158 *standalone = 0;
1159 }
1160 else {
1161 *badPtr = val;
1162 return 0;
1163 }
1164 while (isSpace(toAscii(enc, ptr, end)))
1165 ptr += enc->minBytesPerChar;
1166 if (ptr != end) {
1167 *badPtr = ptr;
1168 return 0;
1169 }
1170 return 1;
1171}
1172
1173static int FASTCALL
1174checkCharRefNumber(int result)
1175{
1176 switch (result >> 8) {
1177 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1178 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1179 return -1;
1180 case 0:
1181 if (latin1_encoding.type[result] == BT_NONXML)
1182 return -1;
1183 break;
1184 case 0xFF:
1185 if (result == 0xFFFE || result == 0xFFFF)
1186 return -1;
1187 break;
1188 }
1189 return result;
1190}
1191
1192int FASTCALL
1193XmlUtf8Encode(int c, char *buf)
1194{
1195 enum {
1196 /* minN is minimum legal resulting value for N byte sequence */
1197 min2 = 0x80,
1198 min3 = 0x800,
1199 min4 = 0x10000
1200 };
1201
1202 if (c < 0)
1203 return 0;
1204 if (c < min2) {
1205 buf[0] = (char)(c | UTF8_cval1);
1206 return 1;
1207 }
1208 if (c < min3) {
1209 buf[0] = (char)((c >> 6) | UTF8_cval2);
1210 buf[1] = (char)((c & 0x3f) | 0x80);
1211 return 2;
1212 }
1213 if (c < min4) {
1214 buf[0] = (char)((c >> 12) | UTF8_cval3);
1215 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1216 buf[2] = (char)((c & 0x3f) | 0x80);
1217 return 3;
1218 }
1219 if (c < 0x110000) {
1220 buf[0] = (char)((c >> 18) | UTF8_cval4);
1221 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1222 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1223 buf[3] = (char)((c & 0x3f) | 0x80);
1224 return 4;
1225 }
1226 return 0;
1227}
1228
1229int FASTCALL
1230XmlUtf16Encode(int charNum, unsigned short *buf)
1231{
1232 if (charNum < 0)
1233 return 0;
1234 if (charNum < 0x10000) {
1235 buf[0] = (unsigned short)charNum;
1236 return 1;
1237 }
1238 if (charNum < 0x110000) {
1239 charNum -= 0x10000;
1240 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1241 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1242 return 2;
1243 }
1244 return 0;
1245}
1246
1251 unsigned short utf16[256];
1252 char utf8[256][4];
1253};
1254
1255#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
1256
1257int
1259{
1260 return sizeof(struct unknown_encoding);
1261}
1262
1263static int PTRFASTCALL
1264unknown_isName(const ENCODING *enc, const char *p)
1265{
1266 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1267 int c = uenc->convert(uenc->userData, p);
1268 if (c & ~0xFFFF)
1269 return 0;
1270 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1271}
1272
1273static int PTRFASTCALL
1274unknown_isNmstrt(const ENCODING *enc, const char *p)
1275{
1276 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1277 int c = uenc->convert(uenc->userData, p);
1278 if (c & ~0xFFFF)
1279 return 0;
1280 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1281}
1282
1283static int PTRFASTCALL
1284unknown_isInvalid(const ENCODING *enc, const char *p)
1285{
1286 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1287 int c = uenc->convert(uenc->userData, p);
1288 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1289}
1290
1291static void PTRCALL
1292unknown_toUtf8(const ENCODING *enc,
1293 const char **fromP, const char *fromLim,
1294 char **toP, const char *toLim)
1295{
1296 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1297 char buf[XML_UTF8_ENCODE_MAX];
1298 for (;;) {
1299 const char *utf8;
1300 int n;
1301 if (*fromP == fromLim)
1302 break;
1303 utf8 = uenc->utf8[(unsigned char)**fromP];
1304 n = *utf8++;
1305 if (n == 0) {
1306 int c = uenc->convert(uenc->userData, *fromP);
1307 n = XmlUtf8Encode(c, buf);
1308 if (n > toLim - *toP)
1309 break;
1310 utf8 = buf;
1311 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1312 - (BT_LEAD2 - 2));
1313 }
1314 else {
1315 if (n > toLim - *toP)
1316 break;
1317 (*fromP)++;
1318 }
1319 do {
1320 *(*toP)++ = *utf8++;
1321 } while (--n != 0);
1322 }
1323}
1324
1325static void PTRCALL
1326unknown_toUtf16(const ENCODING *enc,
1327 const char **fromP, const char *fromLim,
1328 unsigned short **toP, const unsigned short *toLim)
1329{
1330 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1331 while (*fromP != fromLim && *toP != toLim) {
1332 unsigned short c = uenc->utf16[(unsigned char)**fromP];
1333 if (c == 0) {
1334 c = (unsigned short)
1335 uenc->convert(uenc->userData, *fromP);
1336 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1337 - (BT_LEAD2 - 2));
1338 }
1339 else
1340 (*fromP)++;
1341 *(*toP)++ = c;
1342 }
1343}
1344
1345ENCODING *
1347 int *table,
1349 void *userData)
1350{
1351 int i;
1352 struct unknown_encoding *e = (struct unknown_encoding *)mem;
1353 for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1354 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1355 for (i = 0; i < 128; i++)
1356 if (latin1_encoding.type[i] != BT_OTHER
1357 && latin1_encoding.type[i] != BT_NONXML
1358 && table[i] != i)
1359 return 0;
1360 for (i = 0; i < 256; i++) {
1361 int c = table[i];
1362 if (c == -1) {
1363 e->normal.type[i] = BT_MALFORM;
1364 /* This shouldn't really get used. */
1365 e->utf16[i] = 0xFFFF;
1366 e->utf8[i][0] = 1;
1367 e->utf8[i][1] = 0;
1368 }
1369 else if (c < 0) {
1370 if (c < -4)
1371 return 0;
1372 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1373 e->utf8[i][0] = 0;
1374 e->utf16[i] = 0;
1375 }
1376 else if (c < 0x80) {
1377 if (latin1_encoding.type[c] != BT_OTHER
1378 && latin1_encoding.type[c] != BT_NONXML
1379 && c != i)
1380 return 0;
1381 e->normal.type[i] = latin1_encoding.type[c];
1382 e->utf8[i][0] = 1;
1383 e->utf8[i][1] = (char)c;
1384 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1385 }
1386 else if (checkCharRefNumber(c) < 0) {
1387 e->normal.type[i] = BT_NONXML;
1388 /* This shouldn't really get used. */
1389 e->utf16[i] = 0xFFFF;
1390 e->utf8[i][0] = 1;
1391 e->utf8[i][1] = 0;
1392 }
1393 else {
1394 if (c > 0xFFFF)
1395 return 0;
1396 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1397 e->normal.type[i] = BT_NMSTRT;
1398 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1399 e->normal.type[i] = BT_NAME;
1400 else
1401 e->normal.type[i] = BT_OTHER;
1402 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1403 e->utf16[i] = (unsigned short)c;
1404 }
1405 }
1406 e->userData = userData;
1407 e->convert = convert;
1408 if (convert) {
1409 e->normal.isName2 = unknown_isName;
1410 e->normal.isName3 = unknown_isName;
1411 e->normal.isName4 = unknown_isName;
1412 e->normal.isNmstrt2 = unknown_isNmstrt;
1413 e->normal.isNmstrt3 = unknown_isNmstrt;
1414 e->normal.isNmstrt4 = unknown_isNmstrt;
1415 e->normal.isInvalid2 = unknown_isInvalid;
1416 e->normal.isInvalid3 = unknown_isInvalid;
1417 e->normal.isInvalid4 = unknown_isInvalid;
1418 }
1419 e->normal.enc.utf8Convert = unknown_toUtf8;
1420 e->normal.enc.utf16Convert = unknown_toUtf16;
1421 return &(e->normal.enc);
1422}
1423
1424/* If this enumeration is changed, getEncodingIndex and encodings
1425must also be changed. */
1426enum {
1434 /* must match encodingNames up to here */
1435 NO_ENC
1437
1438static const char KW_ISO_8859_1[] = {
1440 ASCII_MINUS, ASCII_1, '\0'
1441};
1442static const char KW_US_ASCII[] = {
1444 '\0'
1445};
1446static const char KW_UTF_8[] = {
1448};
1449static const char KW_UTF_16[] = {
1451};
1452static const char KW_UTF_16BE[] = {
1454 '\0'
1455};
1456static const char KW_UTF_16LE[] = {
1458 '\0'
1459};
1460
1461static int FASTCALL
1462getEncodingIndex(const char *name)
1463{
1464 static const char * const encodingNames[] = {
1465 KW_ISO_8859_1,
1466 KW_US_ASCII,
1467 KW_UTF_8,
1468 KW_UTF_16,
1469 KW_UTF_16BE,
1470 KW_UTF_16LE,
1471 };
1472 int i;
1473 if (name == NULL)
1474 return NO_ENC;
1475 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1476 if (streqci(name, encodingNames[i]))
1477 return i;
1478 return UNKNOWN_ENC;
1479}
1480
1481/* For binary compatibility, we store the index of the encoding
1482 specified at initialization in the isUtf16 member.
1483*/
1484
1485#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1486#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1487
1488/* This is what detects the encoding. encodingTable maps from
1489 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1490 the external (protocol) specified encoding; state is
1491 XML_CONTENT_STATE if we're parsing an external text entity, and
1492 XML_PROLOG_STATE otherwise.
1493*/
1494
1495
1496static int
1497initScan(const ENCODING * const *encodingTable,
1498 const INIT_ENCODING *enc,
1499 int state,
1500 const char *ptr,
1501 const char *end,
1502 const char **nextTokPtr)
1503{
1504 const ENCODING **encPtr;
1505
1506 if (ptr == end)
1507 return XML_TOK_NONE;
1508 encPtr = enc->encPtr;
1509 if (ptr + 1 == end) {
1510 /* only a single byte available for auto-detection */
1511#ifndef XML_DTD /* FIXME */
1512 /* a well-formed document entity must have more than one byte */
1513 if (state != XML_CONTENT_STATE)
1514 return XML_TOK_PARTIAL;
1515#endif
1516 /* so we're parsing an external text entity... */
1517 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1518 switch (INIT_ENC_INDEX(enc)) {
1519 case UTF_16_ENC:
1520 case UTF_16LE_ENC:
1521 case UTF_16BE_ENC:
1522 return XML_TOK_PARTIAL;
1523 }
1524 switch ((unsigned char)*ptr) {
1525 case 0xFE:
1526 case 0xFF:
1527 case 0xEF: /* possibly first byte of UTF-8 BOM */
1528 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1529 && state == XML_CONTENT_STATE)
1530 break;
1531 /* fall through */
1532 case 0x00:
1533 case 0x3C:
1534 return XML_TOK_PARTIAL;
1535 }
1536 }
1537 else {
1538 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1539 case 0xFEFF:
1540 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1541 && state == XML_CONTENT_STATE)
1542 break;
1543 *nextTokPtr = ptr + 2;
1544 *encPtr = encodingTable[UTF_16BE_ENC];
1545 return XML_TOK_BOM;
1546 /* 00 3C is handled in the default case */
1547 case 0x3C00:
1548 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1549 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1550 && state == XML_CONTENT_STATE)
1551 break;
1552 *encPtr = encodingTable[UTF_16LE_ENC];
1553 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1554 case 0xFFFE:
1555 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1556 && state == XML_CONTENT_STATE)
1557 break;
1558 *nextTokPtr = ptr + 2;
1559 *encPtr = encodingTable[UTF_16LE_ENC];
1560 return XML_TOK_BOM;
1561 case 0xEFBB:
1562 /* Maybe a UTF-8 BOM (EF BB BF) */
1563 /* If there's an explicitly specified (external) encoding
1564 of ISO-8859-1 or some flavour of UTF-16
1565 and this is an external text entity,
1566 don't look for the BOM,
1567 because it might be a legal data.
1568 */
1569 if (state == XML_CONTENT_STATE) {
1570 int e = INIT_ENC_INDEX(enc);
1571 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1572 || e == UTF_16LE_ENC || e == UTF_16_ENC)
1573 break;
1574 }
1575 if (ptr + 2 == end)
1576 return XML_TOK_PARTIAL;
1577 if ((unsigned char)ptr[2] == 0xBF) {
1578 *nextTokPtr = ptr + 3;
1579 *encPtr = encodingTable[UTF_8_ENC];
1580 return XML_TOK_BOM;
1581 }
1582 break;
1583 default:
1584 if (ptr[0] == '\0') {
1585 /* 0 isn't a legal data character. Furthermore a document
1586 entity can only start with ASCII characters. So the only
1587 way this can fail to be big-endian UTF-16 if it it's an
1588 external parsed general entity that's labelled as
1589 UTF-16LE.
1590 */
1591 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1592 break;
1593 *encPtr = encodingTable[UTF_16BE_ENC];
1594 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1595 }
1596 else if (ptr[1] == '\0') {
1597 /* We could recover here in the case:
1598 - parsing an external entity
1599 - second byte is 0
1600 - no externally specified encoding
1601 - no encoding declaration
1602 by assuming UTF-16LE. But we don't, because this would mean when
1603 presented just with a single byte, we couldn't reliably determine
1604 whether we needed further bytes.
1605 */
1606 if (state == XML_CONTENT_STATE)
1607 break;
1608 *encPtr = encodingTable[UTF_16LE_ENC];
1609 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1610 }
1611 break;
1612 }
1613 }
1614 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1615 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1616}
1617
1618
1619#define NS(x) x
1620#define ns(x) x
1621#define XML_TOK_NS_C
1622#include "xmltok_ns.c"
1623#undef XML_TOK_NS_C
1624#undef NS
1625#undef ns
1626
1627#ifdef XML_NS
1628
1629#define NS(x) x ## NS
1630#define ns(x) x ## _ns
1631
1632#define XML_TOK_NS_C
1633#include "xmltok_ns.c"
1634#undef XML_TOK_NS_C
1635
1636#undef NS
1637#undef ns
1638
1639ENCODING *
1640XmlInitUnknownEncodingNS(void *mem,
1641 int *table,
1643 void *userData)
1644{
1645 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1646 if (enc)
1647 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1648 return enc;
1649}
1650
1651#endif /* XML_NS */
#define ASCII_l
Definition: ascii.h:43
#define ASCII_i
Definition: ascii.h:40
#define ASCII_F
Definition: ascii.h:10
#define ASCII_o
Definition: ascii.h:46
#define ASCII_E
Definition: ascii.h:9
#define ASCII_C
Definition: ascii.h:7
#define ASCII_O
Definition: ascii.h:19
#define ASCII_Z
Definition: ascii.h:30
#define ASCII_n
Definition: ascii.h:45
#define ASCII_s
Definition: ascii.h:50
#define ASCII_UNDERSCORE
Definition: ascii.h:85
#define ASCII_t
Definition: ascii.h:51
#define ASCII_APOS
Definition: ascii.h:75
#define ASCII_c
Definition: ascii.h:34
#define ASCII_PERIOD
Definition: ascii.h:77
#define ASCII_5
Definition: ascii.h:64
#define ASCII_I
Definition: ascii.h:13
#define ASCII_A
Definition: ascii.h:5
#define ASCII_z
Definition: ascii.h:57
#define ASCII_U
Definition: ascii.h:25
#define ASCII_9
Definition: ascii.h:68
#define ASCII_e
Definition: ascii.h:36
#define ASCII_d
Definition: ascii.h:35
#define ASCII_8
Definition: ascii.h:67
#define ASCII_r
Definition: ascii.h:49
#define ASCII_y
Definition: ascii.h:56
#define ASCII_COLON
Definition: ascii.h:78
#define ASCII_0
Definition: ascii.h:59
#define ASCII_QUOT
Definition: ascii.h:73
#define ASCII_L
Definition: ascii.h:16
#define ASCII_1
Definition: ascii.h:60
#define ASCII_a
Definition: ascii.h:32
#define ASCII_6
Definition: ascii.h:65
#define ASCII_B
Definition: ascii.h:6
#define ASCII_S
Definition: ascii.h:23
#define ASCII_g
Definition: ascii.h:38
#define ASCII_EQUALS
Definition: ascii.h:81
#define ASCII_T
Definition: ascii.h:24
#define ASCII_MINUS
Definition: ascii.h:76
#define ASCII_v
Definition: ascii.h:53
@ BT_NONASCII
@ BT_MALFORM
@ BT_LEAD4
@ BT_LEAD3
@ BT_OTHER
@ BT_TRAIL
@ BT_NAME
@ BT_LEAD2
@ BT_NMSTRT
@ BT_NONXML
#define PTRFASTCALL
Definition: internal.h:56
#define FASTCALL
Definition: internal.h:48
#define PTRCALL
Definition: internal.h:52
const ENCODING ** encPtr
Definition: xmltok.h:261
int minBytesPerChar
Definition: xmltok.h:169
const char const char const char * int(PTRFASTCALL *isNmstrt2)(const ENCODING *
const char const char const char const char * int(PTRFASTCALL *isNmstrt3)(const ENCODING *
int(PTRFASTCALL *isName2)(const ENCODING *
const char * int(PTRFASTCALL *isName3)(const ENCODING *
unsigned char type[256]
Definition: xmltok.cc:178
const char const char const char const char const char const char const char * int(PTRFASTCALL *isInvalid3)(const ENCODING *
const char const char const char const char const char * int(PTRFASTCALL *isNmstrt4)(const ENCODING *
const char const char const char const char const char const char const char const char * int(PTRFASTCALL *isInvalid4)(const ENCODING *
const char const char const char const char const char const char * int(PTRFASTCALL *isInvalid2)(const ENCODING *
const char const char * int(PTRFASTCALL *isName4)(const ENCODING *
ENCODING enc
Definition: xmltok.cc:177
unsigned short utf16[256]
Definition: xmltok.cc:1251
CONVERTER convert
Definition: xmltok.cc:1249
void * userData
Definition: xmltok.cc:1250
char utf8[256][4]
Definition: xmltok.cc:1252
struct normal_encoding normal
Definition: xmltok.cc:1248
#define userData
Definition: xmlparse.cc:555
#define XmlInitUnknownEncodingNS
Definition: xmlparse.cc:49
#define BIG2_BYTE_TO_ASCII(enc, p)
Definition: xmltok.cc:784
#define STANDARD_VTABLE(E)
Definition: xmltok.cc:210
#define VTABLE1
Definition: xmltok.cc:32
#define UTF8_GET_NAMING3(pages, byte)
Definition: xmltok.cc:66
#define BIG2_IS_NAME_CHAR_MINBPC(enc, p)
Definition: xmltok.cc:786
#define DEFINE_UTF16_TO_UTF8(E)
Definition: xmltok.cc:538
#define BIG2_BYTE_TYPE(enc, p)
Definition: xmltok.cc:780
#define NORMAL_VTABLE(E)
Definition: xmltok.cc:214
#define INIT_ENC_INDEX(enc)
Definition: xmltok.cc:1485
ENCODING * XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert, void *userData)
Definition: xmltok.cc:1346
int XmlSizeOfUnknownEncoding(void)
Definition: xmltok.cc:1258
#define UTF8_INVALID2(p)
Definition: xmltok.cc:91
#define SB_BYTE_TYPE(enc, p)
Definition: xmltok.cc:242
#define VTABLE
Definition: xmltok.cc:46
#define LITTLE2_CHAR_MATCHES(enc, p, c)
Definition: xmltok.cc:644
@ NO_ENC
Definition: xmltok.cc:1435
@ US_ASCII_ENC
Definition: xmltok.cc:1429
@ ISO_8859_1_ENC
Definition: xmltok.cc:1428
@ UTF_8_ENC
Definition: xmltok.cc:1430
@ UTF_16_ENC
Definition: xmltok.cc:1431
@ UNKNOWN_ENC
Definition: xmltok.cc:1427
@ UTF_16BE_ENC
Definition: xmltok.cc:1432
@ UTF_16LE_ENC
Definition: xmltok.cc:1433
@ UTF8_cval4
Definition: xmltok.cc:318
@ UTF8_cval1
Definition: xmltok.cc:315
@ UTF8_cval2
Definition: xmltok.cc:316
@ UTF8_cval3
Definition: xmltok.cc:317
#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
Definition: xmltok.cc:788
#define LITTLE2_BYTE_TYPE(enc, p)
Definition: xmltok.cc:639
#define BT_COLON
#define UTF8_INVALID4(p)
Definition: xmltok.cc:111
#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
Definition: xmltok.cc:645
#define AS_UNKNOWN_ENCODING(enc)
Definition: xmltok.cc:1255
#define UCS2_GET_NAMING(pages, hi, lo)
Definition: xmltok.cc:48
#define UTF8_GET_NAMING2(pages, byte)
Definition: xmltok.cc:55
int FASTCALL XmlUtf16Encode(int charNum, unsigned short *buf)
Definition: xmltok.cc:1230
#define LITTLE2_BYTE_TO_ASCII(enc, p)
Definition: xmltok.cc:643
#define UTF8_INVALID3(p)
Definition: xmltok.cc:94
#define DEFINE_UTF16_TO_UTF16(E)
Definition: xmltok.cc:601
#define BIG2_CHAR_MATCHES(enc, p, c)
Definition: xmltok.cc:785
#define AS_NORMAL_ENCODING(enc)
Definition: xmltok.cc:197
#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
Definition: xmltok.cc:647
int FASTCALL XmlUtf8Encode(int c, char *buf)
Definition: xmltok.cc:1193
#define XML_CONTENT_STATE
Definition: xmltok.h:95
#define XmlUtf8Convert(enc, fromP, fromLim, toP, toLim)
Definition: xmltok.h:253
#define XML_UTF8_ENCODE_MAX
Definition: xmltok.h:106
#define XML_TOK_PARTIAL
Definition: xmltok.h:20
#define XmlNameMatchesAscii(enc, ptr1, end1, ptr2)
Definition: xmltok.h:229
int(XMLCALL * CONVERTER)(void *userData, const char *p)
Definition: xmltok.h:283
#define XML_TOK_NONE
Definition: xmltok.h:17
#define XmlTok(enc, state, ptr, end, nextTokPtr)
Definition: xmltok.h:196
#define XML_TOK_BOM
Definition: xmltok.h:43