BOSS 7.0.7
BESIII Offline Software System
Loading...
Searching...
No Matches
xmltok.c
Go to the documentation of this file.
1/*
2The contents of this file are subject to the Mozilla Public License
3Version 1.1 (the "License"); you may not use this file except in
4compliance with the License. You may obtain a copy of the License at
5http://www.mozilla.org/MPL/
6
7Software distributed under the License is distributed on an "AS IS"
8basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
9License for the specific language governing rights and limitations
10under the License.
11
12The Original Code is expat.
13
14The Initial Developer of the Original Code is James Clark.
15Portions created by James Clark are Copyright (C) 1998, 1999
16James Clark. All Rights Reserved.
17
18Contributor(s):
19
20Alternatively, the contents of this file may be used under the terms
21of the GNU General Public License (the "GPL"), in which case the
22provisions of the GPL are applicable instead of those above. If you
23wish to allow use of your version of this file only under the terms of
24the GPL and not to allow others to use your version of this file under
25the MPL, indicate your decision by deleting the provisions above and
26replace them with the notice and other provisions required by the
27GPL. If you do not delete the provisions above, a recipient may use
28your version of this file under either the MPL or the GPL.
29*/
30
31#ifdef HAVE_CONFIG_H
32#include "config.h"
33#endif
34
35#include "xmldef.h"
36#include "xmltok.h"
37#include "nametab.h"
38
39#define VTABLE1 \
40 { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
41 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
42 PREFIX(sameName), \
43 PREFIX(nameMatchesAscii), \
44 PREFIX(nameLength), \
45 PREFIX(skipS), \
46 PREFIX(getAtts), \
47 PREFIX(charRefNumber), \
48 PREFIX(predefinedEntityName), \
49 PREFIX(updatePosition), \
50 PREFIX(isPublicId)
51
52#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
53
54#define UCS2_GET_NAMING(pages, hi, lo) \
55 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
56
57/* A 2 byte UTF-8 representation splits the characters 11 bits
58between the bottom 5 and 6 bits of the bytes.
59We need 8 bits to index into pages, 3 bits to add to that index and
605 bits to generate the mask. */
61#define UTF8_GET_NAMING2(pages, byte) \
62 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
63 + ((((byte)[0]) & 3) << 1) \
64 + ((((byte)[1]) >> 5) & 1)] \
65 & (1 << (((byte)[1]) & 0x1F)))
66
67/* A 3 byte UTF-8 representation splits the characters 16 bits
68between the bottom 4, 6 and 6 bits of the bytes.
69We need 8 bits to index into pages, 3 bits to add to that index and
705 bits to generate the mask. */
71#define UTF8_GET_NAMING3(pages, byte) \
72 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
73 + ((((byte)[1]) >> 2) & 0xF)] \
74 << 3) \
75 + ((((byte)[1]) & 3) << 1) \
76 + ((((byte)[2]) >> 5) & 1)] \
77 & (1 << (((byte)[2]) & 0x1F)))
78
79#define UTF8_GET_NAMING(pages, p, n) \
80 ((n) == 2 \
81 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
82 : ((n) == 3 \
83 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
84 : 0))
85
86#define UTF8_INVALID3(p) \
87 ((*p) == 0xED \
88 ? (((p)[1] & 0x20) != 0) \
89 : ((*p) == 0xEF \
90 ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
91 : 0))
92
93#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
94
95static
96int isNever(const ENCODING *enc, const char *p)
97{
98 return 0;
99}
100
101static
102int utf8_isName2(const ENCODING *enc, const char *p)
103{
104 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
105}
106
107static
108int utf8_isName3(const ENCODING *enc, const char *p)
109{
110 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
111}
112
113#define utf8_isName4 isNever
114
115static
116int utf8_isNmstrt2(const ENCODING *enc, const char *p)
117{
118 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
119}
120
121static
122int utf8_isNmstrt3(const ENCODING *enc, const char *p)
123{
124 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
125}
126
127#define utf8_isNmstrt4 isNever
128
129#define utf8_isInvalid2 isNever
130
131static
132int utf8_isInvalid3(const ENCODING *enc, const char *p)
133{
134 return UTF8_INVALID3((const unsigned char *)p);
135}
136
137static
138int utf8_isInvalid4(const ENCODING *enc, const char *p)
139{
140 return UTF8_INVALID4((const unsigned char *)p);
141}
142
144 ENCODING enc;
145 unsigned char type[256];
146#ifdef XML_MIN_SIZE
147 int (*byteType)(const ENCODING *, const char *);
148 int (*isNameMin)(const ENCODING *, const char *);
149 int (*isNmstrtMin)(const ENCODING *, const char *);
150 int (*byteToAscii)(const ENCODING *, const char *);
151 int (*charMatches)(const ENCODING *, const char *, int);
152#endif /* XML_MIN_SIZE */
153 int (*isName2)(const ENCODING *, const char *);
154 int (*isName3)(const ENCODING *, const char *);
155 int (*isName4)(const ENCODING *, const char *);
156 int (*isNmstrt2)(const ENCODING *, const char *);
157 int (*isNmstrt3)(const ENCODING *, const char *);
158 int (*isNmstrt4)(const ENCODING *, const char *);
159 int (*isInvalid2)(const ENCODING *, const char *);
160 int (*isInvalid3)(const ENCODING *, const char *);
161 int (*isInvalid4)(const ENCODING *, const char *);
162};
163
164#ifdef XML_MIN_SIZE
165
166#define STANDARD_VTABLE(E) \
167 E ## byteType, \
168 E ## isNameMin, \
169 E ## isNmstrtMin, \
170 E ## byteToAscii, \
171 E ## charMatches,
172
173#else
174
175#define STANDARD_VTABLE(E) /* as nothing */
176
177#endif
178
179#define NORMAL_VTABLE(E) \
180 E ## isName2, \
181 E ## isName3, \
182 E ## isName4, \
183 E ## isNmstrt2, \
184 E ## isNmstrt3, \
185 E ## isNmstrt4, \
186 E ## isInvalid2, \
187 E ## isInvalid3, \
188 E ## isInvalid4
189
190static int checkCharRefNumber(int);
191
192#include "xmltok_impl.h"
193
194#ifdef XML_MIN_SIZE
195#define sb_isNameMin isNever
196#define sb_isNmstrtMin isNever
197#endif
198
199#ifdef XML_MIN_SIZE
200#define MINBPC(enc) ((enc)->minBytesPerChar)
201#else
202/* minimum bytes per character */
203#define MINBPC(enc) 1
204#endif
205
206#define SB_BYTE_TYPE(enc, p) \
207 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
208
209#ifdef XML_MIN_SIZE
210static
211int sb_byteType(const ENCODING *enc, const char *p)
212{
213 return SB_BYTE_TYPE(enc, p);
214}
215#define BYTE_TYPE(enc, p) \
216 (((const struct normal_encoding *)(enc))->byteType(enc, p))
217#else
218#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
219#endif
220
221#ifdef XML_MIN_SIZE
222#define BYTE_TO_ASCII(enc, p) \
223 (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
224static
225int sb_byteToAscii(const ENCODING *enc, const char *p)
226{
227 return *p;
228}
229#else
230#define BYTE_TO_ASCII(enc, p) (*p)
231#endif
232
233#define IS_NAME_CHAR(enc, p, n) \
234 (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
235#define IS_NMSTRT_CHAR(enc, p, n) \
236 (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
237#define IS_INVALID_CHAR(enc, p, n) \
238 (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
239
240#ifdef XML_MIN_SIZE
241#define IS_NAME_CHAR_MINBPC(enc, p) \
242 (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
243#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
244 (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
245#else
246#define IS_NAME_CHAR_MINBPC(enc, p) (0)
247#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
248#endif
249
250#ifdef XML_MIN_SIZE
251#define CHAR_MATCHES(enc, p, c) \
252 (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
253static
254int sb_charMatches(const ENCODING *enc, const char *p, int c)
255{
256 return *p == c;
257}
258#else
259/* c is an ASCII character */
260#define CHAR_MATCHES(enc, p, c) (*(p) == c)
261#endif
262
263#define PREFIX(ident) normal_ ## ident
264#include "xmltok_impl.c"
265
266#undef MINBPC
267#undef BYTE_TYPE
268#undef BYTE_TO_ASCII
269#undef CHAR_MATCHES
270#undef IS_NAME_CHAR
271#undef IS_NAME_CHAR_MINBPC
272#undef IS_NMSTRT_CHAR
273#undef IS_NMSTRT_CHAR_MINBPC
274#undef IS_INVALID_CHAR
275
276enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
280 UTF8_cval4 = 0xf0
282
283static
284void utf8_toUtf8(const ENCODING *enc,
285 const char **fromP, const char *fromLim,
286 char **toP, const char *toLim)
287{
288 char *to;
289 const char *from;
290 if (fromLim - *fromP > toLim - *toP) {
291 /* Avoid copying partial characters. */
292 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
293 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
294 break;
295 }
296 for (to = *toP, from = *fromP; from != fromLim; from++, to++)
297 *to = *from;
298 *fromP = from;
299 *toP = to;
300}
301
302static
303void utf8_toUtf16(const ENCODING *enc,
304 const char **fromP, const char *fromLim,
305 unsigned short **toP, const unsigned short *toLim)
306{
307 unsigned short *to = *toP;
308 const char *from = *fromP;
309 while (from != fromLim && to != toLim) {
310 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
311 case BT_LEAD2:
312 *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
313 from += 2;
314 break;
315 case BT_LEAD3:
316 *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
317 from += 3;
318 break;
319 case BT_LEAD4:
320 {
321 unsigned long n;
322 if (to + 1 == toLim)
323 break;
324 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
325 n -= 0x10000;
326 to[0] = (unsigned short)((n >> 10) | 0xD800);
327 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
328 to += 2;
329 from += 4;
330 }
331 break;
332 default:
333 *to++ = *from++;
334 break;
335 }
336 }
337 *fromP = from;
338 *toP = to;
339}
340
341#ifdef XML_NS
342static const struct normal_encoding utf8_encoding_ns = {
343 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
344 {
345#include "asciitab.h"
346#include "utf8tab.h"
347 },
349};
350#endif
351
352static const struct normal_encoding utf8_encoding = {
353 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
354 {
355#define BT_COLON BT_NMSTRT
356#include "asciitab.h"
357#undef BT_COLON
358#include "utf8tab.h"
359 },
361};
362
363#ifdef XML_NS
364
365static const struct normal_encoding internal_utf8_encoding_ns = {
366 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
367 {
368#include "iasciitab.h"
369#include "utf8tab.h"
370 },
372};
373
374#endif
375
376static const struct normal_encoding internal_utf8_encoding = {
377 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
378 {
379#define BT_COLON BT_NMSTRT
380#include "iasciitab.h"
381#undef BT_COLON
382#include "utf8tab.h"
383 },
385};
386
387static
388void latin1_toUtf8(const ENCODING *enc,
389 const char **fromP, const char *fromLim,
390 char **toP, const char *toLim)
391{
392 for (;;) {
393 unsigned char c;
394 if (*fromP == fromLim)
395 break;
396 c = (unsigned char)**fromP;
397 if (c & 0x80) {
398 if (toLim - *toP < 2)
399 break;
400 *(*toP)++ = ((c >> 6) | UTF8_cval2);
401 *(*toP)++ = ((c & 0x3f) | 0x80);
402 (*fromP)++;
403 }
404 else {
405 if (*toP == toLim)
406 break;
407 *(*toP)++ = *(*fromP)++;
408 }
409 }
410}
411
412static
413void latin1_toUtf16(const ENCODING *enc,
414 const char **fromP, const char *fromLim,
415 unsigned short **toP, const unsigned short *toLim)
416{
417 while (*fromP != fromLim && *toP != toLim)
418 *(*toP)++ = (unsigned char)*(*fromP)++;
419}
420
421#ifdef XML_NS
422
423static const struct normal_encoding latin1_encoding_ns = {
424 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
425 {
426#include "asciitab.h"
427#include "latin1tab.h"
428 },
429 STANDARD_VTABLE(sb_)
430};
431
432#endif
433
434static const struct normal_encoding latin1_encoding = {
435 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
436 {
437#define BT_COLON BT_NMSTRT
438#include "asciitab.h"
439#undef BT_COLON
440#include "latin1tab.h"
441 },
442 STANDARD_VTABLE(sb_)
443};
444
445static
446void ascii_toUtf8(const ENCODING *enc,
447 const char **fromP, const char *fromLim,
448 char **toP, const char *toLim)
449{
450 while (*fromP != fromLim && *toP != toLim)
451 *(*toP)++ = *(*fromP)++;
452}
453
454#ifdef XML_NS
455
456static const struct normal_encoding ascii_encoding_ns = {
457 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
458 {
459#include "asciitab.h"
460/* BT_NONXML == 0 */
461 },
462 STANDARD_VTABLE(sb_)
463};
464
465#endif
466
467static const struct normal_encoding ascii_encoding = {
468 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
469 {
470#define BT_COLON BT_NMSTRT
471#include "asciitab.h"
472#undef BT_COLON
473/* BT_NONXML == 0 */
474 },
475 STANDARD_VTABLE(sb_)
476};
477
478static int unicode_byte_type(char hi, char lo)
479{
480 switch ((unsigned char)hi) {
481 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
482 return BT_LEAD4;
483 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
484 return BT_TRAIL;
485 case 0xFF:
486 switch ((unsigned char)lo) {
487 case 0xFF:
488 case 0xFE:
489 return BT_NONXML;
490 }
491 break;
492 }
493 return BT_NONASCII;
494}
495
496#define DEFINE_UTF16_TO_UTF8(E) \
497static \
498void E ## toUtf8(const ENCODING *enc, \
499 const char **fromP, const char *fromLim, \
500 char **toP, const char *toLim) \
501{ \
502 const char *from; \
503 for (from = *fromP; from != fromLim; from += 2) { \
504 int plane; \
505 unsigned char lo2; \
506 unsigned char lo = GET_LO(from); \
507 unsigned char hi = GET_HI(from); \
508 switch (hi) { \
509 case 0: \
510 if (lo < 0x80) { \
511 if (*toP == toLim) { \
512 *fromP = from; \
513 return; \
514 } \
515 *(*toP)++ = lo; \
516 break; \
517 } \
518 /* fall through */ \
519 case 0x1: case 0x2: case 0x3: \
520 case 0x4: case 0x5: case 0x6: case 0x7: \
521 if (toLim - *toP < 2) { \
522 *fromP = from; \
523 return; \
524 } \
525 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
526 *(*toP)++ = ((lo & 0x3f) | 0x80); \
527 break; \
528 default: \
529 if (toLim - *toP < 3) { \
530 *fromP = from; \
531 return; \
532 } \
533 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
534 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
535 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
536 *(*toP)++ = ((lo & 0x3f) | 0x80); \
537 break; \
538 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
539 if (toLim - *toP < 4) { \
540 *fromP = from; \
541 return; \
542 } \
543 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
544 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
545 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
546 from += 2; \
547 lo2 = GET_LO(from); \
548 *(*toP)++ = (((lo & 0x3) << 4) \
549 | ((GET_HI(from) & 0x3) << 2) \
550 | (lo2 >> 6) \
551 | 0x80); \
552 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
553 break; \
554 } \
555 } \
556 *fromP = from; \
557}
558
559#define DEFINE_UTF16_TO_UTF16(E) \
560static \
561void E ## toUtf16(const ENCODING *enc, \
562 const char **fromP, const char *fromLim, \
563 unsigned short **toP, const unsigned short *toLim) \
564{ \
565 /* Avoid copying first half only of surrogate */ \
566 if (fromLim - *fromP > ((toLim - *toP) << 1) \
567 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
568 fromLim -= 2; \
569 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
570 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
571}
572
573#define SET2(ptr, ch) \
574 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
575#define GET_LO(ptr) ((unsigned char)(ptr)[0])
576#define GET_HI(ptr) ((unsigned char)(ptr)[1])
577
578DEFINE_UTF16_TO_UTF8(little2_)
579DEFINE_UTF16_TO_UTF16(little2_)
580
581#undef SET2
582#undef GET_LO
583#undef GET_HI
584
585#define SET2(ptr, ch) \
586 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
587#define GET_LO(ptr) ((unsigned char)(ptr)[1])
588#define GET_HI(ptr) ((unsigned char)(ptr)[0])
589
592
593#undef SET2
594#undef GET_LO
595#undef GET_HI
596
597#define LITTLE2_BYTE_TYPE(enc, p) \
598 ((p)[1] == 0 \
599 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
600 : unicode_byte_type((p)[1], (p)[0]))
601#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
602#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
603#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
604 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
605#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
606 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
607
608#ifdef XML_MIN_SIZE
609
610static
611int little2_byteType(const ENCODING *enc, const char *p)
612{
613 return LITTLE2_BYTE_TYPE(enc, p);
614}
615
616static
617int little2_byteToAscii(const ENCODING *enc, const char *p)
618{
619 return LITTLE2_BYTE_TO_ASCII(enc, p);
620}
621
622static
623int little2_charMatches(const ENCODING *enc, const char *p, int c)
624{
625 return LITTLE2_CHAR_MATCHES(enc, p, c);
626}
627
628static
629int little2_isNameMin(const ENCODING *enc, const char *p)
630{
632}
633
634static
635int little2_isNmstrtMin(const ENCODING *enc, const char *p)
636{
638}
639
640#undef VTABLE
641#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
642
643#else /* not XML_MIN_SIZE */
644
645#undef PREFIX
646#define PREFIX(ident) little2_ ## ident
647#define MINBPC(enc) 2
648/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
649#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
650#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
651#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
652#define IS_NAME_CHAR(enc, p, n) 0
653#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
654#define IS_NMSTRT_CHAR(enc, p, n) (0)
655#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
656
657#include "xmltok_impl.c"
658
659#undef MINBPC
660#undef BYTE_TYPE
661#undef BYTE_TO_ASCII
662#undef CHAR_MATCHES
663#undef IS_NAME_CHAR
664#undef IS_NAME_CHAR_MINBPC
665#undef IS_NMSTRT_CHAR
666#undef IS_NMSTRT_CHAR_MINBPC
667#undef IS_INVALID_CHAR
668
669#endif /* not XML_MIN_SIZE */
670
671#ifdef XML_NS
672
673static const struct normal_encoding little2_encoding_ns = {
674 { VTABLE, 2, 0,
675#if XML_BYTE_ORDER == 12
676 1
677#else
678 0
679#endif
680 },
681 {
682#include "asciitab.h"
683#include "latin1tab.h"
684 },
685 STANDARD_VTABLE(little2_)
686};
687
688#endif
689
690static const struct normal_encoding little2_encoding = {
691 { VTABLE, 2, 0,
692#if XML_BYTE_ORDER == 12
693 1
694#else
695 0
696#endif
697 },
698 {
699#define BT_COLON BT_NMSTRT
700#include "asciitab.h"
701#undef BT_COLON
702#include "latin1tab.h"
703 },
704 STANDARD_VTABLE(little2_)
705};
706
707#if XML_BYTE_ORDER != 21
708
709#ifdef XML_NS
710
711static const struct normal_encoding internal_little2_encoding_ns = {
712 { VTABLE, 2, 0, 1 },
713 {
714#include "iasciitab.h"
715#include "latin1tab.h"
716 },
717 STANDARD_VTABLE(little2_)
718};
719
720#endif
721
722static const struct normal_encoding internal_little2_encoding = {
723 { VTABLE, 2, 0, 1 },
724 {
725#define BT_COLON BT_NMSTRT
726#include "iasciitab.h"
727#undef BT_COLON
728#include "latin1tab.h"
729 },
730 STANDARD_VTABLE(little2_)
731};
732
733#endif
734
735
736#define BIG2_BYTE_TYPE(enc, p) \
737 ((p)[0] == 0 \
738 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
739 : unicode_byte_type((p)[0], (p)[1]))
740#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
741#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
742#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
743 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
744#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
745 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
746
747#ifdef XML_MIN_SIZE
748
749static
750int big2_byteType(const ENCODING *enc, const char *p)
751{
752 return BIG2_BYTE_TYPE(enc, p);
753}
754
755static
756int big2_byteToAscii(const ENCODING *enc, const char *p)
757{
758 return BIG2_BYTE_TO_ASCII(enc, p);
759}
760
761static
762int big2_charMatches(const ENCODING *enc, const char *p, int c)
763{
764 return BIG2_CHAR_MATCHES(enc, p, c);
765}
766
767static
768int big2_isNameMin(const ENCODING *enc, const char *p)
769{
770 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
771}
772
773static
774int big2_isNmstrtMin(const ENCODING *enc, const char *p)
775{
777}
778
779#undef VTABLE
780#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
781
782#else /* not XML_MIN_SIZE */
783
784#undef PREFIX
785#define PREFIX(ident) big2_ ## ident
786#define MINBPC(enc) 2
787/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
788#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
789#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
790#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
791#define IS_NAME_CHAR(enc, p, n) 0
792#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
793#define IS_NMSTRT_CHAR(enc, p, n) (0)
794#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
795
796#include "xmltok_impl.c"
797
798#undef MINBPC
799#undef BYTE_TYPE
800#undef BYTE_TO_ASCII
801#undef CHAR_MATCHES
802#undef IS_NAME_CHAR
803#undef IS_NAME_CHAR_MINBPC
804#undef IS_NMSTRT_CHAR
805#undef IS_NMSTRT_CHAR_MINBPC
806#undef IS_INVALID_CHAR
807
808#endif /* not XML_MIN_SIZE */
809
810#ifdef XML_NS
811
812static const struct normal_encoding big2_encoding_ns = {
813 { VTABLE, 2, 0,
814#if XML_BYTE_ORDER == 21
815 1
816#else
817 0
818#endif
819 },
820 {
821#include "asciitab.h"
822#include "latin1tab.h"
823 },
824 STANDARD_VTABLE(big2_)
825};
826
827#endif
828
829static const struct normal_encoding big2_encoding = {
830 { VTABLE, 2, 0,
831#if XML_BYTE_ORDER == 21
832 1
833#else
834 0
835#endif
836 },
837 {
838#define BT_COLON BT_NMSTRT
839#include "asciitab.h"
840#undef BT_COLON
841#include "latin1tab.h"
842 },
843 STANDARD_VTABLE(big2_)
844};
845
846#if XML_BYTE_ORDER != 12
847
848#ifdef XML_NS
849
850static const struct normal_encoding internal_big2_encoding_ns = {
851 { VTABLE, 2, 0, 1 },
852 {
853#include "iasciitab.h"
854#include "latin1tab.h"
855 },
856 STANDARD_VTABLE(big2_)
857};
858
859#endif
860
861static const struct normal_encoding internal_big2_encoding = {
862 { VTABLE, 2, 0, 1 },
863 {
864#define BT_COLON BT_NMSTRT
865#include "iasciitab.h"
866#undef BT_COLON
867#include "latin1tab.h"
868 },
869 STANDARD_VTABLE(big2_)
870};
871
872#endif
873
874#undef PREFIX
875
876static
877int streqci(const char *s1, const char *s2)
878{
879 for (;;) {
880 char c1 = *s1++;
881 char c2 = *s2++;
882 if ('a' <= c1 && c1 <= 'z')
883 c1 += 'A' - 'a';
884 if ('a' <= c2 && c2 <= 'z')
885 c2 += 'A' - 'a';
886 if (c1 != c2)
887 return 0;
888 if (!c1)
889 break;
890 }
891 return 1;
892}
893
894static
895void initUpdatePosition(const ENCODING *enc, const char *ptr,
896 const char *end, POSITION *pos)
897{
898 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
899}
900
901static
902int toAscii(const ENCODING *enc, const char *ptr, const char *end)
903{
904 char buf[1];
905 char *p = buf;
906 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
907 if (p == buf)
908 return -1;
909 else
910 return buf[0];
911}
912
913static
914int isSpace(int c)
915{
916 switch (c) {
917 case 0x20:
918 case 0xD:
919 case 0xA:
920 case 0x9:
921 return 1;
922 }
923 return 0;
924}
925
926/* Return 1 if there's just optional white space
927or there's an S followed by name=val. */
928static
929int parsePseudoAttribute(const ENCODING *enc,
930 const char *ptr,
931 const char *end,
932 const char **namePtr,
933 const char **valPtr,
934 const char **nextTokPtr)
935{
936 int c;
937 char open;
938 if (ptr == end) {
939 *namePtr = 0;
940 return 1;
941 }
942 if (!isSpace(toAscii(enc, ptr, end))) {
943 *nextTokPtr = ptr;
944 return 0;
945 }
946 do {
947 ptr += enc->minBytesPerChar;
948 } while (isSpace(toAscii(enc, ptr, end)));
949 if (ptr == end) {
950 *namePtr = 0;
951 return 1;
952 }
953 *namePtr = ptr;
954 for (;;) {
955 c = toAscii(enc, ptr, end);
956 if (c == -1) {
957 *nextTokPtr = ptr;
958 return 0;
959 }
960 if (c == '=')
961 break;
962 if (isSpace(c)) {
963 do {
964 ptr += enc->minBytesPerChar;
965 } while (isSpace(c = toAscii(enc, ptr, end)));
966 if (c != '=') {
967 *nextTokPtr = ptr;
968 return 0;
969 }
970 break;
971 }
972 ptr += enc->minBytesPerChar;
973 }
974 if (ptr == *namePtr) {
975 *nextTokPtr = ptr;
976 return 0;
977 }
978 ptr += enc->minBytesPerChar;
979 c = toAscii(enc, ptr, end);
980 while (isSpace(c)) {
981 ptr += enc->minBytesPerChar;
982 c = toAscii(enc, ptr, end);
983 }
984 if (c != '"' && c != '\'') {
985 *nextTokPtr = ptr;
986 return 0;
987 }
988 open = c;
989 ptr += enc->minBytesPerChar;
990 *valPtr = ptr;
991 for (;; ptr += enc->minBytesPerChar) {
992 c = toAscii(enc, ptr, end);
993 if (c == open)
994 break;
995 if (!('a' <= c && c <= 'z')
996 && !('A' <= c && c <= 'Z')
997 && !('0' <= c && c <= '9')
998 && c != '.'
999 && c != '-'
1000 && c != '_') {
1001 *nextTokPtr = ptr;
1002 return 0;
1003 }
1004 }
1005 *nextTokPtr = ptr + enc->minBytesPerChar;
1006 return 1;
1007}
1008
1009static
1010int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1011 const char *,
1012 const char *),
1013 int isGeneralTextEntity,
1014 const ENCODING *enc,
1015 const char *ptr,
1016 const char *end,
1017 const char **badPtr,
1018 const char **versionPtr,
1019 const char **encodingName,
1020 const ENCODING **encoding,
1021 int *standalone)
1022{
1023 const char *val = 0;
1024 const char *name = 0;
1025 ptr += 5 * enc->minBytesPerChar;
1026 end -= 2 * enc->minBytesPerChar;
1027 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) || !name) {
1028 *badPtr = ptr;
1029 return 0;
1030 }
1031 if (!XmlNameMatchesAscii(enc, name, "version")) {
1032 if (!isGeneralTextEntity) {
1033 *badPtr = name;
1034 return 0;
1035 }
1036 }
1037 else {
1038 if (versionPtr)
1039 *versionPtr = val;
1040 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
1041 *badPtr = ptr;
1042 return 0;
1043 }
1044 if (!name) {
1045 if (isGeneralTextEntity) {
1046 /* a TextDecl must have an EncodingDecl */
1047 *badPtr = ptr;
1048 return 0;
1049 }
1050 return 1;
1051 }
1052 }
1053 if (XmlNameMatchesAscii(enc, name, "encoding")) {
1054 int c = toAscii(enc, val, end);
1055 if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) {
1056 *badPtr = val;
1057 return 0;
1058 }
1059 if (encodingName)
1060 *encodingName = val;
1061 if (encoding)
1062 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1063 if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
1064 *badPtr = ptr;
1065 return 0;
1066 }
1067 if (!name)
1068 return 1;
1069 }
1070 if (!XmlNameMatchesAscii(enc, name, "standalone") || isGeneralTextEntity) {
1071 *badPtr = name;
1072 return 0;
1073 }
1074 if (XmlNameMatchesAscii(enc, val, "yes")) {
1075 if (standalone)
1076 *standalone = 1;
1077 }
1078 else if (XmlNameMatchesAscii(enc, val, "no")) {
1079 if (standalone)
1080 *standalone = 0;
1081 }
1082 else {
1083 *badPtr = val;
1084 return 0;
1085 }
1086 while (isSpace(toAscii(enc, ptr, end)))
1087 ptr += enc->minBytesPerChar;
1088 if (ptr != end) {
1089 *badPtr = ptr;
1090 return 0;
1091 }
1092 return 1;
1093}
1094
1095static
1096int checkCharRefNumber(int result)
1097{
1098 switch (result >> 8) {
1099 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1100 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1101 return -1;
1102 case 0:
1103 if (latin1_encoding.type[result] == BT_NONXML)
1104 return -1;
1105 break;
1106 case 0xFF:
1107 if (result == 0xFFFE || result == 0xFFFF)
1108 return -1;
1109 break;
1110 }
1111 return result;
1112}
1113
1114int XmlUtf8Encode(int c, char *buf)
1115{
1116 enum {
1117 /* minN is minimum legal resulting value for N byte sequence */
1118 min2 = 0x80,
1119 min3 = 0x800,
1120 min4 = 0x10000
1121 };
1122
1123 if (c < 0)
1124 return 0;
1125 if (c < min2) {
1126 buf[0] = (c | UTF8_cval1);
1127 return 1;
1128 }
1129 if (c < min3) {
1130 buf[0] = ((c >> 6) | UTF8_cval2);
1131 buf[1] = ((c & 0x3f) | 0x80);
1132 return 2;
1133 }
1134 if (c < min4) {
1135 buf[0] = ((c >> 12) | UTF8_cval3);
1136 buf[1] = (((c >> 6) & 0x3f) | 0x80);
1137 buf[2] = ((c & 0x3f) | 0x80);
1138 return 3;
1139 }
1140 if (c < 0x110000) {
1141 buf[0] = ((c >> 18) | UTF8_cval4);
1142 buf[1] = (((c >> 12) & 0x3f) | 0x80);
1143 buf[2] = (((c >> 6) & 0x3f) | 0x80);
1144 buf[3] = ((c & 0x3f) | 0x80);
1145 return 4;
1146 }
1147 return 0;
1148}
1149
1150int XmlUtf16Encode(int charNum, unsigned short *buf)
1151{
1152 if (charNum < 0)
1153 return 0;
1154 if (charNum < 0x10000) {
1155 buf[0] = charNum;
1156 return 1;
1157 }
1158 if (charNum < 0x110000) {
1159 charNum -= 0x10000;
1160 buf[0] = (charNum >> 10) + 0xD800;
1161 buf[1] = (charNum & 0x3FF) + 0xDC00;
1162 return 2;
1163 }
1164 return 0;
1165}
1166
1169 int (*convert)(void *userData, const char *p);
1171 unsigned short utf16[256];
1172 char utf8[256][4];
1173};
1174
1176{
1177 return sizeof(struct unknown_encoding);
1178}
1179
1180static
1181int unknown_isName(const ENCODING *enc, const char *p)
1182{
1183 int c = ((const struct unknown_encoding *)enc)
1184 ->convert(((const struct unknown_encoding *)enc)->userData, p);
1185 if (c & ~0xFFFF)
1186 return 0;
1187 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1188}
1189
1190static
1191int unknown_isNmstrt(const ENCODING *enc, const char *p)
1192{
1193 int c = ((const struct unknown_encoding *)enc)
1194 ->convert(((const struct unknown_encoding *)enc)->userData, p);
1195 if (c & ~0xFFFF)
1196 return 0;
1197 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1198}
1199
1200static
1201int unknown_isInvalid(const ENCODING *enc, const char *p)
1202{
1203 int c = ((const struct unknown_encoding *)enc)
1204 ->convert(((const struct unknown_encoding *)enc)->userData, p);
1205 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1206}
1207
1208static
1209void unknown_toUtf8(const ENCODING *enc,
1210 const char **fromP, const char *fromLim,
1211 char **toP, const char *toLim)
1212{
1213 char buf[XML_UTF8_ENCODE_MAX];
1214 for (;;) {
1215 const char *utf8;
1216 int n;
1217 if (*fromP == fromLim)
1218 break;
1219 utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
1220 n = *utf8++;
1221 if (n == 0) {
1222 int c = ((const struct unknown_encoding *)enc)
1223 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1224 n = XmlUtf8Encode(c, buf);
1225 if (n > toLim - *toP)
1226 break;
1227 utf8 = buf;
1228 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1229 - (BT_LEAD2 - 2);
1230 }
1231 else {
1232 if (n > toLim - *toP)
1233 break;
1234 (*fromP)++;
1235 }
1236 do {
1237 *(*toP)++ = *utf8++;
1238 } while (--n != 0);
1239 }
1240}
1241
1242static
1243void unknown_toUtf16(const ENCODING *enc,
1244 const char **fromP, const char *fromLim,
1245 unsigned short **toP, const unsigned short *toLim)
1246{
1247 while (*fromP != fromLim && *toP != toLim) {
1248 unsigned short c
1249 = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
1250 if (c == 0) {
1251 c = (unsigned short)((const struct unknown_encoding *)enc)
1252 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1253 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1254 - (BT_LEAD2 - 2);
1255 }
1256 else
1257 (*fromP)++;
1258 *(*toP)++ = c;
1259 }
1260}
1261
1262ENCODING *
1264 int *table,
1265 int (*convert)(void *userData, const char *p),
1266 void *userData)
1267{
1268 int i;
1269 struct unknown_encoding *e = mem;
1270 for (i = 0; i < sizeof(struct normal_encoding); i++)
1271 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1272 for (i = 0; i < 128; i++)
1273 if (latin1_encoding.type[i] != BT_OTHER
1274 && latin1_encoding.type[i] != BT_NONXML
1275 && table[i] != i)
1276 return 0;
1277 for (i = 0; i < 256; i++) {
1278 int c = table[i];
1279 if (c == -1) {
1280 e->normal.type[i] = BT_MALFORM;
1281 /* This shouldn't really get used. */
1282 e->utf16[i] = 0xFFFF;
1283 e->utf8[i][0] = 1;
1284 e->utf8[i][1] = 0;
1285 }
1286 else if (c < 0) {
1287 if (c < -4)
1288 return 0;
1289 e->normal.type[i] = BT_LEAD2 - (c + 2);
1290 e->utf8[i][0] = 0;
1291 e->utf16[i] = 0;
1292 }
1293 else if (c < 0x80) {
1294 if (latin1_encoding.type[c] != BT_OTHER
1295 && latin1_encoding.type[c] != BT_NONXML
1296 && c != i)
1297 return 0;
1298 e->normal.type[i] = latin1_encoding.type[c];
1299 e->utf8[i][0] = 1;
1300 e->utf8[i][1] = (char)c;
1301 e->utf16[i] = c == 0 ? 0xFFFF : c;
1302 }
1303 else if (checkCharRefNumber(c) < 0) {
1304 e->normal.type[i] = BT_NONXML;
1305 /* This shouldn't really get used. */
1306 e->utf16[i] = 0xFFFF;
1307 e->utf8[i][0] = 1;
1308 e->utf8[i][1] = 0;
1309 }
1310 else {
1311 if (c > 0xFFFF)
1312 return 0;
1313 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1314 e->normal.type[i] = BT_NMSTRT;
1315 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1316 e->normal.type[i] = BT_NAME;
1317 else
1318 e->normal.type[i] = BT_OTHER;
1319 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1320 e->utf16[i] = c;
1321 }
1322 }
1323 e->userData = userData;
1324 e->convert = convert;
1325 if (convert) {
1326 e->normal.isName2 = unknown_isName;
1327 e->normal.isName3 = unknown_isName;
1328 e->normal.isName4 = unknown_isName;
1329 e->normal.isNmstrt2 = unknown_isNmstrt;
1330 e->normal.isNmstrt3 = unknown_isNmstrt;
1331 e->normal.isNmstrt4 = unknown_isNmstrt;
1332 e->normal.isInvalid2 = unknown_isInvalid;
1333 e->normal.isInvalid3 = unknown_isInvalid;
1334 e->normal.isInvalid4 = unknown_isInvalid;
1335 }
1336 e->normal.enc.utf8Convert = unknown_toUtf8;
1337 e->normal.enc.utf16Convert = unknown_toUtf16;
1338 return &(e->normal.enc);
1339}
1340
1341/* If this enumeration is changed, getEncodingIndex and encodings
1342must also be changed. */
1343enum {
1351 /* must match encodingNames up to here */
1352 NO_ENC
1354
1355static
1356int getEncodingIndex(const char *name)
1357{
1358 static const char *encodingNames[] = {
1359 "ISO-8859-1",
1360 "US-ASCII",
1361 "UTF-8",
1362 "UTF-16",
1363 "UTF-16BE"
1364 "UTF-16LE",
1365 };
1366 int i;
1367 if (name == 0)
1368 return NO_ENC;
1369 for (i = 0; i < sizeof(encodingNames)/sizeof(encodingNames[0]); i++)
1370 if (streqci(name, encodingNames[i]))
1371 return i;
1372 return UNKNOWN_ENC;
1373}
1374
1375/* For binary compatibility, we store the index of the encoding specified
1376at initialization in the isUtf16 member. */
1377
1378#define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16)
1379
1380/* This is what detects the encoding.
1381encodingTable maps from encoding indices to encodings;
1382INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
1383state is XML_CONTENT_STATE if we're parsing an external text entity,
1384and XML_PROLOG_STATE otherwise.
1385*/
1386
1387
1388static
1389int initScan(const ENCODING **encodingTable,
1390 const INIT_ENCODING *enc,
1391 int state,
1392 const char *ptr,
1393 const char *end,
1394 const char **nextTokPtr)
1395{
1396 const ENCODING **encPtr;
1397
1398 if (ptr == end)
1399 return XML_TOK_NONE;
1400 encPtr = enc->encPtr;
1401 if (ptr + 1 == end) {
1402 /* only a single byte available for auto-detection */
1403 /* a well-formed document entity must have more than one byte */
1404 if (state != XML_CONTENT_STATE)
1405 return XML_TOK_PARTIAL;
1406 /* so we're parsing an external text entity... */
1407 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1408 switch (INIT_ENC_INDEX(enc)) {
1409 case UTF_16_ENC:
1410 case UTF_16LE_ENC:
1411 case UTF_16BE_ENC:
1412 return XML_TOK_PARTIAL;
1413 }
1414 switch ((unsigned char)*ptr) {
1415 case 0xFE:
1416 case 0xFF:
1417 case 0xEF: /* possibly first byte of UTF-8 BOM */
1419 && state == XML_CONTENT_STATE)
1420 break;
1421 /* fall through */
1422 case 0x00:
1423 case 0x3C:
1424 return XML_TOK_PARTIAL;
1425 }
1426 }
1427 else {
1428 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1429 case 0xFEFF:
1431 && state == XML_CONTENT_STATE)
1432 break;
1433 *nextTokPtr = ptr + 2;
1434 *encPtr = encodingTable[UTF_16BE_ENC];
1435 return XML_TOK_BOM;
1436 /* 00 3C is handled in the default case */
1437 case 0x3C00:
1440 && state == XML_CONTENT_STATE)
1441 break;
1442 *encPtr = encodingTable[UTF_16LE_ENC];
1443 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1444 case 0xFFFE:
1446 && state == XML_CONTENT_STATE)
1447 break;
1448 *nextTokPtr = ptr + 2;
1449 *encPtr = encodingTable[UTF_16LE_ENC];
1450 return XML_TOK_BOM;
1451 case 0xEFBB:
1452 /* Maybe a UTF-8 BOM (EF BB BF) */
1453 /* If there's an explicitly specified (external) encoding
1454 of ISO-8859-1 or some flavour of UTF-16
1455 and this is an external text entity,
1456 don't look for the BOM,
1457 because it might be a legal data. */
1458 if (state == XML_CONTENT_STATE) {
1459 int e = INIT_ENC_INDEX(enc);
1460 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
1461 break;
1462 }
1463 if (ptr + 2 == end)
1464 return XML_TOK_PARTIAL;
1465 if ((unsigned char)ptr[2] == 0xBF) {
1466 *encPtr = encodingTable[UTF_8_ENC];
1467 return XML_TOK_BOM;
1468 }
1469 break;
1470 default:
1471 if (ptr[0] == '\0') {
1472 /* 0 isn't a legal data character. Furthermore a document entity can only
1473 start with ASCII characters. So the only way this can fail to be big-endian
1474 UTF-16 if it it's an external parsed general entity that's labelled as
1475 UTF-16LE. */
1476 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1477 break;
1478 *encPtr = encodingTable[UTF_16BE_ENC];
1479 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1480 }
1481 else if (ptr[1] == '\0') {
1482 /* We could recover here in the case:
1483 - parsing an external entity
1484 - second byte is 0
1485 - no externally specified encoding
1486 - no encoding declaration
1487 by assuming UTF-16LE. But we don't, because this would mean when
1488 presented just with a single byte, we couldn't reliably determine
1489 whether we needed further bytes. */
1490 if (state == XML_CONTENT_STATE)
1491 break;
1492 *encPtr = encodingTable[UTF_16LE_ENC];
1493 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1494 }
1495 break;
1496 }
1497 }
1498 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1499 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1500}
1501
1502
1503#define NS(x) x
1504#define ns(x) x
1505#include "xmltok_ns.c"
1506#undef NS
1507#undef ns
1508
1509#ifdef XML_NS
1510
1511#define NS(x) x ## NS
1512#define ns(x) x ## _ns
1513
1514#include "xmltok_ns.c"
1515
1516#undef NS
1517#undef ns
1518
1519ENCODING *
1520XmlInitUnknownEncodingNS(void *mem,
1521 int *table,
1522 int (*convert)(void *userData, const char *p),
1523 void *userData)
1524{
1525 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1526 if (enc)
1527 ((struct normal_encoding *)enc)->type[':'] = BT_COLON;
1528 return enc;
1529}
1530
1531#endif /* XML_NS */
int(* isInvalid4)(const ENCODING *, const char *)
Definition: xmltok.c:161
int(* isInvalid2)(const ENCODING *, const char *)
Definition: xmltok.c:159
unsigned char type[256]
Definition: xmltok.c:145
int(* isNmstrt4)(const ENCODING *, const char *)
Definition: xmltok.c:158
int(* isName3)(const ENCODING *, const char *)
Definition: xmltok.c:154
int(* isName2)(const ENCODING *, const char *)
Definition: xmltok.c:153
int(* isNmstrt2)(const ENCODING *, const char *)
Definition: xmltok.c:156
int(* isName4)(const ENCODING *, const char *)
Definition: xmltok.c:155
int(* isNmstrt3)(const ENCODING *, const char *)
Definition: xmltok.c:157
int(* isInvalid3)(const ENCODING *, const char *)
Definition: xmltok.c:160
ENCODING enc
Definition: xmltok.c:144
unsigned short utf16[256]
Definition: xmltok.c:1171
int(* convert)(void *userData, const char *p)
Definition: xmltok.c:1169
void * userData
Definition: xmltok.c:1170
char utf8[256][4]
Definition: xmltok.c:1172
struct normal_encoding normal
Definition: xmltok.c:1168
#define BIG2_BYTE_TO_ASCII(enc, p)
Definition: xmltok.c:740
@ UTF8_cval4
Definition: xmltok.c:280
@ UTF8_cval1
Definition: xmltok.c:277
@ UTF8_cval2
Definition: xmltok.c:278
@ UTF8_cval3
Definition: xmltok.c:279
#define STANDARD_VTABLE(E)
Definition: xmltok.c:175
#define VTABLE1
Definition: xmltok.c:39
int XmlUtf8Encode(int c, char *buf)
Definition: xmltok.c:1114
#define UTF8_GET_NAMING3(pages, byte)
Definition: xmltok.c:71
#define BIG2_IS_NAME_CHAR_MINBPC(enc, p)
Definition: xmltok.c:742
#define DEFINE_UTF16_TO_UTF8(E)
Definition: xmltok.c:496
#define BIG2_BYTE_TYPE(enc, p)
Definition: xmltok.c:736
int XmlSizeOfUnknownEncoding()
Definition: xmltok.c:1175
ENCODING * XmlInitUnknownEncoding(void *mem, int *table, int(*convert)(void *userData, const char *p), void *userData)
Definition: xmltok.c:1263
#define NORMAL_VTABLE(E)
Definition: xmltok.c:179
#define INIT_ENC_INDEX(enc)
Definition: xmltok.c:1378
int XmlUtf16Encode(int charNum, unsigned short *buf)
Definition: xmltok.c:1150
#define SB_BYTE_TYPE(enc, p)
Definition: xmltok.c:206
#define VTABLE
Definition: xmltok.c:52
#define LITTLE2_CHAR_MATCHES(enc, p, c)
Definition: xmltok.c:602
#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
Definition: xmltok.c:744
#define LITTLE2_BYTE_TYPE(enc, p)
Definition: xmltok.c:597
#define BT_COLON
#define UTF8_INVALID4(p)
Definition: xmltok.c:93
#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
Definition: xmltok.c:603
#define UCS2_GET_NAMING(pages, hi, lo)
Definition: xmltok.c:54
#define UTF8_GET_NAMING2(pages, byte)
Definition: xmltok.c:61
@ NO_ENC
Definition: xmltok.c:1352
@ US_ASCII_ENC
Definition: xmltok.c:1346
@ ISO_8859_1_ENC
Definition: xmltok.c:1345
@ UTF_8_ENC
Definition: xmltok.c:1347
@ UTF_16_ENC
Definition: xmltok.c:1348
@ UNKNOWN_ENC
Definition: xmltok.c:1344
@ UTF_16BE_ENC
Definition: xmltok.c:1349
@ UTF_16LE_ENC
Definition: xmltok.c:1350
#define LITTLE2_BYTE_TO_ASCII(enc, p)
Definition: xmltok.c:601
#define UTF8_INVALID3(p)
Definition: xmltok.c:86
#define DEFINE_UTF16_TO_UTF16(E)
Definition: xmltok.c:559
#define BIG2_CHAR_MATCHES(enc, p, c)
Definition: xmltok.c:741
#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
Definition: xmltok.c:605