Geant4  10.02.p02
xmltok.cc
Go to the documentation of this file.
1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2  See the file COPYING for copying permission.
3 */
4 
5 #if defined(__clang__) || defined(__GNUC__)
6 #pragma GCC diagnostic ignored "-Wunused-parameter"
7 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
8 #endif
9 
10 #include <stddef.h>
11 
12 #ifdef COMPILED_FROM_DSP
13 #include "winconfig.h"
14 #elif defined(MACOS_CLASSIC)
15 #include "macconfig.h"
16 #elif defined(__amigaos__)
17 #include "amigaconfig.h"
18 #elif defined(__WATCOMC__)
19 #include "watcomconfig.h"
20 #else
21 #ifdef HAVE_EXPAT_CONFIG_H
22 #include <expat_config.h>
23 #endif
24 #endif /* ndef COMPILED_FROM_DSP */
25 
26 #include "expat_external.h"
27 #include "internal.h"
28 #include "xmltok.h"
29 #include "nametab.h"
30 
31 #ifdef XML_DTD
32 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
33 #else
34 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
35 #endif
36 
37 #define VTABLE1 \
38  { PREFIX(prologTok), PREFIX(contentTok), \
39  PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
40  { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
41  PREFIX(sameName), \
42  PREFIX(nameMatchesAscii), \
43  PREFIX(nameLength), \
44  PREFIX(skipS), \
45  PREFIX(getAtts), \
46  PREFIX(charRefNumber), \
47  PREFIX(predefinedEntityName), \
48  PREFIX(updatePosition), \
49  PREFIX(isPublicId)
50 
51 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
52 
53 #define UCS2_GET_NAMING(pages, hi, lo) \
54  (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
55 
56 /* A 2 byte UTF-8 representation splits the characters 11 bits between
57  the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
58  pages, 3 bits to add to that index and 5 bits to generate the mask.
59 */
60 #define UTF8_GET_NAMING2(pages, byte) \
61  (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
62  + ((((byte)[0]) & 3) << 1) \
63  + ((((byte)[1]) >> 5) & 1)] \
64  & (1 << (((byte)[1]) & 0x1F)))
65 
66 /* A 3 byte UTF-8 representation splits the characters 16 bits between
67  the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
68  into pages, 3 bits to add to that index and 5 bits to generate the
69  mask.
70 */
71 #define UTF8_GET_NAMING3(pages, byte) \
72  (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
73  + ((((byte)[1]) >> 2) & 0xF)] \
74  << 3) \
75  + ((((byte)[1]) & 3) << 1) \
76  + ((((byte)[2]) >> 5) & 1)] \
77  & (1 << (((byte)[2]) & 0x1F)))
78 
79 #define UTF8_GET_NAMING(pages, p, n) \
80  ((n) == 2 \
81  ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
82  : ((n) == 3 \
83  ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
84  : 0))
85 
86 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
87  of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
88  with the additional restriction of not allowing the Unicode
89  code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
90  Implementation details:
91  (A & 0x80) == 0 means A < 0x80
92  and
93  (A & 0xC0) == 0xC0 means A > 0xBF
94 */
95 
96 #define UTF8_INVALID2(p) \
97  ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
98 
99 #define UTF8_INVALID3(p) \
100  (((p)[2] & 0x80) == 0 \
101  || \
102  ((*p) == 0xEF && (p)[1] == 0xBF \
103  ? \
104  (p)[2] > 0xBD \
105  : \
106  ((p)[2] & 0xC0) == 0xC0) \
107  || \
108  ((*p) == 0xE0 \
109  ? \
110  (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
111  : \
112  ((p)[1] & 0x80) == 0 \
113  || \
114  ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
115 
116 #define UTF8_INVALID4(p) \
117  (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
118  || \
119  ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
120  || \
121  ((*p) == 0xF0 \
122  ? \
123  (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
124  : \
125  ((p)[1] & 0x80) == 0 \
126  || \
127  ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
128 
129 static int PTRFASTCALL
130 isNever(const ENCODING *enc, const char *p)
131 {
132  return 0;
133 }
134 
135 static int PTRFASTCALL
136 utf8_isName2(const ENCODING *enc, const char *p)
137 {
138  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
139 }
140 
141 static int PTRFASTCALL
142 utf8_isName3(const ENCODING *enc, const char *p)
143 {
144  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
145 }
146 
147 #define utf8_isName4 isNever
148 
149 static int PTRFASTCALL
150 utf8_isNmstrt2(const ENCODING *enc, const char *p)
151 {
152  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
153 }
154 
155 static int PTRFASTCALL
156 utf8_isNmstrt3(const ENCODING *enc, const char *p)
157 {
158  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
159 }
160 
161 #define utf8_isNmstrt4 isNever
162 
163 static int PTRFASTCALL
164 utf8_isInvalid2(const ENCODING *enc, const char *p)
165 {
166  return UTF8_INVALID2((const unsigned char *)p);
167 }
168 
169 static int PTRFASTCALL
170 utf8_isInvalid3(const ENCODING *enc, const char *p)
171 {
172  return UTF8_INVALID3((const unsigned char *)p);
173 }
174 
175 static int PTRFASTCALL
176 utf8_isInvalid4(const ENCODING *enc, const char *p)
177 {
178  return UTF8_INVALID4((const unsigned char *)p);
179 }
180 
182  ENCODING enc;
183  unsigned char type[256];
184 #ifdef XML_MIN_SIZE
185  int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
186  int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
187  int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
188  int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
189  int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
190 #endif /* XML_MIN_SIZE */
191  int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
192  int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
193  int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
194  int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
195  int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
196  int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
197  int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
198  int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
199  int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
200 };
201 
202 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
203 
204 #ifdef XML_MIN_SIZE
205 
206 #define STANDARD_VTABLE(E) \
207  E ## byteType, \
208  E ## isNameMin, \
209  E ## isNmstrtMin, \
210  E ## byteToAscii, \
211  E ## charMatches,
212 
213 #else
214 
215 #define STANDARD_VTABLE(E) /* as nothing */
216 
217 #endif
218 
219 #define NORMAL_VTABLE(E) \
220  E ## isName2, \
221  E ## isName3, \
222  E ## isName4, \
223  E ## isNmstrt2, \
224  E ## isNmstrt3, \
225  E ## isNmstrt4, \
226  E ## isInvalid2, \
227  E ## isInvalid3, \
228  E ## isInvalid4
229 
230 static int FASTCALL checkCharRefNumber(int);
231 
232 #include "xmltok_impl.h"
233 #include "ascii.h"
234 
235 #ifdef XML_MIN_SIZE
236 #define sb_isNameMin isNever
237 #define sb_isNmstrtMin isNever
238 #endif
239 
240 #ifdef XML_MIN_SIZE
241 #define MINBPC(enc) ((enc)->minBytesPerChar)
242 #else
243 /* minimum bytes per character */
244 #define MINBPC(enc) 1
245 #endif
246 
247 #define SB_BYTE_TYPE(enc, p) \
248  (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
249 
250 #ifdef XML_MIN_SIZE
251 static int PTRFASTCALL
252 sb_byteType(const ENCODING *enc, const char *p)
253 {
254  return SB_BYTE_TYPE(enc, p);
255 }
256 #define BYTE_TYPE(enc, p) \
257  (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
258 #else
259 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
260 #endif
261 
262 #ifdef XML_MIN_SIZE
263 #define BYTE_TO_ASCII(enc, p) \
264  (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
265 static int PTRFASTCALL
266 sb_byteToAscii(const ENCODING *enc, const char *p)
267 {
268  return *p;
269 }
270 #else
271 #define BYTE_TO_ASCII(enc, p) (*(p))
272 #endif
273 
274 #define IS_NAME_CHAR(enc, p, n) \
275  (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
276 #define IS_NMSTRT_CHAR(enc, p, n) \
277  (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
278 #define IS_INVALID_CHAR(enc, p, n) \
279  (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
280 
281 #ifdef XML_MIN_SIZE
282 #define IS_NAME_CHAR_MINBPC(enc, p) \
283  (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
284 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
285  (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
286 #else
287 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
288 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
289 #endif
290 
291 #ifdef XML_MIN_SIZE
292 #define CHAR_MATCHES(enc, p, c) \
293  (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
294 static int PTRCALL
295 sb_charMatches(const ENCODING *enc, const char *p, int c)
296 {
297  return *p == c;
298 }
299 #else
300 /* c is an ASCII character */
301 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
302 #endif
303 
304 #define PREFIX(ident) normal_ ## ident
305 #define XML_TOK_IMPL_C
306 #include "xmltok_impl.cc"
307 #undef XML_TOK_IMPL_C
308 
309 #undef MINBPC
310 #undef BYTE_TYPE
311 #undef BYTE_TO_ASCII
312 #undef CHAR_MATCHES
313 #undef IS_NAME_CHAR
314 #undef IS_NAME_CHAR_MINBPC
315 #undef IS_NMSTRT_CHAR
316 #undef IS_NMSTRT_CHAR_MINBPC
317 #undef IS_INVALID_CHAR
318 
319 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
320  UTF8_cval1 = 0x00,
321  UTF8_cval2 = 0xc0,
322  UTF8_cval3 = 0xe0,
323  UTF8_cval4 = 0xf0
324 };
325 
326 static void PTRCALL
327 utf8_toUtf8(const ENCODING *enc,
328  const char **fromP, const char *fromLim,
329  char **toP, const char *toLim)
330 {
331  char *to;
332  const char *from;
333  if (fromLim - *fromP > toLim - *toP) {
334  /* Avoid copying partial characters. */
335  for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
336  if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
337  break;
338  }
339  for (to = *toP, from = *fromP; from != fromLim; from++, to++)
340  *to = *from;
341  *fromP = from;
342  *toP = to;
343 }
344 
345 static void PTRCALL
346 utf8_toUtf16(const ENCODING *enc,
347  const char **fromP, const char *fromLim,
348  unsigned short **toP, const unsigned short *toLim)
349 {
350  unsigned short *to = *toP;
351  const char *from = *fromP;
352  while (from != fromLim && to != toLim) {
353  switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
354  case BT_LEAD2:
355  *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
356  from += 2;
357  break;
358  case BT_LEAD3:
359  *to++ = (unsigned short)(((from[0] & 0xf) << 12)
360  | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
361  from += 3;
362  break;
363  case BT_LEAD4:
364  {
365  unsigned long n;
366  if (to + 1 == toLim)
367  goto after;
368  n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
369  | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
370  n -= 0x10000;
371  to[0] = (unsigned short)((n >> 10) | 0xD800);
372  to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
373  to += 2;
374  from += 4;
375  }
376  break;
377  default:
378  *to++ = *from++;
379  break;
380  }
381  }
382 after:
383  *fromP = from;
384  *toP = to;
385 }
386 
387 #ifdef XML_NS
388 static const struct normal_encoding utf8_encoding_ns = {
389  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
390  {
391 #include "asciitab.h"
392 #include "utf8tab.h"
393  },
394  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
395 };
396 #endif
397 
398 static const struct normal_encoding utf8_encoding = {
399  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
400  {
401 #define BT_COLON BT_NMSTRT
402 #include "asciitab.h"
403 #undef BT_COLON
404 #include "utf8tab.h"
405  },
406  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
407 };
408 
409 #ifdef XML_NS
410 
411 static const struct normal_encoding internal_utf8_encoding_ns = {
412  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
413  {
414 #include "iasciitab.h"
415 #include "utf8tab.h"
416  },
417  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
418 };
419 
420 #endif
421 
423  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
424  {
425 #define BT_COLON BT_NMSTRT
426 #include "iasciitab.h"
427 #undef BT_COLON
428 #include "utf8tab.h"
429  },
430  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
431 };
432 
433 static void PTRCALL
434 latin1_toUtf8(const ENCODING *enc,
435  const char **fromP, const char *fromLim,
436  char **toP, const char *toLim)
437 {
438  for (;;) {
439  unsigned char c;
440  if (*fromP == fromLim)
441  break;
442  c = (unsigned char)**fromP;
443  if (c & 0x80) {
444  if (toLim - *toP < 2)
445  break;
446  *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
447  *(*toP)++ = (char)((c & 0x3f) | 0x80);
448  (*fromP)++;
449  }
450  else {
451  if (*toP == toLim)
452  break;
453  *(*toP)++ = *(*fromP)++;
454  }
455  }
456 }
457 
458 static void PTRCALL
459 latin1_toUtf16(const ENCODING *enc,
460  const char **fromP, const char *fromLim,
461  unsigned short **toP, const unsigned short *toLim)
462 {
463  while (*fromP != fromLim && *toP != toLim)
464  *(*toP)++ = (unsigned char)*(*fromP)++;
465 }
466 
467 #ifdef XML_NS
468 
469 static const struct normal_encoding latin1_encoding_ns = {
470  { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
471  {
472 #include "asciitab.h"
473 #include "latin1tab.h"
474  },
475  STANDARD_VTABLE(sb_)
476 };
477 
478 #endif
479 
480 static const struct normal_encoding latin1_encoding = {
481  { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
482  {
483 #define BT_COLON BT_NMSTRT
484 #include "asciitab.h"
485 #undef BT_COLON
486 #include "latin1tab.h"
487  },
488  STANDARD_VTABLE(sb_)
489 };
490 
491 static void PTRCALL
492 ascii_toUtf8(const ENCODING *enc,
493  const char **fromP, const char *fromLim,
494  char **toP, const char *toLim)
495 {
496  while (*fromP != fromLim && *toP != toLim)
497  *(*toP)++ = *(*fromP)++;
498 }
499 
500 #ifdef XML_NS
501 
502 static const struct normal_encoding ascii_encoding_ns = {
503  { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
504  {
505 #include "asciitab.h"
506 /* BT_NONXML == 0 */
507  },
508  STANDARD_VTABLE(sb_)
509 };
510 
511 #endif
512 
513 static const struct normal_encoding ascii_encoding = {
514  { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
515  {
516 #define BT_COLON BT_NMSTRT
517 #include "asciitab.h"
518 #undef BT_COLON
519 /* BT_NONXML == 0 */
520  },
521  STANDARD_VTABLE(sb_)
522 };
523 
524 static int PTRFASTCALL
525 unicode_byte_type(char hi, char lo)
526 {
527  switch ((unsigned char)hi) {
528  case 0xD8: case 0xD9: case 0xDA: case 0xDB:
529  return BT_LEAD4;
530  case 0xDC: case 0xDD: case 0xDE: case 0xDF:
531  return BT_TRAIL;
532  case 0xFF:
533  switch ((unsigned char)lo) {
534  case 0xFF:
535  case 0xFE:
536  return BT_NONXML;
537  }
538  break;
539  }
540  return BT_NONASCII;
541 }
542 
543 #define DEFINE_UTF16_TO_UTF8(E) \
544 static void PTRCALL \
545 E ## toUtf8(const ENCODING *enc, \
546  const char **fromP, const char *fromLim, \
547  char **toP, const char *toLim) \
548 { \
549  const char *from; \
550  for (from = *fromP; from != fromLim; from += 2) { \
551  int plane; \
552  unsigned char lo2; \
553  unsigned char lo = GET_LO(from); \
554  unsigned char hi = GET_HI(from); \
555  switch (hi) { \
556  case 0: \
557  if (lo < 0x80) { \
558  if (*toP == toLim) { \
559  *fromP = from; \
560  return; \
561  } \
562  *(*toP)++ = lo; \
563  break; \
564  } \
565  /* fall through */ \
566  case 0x1: case 0x2: case 0x3: \
567  case 0x4: case 0x5: case 0x6: case 0x7: \
568  if (toLim - *toP < 2) { \
569  *fromP = from; \
570  return; \
571  } \
572  *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
573  *(*toP)++ = ((lo & 0x3f) | 0x80); \
574  break; \
575  default: \
576  if (toLim - *toP < 3) { \
577  *fromP = from; \
578  return; \
579  } \
580  /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
581  *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
582  *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
583  *(*toP)++ = ((lo & 0x3f) | 0x80); \
584  break; \
585  case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
586  if (toLim - *toP < 4) { \
587  *fromP = from; \
588  return; \
589  } \
590  plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
591  *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
592  *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
593  from += 2; \
594  lo2 = GET_LO(from); \
595  *(*toP)++ = (((lo & 0x3) << 4) \
596  | ((GET_HI(from) & 0x3) << 2) \
597  | (lo2 >> 6) \
598  | 0x80); \
599  *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
600  break; \
601  } \
602  } \
603  *fromP = from; \
604 }
605 
606 #define DEFINE_UTF16_TO_UTF16(E) \
607 static void PTRCALL \
608 E ## toUtf16(const ENCODING *enc, \
609  const char **fromP, const char *fromLim, \
610  unsigned short **toP, const unsigned short *toLim) \
611 { \
612  /* Avoid copying first half only of surrogate */ \
613  if (fromLim - *fromP > ((toLim - *toP) << 1) \
614  && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
615  fromLim -= 2; \
616  for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
617  *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
618 }
619 
620 #define SET2(ptr, ch) \
621  (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
622 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
623 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
624 
625 DEFINE_UTF16_TO_UTF8(little2_)
626 DEFINE_UTF16_TO_UTF16(little2_)
627 
628 #undef SET2
629 #undef GET_LO
630 #undef GET_HI
631 
632 #define SET2(ptr, ch) \
633  (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
634 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
635 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
636 
639 
640 #undef SET2
641 #undef GET_LO
642 #undef GET_HI
643 
644 #define LITTLE2_BYTE_TYPE(enc, p) \
645  ((p)[1] == 0 \
646  ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
647  : unicode_byte_type((p)[1], (p)[0]))
648 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
649 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
650 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
651  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
652 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
653  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
654 
655 #ifdef XML_MIN_SIZE
656 
657 static int PTRFASTCALL
658 little2_byteType(const ENCODING *enc, const char *p)
659 {
660  return LITTLE2_BYTE_TYPE(enc, p);
661 }
662 
663 static int PTRFASTCALL
664 little2_byteToAscii(const ENCODING *enc, const char *p)
665 {
666  return LITTLE2_BYTE_TO_ASCII(enc, p);
667 }
668 
669 static int PTRCALL
670 little2_charMatches(const ENCODING *enc, const char *p, int c)
671 {
672  return LITTLE2_CHAR_MATCHES(enc, p, c);
673 }
674 
675 static int PTRFASTCALL
676 little2_isNameMin(const ENCODING *enc, const char *p)
677 {
678  return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
679 }
680 
681 static int PTRFASTCALL
682 little2_isNmstrtMin(const ENCODING *enc, const char *p)
683 {
684  return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
685 }
686 
687 #undef VTABLE
688 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
689 
690 #else /* not XML_MIN_SIZE */
691 
692 #undef PREFIX
693 #define PREFIX(ident) little2_ ## ident
694 #define MINBPC(enc) 2
695 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
696 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
697 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
698 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
699 #define IS_NAME_CHAR(enc, p, n) 0
700 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
701 #define IS_NMSTRT_CHAR(enc, p, n) (0)
702 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
703 
704 #define XML_TOK_IMPL_C
705 #include "xmltok_impl.cc"
706 #undef XML_TOK_IMPL_C
707 
708 #undef MINBPC
709 #undef BYTE_TYPE
710 #undef BYTE_TO_ASCII
711 #undef CHAR_MATCHES
712 #undef IS_NAME_CHAR
713 #undef IS_NAME_CHAR_MINBPC
714 #undef IS_NMSTRT_CHAR
715 #undef IS_NMSTRT_CHAR_MINBPC
716 #undef IS_INVALID_CHAR
717 
718 #endif /* not XML_MIN_SIZE */
719 
720 #ifdef XML_NS
721 
722 static const struct normal_encoding little2_encoding_ns = {
723  { VTABLE, 2, 0,
724 #if BYTEORDER == 1234
725  1
726 #else
727  0
728 #endif
729  },
730  {
731 #include "asciitab.h"
732 #include "latin1tab.h"
733  },
734  STANDARD_VTABLE(little2_)
735 };
736 
737 #endif
738 
739 static const struct normal_encoding little2_encoding = {
740  { VTABLE, 2, 0,
741 #if BYTEORDER == 1234
742  1
743 #else
744  0
745 #endif
746  },
747  {
748 #define BT_COLON BT_NMSTRT
749 #include "asciitab.h"
750 #undef BT_COLON
751 #include "latin1tab.h"
752  },
753  STANDARD_VTABLE(little2_)
754 };
755 
756 #if BYTEORDER != 4321
757 
758 #ifdef XML_NS
759 
760 static const struct normal_encoding internal_little2_encoding_ns = {
761  { VTABLE, 2, 0, 1 },
762  {
763 #include "iasciitab.h"
764 #include "latin1tab.h"
765  },
766  STANDARD_VTABLE(little2_)
767 };
768 
769 #endif
770 
772  { VTABLE, 2, 0, 1 },
773  {
774 #define BT_COLON BT_NMSTRT
775 #include "iasciitab.h"
776 #undef BT_COLON
777 #include "latin1tab.h"
778  },
779  STANDARD_VTABLE(little2_)
780 };
781 
782 #endif
783 
784 
785 #define BIG2_BYTE_TYPE(enc, p) \
786  ((p)[0] == 0 \
787  ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
788  : unicode_byte_type((p)[0], (p)[1]))
789 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
790 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
791 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
792  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
793 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
794  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
795 
796 #ifdef XML_MIN_SIZE
797 
798 static int PTRFASTCALL
799 big2_byteType(const ENCODING *enc, const char *p)
800 {
801  return BIG2_BYTE_TYPE(enc, p);
802 }
803 
804 static int PTRFASTCALL
805 big2_byteToAscii(const ENCODING *enc, const char *p)
806 {
807  return BIG2_BYTE_TO_ASCII(enc, p);
808 }
809 
810 static int PTRCALL
811 big2_charMatches(const ENCODING *enc, const char *p, int c)
812 {
813  return BIG2_CHAR_MATCHES(enc, p, c);
814 }
815 
816 static int PTRFASTCALL
817 big2_isNameMin(const ENCODING *enc, const char *p)
818 {
819  return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
820 }
821 
822 static int PTRFASTCALL
823 big2_isNmstrtMin(const ENCODING *enc, const char *p)
824 {
825  return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
826 }
827 
828 #undef VTABLE
829 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
830 
831 #else /* not XML_MIN_SIZE */
832 
833 #undef PREFIX
834 #define PREFIX(ident) big2_ ## ident
835 #define MINBPC(enc) 2
836 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
837 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
838 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
839 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
840 #define IS_NAME_CHAR(enc, p, n) 0
841 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
842 #define IS_NMSTRT_CHAR(enc, p, n) (0)
843 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
844 
845 #define XML_TOK_IMPL_C
846 #include "xmltok_impl.cc"
847 #undef XML_TOK_IMPL_C
848 
849 #undef MINBPC
850 #undef BYTE_TYPE
851 #undef BYTE_TO_ASCII
852 #undef CHAR_MATCHES
853 #undef IS_NAME_CHAR
854 #undef IS_NAME_CHAR_MINBPC
855 #undef IS_NMSTRT_CHAR
856 #undef IS_NMSTRT_CHAR_MINBPC
857 #undef IS_INVALID_CHAR
858 
859 #endif /* not XML_MIN_SIZE */
860 
861 #ifdef XML_NS
862 
863 static const struct normal_encoding big2_encoding_ns = {
864  { VTABLE, 2, 0,
865 #if BYTEORDER == 4321
866  1
867 #else
868  0
869 #endif
870  },
871  {
872 #include "asciitab.h"
873 #include "latin1tab.h"
874  },
875  STANDARD_VTABLE(big2_)
876 };
877 
878 #endif
879 
880 static const struct normal_encoding big2_encoding = {
881  { VTABLE, 2, 0,
882 #if BYTEORDER == 4321
883  1
884 #else
885  0
886 #endif
887  },
888  {
889 #define BT_COLON BT_NMSTRT
890 #include "asciitab.h"
891 #undef BT_COLON
892 #include "latin1tab.h"
893  },
894  STANDARD_VTABLE(big2_)
895 };
896 
897 #if BYTEORDER != 1234
898 
899 #ifdef XML_NS
900 
901 static const struct normal_encoding internal_big2_encoding_ns = {
902  { VTABLE, 2, 0, 1 },
903  {
904 #include "iasciitab.h"
905 #include "latin1tab.h"
906  },
907  STANDARD_VTABLE(big2_)
908 };
909 
910 #endif
911 
913  { VTABLE, 2, 0, 1 },
914  {
915 #define BT_COLON BT_NMSTRT
916 #include "iasciitab.h"
917 #undef BT_COLON
918 #include "latin1tab.h"
919  },
920  STANDARD_VTABLE(big2_)
921 };
922 
923 #endif
924 
925 #undef PREFIX
926 
927 static int FASTCALL
928 streqci(const char *s1, const char *s2)
929 {
930  for (;;) {
931  char c1 = *s1++;
932  char c2 = *s2++;
933  if (ASCII_a <= c1 && c1 <= ASCII_z)
934  c1 += ASCII_A - ASCII_a;
935  if (ASCII_a <= c2 && c2 <= ASCII_z)
936  c2 += ASCII_A - ASCII_a;
937  if (c1 != c2)
938  return 0;
939  if (!c1)
940  break;
941  }
942  return 1;
943 }
944 
945 static void PTRCALL
946 initUpdatePosition(const ENCODING *enc, const char *ptr,
947  const char *end, POSITION *pos)
948 {
949  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
950 }
951 
952 static int
953 toAscii(const ENCODING *enc, const char *ptr, const char *end)
954 {
955  char buf[1];
956  char *p = buf;
957  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
958  if (p == buf)
959  return -1;
960  else
961  return buf[0];
962 }
963 
964 static int FASTCALL
965 isSpace(int c)
966 {
967  switch (c) {
968  case 0x20:
969  case 0xD:
970  case 0xA:
971  case 0x9:
972  return 1;
973  }
974  return 0;
975 }
976 
977 /* Return 1 if there's just optional white space or there's an S
978  followed by name=val.
979 */
980 static int
981 parsePseudoAttribute(const ENCODING *enc,
982  const char *ptr,
983  const char *end,
984  const char **namePtr,
985  const char **nameEndPtr,
986  const char **valPtr,
987  const char **nextTokPtr)
988 {
989  int c;
990  char open;
991  if (ptr == end) {
992  *namePtr = NULL;
993  return 1;
994  }
995  if (!isSpace(toAscii(enc, ptr, end))) {
996  *nextTokPtr = ptr;
997  return 0;
998  }
999  do {
1000  ptr += enc->minBytesPerChar;
1001  } while (isSpace(toAscii(enc, ptr, end)));
1002  if (ptr == end) {
1003  *namePtr = NULL;
1004  return 1;
1005  }
1006  *namePtr = ptr;
1007  for (;;) {
1008  c = toAscii(enc, ptr, end);
1009  if (c == -1) {
1010  *nextTokPtr = ptr;
1011  return 0;
1012  }
1013  if (c == ASCII_EQUALS) {
1014  *nameEndPtr = ptr;
1015  break;
1016  }
1017  if (isSpace(c)) {
1018  *nameEndPtr = ptr;
1019  do {
1020  ptr += enc->minBytesPerChar;
1021  } while (isSpace(c = toAscii(enc, ptr, end)));
1022  if (c != ASCII_EQUALS) {
1023  *nextTokPtr = ptr;
1024  return 0;
1025  }
1026  break;
1027  }
1028  ptr += enc->minBytesPerChar;
1029  }
1030  if (ptr == *namePtr) {
1031  *nextTokPtr = ptr;
1032  return 0;
1033  }
1034  ptr += enc->minBytesPerChar;
1035  c = toAscii(enc, ptr, end);
1036  while (isSpace(c)) {
1037  ptr += enc->minBytesPerChar;
1038  c = toAscii(enc, ptr, end);
1039  }
1040  if (c != ASCII_QUOT && c != ASCII_APOS) {
1041  *nextTokPtr = ptr;
1042  return 0;
1043  }
1044  open = (char)c;
1045  ptr += enc->minBytesPerChar;
1046  *valPtr = ptr;
1047  for (;; ptr += enc->minBytesPerChar) {
1048  c = toAscii(enc, ptr, end);
1049  if (c == open)
1050  break;
1051  if (!(ASCII_a <= c && c <= ASCII_z)
1052  && !(ASCII_A <= c && c <= ASCII_Z)
1053  && !(ASCII_0 <= c && c <= ASCII_9)
1054  && c != ASCII_PERIOD
1055  && c != ASCII_MINUS
1056  && c != ASCII_UNDERSCORE) {
1057  *nextTokPtr = ptr;
1058  return 0;
1059  }
1060  }
1061  *nextTokPtr = ptr + enc->minBytesPerChar;
1062  return 1;
1063 }
1064 
1065 static const char KW_version[] = {
1066  ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1067 };
1068 
1069 static const char KW_encoding[] = {
1070  ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1071 };
1072 
1073 static const char KW_standalone[] = {
1074  ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
1075  ASCII_n, ASCII_e, '\0'
1076 };
1077 
1078 static const char KW_yes[] = {
1079  ASCII_y, ASCII_e, ASCII_s, '\0'
1080 };
1081 
1082 static const char KW_no[] = {
1083  ASCII_n, ASCII_o, '\0'
1084 };
1085 
1086 static int
1087 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1088  const char *,
1089  const char *),
1090  int isGeneralTextEntity,
1091  const ENCODING *enc,
1092  const char *ptr,
1093  const char *end,
1094  const char **badPtr,
1095  const char **versionPtr,
1096  const char **versionEndPtr,
1097  const char **encodingName,
1098  const ENCODING **encoding,
1099  int *standalone)
1100 {
1101  const char *val = NULL;
1102  const char *name = NULL;
1103  const char *nameEnd = NULL;
1104  ptr += 5 * enc->minBytesPerChar;
1105  end -= 2 * enc->minBytesPerChar;
1106  if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1107  || !name) {
1108  *badPtr = ptr;
1109  return 0;
1110  }
1111  if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1112  if (!isGeneralTextEntity) {
1113  *badPtr = name;
1114  return 0;
1115  }
1116  }
1117  else {
1118  if (versionPtr)
1119  *versionPtr = val;
1120  if (versionEndPtr)
1121  *versionEndPtr = ptr;
1122  if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1123  *badPtr = ptr;
1124  return 0;
1125  }
1126  if (!name) {
1127  if (isGeneralTextEntity) {
1128  /* a TextDecl must have an EncodingDecl */
1129  *badPtr = ptr;
1130  return 0;
1131  }
1132  return 1;
1133  }
1134  }
1135  if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1136  int c = toAscii(enc, val, end);
1137  if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1138  *badPtr = val;
1139  return 0;
1140  }
1141  if (encodingName)
1142  *encodingName = val;
1143  if (encoding)
1144  *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1145  if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1146  *badPtr = ptr;
1147  return 0;
1148  }
1149  if (!name)
1150  return 1;
1151  }
1152  if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1153  || isGeneralTextEntity) {
1154  *badPtr = name;
1155  return 0;
1156  }
1157  if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1158  if (standalone)
1159  *standalone = 1;
1160  }
1161  else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1162  if (standalone)
1163  *standalone = 0;
1164  }
1165  else {
1166  *badPtr = val;
1167  return 0;
1168  }
1169  while (isSpace(toAscii(enc, ptr, end)))
1170  ptr += enc->minBytesPerChar;
1171  if (ptr != end) {
1172  *badPtr = ptr;
1173  return 0;
1174  }
1175  return 1;
1176 }
1177 
1178 static int FASTCALL
1180 {
1181  switch (result >> 8) {
1182  case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1183  case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1184  return -1;
1185  case 0:
1186  if (latin1_encoding.type[result] == BT_NONXML)
1187  return -1;
1188  break;
1189  case 0xFF:
1190  if (result == 0xFFFE || result == 0xFFFF)
1191  return -1;
1192  break;
1193  }
1194  return result;
1195 }
1196 
1197 int FASTCALL
1198 XmlUtf8Encode(int c, char *buf)
1199 {
1200  enum {
1201  /* minN is minimum legal resulting value for N byte sequence */
1202  min2 = 0x80,
1203  min3 = 0x800,
1204  min4 = 0x10000
1205  };
1206 
1207  if (c < 0)
1208  return 0;
1209  if (c < min2) {
1210  buf[0] = (char)(c | UTF8_cval1);
1211  return 1;
1212  }
1213  if (c < min3) {
1214  buf[0] = (char)((c >> 6) | UTF8_cval2);
1215  buf[1] = (char)((c & 0x3f) | 0x80);
1216  return 2;
1217  }
1218  if (c < min4) {
1219  buf[0] = (char)((c >> 12) | UTF8_cval3);
1220  buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1221  buf[2] = (char)((c & 0x3f) | 0x80);
1222  return 3;
1223  }
1224  if (c < 0x110000) {
1225  buf[0] = (char)((c >> 18) | UTF8_cval4);
1226  buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1227  buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1228  buf[3] = (char)((c & 0x3f) | 0x80);
1229  return 4;
1230  }
1231  return 0;
1232 }
1233 
1234 int FASTCALL
1235 XmlUtf16Encode(int charNum, unsigned short *buf)
1236 {
1237  if (charNum < 0)
1238  return 0;
1239  if (charNum < 0x10000) {
1240  buf[0] = (unsigned short)charNum;
1241  return 1;
1242  }
1243  if (charNum < 0x110000) {
1244  charNum -= 0x10000;
1245  buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1246  buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1247  return 2;
1248  }
1249  return 0;
1250 }
1251 
1254  CONVERTER convert;
1255  void *userData;
1256  unsigned short utf16[256];
1257  char utf8[256][4];
1258 };
1259 
1260 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
1261 
1262 int
1264 {
1265  return sizeof(struct unknown_encoding);
1266 }
1267 
1268 static int PTRFASTCALL
1269 unknown_isName(const ENCODING *enc, const char *p)
1270 {
1271  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1272  int c = uenc->convert(uenc->userData, p);
1273  if (c & ~0xFFFF)
1274  return 0;
1275  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1276 }
1277 
1278 static int PTRFASTCALL
1279 unknown_isNmstrt(const ENCODING *enc, const char *p)
1280 {
1281  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1282  int c = uenc->convert(uenc->userData, p);
1283  if (c & ~0xFFFF)
1284  return 0;
1285  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1286 }
1287 
1288 static int PTRFASTCALL
1289 unknown_isInvalid(const ENCODING *enc, const char *p)
1290 {
1291  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1292  int c = uenc->convert(uenc->userData, p);
1293  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1294 }
1295 
1296 static void PTRCALL
1297 unknown_toUtf8(const ENCODING *enc,
1298  const char **fromP, const char *fromLim,
1299  char **toP, const char *toLim)
1300 {
1301  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1302  char buf[XML_UTF8_ENCODE_MAX];
1303  for (;;) {
1304  const char *utf8;
1305  int n;
1306  if (*fromP == fromLim)
1307  break;
1308  utf8 = uenc->utf8[(unsigned char)**fromP];
1309  n = *utf8++;
1310  if (n == 0) {
1311  int c = uenc->convert(uenc->userData, *fromP);
1312  n = XmlUtf8Encode(c, buf);
1313  if (n > toLim - *toP)
1314  break;
1315  utf8 = buf;
1316  *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1317  - (BT_LEAD2 - 2));
1318  }
1319  else {
1320  if (n > toLim - *toP)
1321  break;
1322  (*fromP)++;
1323  }
1324  do {
1325  *(*toP)++ = *utf8++;
1326  } while (--n != 0);
1327  }
1328 }
1329 
1330 static void PTRCALL
1331 unknown_toUtf16(const ENCODING *enc,
1332  const char **fromP, const char *fromLim,
1333  unsigned short **toP, const unsigned short *toLim)
1334 {
1335  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1336  while (*fromP != fromLim && *toP != toLim) {
1337  unsigned short c = uenc->utf16[(unsigned char)**fromP];
1338  if (c == 0) {
1339  c = (unsigned short)
1340  uenc->convert(uenc->userData, *fromP);
1341  *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1342  - (BT_LEAD2 - 2));
1343  }
1344  else
1345  (*fromP)++;
1346  *(*toP)++ = c;
1347  }
1348 }
1349 
1350 ENCODING *
1352  int *table,
1353  CONVERTER convert,
1354  void *userData)
1355 {
1356  int i;
1357  struct unknown_encoding *e = (struct unknown_encoding *)mem;
1358  for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1359  ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1360  for (i = 0; i < 128; i++)
1361  if (latin1_encoding.type[i] != BT_OTHER
1362  && latin1_encoding.type[i] != BT_NONXML
1363  && table[i] != i)
1364  return 0;
1365  for (i = 0; i < 256; i++) {
1366  int c = table[i];
1367  if (c == -1) {
1368  e->normal.type[i] = BT_MALFORM;
1369  /* This shouldn't really get used. */
1370  e->utf16[i] = 0xFFFF;
1371  e->utf8[i][0] = 1;
1372  e->utf8[i][1] = 0;
1373  }
1374  else if (c < 0) {
1375  if (c < -4)
1376  return 0;
1377  e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1378  e->utf8[i][0] = 0;
1379  e->utf16[i] = 0;
1380  }
1381  else if (c < 0x80) {
1382  if (latin1_encoding.type[c] != BT_OTHER
1383  && latin1_encoding.type[c] != BT_NONXML
1384  && c != i)
1385  return 0;
1386  e->normal.type[i] = latin1_encoding.type[c];
1387  e->utf8[i][0] = 1;
1388  e->utf8[i][1] = (char)c;
1389  e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1390  }
1391  else if (checkCharRefNumber(c) < 0) {
1392  e->normal.type[i] = BT_NONXML;
1393  /* This shouldn't really get used. */
1394  e->utf16[i] = 0xFFFF;
1395  e->utf8[i][0] = 1;
1396  e->utf8[i][1] = 0;
1397  }
1398  else {
1399  if (c > 0xFFFF)
1400  return 0;
1401  if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1402  e->normal.type[i] = BT_NMSTRT;
1403  else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1404  e->normal.type[i] = BT_NAME;
1405  else
1406  e->normal.type[i] = BT_OTHER;
1407  e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1408  e->utf16[i] = (unsigned short)c;
1409  }
1410  }
1411  e->userData = userData;
1412  e->convert = convert;
1413  if (convert) {
1414  e->normal.isName2 = unknown_isName;
1415  e->normal.isName3 = unknown_isName;
1416  e->normal.isName4 = unknown_isName;
1417  e->normal.isNmstrt2 = unknown_isNmstrt;
1418  e->normal.isNmstrt3 = unknown_isNmstrt;
1419  e->normal.isNmstrt4 = unknown_isNmstrt;
1420  e->normal.isInvalid2 = unknown_isInvalid;
1421  e->normal.isInvalid3 = unknown_isInvalid;
1422  e->normal.isInvalid4 = unknown_isInvalid;
1423  }
1424  e->normal.enc.utf8Convert = unknown_toUtf8;
1425  e->normal.enc.utf16Convert = unknown_toUtf16;
1426  return &(e->normal.enc);
1427 }
1428 
1429 /* If this enumeration is changed, getEncodingIndex and encodings
1430 must also be changed. */
1431 enum {
1439  /* must match encodingNames up to here */
1441 };
1442 
1443 static const char KW_ISO_8859_1[] = {
1444  ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
1445  ASCII_MINUS, ASCII_1, '\0'
1446 };
1447 static const char KW_US_ASCII[] = {
1448  ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
1449  '\0'
1450 };
1451 static const char KW_UTF_8[] = {
1452  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1453 };
1454 static const char KW_UTF_16[] = {
1455  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1456 };
1457 static const char KW_UTF_16BE[] = {
1458  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
1459  '\0'
1460 };
1461 static const char KW_UTF_16LE[] = {
1462  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
1463  '\0'
1464 };
1465 
1466 static int FASTCALL
1468 {
1469  static const char * const encodingNames[] = {
1470  KW_ISO_8859_1,
1471  KW_US_ASCII,
1472  KW_UTF_8,
1473  KW_UTF_16,
1474  KW_UTF_16BE,
1475  KW_UTF_16LE,
1476  };
1477  int i;
1478  if (name == NULL)
1479  return NO_ENC;
1480  for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1481  if (streqci(name, encodingNames[i]))
1482  return i;
1483  return UNKNOWN_ENC;
1484 }
1485 
1486 /* For binary compatibility, we store the index of the encoding
1487  specified at initialization in the isUtf16 member.
1488 */
1489 
1490 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1491 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1492 
1493 /* This is what detects the encoding. encodingTable maps from
1494  encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1495  the external (protocol) specified encoding; state is
1496  XML_CONTENT_STATE if we're parsing an external text entity, and
1497  XML_PROLOG_STATE otherwise.
1498 */
1499 
1500 
1501 static int
1502 initScan(const ENCODING * const *encodingTable,
1503  const INIT_ENCODING *enc,
1504  int state,
1505  const char *ptr,
1506  const char *end,
1507  const char **nextTokPtr)
1508 {
1509  const ENCODING **encPtr;
1510 
1511  if (ptr == end)
1512  return XML_TOK_NONE;
1513  encPtr = enc->encPtr;
1514  if (ptr + 1 == end) {
1515  /* only a single byte available for auto-detection */
1516 #ifndef XML_DTD /* FIXME */
1517  /* a well-formed document entity must have more than one byte */
1518  if (state != XML_CONTENT_STATE)
1519  return XML_TOK_PARTIAL;
1520 #endif
1521  /* so we're parsing an external text entity... */
1522  /* if UTF-16 was externally specified, then we need at least 2 bytes */
1523  switch (INIT_ENC_INDEX(enc)) {
1524  case UTF_16_ENC:
1525  case UTF_16LE_ENC:
1526  case UTF_16BE_ENC:
1527  return XML_TOK_PARTIAL;
1528  }
1529  switch ((unsigned char)*ptr) {
1530  case 0xFE:
1531  case 0xFF:
1532  case 0xEF: /* possibly first byte of UTF-8 BOM */
1533  if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1534  && state == XML_CONTENT_STATE)
1535  break;
1536  /* fall through */
1537  case 0x00:
1538  case 0x3C:
1539  return XML_TOK_PARTIAL;
1540  }
1541  }
1542  else {
1543  switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1544  case 0xFEFF:
1545  if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1546  && state == XML_CONTENT_STATE)
1547  break;
1548  *nextTokPtr = ptr + 2;
1549  *encPtr = encodingTable[UTF_16BE_ENC];
1550  return XML_TOK_BOM;
1551  /* 00 3C is handled in the default case */
1552  case 0x3C00:
1553  if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1554  || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1555  && state == XML_CONTENT_STATE)
1556  break;
1557  *encPtr = encodingTable[UTF_16LE_ENC];
1558  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1559  case 0xFFFE:
1560  if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1561  && state == XML_CONTENT_STATE)
1562  break;
1563  *nextTokPtr = ptr + 2;
1564  *encPtr = encodingTable[UTF_16LE_ENC];
1565  return XML_TOK_BOM;
1566  case 0xEFBB:
1567  /* Maybe a UTF-8 BOM (EF BB BF) */
1568  /* If there's an explicitly specified (external) encoding
1569  of ISO-8859-1 or some flavour of UTF-16
1570  and this is an external text entity,
1571  don't look for the BOM,
1572  because it might be a legal data.
1573  */
1574  if (state == XML_CONTENT_STATE) {
1575  int e = INIT_ENC_INDEX(enc);
1576  if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1577  || e == UTF_16LE_ENC || e == UTF_16_ENC)
1578  break;
1579  }
1580  if (ptr + 2 == end)
1581  return XML_TOK_PARTIAL;
1582  if ((unsigned char)ptr[2] == 0xBF) {
1583  *nextTokPtr = ptr + 3;
1584  *encPtr = encodingTable[UTF_8_ENC];
1585  return XML_TOK_BOM;
1586  }
1587  break;
1588  default:
1589  if (ptr[0] == '\0') {
1590  /* 0 isn't a legal data character. Furthermore a document
1591  entity can only start with ASCII characters. So the only
1592  way this can fail to be big-endian UTF-16 if it it's an
1593  external parsed general entity that's labelled as
1594  UTF-16LE.
1595  */
1596  if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1597  break;
1598  *encPtr = encodingTable[UTF_16BE_ENC];
1599  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1600  }
1601  else if (ptr[1] == '\0') {
1602  /* We could recover here in the case:
1603  - parsing an external entity
1604  - second byte is 0
1605  - no externally specified encoding
1606  - no encoding declaration
1607  by assuming UTF-16LE. But we don't, because this would mean when
1608  presented just with a single byte, we couldn't reliably determine
1609  whether we needed further bytes.
1610  */
1611  if (state == XML_CONTENT_STATE)
1612  break;
1613  *encPtr = encodingTable[UTF_16LE_ENC];
1614  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1615  }
1616  break;
1617  }
1618  }
1619  *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1620  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1621 }
1622 
1623 
1624 #define NS(x) x
1625 #define ns(x) x
1626 #define XML_TOK_NS_C
1627 #include "xmltok_ns.cc"
1628 #undef XML_TOK_NS_C
1629 #undef NS
1630 #undef ns
1631 
1632 #ifdef XML_NS
1633 
1634 #define NS(x) x ## NS
1635 #define ns(x) x ## _ns
1636 
1637 #define XML_TOK_NS_C
1638 #include "xmltok_ns.cc"
1639 #undef XML_TOK_NS_C
1640 
1641 #undef NS
1642 #undef ns
1643 
1644 ENCODING *
1645 XmlInitUnknownEncodingNS(void *mem,
1646  int *table,
1647  CONVERTER convert,
1648  void *userData)
1649 {
1650  ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1651  if (enc)
1652  ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1653  return enc;
1654 }
1655 
1656 #endif /* XML_NS */
static const char KW_ISO_8859_1[]
Definition: xmltok.cc:1443
static int PTRFASTCALL unicode_byte_type(char hi, char lo)
Definition: xmltok.cc:525
#define AS_NORMAL_ENCODING(enc)
Definition: xmltok.cc:202
static c2_factory< G4double > c2
CONVERTER convert
Definition: xmltok.cc:1254
#define INIT_ENC_INDEX(enc)
Definition: xmltok.cc:1490
static void PTRCALL latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim)
Definition: xmltok.cc:434
static int PTRFASTCALL utf8_isNmstrt3(const ENCODING *enc, const char *p)
Definition: xmltok.cc:156
#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
Definition: xmltok.cc:652
static int toAscii(const ENCODING *enc, const char *ptr, const char *end)
Definition: xmltok.cc:953
static void PTRCALL initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end, POSITION *pos)
Definition: xmltok.cc:946
#define LITTLE2_BYTE_TYPE(enc, p)
Definition: xmltok.cc:644
#define UTF8_INVALID2(p)
Definition: xmltok.cc:96
static const struct normal_encoding utf8_encoding
Definition: xmltok.cc:398
G4String name
Definition: TRTMaterials.hh:40
#define SB_BYTE_TYPE(enc, p)
Definition: xmltok.cc:247
#define NORMAL_VTABLE(E)
Definition: xmltok.cc:219
static int PTRFASTCALL unknown_isName(const ENCODING *enc, const char *p)
Definition: xmltok.cc:1269
static int initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc, int state, const char *ptr, const char *end, const char **nextTokPtr)
Definition: xmltok.cc:1502
#define UCS2_GET_NAMING(pages, hi, lo)
Definition: xmltok.cc:53
static const char KW_UTF_8[]
Definition: xmltok.cc:1451
#define UTF8_GET_NAMING2(pages, byte)
Definition: xmltok.cc:60
#define AS_UNKNOWN_ENCODING(enc)
Definition: xmltok.cc:1260
static int PTRFASTCALL unknown_isInvalid(const ENCODING *enc, const char *p)
Definition: xmltok.cc:1289
static int FASTCALL checkCharRefNumber(int)
Definition: xmltok.cc:1179
static const char KW_version[]
Definition: xmltok.cc:1065
int FASTCALL XmlUtf16Encode(int charNum, unsigned short *buf)
Definition: xmltok.cc:1235
#define userData
Definition: xmlparse.cc:572
int XmlSizeOfUnknownEncoding(void)
Definition: xmltok.cc:1263
#define LITTLE2_BYTE_TO_ASCII(enc, p)
Definition: xmltok.cc:648
static int PTRFASTCALL utf8_isInvalid3(const ENCODING *enc, const char *p)
Definition: xmltok.cc:170
#define LITTLE2_CHAR_MATCHES(enc, p, c)
Definition: xmltok.cc:649
static const char KW_encoding[]
Definition: xmltok.cc:1069
static const struct normal_encoding big2_encoding
Definition: xmltok.cc:880
int FASTCALL XmlUtf8Encode(int c, char *buf)
Definition: xmltok.cc:1198
int(PTRFASTCALL *isName2)(const ENCODING *
static void PTRCALL unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim)
Definition: xmltok.cc:1297
#define UTF8_INVALID3(p)
Definition: xmltok.cc:99
static const struct normal_encoding internal_utf8_encoding
Definition: xmltok.cc:422
static const struct normal_encoding ascii_encoding
Definition: xmltok.cc:513
#define BIG2_IS_NAME_CHAR_MINBPC(enc, p)
Definition: xmltok.cc:791
static int min3(int a, int b, int c)
#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
Definition: xmltok.cc:793
static int PTRFASTCALL unknown_isNmstrt(const ENCODING *enc, const char *p)
Definition: xmltok.cc:1279
static int FASTCALL getEncodingIndex(const char *name)
Definition: xmltok.cc:1467
static int FASTCALL isSpace(int c)
Definition: xmltok.cc:965
static const struct normal_encoding latin1_encoding
Definition: xmltok.cc:480
static const struct normal_encoding internal_little2_encoding
Definition: xmltok.cc:771
static const char KW_UTF_16[]
Definition: xmltok.cc:1454
static int PTRFASTCALL utf8_isInvalid2(const ENCODING *enc, const char *p)
Definition: xmltok.cc:164
void * userData
Definition: xmltok.cc:1255
static int FASTCALL streqci(const char *s1, const char *s2)
Definition: xmltok.cc:928
static const struct normal_encoding internal_big2_encoding
Definition: xmltok.cc:912
#define BIG2_CHAR_MATCHES(enc, p, c)
Definition: xmltok.cc:790
static const char KW_no[]
Definition: xmltok.cc:1082
const G4int n
static const G4double c1
static int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *, const char *), int isGeneralTextEntity, const ENCODING *enc, const char *ptr, const char *end, const char **badPtr, const char **versionPtr, const char **versionEndPtr, const char **encodingName, const ENCODING **encoding, int *standalone)
Definition: xmltok.cc:1087
#define encoding
Definition: xmlparse.cc:605
#define BIG2_BYTE_TO_ASCII(enc, p)
Definition: xmltok.cc:789
char utf8[256][4]
Definition: xmltok.cc:1257
static int PTRFASTCALL utf8_isInvalid4(const ENCODING *enc, const char *p)
Definition: xmltok.cc:176
static int PTRFASTCALL isNever(const ENCODING *enc, const char *p)
Definition: xmltok.cc:130
#define DEFINE_UTF16_TO_UTF8(E)
Definition: xmltok.cc:543
static int PTRFASTCALL utf8_isName3(const ENCODING *enc, const char *p)
Definition: xmltok.cc:142
static const char KW_yes[]
Definition: xmltok.cc:1078
#define UTF8_GET_NAMING3(pages, byte)
Definition: xmltok.cc:71
static void PTRCALL utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim)
Definition: xmltok.cc:327
static int PTRFASTCALL utf8_isNmstrt2(const ENCODING *enc, const char *p)
Definition: xmltok.cc:150
static void PTRCALL latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim)
Definition: xmltok.cc:459
ENCODING enc
Definition: xmltok.cc:182
#define BIG2_BYTE_TYPE(enc, p)
Definition: xmltok.cc:785
#define XmlInitUnknownEncodingNS
Definition: xmlparse.cc:57
struct normal_encoding normal
Definition: xmltok.cc:1253
ENCODING * XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert, void *userData)
Definition: xmltok.cc:1351
static const char KW_UTF_16BE[]
Definition: xmltok.cc:1457
static int PTRFASTCALL utf8_isName2(const ENCODING *enc, const char *p)
Definition: xmltok.cc:136
unsigned short utf16[256]
Definition: xmltok.cc:1256
static const char KW_US_ASCII[]
Definition: xmltok.cc:1447
static const char KW_UTF_16LE[]
Definition: xmltok.cc:1461
#define VTABLE
Definition: xmltok.cc:51
static void PTRCALL utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim)
Definition: xmltok.cc:346
unsigned char type[256]
Definition: xmltok.cc:183
static int parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end, const char **namePtr, const char **nameEndPtr, const char **valPtr, const char **nextTokPtr)
Definition: xmltok.cc:981
static const struct normal_encoding little2_encoding
Definition: xmltok.cc:739
static void PTRCALL unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim)
Definition: xmltok.cc:1331
#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
Definition: xmltok.cc:650
#define UTF8_INVALID4(p)
Definition: xmltok.cc:116
static const char KW_standalone[]
Definition: xmltok.cc:1073
static void PTRCALL ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim)
Definition: xmltok.cc:492
#define VTABLE1
Definition: xmltok.cc:37
#define DEFINE_UTF16_TO_UTF16(E)
Definition: xmltok.cc:606
static const G4double pos
#define STANDARD_VTABLE(E)
Definition: xmltok.cc:215
#define BT_COLON