| 1 |
let eof = 0
|
| 2 |
let encoding_error = 1
|
| 3 |
let xml_char = 2
|
| 4 |
let blank = 3
|
| 5 |
let lowercase = 4
|
| 6 |
let uppercase = 5
|
| 7 |
let ascii_digit = 6
|
| 8 |
let char_23 = 7
|
| 9 |
let char_5f = 8
|
| 10 |
let char_3c = 9
|
| 11 |
let char_3e = 10
|
| 12 |
let char_3d = 11
|
| 13 |
let char_2e = 12
|
| 14 |
let char_2c = 13
|
| 15 |
let char_3a = 14
|
| 16 |
let char_3b = 15
|
| 17 |
let char_2b = 16
|
| 18 |
let char_2d = 17
|
| 19 |
let char_2a = 18
|
| 20 |
let char_2f = 19
|
| 21 |
let char_40 = 20
|
| 22 |
let char_26 = 21
|
| 23 |
let char_7b = 22
|
| 24 |
let char_7d = 23
|
| 25 |
let char_5b = 24
|
| 26 |
let char_5d = 25
|
| 27 |
let char_28 = 26
|
| 28 |
let char_29 = 27
|
| 29 |
let char_7c = 28
|
| 30 |
let char_3f = 29
|
| 31 |
let char_60 = 30
|
| 32 |
let char_22 = 31
|
| 33 |
let char_5c = 32
|
| 34 |
let char_27 = 33
|
| 35 |
let char_21 = 34
|
| 36 |
let unicode_base_char = 35
|
| 37 |
let unicode_ideographic = 36
|
| 38 |
let unicode_combining_char = 37
|
| 39 |
let unicode_digit = 38
|
| 40 |
let unicode_extender = 39
|
| 41 |
|
| 42 |
let one_char_classes = [
|
| 43 |
(0x23, 07);
|
| 44 |
(0x5f, 08);
|
| 45 |
(0x3c, 09);
|
| 46 |
(0x3e, 10);
|
| 47 |
(0x3d, 11);
|
| 48 |
(0x2e, 12);
|
| 49 |
(0x2c, 13);
|
| 50 |
(0x3a, 14);
|
| 51 |
(0x3b, 15);
|
| 52 |
(0x2b, 16);
|
| 53 |
(0x2d, 17);
|
| 54 |
(0x2a, 18);
|
| 55 |
(0x2f, 19);
|
| 56 |
(0x40, 20);
|
| 57 |
(0x26, 21);
|
| 58 |
(0x7b, 22);
|
| 59 |
(0x7d, 23);
|
| 60 |
(0x5b, 24);
|
| 61 |
(0x5d, 25);
|
| 62 |
(0x28, 26);
|
| 63 |
(0x29, 27);
|
| 64 |
(0x7c, 28);
|
| 65 |
(0x3f, 29);
|
| 66 |
(0x60, 30);
|
| 67 |
(0x22, 31);
|
| 68 |
(0x5c, 32);
|
| 69 |
(0x27, 33);
|
| 70 |
(0x21, 34);
|
| 71 |
]
|
| 72 |
|
| 73 |
let nb_classes = 40
|
| 74 |
|
| 75 |
# 17 "parser/wlexer.mll"
|
| 76 |
|
| 77 |
let keywords = Hashtbl.create 17
|
| 78 |
|
| 79 |
let in_comment = ref false
|
| 80 |
|
| 81 |
let error = Location.raise_loc
|
| 82 |
exception Illegal_character of char
|
| 83 |
exception Unterminated_comment
|
| 84 |
exception Unterminated_string
|
| 85 |
exception Unterminated_string_in_comment
|
| 86 |
|
| 87 |
|
| 88 |
(* Buffer for string literals (always encoded in UTF8). *)
|
| 89 |
|
| 90 |
let string_buff = Buffer.create 1024
|
| 91 |
|
| 92 |
let store_ascii = Buffer.add_char string_buff
|
| 93 |
let store_char = Buffer.add_string string_buff
|
| 94 |
let store_code = Encodings.Utf8.store string_buff
|
| 95 |
let get_stored_string () =
|
| 96 |
let s = Buffer.contents string_buff in
|
| 97 |
Buffer.clear string_buff;
|
| 98 |
s
|
| 99 |
let store_special = function
|
| 100 |
| 'n' -> store_ascii '\n'
|
| 101 |
| 'r' -> store_ascii '\r'
|
| 102 |
| 't' -> store_ascii '\t'
|
| 103 |
| c -> raise (Illegal_character '\\')
|
| 104 |
|
| 105 |
let string_start_pos = ref 0;;
|
| 106 |
let comment_start_pos : int list ref = ref [];;
|
| 107 |
|
| 108 |
let decimal_char s =
|
| 109 |
int_of_string (String.sub s 1 (String.length s - 2))
|
| 110 |
|
| 111 |
|
| 112 |
let hexa_digit = function
|
| 113 |
| '0'..'9' as c -> (Char.code c) - (Char.code '0')
|
| 114 |
| 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10
|
| 115 |
| _ -> failwith "Invalid hexadecimal digit" (* TODO: error loc *)
|
| 116 |
|
| 117 |
|
| 118 |
let hexa_char s =
|
| 119 |
let rec aux i accu =
|
| 120 |
if i = String.length s - 1 then accu
|
| 121 |
else aux (succ i) (accu * 16 + hexa_digit s.[i])
|
| 122 |
in
|
| 123 |
aux 0 0
|
| 124 |
|
| 125 |
let lex_tables = {
|
| 126 |
Lexing.lex_base =
|
| 127 |
"\000\000\023\000\011\000\015\000\254\255\042\000\046\000\255\255\
|
| 128 |
\250\255\249\255\255\255\041\000\253\255\019\000\252\255\252\255\
|
| 129 |
\251\255\000\000\002\000\253\255\246\255\245\255\010\000\054\000\
|
| 130 |
\018\000\059\000\021\000\059\000\250\255\026\000\064\000\036\000\
|
| 131 |
\067\000\070\000\024\000\027\000\047\000\054\000\248\255\250\255\
|
| 132 |
\247\255\073\000\063\000\093\000\103\000\098\000\139\000\079\000\
|
| 133 |
";
|
| 134 |
Lexing.lex_backtrk =
|
| 135 |
"\255\255\255\255\255\255\255\255\255\255\001\000\255\255\255\255\
|
| 136 |
\255\255\255\255\255\255\004\000\255\255\255\255\255\255\255\255\
|
| 137 |
\255\255\004\000\004\000\255\255\255\255\255\255\000\000\001\000\
|
| 138 |
\004\000\005\000\005\000\005\000\255\255\005\000\005\000\005\000\
|
| 139 |
\005\000\005\000\005\000\005\000\005\000\005\000\255\255\255\255\
|
| 140 |
\255\255\004\000\255\255\006\000\001\000\255\255\001\000\000\000\
|
| 141 |
";
|
| 142 |
Lexing.lex_default =
|
| 143 |
"\028\000\016\000\009\000\004\000\000\000\255\255\255\255\000\000\
|
| 144 |
\000\000\000\000\000\000\255\255\000\000\255\255\000\000\000\000\
|
| 145 |
\000\000\255\255\255\255\000\000\000\000\000\000\255\255\255\255\
|
| 146 |
\255\255\255\255\255\255\255\255\000\000\255\255\255\255\255\255\
|
| 147 |
\255\255\255\255\255\255\255\255\255\255\255\255\000\000\000\000\
|
| 148 |
\000\000\255\255\255\255\255\255\255\255\255\255\255\255\255\255\
|
| 149 |
";
|
| 150 |
Lexing.lex_trans =
|
| 151 |
"\020\000\021\000\021\000\022\000\023\000\023\000\024\000\025\000\
|
| 152 |
\023\000\026\000\027\000\008\000\029\000\047\000\030\000\031\000\
|
| 153 |
\032\000\033\000\034\000\005\000\007\000\005\000\035\000\015\000\
|
| 154 |
\041\000\013\000\036\000\004\000\037\000\034\000\039\000\038\000\
|
| 155 |
\039\000\038\000\014\000\023\000\023\000\021\000\021\000\021\000\
|
| 156 |
\042\000\017\000\010\000\011\000\010\000\012\000\006\000\013\000\
|
| 157 |
\006\000\018\000\006\000\039\000\006\000\039\000\019\000\039\000\
|
| 158 |
\019\000\007\000\044\000\044\000\044\000\007\000\044\000\043\000\
|
| 159 |
\043\000\040\000\044\000\043\000\045\000\039\000\039\000\044\000\
|
| 160 |
\004\000\004\000\004\000\039\000\041\000\039\000\039\000\041\000\
|
| 161 |
\039\000\014\000\047\000\039\000\000\000\000\000\000\000\039\000\
|
| 162 |
\000\000\044\000\044\000\044\000\044\000\044\000\043\000\043\000\
|
| 163 |
\039\000\043\000\043\000\043\000\000\000\043\000\046\000\046\000\
|
| 164 |
\000\000\043\000\046\000\044\000\044\000\044\000\043\000\044\000\
|
| 165 |
\000\000\000\000\000\000\044\000\012\000\045\000\000\000\000\000\
|
| 166 |
\044\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
|
| 167 |
\043\000\043\000\043\000\043\000\043\000\046\000\046\000\000\000\
|
| 168 |
\000\000\000\000\044\000\044\000\044\000\044\000\044\000\046\000\
|
| 169 |
\046\000\046\000\000\000\046\000\000\000\000\000\000\000\046\000\
|
| 170 |
\000\000\000\000\000\000\000\000\046\000\000\000\000\000\000\000\
|
| 171 |
\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
|
| 172 |
\000\000\000\000\000\000\000\000\000\000\000\000\046\000\046\000\
|
| 173 |
\046\000\046\000\046\000";
|
| 174 |
Lexing.lex_check =
|
| 175 |
"\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
|
| 176 |
\000\000\000\000\000\000\002\000\000\000\022\000\000\000\000\000\
|
| 177 |
\000\000\000\000\000\000\003\000\018\000\003\000\000\000\001\000\
|
| 178 |
\024\000\013\000\000\000\017\000\000\000\000\000\026\000\000\000\
|
| 179 |
\026\000\000\000\013\000\000\000\000\000\000\000\000\000\000\000\
|
| 180 |
\029\000\001\000\002\000\002\000\002\000\011\000\005\000\011\000\
|
| 181 |
\005\000\001\000\006\000\031\000\006\000\034\000\001\000\035\000\
|
| 182 |
\001\000\005\000\023\000\023\000\023\000\006\000\023\000\025\000\
|
| 183 |
\025\000\036\000\023\000\025\000\023\000\027\000\027\000\023\000\
|
| 184 |
\011\000\011\000\011\000\030\000\033\000\037\000\030\000\041\000\
|
| 185 |
\033\000\042\000\047\000\032\000\255\255\255\255\255\255\033\000\
|
| 186 |
\255\255\023\000\023\000\023\000\023\000\023\000\025\000\025\000\
|
| 187 |
\032\000\043\000\043\000\043\000\255\255\043\000\045\000\045\000\
|
| 188 |
\255\255\043\000\045\000\044\000\044\000\044\000\043\000\044\000\
|
| 189 |
\255\255\255\255\255\255\044\000\045\000\044\000\255\255\255\255\
|
| 190 |
\044\000\255\255\255\255\255\255\255\255\255\255\255\255\255\255\
|
| 191 |
\043\000\043\000\043\000\043\000\043\000\045\000\045\000\255\255\
|
| 192 |
\255\255\255\255\044\000\044\000\044\000\044\000\044\000\046\000\
|
| 193 |
\046\000\046\000\255\255\046\000\255\255\255\255\255\255\046\000\
|
| 194 |
\255\255\255\255\255\255\255\255\046\000\255\255\255\255\255\255\
|
| 195 |
\255\255\255\255\255\255\255\255\255\255\255\255\255\255\255\255\
|
| 196 |
\255\255\255\255\255\255\255\255\255\255\255\255\046\000\046\000\
|
| 197 |
\046\000\046\000\046\000"
|
| 198 |
}
|
| 199 |
|
| 200 |
let rec token engine lexbuf =
|
| 201 |
match engine lex_tables 0 lexbuf with
|
| 202 |
0 -> (
|
| 203 |
# 78 "parser/wlexer.mll"
|
| 204 |
token engine lexbuf )
|
| 205 |
| 1 -> (
|
| 206 |
# 80 "parser/wlexer.mll"
|
| 207 |
|
| 208 |
let s = Lexing.lexeme lexbuf in
|
| 209 |
if (s.[0] >= 'A') && (s.[0] <= 'Z')
|
| 210 |
then "UIDENT",s
|
| 211 |
else if Hashtbl.mem keywords s then "",s else "LIDENT",s
|
| 212 |
)
|
| 213 |
| 2 -> (
|
| 214 |
# 87 "parser/wlexer.mll"
|
| 215 |
|
| 216 |
let s = Lexing.lexeme lexbuf in
|
| 217 |
let s = String.sub s 0 (String.length s - 2) in
|
| 218 |
"ANY_IN_NS", s
|
| 219 |
)
|
| 220 |
| 3 -> (
|
| 221 |
# 92 "parser/wlexer.mll"
|
| 222 |
"ANY_IN_NS", "" )
|
| 223 |
| 4 -> (
|
| 224 |
# 94 "parser/wlexer.mll"
|
| 225 |
"INT",Lexing.lexeme lexbuf )
|
| 226 |
| 5 -> (
|
| 227 |
# 99 "parser/wlexer.mll"
|
| 228 |
"",Lexing.lexeme lexbuf )
|
| 229 |
| 6 -> (
|
| 230 |
# 100 "parser/wlexer.mll"
|
| 231 |
"DIRECTIVE",Lexing.lexeme lexbuf )
|
| 232 |
| 7 -> (
|
| 233 |
# 102 "parser/wlexer.mll"
|
| 234 |
let string_start = Lexing.lexeme_start lexbuf in
|
| 235 |
string_start_pos := string_start;
|
| 236 |
let double_quote = Lexing.lexeme_char lexbuf 0 = '"' in
|
| 237 |
string (Lexing.lexeme lexbuf) engine lexbuf;
|
| 238 |
lexbuf.Lexing.lex_start_pos <-
|
| 239 |
string_start - lexbuf.Lexing.lex_abs_pos;
|
| 240 |
(if double_quote then "STRING2" else "STRING1"),
|
| 241 |
(get_stored_string()) )
|
| 242 |
| 8 -> (
|
| 243 |
# 112 "parser/wlexer.mll"
|
| 244 |
comment_start_pos := [Lexing.lexeme_start lexbuf];
|
| 245 |
in_comment := true;
|
| 246 |
comment engine lexbuf;
|
| 247 |
in_comment := false;
|
| 248 |
token engine lexbuf )
|
| 249 |
| 9 -> (
|
| 250 |
# 119 "parser/wlexer.mll"
|
| 251 |
"EOI","" )
|
| 252 |
| 10 -> (
|
| 253 |
# 121 "parser/wlexer.mll"
|
| 254 |
error
|
| 255 |
(Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf)
|
| 256 |
(Illegal_character ((Lexing.lexeme lexbuf).[0])) )
|
| 257 |
| _ -> failwith "lexing: empty token [token]"
|
| 258 |
|
| 259 |
and comment engine lexbuf =
|
| 260 |
match engine lex_tables 1 lexbuf with
|
| 261 |
0 -> (
|
| 262 |
# 127 "parser/wlexer.mll"
|
| 263 |
comment_start_pos := Lexing.lexeme_start lexbuf :: !comment_start_pos;
|
| 264 |
comment engine lexbuf;
|
| 265 |
)
|
| 266 |
| 1 -> (
|
| 267 |
# 131 "parser/wlexer.mll"
|
| 268 |
comment_start_pos := List.tl !comment_start_pos;
|
| 269 |
if !comment_start_pos <> [] then comment engine lexbuf;
|
| 270 |
)
|
| 271 |
| 2 -> (
|
| 272 |
# 135 "parser/wlexer.mll"
|
| 273 |
string_start_pos := Lexing.lexeme_start lexbuf;
|
| 274 |
Buffer.clear string_buff;
|
| 275 |
let ender = Lexing.lexeme lexbuf in
|
| 276 |
(try string ender engine lexbuf
|
| 277 |
with Location.Location (_,_,Unterminated_string) ->
|
| 278 |
let st = List.hd !comment_start_pos in
|
| 279 |
error st (st+2) Unterminated_string_in_comment);
|
| 280 |
Buffer.clear string_buff;
|
| 281 |
comment engine lexbuf )
|
| 282 |
| 3 -> (
|
| 283 |
# 145 "parser/wlexer.mll"
|
| 284 |
let st = List.hd !comment_start_pos in
|
| 285 |
error st (st+2) Unterminated_comment
|
| 286 |
)
|
| 287 |
| 4 -> (
|
| 288 |
# 149 "parser/wlexer.mll"
|
| 289 |
comment engine lexbuf )
|
| 290 |
| _ -> failwith "lexing: empty token [comment]"
|
| 291 |
|
| 292 |
and string ender engine lexbuf =
|
| 293 |
match engine lex_tables 2 lexbuf with
|
| 294 |
0 -> (
|
| 295 |
# 153 "parser/wlexer.mll"
|
| 296 |
let c = Lexing.lexeme lexbuf in
|
| 297 |
if c = ender then ()
|
| 298 |
else (store_char (Lexing.lexeme lexbuf);
|
| 299 |
string ender engine lexbuf) )
|
| 300 |
| 1 -> (
|
| 301 |
# 158 "parser/wlexer.mll"
|
| 302 |
store_ascii (Lexing.lexeme_char lexbuf 1);
|
| 303 |
string ender engine lexbuf )
|
| 304 |
| 2 -> (
|
| 305 |
# 161 "parser/wlexer.mll"
|
| 306 |
let c = Lexing.lexeme_char lexbuf 1 in
|
| 307 |
if c = 'x'
|
| 308 |
then parse_hexa_char engine lexbuf
|
| 309 |
else store_special c;
|
| 310 |
string ender engine lexbuf )
|
| 311 |
| 3 -> (
|
| 312 |
# 167 "parser/wlexer.mll"
|
| 313 |
store_code (decimal_char (Lexing.lexeme lexbuf));
|
| 314 |
string ender engine lexbuf )
|
| 315 |
| 4 -> (
|
| 316 |
# 170 "parser/wlexer.mll"
|
| 317 |
error
|
| 318 |
(Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf)
|
| 319 |
(Illegal_character '\\') )
|
| 320 |
| 5 -> (
|
| 321 |
# 174 "parser/wlexer.mll"
|
| 322 |
error !string_start_pos (!string_start_pos+1) Unterminated_string )
|
| 323 |
| 6 -> (
|
| 324 |
# 176 "parser/wlexer.mll"
|
| 325 |
store_code (Char.code (Lexing.lexeme_char lexbuf 0));
|
| 326 |
(* Adapt when source is UTF8 *)
|
| 327 |
string ender engine lexbuf )
|
| 328 |
| _ -> failwith "lexing: empty token [string ender]"
|
| 329 |
|
| 330 |
and parse_hexa_char engine lexbuf =
|
| 331 |
match engine lex_tables 3 lexbuf with
|
| 332 |
0 -> (
|
| 333 |
# 182 "parser/wlexer.mll"
|
| 334 |
store_code (hexa_char (Lexing.lexeme lexbuf)) )
|
| 335 |
| 1 -> (
|
| 336 |
# 184 "parser/wlexer.mll"
|
| 337 |
error
|
| 338 |
(Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf)
|
| 339 |
(Illegal_character '\\') )
|
| 340 |
| _ -> failwith "lexing: empty token [parse_hexa_char]"
|
| 341 |
|
| 342 |
;;
|
| 343 |
|
| 344 |
# 190 "parser/wlexer.mll"
|
| 345 |
|
| 346 |
|
| 347 |
let delta_loc = ref 0
|
| 348 |
let set_delta_loc dl = delta_loc := dl
|
| 349 |
|
| 350 |
(* For synchronization on errors in the toplevel ... *)
|
| 351 |
(* Issue: file inclusion *)
|
| 352 |
let lexbuf = ref None
|
| 353 |
let last_tok = ref ("","")
|
| 354 |
|
| 355 |
let lexer_func_of_wlex lexfun lexengine cs =
|
| 356 |
let dl = !delta_loc in
|
| 357 |
delta_loc := 0;
|
| 358 |
let lb =
|
| 359 |
Lexing.from_function
|
| 360 |
(fun s n ->
|
| 361 |
try s.[0] <- Stream.next cs; 1
|
| 362 |
with Stream.Failure -> 0)
|
| 363 |
in
|
| 364 |
lexbuf := Some lb;
|
| 365 |
let next () =
|
| 366 |
let tok = lexfun lexengine lb in
|
| 367 |
let loc = (Lexing.lexeme_start lb + dl,
|
| 368 |
Lexing.lexeme_end lb + dl) in
|
| 369 |
last_tok := tok;
|
| 370 |
(tok, loc)
|
| 371 |
in
|
| 372 |
Token.make_stream_and_location next
|
| 373 |
|
| 374 |
let register_kw (s1,s2) =
|
| 375 |
if s1 = "" then
|
| 376 |
match s2.[0] with
|
| 377 |
| 'a' .. 'z' when not (Hashtbl.mem keywords s2) ->
|
| 378 |
Hashtbl.add keywords s2 ()
|
| 379 |
| _ -> ()
|
| 380 |
|
| 381 |
|
| 382 |
let lexer lexfun lexengine =
|
| 383 |
{
|
| 384 |
Token.tok_func = lexer_func_of_wlex lexfun lexengine;
|
| 385 |
Token.tok_using = register_kw;
|
| 386 |
Token.tok_removing = (fun _ -> ());
|
| 387 |
Token.tok_match = Token.default_match;
|
| 388 |
Token.tok_text = Token.lexer_text
|
| 389 |
}
|
| 390 |
|
| 391 |
let classes =
|
| 392 |
let c i = (i,i) in
|
| 393 |
let i ch1 ch2 = (Char.code ch1, Char.code ch2) in
|
| 394 |
[ unicode_base_char,
|
| 395 |
[ 0x00C0,0x00D6; 0x00D8,0x00F6;
|
| 396 |
0x00F8,0x00FF; 0x0100,0x0131; 0x0134,0x013E; 0x0141,0x0148;
|
| 397 |
0x014A,0x017E; 0x0180,0x01C3; 0x01CD,0x01F0; 0x01F4,0x01F5;
|
| 398 |
0x01FA,0x0217; 0x0250,0x02A8; 0x02BB,0x02C1; 0x0386,0x0386;
|
| 399 |
0x0388,0x038A; 0x038C,0x038C; 0x038E,0x03A1; 0x03A3,0x03CE;
|
| 400 |
0x03D0,0x03D6; 0x03DA,0x03DA; 0x03DC,0x03DC; 0x03DE,0x03DE;
|
| 401 |
0x03E0,0x03E0; 0x03E2,0x03F3;
|
| 402 |
0x0401,0x040C; 0x040E,0x044F; 0x0451,0x045C; 0x045E,0x0481;
|
| 403 |
0x0490,0x04C4; 0x04C7,0x04C8; 0x04CB,0x04CC; 0x04D0,0x04EB;
|
| 404 |
0x04EE,0x04F5; 0x04F8,0x04F9; 0x0531,0x0556; 0x0559,0x0559;
|
| 405 |
0x0561,0x0586; 0x05D0,0x05EA; 0x05F0,0x05F2; 0x0621,0x063A;
|
| 406 |
0x0641,0x064A; 0x0671,0x06B7; 0x06BA,0x06BE; 0x06C0,0x06CE;
|
| 407 |
0x06D0,0x06D3; 0x06D5,0x06D5; 0x06E5,0x06E6; 0x0905,0x0939;
|
| 408 |
0x093D,0x093D;
|
| 409 |
0x0958,0x0961; 0x0985,0x098C; 0x098F,0x0990; 0x0993,0x09A8;
|
| 410 |
0x09AA,0x09B0; 0x09B2,0x09B2; 0x09B6,0x09B9; 0x09DC,0x09DD;
|
| 411 |
0x09DF,0x09E1; 0x09F0,0x09F1; 0x0A05,0x0A0A; 0x0A0F,0x0A10;
|
| 412 |
0x0A13,0x0A28; 0x0A2A,0x0A30; 0x0A32,0x0A33; 0x0A35,0x0A36;
|
| 413 |
0x0A38,0x0A39; 0x0A59,0x0A5C; 0x0A5E,0x0A5E; 0x0A72,0x0A74;
|
| 414 |
0x0A85,0x0A8B; 0x0A8D,0x0A8D; 0x0A8F,0x0A91; 0x0A93,0x0AA8;
|
| 415 |
0x0AAA,0x0AB0; 0x0AB2,0x0AB3; 0x0AB5,0x0AB9; 0x0ABD,0x0ABD;
|
| 416 |
0x0AE0,0x0AE0;
|
| 417 |
0x0B05,0x0B0C; 0x0B0F,0x0B10; 0x0B13,0x0B28; 0x0B2A,0x0B30;
|
| 418 |
0x0B32,0x0B33; 0x0B36,0x0B39; 0x0B3D,0x0B3D; 0x0B5C,0x0B5D;
|
| 419 |
0x0B5F,0x0B61; 0x0B85,0x0B8A; 0x0B8E,0x0B90; 0x0B92,0x0B95;
|
| 420 |
0x0B99,0x0B9A; 0x0B9C,0x0B9C; 0x0B9E,0x0B9F; 0x0BA3,0x0BA4;
|
| 421 |
0x0BA8,0x0BAA; 0x0BAE,0x0BB5; 0x0BB7,0x0BB9; 0x0C05,0x0C0C;
|
| 422 |
0x0C0E,0x0C10; 0x0C12,0x0C28; 0x0C2A,0x0C33; 0x0C35,0x0C39;
|
| 423 |
0x0C60,0x0C61; 0x0C85,0x0C8C; 0x0C8E,0x0C90; 0x0C92,0x0CA8;
|
| 424 |
0x0CAA,0x0CB3; 0x0CB5,0x0CB9; 0x0CDE,0x0CDE; 0x0CE0,0x0CE1;
|
| 425 |
0x0D05,0x0D0C; 0x0D0E,0x0D10; 0x0D12,0x0D28; 0x0D2A,0x0D39;
|
| 426 |
0x0D60,0x0D61; 0x0E01,0x0E2E; 0x0E30,0x0E30; 0x0E32,0x0E33;
|
| 427 |
0x0E40,0x0E45; 0x0E81,0x0E82; 0x0E84,0x0E84; 0x0E87,0x0E88;
|
| 428 |
0x0E8A,0x0E8A;
|
| 429 |
0x0E8D,0x0E8D; 0x0E94,0x0E97; 0x0E99,0x0E9F; 0x0EA1,0x0EA3;
|
| 430 |
0x0EA5,0x0EA5;
|
| 431 |
0x0EA7,0x0EA7; 0x0EAA,0x0EAB; 0x0EAD,0x0EAE; 0x0EB0,0x0EB0;
|
| 432 |
0x0EB2,0x0EB3;
|
| 433 |
0x0EBD,0x0EBD; 0x0EC0,0x0EC4; 0x0F40,0x0F47; 0x0F49,0x0F69;
|
| 434 |
0x10A0,0x10C5; 0x10D0,0x10F6; 0x1100,0x1100; 0x1102,0x1103;
|
| 435 |
0x1105,0x1107; 0x1109,0x1109; 0x110B,0x110C; 0x110E,0x1112;
|
| 436 |
0x113C,0x113C;
|
| 437 |
0x113E,0x113E; 0x1140,0x1140; 0x114C,0x114C; 0x114E,0x114E;
|
| 438 |
0x1150,0x1150; 0x1154,0x1155; 0x1159,0x1159;
|
| 439 |
0x115F,0x1161; 0x1163,0x1163; 0x1165,0x1165; 0x1167,0x1167;
|
| 440 |
0x1169,0x1169; 0x116D,0x116E;
|
| 441 |
0x1172,0x1173; 0x1175,0x1175; 0x119E,0x119E; 0x11A8,0x11A8;
|
| 442 |
0x11AB,0x11AB; 0x11AE,0x11AF;
|
| 443 |
0x11B7,0x11B8; 0x11BA,0x11BA; 0x11BC,0x11C2; 0x11EB,0x11EB;
|
| 444 |
0x11F0,0x11F0; 0x11F9,0x11F9;
|
| 445 |
0x1E00,0x1E9B; 0x1EA0,0x1EF9; 0x1F00,0x1F15; 0x1F18,0x1F1D;
|
| 446 |
0x1F20,0x1F45; 0x1F48,0x1F4D; 0x1F50,0x1F57; 0x1F59,0x1F59;
|
| 447 |
0x1F5B,0x1F5B;
|
| 448 |
0x1F5D,0x1F5D; 0x1F5F,0x1F7D; 0x1F80,0x1FB4; 0x1FB6,0x1FBC;
|
| 449 |
0x1FBE,0x1FBE;
|
| 450 |
0x1FC2,0x1FC4; 0x1FC6,0x1FCC; 0x1FD0,0x1FD3; 0x1FD6,0x1FDB;
|
| 451 |
0x1FE0,0x1FEC; 0x1FF2,0x1FF4; 0x1FF6,0x1FFC; 0x2126,0x2126;
|
| 452 |
0x212A,0x212B; 0x212E,0x212E; 0x2180,0x2182; 0x3041,0x3094;
|
| 453 |
0x30A1,0x30FA; 0x3105,0x312C; (* 0xAC00,0xD7A3 *) ];
|
| 454 |
|
| 455 |
unicode_ideographic,
|
| 456 |
[ 0x3007,0x3007; 0x3021,0x3029 (* 0x4E00-0x9FA5 *) ];
|
| 457 |
|
| 458 |
unicode_combining_char,
|
| 459 |
[ 0x0300,0x0345; 0x0360,0x0361; 0x0483,0x0486; 0x0591,0x05A1;
|
| 460 |
0x05A3,0x05B9; 0x05BB,0x05BD; 0x05BF,0x05BF; 0x05C1,0x05C2;
|
| 461 |
0x05C4,0x05C4; 0x064B,0x0652; 0x0670,0x0670; 0x06D6,0x06DC;
|
| 462 |
0x06DD,0x06DF; 0x06E0,0x06E4; 0x06E7,0x06E8; 0x06EA,0x06ED;
|
| 463 |
0x0901,0x0903; 0x093C,0x093C; 0x093E,0x094C; 0x094D,0x094D;
|
| 464 |
0x0951,0x0954; 0x0962,0x0963; 0x0981,0x0983; 0x09BC,0x09BC;
|
| 465 |
0x09BE,0x09BE; 0x09BF,0x09BF; 0x09C0,0x09C4; 0x09C7,0x09C8;
|
| 466 |
0x09CB,0x09CD; 0x09D7,0x09D7; 0x09E2,0x09E3; 0x0A02,0x0A02;
|
| 467 |
0x0A3C,0x0A3C; 0x0A3E,0x0A3E; 0x0A3F,0x0A3F; 0x0A40,0x0A42;
|
| 468 |
0x0A47,0x0A48; 0x0A4B,0x0A4D; 0x0A70,0x0A71; 0x0A81,0x0A83;
|
| 469 |
0x0ABC,0x0ABC; 0x0ABE,0x0AC5; 0x0AC7,0x0AC9; 0x0ACB,0x0ACD;
|
| 470 |
0x0B01,0x0B03; 0x0B3C,0x0B3C; 0x0B3E,0x0B43; 0x0B47,0x0B48;
|
| 471 |
0x0B4B,0x0B4D; 0x0B56,0x0B57; 0x0B82,0x0B83; 0x0BBE,0x0BC2;
|
| 472 |
0x0BC6,0x0BC8; 0x0BCA,0x0BCD; 0x0BD7,0x0BD7; 0x0C01,0x0C03;
|
| 473 |
0x0C3E,0x0C44; 0x0C46,0x0C48; 0x0C4A,0x0C4D; 0x0C55,0x0C56;
|
| 474 |
0x0C82,0x0C83; 0x0CBE,0x0CC4; 0x0CC6,0x0CC8; 0x0CCA,0x0CCD;
|
| 475 |
0x0CD5,0x0CD6; 0x0D02,0x0D03; 0x0D3E,0x0D43; 0x0D46,0x0D48;
|
| 476 |
0x0D4A,0x0D4D; 0x0D57,0x0D57; 0x0E31,0x0E31; 0x0E34,0x0E3A;
|
| 477 |
0x0E47,0x0E4E; 0x0EB1,0x0EB1; 0x0EB4,0x0EB9; 0x0EBB,0x0EBC;
|
| 478 |
0x0EC8,0x0ECD; 0x0F18,0x0F19; 0x0F35,0x0F35; 0x0F37,0x0F37;
|
| 479 |
0x0F39,0x0F39; 0x0F3E,0x0F3E; 0x0F3F,0x0F3F; 0x0F71,0x0F84;
|
| 480 |
0x0F86,0x0F8B; 0x0F90,0x0F95; 0x0F97,0x0F97; 0x0F99,0x0FAD;
|
| 481 |
0x0FB1,0x0FB7; 0x0FB9,0x0FB9; 0x20D0,0x20DC; 0x20E1,0x20E1;
|
| 482 |
0x302A,0x302F; 0x3099,0x3099; 0x309A,0x309A ];
|
| 483 |
|
| 484 |
unicode_digit,
|
| 485 |
[ 0x0660,0x0669; 0x06F0,0x06F9; 0x0966,0x096F; 0x09E6,0x09EF;
|
| 486 |
0x0A66,0x0A6F; 0x0AE6,0x0AEF; 0x0B66,0x0B6F; 0x0BE7,0x0BEF;
|
| 487 |
0x0C66,0x0C6F; 0x0CE6,0x0CEF; 0x0D66,0x0D6F; 0x0E50,0x0E59;
|
| 488 |
0x0ED0,0x0ED9; 0x0F20,0x0F29 ];
|
| 489 |
|
| 490 |
|
| 491 |
unicode_extender,
|
| 492 |
[ 0x00B7,0x00B7; 0x02D0,0x02D1; 0x0387,0x0387; 0x0640,0x0640;
|
| 493 |
0x0E46,0x0E46; 0x0EC6,0x0EC6; 0x3005,0x3005; 0x3031,0x3035;
|
| 494 |
0x309D,0x309E; 0x30FC,0x30FE ];
|
| 495 |
|
| 496 |
ascii_digit,
|
| 497 |
[ i '0' '9'];
|
| 498 |
|
| 499 |
lowercase,
|
| 500 |
[i 'a' 'z'];
|
| 501 |
|
| 502 |
uppercase,
|
| 503 |
[i 'A' 'Z'];
|
| 504 |
|
| 505 |
blank,
|
| 506 |
[c 8; c 9; c 10; c 13; c 32]
|
| 507 |
]
|
| 508 |
|
| 509 |
let table =
|
| 510 |
assert(nb_classes <= 256);
|
| 511 |
let v = String.make 0x312d (Char.chr encoding_error) in
|
| 512 |
let fill_int c (i, j) = String.fill v i (j-i+1) c in
|
| 513 |
let fill_class (c, l) = List.iter (fill_int (Char.chr c)) l in
|
| 514 |
let fill_char (ch, cl) = v.[ch] <- Char.chr cl in
|
| 515 |
List.iter fill_class classes;
|
| 516 |
List.iter fill_char one_char_classes;
|
| 517 |
v
|
| 518 |
|
| 519 |
let utf8_engine =
|
| 520 |
Lex_engines.engine_tiny_utf8 table
|
| 521 |
(fun code ->
|
| 522 |
if code >= 0x4E00 && code <= 0x9FA5 then
|
| 523 |
unicode_ideographic
|
| 524 |
else if code >= 0xAC00 && code <= 0xD7A3 then
|
| 525 |
unicode_base_char
|
| 526 |
else if code <= 0xD7FF || (code >= 0xE000 && code <= 0xFFFD) ||
|
| 527 |
(code >= 0x10000 && code <= 0x10FFFF) then
|
| 528 |
xml_char
|
| 529 |
else encoding_error)
|
| 530 |
|
| 531 |
let latin1_engine = Lex_engines.engine_tiny_8bit table
|