/[svn]/parser/wlexer.mll
ViewVC logotype

Contents of /parser/wlexer.mll

Parent Directory Parent Directory | Revision Log Revision Log


Revision 469 - (hide annotations)
Tue Jul 10 17:37:04 2007 UTC (5 years, 11 months ago) by abate
File size: 13040 byte(s)
[r2003-05-31 20:59:39 by cvscast] Protect against Stream.Failure in Parser.sync -- Alain

Original author: cvscast
Date: 2003-05-31 20:59:39+00:00
1 abate 81 (* File to be processed by wlex, not ocamllex ! *)
2    
3     classes
4     encoding_error
5     xml_char
6 abate 431 blank
7 abate 81 lowercase uppercase ascii_digit
8 abate 431 "#_<>=.,:;+-*/@&{}[]()|?`\"\\\'!"
9 abate 81
10 abate 332 unicode_base_char
11     unicode_ideographic
12     unicode_combining_char
13     unicode_digit
14     unicode_extender
15    
16    
17 abate 81 {
18     let keywords = Hashtbl.create 17
19    
20 abate 447 let in_comment = ref false
21    
22 abate 249 let error = Location.raise_loc
23 abate 81 exception Illegal_character of char
24     exception Unterminated_comment
25     exception Unterminated_string
26     exception Unterminated_string_in_comment
27    
28 abate 237
29 abate 325 (* Buffer for string literals (always encoded in UTF8).
30     Each character is encoded in two consecutives code point;
31     the first one gives the number of bytes in the input document;
32     the second one gives the Unicode representation *)
33 abate 81
34     let string_buff = Buffer.create 1024
35 abate 325
36     let store_len ?(add=0) lexbuf =
37     let l = add + (Lexing.lexeme_end lexbuf) - (Lexing.lexeme_start lexbuf) in
38     Encodings.Utf8.store string_buff l
39    
40 abate 310 let store_ascii = Buffer.add_char string_buff
41     let store_char = Buffer.add_string string_buff
42     let store_code = Encodings.Utf8.store string_buff
43 abate 81 let get_stored_string () =
44     let s = Buffer.contents string_buff in
45     Buffer.clear string_buff;
46     s
47 abate 237 let store_special = function
48 abate 310 | 'n' -> store_ascii '\n'
49     | 'r' -> store_ascii '\r'
50     | 't' -> store_ascii '\t'
51 abate 237 | c -> raise (Illegal_character '\\')
52 abate 81
53     let string_start_pos = ref 0;;
54     let comment_start_pos : int list ref = ref [];;
55    
56 abate 310 let numeric_char s =
57     int_of_string (String.sub s 1 (String.length s - 2))
58 abate 154
59 abate 332
60     let hexa_digit = function
61     | '0'..'9' as c -> (Char.code c) - (Char.code '0')
62     | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10
63     | _ -> failwith "Invalid hexadecimal digit" (* TODO: error loc *)
64    
65    
66 abate 310 let hexa_char s =
67     let rec aux i accu =
68     if i = String.length s - 1 then accu
69 abate 332 else aux (succ i) (accu * 16 + hexa_digit s.[i])
70 abate 310 in
71     aux 0 0
72    
73 abate 154 let rec tag_of_tag s i =
74     match s.[i] with
75     | '\008' | '\009' | '\010' | '\013' | '\032' -> tag_of_tag s (i+1)
76     | _ -> String.sub s i (String.length s - i)
77    
78 abate 81 }
79    
80 abate 332 let letter = lowercase | uppercase | unicode_base_char | unicode_ideographic
81     let digit = ascii_digit | unicode_digit
82     let character = [ ^ encoding_error ]
83 abate 81
84 abate 332 let ncname_char =
85     letter | digit | [ ".-_" ] | unicode_combining_char | unicode_extender
86     let ncname = (letter | '_' ) ncname_char*
87     let qname = (ncname ':')? ncname
88    
89 abate 81 rule token = parse
90     blank+ { token engine lexbuf }
91 abate 332 | qname
92     {
93     let s = Lexing.lexeme lexbuf in
94     if (s.[0] >= 'A') && (s.[0] <= 'Z')
95     then "UIDENT",s
96     else if Hashtbl.mem keywords s then "",s else "LIDENT",s
97     }
98     | '-'? ascii_digit+
99     { "INT",Lexing.lexeme lexbuf }
100 abate 81 | [ "<>=.,:;+-*/@&{}[]()|?`!" ]
101 abate 229 | "->" | "::" | ";;" | "--" | ":=" | "\\" | "++"
102 abate 237 | "{|" | "|}" | "<=" | ">=" | "<<" | ">>"
103 abate 81 | ["?+*"] "?"
104     { "",Lexing.lexeme lexbuf }
105 abate 446 | "#" lowercase+ { "DIRECTIVE",Lexing.lexeme lexbuf }
106 abate 81 | '"' | "'"
107     { let string_start = Lexing.lexeme_start lexbuf in
108     string_start_pos := string_start;
109     let double_quote = Lexing.lexeme_char lexbuf 0 = '"' in
110 abate 310 string (Lexing.lexeme lexbuf) engine lexbuf;
111 abate 81 lexbuf.Lexing.lex_start_pos <-
112     string_start - lexbuf.Lexing.lex_abs_pos;
113     (if double_quote then "STRING2" else "STRING1"),
114     (get_stored_string()) }
115    
116     | "(*"
117     { comment_start_pos := [Lexing.lexeme_start lexbuf];
118 abate 447 in_comment := true;
119 abate 81 comment engine lexbuf;
120 abate 447 in_comment := false;
121 abate 81 token engine lexbuf }
122    
123     | eof
124     { "EOI","" }
125     | _
126 abate 469 { Printf.eprintf "XXX\n"; error
127 abate 81 (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf)
128     (Illegal_character ((Lexing.lexeme lexbuf).[0])) }
129    
130     and comment = parse
131     "(*"
132     { comment_start_pos := Lexing.lexeme_start lexbuf :: !comment_start_pos;
133     comment engine lexbuf;
134     }
135     | "*)"
136     { comment_start_pos := List.tl !comment_start_pos;
137     if !comment_start_pos <> [] then comment engine lexbuf;
138     }
139     | '"' | "'"
140     { string_start_pos := Lexing.lexeme_start lexbuf;
141 abate 325 Buffer.clear string_buff;
142 abate 310 let ender = Lexing.lexeme lexbuf in
143     (try string ender engine lexbuf
144 abate 81 with Location.Location (_,Unterminated_string) ->
145     let st = List.hd !comment_start_pos in
146     error st (st+2) Unterminated_string_in_comment);
147     Buffer.clear string_buff;
148     comment engine lexbuf }
149     | eof
150     { let st = List.hd !comment_start_pos in
151     error st (st+2) Unterminated_comment
152     }
153     | _
154     { comment engine lexbuf }
155    
156 abate 310 and string ender = parse
157     | '"' | "'"
158 abate 325 { let c = Lexing.lexeme lexbuf in
159 abate 310 if c = ender then ()
160 abate 325 else (store_len lexbuf;
161     store_char (Lexing.lexeme lexbuf);
162     string ender engine lexbuf) }
163 abate 322 | '\\' ['\\' '"' '\'']
164 abate 325 { store_len lexbuf;
165     store_ascii (Lexing.lexeme_char lexbuf 1);
166 abate 310 string ender engine lexbuf }
167 abate 325 | '\\' lowercase
168     { let c = Lexing.lexeme_char lexbuf 1 in
169     if c = 'x'
170     then parse_hexa_char engine lexbuf
171     else (store_len lexbuf; store_special c);
172     string ender engine lexbuf }
173     | '\\' ascii_digit+ ';'
174     { store_len lexbuf;
175     store_code (numeric_char (Lexing.lexeme lexbuf));
176 abate 310 string ender engine lexbuf }
177 abate 325 | '\\'
178     { error
179     (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf)
180     (Illegal_character '\\') }
181 abate 81 | eof
182     { error !string_start_pos (!string_start_pos+1) Unterminated_string }
183     | _
184 abate 325 { store_len lexbuf;
185     store_code (Char.code (Lexing.lexeme_char lexbuf 0));
186     (* Adapt when source is UTF8 *)
187 abate 310 string ender engine lexbuf }
188 abate 81
189 abate 310 and parse_hexa_char = parse
190 abate 332 | (ascii_digit|lowercase)+ ';'
191 abate 325 { store_len ~add:2 lexbuf;
192     store_code (hexa_char (Lexing.lexeme lexbuf)) }
193 abate 81 | _
194 abate 325 { error
195     (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf)
196     (Illegal_character '\\') }
197 abate 81
198 abate 325
199 abate 431
200 abate 81 {
201    
202 abate 302 let delta_loc = ref 0
203     let set_delta_loc dl = delta_loc := dl
204    
205 abate 332 let lexer_func_of_wlex lexfun lexengine cs =
206     let dl = !delta_loc in
207     delta_loc := 0;
208     let lb =
209     Lexing.from_function
210     (fun s n ->
211     try s.[0] <- Stream.next cs; 1 with Stream.Failure -> 0)
212     in
213     let next () =
214     let tok = lexfun lexengine lb in
215     let loc = (Lexing.lexeme_start lb + dl,
216     Lexing.lexeme_end lb + dl) in
217     (tok, loc)
218     in
219     Token.make_stream_and_location next
220    
221     let register_kw (s1,s2) =
222     if s1 = "" then
223     match s2.[0] with
224     | 'a' .. 'z' when not (Hashtbl.mem keywords s2) ->
225     Hashtbl.add keywords s2 ()
226     | _ -> ()
227 abate 81
228    
229 abate 332 let lexer lexfun lexengine =
230     {
231     Token.tok_func = lexer_func_of_wlex lexfun lexengine;
232     Token.tok_using = register_kw;
233     Token.tok_removing = (fun _ -> ());
234     Token.tok_match = Token.default_match;
235     Token.tok_text = Token.lexer_text
236     }
237 abate 81
238     let classes =
239     let c i = (i,i) in
240     let i ch1 ch2 = (Char.code ch1, Char.code ch2) in
241 abate 332 [ unicode_base_char,
242     [ 0x00C0,0x00D6; 0x00D8,0x00F6;
243     0x00F8,0x00FF; 0x0100,0x0131; 0x0134,0x013E; 0x0141,0x0148;
244     0x014A,0x017E; 0x0180,0x01C3; 0x01CD,0x01F0; 0x01F4,0x01F5;
245     0x01FA,0x0217; 0x0250,0x02A8; 0x02BB,0x02C1; 0x0386,0x0386;
246     0x0388,0x038A; 0x038C,0x038C; 0x038E,0x03A1; 0x03A3,0x03CE;
247     0x03D0,0x03D6; 0x03DA,0x03DA; 0x03DC,0x03DC; 0x03DE,0x03DE;
248     0x03E0,0x03E0; 0x03E2,0x03F3;
249     0x0401,0x040C; 0x040E,0x044F; 0x0451,0x045C; 0x045E,0x0481;
250     0x0490,0x04C4; 0x04C7,0x04C8; 0x04CB,0x04CC; 0x04D0,0x04EB;
251     0x04EE,0x04F5; 0x04F8,0x04F9; 0x0531,0x0556; 0x0559,0x0559;
252     0x0561,0x0586; 0x05D0,0x05EA; 0x05F0,0x05F2; 0x0621,0x063A;
253     0x0641,0x064A; 0x0671,0x06B7; 0x06BA,0x06BE; 0x06C0,0x06CE;
254     0x06D0,0x06D3; 0x06D5,0x06D5; 0x06E5,0x06E6; 0x0905,0x0939;
255     0x093D,0x093D;
256     0x0958,0x0961; 0x0985,0x098C; 0x098F,0x0990; 0x0993,0x09A8;
257     0x09AA,0x09B0; 0x09B2,0x09B2; 0x09B6,0x09B9; 0x09DC,0x09DD;
258     0x09DF,0x09E1; 0x09F0,0x09F1; 0x0A05,0x0A0A; 0x0A0F,0x0A10;
259     0x0A13,0x0A28; 0x0A2A,0x0A30; 0x0A32,0x0A33; 0x0A35,0x0A36;
260     0x0A38,0x0A39; 0x0A59,0x0A5C; 0x0A5E,0x0A5E; 0x0A72,0x0A74;
261     0x0A85,0x0A8B; 0x0A8D,0x0A8D; 0x0A8F,0x0A91; 0x0A93,0x0AA8;
262     0x0AAA,0x0AB0; 0x0AB2,0x0AB3; 0x0AB5,0x0AB9; 0x0ABD,0x0ABD;
263     0x0AE0,0x0AE0;
264     0x0B05,0x0B0C; 0x0B0F,0x0B10; 0x0B13,0x0B28; 0x0B2A,0x0B30;
265     0x0B32,0x0B33; 0x0B36,0x0B39; 0x0B3D,0x0B3D; 0x0B5C,0x0B5D;
266     0x0B5F,0x0B61; 0x0B85,0x0B8A; 0x0B8E,0x0B90; 0x0B92,0x0B95;
267     0x0B99,0x0B9A; 0x0B9C,0x0B9C; 0x0B9E,0x0B9F; 0x0BA3,0x0BA4;
268     0x0BA8,0x0BAA; 0x0BAE,0x0BB5; 0x0BB7,0x0BB9; 0x0C05,0x0C0C;
269     0x0C0E,0x0C10; 0x0C12,0x0C28; 0x0C2A,0x0C33; 0x0C35,0x0C39;
270     0x0C60,0x0C61; 0x0C85,0x0C8C; 0x0C8E,0x0C90; 0x0C92,0x0CA8;
271     0x0CAA,0x0CB3; 0x0CB5,0x0CB9; 0x0CDE,0x0CDE; 0x0CE0,0x0CE1;
272     0x0D05,0x0D0C; 0x0D0E,0x0D10; 0x0D12,0x0D28; 0x0D2A,0x0D39;
273     0x0D60,0x0D61; 0x0E01,0x0E2E; 0x0E30,0x0E30; 0x0E32,0x0E33;
274     0x0E40,0x0E45; 0x0E81,0x0E82; 0x0E84,0x0E84; 0x0E87,0x0E88;
275     0x0E8A,0x0E8A;
276     0x0E8D,0x0E8D; 0x0E94,0x0E97; 0x0E99,0x0E9F; 0x0EA1,0x0EA3;
277     0x0EA5,0x0EA5;
278     0x0EA7,0x0EA7; 0x0EAA,0x0EAB; 0x0EAD,0x0EAE; 0x0EB0,0x0EB0;
279     0x0EB2,0x0EB3;
280     0x0EBD,0x0EBD; 0x0EC0,0x0EC4; 0x0F40,0x0F47; 0x0F49,0x0F69;
281     0x10A0,0x10C5; 0x10D0,0x10F6; 0x1100,0x1100; 0x1102,0x1103;
282     0x1105,0x1107; 0x1109,0x1109; 0x110B,0x110C; 0x110E,0x1112;
283     0x113C,0x113C;
284     0x113E,0x113E; 0x1140,0x1140; 0x114C,0x114C; 0x114E,0x114E;
285     0x1150,0x1150; 0x1154,0x1155; 0x1159,0x1159;
286     0x115F,0x1161; 0x1163,0x1163; 0x1165,0x1165; 0x1167,0x1167;
287     0x1169,0x1169; 0x116D,0x116E;
288     0x1172,0x1173; 0x1175,0x1175; 0x119E,0x119E; 0x11A8,0x11A8;
289     0x11AB,0x11AB; 0x11AE,0x11AF;
290     0x11B7,0x11B8; 0x11BA,0x11BA; 0x11BC,0x11C2; 0x11EB,0x11EB;
291     0x11F0,0x11F0; 0x11F9,0x11F9;
292     0x1E00,0x1E9B; 0x1EA0,0x1EF9; 0x1F00,0x1F15; 0x1F18,0x1F1D;
293     0x1F20,0x1F45; 0x1F48,0x1F4D; 0x1F50,0x1F57; 0x1F59,0x1F59;
294     0x1F5B,0x1F5B;
295     0x1F5D,0x1F5D; 0x1F5F,0x1F7D; 0x1F80,0x1FB4; 0x1FB6,0x1FBC;
296     0x1FBE,0x1FBE;
297     0x1FC2,0x1FC4; 0x1FC6,0x1FCC; 0x1FD0,0x1FD3; 0x1FD6,0x1FDB;
298     0x1FE0,0x1FEC; 0x1FF2,0x1FF4; 0x1FF6,0x1FFC; 0x2126,0x2126;
299     0x212A,0x212B; 0x212E,0x212E; 0x2180,0x2182; 0x3041,0x3094;
300     0x30A1,0x30FA; 0x3105,0x312C; (* 0xAC00,0xD7A3 *) ];
301    
302     unicode_ideographic,
303     [ 0x3007,0x3007; 0x3021,0x3029 (* 0x4E00-0x9FA5 *) ];
304    
305     unicode_combining_char,
306     [ 0x0300,0x0345; 0x0360,0x0361; 0x0483,0x0486; 0x0591,0x05A1;
307     0x05A3,0x05B9; 0x05BB,0x05BD; 0x05BF,0x05BF; 0x05C1,0x05C2;
308     0x05C4,0x05C4; 0x064B,0x0652; 0x0670,0x0670; 0x06D6,0x06DC;
309     0x06DD,0x06DF; 0x06E0,0x06E4; 0x06E7,0x06E8; 0x06EA,0x06ED;
310     0x0901,0x0903; 0x093C,0x093C; 0x093E,0x094C; 0x094D,0x094D;
311     0x0951,0x0954; 0x0962,0x0963; 0x0981,0x0983; 0x09BC,0x09BC;
312     0x09BE,0x09BE; 0x09BF,0x09BF; 0x09C0,0x09C4; 0x09C7,0x09C8;
313     0x09CB,0x09CD; 0x09D7,0x09D7; 0x09E2,0x09E3; 0x0A02,0x0A02;
314     0x0A3C,0x0A3C; 0x0A3E,0x0A3E; 0x0A3F,0x0A3F; 0x0A40,0x0A42;
315     0x0A47,0x0A48; 0x0A4B,0x0A4D; 0x0A70,0x0A71; 0x0A81,0x0A83;
316     0x0ABC,0x0ABC; 0x0ABE,0x0AC5; 0x0AC7,0x0AC9; 0x0ACB,0x0ACD;
317     0x0B01,0x0B03; 0x0B3C,0x0B3C; 0x0B3E,0x0B43; 0x0B47,0x0B48;
318     0x0B4B,0x0B4D; 0x0B56,0x0B57; 0x0B82,0x0B83; 0x0BBE,0x0BC2;
319     0x0BC6,0x0BC8; 0x0BCA,0x0BCD; 0x0BD7,0x0BD7; 0x0C01,0x0C03;
320     0x0C3E,0x0C44; 0x0C46,0x0C48; 0x0C4A,0x0C4D; 0x0C55,0x0C56;
321     0x0C82,0x0C83; 0x0CBE,0x0CC4; 0x0CC6,0x0CC8; 0x0CCA,0x0CCD;
322     0x0CD5,0x0CD6; 0x0D02,0x0D03; 0x0D3E,0x0D43; 0x0D46,0x0D48;
323     0x0D4A,0x0D4D; 0x0D57,0x0D57; 0x0E31,0x0E31; 0x0E34,0x0E3A;
324     0x0E47,0x0E4E; 0x0EB1,0x0EB1; 0x0EB4,0x0EB9; 0x0EBB,0x0EBC;
325     0x0EC8,0x0ECD; 0x0F18,0x0F19; 0x0F35,0x0F35; 0x0F37,0x0F37;
326     0x0F39,0x0F39; 0x0F3E,0x0F3E; 0x0F3F,0x0F3F; 0x0F71,0x0F84;
327     0x0F86,0x0F8B; 0x0F90,0x0F95; 0x0F97,0x0F97; 0x0F99,0x0FAD;
328     0x0FB1,0x0FB7; 0x0FB9,0x0FB9; 0x20D0,0x20DC; 0x20E1,0x20E1;
329     0x302A,0x302F; 0x3099,0x3099; 0x309A,0x309A ];
330    
331     unicode_digit,
332     [ 0x0660,0x0669; 0x06F0,0x06F9; 0x0966,0x096F; 0x09E6,0x09EF;
333     0x0A66,0x0A6F; 0x0AE6,0x0AEF; 0x0B66,0x0B6F; 0x0BE7,0x0BEF;
334     0x0C66,0x0C6F; 0x0CE6,0x0CEF; 0x0D66,0x0D6F; 0x0E50,0x0E59;
335     0x0ED0,0x0ED9; 0x0F20,0x0F29 ];
336    
337    
338     unicode_extender,
339     [ 0x00B7,0x00B7; 0x02D0,0x02D1; 0x0387,0x0387; 0x0640,0x0640;
340     0x0E46,0x0E46; 0x0EC6,0x0EC6; 0x3005,0x3005; 0x3031,0x3035;
341     0x309D,0x309E; 0x30FC,0x30FE ];
342    
343     ascii_digit,
344     [ i '0' '9'];
345    
346     lowercase,
347     [i 'a' 'z'];
348    
349     uppercase,
350     [i 'A' 'Z'];
351    
352     blank,
353     [c 8; c 9; c 10; c 13; c 32]
354 abate 81 ]
355    
356     let table =
357     assert(nb_classes <= 256);
358 abate 332 let v = String.make 0x312d (Char.chr encoding_error) in
359 abate 81 let fill_int c (i, j) = String.fill v i (j-i+1) c in
360     let fill_class (c, l) = List.iter (fill_int (Char.chr c)) l in
361     let fill_char (ch, cl) = v.[ch] <- Char.chr cl in
362     List.iter fill_class classes;
363     List.iter fill_char one_char_classes;
364     v
365    
366 abate 332 let utf8_engine =
367     Lex_engines.engine_tiny_utf8 table
368     (fun code ->
369     if code >= 0x4E00 && code <= 0x9FA5 then
370     unicode_ideographic
371     else if code >= 0xAC00 && code <= 0xD7A3 then
372     unicode_base_char
373     else if code <= 0xD7FF || (code >= 0xE000 && code <= 0xFFFD) ||
374     (code >= 0x10000 && code <= 0x10FFFF) then
375     xml_char
376     else encoding_error)
377 abate 81
378     let latin1_engine = Lex_engines.engine_tiny_8bit table
379     }

CVS Admin">CVS Admin
ViewVC Help
Powered by ViewVC 1.1.5