/[svn]/parser/lexer.ml
ViewVC logotype

Contents of /parser/lexer.ml

Parent Directory Parent Directory | Revision Log Revision Log


Revision 38 - (show annotations)
Tue Jul 10 16:59:59 2007 UTC (5 years, 10 months ago) by abate
File size: 18627 byte(s)
[r2002-10-22 16:34:57 by cvscast] Empty log message

Original author: cvscast
Date: 2002-10-22 16:34:57+00:00
1 (* Modified from Camlp4 Plexer module *)
2
3 open Stdpp
4 open Token
5
6 let no_quotations = ref false
7
8 (* The string buffering machinery *)
9
10 let buff = ref (String.create 80)
11 let store len x =
12 if len >= String.length !buff then
13 buff := !buff ^ String.create (String.length !buff);
14 !buff.[len] <- x;
15 succ len
16 let mstore len s =
17 let rec add_rec len i =
18 if i == String.length s then len else add_rec (store len s.[i]) (succ i)
19 in
20 add_rec len 0
21 let get_buff len = String.sub !buff 0 len
22
23 (* The lexer *)
24
25 let rec ident len =
26 parser
27 [< ''A'..'Z' | 'a'..'z' | '\192'..'\214' | '\216'..'\246' |
28 '\248'..'\255' | '0'..'9' | '_' | '\'' as c;
29 s >] ->
30 ident (store len c) s
31 | [< >] -> len
32 and ident2 len =
33 parser
34 [< ''!' | '?' | '~' | '=' | '@' | '^' | '&' | '+' | '-' | '*' | '/' |
35 '%' | '.' | ':' | '<' | '>' | '|' | '$' as c;
36 s >] ->
37 ident2 (store len c) s
38 | [< >] -> len
39 and ident3 len =
40 parser
41 [< ''0'..'9' | 'A'..'Z' | 'a'..'z' | '\192'..'\214' | '\216'..'\246' |
42 '\248'..'\255' | '_' | '!' | '%' | '&' | '*' | '+' | '-' | '.' | '/' |
43 ':' | '<' | '=' | '>' | '?' | '@' | '^' | '|' | '~' | '\'' | '$' as c;
44 s >] ->
45 ident3 (store len c) s
46 | [< >] -> len
47 and base_number len =
48 parser
49 [< ''o' | 'O'; s >] -> octal_digits (store len 'o') s
50 | [< ''x' | 'X'; s >] -> hexa_digits (store len 'x') s
51 | [< ''b' | 'B'; s >] -> binary_digits (store len 'b') s
52 | [< a = number len >] -> a
53 and octal_digits len =
54 parser
55 [< ''0'..'7' as d; s >] -> octal_digits (store len d) s
56 | [< >] -> "INT", get_buff len
57 and hexa_digits len =
58 parser
59 [< ''0'..'9' | 'a'..'f' | 'A'..'F' as d; s >] ->
60 hexa_digits (store len d) s
61 | [< >] -> "INT", get_buff len
62 and binary_digits len =
63 parser
64 [< ''0'..'1' as d; s >] -> binary_digits (store len d) s
65 | [< >] -> "INT", get_buff len
66 and number len =
67 parser
68 [< ''0'..'9' as c; s >] -> number (store len c) s
69 | [< ''.'; s >] -> decimal_part (store len '.') s
70 | [< ''e' | 'E'; s >] -> exponent_part (store len 'E') s
71 | [< >] -> "INT", get_buff len
72 and decimal_part len =
73 parser
74 [< ''0'..'9' as c; s >] -> decimal_part (store len c) s
75 | [< ''e' | 'E'; s >] -> exponent_part (store len 'E') s
76 | [< >] -> "FLOAT", get_buff len
77 and exponent_part len =
78 parser
79 [< ''+' | '-' as c; s >] -> end_exponent_part (store len c) s
80 | [< a = end_exponent_part len >] -> a
81 and end_exponent_part len =
82 parser
83 [< ''0'..'9' as c; s >] -> end_exponent_part (store len c) s
84 | [< >] -> "FLOAT", get_buff len
85
86 let rec skip_spaces =
87 parser
88 [< '' ' | '\010' | '\013' | '\t' | '\026' | '\012'; s >] -> skip_spaces s
89 | [< >] -> ()
90
91 let error_on_unknown_keywords = ref false
92 let err loc msg = raise_with_loc loc (Token.Error msg)
93
94 let next_token_fun dfa find_kwd =
95 let keyword_or_error loc s =
96 try ("", find_kwd s), loc with
97 Not_found ->
98 if !error_on_unknown_keywords then err loc ("illegal token: " ^ s)
99 else ("", s), loc
100 in
101 let rec next_token =
102 parser bp
103 [< '' ' | '\010' | '\013' | '\t' | '\026' | '\012'; s >] -> next_token s
104 | [< ''('; s >] -> left_paren bp s
105 | [< ''#'; s >] -> spaces_tabs s; linenum bp s
106 | [< ''A'..'Z' | '\192'..'\214' | '\216'..'\222' as c; s >] ->
107 let id = get_buff (ident (store 0 c) s) in
108 let loc = bp, Stream.count s in
109 (try "", find_kwd id with
110 Not_found -> "UIDENT", id),
111 loc
112 | [< ''a'..'z' | '\223'..'\246' | '\248'..'\255' | '_' as c; s >] ->
113 let id = get_buff (ident (store 0 c) s) in
114 let loc = bp, Stream.count s in
115 (try "", find_kwd id with
116 Not_found -> "LIDENT", id),
117 loc
118 | [< ''1'..'9' as c; s >] ->
119 let tok = number (store 0 c) s in
120 let loc = bp, Stream.count s in tok, loc
121 | [< ''0'; s >] ->
122 let tok = base_number (store 0 '0') s in
123 let loc = bp, Stream.count s in tok, loc
124 | [< ''\''; s >] ->
125 (* begin match Stream.npeek 2 s with
126 [_; '\''] | ['\\'; _] -> *)
127 let tok = "CHAR", get_buff (char bp 0 s) in
128 let loc = bp, Stream.count s in tok, loc
129 (* | _ -> keyword_or_error (bp, Stream.count s) "'"
130 end *)
131 | [< ''\"'; s >] ->
132 let tok = "STRING", get_buff (string bp 0 s) in
133 let loc = bp, Stream.count s in tok, loc
134 | [< ''$'; s >] ->
135 let tok = dollar bp 0 s in let loc = bp, Stream.count s in tok, loc
136 | [< ''!' | '=' | '@' | '^' | '&' | '+' | '-' | '*' | '/' | '%' as c;
137 s >] ->
138 let id = get_buff (ident2 (store 0 c) s) in
139 keyword_or_error (bp, Stream.count s) id
140 | [< ''~' as c;
141 a =
142 parser
143 [< ''a'..'z' as c; len = ident (store 0 c) >] ep ->
144 ("TILDEIDENT", get_buff len), (bp, ep)
145 | [< s >] ->
146 let id = get_buff (ident2 (store 0 c) s) in
147 keyword_or_error (bp, Stream.count s) id >] ->
148 a
149 | [< ''?' as c;
150 a =
151 parser
152 [< ''a'..'z' as c; len = ident (store 0 c) >] ep ->
153 ("QUESTIONIDENT", get_buff len), (bp, ep)
154 | [< s >] ->
155 let id = get_buff (ident2 (store 0 c) s) in
156 keyword_or_error (bp, Stream.count s) id >] ->
157 a
158 | [< ''<'; s >] -> less bp s
159 | [< '':' as c1;
160 len =
161 parser
162 [< '']' | ':' | '=' | '>' as c2 >] -> store (store 0 c1) c2
163 | [< >] -> store 0 c1 >] ep ->
164 let id = get_buff len in keyword_or_error (bp, ep) id
165 | [< ''>' | '|' as c1;
166 len =
167 parser
168 [< '']' | '}' as c2 >] -> store (store 0 c1) c2
169 | [< a = ident2 (store 0 c1) >] -> a >] ep ->
170 let id = get_buff len in keyword_or_error (bp, ep) id
171 | [< ''[' | '{' as c1; s >] ->
172 let len =
173 match Stream.npeek 2 s with
174 ['<'; '<' | ':'] -> store 0 c1
175 | _ ->
176 match s with parser
177 [< ''|' | '<' | ':' as c2 >] -> store (store 0 c1) c2
178 | [< >] -> store 0 c1
179 in
180 let ep = Stream.count s in
181 let id = get_buff len in keyword_or_error (bp, ep) id
182 | [< ''.';
183 id =
184 parser
185 [< ''.' >] -> ".."
186 | [< >] -> "." >] ep ->
187 keyword_or_error (bp, ep) id
188 | [< '';';
189 id =
190 parser
191 [< '';' >] -> ";;"
192 | [< >] -> ";" >] ep ->
193 keyword_or_error (bp, ep) id
194 | [< ''\\'; s >] ep -> ("LIDENT", get_buff (ident3 0 s)), (bp, ep)
195 | [< 'c >] ep -> keyword_or_error (bp, ep) (String.make 1 c)
196 | [< _ = Stream.empty >] -> ("EOI", ""), (bp, succ bp)
197 and less bp strm =
198 if !no_quotations then
199 match strm with parser
200 [< len = ident2 (store 0 '<') >] ep ->
201 let id = get_buff len in keyword_or_error (bp, ep) id
202 else
203 match strm with parser
204 [< ''<'; len = quotation bp 0 >] ep ->
205 ("QUOTATION", ":" ^ get_buff len), (bp, ep)
206 | [< '':';
207 i =
208 (parser
209 [< len = ident 0 >] -> get_buff len);
210 ''<' ?? "character '<' expected"; len = quotation bp 0 >] ep ->
211 ("QUOTATION", i ^ ":" ^ get_buff len), (bp, ep)
212 | [< len = ident2 (store 0 '<') >] ep ->
213 let id = get_buff len in keyword_or_error (bp, ep) id
214 and string bp len =
215 parser
216 [< ''\"' >] -> len
217 | [< ''\\'; 'c; s >] -> string bp (store (store len '\\') c) s
218 | [< 'c; s >] -> string bp (store len c) s
219 | [< >] ep -> err (bp, ep) "string not terminated"
220 and char bp len =
221 parser
222 [< ''\''; s >] -> if len = 0 then char bp (store len '\'') s else len
223 | [< ''\\'; 'c; s >] -> char bp (store (store len '\\') c) s
224 | [< 'c; s >] -> char bp (store len c) s
225 | [< >] ep -> err (bp, ep) "char not terminated"
226 and dollar bp len =
227 parser
228 [< ''$' >] -> "ANTIQUOT", ":" ^ get_buff len
229 | [< ''a'..'z' | 'A'..'Z' as c; s >] -> antiquot bp (store len c) s
230 | [< ''0'..'9' as c; s >] -> maybe_locate bp (store len c) s
231 | [< '':'; s >] ->
232 let k = get_buff len in
233 "ANTIQUOT", k ^ ":" ^ locate_or_antiquot_rest bp 0 s
234 | [< ''\\'; 'c; s >] ->
235 "ANTIQUOT", ":" ^ locate_or_antiquot_rest bp (store len c) s
236 | [< s >] ->
237 if dfa then
238 match s with parser
239 [< 'c >] ->
240 "ANTIQUOT", ":" ^ locate_or_antiquot_rest bp (store len c) s
241 | [< >] ep -> err (bp, ep) "antiquotation not terminated"
242 else "", get_buff (ident2 (store 0 '$') s)
243 and maybe_locate bp len =
244 parser
245 [< ''$' >] -> "ANTIQUOT", ":" ^ get_buff len
246 | [< ''0'..'9' as c; s >] -> maybe_locate bp (store len c) s
247 | [< '':'; s >] ->
248 "LOCATE", get_buff len ^ ":" ^ locate_or_antiquot_rest bp 0 s
249 | [< ''\\'; 'c; s >] ->
250 "ANTIQUOT", ":" ^ locate_or_antiquot_rest bp (store len c) s
251 | [< 'c; s >] ->
252 "ANTIQUOT", ":" ^ locate_or_antiquot_rest bp (store len c) s
253 | [< >] ep -> err (bp, ep) "antiquotation not terminated"
254 and antiquot bp len =
255 parser
256 [< ''$' >] -> "ANTIQUOT", ":" ^ get_buff len
257 | [< ''a'..'z' | 'A'..'Z' | '0'..'9' as c; s >] ->
258 antiquot bp (store len c) s
259 | [< '':'; s >] ->
260 let k = get_buff len in
261 "ANTIQUOT", k ^ ":" ^ locate_or_antiquot_rest bp 0 s
262 | [< ''\\'; 'c; s >] ->
263 "ANTIQUOT", ":" ^ locate_or_antiquot_rest bp (store len c) s
264 | [< 'c; s >] ->
265 "ANTIQUOT", ":" ^ locate_or_antiquot_rest bp (store len c) s
266 | [< >] ep -> err (bp, ep) "antiquotation not terminated"
267 and locate_or_antiquot_rest bp len =
268 parser
269 [< ''$' >] -> get_buff len
270 | [< ''\\'; 'c; s >] -> locate_or_antiquot_rest bp (store len c) s
271 | [< 'c; s >] -> locate_or_antiquot_rest bp (store len c) s
272 | [< >] ep -> err (bp, ep) "antiquotation not terminated"
273 and quotation bp len =
274 parser
275 [< ''>'; s >] -> maybe_end_quotation bp len s
276 | [< ''<'; s >] ->
277 quotation bp (maybe_nested_quotation bp (store len '<') strm__) s
278 | [< ''\\';
279 len =
280 (parser
281 [< ''>' | '<' | '\\' as c >] -> store len c
282 | [< >] -> store len '\\');
283 s >] ->
284 quotation bp len s
285 | [< 'c; s >] -> quotation bp (store len c) s
286 | [< >] ep -> err (bp, ep) "quotation not terminated"
287 and maybe_nested_quotation bp len =
288 parser
289 [< ''<'; s >] -> mstore (quotation bp (store len '<') s) ">>"
290 | [< '':'; len = ident (store len ':');
291 a =
292 parser
293 [< ''<'; s >] -> mstore (quotation bp (store len '<') s) ">>"
294 | [< >] -> len >] ->
295 a
296 | [< >] -> len
297 and maybe_end_quotation bp len =
298 parser
299 [< ''>' >] -> len
300 | [< a = quotation bp (store len '>') >] -> a
301 and left_paren bp =
302 parser
303 [< ''*'; _ = comment bp; a = next_token >] -> a
304 | [< >] ep -> keyword_or_error (bp, ep) "("
305 and comment bp =
306 parser
307 [< ''('; s >] -> left_paren_in_comment bp s
308 | [< ''*'; s >] -> star_in_comment bp s
309 | [< ''\"'; _ = string bp 0; s >] -> comment bp s
310 | [< ''\''; s >] -> quote_in_comment bp s
311 | [< 'c; s >] -> comment bp s
312 | [< >] ep -> err (bp, ep) "comment not terminated"
313 and quote_in_comment bp =
314 parser
315 [< ''\''; s >] -> comment bp s
316 | [< ''\\'; s >] -> quote_antislash_in_comment bp 0 s
317 | [< '_; s >] -> quote_any_in_comment bp s
318 | [< a = comment bp >] -> a
319 and quote_any_in_comment bp =
320 parser
321 [< ''\''; s >] -> comment bp s
322 | [< a = comment bp >] -> a
323 and quote_antislash_in_comment bp len =
324 parser
325 [< ''\''; s >] -> comment bp s
326 | [< ''\\' | '\"' | 'n' | 't' | 'b' | 'r'; s >] ->
327 quote_any_in_comment bp s
328 | [< ''0'..'9'; s >] -> quote_antislash_digit_in_comment bp s
329 | [< a = comment bp >] -> a
330 and quote_antislash_digit_in_comment bp =
331 parser
332 [< ''0'..'9'; s >] -> quote_antislash_digit2_in_comment bp s
333 | [< a = comment bp >] -> a
334 and quote_antislash_digit2_in_comment bp =
335 parser
336 [< ''0'..'9'; s >] -> quote_any_in_comment bp s
337 | [< a = comment bp >] -> a
338 and left_paren_in_comment bp =
339 parser
340 [< ''*'; s >] -> comment bp s; comment bp s
341 | [< a = comment bp >] -> a
342 and star_in_comment bp =
343 parser
344 [< '')' >] -> ()
345 | [< a = comment bp >] -> a
346 and linenum bp =
347 parser
348 [< ''0'..'9'; _ = digits; _ = spaces_tabs; ''\"'; _ = any_to_nl; s >] ->
349 next_token s
350 | [< >] -> keyword_or_error (bp, bp + 1) "#"
351 and spaces_tabs =
352 parser
353 [< '' ' | '\t'; s >] -> spaces_tabs s
354 | [< >] -> ()
355 and digits =
356 parser
357 [< ''0'..'9'; s >] -> digits s
358 | [< >] -> ()
359 and any_to_nl =
360 parser
361 [< ''\013' | '\010' >] -> ()
362 | [< '_; s >] -> any_to_nl s
363 | [< >] -> ()
364 in
365 fun cstrm ->
366 try next_token cstrm with
367 Stream.Error str -> err (Stream.count cstrm, Stream.count cstrm + 1) str
368
369 let dollar_for_antiquotation = ref true
370
371 let func kwd_table =
372 let find = Hashtbl.find kwd_table in
373 let dfa = !dollar_for_antiquotation in
374 Token.lexer_func_of_parser (next_token_fun dfa find)
375
376 let rec check_keyword_stream =
377 parser
378 [< _ = check; _ = Stream.empty >] -> true
379 and check =
380 parser
381 [< ''A'..'Z' | 'a'..'z' | '\192'..'\214' | '\216'..'\246' |
382 '\248'..'\255';
383 s >] ->
384 check_ident s
385 | [< ''!' | '?' | '~' | '=' | '@' | '^' | '&' | '+' | '-' | '*' | '/' |
386 '%' | '.';
387 s >] ->
388 check_ident2 s
389 | [< ''<'; s >] ->
390 begin match Stream.npeek 1 s with
391 [':' | '<'] -> ()
392 | _ -> check_ident2 s
393 end
394 | [< '':';
395 _ =
396 parser
397 [< '']' | ':' | '=' | '>' >] -> ()
398 | [< >] -> () >] ep ->
399 ()
400 | [< ''>' | '|';
401 _ =
402 parser
403 [< '']' | '}' >] -> ()
404 | [< a = check_ident2 >] -> a >] ->
405 ()
406 | [< ''[' | '{'; s >] ->
407 begin match Stream.npeek 2 s with
408 ['<'; '<' | ':'] -> ()
409 | _ ->
410 match s with parser
411 [< ''|' | '<' | ':' >] -> () | [< >] -> ()
412 end
413 | [< '';';
414 _ =
415 parser
416 [< '';' >] -> ()
417 | [< >] -> () >] ->
418 ()
419 | [< '_ >] -> ()
420 and check_ident =
421 parser
422 [< ''A'..'Z' | 'a'..'z' | '\192'..'\214' | '\216'..'\246' |
423 '\248'..'\255' | '0'..'9' | '_' | '\'';
424 s >] ->
425 check_ident s
426 | [< >] -> ()
427 and check_ident2 =
428 parser
429 [< ''!' | '?' | '~' | '=' | '@' | '^' | '&' | '+' | '-' | '*' | '/' |
430 '%' | '.' | ':' | '<' | '>' | '|';
431 s >] ->
432 check_ident2 s
433 | [< >] -> ()
434
435 let check_keyword s =
436 try check_keyword_stream (Stream.of_string s) with
437 _ -> false
438
439 let error_no_respect_rules p_con p_prm =
440 raise
441 (Token.Error
442 ("the token " ^
443 (if p_con = "" then "\"" ^ p_prm ^ "\""
444 else if p_prm = "" then p_con
445 else p_con ^ " \"" ^ p_prm ^ "\"") ^
446 " does not respect Plexer rules"))
447
448 let error_ident_and_keyword p_con p_prm =
449 raise
450 (Token.Error
451 ("the token \"" ^ p_prm ^ "\" is used as " ^ p_con ^
452 " and as keyword"))
453
454 let using_token kwd_table ident_table (p_con, p_prm) =
455 match p_con with
456 "" ->
457 if not (Hashtbl.mem kwd_table p_prm) then
458 if check_keyword p_prm then
459 if Hashtbl.mem ident_table p_prm then
460 error_ident_and_keyword (Hashtbl.find ident_table p_prm) p_prm
461 else Hashtbl.add kwd_table p_prm p_prm
462 else error_no_respect_rules p_con p_prm
463 | "LIDENT" ->
464 if p_prm = "" then ()
465 else
466 begin match p_prm.[0] with
467 'A'..'Z' -> error_no_respect_rules p_con p_prm
468 | _ ->
469 if Hashtbl.mem kwd_table p_prm then
470 error_ident_and_keyword p_con p_prm
471 else Hashtbl.add ident_table p_prm p_con
472 end
473 | "UIDENT" ->
474 if p_prm = "" then ()
475 else
476 begin match p_prm.[0] with
477 'a'..'z' -> error_no_respect_rules p_con p_prm
478 | _ ->
479 if Hashtbl.mem kwd_table p_prm then
480 error_ident_and_keyword p_con p_prm
481 else Hashtbl.add ident_table p_prm p_con
482 end
483 | "TILDEIDENT" | "QUESTIONIDENT" | "INT" | "FLOAT" | "CHAR" | "STRING" |
484 "QUOTATION" | "ANTIQUOT" | "LOCATE" | "EOI" ->
485 ()
486 | _ ->
487 raise
488 (Token.Error
489 ("the constructor \"" ^ p_con ^ "\" is not recognized by Plexer"))
490
491 let removing_token kwd_table ident_table (p_con, p_prm) =
492 match p_con with
493 "" -> Hashtbl.remove kwd_table p_prm
494 | "LIDENT" | "UIDENT" ->
495 if p_prm <> "" then Hashtbl.remove ident_table p_prm
496 | _ -> ()
497
498 let text =
499 function
500 "", t -> "'" ^ t ^ "'"
501 | "LIDENT", "" -> "lowercase identifier"
502 | "LIDENT", t -> "'" ^ t ^ "'"
503 | "UIDENT", "" -> "uppercase identifier"
504 | "UIDENT", t -> "'" ^ t ^ "'"
505 | "INT", "" -> "integer"
506 | "INT", s -> "'" ^ s ^ "'"
507 | "FLOAT", "" -> "float"
508 | "STRING", "" -> "string"
509 | "CHAR", "" -> "char"
510 | "QUOTATION", "" -> "quotation"
511 | "ANTIQUOT", k -> "antiquot \"" ^ k ^ "\""
512 | "LOCATE", "" -> "locate"
513 | "EOI", "" -> "end of input"
514 | con, "" -> con
515 | con, prm -> con ^ " \"" ^ prm ^ "\""
516
517 let eq_before_colon p e =
518 let rec loop i =
519 if i == String.length e then
520 failwith "Internal error in Plexer: incorrect ANTIQUOT"
521 else if i == String.length p then e.[i] == ':'
522 else if p.[i] == e.[i] then loop (i + 1)
523 else false
524 in
525 loop 0
526
527 let after_colon e =
528 try
529 let i = String.index e ':' in
530 String.sub e (i + 1) (String.length e - i - 1)
531 with
532 Not_found -> ""
533
534 let tok_match =
535 function
536 "ANTIQUOT", p_prm ->
537 begin function
538 "ANTIQUOT", prm when eq_before_colon p_prm prm -> after_colon prm
539 | _ -> raise Stream.Failure
540 end
541 | tok -> Token.default_match tok
542
543 let gmake () =
544 let kwd_table = Hashtbl.create 301 in
545 let id_table = Hashtbl.create 301 in
546 {tok_func = func kwd_table; tok_using = using_token kwd_table id_table;
547 tok_removing = removing_token kwd_table id_table; tok_match = tok_match;
548 tok_text = text}
549
550 let tparse =
551 function
552 "ANTIQUOT", p_prm ->
553 let p =
554 parser
555 [< '"ANTIQUOT", prm when eq_before_colon p_prm prm >] ->
556 after_colon prm
557 in
558 Some p
559 | _ -> None
560
561 let make () =
562 let kwd_table = Hashtbl.create 301 in
563 let id_table = Hashtbl.create 301 in
564 {func = func kwd_table; using = using_token kwd_table id_table;
565 removing = removing_token kwd_table id_table; tparse = tparse; text = text}

CVS Admin">CVS Admin
ViewVC Help
Powered by ViewVC 1.1.5