/[svn]/parser/wlexer.mll
ViewVC logotype

Contents of /parser/wlexer.mll

Parent Directory Parent Directory | Revision Log Revision Log


Revision 154 - (hide annotations)
Tue Jul 10 17:10:55 2007 UTC (5 years, 11 months ago) by abate
File size: 5913 byte(s)
[r2002-11-25 16:10:56 by cvscast] Empty log message

Original author: cvscast
Date: 2002-11-25 16:11:30+00:00
1 abate 81 (* File to be processed by wlex, not ocamllex ! *)
2     (* Loosely inspired from OCaml lexer.mll *)
3    
4     classes
5     encoding_error
6     xml_char
7     blank
8     lowercase uppercase ascii_digit
9     "_<>=.,:;+-*/@&{}[]()|?`\"\\\'!"
10    
11    
12     {
13     let keywords = Hashtbl.create 17
14    
15     let error i j exn = raise (Location.Location ((i,j),exn))
16     exception Illegal_character of char
17     exception Unterminated_comment
18     exception Unterminated_string
19     exception Unterminated_string_in_comment
20    
21     (* Buffer for string literals *)
22    
23     let string_buff = Buffer.create 1024
24     let store_char = Buffer.add_char string_buff
25     let get_stored_string () =
26     let s = Buffer.contents string_buff in
27     Buffer.clear string_buff;
28     s
29    
30     let string_start_pos = ref 0;;
31     let comment_start_pos : int list ref = ref [];;
32    
33     let char_for_decimal_code s =
34     let s = String.sub s 1 (String.length s - 1) in
35     let c = int_of_string s in
36     assert ( c < 256 ); (* TODO: handle Unicode *)
37     Char.chr c
38 abate 154
39     let rec tag_of_tag s i =
40     match s.[i] with
41     | '\008' | '\009' | '\010' | '\013' | '\032' -> tag_of_tag s (i+1)
42     | _ -> String.sub s i (String.length s - i)
43    
44 abate 81 }
45    
46    
47 abate 143 let identchar = lowercase | uppercase | ascii_digit | '_' | '\''
48 abate 81
49     rule token = parse
50     blank+ { token engine lexbuf }
51 abate 143 | (lowercase | '_') identchar* ( ':' identchar+)* {
52 abate 81 let s = Lexing.lexeme lexbuf in
53     if Hashtbl.mem keywords s then "",s else "LIDENT",s
54     }
55 abate 143 | uppercase identchar* ( ':' identchar+)* { "UIDENT",Lexing.lexeme lexbuf }
56 abate 81 | ascii_digit+ { "INT",Lexing.lexeme lexbuf }
57 abate 154 | "<" blank* (lowercase | uppercase) identchar* {
58 abate 81 let s = Lexing.lexeme lexbuf in
59 abate 154 "TAG", tag_of_tag s 1
60 abate 81 }
61     | [ "<>=.,:;+-*/@&{}[]()|?`!" ]
62 abate 99 | "->" | "::" | ";;" | "--" | ":=" | "\\"
63 abate 81 | ["?+*"] "?"
64     { "",Lexing.lexeme lexbuf }
65    
66     | '"' | "'"
67     { let string_start = Lexing.lexeme_start lexbuf in
68     string_start_pos := string_start;
69     let double_quote = Lexing.lexeme_char lexbuf 0 = '"' in
70     if double_quote then string2 engine lexbuf else string1 engine lexbuf;
71     lexbuf.Lexing.lex_start_pos <-
72     string_start - lexbuf.Lexing.lex_abs_pos;
73     (if double_quote then "STRING2" else "STRING1"),
74     (get_stored_string()) }
75    
76     | "(*"
77     { comment_start_pos := [Lexing.lexeme_start lexbuf];
78     comment engine lexbuf;
79     token engine lexbuf }
80    
81     | eof
82     { "EOI","" }
83     | _
84     { error
85     (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf)
86     (Illegal_character ((Lexing.lexeme lexbuf).[0])) }
87    
88     and comment = parse
89     "(*"
90     { comment_start_pos := Lexing.lexeme_start lexbuf :: !comment_start_pos;
91     comment engine lexbuf;
92     }
93     | "*)"
94     { comment_start_pos := List.tl !comment_start_pos;
95     if !comment_start_pos <> [] then comment engine lexbuf;
96     }
97     | '"' | "'"
98     { string_start_pos := Lexing.lexeme_start lexbuf;
99     let string =
100     if Lexing.lexeme_char lexbuf 0 = '"' then string2 else string1 in
101     (try string engine lexbuf
102     with Location.Location (_,Unterminated_string) ->
103     let st = List.hd !comment_start_pos in
104     error st (st+2) Unterminated_string_in_comment);
105     Buffer.clear string_buff;
106     comment engine lexbuf }
107     | eof
108     { let st = List.hd !comment_start_pos in
109     error st (st+2) Unterminated_comment
110     }
111     | _
112     { comment engine lexbuf }
113    
114     and string2 = parse
115     '"'
116     { () }
117     | '\\' ['\\' '"']
118     { store_char (Lexing.lexeme_char lexbuf 1);
119     string2 engine lexbuf }
120     | '\\' ascii_digit+
121     { store_char (char_for_decimal_code (Lexing.lexeme lexbuf));
122     string2 engine lexbuf }
123     | eof
124     { error !string_start_pos (!string_start_pos+1) Unterminated_string }
125     | _
126     { store_char (Lexing.lexeme_char lexbuf 0);
127     (* TODO: Unicode *)
128     string2 engine lexbuf }
129    
130     and string1 = parse
131     "'"
132     { () }
133     | '\\' ['\\' '\'']
134     { store_char (Lexing.lexeme_char lexbuf 1);
135     string1 engine lexbuf }
136     | '\\' ascii_digit+
137     { store_char (char_for_decimal_code (Lexing.lexeme lexbuf));
138     string1 engine lexbuf }
139     | eof
140     { error !string_start_pos (!string_start_pos+1) Unterminated_string }
141     | _
142     { store_char (Lexing.lexeme_char lexbuf 0);
143     string1 engine lexbuf }
144    
145     {
146    
147     let lexer_func_of_wlex lexfun lexengine cs =
148     let lb =
149     Lexing.from_function
150     (fun s n ->
151     try s.[0] <- Stream.next cs; 1 with Stream.Failure -> 0)
152     in
153     let next () =
154     let tok = lexfun lexengine lb in
155     let loc = (Lexing.lexeme_start lb, Lexing.lexeme_end lb) in
156     (tok, loc)
157     in
158     Token.make_stream_and_location next
159    
160     let register_kw (s1,s2) =
161     if s1 = "" then
162     match s2.[0] with
163     | 'a' .. 'z' when not (Hashtbl.mem keywords s2) ->
164     Hashtbl.add keywords s2 ()
165     | _ -> ()
166    
167    
168     let lexer lexfun lexengine =
169     {
170     Token.tok_func = lexer_func_of_wlex lexfun lexengine;
171     Token.tok_using = register_kw;
172     Token.tok_removing = (fun _ -> ());
173     Token.tok_match = Token.default_match;
174     Token.tok_text = Token.lexer_text
175     }
176    
177     let classes =
178     let c i = (i,i) in
179     let i ch1 ch2 = (Char.code ch1, Char.code ch2) in
180     [ (ascii_digit, [i '0' '9']);
181     (lowercase, [i 'a' 'z']);
182     (uppercase, [i 'A' 'Z']);
183     (blank, [c 8; c 9; c 10; c 13; c 32]);
184     ]
185    
186     let table =
187     assert(nb_classes <= 256);
188     let v = String.make 256 (Char.chr encoding_error) in
189     let fill_int c (i, j) = String.fill v i (j-i+1) c in
190     let fill_class (c, l) = List.iter (fill_int (Char.chr c)) l in
191     let fill_char (ch, cl) = v.[ch] <- Char.chr cl in
192     List.iter fill_class classes;
193     List.iter fill_char one_char_classes;
194     v
195    
196     let utf8_engine = Lex_engines.engine_tiny_utf8 table
197     (fun c ->
198     if c>=0x10000 && c < 0x11000 then xml_char
199     else encoding_error)
200    
201     let latin1_engine = Lex_engines.engine_tiny_8bit table
202     }

CVS Admin">CVS Admin
ViewVC Help
Powered by ViewVC 1.1.5