/[svn]/parser/wlexer.mll
ViewVC logotype

Contents of /parser/wlexer.mll

Parent Directory Parent Directory | Revision Log Revision Log


Revision 236 - (hide annotations)
Tue Jul 10 17:17:48 2007 UTC (5 years, 11 months ago) by abate
File size: 5931 byte(s)
[r2003-03-12 19:09:30 by cvscast] Empty log message

Original author: cvscast
Date: 2003-03-12 19:09:30+00:00
1 abate 81 (* File to be processed by wlex, not ocamllex ! *)
2     (* Loosely inspired from OCaml lexer.mll *)
3    
4     classes
5     encoding_error
6     xml_char
7     blank
8     lowercase uppercase ascii_digit
9     "_<>=.,:;+-*/@&{}[]()|?`\"\\\'!"
10    
11    
12     {
13     let keywords = Hashtbl.create 17
14    
15     let error i j exn = raise (Location.Location ((i,j),exn))
16     exception Illegal_character of char
17     exception Unterminated_comment
18     exception Unterminated_string
19     exception Unterminated_string_in_comment
20    
21     (* Buffer for string literals *)
22    
23     let string_buff = Buffer.create 1024
24     let store_char = Buffer.add_char string_buff
25     let get_stored_string () =
26     let s = Buffer.contents string_buff in
27     Buffer.clear string_buff;
28     s
29    
30     let string_start_pos = ref 0;;
31     let comment_start_pos : int list ref = ref [];;
32    
33     let char_for_decimal_code s =
34     let s = String.sub s 1 (String.length s - 1) in
35     let c = int_of_string s in
36     assert ( c < 256 ); (* TODO: handle Unicode *)
37     Char.chr c
38 abate 154
39     let rec tag_of_tag s i =
40     match s.[i] with
41     | '\008' | '\009' | '\010' | '\013' | '\032' -> tag_of_tag s (i+1)
42     | _ -> String.sub s i (String.length s - i)
43    
44 abate 81 }
45    
46    
47 abate 236 let identchar = lowercase | uppercase | ascii_digit | '_' | '\'' | '-'
48 abate 162 let ident = identchar* ( ':' identchar+)*
49 abate 81
50     rule token = parse
51     blank+ { token engine lexbuf }
52 abate 162 | (lowercase | '_') ident {
53 abate 81 let s = Lexing.lexeme lexbuf in
54     if Hashtbl.mem keywords s then "",s else "LIDENT",s
55     }
56 abate 162 | uppercase ident { "UIDENT",Lexing.lexeme lexbuf }
57 abate 81 | ascii_digit+ { "INT",Lexing.lexeme lexbuf }
58 abate 162 | "<" blank* (lowercase | uppercase) ident {
59 abate 81 let s = Lexing.lexeme lexbuf in
60 abate 154 "TAG", tag_of_tag s 1
61 abate 81 }
62     | [ "<>=.,:;+-*/@&{}[]()|?`!" ]
63 abate 229 | "->" | "::" | ";;" | "--" | ":=" | "\\" | "++"
64 abate 159 | "{|" | "|}"
65 abate 81 | ["?+*"] "?"
66     { "",Lexing.lexeme lexbuf }
67    
68     | '"' | "'"
69     { let string_start = Lexing.lexeme_start lexbuf in
70     string_start_pos := string_start;
71     let double_quote = Lexing.lexeme_char lexbuf 0 = '"' in
72     if double_quote then string2 engine lexbuf else string1 engine lexbuf;
73     lexbuf.Lexing.lex_start_pos <-
74     string_start - lexbuf.Lexing.lex_abs_pos;
75     (if double_quote then "STRING2" else "STRING1"),
76     (get_stored_string()) }
77    
78     | "(*"
79     { comment_start_pos := [Lexing.lexeme_start lexbuf];
80     comment engine lexbuf;
81     token engine lexbuf }
82    
83     | eof
84     { "EOI","" }
85     | _
86     { error
87     (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf)
88     (Illegal_character ((Lexing.lexeme lexbuf).[0])) }
89    
90     and comment = parse
91     "(*"
92     { comment_start_pos := Lexing.lexeme_start lexbuf :: !comment_start_pos;
93     comment engine lexbuf;
94     }
95     | "*)"
96     { comment_start_pos := List.tl !comment_start_pos;
97     if !comment_start_pos <> [] then comment engine lexbuf;
98     }
99     | '"' | "'"
100     { string_start_pos := Lexing.lexeme_start lexbuf;
101     let string =
102     if Lexing.lexeme_char lexbuf 0 = '"' then string2 else string1 in
103     (try string engine lexbuf
104     with Location.Location (_,Unterminated_string) ->
105     let st = List.hd !comment_start_pos in
106     error st (st+2) Unterminated_string_in_comment);
107     Buffer.clear string_buff;
108     comment engine lexbuf }
109     | eof
110     { let st = List.hd !comment_start_pos in
111     error st (st+2) Unterminated_comment
112     }
113     | _
114     { comment engine lexbuf }
115    
116     and string2 = parse
117     '"'
118     { () }
119     | '\\' ['\\' '"']
120     { store_char (Lexing.lexeme_char lexbuf 1);
121     string2 engine lexbuf }
122     | '\\' ascii_digit+
123     { store_char (char_for_decimal_code (Lexing.lexeme lexbuf));
124     string2 engine lexbuf }
125     | eof
126     { error !string_start_pos (!string_start_pos+1) Unterminated_string }
127     | _
128     { store_char (Lexing.lexeme_char lexbuf 0);
129     (* TODO: Unicode *)
130     string2 engine lexbuf }
131    
132     and string1 = parse
133     "'"
134     { () }
135     | '\\' ['\\' '\'']
136     { store_char (Lexing.lexeme_char lexbuf 1);
137     string1 engine lexbuf }
138     | '\\' ascii_digit+
139     { store_char (char_for_decimal_code (Lexing.lexeme lexbuf));
140     string1 engine lexbuf }
141     | eof
142     { error !string_start_pos (!string_start_pos+1) Unterminated_string }
143     | _
144     { store_char (Lexing.lexeme_char lexbuf 0);
145     string1 engine lexbuf }
146    
147     {
148    
149     let lexer_func_of_wlex lexfun lexengine cs =
150     let lb =
151     Lexing.from_function
152     (fun s n ->
153     try s.[0] <- Stream.next cs; 1 with Stream.Failure -> 0)
154     in
155     let next () =
156     let tok = lexfun lexengine lb in
157     let loc = (Lexing.lexeme_start lb, Lexing.lexeme_end lb) in
158     (tok, loc)
159     in
160     Token.make_stream_and_location next
161    
162     let register_kw (s1,s2) =
163     if s1 = "" then
164     match s2.[0] with
165     | 'a' .. 'z' when not (Hashtbl.mem keywords s2) ->
166     Hashtbl.add keywords s2 ()
167     | _ -> ()
168    
169    
170     let lexer lexfun lexengine =
171     {
172     Token.tok_func = lexer_func_of_wlex lexfun lexengine;
173     Token.tok_using = register_kw;
174     Token.tok_removing = (fun _ -> ());
175     Token.tok_match = Token.default_match;
176     Token.tok_text = Token.lexer_text
177     }
178    
179     let classes =
180     let c i = (i,i) in
181     let i ch1 ch2 = (Char.code ch1, Char.code ch2) in
182     [ (ascii_digit, [i '0' '9']);
183     (lowercase, [i 'a' 'z']);
184     (uppercase, [i 'A' 'Z']);
185     (blank, [c 8; c 9; c 10; c 13; c 32]);
186     ]
187    
188     let table =
189     assert(nb_classes <= 256);
190     let v = String.make 256 (Char.chr encoding_error) in
191     let fill_int c (i, j) = String.fill v i (j-i+1) c in
192     let fill_class (c, l) = List.iter (fill_int (Char.chr c)) l in
193     let fill_char (ch, cl) = v.[ch] <- Char.chr cl in
194     List.iter fill_class classes;
195     List.iter fill_char one_char_classes;
196     v
197    
198     let utf8_engine = Lex_engines.engine_tiny_utf8 table
199     (fun c ->
200     if c>=0x10000 && c < 0x11000 then xml_char
201     else encoding_error)
202    
203     let latin1_engine = Lex_engines.engine_tiny_8bit table
204     }

CVS Admin">CVS Admin
ViewVC Help
Powered by ViewVC 1.1.5