| 1 |
(* File to be processed by wlex, not ocamllex ! *)
|
| 2 |
(* Loosely inspired from OCaml lexer.mll *)
|
| 3 |
|
| 4 |
classes
|
| 5 |
encoding_error
|
| 6 |
xml_char
|
| 7 |
blank
|
| 8 |
lowercase uppercase ascii_digit
|
| 9 |
"_<>=.,:;+-*/@&{}[]()|?`\"\\\'!"
|
| 10 |
|
| 11 |
|
| 12 |
{
|
| 13 |
let keywords = Hashtbl.create 17
|
| 14 |
|
| 15 |
let error i j exn = raise (Location.Location ((i,j),exn))
|
| 16 |
exception Illegal_character of char
|
| 17 |
exception Unterminated_comment
|
| 18 |
exception Unterminated_string
|
| 19 |
exception Unterminated_string_in_comment
|
| 20 |
|
| 21 |
(* Buffer for string literals *)
|
| 22 |
|
| 23 |
let string_buff = Buffer.create 1024
|
| 24 |
let store_char = Buffer.add_char string_buff
|
| 25 |
let get_stored_string () =
|
| 26 |
let s = Buffer.contents string_buff in
|
| 27 |
Buffer.clear string_buff;
|
| 28 |
s
|
| 29 |
|
| 30 |
let string_start_pos = ref 0;;
|
| 31 |
let comment_start_pos : int list ref = ref [];;
|
| 32 |
|
| 33 |
let char_for_decimal_code s =
|
| 34 |
let s = String.sub s 1 (String.length s - 1) in
|
| 35 |
let c = int_of_string s in
|
| 36 |
assert ( c < 256 ); (* TODO: handle Unicode *)
|
| 37 |
Char.chr c
|
| 38 |
|
| 39 |
let rec tag_of_tag s i =
|
| 40 |
match s.[i] with
|
| 41 |
| '\008' | '\009' | '\010' | '\013' | '\032' -> tag_of_tag s (i+1)
|
| 42 |
| _ -> String.sub s i (String.length s - i)
|
| 43 |
|
| 44 |
}
|
| 45 |
|
| 46 |
|
| 47 |
let identchar = lowercase | uppercase | ascii_digit | '_' | '\''
|
| 48 |
let ident = identchar* ( ':' identchar+)*
|
| 49 |
|
| 50 |
rule token = parse
|
| 51 |
blank+ { token engine lexbuf }
|
| 52 |
| (lowercase | '_') ident {
|
| 53 |
let s = Lexing.lexeme lexbuf in
|
| 54 |
if Hashtbl.mem keywords s then "",s else "LIDENT",s
|
| 55 |
}
|
| 56 |
| uppercase ident { "UIDENT",Lexing.lexeme lexbuf }
|
| 57 |
| ascii_digit+ { "INT",Lexing.lexeme lexbuf }
|
| 58 |
| "<" blank* (lowercase | uppercase) ident {
|
| 59 |
let s = Lexing.lexeme lexbuf in
|
| 60 |
"TAG", tag_of_tag s 1
|
| 61 |
}
|
| 62 |
| [ "<>=.,:;+-*/@&{}[]()|?`!" ]
|
| 63 |
| "->" | "::" | ";;" | "--" | ":=" | "\\"
|
| 64 |
| "{|" | "|}"
|
| 65 |
| ["?+*"] "?"
|
| 66 |
{ "",Lexing.lexeme lexbuf }
|
| 67 |
|
| 68 |
| '"' | "'"
|
| 69 |
{ let string_start = Lexing.lexeme_start lexbuf in
|
| 70 |
string_start_pos := string_start;
|
| 71 |
let double_quote = Lexing.lexeme_char lexbuf 0 = '"' in
|
| 72 |
if double_quote then string2 engine lexbuf else string1 engine lexbuf;
|
| 73 |
lexbuf.Lexing.lex_start_pos <-
|
| 74 |
string_start - lexbuf.Lexing.lex_abs_pos;
|
| 75 |
(if double_quote then "STRING2" else "STRING1"),
|
| 76 |
(get_stored_string()) }
|
| 77 |
|
| 78 |
| "(*"
|
| 79 |
{ comment_start_pos := [Lexing.lexeme_start lexbuf];
|
| 80 |
comment engine lexbuf;
|
| 81 |
token engine lexbuf }
|
| 82 |
|
| 83 |
| eof
|
| 84 |
{ "EOI","" }
|
| 85 |
| _
|
| 86 |
{ error
|
| 87 |
(Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf)
|
| 88 |
(Illegal_character ((Lexing.lexeme lexbuf).[0])) }
|
| 89 |
|
| 90 |
and comment = parse
|
| 91 |
"(*"
|
| 92 |
{ comment_start_pos := Lexing.lexeme_start lexbuf :: !comment_start_pos;
|
| 93 |
comment engine lexbuf;
|
| 94 |
}
|
| 95 |
| "*)"
|
| 96 |
{ comment_start_pos := List.tl !comment_start_pos;
|
| 97 |
if !comment_start_pos <> [] then comment engine lexbuf;
|
| 98 |
}
|
| 99 |
| '"' | "'"
|
| 100 |
{ string_start_pos := Lexing.lexeme_start lexbuf;
|
| 101 |
let string =
|
| 102 |
if Lexing.lexeme_char lexbuf 0 = '"' then string2 else string1 in
|
| 103 |
(try string engine lexbuf
|
| 104 |
with Location.Location (_,Unterminated_string) ->
|
| 105 |
let st = List.hd !comment_start_pos in
|
| 106 |
error st (st+2) Unterminated_string_in_comment);
|
| 107 |
Buffer.clear string_buff;
|
| 108 |
comment engine lexbuf }
|
| 109 |
| eof
|
| 110 |
{ let st = List.hd !comment_start_pos in
|
| 111 |
error st (st+2) Unterminated_comment
|
| 112 |
}
|
| 113 |
| _
|
| 114 |
{ comment engine lexbuf }
|
| 115 |
|
| 116 |
and string2 = parse
|
| 117 |
'"'
|
| 118 |
{ () }
|
| 119 |
| '\\' ['\\' '"']
|
| 120 |
{ store_char (Lexing.lexeme_char lexbuf 1);
|
| 121 |
string2 engine lexbuf }
|
| 122 |
| '\\' ascii_digit+
|
| 123 |
{ store_char (char_for_decimal_code (Lexing.lexeme lexbuf));
|
| 124 |
string2 engine lexbuf }
|
| 125 |
| eof
|
| 126 |
{ error !string_start_pos (!string_start_pos+1) Unterminated_string }
|
| 127 |
| _
|
| 128 |
{ store_char (Lexing.lexeme_char lexbuf 0);
|
| 129 |
(* TODO: Unicode *)
|
| 130 |
string2 engine lexbuf }
|
| 131 |
|
| 132 |
and string1 = parse
|
| 133 |
"'"
|
| 134 |
{ () }
|
| 135 |
| '\\' ['\\' '\'']
|
| 136 |
{ store_char (Lexing.lexeme_char lexbuf 1);
|
| 137 |
string1 engine lexbuf }
|
| 138 |
| '\\' ascii_digit+
|
| 139 |
{ store_char (char_for_decimal_code (Lexing.lexeme lexbuf));
|
| 140 |
string1 engine lexbuf }
|
| 141 |
| eof
|
| 142 |
{ error !string_start_pos (!string_start_pos+1) Unterminated_string }
|
| 143 |
| _
|
| 144 |
{ store_char (Lexing.lexeme_char lexbuf 0);
|
| 145 |
string1 engine lexbuf }
|
| 146 |
|
| 147 |
{
|
| 148 |
|
| 149 |
let lexer_func_of_wlex lexfun lexengine cs =
|
| 150 |
let lb =
|
| 151 |
Lexing.from_function
|
| 152 |
(fun s n ->
|
| 153 |
try s.[0] <- Stream.next cs; 1 with Stream.Failure -> 0)
|
| 154 |
in
|
| 155 |
let next () =
|
| 156 |
let tok = lexfun lexengine lb in
|
| 157 |
let loc = (Lexing.lexeme_start lb, Lexing.lexeme_end lb) in
|
| 158 |
(tok, loc)
|
| 159 |
in
|
| 160 |
Token.make_stream_and_location next
|
| 161 |
|
| 162 |
let register_kw (s1,s2) =
|
| 163 |
if s1 = "" then
|
| 164 |
match s2.[0] with
|
| 165 |
| 'a' .. 'z' when not (Hashtbl.mem keywords s2) ->
|
| 166 |
Hashtbl.add keywords s2 ()
|
| 167 |
| _ -> ()
|
| 168 |
|
| 169 |
|
| 170 |
let lexer lexfun lexengine =
|
| 171 |
{
|
| 172 |
Token.tok_func = lexer_func_of_wlex lexfun lexengine;
|
| 173 |
Token.tok_using = register_kw;
|
| 174 |
Token.tok_removing = (fun _ -> ());
|
| 175 |
Token.tok_match = Token.default_match;
|
| 176 |
Token.tok_text = Token.lexer_text
|
| 177 |
}
|
| 178 |
|
| 179 |
let classes =
|
| 180 |
let c i = (i,i) in
|
| 181 |
let i ch1 ch2 = (Char.code ch1, Char.code ch2) in
|
| 182 |
[ (ascii_digit, [i '0' '9']);
|
| 183 |
(lowercase, [i 'a' 'z']);
|
| 184 |
(uppercase, [i 'A' 'Z']);
|
| 185 |
(blank, [c 8; c 9; c 10; c 13; c 32]);
|
| 186 |
]
|
| 187 |
|
| 188 |
let table =
|
| 189 |
assert(nb_classes <= 256);
|
| 190 |
let v = String.make 256 (Char.chr encoding_error) in
|
| 191 |
let fill_int c (i, j) = String.fill v i (j-i+1) c in
|
| 192 |
let fill_class (c, l) = List.iter (fill_int (Char.chr c)) l in
|
| 193 |
let fill_char (ch, cl) = v.[ch] <- Char.chr cl in
|
| 194 |
List.iter fill_class classes;
|
| 195 |
List.iter fill_char one_char_classes;
|
| 196 |
v
|
| 197 |
|
| 198 |
let utf8_engine = Lex_engines.engine_tiny_utf8 table
|
| 199 |
(fun c ->
|
| 200 |
if c>=0x10000 && c < 0x11000 then xml_char
|
| 201 |
else encoding_error)
|
| 202 |
|
| 203 |
let latin1_engine = Lex_engines.engine_tiny_8bit table
|
| 204 |
}
|