/[svn]/runtime/load_xml.ml
ViewVC logotype

Contents of /runtime/load_xml.ml

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1106 - (show annotations)
Tue Jul 10 18:23:10 2007 UTC (5 years, 10 months ago) by abate
File size: 5844 byte(s)
[r2004-05-23 08:40:41 by afrisch] Empty log message

Original author: afrisch
Date: 2004-05-23 08:40:41+00:00
1 (* Loading XML documents *)
2
3 ifdef EXPAT then
4 let expat_support = true
5 else
6 let expat_support = false
7
8 let use_parser = ref (if expat_support then `Expat else `Pxp)
9
10 open Pxp_yacc
11 open Pxp_lexer_types
12 open Pxp_types
13 open Value
14 open Ident
15 open Encodings
16
17
18 type buf =
19 { mutable buffer : string;
20 mutable pos : int;
21 mutable length : int }
22
23 let txt = { buffer = String.create 1024; pos = 0; length = 1024 }
24
25 let resize txt n =
26 let new_len = txt.length * 2 + n in
27 let new_buf = String.create new_len in
28 String.unsafe_blit txt.buffer 0 new_buf 0 txt.pos;
29 txt.buffer <- new_buf;
30 txt.length <- new_len
31
32 let add_string txt s =
33 let len = String.length s in
34 let new_pos = txt.pos + len in
35 if new_pos > txt.length then resize txt len;
36 String.unsafe_blit s 0 txt.buffer txt.pos len;
37 txt.pos <- new_pos
38
39 let rec only_ws s i =
40 (i = 0) ||
41 (let i = pred i in match (String.unsafe_get s i) with
42 | ' ' | '\t' | '\n' | '\r' -> only_ws s i
43 | _ -> false)
44
45
46 let string s q =
47 let s = Utf8.mk s in
48 String_utf8 (Utf8.start_index s,Utf8.end_index s, s, q)
49
50
51 let attrib att =
52 (* TODO: better error message *)
53 let att = List.map (fun (n,v) -> LabelPool.mk n, string_utf8 v) att in
54 LabelMap.from_list (fun _ _ -> failwith "Invalid XML document: uniqueness of attributes") att
55
56 let elem (tag_ns,tag) att child =
57 Xml (Atom (Atoms.V.mk tag_ns tag), Record (attrib att), child)
58
59 (*
60 class warner = object method warn w = print_endline ("PXP WARNING: " ^ w) end
61 *)
62
63 type stack =
64 | Element of Value.t * stack
65 | Start of Ns.qname * (Ns.qname * Utf8.t) list * Ns.table * stack
66 | String of string * stack
67 | Empty
68
69 let stack = ref Empty
70 let ns_table = ref Ns.empty_table
71
72 let rec create_elt accu = function
73 | String (s,st) -> create_elt (string s accu) st
74 | Element (x,st) -> create_elt (Pair (x,accu)) st
75 | Start (name,att,table,st) ->
76 stack := Element (elem name att accu, st);
77 ns_table := table
78 | Empty -> assert false
79
80 let start_element_handler name att =
81 if not (only_ws txt.buffer txt.pos) then
82 stack := String (String.sub txt.buffer 0 txt.pos, !stack);
83 txt.pos <- 0;
84
85 let (table,name,att) = Ns.process_start_tag !ns_table name att in
86 stack := Start (name,att,!ns_table, !stack);
87 ns_table := table
88
89 let end_element_handler _ =
90 let accu =
91 if only_ws txt.buffer txt.pos
92 then nil
93 else string (String.sub txt.buffer 0 txt.pos) nil in
94 txt.pos <- 0;
95 create_elt accu !stack
96
97 ifdef EXPAT then
98
99 let load_expat =
100 let buflen = 1024 in
101 let buf = String.create buflen in
102 fun s ->
103 let ic =
104 if Url.is_url s then
105 let msg =
106 Printf.sprintf "load_xml, file \"%s\": URLs support is not available for expat, yet." s
107 in
108 raise (Location.Generic msg)
109 else
110 try open_in s
111 with exn ->
112 let msg =
113 Printf.sprintf "load_xml, file \"%s\": %s" s (Printexc.to_string exn)
114 in
115 raise (Location.Generic msg)
116 in
117 let p = Expat.parser_create "" in
118 Expat.set_start_element_handler p start_element_handler;
119 Expat.set_end_element_handler p end_element_handler;
120 Expat.set_character_data_handler p (add_string txt);
121 let rec loop () =
122 let n = input ic buf 0 buflen in
123 if (n > 0) then (Expat.parse_sub p buf 0 n; loop ())
124 in
125 try
126 loop();
127 Expat.final p;
128 close_in ic;
129 with
130 Expat.Expat_error e ->
131 close_in ic;
132 let line = Expat.get_current_line_number p
133 and col = Expat.get_current_column_number p in
134 let msg =
135 Printf.sprintf
136 "load_xml, file \"%s\", at line %i, column %i: %s"
137 s
138 (Expat.get_current_line_number p)
139 (Expat.get_current_column_number p)
140 (Expat.xml_error_to_string e)
141 in
142 raise (Location.Generic msg)
143 else
144
145 let load_expat s =
146 failwith "Expat support not included"
147
148 let pxp_handle_event = function
149 | E_start_tag (name,att,_) -> start_element_handler name att
150 | E_char_data data -> add_string txt data
151 | E_end_tag (_,_) -> end_element_handler ()
152 | _ -> ()
153
154 let pxp_config =
155 { default_config with
156 (* warner = new warner; *)
157 encoding = `Enc_utf8;
158 store_element_positions = false;
159 drop_ignorable_whitespace = true
160 }
161
162 let load_pxp s =
163 try
164 let src =
165 match s with
166 | Url.Url s -> from_string s
167 | Url.Filename s -> from_file s in
168 let mgr = create_entity_manager pxp_config src in
169 process_entity pxp_config (`Entry_document[`Extend_dtd_fully]) mgr pxp_handle_event;
170 with exn ->
171 raise (Location.Generic (Pxp_types.string_of_exn exn))
172
173
174
175 let load_xml s =
176 Location.protect_op "load_xml";
177 try
178 (match !use_parser with
179 | `Expat -> load_expat s
180 | `Pxp -> load_pxp (Url.process s));
181 match !stack with
182 | Element (x,Empty) -> stack := Empty; x
183 | _ -> assert false
184 with e -> stack := Empty; txt.pos <-0; raise e
185
186
187
188 let load_html s =
189 let rec val_of_doc q = function
190 | Nethtml.Data data ->
191 if (only_ws data (String.length data)) then q else string data q
192 | Nethtml.Element (tag, att, child) ->
193 let att = List.map (fun (n,v) -> ((Ns.empty, U.mk n), U.mk v)) att in
194 Pair (elem (Ns.empty,U.mk tag) att (val_of_docs child), q)
195 and val_of_docs = function
196 | [] -> nil
197 | h::t -> val_of_doc (val_of_docs t) h
198 in
199
200 Location.protect_op "load_html";
201 let parse src = Nethtml.parse_document ~dtd:Nethtml.relaxed_html40_dtd src in
202 let doc =
203 match Url.process s with
204 | Url.Filename s ->
205 let ic = open_in s in
206 let doc = parse (Lexing.from_channel ic) in
207 close_in ic;
208 doc
209 | Url.Url s ->
210 parse (Lexing.from_string s)
211 in
212 let doc = Nethtml.decode ~subst:(fun _ -> "???") doc in
213 let doc = Nethtml.map_list
214 (Netconversion.convert ~in_enc:`Enc_iso88591
215 ~out_enc:`Enc_utf8) doc in
216 val_of_docs doc
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233

CVS Admin">CVS Admin
ViewVC Help
Powered by ViewVC 1.1.5