| 1 |
(* Loading XML documents *)
|
| 2 |
|
| 3 |
ifdef EXPAT then
|
| 4 |
let expat_support = true
|
| 5 |
else
|
| 6 |
let expat_support = false
|
| 7 |
|
| 8 |
let use_parser = ref (if expat_support then `Expat else `Pxp)
|
| 9 |
|
| 10 |
open Pxp_yacc
|
| 11 |
open Pxp_lexer_types
|
| 12 |
open Pxp_types
|
| 13 |
open Value
|
| 14 |
open Ident
|
| 15 |
open Encodings
|
| 16 |
|
| 17 |
|
| 18 |
type buf =
|
| 19 |
{ mutable buffer : string;
|
| 20 |
mutable pos : int;
|
| 21 |
mutable length : int }
|
| 22 |
|
| 23 |
let txt = { buffer = String.create 1024; pos = 0; length = 1024 }
|
| 24 |
|
| 25 |
let resize txt n =
|
| 26 |
let new_len = txt.length * 2 + n in
|
| 27 |
let new_buf = String.create new_len in
|
| 28 |
String.unsafe_blit txt.buffer 0 new_buf 0 txt.pos;
|
| 29 |
txt.buffer <- new_buf;
|
| 30 |
txt.length <- new_len
|
| 31 |
|
| 32 |
let add_string txt s =
|
| 33 |
let len = String.length s in
|
| 34 |
let new_pos = txt.pos + len in
|
| 35 |
if new_pos > txt.length then resize txt len;
|
| 36 |
String.unsafe_blit s 0 txt.buffer txt.pos len;
|
| 37 |
txt.pos <- new_pos
|
| 38 |
|
| 39 |
let rec only_ws s i =
|
| 40 |
(i = 0) ||
|
| 41 |
(let i = pred i in match (String.unsafe_get s i) with
|
| 42 |
| ' ' | '\t' | '\n' | '\r' -> only_ws s i
|
| 43 |
| _ -> false)
|
| 44 |
|
| 45 |
|
| 46 |
let string s q =
|
| 47 |
let s = Utf8.mk s in
|
| 48 |
String_utf8 (Utf8.start_index s,Utf8.end_index s, s, q)
|
| 49 |
|
| 50 |
|
| 51 |
let attrib att =
|
| 52 |
(* TODO: better error message *)
|
| 53 |
let att = List.map (fun (n,v) -> LabelPool.mk n, string_utf8 v) att in
|
| 54 |
LabelMap.from_list (fun _ _ -> failwith "Invalid XML document: uniqueness of attributes") att
|
| 55 |
|
| 56 |
let elem (tag_ns,tag) att child =
|
| 57 |
Xml (Atom (Atoms.V.mk tag_ns tag), Record (attrib att), child)
|
| 58 |
|
| 59 |
(*
|
| 60 |
class warner = object method warn w = print_endline ("PXP WARNING: " ^ w) end
|
| 61 |
*)
|
| 62 |
|
| 63 |
type stack =
|
| 64 |
| Element of Value.t * stack
|
| 65 |
| Start of Ns.qname * (Ns.qname * Utf8.t) list * Ns.table * stack
|
| 66 |
| String of string * stack
|
| 67 |
| Empty
|
| 68 |
|
| 69 |
let stack = ref Empty
|
| 70 |
let ns_table = ref Ns.empty_table
|
| 71 |
|
| 72 |
let rec create_elt accu = function
|
| 73 |
| String (s,st) -> create_elt (string s accu) st
|
| 74 |
| Element (x,st) -> create_elt (Pair (x,accu)) st
|
| 75 |
| Start (name,att,table,st) ->
|
| 76 |
stack := Element (elem name att accu, st);
|
| 77 |
ns_table := table
|
| 78 |
| Empty -> assert false
|
| 79 |
|
| 80 |
let start_element_handler name att =
|
| 81 |
if not (only_ws txt.buffer txt.pos) then
|
| 82 |
stack := String (String.sub txt.buffer 0 txt.pos, !stack);
|
| 83 |
txt.pos <- 0;
|
| 84 |
|
| 85 |
let (table,name,att) = Ns.process_start_tag !ns_table name att in
|
| 86 |
stack := Start (name,att,!ns_table, !stack);
|
| 87 |
ns_table := table
|
| 88 |
|
| 89 |
let end_element_handler _ =
|
| 90 |
let accu =
|
| 91 |
if only_ws txt.buffer txt.pos
|
| 92 |
then nil
|
| 93 |
else string (String.sub txt.buffer 0 txt.pos) nil in
|
| 94 |
txt.pos <- 0;
|
| 95 |
create_elt accu !stack
|
| 96 |
|
| 97 |
ifdef EXPAT then
|
| 98 |
|
| 99 |
let load_expat =
|
| 100 |
let buflen = 1024 in
|
| 101 |
let buf = String.create buflen in
|
| 102 |
fun s ->
|
| 103 |
let ic =
|
| 104 |
if Url.is_url s then
|
| 105 |
let msg =
|
| 106 |
Printf.sprintf "load_xml, file \"%s\": URLs support is not available for expat, yet." s
|
| 107 |
in
|
| 108 |
raise (Location.Generic msg)
|
| 109 |
else
|
| 110 |
try open_in s
|
| 111 |
with exn ->
|
| 112 |
let msg =
|
| 113 |
Printf.sprintf "load_xml, file \"%s\": %s" s (Printexc.to_string exn)
|
| 114 |
in
|
| 115 |
raise (Location.Generic msg)
|
| 116 |
in
|
| 117 |
let p = Expat.parser_create "" in
|
| 118 |
Expat.set_start_element_handler p start_element_handler;
|
| 119 |
Expat.set_end_element_handler p end_element_handler;
|
| 120 |
Expat.set_character_data_handler p (add_string txt);
|
| 121 |
let rec loop () =
|
| 122 |
let n = input ic buf 0 buflen in
|
| 123 |
if (n > 0) then (Expat.parse_sub p buf 0 n; loop ())
|
| 124 |
in
|
| 125 |
try
|
| 126 |
loop();
|
| 127 |
Expat.final p;
|
| 128 |
close_in ic;
|
| 129 |
with
|
| 130 |
Expat.Expat_error e ->
|
| 131 |
close_in ic;
|
| 132 |
let line = Expat.get_current_line_number p
|
| 133 |
and col = Expat.get_current_column_number p in
|
| 134 |
let msg =
|
| 135 |
Printf.sprintf
|
| 136 |
"load_xml, file \"%s\", at line %i, column %i: %s"
|
| 137 |
s
|
| 138 |
(Expat.get_current_line_number p)
|
| 139 |
(Expat.get_current_column_number p)
|
| 140 |
(Expat.xml_error_to_string e)
|
| 141 |
in
|
| 142 |
raise (Location.Generic msg)
|
| 143 |
else
|
| 144 |
|
| 145 |
let load_expat s =
|
| 146 |
failwith "Expat support not included"
|
| 147 |
|
| 148 |
let pxp_handle_event = function
|
| 149 |
| E_start_tag (name,att,_) -> start_element_handler name att
|
| 150 |
| E_char_data data -> add_string txt data
|
| 151 |
| E_end_tag (_,_) -> end_element_handler ()
|
| 152 |
| _ -> ()
|
| 153 |
|
| 154 |
let pxp_config =
|
| 155 |
{ default_config with
|
| 156 |
(* warner = new warner; *)
|
| 157 |
encoding = `Enc_utf8;
|
| 158 |
store_element_positions = false;
|
| 159 |
drop_ignorable_whitespace = true
|
| 160 |
}
|
| 161 |
|
| 162 |
let load_pxp s =
|
| 163 |
try
|
| 164 |
let src =
|
| 165 |
match s with
|
| 166 |
| Url.Url s -> from_string s
|
| 167 |
| Url.Filename s -> from_file s in
|
| 168 |
let mgr = create_entity_manager pxp_config src in
|
| 169 |
process_entity pxp_config (`Entry_document[`Extend_dtd_fully]) mgr pxp_handle_event;
|
| 170 |
with exn ->
|
| 171 |
raise (Location.Generic (Pxp_types.string_of_exn exn))
|
| 172 |
|
| 173 |
|
| 174 |
|
| 175 |
let load_xml s =
|
| 176 |
Location.protect_op "load_xml";
|
| 177 |
try
|
| 178 |
(match !use_parser with
|
| 179 |
| `Expat -> load_expat s
|
| 180 |
| `Pxp -> load_pxp (Url.process s));
|
| 181 |
match !stack with
|
| 182 |
| Element (x,Empty) -> stack := Empty; x
|
| 183 |
| _ -> assert false
|
| 184 |
with e -> stack := Empty; txt.pos <-0; raise e
|
| 185 |
|
| 186 |
|
| 187 |
|
| 188 |
let load_html s =
|
| 189 |
let rec val_of_doc q = function
|
| 190 |
| Nethtml.Data data ->
|
| 191 |
if (only_ws data (String.length data)) then q else string data q
|
| 192 |
| Nethtml.Element (tag, att, child) ->
|
| 193 |
let att = List.map (fun (n,v) -> ((Ns.empty, U.mk n), U.mk v)) att in
|
| 194 |
Pair (elem (Ns.empty,U.mk tag) att (val_of_docs child), q)
|
| 195 |
and val_of_docs = function
|
| 196 |
| [] -> nil
|
| 197 |
| h::t -> val_of_doc (val_of_docs t) h
|
| 198 |
in
|
| 199 |
|
| 200 |
Location.protect_op "load_html";
|
| 201 |
let parse src = Nethtml.parse_document ~dtd:Nethtml.relaxed_html40_dtd src in
|
| 202 |
let doc =
|
| 203 |
match Url.process s with
|
| 204 |
| Url.Filename s ->
|
| 205 |
let ic = open_in s in
|
| 206 |
let doc = parse (Lexing.from_channel ic) in
|
| 207 |
close_in ic;
|
| 208 |
doc
|
| 209 |
| Url.Url s ->
|
| 210 |
parse (Lexing.from_string s)
|
| 211 |
in
|
| 212 |
let doc = Nethtml.decode ~subst:(fun _ -> "???") doc in
|
| 213 |
let doc = Nethtml.map_list
|
| 214 |
(Netconversion.convert ~in_enc:`Enc_iso88591
|
| 215 |
~out_enc:`Enc_utf8) doc in
|
| 216 |
val_of_docs doc
|
| 217 |
|
| 218 |
|
| 219 |
|
| 220 |
|
| 221 |
|
| 222 |
|
| 223 |
|
| 224 |
|
| 225 |
|
| 226 |
|
| 227 |
|
| 228 |
|
| 229 |
|
| 230 |
|
| 231 |
|
| 232 |
|
| 233 |
|