| 1 |
(* Loading XML documents *)
|
| 2 |
|
| 3 |
let use_parser = ref `Pxp
|
| 4 |
|
| 5 |
open Pxp_yacc
|
| 6 |
open Pxp_lexer_types
|
| 7 |
open Pxp_types
|
| 8 |
open Value
|
| 9 |
open Ident
|
| 10 |
open Encodings
|
| 11 |
|
| 12 |
type buf =
|
| 13 |
{ mutable buffer : string;
|
| 14 |
mutable pos : int;
|
| 15 |
mutable length : int }
|
| 16 |
|
| 17 |
let create n = { buffer = String.create n; pos = 0; length = n }
|
| 18 |
|
| 19 |
let resize b n =
|
| 20 |
let new_len = b.length * 2 + n in
|
| 21 |
let new_buf = String.create new_len in
|
| 22 |
String.unsafe_blit b.buffer 0 new_buf 0 b.pos;
|
| 23 |
b.buffer <- new_buf;
|
| 24 |
b.length <- new_len
|
| 25 |
|
| 26 |
let add_string b s =
|
| 27 |
let len = String.length s in
|
| 28 |
let new_pos = b.pos + len in
|
| 29 |
if new_pos > b.length then resize b len;
|
| 30 |
String.unsafe_blit s 0 b.buffer b.pos len;
|
| 31 |
b.pos <- new_pos
|
| 32 |
|
| 33 |
let rec only_ws s i =
|
| 34 |
(i = 0) ||
|
| 35 |
(let i = pred i in match (String.unsafe_get s i) with
|
| 36 |
| ' ' | '\t' | '\n' | '\r' -> only_ws s i
|
| 37 |
| _ -> false)
|
| 38 |
|
| 39 |
|
| 40 |
let string s q =
|
| 41 |
let s = Utf8.mk s in
|
| 42 |
String_utf8 (Utf8.start_index s,Utf8.end_index s, s, q)
|
| 43 |
|
| 44 |
|
| 45 |
let attrib att =
|
| 46 |
let att = List.map (fun (l,v) -> LabelPool.mk (U.mk l), string v nil) att in
|
| 47 |
LabelMap.from_list (fun _ _ -> assert false) att
|
| 48 |
|
| 49 |
let elem tag att child =
|
| 50 |
Xml (Atom (Atoms.mk (U.mk tag)), Record (attrib att), child)
|
| 51 |
|
| 52 |
(*
|
| 53 |
class warner = object method warn w = print_endline ("PXP WARNING: " ^ w) end
|
| 54 |
*)
|
| 55 |
|
| 56 |
type token =
|
| 57 |
| Element of Value.t
|
| 58 |
| Start of string * (string * string) list
|
| 59 |
| String of string
|
| 60 |
|
| 61 |
let stack = ref []
|
| 62 |
let txt = create 1024
|
| 63 |
|
| 64 |
|
| 65 |
let rec create_elt accu = function
|
| 66 |
| String s :: st -> create_elt (string s accu) st
|
| 67 |
| Element x :: st -> create_elt (Pair (x,accu)) st
|
| 68 |
| Start (name,att) :: st -> stack := Element (elem name att accu) :: st
|
| 69 |
| [] -> assert false
|
| 70 |
|
| 71 |
|
| 72 |
let buflen = 1000
|
| 73 |
let buf = String.create buflen
|
| 74 |
|
| 75 |
ifdef EXPAT then
|
| 76 |
|
| 77 |
let load_expat s =
|
| 78 |
let p = Expat.parser_create "" in
|
| 79 |
Expat.set_start_element_handler p
|
| 80 |
(fun name att ->
|
| 81 |
if not (only_ws txt.buffer txt.pos) then
|
| 82 |
stack := String (String.sub txt.buffer 0 txt.pos) :: !stack;
|
| 83 |
txt.pos <- 0;
|
| 84 |
stack := Start (name,att) :: !stack);
|
| 85 |
Expat.set_end_element_handler p
|
| 86 |
(fun _ ->
|
| 87 |
let accu =
|
| 88 |
if only_ws txt.buffer txt.pos
|
| 89 |
then nil
|
| 90 |
else string (String.sub txt.buffer 0 txt.pos) nil in
|
| 91 |
txt.pos <- 0;
|
| 92 |
create_elt accu !stack);
|
| 93 |
Expat.set_character_data_handler p (add_string txt);
|
| 94 |
let ic = open_in s in
|
| 95 |
let rec loop () =
|
| 96 |
let n = input ic buf 0 buflen in
|
| 97 |
if (n > 0) then
|
| 98 |
(*(Expat.parse p (String.sub buf 0 n); loop ())*)
|
| 99 |
(Expat.parse_sub p buf 0 n; loop ())
|
| 100 |
in
|
| 101 |
try
|
| 102 |
loop();
|
| 103 |
Expat.final p;
|
| 104 |
close_in ic;
|
| 105 |
match !stack with
|
| 106 |
| [ Element x ] -> stack := []; x
|
| 107 |
| _ -> assert false
|
| 108 |
with
|
| 109 |
Expat.Expat_error e ->
|
| 110 |
failwith ("Expat ("^s^"):"^Expat.xml_error_to_string e)
|
| 111 |
|
| 112 |
else
|
| 113 |
|
| 114 |
let load_expat s =
|
| 115 |
failwith "Expat support not included"
|
| 116 |
|
| 117 |
|
| 118 |
|
| 119 |
let handle_event = function
|
| 120 |
| E_start_tag (name,att,_) ->
|
| 121 |
if not (only_ws txt.buffer txt.pos) then
|
| 122 |
stack := String (String.sub txt.buffer 0 txt.pos) :: !stack;
|
| 123 |
txt.pos <- 0;
|
| 124 |
stack := Start (name,att) :: !stack
|
| 125 |
| E_char_data data ->
|
| 126 |
add_string txt data
|
| 127 |
| E_end_tag (_,_) ->
|
| 128 |
let accu =
|
| 129 |
if only_ws txt.buffer txt.pos
|
| 130 |
then nil
|
| 131 |
else string (String.sub txt.buffer 0 txt.pos) nil in
|
| 132 |
txt.pos <- 0;
|
| 133 |
create_elt accu !stack
|
| 134 |
| _ -> ()
|
| 135 |
|
| 136 |
let load_pxp s =
|
| 137 |
let config = { default_config with
|
| 138 |
(* warner = new warner; *)
|
| 139 |
encoding = `Enc_utf8;
|
| 140 |
store_element_positions = false;
|
| 141 |
drop_ignorable_whitespace = true
|
| 142 |
}
|
| 143 |
in
|
| 144 |
let mgr = create_entity_manager config (from_file s) in
|
| 145 |
process_entity config (`Entry_document[]) mgr handle_event;
|
| 146 |
match !stack with
|
| 147 |
| [ Element x ] -> stack := []; x
|
| 148 |
| _ -> assert false
|
| 149 |
|
| 150 |
let load_xml_aux s =
|
| 151 |
match !use_parser with
|
| 152 |
| `Expat -> load_expat s
|
| 153 |
| `Pxp -> load_pxp s
|
| 154 |
|
| 155 |
let load_xml s =
|
| 156 |
Location.protect_op "load_xml";
|
| 157 |
try load_xml_aux s
|
| 158 |
with exn ->
|
| 159 |
raise
|
| 160 |
(Location.Generic (Pxp_types.string_of_exn exn))
|
| 161 |
|
| 162 |
|
| 163 |
let load_html s =
|
| 164 |
let rec val_of_doc q = function
|
| 165 |
| Nethtml.Data data ->
|
| 166 |
if (only_ws data (String.length data)) then q else string data q
|
| 167 |
| Nethtml.Element (tag, att, child) ->
|
| 168 |
Pair (elem tag att (val_of_docs child), q)
|
| 169 |
and val_of_docs = function
|
| 170 |
| [] -> nil
|
| 171 |
| h::t -> val_of_doc (val_of_docs t) h
|
| 172 |
in
|
| 173 |
|
| 174 |
Location.protect_op "load_html";
|
| 175 |
let ic = open_in s in
|
| 176 |
let doc = Nethtml.parse_document
|
| 177 |
~dtd:Nethtml.relaxed_html40_dtd
|
| 178 |
(Lexing.from_channel ic) in
|
| 179 |
let doc = Nethtml.decode ~enc:`Enc_utf8 ~subst:(fun _ -> "???") doc in
|
| 180 |
close_in ic;
|
| 181 |
val_of_docs doc
|
| 182 |
|