| 1 |
(* Loading XML documents *)
|
| 2 |
|
| 3 |
(*TODO: close the file ! *)
|
| 4 |
|
| 5 |
open Pxp_yacc
|
| 6 |
open Pxp_lexer_types
|
| 7 |
open Pxp_types
|
| 8 |
open Value
|
| 9 |
open Ident
|
| 10 |
open Encodings
|
| 11 |
|
| 12 |
type buf =
|
| 13 |
{ mutable buffer : string;
|
| 14 |
mutable pos : int;
|
| 15 |
mutable length : int }
|
| 16 |
|
| 17 |
let create n = { buffer = String.create n; pos = 0; length = n }
|
| 18 |
|
| 19 |
let resize b n =
|
| 20 |
let new_len = b.length * 2 + n in
|
| 21 |
let new_buf = String.create new_len in
|
| 22 |
String.unsafe_blit b.buffer 0 new_buf 0 b.pos;
|
| 23 |
b.buffer <- new_buf;
|
| 24 |
b.length <- new_len
|
| 25 |
|
| 26 |
let add_string b s =
|
| 27 |
let len = String.length s in
|
| 28 |
let new_pos = b.pos + len in
|
| 29 |
if new_pos > b.length then resize b len;
|
| 30 |
String.unsafe_blit s 0 b.buffer b.pos len;
|
| 31 |
b.pos <- new_pos
|
| 32 |
|
| 33 |
let rec only_ws s i =
|
| 34 |
(i = 0) ||
|
| 35 |
(let i = pred i in match (String.unsafe_get s i) with
|
| 36 |
| ' ' | '\t' | '\n' | '\r' -> only_ws s i
|
| 37 |
| _ -> false)
|
| 38 |
|
| 39 |
|
| 40 |
let string s q =
|
| 41 |
let s = Utf8.mk s in
|
| 42 |
String_utf8 (Utf8.start_index s,Utf8.end_index s, s, q)
|
| 43 |
|
| 44 |
|
| 45 |
let attrib att =
|
| 46 |
let att = List.map (fun (l,v) -> LabelPool.mk (U.mk l), string v nil) att in
|
| 47 |
LabelMap.from_list (fun _ _ -> assert false) att
|
| 48 |
|
| 49 |
let elem tag att child =
|
| 50 |
Xml (Atom (Atoms.mk (U.mk tag)), Pair (Record (attrib att), child))
|
| 51 |
|
| 52 |
class warner = object method warn w = print_endline ("PXP WARNING: " ^ w) end
|
| 53 |
|
| 54 |
type token =
|
| 55 |
| Element of Value.t
|
| 56 |
| Start of string * (string * string) list
|
| 57 |
| String of string
|
| 58 |
|
| 59 |
let load_xml_aux s =
|
| 60 |
let config = { default_config with
|
| 61 |
(* warner = new warner; *)
|
| 62 |
encoding = `Enc_utf8;
|
| 63 |
store_element_positions = false;
|
| 64 |
drop_ignorable_whitespace = true
|
| 65 |
}
|
| 66 |
in
|
| 67 |
let mgr = create_entity_manager config (from_file s) in
|
| 68 |
let next_event =
|
| 69 |
create_pull_parser config (`Entry_document[]) mgr in
|
| 70 |
let get () =
|
| 71 |
match next_event () with
|
| 72 |
| Some (E_error exn) -> failwith (Pxp_types.string_of_exn exn)
|
| 73 |
| Some E_end_of_stream | None -> failwith "Unexpected end of XML stream"
|
| 74 |
| Some x -> x in
|
| 75 |
|
| 76 |
let txt = create 1024 in
|
| 77 |
|
| 78 |
let rec create_elt accu = function
|
| 79 |
| String s :: st -> create_elt (string s accu) st
|
| 80 |
| Element x :: st -> create_elt (Pair (x,accu)) st
|
| 81 |
| [ Start (name,att) ] -> elem name att accu
|
| 82 |
| Start (name,att) :: st -> parse_seq (Element (elem name att accu) :: st)
|
| 83 |
| [] -> assert false
|
| 84 |
and parse_seq stack =
|
| 85 |
match get() with
|
| 86 |
| E_start_tag (name,att,_) ->
|
| 87 |
if only_ws txt.buffer txt.pos then
|
| 88 |
let () = txt.pos <- 0 in
|
| 89 |
parse_seq (Start (name,att) :: stack)
|
| 90 |
else
|
| 91 |
let s = String.sub txt.buffer 0 txt.pos in
|
| 92 |
let () = txt.pos <- 0 in
|
| 93 |
parse_seq (Start (name,att) :: String s :: stack)
|
| 94 |
| E_char_data data ->
|
| 95 |
add_string txt data;
|
| 96 |
parse_seq stack
|
| 97 |
| E_end_tag (_,_) ->
|
| 98 |
if only_ws txt.buffer txt.pos then
|
| 99 |
(txt.pos <- 0;
|
| 100 |
create_elt nil stack)
|
| 101 |
else
|
| 102 |
let s = String.sub txt.buffer 0 txt.pos in
|
| 103 |
txt.pos <- 0;
|
| 104 |
create_elt (string s nil) stack
|
| 105 |
| _ -> failwith "Expect start_tag, char_data, or end_tag"
|
| 106 |
in
|
| 107 |
let rec parse_doc () =
|
| 108 |
match get () with
|
| 109 |
| E_start_tag (name,att,_) -> parse_seq [ Start (name,att) ]
|
| 110 |
| _ -> parse_doc () in
|
| 111 |
parse_doc ()
|
| 112 |
|
| 113 |
|
| 114 |
let load_xml s =
|
| 115 |
Location.protect_op "load_xml";
|
| 116 |
try load_xml_aux s
|
| 117 |
with exn ->
|
| 118 |
raise
|
| 119 |
(Location.Generic (Pxp_types.string_of_exn exn))
|
| 120 |
|
| 121 |
|
| 122 |
let load_html s =
|
| 123 |
let rec val_of_doc q = function
|
| 124 |
| Nethtml.Data data ->
|
| 125 |
if (only_ws data (String.length data)) then q else string data q
|
| 126 |
| Nethtml.Element (tag, att, child) ->
|
| 127 |
Pair (elem tag att (val_of_docs child), q)
|
| 128 |
and val_of_docs = function
|
| 129 |
| [] -> nil
|
| 130 |
| h::t -> val_of_doc (val_of_docs t) h
|
| 131 |
in
|
| 132 |
|
| 133 |
Location.protect_op "load_html";
|
| 134 |
let ic = open_in s in
|
| 135 |
let doc = Nethtml.parse_document
|
| 136 |
~dtd:Nethtml.relaxed_html40_dtd
|
| 137 |
(Lexing.from_channel ic) in
|
| 138 |
let doc = Nethtml.decode ~enc:`Enc_utf8 ~subst:(fun _ -> "???") doc in
|
| 139 |
close_in ic;
|
| 140 |
val_of_docs doc
|
| 141 |
|