/[svn]/runtime/load_xml.ml
ViewVC logotype

Contents of /runtime/load_xml.ml

Parent Directory Parent Directory | Revision Log Revision Log


Revision 395 - (show annotations)
Tue Jul 10 17:31:24 2007 UTC (5 years, 10 months ago) by abate
File size: 3793 byte(s)
[r2003-05-22 17:09:52 by cvscast] Tail-recursive load_xml improved

Original author: cvscast
Date: 2003-05-22 17:09:52+00:00
1 (* Loading XML documents *)
2
3 (*TODO: close the file ! *)
4
5 open Pxp_yacc
6 open Pxp_lexer_types
7 open Pxp_types
8 open Value
9 open Ident
10 open Encodings
11
12 type buf =
13 { mutable buffer : string;
14 mutable pos : int;
15 mutable length : int }
16
17 let create n = { buffer = String.create n; pos = 0; length = n }
18
19 let resize b n =
20 let new_len = b.length * 2 + n in
21 let new_buf = String.create new_len in
22 String.unsafe_blit b.buffer 0 new_buf 0 b.pos;
23 b.buffer <- new_buf;
24 b.length <- new_len
25
26 let add_string b s =
27 let len = String.length s in
28 let new_pos = b.pos + len in
29 if new_pos > b.length then resize b len;
30 String.unsafe_blit s 0 b.buffer b.pos len;
31 b.pos <- new_pos
32
33 let rec only_ws s i =
34 (i = 0) ||
35 (let i = pred i in match (String.unsafe_get s i) with
36 | ' ' | '\t' | '\n' | '\r' -> only_ws s i
37 | _ -> false)
38
39
40 let string s q =
41 let s = Utf8.mk s in
42 String_utf8 (Utf8.start_index s,Utf8.end_index s, s, q)
43
44
45 let attrib att =
46 let att = List.map (fun (l,v) -> LabelPool.mk (U.mk l), string v nil) att in
47 LabelMap.from_list (fun _ _ -> assert false) att
48
49 let elem tag att child =
50 Xml (Atom (Atoms.mk (U.mk tag)), Pair (Record (attrib att), child))
51
52 class warner = object method warn w = print_endline ("PXP WARNING: " ^ w) end
53
54 type token =
55 | Element of Value.t
56 | Start of string * (string * string) list
57 | String of string
58
59 let load_xml_aux s =
60 let config = { default_config with
61 (* warner = new warner; *)
62 encoding = `Enc_utf8;
63 store_element_positions = false;
64 drop_ignorable_whitespace = true
65 }
66 in
67 let mgr = create_entity_manager config (from_file s) in
68 let next_event =
69 create_pull_parser config (`Entry_document[]) mgr in
70 let get () =
71 match next_event () with
72 | Some (E_error exn) -> failwith (Pxp_types.string_of_exn exn)
73 | Some E_end_of_stream | None -> failwith "Unexpected end of XML stream"
74 | Some x -> x in
75
76 let txt = create 1024 in
77
78 let rec create_elt accu = function
79 | String s :: st -> create_elt (string s accu) st
80 | Element x :: st -> create_elt (Pair (x,accu)) st
81 | [ Start (name,att) ] -> elem name att accu
82 | Start (name,att) :: st -> parse_seq (Element (elem name att accu) :: st)
83 | [] -> assert false
84 and parse_seq stack =
85 match get() with
86 | E_start_tag (name,att,_) ->
87 let stack =
88 if only_ws txt.buffer txt.pos then stack
89 else String (String.sub txt.buffer 0 txt.pos) :: stack in
90 txt.pos <- 0;
91 parse_seq (Start (name,att) :: stack)
92 | E_char_data data ->
93 add_string txt data;
94 parse_seq stack
95 | E_end_tag (_,_) ->
96 let accu =
97 if only_ws txt.buffer txt.pos
98 then nil
99 else string (String.sub txt.buffer 0 txt.pos) nil in
100 txt.pos <- 0;
101 create_elt accu stack
102 | _ -> failwith "Expect start_tag, char_data, or end_tag"
103 in
104 let rec parse_doc () =
105 match get () with
106 | E_start_tag (name,att,_) -> parse_seq [ Start (name,att) ]
107 | _ -> parse_doc () in
108 parse_doc ()
109
110
111 let load_xml s =
112 Location.protect_op "load_xml";
113 try load_xml_aux s
114 with exn ->
115 raise
116 (Location.Generic (Pxp_types.string_of_exn exn))
117
118
119 let load_html s =
120 let rec val_of_doc q = function
121 | Nethtml.Data data ->
122 if (only_ws data (String.length data)) then q else string data q
123 | Nethtml.Element (tag, att, child) ->
124 Pair (elem tag att (val_of_docs child), q)
125 and val_of_docs = function
126 | [] -> nil
127 | h::t -> val_of_doc (val_of_docs t) h
128 in
129
130 Location.protect_op "load_html";
131 let ic = open_in s in
132 let doc = Nethtml.parse_document
133 ~dtd:Nethtml.relaxed_html40_dtd
134 (Lexing.from_channel ic) in
135 let doc = Nethtml.decode ~enc:`Enc_utf8 ~subst:(fun _ -> "???") doc in
136 close_in ic;
137 val_of_docs doc
138

CVS Admin">CVS Admin
ViewVC Help
Powered by ViewVC 1.1.5