/[svn]/misc/encodings.ml
ViewVC logotype

Contents of /misc/encodings.ml

Parent Directory Parent Directory | Revision Log Revision Log


Revision 310 - (hide annotations)
Tue Jul 10 17:24:12 2007 UTC (5 years, 11 months ago) by abate
File size: 3092 byte(s)
[r2003-05-10 14:44:29 by cvscast] Start Unicode support. Remove more generic comparisons

Original author: cvscast
Date: 2003-05-10 14:44:30+00:00
1 abate 79 type uchar = int
2    
3 abate 310 module Utf8 =
4     struct
5     type ustring = string
6     type uindex = int
7 abate 79
8 abate 310 let start_index s = 0
9     let end_index s = String.length s
10     let equal_index = (==)
11     let mk s = s
12     let get_str s = s
13     let get_idx i = i
14 abate 79
15     (* TODO: handle 5,6 bytes chars; report malformed UTF-8 *)
16     let get s i =
17     match s.[i] with
18     | '\000'..'\127' as c ->
19     Char.code c
20     | '\192'..'\223' as c ->
21     ((Char.code c - 192) lsl 6) lor
22     ((Char.code s.[i+1] - 128))
23     | '\224'..'\239' as c ->
24     ((Char.code c - 192) lsl 12) lor
25     ((Char.code s.[i+1] - 128) lsl 6) lor
26     ((Char.code s.[i+2] - 128))
27     | '\240'..'\248' as c ->
28     ((Char.code c - 192) lsl 18) lor
29     ((Char.code s.[i+1] - 128) lsl 12) lor
30     ((Char.code s.[i+2] - 128) lsl 6) lor
31     ((Char.code s.[i+3] - 128))
32     | _ -> failwith "Malformed UTF-8 bufffer"
33    
34 abate 310 let next s i =
35     match s.[i] with
36     | '\000'..'\127' as c ->
37     Char.code c, i + 1
38     | '\192'..'\223' as c ->
39     ((Char.code c - 192) lsl 6) lor
40     ((Char.code s.[i+1] - 128)), i + 2
41     | '\224'..'\239' as c ->
42     ((Char.code c - 192) lsl 12) lor
43     ((Char.code s.[i+1] - 128) lsl 6) lor
44     ((Char.code s.[i+2] - 128)), i + 3
45     | '\240'..'\248' as c ->
46     ((Char.code c - 192) lsl 18) lor
47     ((Char.code s.[i+1] - 128) lsl 12) lor
48     ((Char.code s.[i+2] - 128) lsl 6) lor
49     ((Char.code s.[i+3] - 128)), i + 4
50     | _ -> failwith "Malformed UTF-8 bufffer"
51    
52     let advance s i =
53     match s.[i] with
54     | '\000'..'\127' as c -> i + 1
55     | '\192'..'\223' as c -> i + 2
56     | '\224'..'\239' as c -> i + 3
57     | '\240'..'\248' as c -> i + 4
58     | _ -> failwith "Malformed UTF-8 bufffer"
59     (*
60 abate 79 let width = Array.create 256 1
61     let () =
62     for i = 192 to 223 do width.(i) <- 2 done;
63     for i = 224 to 249 do width.(i) <- 3 done;
64     for i = 240 to 248 do width.(i) <- 4 done
65    
66 abate 310 let len s i =
67 abate 79 Array.unsafe_get width (Char.code s.[i])
68 abate 310 *)
69 abate 79
70 abate 310 let store b p =
71     (* Adapted from Netstring's netconversion.ml/write_utf8 *)
72     if p <= 127 then
73     Buffer.add_char b (Char.chr p)
74     else if p <= 0x7ff then (
75     Buffer.add_char b (Char.chr (0xc0 lor (p lsr 6)));
76     Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f)))
77     )
78     else if p <= 0xffff then (
79     (* Refuse writing surrogate pairs, and fffe, ffff *)
80     if (p >= 0xd800 & p < 0xe000) or (p >= 0xfffe) then
81     failwith "Encodings.Utf8.store";
82     Buffer.add_char b (Char.chr (0xe0 lor (p lsr 12)));
83     Buffer.add_char b (Char.chr (0x80 lor ((p lsr 6) land 0x3f)));
84     Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f)))
85     )
86     else if p <= 0x10ffff then (
87     Buffer.add_char b (Char.chr (0xf0 lor (p lsr 18)));
88     Buffer.add_char b (Char.chr (0x80 lor ((p lsr 12) land 0x3f)));
89     Buffer.add_char b (Char.chr (0x80 lor ((p lsr 6) land 0x3f)));
90     Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f)))
91     )
92     else
93     (* Higher code points are not possible in XML: *)
94     failwith "Encodings.Utf8.store"
95 abate 79
96 abate 310 let copy b s i j =
97     Buffer.add_substring b s i (j - i)
98    
99     let get_substr s i j =
100     String.sub s i (j - i)
101 abate 79 end

CVS Admin">CVS Admin
ViewVC Help
Powered by ViewVC 1.1.5