1 files changed, 1085 insertions, 0 deletions
diff --git a/dev-ml/markup/files/uutf.patch b/dev-ml/markup/files/uutf.patch
new file mode 100644
index 000000000000..f561084ee454
--- /dev/null
+++ b/dev-ml/markup/files/uutf.patch
@@ -0,0 +1,1085 @@
+Index: markup.ml-0.7.2/src/common.ml
+===================================================================
+--- markup.ml-0.7.2.orig/src/common.ml
++++ markup.ml-0.7.2/src/common.ml
+@@ -134,7 +134,7 @@ let is_printable = is_in_range 0x0020 0x
+ let char c =
+   if is_printable c then begin
+     let buffer = Buffer.create 4 in
+-    add_utf_8 buffer c;
++    add_utf_8 buffer (Uchar.of_int c);
+     Buffer.contents buffer
+   end
+   else
+Index: markup.ml-0.7.2/src/detect.ml
+===================================================================
+--- markup.ml-0.7.2.orig/src/detect.ml
++++ markup.ml-0.7.2/src/detect.ml
+@@ -222,7 +222,7 @@ let meta_tag_prescan =
+     let rec iterate () =
+       next source throw (fun () -> k "") (function
+         | c when c = quote -> k (Buffer.contents buffer)
+-        | c -> add_utf_8 buffer (Char.code (Char.lowercase c)); iterate ())
++        | c -> add_utf_8 buffer (Uchar.of_int (Char.code (Char.lowercase c))); iterate ())
+     in
+     iterate ()
+   in
+@@ -236,7 +236,7 @@ let meta_tag_prescan =
+           push source c;
+           k (Buffer.contents buffer)
+         | c ->
+-          add_utf_8 buffer (Char.code (Char.lowercase c));
++          add_utf_8 buffer (Uchar.of_int (Char.code (Char.lowercase c)));
+           iterate ())
+     in
+     iterate ()
+@@ -315,7 +315,7 @@ let meta_tag_prescan =
+               k (Buffer.contents buffer)
+ 
+             | Some c ->
+-              add_utf_8 buffer (Char.code (Char.lowercase c));
++              add_utf_8 buffer (Uchar.of_int (Char.code (Char.lowercase c)));
+               iterate ()
+           end
+         in
+Index: markup.ml-0.7.2/src/encoding.ml
+===================================================================
+--- markup.ml-0.7.2.orig/src/encoding.ml
++++ markup.ml-0.7.2/src/encoding.ml
+@@ -4,7 +4,7 @@
+ open Common
+ open Kstream
+ 
+-type t = ?report:Error.parse_handler -> char Kstream.t -> int Kstream.t
++type t = ?report:Error.parse_handler -> char Kstream.t -> Uchar.t Kstream.t
+ 
+ let wrap f = fun ?(report = Error.ignore_errors) s -> f report s
+ 
+@@ -24,8 +24,8 @@ let _uutf_decoder encoding name =
+           k Uutf.u_rep)
+         | `Await ->
+           next bytes throw
+-            (fun () -> Uutf.Manual.src decoder "" 0 0; run ())
+-            (fun c -> Uutf.Manual.src decoder (String.make 1 c) 0 1; run ())
++            (fun () -> Uutf.Manual.src decoder Bytes.empty 0 0; run ())
++            (fun c -> Uutf.Manual.src decoder (Bytes.make 1 c) 0 1; run ())
+       in
+       run ())
+     |> make)
+@@ -87,7 +87,7 @@ let _ucs_4_decoder arrange name =
+               let skip =
+                 if !first then begin
+                   first := false;
+-                  scalar = Uutf.u_bom
++                  scalar = Uchar.to_int Uutf.u_bom
+                 end
+                 else
+                   false
+@@ -96,9 +96,9 @@ let _ucs_4_decoder arrange name =
+               if skip then run ()
+               else
+                 if scalar = 0x000A then
+-                  newline k scalar
++                  newline k (Uchar.of_int scalar)
+                 else
+-                  char k scalar
++                  char k (Uchar.of_int scalar)
+ 
+           | [] -> empty ()
+ 
+@@ -130,7 +130,7 @@ let code_page table =
+ 
+   (fun _ bytes ->
+     (fun throw empty k ->
+-      next bytes throw empty (fun c -> k table.(Char.code c)))
++      next bytes throw empty (fun c -> k (Uchar.of_int table.(Char.code c))))
+     |> make)
+   |> wrap
+ 
+Index: markup.ml-0.7.2/src/html_parser.ml
+===================================================================
+--- markup.ml-0.7.2.orig/src/html_parser.ml
++++ markup.ml-0.7.2/src/html_parser.ml
+@@ -1022,7 +1022,7 @@ let parse requested_context report (toke
+   let frameset_ok = ref true in
+   let head_seen = ref false in
+ 
+-  let add_character = Text.add text in
++  let add_character = (fun x y -> Text.add text x (Uchar.of_int y)) in
+ 
+   set_foreign (fun () ->
+     Stack.current_element_is_foreign context open_elements);
+@@ -2717,7 +2717,7 @@ let parse requested_context report (toke
+     | l, `Char 0 ->
+       report l (`Bad_token ("U+0000", "foreign content", "null")) !throw
+         (fun () ->
+-      add_character l Uutf.u_rep;
++      add_character l (Uchar.to_int Uutf.u_rep);
+       mode ())
+ 
+     | l, `Char (0x0009 | 0x000A | 0x000C | 0x000D | 0x0020 as c) ->
+Index: markup.ml-0.7.2/src/html_tokenizer.ml
+===================================================================
+--- markup.ml-0.7.2.orig/src/html_tokenizer.ml
++++ markup.ml-0.7.2/src/html_tokenizer.ml
+@@ -252,7 +252,7 @@ let tokenize report (input, get_location
+                 report location
+                   (`Bad_token (prefix ^ text ^ semicolon, "character reference",
+                                "Windows-1252 character")) !throw (fun () ->
+-                k (Some (`One n)))
++                k (Some (`One (Uchar.of_int n))))
+ 
+               else
+                 match n with
+@@ -268,9 +268,9 @@ let tokenize report (input, get_location
+                     (`Bad_token (prefix ^ text ^ semicolon,
+                                  "character reference",
+                                  "invalid HTML character")) !throw (fun () ->
+-                  k (Some (`One n)))
++                  k (Some (`One (Uchar.of_int n))))
+ 
+-                | n -> k (Some (`One n))
++                | n -> k (Some (`One (Uchar.of_int n)))
+               end
+             end
+         in
+@@ -366,6 +366,10 @@ let tokenize report (input, get_location
+                   | _ -> unterminated ())
+         in
+ 
++	let ma = function
++	a, `One x -> (a, `One (Uchar.of_int x))
++	| a, `Two (x,y) -> (a, `Two (Uchar.of_int x, Uchar.of_int y)) in
++
+         let rec match_named best matched replace candidate =
+           next_option input !throw (function
+             | None -> finish best matched replace
+@@ -377,8 +381,8 @@ let tokenize report (input, get_location
+               | `None -> finish best matched (v::replace)
+               | `Continue -> match_named best matched (v::replace) candidate
+               | `Match_and_continue m ->
+-                match_named (Some m) (v::(replace @ matched)) [] candidate
+-              | `Match m -> finish (Some m) (v::matched) [])
++                match_named (Some (ma m)) (v::(replace @ matched)) [] candidate
++              | `Match m -> finish (Some (ma m)) (v::matched) [])
+         in
+         match_named None [] [] "")
+ 
+@@ -409,11 +413,11 @@ let tokenize report (input, get_location
+         emit (l, `Char 0x0026) state
+ 
+       | Some (`One c) ->
+-        emit (l, `Char c) state
++        emit (l, `Char (Uchar.to_int c)) state
+ 
+       | Some (`Two (c, c')) ->
+-        emit (l, `Char c) (fun () ->
+-        emit (l, `Char c') state)
++        emit (l, `Char (Uchar.to_int c)) (fun () ->
++        emit (l, `Char (Uchar.to_int c')) state)
+     end
+ 
+   (* 8.2.4.3. *)
+@@ -427,7 +431,7 @@ let tokenize report (input, get_location
+ 
+       | Some (l, 0) ->
+         report l (`Bad_token ("U+0000", "content", "null")) !throw (fun () ->
+-        emit (l, `Char Uutf.u_rep) rcdata_state)
++        emit (l, `Char (Uchar.to_int Uutf.u_rep)) rcdata_state)
+ 
+       | None ->
+         emit_eof ()
+@@ -444,7 +448,7 @@ let tokenize report (input, get_location
+ 
+       | Some (l, 0) ->
+         report l (`Bad_token ("U+0000", "content", "null")) !throw (fun () ->
+-        emit (l, `Char Uutf.u_rep) rawtext_state)
++        emit (l, `Char (Uchar.to_int Uutf.u_rep)) rawtext_state)
+ 
+       | None ->
+         emit_eof ()
+@@ -461,7 +465,7 @@ let tokenize report (input, get_location
+ 
+       | Some (l, 0) ->
+         report l (`Bad_token ("U+0000", "content", "null")) !throw (fun () ->
+-        emit_character l Uutf.u_rep script_data_state)
++        emit_character l (Uchar.to_int Uutf.u_rep) script_data_state)
+ 
+       | None ->
+         emit_eof ()
+@@ -475,7 +479,7 @@ let tokenize report (input, get_location
+     next_option input !throw begin function
+       | Some (l, 0) ->
+         report l (`Bad_token ("U+0000", "content", "null")) !throw (fun () ->
+-        emit (l, `Char Uutf.u_rep) plaintext_state)
++        emit (l, `Char (Uchar.to_int Uutf.u_rep)) plaintext_state)
+ 
+       | None ->
+         emit_eof ()
+@@ -501,7 +505,7 @@ let tokenize report (input, get_location
+         end_tag_open_state l' tag
+ 
+       | Some (_, c) when is_alphabetic c ->
+-        add_utf_8 tag._tag_name (to_lowercase c);
++        add_utf_8 tag._tag_name (Uchar.of_int (to_lowercase c));
+         tag_name_state l' tag
+ 
+       | Some (_, 0x003F) ->
+@@ -529,7 +533,7 @@ let tokenize report (input, get_location
+ 
+     next_option input !throw begin function
+       | Some (_, c) when is_alphabetic c ->
+-        add_utf_8 tag._tag_name (to_lowercase c);
++        add_utf_8 tag._tag_name (Uchar.of_int (to_lowercase c));
+         tag_name_state l' tag
+ 
+       | Some (_, 0x003E) ->
+@@ -569,7 +573,7 @@ let tokenize report (input, get_location
+         report (get_location ()) (`Unexpected_eoi "tag") !throw data_state
+ 
+       | Some (_, c) ->
+-        add_utf_8 tag._tag_name (to_lowercase c);
++        add_utf_8 tag._tag_name (Uchar.of_int (to_lowercase c));
+         tag_name_state l' tag
+     end
+ 
+@@ -589,7 +593,7 @@ let tokenize report (input, get_location
+     next_option input !throw begin function
+       | Some (_, c as v) when is_alphabetic c ->
+         let name_buffer = Buffer.create 32 in
+-        add_utf_8 name_buffer (to_lowercase c);
++        add_utf_8 name_buffer (Uchar.of_int (to_lowercase c));
+         text_end_tag_name_state state l' (v::cs) name_buffer
+ 
+       | maybe_v ->
+@@ -618,7 +622,7 @@ let tokenize report (input, get_location
+         emit_tag l' (create_tag ())
+ 
+       | Some ((_, c) as v) when is_alphabetic c ->
+-        add_utf_8 name_buffer (to_lowercase c);
++        add_utf_8 name_buffer (Uchar.of_int (to_lowercase c));
+         text_end_tag_name_state state l' (v::cs) name_buffer
+ 
+       | maybe_v ->
+@@ -676,7 +680,7 @@ let tokenize report (input, get_location
+ 
+       | Some (l, 0) ->
+         report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () ->
+-        emit_character l Uutf.u_rep (fun () ->
++        emit_character l (Uchar.to_int Uutf.u_rep) (fun () ->
+         script_data_escaped_state l'))
+ 
+       | None ->
+@@ -699,7 +703,7 @@ let tokenize report (input, get_location
+ 
+       | Some (l, 0) ->
+         report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () ->
+-        emit_character l Uutf.u_rep (fun () ->
++        emit_character l (Uchar.to_int Uutf.u_rep) (fun () ->
+         script_data_escaped_state l'))
+ 
+       | None ->
+@@ -725,7 +729,7 @@ let tokenize report (input, get_location
+ 
+       | Some (l, 0) ->
+         report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () ->
+-        emit_character l Uutf.u_rep (fun () ->
++        emit_character l (Uchar.to_int Uutf.u_rep) (fun () ->
+         script_data_escaped_state l'))
+ 
+       | None ->
+@@ -745,7 +749,7 @@ let tokenize report (input, get_location
+ 
+       | Some (_, c as v) when is_alphabetic c ->
+         let tag_buffer = Buffer.create 32 in
+-        add_utf_8 tag_buffer (to_lowercase c);
++        add_utf_8 tag_buffer (Uchar.of_int (to_lowercase c));
+         emit_characters (List.rev (v::cs)) (fun () ->
+         script_data_double_escape_start_state l' tag_buffer)
+ 
+@@ -765,7 +769,7 @@ let tokenize report (input, get_location
+         else script_data_escaped_state l')
+ 
+       | Some (l, c) when is_alphabetic c ->
+-        add_utf_8 tag_buffer (to_lowercase c);
++        add_utf_8 tag_buffer (Uchar.of_int (to_lowercase c));
+         emit_character l c (fun () ->
+         script_data_double_escape_start_state l' tag_buffer)
+ 
+@@ -787,7 +791,7 @@ let tokenize report (input, get_location
+ 
+       | Some (l, 0) ->
+         report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () ->
+-        emit_character l Uutf.u_rep (fun () ->
++        emit_character l (Uchar.to_int Uutf.u_rep) (fun () ->
+         script_data_double_escaped_state l'))
+ 
+       | None ->
+@@ -811,7 +815,7 @@ let tokenize report (input, get_location
+ 
+       | Some (l, 0) ->
+         report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () ->
+-        emit_character l Uutf.u_rep (fun () ->
++        emit_character l (Uchar.to_int Uutf.u_rep) (fun () ->
+         script_data_double_escaped_state l'))
+ 
+       | None ->
+@@ -838,7 +842,7 @@ let tokenize report (input, get_location
+ 
+       | Some (l, 0) ->
+         report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () ->
+-        emit_character l Uutf.u_rep (fun () ->
++        emit_character l (Uchar.to_int Uutf.u_rep) (fun () ->
+         script_data_double_escaped_state l'))
+ 
+       | None ->
+@@ -872,7 +876,7 @@ let tokenize report (input, get_location
+         else script_data_double_escaped_state l')
+ 
+       | Some (l, c) when is_alphabetic c ->
+-        add_utf_8 tag_buffer (to_lowercase c);
++        add_utf_8 tag_buffer (Uchar.of_int (to_lowercase c));
+         emit_character l c (fun () ->
+         script_data_double_escape_end_state l' tag_buffer)
+ 
+@@ -910,10 +914,10 @@ let tokenize report (input, get_location
+       | Some (l, (0x0022 | 0x0027 | 0x003C | 0x003D as c)) ->
+         report l (`Bad_token (char c, "attribute name",
+                               "invalid start character")) !throw (fun () ->
+-        start_attribute c)
++        start_attribute (Uchar.of_int c))
+ 
+       | Some (_, c) ->
+-        start_attribute (to_lowercase c)
++        start_attribute (Uchar.of_int (to_lowercase c))
+     end
+ 
+   (* 8.2.4.35. *)
+@@ -942,14 +946,14 @@ let tokenize report (input, get_location
+       | Some (l, (0x0022 | 0x0027 | 0x003C as c)) ->
+         report l (`Bad_token (char c, "attribute name",
+                               "invalid name character")) !throw (fun () ->
+-        add_utf_8 name_buffer c;
++        add_utf_8 name_buffer (Uchar.of_int c);
+         attribute_name_state l' tag name_buffer)
+ 
+       | None ->
+         report (get_location ()) (`Unexpected_eoi "tag") !throw data_state
+ 
+       | Some (_, c) ->
+-        add_utf_8 name_buffer (to_lowercase c);
++        add_utf_8 name_buffer (Uchar.of_int (to_lowercase c));
+         attribute_name_state l' tag name_buffer
+     end
+ 
+@@ -985,13 +989,13 @@ let tokenize report (input, get_location
+       | Some (l, (0x0022 | 0x0027 | 0x003C as c)) ->
+         report l (`Bad_token (char c, "attribute name",
+                               "invalid start character")) !throw (fun () ->
+-        start_next_attribute c)
++        start_next_attribute (Uchar.of_int c))
+ 
+       | None ->
+         report (get_location ()) (`Unexpected_eoi "tag") !throw data_state
+ 
+       | Some (_, c) ->
+-        start_next_attribute (to_lowercase c)
++        start_next_attribute (Uchar.of_int (to_lowercase c))
+     end
+ 
+   (* 8.2.4.37. *)
+@@ -1030,13 +1034,13 @@ let tokenize report (input, get_location
+       | Some (l, (0x003C | 0x003D | 0x0060 as c)) ->
+         report l (`Bad_token (char c, "attribute value",
+                               "invalid start character")) !throw (fun () ->
+-        start_value attribute_value_unquoted_state (Some c))
++        start_value attribute_value_unquoted_state (Some (Uchar.of_int c)))
+ 
+       | None ->
+         report (get_location ()) (`Unexpected_eoi "tag") !throw data_state
+ 
+       | Some (_, c) ->
+-        start_value attribute_value_unquoted_state (Some c)
++        start_value attribute_value_unquoted_state (Some (Uchar.of_int c))
+     end
+ 
+   (* 8.2.4.38 and 8.2.4.39. *)
+@@ -1062,7 +1066,7 @@ let tokenize report (input, get_location
+           data_state
+ 
+       | Some (_, c) ->
+-        add_utf_8 value_buffer c;
++        add_utf_8 value_buffer (Uchar.of_int c);
+         attribute_value_quoted_state quote l' tag name value_buffer
+     end
+ 
+@@ -1092,14 +1096,14 @@ let tokenize report (input, get_location
+       | Some (l, (0x0022 | 0x0027 | 0x003C | 0x003D | 0x0060 as c)) ->
+         report l (`Bad_token (char c, "attribute value",
+                               "invalid character")) !throw (fun () ->
+-        add_utf_8 value_buffer c;
++        add_utf_8 value_buffer (Uchar.of_int c);
+         attribute_value_unquoted_state l' tag name value_buffer)
+ 
+       | None ->
+         report (get_location ()) (`Unexpected_eoi "tag") !throw data_state
+ 
+       | Some (_, c) ->
+-        add_utf_8 value_buffer c;
++        add_utf_8 value_buffer (Uchar.of_int c);
+         attribute_value_unquoted_state l' tag name value_buffer
+     end
+ 
+@@ -1107,7 +1111,7 @@ let tokenize report (input, get_location
+   and character_reference_in_attribute allowed l value_buffer k =
+     consume_character_reference true (Some allowed) l begin function
+       | None ->
+-        add_utf_8 value_buffer 0x0026;
++        add_utf_8 value_buffer (Uchar.of_int 0x0026);
+         k ()
+ 
+       | Some (`One c) ->
+@@ -1176,7 +1180,7 @@ let tokenize report (input, get_location
+           emit_comment l' buffer
+ 
+         | Some (_, c) ->
+-          add_utf_8 buffer c;
++          add_utf_8 buffer (Uchar.of_int c);
+           consume ()
+       end
+     in
+@@ -1239,7 +1243,7 @@ let tokenize report (input, get_location
+         emit_comment l' buffer)
+ 
+       | Some (_, c) ->
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         comment_state l' buffer
+     end
+ 
+@@ -1266,7 +1270,7 @@ let tokenize report (input, get_location
+ 
+       | Some (_, c) ->
+         Buffer.add_char buffer '-';
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         comment_state l' buffer
+     end
+ 
+@@ -1286,7 +1290,7 @@ let tokenize report (input, get_location
+         emit_comment l' buffer)
+ 
+       | Some (_, c) ->
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         comment_state l' buffer
+     end
+ 
+@@ -1308,7 +1312,7 @@ let tokenize report (input, get_location
+ 
+       | Some (_, c) ->
+         Buffer.add_char buffer '-';
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         comment_state l' buffer
+     end
+ 
+@@ -1343,7 +1347,7 @@ let tokenize report (input, get_location
+         report l (`Bad_token ("--" ^ (char c), "comment",
+                               "'--' should be in '-->'")) !throw (fun () ->
+         Buffer.add_string buffer "--";
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         comment_state l' buffer)
+     end
+ 
+@@ -1369,7 +1373,7 @@ let tokenize report (input, get_location
+ 
+       | Some (_, c) ->
+         Buffer.add_string buffer "--!";
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         comment_state l' buffer
+     end
+ 
+@@ -1420,7 +1424,7 @@ let tokenize report (input, get_location
+ 
+       | Some (_, c) ->
+         doctype._doctype_name <-
+-          add_doctype_char doctype._doctype_name (to_lowercase c);
++          add_doctype_char doctype._doctype_name (Uchar.of_int (to_lowercase c));
+         doctype_name_state l' doctype
+     end
+ 
+@@ -1445,7 +1449,7 @@ let tokenize report (input, get_location
+ 
+       | Some (_, c) ->
+         doctype._doctype_name <-
+-          add_doctype_char doctype._doctype_name (to_lowercase c);
++          add_doctype_char doctype._doctype_name (Uchar.of_int (to_lowercase c));
+         doctype_name_state l' doctype
+     end
+ 
+@@ -1574,7 +1578,7 @@ let tokenize report (input, get_location
+         emit_doctype ~quirks:true l' doctype)
+ 
+       | Some (_, c) ->
+-        add doctype c;
++        add doctype (Uchar.of_int c);
+         doctype_identifier_quoted_state add quote next_state l' doctype
+     end
+ 
+Index: markup.ml-0.7.2/src/html_writer.ml
+===================================================================
+--- markup.ml-0.7.2.orig/src/html_writer.ml
++++ markup.ml-0.7.2/src/html_writer.ml
+@@ -8,7 +8,7 @@ let _escape_attribute s =
+   Uutf.String.fold_utf_8 (fun () _ -> function
+     | `Malformed _ -> ()
+     | `Uchar c ->
+-      match c with
++      match (Uchar.to_int c) with
+       | 0x0026 -> Buffer.add_string buffer "&amp;"
+       | 0x00A0 -> Buffer.add_string buffer "&nbsp;"
+       | 0x0022 -> Buffer.add_string buffer "&quot;"
+@@ -21,7 +21,7 @@ let _escape_text s =
+   Uutf.String.fold_utf_8 (fun () _ -> function
+     | `Malformed _ -> ()
+     | `Uchar c ->
+-      match c with
++      match (Uchar.to_int c) with
+       | 0x0026 -> Buffer.add_string buffer "&amp;"
+       | 0x00A0 -> Buffer.add_string buffer "&nbsp;"
+       | 0x003C -> Buffer.add_string buffer "&lt;"
+Index: markup.ml-0.7.2/src/input.ml
+===================================================================
+--- markup.ml-0.7.2.orig/src/input.ml
++++ markup.ml-0.7.2/src/input.ml
+@@ -27,13 +27,13 @@ let preprocess is_valid_char report sour
+       in
+ 
+       let rec iterate () =
+-        next source throw empty (function
++        next source throw empty (fun x -> match Uchar.to_int x with
+           | 0xFEFF when !first_char -> first_char := false; iterate ()
+ 
+           | 0x0D ->
+-            next source throw newline (function
++            next source throw newline (fun y -> match Uchar.to_int y with
+               | 0x0A -> newline ()
+-              | c -> push source c; newline ())
++              | c -> push source (Uchar.of_int c); newline ())
+ 
+           | 0x0A -> newline ()
+ 
+Index: markup.ml-0.7.2/src/input.mli
+===================================================================
+--- markup.ml-0.7.2.orig/src/input.mli
++++ markup.ml-0.7.2/src/input.mli
+@@ -4,5 +4,5 @@
+ open Common
+ 
+ val preprocess :
+-  (int -> bool) -> Error.parse_handler -> int Kstream.t ->
++  (int -> bool) -> Error.parse_handler -> Uchar.t Kstream.t ->
+     (location * int) Kstream.t * (unit -> location)
+Index: markup.ml-0.7.2/src/markup.ml
+===================================================================
+--- markup.ml-0.7.2.orig/src/markup.ml
++++ markup.ml-0.7.2/src/markup.ml
+@@ -187,7 +187,7 @@ sig
+ 
+     val decode :
+       ?report:(location -> Error.t -> unit io) -> t ->
+-      (char, _) stream -> (int, async) stream
++      (char, _) stream -> (Uchar.t, async) stream
+   end
+ 
+   val parse_xml :
+Index: markup.ml-0.7.2/src/markup.mli
+===================================================================
+--- markup.ml-0.7.2.orig/src/markup.mli
++++ markup.ml-0.7.2/src/markup.mli
+@@ -194,7 +194,7 @@ sig
+ 
+   val decode :
+     ?report:(location -> Error.t -> unit) -> t ->
+-    (char, 's) stream -> (int, 's) stream
++    (char, 's) stream -> (Uchar.t, 's) stream
+   (** Applies a decoder to a byte stream. Illegal input byte sequences result in
+       calls to the error handler [~report] with error kind [`Decoding_error].
+       The illegal bytes are then skipped, and zero or more U+FFFD replacement
+@@ -764,7 +764,7 @@ sig
+ 
+     val decode :
+       ?report:(location -> Error.t -> unit io) -> Encoding.t ->
+-      (char, _) stream -> (int, async) stream
++      (char, _) stream -> (Uchar.t, async) stream
+   end
+ 
+   (** {2 XML} *)
+@@ -838,7 +838,7 @@ val kstream : ('a, _) stream -> 'a Kstre
+ val of_kstream : 'a Kstream.t -> ('a, _) stream
+ 
+ val preprocess_input_stream :
+-  (int, 's) stream -> (location * int, 's) stream * (unit -> location)
++  (Uchar.t, 's) stream -> (location * int, 's) stream * (unit -> location)
+ 
+ (**/**)
+ 
+Index: markup.ml-0.7.2/src/utility.ml
+===================================================================
+--- markup.ml-0.7.2.orig/src/utility.ml
++++ markup.ml-0.7.2/src/utility.ml
+@@ -346,11 +346,11 @@ let xhtml_entity name =
+ 
+     match lookup 0 with
+     | `One c ->
+-      add_utf_8 buffer c;
++      add_utf_8 buffer (Uchar.of_int c);
+       Some (Buffer.contents buffer)
+     | `Two (c, c') ->
+-      add_utf_8 buffer c;
+-      add_utf_8 buffer c';
++      add_utf_8 buffer (Uchar.of_int c);
++      add_utf_8 buffer (Uchar.of_int c');
+       Some (Buffer.contents buffer)
+ 
+   with Exit -> None
+Index: markup.ml-0.7.2/src/xml_tokenizer.ml
+===================================================================
+--- markup.ml-0.7.2.orig/src/xml_tokenizer.ml
++++ markup.ml-0.7.2/src/xml_tokenizer.ml
+@@ -101,7 +101,7 @@ let tokenize report resolve_reference (i
+               end
+ 
+           | _, c when filter c ->
+-            add_utf_8 buffer c;
++            add_utf_8 buffer (Uchar.of_int c);
+             read ()
+ 
+           | l, c ->
+@@ -133,7 +133,7 @@ let tokenize report resolve_reference (i
+ 
+       | _, c when is_name_start_char c ->
+         let buffer = Buffer.create 32 in
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         let rec read () =
+           next input !throw unexpected_eoi begin function
+             | _, 0x003B ->
+@@ -146,7 +146,7 @@ let tokenize report resolve_reference (i
+               end
+ 
+             | _, c when is_name_char c ->
+-              add_utf_8 buffer c;
++              add_utf_8 buffer (Uchar.of_int c);
+               read ()
+ 
+             | l, c ->
+@@ -218,7 +218,7 @@ let tokenize report resolve_reference (i
+           report_if (not @@ is_name_start_char c) l (fun () ->
+             `Bad_token (char c, "attribute", "invalid start character"))
+             !throw (fun () ->
+-          add_utf_8 name_buffer c;
++          add_utf_8 name_buffer (Uchar.of_int c);
+           name_state ())
+       end
+ 
+@@ -235,7 +235,7 @@ let tokenize report resolve_reference (i
+           report_if (not @@ is_name_start_char c) l (fun () ->
+             `Bad_token (char c, "attribute", "invalid name character"))
+             !throw (fun () ->
+-          add_utf_8 name_buffer c;
++          add_utf_8 name_buffer (Uchar.of_int c);
+           name_state ())
+       end
+ 
+@@ -275,14 +275,14 @@ let tokenize report resolve_reference (i
+           report l
+             (`Bad_token ("&", "attribute", "replace with '&amp;'"))
+             !throw (fun () ->
+-          add_utf_8 value_buffer 0x0026;
++          add_utf_8 value_buffer (Uchar.of_int 0x0026);
+           state ())
+       end
+ 
+     and handle_lt l state =
+       report l (`Bad_token ("<", "attribute", "replace with '&lt;'")) !throw
+         (fun () ->
+-      add_utf_8 value_buffer 0x003C;
++      add_utf_8 value_buffer (Uchar.of_int 0x003C);
+       state ())
+ 
+     and quoted_value_state quote =
+@@ -300,7 +300,7 @@ let tokenize report resolve_reference (i
+           quoted_value_state quote)
+ 
+         | _, c ->
+-          add_utf_8 value_buffer c;
++          add_utf_8 value_buffer (Uchar.of_int c);
+           quoted_value_state quote
+       end
+ 
+@@ -317,7 +317,7 @@ let tokenize report resolve_reference (i
+           handle_lt l unquoted_value_state
+ 
+         | _, c ->
+-          add_utf_8 value_buffer c;
++          add_utf_8 value_buffer (Uchar.of_int c);
+           unquoted_value_state ()
+       end
+ 
+@@ -372,7 +372,7 @@ let tokenize report resolve_reference (i
+           report_if (not @@ is_name_start_char c) l (fun () ->
+             `Bad_token (char c, pi, "invalid start character")) !throw
+             (fun () ->
+-          add_utf_8 target_buffer c;
++          add_utf_8 target_buffer (Uchar.of_int c);
+           target_state ())
+       end
+ 
+@@ -388,13 +388,13 @@ let tokenize report resolve_reference (i
+           report_if (not @@ is_name_char c) l (fun () ->
+             `Bad_token (char c, pi, "invalid name character")) !throw
+             (fun () ->
+-          add_utf_8 target_buffer c;
++          add_utf_8 target_buffer (Uchar.of_int c);
+           target_state ())
+       end
+ 
+     and text_state () =
+       next' pi finish_pi (fun (_, c) ->
+-        add_utf_8 text_buffer c;
++        add_utf_8 text_buffer (Uchar.of_int c);
+         text_state ())
+ 
+     and xml_declaration_state () =
+@@ -572,7 +572,7 @@ let tokenize report resolve_reference (i
+   and initial_state () =
+     next input !throw (fun () -> emit_eoi ()) begin function
+       | l, (0x005D as c) ->
+-        add_character l c;
++        add_character l (Uchar.of_int c);
+         one_bracket_state l
+ 
+       | l, 0x003C ->
+@@ -583,7 +583,7 @@ let tokenize report resolve_reference (i
+           | None ->
+             report l (`Bad_token (char c, "text", "replace with '&amp;'"))
+               !throw (fun () ->
+-            add_character l c;
++            add_character l (Uchar.of_int c);
+             initial_state ())
+ 
+           | Some s ->
+@@ -591,14 +591,14 @@ let tokenize report resolve_reference (i
+             initial_state ())
+ 
+       | l, c ->
+-        add_character l c;
++        add_character l (Uchar.of_int c);
+         initial_state ()
+     end
+ 
+   and one_bracket_state l' =
+     next_option input !throw begin function
+       | Some (l, (0x005D as c)) ->
+-        add_character l c;
++        add_character l (Uchar.of_int c);
+         two_brackets_state l' l
+ 
+       | v ->
+@@ -611,11 +611,11 @@ let tokenize report resolve_reference (i
+       | Some (l, (0x003E as c)) ->
+         report l' (`Bad_token ("]]>", "text", "must end a CDATA section"))
+           !throw (fun () ->
+-        add_character l c;
++        add_character l (Uchar.of_int c);
+         initial_state ())
+ 
+       | Some (l, (0x005D as c)) ->
+-        add_character l c;
++        add_character l (Uchar.of_int c);
+         two_brackets_state l'' l
+ 
+       | v ->
+@@ -626,7 +626,7 @@ let tokenize report resolve_reference (i
+   and begin_markup_state l' =
+     let recover v =
+       lt_in_text l' (fun () ->
+-      add_character l' 0x003C;
++      add_character l' (Uchar.of_int 0x003C);
+       push_option input v;
+       initial_state ())
+     in
+@@ -648,7 +648,7 @@ let tokenize report resolve_reference (i
+ 
+       | _, c when is_name_start_char c ->
+         let tag_name_buffer = Buffer.create 32 in
+-        add_utf_8 tag_name_buffer c;
++        add_utf_8 tag_name_buffer (Uchar.of_int c);
+         start_tag_state l' tag_name_buffer
+ 
+       | l, c as v ->
+@@ -660,7 +660,7 @@ let tokenize report resolve_reference (i
+   and start_tag_state l' buffer =
+     let recover v =
+       lt_in_text l' (fun () ->
+-      add_character l' 0x003C;
++      add_character l' (Uchar.of_int 0x003C);
+       add_string l' (Buffer.contents buffer);
+       push_option input v;
+       initial_state ())
+@@ -680,7 +680,7 @@ let tokenize report resolve_reference (i
+         attributes_state l' (Buffer.contents buffer) []
+ 
+       | _, c when is_name_char c ->
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         start_tag_state l' buffer
+ 
+       | l, c as v ->
+@@ -731,8 +731,8 @@ let tokenize report resolve_reference (i
+   and end_tag_state l' =
+     let recover v =
+       lt_in_text l' (fun () ->
+-      add_character l' 0x003C;
+-      add_character l' 0x002F;
++      add_character l' (Uchar.of_int 0x003C);
++      add_character l' (Uchar.of_int 0x002F);
+       push_option input v;
+       initial_state ())
+     in
+@@ -743,7 +743,7 @@ let tokenize report resolve_reference (i
+     begin function
+       | _, c when is_name_start_char c ->
+         let name_buffer = Buffer.create 32 in
+-        add_utf_8 name_buffer c;
++        add_utf_8 name_buffer (Uchar.of_int c);
+         end_tag_name_state l' name_buffer
+ 
+       | l, c as v ->
+@@ -755,8 +755,8 @@ let tokenize report resolve_reference (i
+   and end_tag_name_state l' buffer =
+     let recover v =
+       lt_in_text l' (fun () ->
+-      add_character l' 0x003C;
+-      add_character l' 0x002F;
++      add_character l' (Uchar.of_int 0x003C);
++      add_character l' (Uchar.of_int 0x002F);
+       add_string l' (Buffer.contents buffer);
+       push_option input v;
+       initial_state ())
+@@ -773,7 +773,7 @@ let tokenize report resolve_reference (i
+         end_tag_whitespace_state false l' (Buffer.contents buffer)
+ 
+       | _, c when is_name_char c ->
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         end_tag_name_state l' buffer
+ 
+       | l, c as v ->
+@@ -821,8 +821,8 @@ let tokenize report resolve_reference (i
+ 
+       | v ->
+         bad_comment_start "<!" l' (fun () ->
+-        add_character l' 0x003C;
+-        add_character l' 0x0021;
++        add_character l' (Uchar.of_int 0x003C);
++        add_character l' (Uchar.of_int 0x0021);
+         push_option input v;
+         initial_state ())
+     end
+@@ -834,9 +834,9 @@ let tokenize report resolve_reference (i
+ 
+       | v ->
+         bad_comment_start "<!-" l' (fun () ->
+-        add_character l' 0x003C;
+-        add_character l' 0x0021;
+-        add_character l' 0x002D;
++        add_character l' (Uchar.of_int 0x003C);
++        add_character l' (Uchar.of_int 0x0021);
++        add_character l' (Uchar.of_int 0x002D);
+         push_option input v;
+         initial_state ())
+     end
+@@ -852,7 +852,7 @@ let tokenize report resolve_reference (i
+         comment_one_dash_state l' l buffer
+ 
+       | _, c ->
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         comment_state l' buffer
+     end
+ 
+@@ -863,8 +863,8 @@ let tokenize report resolve_reference (i
+         comment_two_dashes_state false l' l'' buffer
+ 
+       | _, c ->
+-        add_utf_8 buffer 0x002D;
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int 0x002D);
++        add_utf_8 buffer (Uchar.of_int c);
+         comment_state l' buffer
+     end
+ 
+@@ -883,14 +883,14 @@ let tokenize report resolve_reference (i
+ 
+       | _, 0x002D ->
+         recover (fun () ->
+-        add_utf_8 buffer 0x002D;
++        add_utf_8 buffer (Uchar.of_int 0x002D);
+         comment_two_dashes_state true l' l'' buffer)
+ 
+       | _, c ->
+         recover (fun () ->
+-        add_utf_8 buffer 0x002D;
+-        add_utf_8 buffer 0x002D;
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int 0x002D);
++        add_utf_8 buffer (Uchar.of_int 0x002D);
++        add_utf_8 buffer (Uchar.of_int c);
+         comment_state l' buffer)
+     end
+ 
+@@ -905,9 +905,9 @@ let tokenize report resolve_reference (i
+           !throw (fun () ->
+         lt_in_text l' (fun () ->
+         push_list input cs;
+-        add_character l' 0x003C;
+-        add_character l' 0x0021;
+-        add_character l' 0x005B;
++        add_character l' (Uchar.of_int 0x003C);
++        add_character l' (Uchar.of_int 0x0021);
++        add_character l' (Uchar.of_int 0x005B);
+         initial_state ()))
+     end
+ 
+@@ -918,7 +918,7 @@ let tokenize report resolve_reference (i
+         cdata_one_bracket_state l' l
+ 
+       | l, c ->
+-        add_character l c;
++        add_character l (Uchar.of_int c);
+         cdata_state l'
+     end
+ 
+@@ -929,8 +929,8 @@ let tokenize report resolve_reference (i
+         cdata_two_brackets_state l' l'' l
+ 
+       | l, c ->
+-        add_character l'' 0x005D;
+-        add_character l c;
++        add_character l'' (Uchar.of_int 0x005D);
++        add_character l   (Uchar.of_int c);
+         cdata_state l'
+     end
+ 
+@@ -941,13 +941,13 @@ let tokenize report resolve_reference (i
+         initial_state ()
+ 
+       | l, 0x005D ->
+-        add_character l'' 0x005D;
++        add_character l'' (Uchar.of_int 0x005D);
+         cdata_two_brackets_state l' l''' l
+ 
+       | l, c ->
+-        add_character l'' 0x005D;
+-        add_character l''' 0x005D;
+-        add_character l c;
++        add_character l'' (Uchar.of_int 0x005D);
++        add_character l''' (Uchar.of_int 0x005D);
++        add_character l (Uchar.of_int c);
+         cdata_state l'
+     end
+ 
+@@ -963,9 +963,9 @@ let tokenize report resolve_reference (i
+           !throw (fun () ->
+         lt_in_text l' (fun () ->
+         push_list input cs;
+-        add_character l' 0x003C;
+-        add_character l' 0x0021;
+-        add_character l' 0x0044;
++        add_character l' (Uchar.of_int 0x003C);
++        add_character l' (Uchar.of_int 0x0021);
++        add_character l' (Uchar.of_int 0x0044);
+         initial_state ()))
+     end
+ 
+@@ -980,15 +980,15 @@ let tokenize report resolve_reference (i
+         emit_doctype l' buffer initial_state
+ 
+       | _, (0x0022 | 0x0027 as c) ->
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         doctype_quoted_state (fun () -> doctype_state l' buffer) c l' buffer
+ 
+       | _, (0x003C as c) ->
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         doctype_item_state (fun () -> doctype_state l' buffer) l' buffer
+ 
+       | _, c ->
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         doctype_state l' buffer
+     end
+ 
+@@ -996,11 +996,11 @@ let tokenize report resolve_reference (i
+     next input !throw (fun () -> unterminated_doctype l' buffer)
+     begin function
+       | _, c when c = quote ->
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         state ()
+ 
+       | _, c ->
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         doctype_quoted_state state quote l' buffer
+     end
+ 
+@@ -1008,18 +1008,18 @@ let tokenize report resolve_reference (i
+     next input !throw (fun () -> unterminated_doctype l' buffer)
+     begin function
+       | _, (0x0021 as c) ->
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         doctype_declaration_state state l' buffer
+ 
+       | l, (0x003F as c) ->
+-        add_utf_8 buffer c;
+-        let undo = tap (fun (_, c) -> add_utf_8 buffer c) input in
++        add_utf_8 buffer (Uchar.of_int c);
++        let undo = tap (fun (_, c) -> add_utf_8 buffer (Uchar.of_int c)) input in
+         parse_declaration_or_processing_instruction l (fun _ ->
+         undo ();
+         state ())
+ 
+       | _, c ->
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         state ()
+     end
+ 
+@@ -1027,16 +1027,16 @@ let tokenize report resolve_reference (i
+     next input !throw (fun () -> unterminated_doctype l' buffer)
+     begin function
+       | _, (0x003E as c) ->
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         state ()
+ 
+       | _, (0x0022 | 0x0027 as c) ->
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         doctype_quoted_state
+           (fun () -> doctype_declaration_state state l' buffer) c l' buffer
+ 
+       | _, c ->
+-        add_utf_8 buffer c;
++        add_utf_8 buffer (Uchar.of_int c);
+         doctype_declaration_state state l' buffer
+     end
+