From: Christian Lindig <christian.lindig@cloud.com>
Subject: Simplify UTF-8 decoding

* Use the decoder from the OCaml standard library instead of
  our own implementation, which this patch removes.
* Validate UTF-8/XML conformance for maps and sets, in addition to
  strings.

This is XSA-474 / CVE-2025-58146.

Signed-off-by: Christian Lindig <christian.lindig@cloud.com>
Reviewed-by: Edwin Török <edwin.torok@cloud.com>

diff --git a/ocaml/database/db_cache_impl.ml b/ocaml/database/db_cache_impl.ml
index e9745749a..050d43f05 100644
--- a/ocaml/database/db_cache_impl.ml
+++ b/ocaml/database/db_cache_impl.ml
@@ -67,9 +67,7 @@ let read_field t tblname fldname objref =
     occurs. *)
 let ensure_utf8_xml string =
   let length = String.length string in
-  let prefix =
-    Xapi_stdext_encodings.Encodings.UTF8_XML.longest_valid_prefix string
-  in
+  let prefix = Xapi_stdext_encodings.Utf8.XML.longest_valid_prefix string in
   if length > String.length prefix then
     warn "string truncated to: '%s'." prefix ;
   prefix
@@ -86,20 +84,32 @@ let write_field_locked t tblname objref fldname newval =
       (get_database t)
   )
 
+(** Ensure a value is conforming to UTF-8 with XML restrictions *)
+let is_valid v =
+  let valid = Xapi_stdext_encodings.Utf8.XML.is_valid in
+  let valid_pair (x, y) = valid x && valid y in
+  match v with
+  | Schema.Value.String s ->
+      valid s
+  | Schema.Value.Set ss ->
+      List.for_all valid ss
+  | Schema.Value.Pairs pairs ->
+      List.for_all valid_pair pairs
+
+let share_string = function
+  | Schema.Value.String s ->
+      Schema.Value.String (Share.merge s)
+  | v ->
+      (* we assume strings in the tree have been shared already *)
+      v
+
 let write_field t tblname objref fldname newval =
-  let newval =
-    match newval with
-    | Schema.Value.String s ->
-        (* the other caller of write_field_locked only uses sets and maps,
-           so we only need to check for String here
-        *)
-        if not (Xapi_stdext_encodings.Encodings.UTF8_XML.is_valid s) then
-          raise Invalid_value ;
-        Schema.Value.String (Share.merge s)
-    | _ ->
-        newval
-  in
-  with_lock (fun () -> write_field_locked t tblname objref fldname newval)
+  if not @@ is_valid newval then
+    raise Invalid_value
+  else
+    with_lock (fun () ->
+        write_field_locked t tblname objref fldname (share_string newval)
+    )
 
 let touch_row t tblname objref =
   update_database t (touch tblname objref) ;
diff --git a/ocaml/database/string_marshall_helper.ml b/ocaml/database/string_marshall_helper.ml
index ba003bee9..1add3aef7 100644
--- a/ocaml/database/string_marshall_helper.ml
+++ b/ocaml/database/string_marshall_helper.ml
@@ -22,9 +22,7 @@ module D = Debug.Make (struct let name = __MODULE__ end)
 
 let ensure_utf8_xml string =
   let length = String.length string in
-  let prefix =
-    Xapi_stdext_encodings.Encodings.UTF8_XML.longest_valid_prefix string
-  in
+  let prefix = Xapi_stdext_encodings.Utf8.XML.longest_valid_prefix string in
   if length > String.length prefix then
     D.warn "Whilst doing 'set' of structured field, string truncated to: '%s'."
       prefix ;
diff --git a/ocaml/idl/ocaml_backend/gen_server.ml b/ocaml/idl/ocaml_backend/gen_server.ml
index 191450212..f95f5f6d9 100644
--- a/ocaml/idl/ocaml_backend/gen_server.ml
+++ b/ocaml/idl/ocaml_backend/gen_server.ml
@@ -457,7 +457,7 @@ let gen_module api : O.Module.t =
                ([
                   "let __call, __params = call.Rpc.name, call.Rpc.params in"
                 ; "List.iter (fun p -> let s = Rpc.to_string p in if not \
-                   (Xapi_stdext_encodings.Encodings.UTF8_XML.is_valid s) then"
+                   (Xapi_stdext_encodings.Utf8.is_valid s) then"
                 ; "raise (Api_errors.Server_error(Api_errors.invalid_value, \
                    [\"Invalid UTF-8 string in parameter\"; s])))  __params;"
                 ; "let __label = __call in"
diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/bench/bench_encodings.ml b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/bench/bench_encodings.ml
index 7308c756d..bb20eed4f 100644
--- a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/bench/bench_encodings.ml
+++ b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/bench/bench_encodings.ml
@@ -1,5 +1,5 @@
 open Bechamel
-open Xapi_stdext_encodings.Encodings
+open Xapi_stdext_encodings
 
 let test name f =
   Test.make_indexed_with_resource ~name ~args:[10; 1000; 10000]
@@ -10,6 +10,6 @@ let test name f =
 
 let benchmarks =
   Test.make_grouped ~name:"Encodings.validate"
-    [test "UTF8_XML" UTF8_XML.validate]
+    [test "UTF8.XML" Utf8.XML.is_valid]
 
 let () = Bechamel_simple_cli.cli benchmarks
diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/dune b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/dune
index 742dd212f..839346e35 100644
--- a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/dune
+++ b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/dune
@@ -1,12 +1,6 @@
 (library
   (name xapi_stdext_encodings)
   (public_name xapi-stdext-encodings)
-  (modules :standard \ test)
+  (modules :standard)
 )
 
-(test
-  (name test)
-  (package xapi-stdext-encodings)
-  (modules test)
-  (libraries alcotest xapi-stdext-encodings)
-)
diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/encodings.ml b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/encodings.ml
deleted file mode 100644
index 2dfd45a7d..000000000
--- a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/encodings.ml
+++ /dev/null
@@ -1,167 +0,0 @@
-(*
- * Copyright (C) 2006-2009 Citrix Systems Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; version 2.1 only. with the special
- * exception on linking described in file LICENSE.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *)
-exception UCS_value_out_of_range
-
-exception UCS_value_prohibited_in_UTF8
-
-exception UCS_value_prohibited_in_XML
-
-exception UTF8_character_incomplete
-
-exception UTF8_header_byte_invalid
-
-exception UTF8_continuation_byte_invalid
-
-exception UTF8_encoding_not_canonical
-
-exception String_incomplete
-
-(* === Unicode Functions === *)
-
-module UCS = struct
-  let is_non_character value =
-    false
-    || (0xfdd0 <= value && value <= 0xfdef) (* case 1 *)
-    || Int.logand 0xfffe value = 0xfffe
-  (* case 2 *)
-  [@@inline]
-end
-
-module XML = struct
-  let is_illegal_control_character value =
-    let value = Uchar.to_int value in
-    value < 0x20 && value <> 0x09 && value <> 0x0a && value <> 0x0d
-  [@@inline]
-end
-
-(* === UCS Validators === *)
-
-module type UCS_VALIDATOR = sig
-  val validate : Uchar.t -> unit
-end
-
-module UTF8_UCS_validator = struct
-  let validate value =
-    if (UCS.is_non_character [@inlined]) (Uchar.to_int value) then
-      raise UCS_value_prohibited_in_UTF8
-  [@@inline]
-end
-
-module XML_UTF8_UCS_validator = struct
-  let validate value =
-    (UTF8_UCS_validator.validate [@inlined]) value ;
-    if (XML.is_illegal_control_character [@inlined]) value then
-      raise UCS_value_prohibited_in_XML
-end
-
-(* === String Validators === *)
-
-module type STRING_VALIDATOR = sig
-  val is_valid : string -> bool
-
-  val validate : string -> unit
-
-  val longest_valid_prefix : string -> string
-end
-
-exception Validation_error of int * exn
-
-module UTF8_XML : STRING_VALIDATOR = struct
-  let decode_continuation_byte byte =
-    if byte land 0b11000000 = 0b10000000 then
-      byte land 0b00111111
-    else
-      raise UTF8_continuation_byte_invalid
-
-  let rec decode_continuation_bytes string last value index =
-    if index <= last then
-      let chunk = decode_continuation_byte (Char.code string.[index]) in
-      let value = (value lsl 6) lor chunk in
-      decode_continuation_bytes string last value (index + 1)
-    else
-      value
-
-  let validate_character_utf8 string byte index =
-    let value, width =
-      if byte land 0b10000000 = 0b00000000 then
-        (byte, 1)
-      else if byte land 0b11100000 = 0b11000000 then
-        (byte land 0b0011111, 2)
-      else if byte land 0b11110000 = 0b11100000 then
-        (byte land 0b0001111, 3)
-      else if byte land 0b11111000 = 0b11110000 then
-        (byte land 0b0000111, 4)
-      else
-        raise UTF8_header_byte_invalid
-    in
-    let value =
-      if width = 1 then
-        value
-      else
-        decode_continuation_bytes string (index + width - 1) value (index + 1)
-    in
-    XML_UTF8_UCS_validator.validate (Uchar.unsafe_of_int value) ;
-    width
-
-  let rec validate_aux string length index =
-    if index = length then
-      ()
-    else
-      let width =
-        try
-          let byte = string.[index] |> Char.code in
-          validate_character_utf8 string byte index
-        with
-        | Invalid_argument _ ->
-            raise String_incomplete
-        | error ->
-            raise (Validation_error (index, error))
-      in
-      validate_aux string length (index + width)
-
-  let validate string = validate_aux string (String.length string) 0
-
-  let rec validate_with_fastpath string stop pos =
-    if pos < stop then
-      (* the compiler is smart enough to optimize the 'int32' away here,
-         and not allocate *)
-      let i32 = String.get_int32_ne string pos |> Int32.to_int in
-      (* test that for all bytes 0x20 <= byte < 0x80.
-         If any is <0x20 it would cause a negative value to appear in that byte,
-         which we can detect if we use 0x80 as a mask.
-         Byte >= 0x80 can be similarly detected with a mask of 0x80 on each byte.
-         We don't want to see a 0x80 from either of these, hence we bitwise or the 2 values together.
-      *)
-      if i32 lor (i32 - 0x20_20_20_20) land 0x80_80_80_80 = 0 then
-        validate_with_fastpath string stop (pos + 4)
-      else (* when the condition doesn't hold fall back to full UTF8 decoder *)
-        validate_aux string (String.length string) pos
-    else
-      validate_aux string (String.length string) pos
-
-  let validate_with_fastpath string =
-    validate_with_fastpath string (String.length string - 3) 0
-
-  let validate =
-    if Sys.word_size = 64 then
-      validate_with_fastpath
-    else
-      validate
-
-  let is_valid string = try validate string ; true with _ -> false
-
-  let longest_valid_prefix string =
-    try validate string ; string
-    with Validation_error (index, _) -> String.sub string 0 index
-end
diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/encodings.mli b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/encodings.mli
deleted file mode 100644
index 2a139ae37..000000000
--- a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/encodings.mli
+++ /dev/null
@@ -1,84 +0,0 @@
-(*
- * Copyright (C) 2006-2009 Citrix Systems Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; version 2.1 only. with the special
- * exception on linking described in file LICENSE.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *)
-
-(** Encoding helper modules *)
-
-(** {2 Exceptions} *)
-
-exception UCS_value_out_of_range
-
-exception UCS_value_prohibited_in_UTF8
-
-exception UCS_value_prohibited_in_XML
-
-exception UTF8_character_incomplete
-
-exception UTF8_header_byte_invalid
-
-exception UTF8_continuation_byte_invalid
-
-exception UTF8_encoding_not_canonical
-
-exception String_incomplete
-
-(** {2 UCS Validators} *)
-
-(** Validates UCS character values. *)
-module type UCS_VALIDATOR = sig
-  val validate : Uchar.t -> unit
-end
-
-(** Accepts all values within the UCS character value range except
- *  those which are invalid for all UTF-8-encoded XML documents. *)
-module XML_UTF8_UCS_validator : UCS_VALIDATOR
-
-module XML : sig
-  val is_illegal_control_character : Uchar.t -> bool
-  (** Returns true if and only if the given value corresponds to
-      	 *  a illegal control character as defined in section 2.2 of
-      	 *  the XML specification, version 1.0. *)
-end
-
-(** {2 String Validators} *)
-
-(** Provides functionality for validating and processing
- *  strings according to a particular character encoding. *)
-module type STRING_VALIDATOR = sig
-  val is_valid : string -> bool
-  (** Returns true if and only if the given string is validly-encoded. *)
-
-  val validate : string -> unit
-  (** Raises an encoding error if the given string is not validly-encoded. *)
-
-  val longest_valid_prefix : string -> string
-  (** Returns the longest validly-encoded prefix of the given string. *)
-end
-
-(** Represents a validation error as a tuple [(i,e)], where:
- *    [i] = the index of the first non-compliant character;
- *    [e] = the reason for non-compliance. *)
-exception Validation_error of int * exn
-
-(** Provides functions for validating and processing
- *  strings according to the UTF-8 character encoding,
- *  with certain additional restrictions on UCS values
- *  imposed by the XML specification.
- *
- *  Validly-encoded strings must satisfy both RFC 3629
- *  and section 2.2 of the XML specification.
- *
- *  For further information, see:
- *  http://www.rfc.net/rfc3629.html
- *  http://www.w3.org/TR/REC-xml/#charsets *)
-module UTF8_XML : STRING_VALIDATOR
diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/test.ml b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/test.ml
deleted file mode 100644
index 9cc75b297..000000000
--- a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/test.ml
+++ /dev/null
@@ -1,533 +0,0 @@
-(*
- * Copyright (C) 2006-2009 Citrix Systems Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; version 2.1 only. with the special
- * exception on linking described in file LICENSE.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *)
-module E = Xapi_stdext_encodings.Encodings
-
-(* Pull in the infix operators from Encodings used in this test *)
-let ( --- ), ( +++ ), ( <<< ) = (Int.sub, Int.add, Int.shift_left)
-
-(* === Mock exceptions  ==================================================== *)
-
-(** Simulates a decoding error. *)
-exception Decode_error
-
-(* === Mock UCS validators ================================================= *)
-
-(** A validator that always succeeds. *)
-module Lenient_UCS_validator : E.UCS_VALIDATOR = struct
-  let validate _ = ()
-end
-
-(* === Mock character validators ============================================= *)
-
-(** A validator that succeeds for all characters. *)
-module Universal_character_validator = struct
-  let validate _ = ()
-end
-
-(** A validator that fails for all characters. *)
-module Failing_character_validator = struct
-  let validate _ = raise Decode_error
-end
-
-(** A validator that succeeds for all characters except the letter 'F'. *)
-module Selective_character_validator = struct
-  let validate uchar =
-    if Uchar.equal uchar (Uchar.of_char 'F') then raise Decode_error
-end
-
-(* === Test helpers ======================================================== *)
-
-let assert_true = Alcotest.(check bool) "true" true
-
-let assert_false = Alcotest.(check bool) "false" false
-
-let assert_raises_match exception_match fn =
-  try
-    fn () ;
-    Alcotest.fail "assert_raises_match: failure expected"
-  with failure ->
-    if not (exception_match failure) then
-      raise failure
-    else
-      ()
-
-(* === Mock codecs ========================================================= *)
-
-module UCS = struct
-  (* === Unicode Functions === *)
-  let min_value = 0x000000
-
-  let max_value = 0x10ffff
-  (* used to be 0x1fffff, but this changed and Unicode won't allocate larger than 0x10ffff *)
-
-  let is_non_character value =
-    false
-    || (0xfdd0 <= value && value <= 0xfdef) (* case 1 *)
-    || Int.logand 0xfffe value = 0xfffe
-  (* case 2 *)
-
-  let is_out_of_range value = value < min_value || value > max_value
-
-  let is_surrogate value = 0xd800 <= value && value <= 0xdfff
-
-  (** A list of UCS non-characters values, including:
-      a. non-characters within the basic multilingual plane;
-      b. non-characters at the end of the basic multilingual plane;
-      c. non-characters at the end of the private use area. *)
-  let non_characters =
-    [
-      0x00fdd0
-    ; 0x00fdef
-    ; (* case a. *)
-      0x00fffe
-    ; 0x00ffff
-    ; (* case b. *)
-      0x1ffffe
-    ; 0x1fffff (* case c. *)
-    ]
-
-  (** A list of UCS character values located immediately before or
-      after UCS non-character values, including:
-      a. non-characters within the basic multilingual plane;
-      b. non-characters at the end of the basic multilingual plane;
-      c. non-characters at the end of the private use area. *)
-  let valid_characters_next_to_non_characters =
-    [
-      0x00fdcf
-    ; 0x00fdf0
-    ; (* case a. *)
-      0x00fffd
-    ; 0x010000
-    ; (* case b. *)
-      0x1ffffd
-    ; 0x200000 (* case c. *)
-    ]
-
-  let test_is_non_character () =
-    List.iter (fun value -> assert_true (is_non_character value)) non_characters ;
-    List.iter
-      (fun value -> assert_false (is_non_character value))
-      valid_characters_next_to_non_characters
-
-  let test_is_out_of_range () =
-    assert_true (is_out_of_range (min_value --- 1)) ;
-    assert_false (is_out_of_range min_value) ;
-    assert_false (is_out_of_range max_value) ;
-    assert_true (is_out_of_range (max_value +++ 1))
-
-  let test_is_surrogate () =
-    assert_false (is_surrogate 0xd7ff) ;
-    assert_true (is_surrogate 0xd800) ;
-    assert_true (is_surrogate 0xdfff) ;
-    assert_false (is_surrogate 0xe000)
-
-  let tests =
-    [
-      ("test_is_non_character", `Quick, test_is_non_character)
-    ; ("test_is_out_of_range", `Quick, test_is_out_of_range)
-    ; ("test_is_surrogate", `Quick, test_is_surrogate)
-    ]
-end
-
-module Lenient_UTF8_codec = struct
-  let decode_header_byte byte =
-    if byte land 0b10000000 = 0b00000000 then
-      (byte, 1)
-    else if byte land 0b11100000 = 0b11000000 then
-      (byte land 0b0011111, 2)
-    else if byte land 0b11110000 = 0b11100000 then
-      (byte land 0b0001111, 3)
-    else if byte land 0b11111000 = 0b11110000 then
-      (byte land 0b0000111, 4)
-    else
-      raise E.UTF8_header_byte_invalid
-
-  let decode_continuation_byte byte =
-    if byte land 0b11000000 = 0b10000000 then
-      byte land 0b00111111
-    else
-      raise E.UTF8_continuation_byte_invalid
-
-  let width_required_for_ucs_value value =
-    if value < 0x000080 (* 1 lsl  7 *) then
-      1
-    else if value < 0x000800 (* 1 lsl 11 *) then
-      2
-    else if value < 0x010000 (* 1 lsl 16 *) then
-      3
-    else
-      4
-
-  let decode_character string index =
-    let value, width = decode_header_byte (Char.code string.[index]) in
-    let value =
-      if width = 1 then
-        value
-      else
-        let value = ref value in
-        for index = index + 1 to index + width - 1 do
-          let chunk = decode_continuation_byte (Char.code string.[index]) in
-          value := (!value lsl 6) lor chunk
-        done ;
-        if width > width_required_for_ucs_value !value then
-          raise E.UTF8_encoding_not_canonical ;
-        !value
-    in
-    (value, width)
-end
-
-(* === Mock string validators ============================================== *)
-module Mock_String_validator (Validator : E.UCS_VALIDATOR) :
-  E.STRING_VALIDATOR = struct
-  (* no longer a functor in Encodings for performance reasons,
-     so modify the original string passed as argument instead replacing
-     characters that would be invalid with a known invalid XML char: 0x0B.
-  *)
-
-  let transform str =
-    let b = Buffer.create (String.length str) in
-    let rec loop pos =
-      if pos < String.length str then
-        let value, width = Lenient_UTF8_codec.decode_character str pos in
-        let () =
-          try
-            let u = Uchar.of_int value in
-            Validator.validate u ; Buffer.add_utf_8_uchar b u
-          with _ -> Buffer.add_char b '\x0B'
-        in
-        loop (pos + width)
-    in
-    loop 0 ; Buffer.contents b
-
-  let is_valid str = E.UTF8_XML.is_valid (transform str)
-
-  let validate str =
-    try E.UTF8_XML.validate (transform str)
-    with E.Validation_error (pos, _) ->
-      raise (E.Validation_error (pos, Decode_error))
-
-  let longest_valid_prefix str = E.UTF8_XML.longest_valid_prefix (transform str)
-end
-
-(** A validator that accepts all strings. *)
-module Universal_string_validator =
-  Mock_String_validator (Universal_character_validator)
-
-(** A validator that rejects all strings. *)
-module Failing_string_validator =
-  Mock_String_validator (Failing_character_validator)
-
-(** A validator that rejects strings containing the character 'F'. *)
-module Selective_string_validator =
-  Mock_String_validator (Selective_character_validator)
-
-(* === Tests =============================================================== *)
-
-module String_validator = struct
-  let test_is_valid () =
-    assert_true (Universal_string_validator.is_valid "") ;
-    assert_true (Universal_string_validator.is_valid "123456789") ;
-    assert_true (Selective_string_validator.is_valid "") ;
-    assert_true (Selective_string_validator.is_valid "123456789") ;
-    assert_false (Selective_string_validator.is_valid "F23456789") ;
-    assert_false (Selective_string_validator.is_valid "1234F6789") ;
-    assert_false (Selective_string_validator.is_valid "12345678F") ;
-    assert_false (Selective_string_validator.is_valid "FFFFFFFFF")
-
-  let test_longest_valid_prefix () =
-    Alcotest.(check string)
-      "prefix"
-      (Universal_string_validator.longest_valid_prefix "")
-      "" ;
-    Alcotest.(check string)
-      "prefix"
-      (Universal_string_validator.longest_valid_prefix "123456789")
-      "123456789" ;
-    Alcotest.(check string)
-      "prefix"
-      (Selective_string_validator.longest_valid_prefix "")
-      "" ;
-    Alcotest.(check string)
-      "prefix"
-      (Selective_string_validator.longest_valid_prefix "123456789")
-      "123456789" ;
-    Alcotest.(check string)
-      "prefix"
-      (Selective_string_validator.longest_valid_prefix "F23456789")
-      "" ;
-    Alcotest.(check string)
-      "prefix"
-      (Selective_string_validator.longest_valid_prefix "1234F6789")
-      "1234" ;
-    Alcotest.(check string)
-      "prefix"
-      (Selective_string_validator.longest_valid_prefix "12345678F")
-      "12345678" ;
-    Alcotest.(check string)
-      "prefix"
-      (Selective_string_validator.longest_valid_prefix "FFFFFFFFF")
-      ""
-
-  (** Tests that validation does not fail for an empty string. *)
-  let test_validate_with_empty_string () = E.UTF8_XML.validate ""
-
-  let test_validate_with_incomplete_string () =
-    Alcotest.check_raises "Validation fails correctly for an incomplete string"
-      E.String_incomplete (fun () -> E.UTF8_XML.validate "\xc2"
-    )
-
-  let test_validate_with_failing_decoders () =
-    Failing_string_validator.validate "" ;
-    assert_raises_match
-      (function E.Validation_error (0, Decode_error) -> true | _ -> false)
-      (fun () -> Selective_string_validator.validate "F") ;
-    assert_raises_match
-      (function E.Validation_error (0, Decode_error) -> true | _ -> false)
-      (fun () -> Selective_string_validator.validate "F12345678") ;
-    assert_raises_match
-      (function E.Validation_error (4, Decode_error) -> true | _ -> false)
-      (fun () -> Selective_string_validator.validate "0123F5678") ;
-    assert_raises_match
-      (function E.Validation_error (8, Decode_error) -> true | _ -> false)
-      (fun () -> Selective_string_validator.validate "01234567F") ;
-    assert_raises_match
-      (function E.Validation_error (0, Decode_error) -> true | _ -> false)
-      (fun () -> Selective_string_validator.validate "FFFFFFFFF")
-
-  let tests =
-    [
-      ("test_is_valid", `Quick, test_is_valid)
-    ; ("test_longest_valid_prefix", `Quick, test_longest_valid_prefix)
-    ; ( "test_validate_with_empty_string"
-      , `Quick
-      , test_validate_with_empty_string
-      )
-    ; ( "test_validate_with_incomplete_string"
-      , `Quick
-      , test_validate_with_incomplete_string
-      )
-    ; ( "test_validate_with_failing_decoders"
-      , `Quick
-      , test_validate_with_failing_decoders
-      )
-    ]
-end
-
-module XML = struct
-  include E.XML
-
-  let test_is_illegal_control_character () =
-    assert_true (is_illegal_control_character (Uchar.of_int 0x00)) ;
-    assert_true (is_illegal_control_character (Uchar.of_int 0x19)) ;
-    assert_false (is_illegal_control_character (Uchar.of_int 0x09)) ;
-    assert_false (is_illegal_control_character (Uchar.of_int 0x0a)) ;
-    assert_false (is_illegal_control_character (Uchar.of_int 0x0d)) ;
-    assert_false (is_illegal_control_character (Uchar.of_int 0x20))
-
-  let tests =
-    [
-      ( "test_is_illegal_control_character"
-      , `Quick
-      , test_is_illegal_control_character
-      )
-    ]
-end
-
-(** Tests the XML-specific UTF-8 UCS validation function. *)
-module XML_UTF8_UCS_validator = struct
-  include E.XML_UTF8_UCS_validator
-
-  let validate uchar =
-    if Uchar.is_valid uchar then
-      validate @@ Uchar.of_int uchar
-    else if uchar < Uchar.to_int Uchar.min || uchar > Uchar.to_int Uchar.max
-    then
-      raise E.UCS_value_out_of_range
-    else
-      raise E.UCS_value_prohibited_in_UTF8
-
-  let test_validate () =
-    let value = ref (UCS.min_value --- 1) in
-    while !value <= UCS.max_value +++ 1 do
-      if UCS.is_out_of_range !value then
-        Alcotest.check_raises "should fail" E.UCS_value_out_of_range (fun () ->
-            validate !value
-        )
-      else if UCS.is_non_character !value || UCS.is_surrogate !value then
-        Alcotest.check_raises "should fail" E.UCS_value_prohibited_in_UTF8
-          (fun () -> validate !value
-        )
-      else if
-        Uchar.is_valid !value
-        && XML.is_illegal_control_character (Uchar.of_int !value)
-      then
-        Alcotest.check_raises "should fail" E.UCS_value_prohibited_in_XML
-          (fun () -> validate !value
-        )
-      else
-        validate !value ;
-      value := !value +++ 1
-    done
-
-  let tests = [("test_validate", `Quick, test_validate)]
-end
-
-module UTF8_codec = struct
-  (** A list of canonical encoding widths of UCS values,
-      represented by tuples of the form (v, w), where:
-      v = the UCS character value to be encoded; and
-      w = the width of the encoded character, in bytes. *)
-  let valid_ucs_value_widths =
-    [
-      (1, 1)
-    ; ((1 <<< 7) --- 1, 1)
-    ; (1 <<< 7, 2)
-    ; ((1 <<< 11) --- 1, 2)
-    ; (1 <<< 11, 3)
-    ; ((1 <<< 16) --- 1, 3)
-    ; (1 <<< 16, 4)
-    ; ((1 <<< 21) --- 1, 4)
-    ]
-
-  let width_required_for_ucs_value value =
-    if value < 0x000080 (* 1 lsl  7 *) then
-      1
-    else if value < 0x000800 (* 1 lsl 11 *) then
-      2
-    else if value < 0x010000 (* 1 lsl 16 *) then
-      3
-    else
-      4
-
-  let test_width_required_for_ucs_value () =
-    List.iter
-      (fun (value, width) ->
-        Alcotest.(check int)
-          "same ints"
-          (width_required_for_ucs_value value)
-          width
-      )
-      valid_ucs_value_widths
-
-  (** A list of valid character decodings represented by
-      tuples of the form (s, (v, w)), where:
-
-      s = a validly-encoded UTF-8 string;
-      v = the UCS value represented by the string;
-          (which may or may not be valid in its own right)
-      w = the width of the encoded string, in bytes.
-
-      For each byte length b in [1...4], the list contains
-      decodings for:
-
-      v_min = the smallest UCS value encodable in b bytes.
-      v_max = the greatest UCS value encodable in b bytes. *)
-  let valid_character_decodings =
-    [
-      (*               7654321   *)
-      (* 0b0xxxxxxx                                  *)
-      (* 00000000000000xxxxxxx   *)
-      ( "\x00" (* 0b00000000                                  *)
-      , (0b000000000000000000000, 1)
-      )
-    ; ( "\x7f" (* 0b01111111                                  *)
-      , (0b000000000000001111111, 1)
-      )
-    ; (*           10987654321   *)
-      (* 0b110xxxsx 0b10xxxxxx                       *)
-      (* 0000000000xxxsxxxxxxx   *)
-      ( "\xc2\x80" (* 0b11000010 0b10000000                       *)
-      , (0b000000000000010000000, 2)
-      )
-    ; ( "\xdf\xbf" (* 0b11011111 0b10111111                       *)
-      , (0b000000000011111111111, 2)
-      )
-    ; (*      6543210987654321   *)
-      (* 0b1110xxxx 0b10sxxxxx 0b10xxxxxx            *)
-      (*      xxxxsxxxxxxxxxxx   *)
-      ( "\xe0\xa0\x80" (* 0b11100000 0b10100000 0b10000000            *)
-      , (0b000000000100000000000, 3)
-      )
-    ; ( "\xef\xbf\xbf" (* 0b11101111 0b10111111 0b10111111            *)
-      , (0b000001111111111111111, 3)
-      )
-    ; (* 109876543210987654321   *)
-      (* 0b11110xxx 0b10xsxxxx 0b10xxxxxx 0b10xxxxxx *)
-      (* xxxxsxxxxxxxxxxxxxxxx   *)
-      ( "\xf0\x90\x80\x80" (* 0b11110000 0b10010000 0b10000000 0b10000000 *)
-      , (0b000010000000000000000, 4)
-      )
-    ; ( "\xf7\xbf\xbf\xbf" (* 0b11110111 0b10111111 0b10111111 0b10111111 *)
-      , (0b111111111111111111111, 4)
-      )
-    ]
-
-  let uchar = Alcotest.int
-
-  let test_decode_character_when_valid () =
-    List.iter
-      (fun (string, (value, width)) ->
-        Alcotest.(check (pair uchar int))
-          "same pair"
-          (Lenient_UTF8_codec.decode_character string 0)
-          (value, width)
-      )
-      valid_character_decodings
-
-  (** A list of strings containing overlong character encodings.
-      For each byte length b in [2...4], this list contains the
-      overlong encoding e (v), where v is the UCS value one less
-      than the smallest UCS value validly-encodable in b bytes. *)
-  let overlong_character_encodings =
-    [
-      "\xc1\xbf" (* 0b11000001 0b10111111                       *)
-    ; "\xe0\x9f\xbf" (* 0b11100000 0b10011111 0b10111111            *)
-    ; "\xf0\x8f\xbf\xbf" (* 0b11110000 0b10001111 0b10111111 0b10111111 *)
-    ]
-
-  let test_decode_character_when_overlong () =
-    List.iter
-      (fun string ->
-        Alcotest.check_raises "should fail" E.UTF8_encoding_not_canonical
-          (fun () -> Lenient_UTF8_codec.decode_character string 0 |> ignore
-        )
-      )
-      overlong_character_encodings
-
-  let tests =
-    [
-      ( "test_width_required_for_ucs_value"
-      , `Quick
-      , test_width_required_for_ucs_value
-      )
-    ; ( "test_decode_character_when_valid"
-      , `Quick
-      , test_decode_character_when_valid
-      )
-    ; ( "test_decode_character_when_overlong"
-      , `Quick
-      , test_decode_character_when_overlong
-      )
-    ]
-end
-
-let () =
-  Alcotest.run "Encodings"
-    [
-      ("UCS", UCS.tests)
-    ; ("XML", XML.tests)
-    ; ("String_validator", String_validator.tests)
-    ; ("XML_UTF8_UCS_validator", XML_UTF8_UCS_validator.tests)
-    ; ("UTF8_codec", UTF8_codec.tests)
-    ]
diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/utf8.ml b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/utf8.ml
new file mode 100644
index 000000000..d17d85b3b
--- /dev/null
+++ b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/utf8.ml
@@ -0,0 +1,74 @@
+(*
+ * Copyright (c) Cloud Software Group, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *)
+
+let is_valid = String.is_valid_utf_8
+
+(* deprecated - reject invalid UTF-8 *)
+let longest_valid_prefix str =
+  let len = String.length str in
+  let rec loop = function
+    | i when i < len ->
+        let dec = String.get_utf_8_uchar str i in
+        if Uchar.utf_decode_is_valid dec then
+          loop (i + Uchar.utf_decode_length dec)
+        else
+          String.sub str 0 i
+    | i when i = len ->
+        str
+    | i ->
+        String.sub str 0 i (* never reached *)
+  in
+  loop 0
+
+module XML = struct
+  (** some UTF-8 characters are not legal in XML. Assuming uchar is
+      legal UTF-8, further check that it is legal in XML *)
+  let is_legal uchar =
+    let uchar = Uchar.to_int uchar in
+    uchar >= 0x20 || uchar = 0x09 || uchar = 0x0a || uchar = 0x0d
+  [@@inline]
+
+  let is_valid str =
+    let len = String.length str in
+    let rec loop = function
+      | i when i < len ->
+          let dec = String.get_utf_8_uchar str i in
+          Uchar.utf_decode_is_valid dec
+          && is_legal (Uchar.utf_decode_uchar dec)
+          && loop (i + Uchar.utf_decode_length dec)
+      | _ ->
+          true
+    in
+    loop 0
+
+  (* deprecated - reject invalid UTF-8 *)
+  let longest_valid_prefix str =
+    let len = String.length str in
+    let rec loop = function
+      | i when i < len ->
+          let dec = String.get_utf_8_uchar str i in
+          if
+            Uchar.utf_decode_is_valid dec
+            && is_legal (Uchar.utf_decode_uchar dec)
+          then
+            loop (i + Uchar.utf_decode_length dec)
+          else
+            String.sub str 0 i
+      | i when i = len ->
+          str (* avoid copy *)
+      | i ->
+          String.sub str 0 i (* never reached *)
+    in
+    loop 0
+end
diff --git a/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/utf8.mli b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/utf8.mli
new file mode 100644
index 000000000..6d8949e2f
--- /dev/null
+++ b/ocaml/libs/xapi-stdext/lib/xapi-stdext-encodings/utf8.mli
@@ -0,0 +1,31 @@
+(*
+ * Copyright (c) Cloud Software Group, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *)
+
+val is_valid : string -> bool
+(** true, if a string is a proper UTF-8 string *)
+
+val longest_valid_prefix : string -> string
+(** Deprecated. Longest prefix of a string that is proper UTF-8 *)
+
+(* strings in XML are more restricted than UTF-8 in general. The must be
+   valid UTF-8 and must not contain certain characters *)
+
+module XML : sig
+  val is_valid : string -> bool
+  (** true, if a string is a proper UTF-8 string in XML *)
+
+  val longest_valid_prefix : string -> string
+  (** Deprecated. longest prefix of a string that is proper UTF-8.
+      Better reject invalid UTF-8. *)
+end
diff --git a/ocaml/xapi/xapi_message.ml b/ocaml/xapi/xapi_message.ml
index 408ba7acf..4c08648dc 100644
--- a/ocaml/xapi/xapi_message.ml
+++ b/ocaml/xapi/xapi_message.ml
@@ -28,7 +28,7 @@
  *)
 
 module Date = Clock.Date
-module Encodings = Xapi_stdext_encodings.Encodings
+module Encodings = Xapi_stdext_encodings
 module Listext = Xapi_stdext_std.Listext
 module Pervasiveext = Xapi_stdext_pervasives.Pervasiveext
 module Unixext = Xapi_stdext_unix.Unixext
@@ -414,7 +414,7 @@ let create ~__context ~name ~priority ~cls ~obj_uuid ~body =
   debug "Message.create %s %Ld %s %s" name priority
     (Record_util.cls_to_string cls)
     obj_uuid ;
-  if not (Encodings.UTF8_XML.is_valid body) then
+  if not (Encodings.Utf8.is_valid body) then
     raise (Api_errors.Server_error (Api_errors.invalid_value, ["UTF8 expected"])) ;
   if not (check_uuid ~__context ~cls ~uuid:obj_uuid) then
     raise
