(** UTF-16 support for Ulex.
   Implementation as described in "http://www.ietf.org/rfc/rfc2781.txt".
 *)

exception MalFormed

(** UTF-16 can be encoded in little endian format (0xabcd ->
   (0xcd|0xab)) or big endian format (0xabcd -> (0xab|0xcd).  *)

type byte_order = Little_endian | Big_endian

(** {6 Interface } *)

(** [to_int_array opt_bo str spos bytes] decodes the string [str] of
   length [bytes] starting in position [spos]. If [opt_bo] matches
   with [None] the functions tries to detect a BOM, if it can't it
   assumes big endian byte order. If [opt_bo] matches with [Some bo]
   byte order [bo] is assumed and potential byte order marks are
   interpreted as code points 0xfeff. *)
val to_int_array: byte_order option -> string -> int -> int -> int array

(** [from_int_array bo a apos len bom] encodes an int array [a]
   containing [len] code points from position [apos] into a string
   with byte order [bo]. The results starts with a BOM if [bom =
   true]. *)
val from_int_array: byte_order -> int array -> int -> int -> bool -> string

(** [stream_from_char_stream opt_stro] creates a new int stream
   containing the code points encoded in [str]. Treats [opt_bo] as
   [to_int_array]. *)
val stream_from_char_stream: byte_order option -> char Stream.t -> int Stream.t

(** {6 Low level} *)

(** [get_byte_order c1 c2] determines the byte order by a pair of
   bytes/characters [c1] and [c2].  *)
val get_byte_order: char -> char -> byte_order


(** [from_stream bo s] reads the next code point from a stream encoded
   in byte order [bo]. *)
val from_stream: byte_order -> char Stream.t -> int

(** [number_of_char_pair bo c1 c2] returns the code point encoded in
   [c1] and [c2] following byte order [bo]. *)
val number_of_char_pair: byte_order -> char -> char -> int

(** [char_pair_of_number bo cp] encodes code point [cp] into two
   characters with byte order [bo].  *)
val char_pair_of_number: byte_order -> int -> char * char

(** [next_code bo s pos bytes bo] reads the code point starting at
   position [pos] in a string [s] of total length [bytes].  *)
val next_code: byte_order -> string -> int -> int -> int * int

(** [compute_len opt_bo str pos len] computes the
   number of encoded code points in string [str] from position
   [pos] to [pos+len-1]. *)
val compute_len: byte_order option -> string -> int -> int -> int

(** [blit_to_int bo str spos a apos n] decode [len] bytes
   from string [str] starting at position [spos] into
   array [a], at position [apos]. *)
val blit_to_int:
 byte_order option -> string -> int -> int array -> int -> int -> unit


(** [store bo buf cp] adds a codepoint [cp] to a buffer [buf]
   following the byte order [bo]. *)
val store: byte_order -> Buffer.t -> int -> unit


val from_utf16_stream: char Stream.t -> byte_order option -> Ulexing.lexbuf
  (** [from_utf16_stream s opt_bo] creates a lexbuf from an UTF-16
      encoded stream. If [opt_bo] matches with [None] the function
      expects a BOM (Byte Order Mark), and takes the byte order as
      [Utf16.Big_endian] if it cannot find one. When [opt_bo] matches 
      with [Some bo], [bo] is taken as byte order. In this case a
      leading BOM is kept in the stream - the lexer has to ignore it
      and a `wrong' BOM ([0xfffe]) will raise Utf16.InvalidCodepoint.
    *)

val from_utf16_channel: in_channel -> byte_order option-> Ulexing.lexbuf
  (** Works as [from_utf16_stream] with an [in_channel]. *)
                                                                  
val from_utf16_string: string -> byte_order option -> Ulexing.lexbuf 
  (** Works as [from_utf16_stream] with a [string]. *)
                                                              
val utf16_lexeme: Ulexing.lexbuf -> byte_order -> bool -> string
  (** [utf16_lexeme lb bo bom] as [Ulexing.lexeme] with a result encoded in
      UTF-16 in byte_order [bo] and starting with a BOM if [bom = true].
  *)
                                                  
val utf16_sub_lexeme: Ulexing.lexbuf -> int -> int -> byte_order -> bool -> string
  (** [utf16_sub_lexeme lb pos len bo bom] as [Ulexing.sub_lexeme] with a 
      result encoded in UTF-16 with byte order [bo] and starting with a BOM
      if [bom=true]  *)