(* $Id: uCharInfo.mli,v 1.20 2006/08/06 19:48:55 yori Exp $ *) (* Copyright 2002, 2003 Yamagata Yoriyuki. distributed with LGPL *) module type Type = sig (** Character Information *) (** Type of Unicode general character categories. Each variant specifies - [`Lu] : Letter, Uppercase - [`Ll] : Letter, Lowercase - [`Lt] : Letter, Titlecase - [`Mn] : Mark, Non-Spacing - [`Mc] : Mark, Spacing Combining - [`Me] : Mark, Enclosing - [`Nd] : Number, Decimal Digit - [`Nl] : Number, Letter - [`No] : Number, Other - [`Zs] : Separator, Space - [`Zl] : Separator, Line - [`Zp] : Separator, Paragraph - [`Cc] : Other, Control - [`Cf] : Other, Format - [`Cs] : Other, Surrogate - [`Co] : Other, Private Use - [`Cn] : Other, Not Assigned - [`Lm] : Letter, Modifier - [`Lo] : Letter, Other - [`Pc] : Punctuation, Connector - [`Pd] : Punctuation, Dash - [`Ps] : Punctuation, Open - [`Pe] : Punctuation, Close - [`Pi] : Punctuation, Initial - [`Pf] : Punctuation, Final - [`Po] : Punctuation, Other - [`Sm] : Symbol, Math - [`Sc] : Symbol, Currency - [`Sk] : Symbol, Modifier - [`So] : Symbol, Other *) type general_category_type = [ `Lu (** Letter, Uppercase *) | `Ll (** Letter, Lowercase *) | `Lt (** Letter, Titlecase *) | `Mn (** Mark, Non-Spacing *) | `Mc (** Mark, Spacing Combining *) | `Me (** Mark, Enclosing *) | `Nd (** Number, Decimal Digit *) | `Nl (** Number, Letter *) | `No (** Number, Other *) | `Zs (** Separator, Space *) | `Zl (** Separator, Line *) | `Zp (** Separator, Paragraph *) | `Cc (** Other, Control *) | `Cf (** Other, Format *) | `Cs (** Other, Surrogate *) | `Co (** Other, Private Use *) | `Cn (** Other, Not Assigned *) | `Lm (** Letter, Modifier *) | `Lo (** Letter, Other *) | `Pc (** Punctuation, Connector *) | `Pd (** Punctuation, Dash *) | `Ps (** Punctuation, Open *) | `Pe (** Punctuation, Close *) | `Pi (** Punctuation, Initial quote *) | `Pf (** Punctuation, Final quote *) | `Po (** Punctuation, Other *) | `Sm (** Symbol, Math *) | `Sc (** Symbol, Currency *) | `Sk (** Symbol, Modifier *) | `So (** Symbol, Other *) ] val general_category : UChar.t -> general_category_type val load_general_category_map : unit -> general_category_type UMap.t (** Type of character properties *) type character_property_type = [ (**Derived Core Properties*) `Math | `Alphabetic | `Lowercase | `Uppercase | `ID_Start | `ID_Continue | `XID_Start | `XID_Continue | `Default_Ignorable_Code_Point | `Grapheme_Extend | `Grapheme_Base (**Extended Properties*) | `Bidi_Control | `White_Space | `Hyphen | `Quotation_Mark | `Terminal_Punctuation | `Other_Math | `Hex_Digit | `Ascii_Hex_Digit | `Other_Alphabetic | `Ideographic | `Diacritic | `Extender | `Other_Lowercase | `Other_Uppercase | `Noncharacter_Code_Point | `Other_Grapheme_Extend | `Grapheme_Link | `IDS_Binary_Operator | `IDS_Trinary_Operator | `Radical | `Unified_Ideograph | `Other_default_Ignorable_Code_Point | `Deprecated | `Soft_Dotted | `Logical_Order_Exception ] (** Load the table for the given character type. *) val load_property_tbl : character_property_type -> UCharTbl.Bool.t (** Load the table for the given name of the character type. The name can be obtained by removing ` from its name of the polymorphic variant tag. *) val load_property_tbl_by_name : string -> UCharTbl.Bool.t (** Load the set of characters of the given character type. *) val load_property_set : character_property_type -> USet.t (** Load the set of characters of the given name of the character type. The name can be obtained by removing ` from its name of the polymorphic variant tag. *) val load_property_set_by_name : string -> USet.t (** Type for script type *) type script_type = [ `Common | `Inherited | `Latin | `Greek | `Cyrillic | `Armenian | `Hebrew | `Arabic | `Syriac | `Thaana | `Devanagari | `Bengali | `Gurmukhi | `Gujarati | `Oriya | `Tamil | `Telugu | `Kannada | `Malayalam | `Sinhala | `Thai | `Lao | `Tibetan | `Myanmar | `Georgian | `Hangul | `Ethiopic | `Cherokee | `Canadian_Aboriginal | `Ogham | `Runic | `Khmer | `Mongolian | `Hiragana | `Katakana | `Bopomofo | `Han | `Yi | `Old_Italic | `Gothic | `Deseret | `Tagalog | `Hanunoo | `Buhid | `Tagbanwa ] val script : UChar.t -> script_type val load_script_map : unit -> script_type UMap.t (** casing *) val load_to_lower1_tbl : unit -> UChar.t UCharTbl.t val load_to_upper1_tbl : unit -> UChar.t UCharTbl.t val load_to_title1_tbl : unit -> UChar.t UCharTbl.t type casemap_condition = [ `Locale of string | `FinalSigma | `AfterSoftDotted | `MoreAbove | `BeforeDot | `Not of casemap_condition ] type special_casing_property = {lower : UChar.t list; title : UChar.t list; upper : UChar.t list; condition : casemap_condition list;} val load_conditional_casing_tbl : unit -> special_casing_property list UCharTbl.t val load_casefolding_tbl : unit -> UChar.t list UCharTbl.t (** Combined class A combined class is an integer of 0 -- 255, showing how this character interacts to other combined characters. *) val combined_class : UChar.t -> int (** Decomposition *) (** Types of decomposition. *) type decomposition_type = [ `Canon | `Font | `NoBreak | `Initial | `Medial | `Final | `Isolated | `Circle | `Super | `Sub | `Vertical | `Wide | `Narrow | `Small | `Square | `Fraction | `Compat ] type decomposition_info = (** Already in the canonical form *) [ `Canonform (** Hangul is treated algotighmically.*) | `HangulSyllable (** [`Composite (dtype, text)] means the given character is decomposed into text by dtype decomposition. *) | `Composite of decomposition_type * UChar.t list ] val load_decomposition_tbl : unit -> decomposition_info UCharTbl.t (** Canonical Composition *) (** The return value [[(u_1, u'_1); ... (u_n, u'_1)]] means for the given character [u], [u u_i] forms the canonical composition [u'_i]. If u is a Hangul jamo, composition returns []. *) val load_composition_tbl : unit -> (UChar.t * UChar.t) list UCharTbl.t (** Whether the given composed character is used in NFC or NFKC *) val load_composition_exclusion_tbl : unit -> UCharTbl.Bool.t end module Make (Config : ConfigInt.Type) : Type