mirror of
https://git.eaglercraft.rip/eaglercraft/eaglercraft-1.8.git
synced 2025-04-19 06:57:38 -07:00
563 lines
19 KiB
Java
563 lines
19 KiB
Java
/*
|
|
* Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved.
|
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
*
|
|
* This code is free software; you can redistribute it and/or modify it
|
|
* under the terms of the GNU General Public License version 2 only, as
|
|
* published by the Free Software Foundation. Oracle designates this
|
|
* particular file as subject to the "Classpath" exception as provided
|
|
* by Oracle in the LICENSE file that accompanied this code.
|
|
*
|
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
* version 2 for more details (a copy is included in the LICENSE file that
|
|
* accompanied this code).
|
|
*
|
|
* You should have received a copy of the GNU General Public License version
|
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
*
|
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
* or visit www.oracle.com if you need additional information or have any
|
|
* questions.
|
|
*/
|
|
|
|
/**
|
|
*******************************************************************************
|
|
* Copyright (C) 1996-2014, International Business Machines Corporation and
|
|
* others. All Rights Reserved.
|
|
*******************************************************************************
|
|
*/
|
|
|
|
package jdk_internal.icu.lang;
|
|
|
|
import jdk_internal.icu.impl.UBiDiProps;
|
|
import jdk_internal.icu.impl.UCharacterProperty;
|
|
import jdk_internal.icu.text.Normalizer2;
|
|
import jdk_internal.icu.text.UTF16;
|
|
import jdk_internal.icu.util.VersionInfo;
|
|
|
|
/**
|
|
* <p>
|
|
* The UCharacter class provides extensions to the
|
|
* <a href="http://java.sun.com/j2se/1.5/docs/api/java/lang/Character.html">
|
|
* java.lang.Character</a> class. These extensions provide support for more
|
|
* Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
|
|
* class, provide support for supplementary characters (those with code points
|
|
* above U+FFFF). Each ICU release supports the latest version of Unicode
|
|
* available at that time.
|
|
*
|
|
* <p>
|
|
* Code points are represented in these API using ints. While it would be more
|
|
* convenient in Java to have a separate primitive datatype for them, ints
|
|
* suffice in the meantime.
|
|
*
|
|
* <p>
|
|
* To use this class please add the jar file name icu4j.jar to the class path,
|
|
* since it contains data files which supply the information used by this
|
|
* file.<br>
|
|
* E.g. In Windows <br>
|
|
* <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
|
|
* Otherwise, another method would be to copy the files uprops.dat and
|
|
* unames.icu from the icu4j source subdirectory
|
|
* <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
|
|
* <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
|
|
*
|
|
* <p>
|
|
* Aside from the additions for UTF-16 support, and the updated Unicode
|
|
* properties, the main differences between UCharacter and Character are:
|
|
* <ul>
|
|
* <li>UCharacter is not designed to be a char wrapper and does not have APIs to
|
|
* which involves management of that single char.<br>
|
|
* These include:
|
|
* <ul>
|
|
* <li>char charValue(),
|
|
* <li>int compareTo(java.lang.Character, java.lang.Character), etc.
|
|
* </ul>
|
|
* <li>UCharacter does not include Character APIs that are deprecated, nor does
|
|
* it include the Java-specific character information, such as boolean
|
|
* isJavaIdentifierPart(char ch).
|
|
* <li>Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric values
|
|
* '10' - '35'. UCharacter also does this in digit and getNumericValue, to
|
|
* adhere to the java semantics of these methods. New methods unicodeDigit, and
|
|
* getUnicodeNumericValue do not treat the above code points as having numeric
|
|
* values. This is a semantic change from ICU4J 1.3.1.
|
|
* </ul>
|
|
* <p>
|
|
* Further detail on differences can be determined using the program <a href=
|
|
* "http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
|
|
* com.ibm.icu.dev.test.lang.UCharacterCompare</a>
|
|
* </p>
|
|
* <p>
|
|
* In addition to Java compatibility functions, which calculate derived
|
|
* properties, this API provides low-level access to the Unicode Character
|
|
* Database.
|
|
* </p>
|
|
* <p>
|
|
* Unicode assigns each code point (not just assigned character) values for many
|
|
* properties. Most of them are simple boolean flags, or constants from a small
|
|
* enumerated list. For some properties, values are strings or other relatively
|
|
* more complex types.
|
|
* </p>
|
|
* <p>
|
|
* For more information see <a href="http://www.unicode/org/ucd/">"About the
|
|
* Unicode Character Database"</a> (http://www.unicode.org/ucd/) and the
|
|
* <a href="http://www.icu-project.org/userguide/properties.html">ICU User Guide
|
|
* chapter on Properties</a>
|
|
* (http://www.icu-project.org/userguide/properties.html).
|
|
* </p>
|
|
* <p>
|
|
* There are also functions that provide easy migration from C/POSIX functions
|
|
* like isblank(). Their use is generally discouraged because the C/POSIX
|
|
* standards do not define their semantics beyond the ASCII range, which means
|
|
* that different implementations exhibit very different behavior. Instead,
|
|
* Unicode properties should be used directly.
|
|
* </p>
|
|
* <p>
|
|
* There are also only a few, broad C/POSIX character classes, and they tend to
|
|
* be used for conflicting purposes. For example, the "isalpha()" class is
|
|
* sometimes used to determine word boundaries, while a more sophisticated
|
|
* approach would at least distinguish initial letters from continuation
|
|
* characters (the latter including combining marks). (In ICU, BreakIterator is
|
|
* the most sophisticated API for word boundaries.) Another example: There is no
|
|
* "istitle()" class for titlecase characters.
|
|
* </p>
|
|
* <p>
|
|
* ICU 3.4 and later provides API access for all twelve C/POSIX character
|
|
* classes. ICU implements them according to the Standard Recommendations in
|
|
* Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
|
|
* (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
|
|
* </p>
|
|
* <p>
|
|
* API access for C/POSIX character classes is as follows:
|
|
*
|
|
* <pre>{@code
|
|
* - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
|
|
* - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
|
|
* - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
|
|
* - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|
|
|
* (1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|
|
|
* (1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
|
|
* - digit: isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
|
|
* - xdigit: hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
|
|
* - alnum: hasBinaryProperty(c, UProperty.POSIX_ALNUM)
|
|
* - space: isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE)
|
|
* - blank: hasBinaryProperty(c, UProperty.POSIX_BLANK)
|
|
* - cntrl: getType(c)==CONTROL
|
|
* - graph: hasBinaryProperty(c, UProperty.POSIX_GRAPH)
|
|
* - print: hasBinaryProperty(c, UProperty.POSIX_PRINT)
|
|
* }</pre>
|
|
* </p>
|
|
* <p>
|
|
* The C/POSIX character classes are also available in UnicodeSet patterns,
|
|
* using patterns like [:graph:] or \p{graph}.
|
|
* </p>
|
|
*
|
|
* There are several ICU (and Java) whitespace functions. Comparison:
|
|
* <ul>
|
|
* <li>isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property; most of
|
|
* general categories "Z" (separators) + most whitespace ISO controls (including
|
|
* no-break spaces, but excluding IS1..IS4 and ZWSP)
|
|
* <li>isWhitespace: Java isWhitespace; Z + whitespace ISO controls but
|
|
* excluding no-break spaces
|
|
* <li>isSpaceChar: just Z (including no-break spaces)
|
|
* </ul>
|
|
* </p>
|
|
* <p>
|
|
* This class is not subclassable.
|
|
* </p>
|
|
*
|
|
* @author Syn Wee Quek
|
|
* @stable ICU 2.1
|
|
* @see com.ibm.icu.lang.UCharacterEnums
|
|
*/
|
|
|
|
public final class UCharacter {
|
|
|
|
/**
|
|
* Joining Group constants.
|
|
*
|
|
* @see UProperty#JOINING_GROUP
|
|
* @stable ICU 2.4
|
|
*/
|
|
public static interface JoiningGroup {
|
|
/**
|
|
* @stable ICU 2.4
|
|
*/
|
|
public static final int NO_JOINING_GROUP = 0;
|
|
}
|
|
|
|
/**
|
|
* Numeric Type constants.
|
|
*
|
|
* @see UProperty#NUMERIC_TYPE
|
|
* @stable ICU 2.4
|
|
*/
|
|
public static interface NumericType {
|
|
/**
|
|
* @stable ICU 2.4
|
|
*/
|
|
public static final int NONE = 0;
|
|
/**
|
|
* @stable ICU 2.4
|
|
*/
|
|
public static final int DECIMAL = 1;
|
|
/**
|
|
* @stable ICU 2.4
|
|
*/
|
|
public static final int DIGIT = 2;
|
|
/**
|
|
* @stable ICU 2.4
|
|
*/
|
|
public static final int NUMERIC = 3;
|
|
/**
|
|
* @stable ICU 2.4
|
|
*/
|
|
public static final int COUNT = 4;
|
|
}
|
|
|
|
/**
|
|
* Hangul Syllable Type constants.
|
|
*
|
|
* @see UProperty#HANGUL_SYLLABLE_TYPE
|
|
* @stable ICU 2.6
|
|
*/
|
|
public static interface HangulSyllableType {
|
|
/**
|
|
* @stable ICU 2.6
|
|
*/
|
|
public static final int NOT_APPLICABLE = 0; /* [NA] */ /* See note !! */
|
|
/**
|
|
* @stable ICU 2.6
|
|
*/
|
|
public static final int LEADING_JAMO = 1; /* [L] */
|
|
/**
|
|
* @stable ICU 2.6
|
|
*/
|
|
public static final int VOWEL_JAMO = 2; /* [V] */
|
|
/**
|
|
* @stable ICU 2.6
|
|
*/
|
|
public static final int TRAILING_JAMO = 3; /* [T] */
|
|
/**
|
|
* @stable ICU 2.6
|
|
*/
|
|
public static final int LV_SYLLABLE = 4; /* [LV] */
|
|
/**
|
|
* @stable ICU 2.6
|
|
*/
|
|
public static final int LVT_SYLLABLE = 5; /* [LVT] */
|
|
/**
|
|
* @stable ICU 2.6
|
|
*/
|
|
public static final int COUNT = 6;
|
|
}
|
|
|
|
// public data members -----------------------------------------------
|
|
|
|
/**
|
|
* The lowest Unicode code point value.
|
|
*
|
|
* @stable ICU 2.1
|
|
*/
|
|
public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
|
|
|
|
/**
|
|
* The highest Unicode code point value (scalar value) according to the Unicode
|
|
* Standard. This is a 21-bit value (21 bits, rounded up).<br>
|
|
* Up-to-date Unicode implementation of java.lang.Character.MAX_VALUE
|
|
*
|
|
* @stable ICU 2.1
|
|
*/
|
|
public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
|
|
|
|
// public methods ----------------------------------------------------
|
|
|
|
/**
|
|
* Returns the numeric value of a decimal digit code point. <br>
|
|
* This method observes the semantics of
|
|
* <code>java.lang.Character.digit()</code>. Note that this will return positive
|
|
* values for code points for which isDigit returns false, just like
|
|
* java.lang.Character. <br>
|
|
* <em>Semantic Change:</em> In release 1.3.1 and prior, this did not treat the
|
|
* European letters as having a digit value, and also treated numeric letters
|
|
* and other numbers as digits. This has been changed to conform to the java
|
|
* semantics. <br>
|
|
* A code point is a valid digit if and only if:
|
|
* <ul>
|
|
* <li>ch is a decimal digit or one of the european letters, and
|
|
* <li>the value of ch is less than the specified radix.
|
|
* </ul>
|
|
*
|
|
* @param ch the code point to query
|
|
* @param radix the radix
|
|
* @return the numeric value represented by the code point in the specified
|
|
* radix, or -1 if the code point is not a decimal digit or if its value
|
|
* is too large for the radix
|
|
* @stable ICU 2.1
|
|
*/
|
|
public static int digit(int ch, int radix) {
|
|
if (2 <= radix && radix <= 36) {
|
|
int value = digit(ch);
|
|
if (value < 0) {
|
|
// ch is not a decimal digit, try latin letters
|
|
value = UCharacterProperty.getEuropeanDigit(ch);
|
|
}
|
|
return (value < radix) ? value : -1;
|
|
} else {
|
|
return -1; // invalid radix
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Returns the numeric value of a decimal digit code point. <br>
|
|
* This is a convenience overload of <code>digit(int, int)</code> that provides
|
|
* a decimal radix. <br>
|
|
* <em>Semantic Change:</em> In release 1.3.1 and prior, this treated numeric
|
|
* letters and other numbers as digits. This has been changed to conform to the
|
|
* java semantics.
|
|
*
|
|
* @param ch the code point to query
|
|
* @return the numeric value represented by the code point, or -1 if the code
|
|
* point is not a decimal digit or if its value is too large for a
|
|
* decimal radix
|
|
* @stable ICU 2.1
|
|
*/
|
|
public static int digit(int ch) {
|
|
return UCharacterProperty.INSTANCE.digit(ch);
|
|
}
|
|
|
|
/**
|
|
* Returns a value indicating a code point's Unicode category. Up-to-date
|
|
* Unicode implementation of java.lang.Character.getType() except for the above
|
|
* mentioned code points that had their category changed.<br>
|
|
* Return results are constants from the interface
|
|
* <a href=UCharacterCategory.html>UCharacterCategory</a><br>
|
|
* <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with
|
|
* those returned by java.lang.Character.getType. UCharacterCategory values
|
|
* match the ones used in ICU4C, while java.lang.Character type values, though
|
|
* similar, skip the value 17.
|
|
* </p>
|
|
*
|
|
* @param ch code point whose type is to be determined
|
|
* @return category which is a value of UCharacterCategory
|
|
* @stable ICU 2.1
|
|
*/
|
|
public static int getType(int ch) {
|
|
return UCharacterProperty.INSTANCE.getType(ch);
|
|
}
|
|
|
|
/**
|
|
* Returns the Bidirection property of a code point. For example, 0x0041 (letter
|
|
* A) has the LEFT_TO_RIGHT directional property.<br>
|
|
* Result returned belongs to the interface
|
|
* <a href=UCharacterDirection.html>UCharacterDirection</a>
|
|
*
|
|
* @param ch the code point to be determined its direction
|
|
* @return direction constant from UCharacterDirection.
|
|
* @stable ICU 2.1
|
|
*/
|
|
public static int getDirection(int ch) {
|
|
return UBiDiProps.INSTANCE.getClass(ch);
|
|
}
|
|
|
|
/**
|
|
* Maps the specified code point to a "mirror-image" code point. For code points
|
|
* with the "mirrored" property, implementations sometimes need a "poor man's"
|
|
* mapping to another code point such that the default glyph may serve as the
|
|
* mirror-image of the default glyph of the specified code point.<br>
|
|
* This is useful for text conversion to and from codepages with visual order,
|
|
* and for displays without glyph selection capabilities.
|
|
*
|
|
* @param ch code point whose mirror is to be retrieved
|
|
* @return another code point that may serve as a mirror-image substitute, or ch
|
|
* itself if there is no such mapping or ch does not have the "mirrored"
|
|
* property
|
|
* @stable ICU 2.1
|
|
*/
|
|
public static int getMirror(int ch) {
|
|
return UBiDiProps.INSTANCE.getMirror(ch);
|
|
}
|
|
|
|
/**
|
|
* Maps the specified character to its paired bracket character. For
|
|
* Bidi_Paired_Bracket_Type!=None, this is the same as getMirror(int). Otherwise
|
|
* c itself is returned. See http://www.unicode.org/reports/tr9/
|
|
*
|
|
* @param c the code point to be mapped
|
|
* @return the paired bracket code point, or c itself if there is no such
|
|
* mapping (Bidi_Paired_Bracket_Type=None)
|
|
*
|
|
* @see UProperty#BIDI_PAIRED_BRACKET
|
|
* @see UProperty#BIDI_PAIRED_BRACKET_TYPE
|
|
* @see #getMirror(int)
|
|
* @stable ICU 52
|
|
*/
|
|
public static int getBidiPairedBracket(int c) {
|
|
return UBiDiProps.INSTANCE.getPairedBracket(c);
|
|
}
|
|
|
|
/**
|
|
* Returns the combining class of the argument codepoint
|
|
*
|
|
* @param ch code point whose combining is to be retrieved
|
|
* @return the combining class of the codepoint
|
|
* @stable ICU 2.1
|
|
*/
|
|
public static int getCombiningClass(int ch) {
|
|
return Normalizer2.getNFDInstance().getCombiningClass(ch);
|
|
}
|
|
|
|
/**
|
|
* Returns the version of Unicode data used.
|
|
*
|
|
* @return the unicode version number used
|
|
* @stable ICU 2.1
|
|
*/
|
|
public static VersionInfo getUnicodeVersion() {
|
|
return UCharacterProperty.INSTANCE.m_unicodeVersion_;
|
|
}
|
|
|
|
/**
|
|
* Returns a code point corresponding to the two UTF16 characters.
|
|
*
|
|
* @param lead the lead char
|
|
* @param trail the trail char
|
|
* @return code point if surrogate characters are valid.
|
|
* @exception IllegalArgumentException thrown when argument characters do not
|
|
* form a valid codepoint
|
|
* @stable ICU 2.1
|
|
*/
|
|
public static int getCodePoint(char lead, char trail) {
|
|
if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
|
|
return UCharacterProperty.getRawSupplementary(lead, trail);
|
|
}
|
|
throw new IllegalArgumentException("Illegal surrogate characters");
|
|
}
|
|
|
|
/**
|
|
* Returns the "age" of the code point.
|
|
* </p>
|
|
* <p>
|
|
* The "age" is the Unicode version when the code point was first designated (as
|
|
* a non-character or for Private Use) or assigned a character.
|
|
* <p>
|
|
* This can be useful to avoid emitting code points to receiving processes that
|
|
* do not accept newer characters.
|
|
* </p>
|
|
* <p>
|
|
* The data is from the UCD file DerivedAge.txt.
|
|
* </p>
|
|
*
|
|
* @param ch The code point.
|
|
* @return the Unicode version number
|
|
* @stable ICU 2.6
|
|
*/
|
|
public static VersionInfo getAge(int ch) {
|
|
if (ch < MIN_VALUE || ch > MAX_VALUE) {
|
|
throw new IllegalArgumentException("Codepoint out of bounds");
|
|
}
|
|
return UCharacterProperty.INSTANCE.getAge(ch);
|
|
}
|
|
|
|
/**
|
|
* Returns the property value for an Unicode property type of a code point. Also
|
|
* returns binary and mask property values.
|
|
* </p>
|
|
* <p>
|
|
* Unicode, especially in version 3.2, defines many more properties than the
|
|
* original set in UnicodeData.txt.
|
|
* </p>
|
|
* <p>
|
|
* The properties APIs are intended to reflect Unicode properties as defined in
|
|
* the Unicode Character Database (UCD) and Unicode Technical Reports (UTR). For
|
|
* details about the properties see http://www.unicode.org/.
|
|
* </p>
|
|
* <p>
|
|
* For names of Unicode properties see the UCD file PropertyAliases.txt.
|
|
* </p>
|
|
*
|
|
* <pre>
|
|
* Sample usage:
|
|
* int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
|
|
* int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
|
|
* boolean b = (ideo == 1) ? true : false;
|
|
* </pre>
|
|
*
|
|
* @param ch code point to test.
|
|
* @param type UProperty selector constant, identifies which binary property to
|
|
* check. Must be UProperty.BINARY_START <= type <
|
|
* UProperty.BINARY_LIMIT or UProperty.INT_START <= type <
|
|
* UProperty.INT_LIMIT or UProperty.MASK_START <= type <
|
|
* UProperty.MASK_LIMIT.
|
|
* @return numeric value that is directly the property value or, for enumerated
|
|
* properties, corresponds to the numeric value of the enumerated
|
|
* constant of the respective property value enumeration type (cast to
|
|
* enum type if necessary). Returns 0 or 1 (for false / true) for binary
|
|
* Unicode properties. Returns a bit-mask for mask properties. Returns 0
|
|
* if 'type' is out of bounds or if the Unicode version does not have
|
|
* data for the property at all, or not for this code point.
|
|
* @see UProperty
|
|
* @see #hasBinaryProperty
|
|
* @see #getIntPropertyMinValue
|
|
* @see #getIntPropertyMaxValue
|
|
* @see #getUnicodeVersion
|
|
* @stable ICU 2.4
|
|
*/
|
|
// for BiDiBase.java
|
|
public static int getIntPropertyValue(int ch, int type) {
|
|
return UCharacterProperty.INSTANCE.getIntPropertyValue(ch, type);
|
|
}
|
|
|
|
// private constructor -----------------------------------------------
|
|
|
|
/**
|
|
* Private constructor to prevent instantiation
|
|
*/
|
|
private UCharacter() {
|
|
}
|
|
|
|
/*
|
|
* Copied from UCharacterEnums.java
|
|
*/
|
|
|
|
/**
|
|
* Character type Mn
|
|
*
|
|
* @stable ICU 2.1
|
|
*/
|
|
public static final byte NON_SPACING_MARK = 6;
|
|
/**
|
|
* Character type Me
|
|
*
|
|
* @stable ICU 2.1
|
|
*/
|
|
public static final byte ENCLOSING_MARK = 7;
|
|
/**
|
|
* Character type Mc
|
|
*
|
|
* @stable ICU 2.1
|
|
*/
|
|
public static final byte COMBINING_SPACING_MARK = 8;
|
|
/**
|
|
* Character type count
|
|
*
|
|
* @stable ICU 2.1
|
|
*/
|
|
public static final byte CHAR_CATEGORY_COUNT = 30;
|
|
|
|
/**
|
|
* Directional type R
|
|
*
|
|
* @stable ICU 2.1
|
|
*/
|
|
public static final int RIGHT_TO_LEFT = 1;
|
|
/**
|
|
* Directional type AL
|
|
*
|
|
* @stable ICU 2.1
|
|
*/
|
|
public static final int RIGHT_TO_LEFT_ARABIC = 13;
|
|
}
|