/* * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /* ******************************************************************************* * Copyright (C) 1996-2014, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ package jdk_internal.icu.impl; import java.io.IOException; import java.nio.ByteBuffer; import java.util.Iterator; import jdk_internal.icu.lang.UCharacter.HangulSyllableType; import jdk_internal.icu.lang.UCharacter.NumericType; import jdk_internal.icu.text.UTF16; import jdk_internal.icu.text.UnicodeSet; import jdk_internal.icu.util.VersionInfo; /** *

* Internal class used for Unicode character property database. *

*

* This classes store binary data read from uprops.icu. It does not have the * capability to parse the data into more high-level information. It only * returns bytes of information when required. *

*

* Due to the form most commonly used for retrieval, array of char is used to * store the binary data. *

*

* UCharacterPropertyDB also contains information on accessing indexes to * significant points in the binary data. *

*

* Responsibility for molding the binary data into more meaning form lies on * UCharacter. *

* * @author Syn Wee Quek * @since release 2.1, february 1st 2002 */ public final class UCharacterProperty { // public data members ----------------------------------------------- /* * public singleton instance */ public static final UCharacterProperty INSTANCE; /** * Trie data */ public Trie2_16 m_trie_; /** * Unicode version */ public VersionInfo m_unicodeVersion_; /** * Character type mask */ public static final int TYPE_MASK = 0x1F; // uprops.h enum UPropertySource --------------------------------------- *** /** From uchar.c/uprops.icu main trie */ public static final int SRC_CHAR = 1; /** From uchar.c/uprops.icu properties vectors trie */ public static final int SRC_PROPSVEC = 2; /** From ubidi_props.c/ubidi.icu */ public static final int SRC_BIDI = 5; /** From normalizer2impl.cpp/nfc.nrm */ public static final int SRC_NFC = 8; /** From normalizer2impl.cpp/nfkc.nrm */ public static final int SRC_NFKC = 9; // public methods ---------------------------------------------------- /** * Gets the main property value for code point ch. * * @param ch code point whose property value is to be retrieved * @return property value of code point */ public final int getProperty(int ch) { return m_trie_.get(ch); } /** * Gets the unicode additional properties. Java version of C * u_getUnicodeProperties(). * * @param codepoint codepoint whose additional properties is to be retrieved * @param column The column index. * @return unicode properties */ public int getAdditional(int codepoint, int column) { assert column >= 0; if (column >= m_additionalColumnsCount_) { return 0; } return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; } /** *

* Get the "age" of the code point. *

*

* The "age" is the Unicode version when the code point was first designated (as * a non-character or for Private Use) or assigned a character. *

*

* This can be useful to avoid emitting code points to receiving processes that * do not accept newer characters. *

*

* The data is from the UCD file DerivedAge.txt. *

*

* This API does not check the validity of the codepoint. *

* * @param codepoint The code point. * @return the Unicode version number */ public VersionInfo getAge(int codepoint) { int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; return VersionInfo.getInstance((version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, version & LAST_NIBBLE_MASK_, 0, 0); } // int-value and enumerated properties --------------------------------- *** public int getType(int c) { return getProperty(c) & TYPE_MASK; } /* * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. * Hangul_Syllable_Type is fully redundant with a subset of * Grapheme_Cluster_Break. */ private static final int /* UHangulSyllableType */ gcbToHst[] = { HangulSyllableType.NOT_APPLICABLE, /* * U_GCB_OTHER */ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */ HangulSyllableType.LEADING_JAMO, /* U_GCB_L */ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */ HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */ HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */ HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */ HangulSyllableType.VOWEL_JAMO /* U_GCB_V */ /* * Omit GCB values beyond what we need for hst. The code below checks for the * array length. */ }; private class IntProperty { int column; // SRC_PROPSVEC column, or "source" if mask==0 int mask; int shift; IntProperty(int column, int mask, int shift) { this.column = column; this.mask = mask; this.shift = shift; } IntProperty(int source) { this.column = source; this.mask = 0; } int getValue(int c) { // systematic, directly stored properties return (getAdditional(c, column) & mask) >>> shift; } } private class BiDiIntProperty extends IntProperty { BiDiIntProperty() { super(SRC_BIDI); } } private class CombiningClassIntProperty extends IntProperty { CombiningClassIntProperty(int source) { super(source); } } private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties int which; int max; NormQuickCheckIntProperty(int source, int which, int max) { super(source); this.which = which; this.max = max; } } private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE int getValue(int c) { return UBiDiProps.INSTANCE.getPairedBracketType(c); } }; public int getIntPropertyValue(int c, int which) { if (which == BIDI_PAIRED_BRACKET_TYPE) { return intProp.getValue(c); } return 0; // undefined } /** * Forms a supplementary code point from the argument character
* Note this is for internal use hence no checks for the validity of the * surrogate characters are done * * @param lead lead surrogate character * @param trail trailing surrogate character * @return code point of the supplementary character */ public static int getRawSupplementary(char lead, char trail) { return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; } /** * Gets the type mask * * @param type character type * @return mask */ public static final int getMask(int type) { return 1 << type; } /** * Returns the digit values of characters like 'A' - 'Z', normal, half-width and * full-width. This method assumes that the other digit characters are checked * by the calling method. * * @param ch character to test * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise its * corresponding digit will be returned. */ public static int getEuropeanDigit(int ch) { if ((ch > 0x7a && ch < 0xff21) || ch < 0x41 || (ch > 0x5a && ch < 0x61) || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { return -1; } if (ch <= 0x7a) { // ch >= 0x41 or ch < 0x61 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); } // ch >= 0xff21 if (ch <= 0xff3a) { return ch + 10 - 0xff21; } // ch >= 0xff41 && ch <= 0xff5a return ch + 10 - 0xff41; } public int digit(int c) { int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_; if (value <= 9) { return value; } else { return -1; } } // protected variables ----------------------------------------------- /** * Extra property trie */ Trie2_16 m_additionalTrie_; /** * Extra property vectors, 1st column for age and second for binary properties. */ int m_additionalVectors_[]; /** * Number of additional columns */ int m_additionalColumnsCount_; /** * Maximum values for block, bits used as in vector word 0 */ int m_maxBlockScriptValue_; /** * Maximum values for script, bits used as in vector word 0 */ int m_maxJTGValue_; /** * Script_Extensions data */ public char[] m_scriptExtensions_; // private variables ------------------------------------------------- /** * Default name of the datafile */ @SuppressWarnings("deprecation") private static final String DATA_FILE_NAME_ = "/assets/eagler/icudt/uprops.icu"; /** * Shift value for lead surrogate to form a supplementary character. */ private static final int LEAD_SURROGATE_SHIFT_ = 10; /** * Offset to add to combined surrogate pair to avoid masking. */ private static final int SURROGATE_OFFSET_ = UTF16.SUPPLEMENTARY_MIN_VALUE - (UTF16.SURROGATE_MIN_VALUE << LEAD_SURROGATE_SHIFT_) - UTF16.TRAIL_SURROGATE_MIN_VALUE; // property data constants ------------------------------------------------- /** * Numeric types and values in the main properties words. */ private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6; private static final int getNumericTypeValue(int props) { return props >> NUMERIC_TYPE_VALUE_SHIFT_; } /* constants for the storage form of numeric types and values */ /** No numeric value. */ private static final int NTV_NONE_ = 0; /** Decimal digits: nv=0..9 */ private static final int NTV_DECIMAL_START_ = 1; /** Other digits: nv=0..9 */ private static final int NTV_DIGIT_START_ = 11; /** Small integers: nv=0..154 */ private static final int NTV_NUMERIC_START_ = 21; private static final int ntvGetType(int ntv) { return (ntv == NTV_NONE_) ? NumericType.NONE : (ntv < NTV_DIGIT_START_) ? NumericType.DECIMAL : (ntv < NTV_NUMERIC_START_) ? NumericType.DIGIT : NumericType.NUMERIC; } /* * Properties in vector word 0 Bits 31..24 DerivedAge version major/minor one * nibble each 23..22 3..1: Bits 21..20 & 7..0 = Script_Extensions index 3: * Script value from Script_Extensions 2: Script=Inherited 1: Script=Common 0: * Script=bits 21..20 & 7..0 21..20 Bits 9..8 of the UScriptCode, or index to * Script_Extensions 19..17 East Asian Width 16.. 8 UBlockCode 7.. 0 * UScriptCode, or index to Script_Extensions */ /** * Script_Extensions: mask includes Script */ public static final int SCRIPT_X_MASK = 0x00f000ff; // private static final int SCRIPT_X_SHIFT = 22; // The UScriptCode or Script_Extensions index is split across two bit fields. // (Starting with Unicode 13/ICU 66/2019 due to more varied Script_Extensions.) // Shift the high bits right by 12 to assemble the full value. public static final int SCRIPT_HIGH_MASK = 0x00300000; public static final int SCRIPT_HIGH_SHIFT = 12; public static final int MAX_SCRIPT = 0x3ff; /** * Integer properties mask and shift values for East Asian cell width. * Equivalent to icu4c UPROPS_EA_MASK */ private static final int EAST_ASIAN_MASK_ = 0x000e0000; /** * Integer properties mask and shift values for East Asian cell width. * Equivalent to icu4c UPROPS_EA_SHIFT */ private static final int EAST_ASIAN_SHIFT_ = 17; /** * Integer properties mask and shift values for blocks. Equivalent to icu4c * UPROPS_BLOCK_MASK */ private static final int BLOCK_MASK_ = 0x0001ff00; /** * Integer properties mask and shift values for blocks. Equivalent to icu4c * UPROPS_BLOCK_SHIFT */ private static final int BLOCK_SHIFT_ = 8; /** * Integer properties mask and shift values for scripts. Equivalent to icu4c * UPROPS_SHIFT_LOW_MASK. */ public static final int SCRIPT_LOW_MASK = 0x000000ff; public static final int mergeScriptCodeOrIndex(int scriptX) { return ((scriptX & SCRIPT_HIGH_MASK) >> SCRIPT_HIGH_SHIFT) | (scriptX & SCRIPT_LOW_MASK); } /** * Additional properties used in internal trie data */ /* * Properties in vector word 1 Each bit encodes one binary property. The * following constants represent the bit number, use 1< expectedTrieLength) { throw new IOException("uprops.icu: not enough bytes for main trie"); } // skip padding after trie bytes ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); // skip unused intervening data structures ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4); if (m_additionalColumnsCount_ > 0) { // reads the additional property block m_additionalTrie_ = Trie2_16.createFromSerialized(bytes); expectedTrieLength = (additionalVectorsOffset - additionalOffset) * 4; trieLength = m_additionalTrie_.getSerializedLength(); if (trieLength > expectedTrieLength) { throw new IOException("uprops.icu: not enough bytes for additional-properties trie"); } // skip padding after trie bytes ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); // additional properties int size = scriptExtensionsOffset - additionalVectorsOffset; m_additionalVectors_ = new int[size]; for (int i = 0; i < size; i++) { m_additionalVectors_[i] = bytes.getInt(); } } // Script_Extensions int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2; if (numChars > 0) { m_scriptExtensions_ = new char[numChars]; for (int i = 0; i < numChars; ++i) { m_scriptExtensions_[i] = bytes.getChar(); } } } private static final class IsAcceptable implements ICUBinary.Authenticate { // @Override when we switch to Java 6 public boolean isDataVersionAcceptable(byte version[]) { return version[0] == 7; } } private static final int DATA_FORMAT = 0x5550726F; // "UPro" public void upropsvec_addPropertyStarts(UnicodeSet set) { /* * add the start code point of each same-value range of the properties vectors * trie */ if (m_additionalColumnsCount_ > 0) { /* * if m_additionalColumnsCount_==0 then the properties vectors trie may not be * there at all */ Iterator trieIterator = m_additionalTrie_.iterator(); Trie2.Range range; while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) { set.add(range.startCodePoint); } } } // This static initializer block must be placed after // other static member initialization static { try { INSTANCE = new UCharacterProperty(); } catch (IOException e) { throw new RuntimeException("Missing resource: \"" + DATA_FILE_NAME_ + "\"; Reason: " + e.getMessage()); } } // Moved from UProperty.java /** * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). Used in * UAX #9: Unicode Bidirectional Algorithm (http://www.unicode.org/reports/tr9/) * Returns UCharacter.BidiPairedBracketType values. * * @stable ICU 52 */ public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015; }