/* * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /** ******************************************************************************* * Copyright (C) 1996-2014, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ package jdk_internal.icu.text; import jdk_internal.icu.impl.UCharacterProperty; /** *
* Standalone utility class providing UTF16 character conversions and indexing * conversions. *
* Code that uses strings alone rarely need modification. By design, UTF-16 does
* not allow overlap, so searching for strings is a safe operation. Similarly,
* concatenation is always safe. Substringing is safe if the start and end are
* both on UTF-32 boundaries. In normal code, the values for start and end are
* on those boundaries, since they arose from operations like searching. If not,
* the nearest UTF-32 boundaries can be determined using bounds()
.
* Examples:
*
* The following examples illustrate use of some of these methods. * *
{@code * // iteration forwards: Original * for (int i = 0; i < s.length(); ++i) { * char ch = s.charAt(i); * doSomethingWith(ch); * } * * // iteration forwards: Changes for UTF-32 * int ch; * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) { * ch = UTF16.charAt(s, i); * doSomethingWith(ch); * } * * // iteration backwards: Original * for (int i = s.length() - 1; i >= 0; --i) { * char ch = s.charAt(i); * doSomethingWith(ch); * } * * // iteration backwards: Changes for UTF-32 * int ch; * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) { * ch = UTF16.charAt(s, i); * doSomethingWith(ch); * } * }* * Notes: *
Lead
and Trail
in the API, which gives a better
* sense of their ordering in a string. offset16
and
* offset32
are used to distinguish offsets to UTF-16 boundaries vs
* offsets to UTF-32 boundaries. int char32
is used to contain
* UTF-32 characters, as opposed to char16
, which is a UTF-16 code
* unit.bounds(string, offset16) != TRAIL
.UCharacter.isLegal()
can be used to check for validity
* if desired.UTF16.getCharCount()
, as well as random access.
* If a validity check is required, use
*
* UCharacter.isLegal()
on the return value. If the char retrieved is
* part of a surrogate pair, its supplementary character will be returned. If a
* complete supplementary character is not found the incomplete character will
* be returned
*
* @param source array of UTF-16 chars
* @param offset16 UTF-16 offset to the start of the character.
* @return UTF-32 value for the UTF-32 value that contains the char at offset16.
* The boundaries of that codepoint are the same as in
* bounds32()
.
* @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
* @stable ICU 2.1
*/
public static int charAt(String source, int offset16) {
char single = source.charAt(offset16);
if (single < LEAD_SURROGATE_MIN_VALUE) {
return single;
}
return _charAt(source, offset16, single);
}
private static int _charAt(String source, int offset16, char single) {
if (single > TRAIL_SURROGATE_MAX_VALUE) {
return single;
}
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is
// low, look both directions.
if (single <= LEAD_SURROGATE_MAX_VALUE) {
++offset16;
if (source.length() != offset16) {
char trail = source.charAt(offset16);
if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(single, trail);
}
}
} else {
--offset16;
if (offset16 >= 0) {
// single is a trail surrogate so
char lead = source.charAt(offset16);
if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(lead, single);
}
}
}
return single; // return unmatched surrogate
}
/**
* Extract a single UTF-32 value from a string. Used when iterating forwards or
* backwards (with UTF16.getCharCount()
, as well as random access.
* If a validity check is required, use
* UCharacter.isLegal()
*
on the return value. If the char retrieved is part of a surrogate
* pair, its supplementary character will be returned. If a complete
* supplementary character is not found the incomplete character will be
* returned
*
* @param source array of UTF-16 chars
* @param offset16 UTF-16 offset to the start of the character.
* @return UTF-32 value for the UTF-32 value that contains the char at offset16.
* The boundaries of that codepoint are the same as in
* bounds32()
.
* @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
* @stable ICU 2.1
*/
public static int charAt(CharSequence source, int offset16) {
char single = source.charAt(offset16);
if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
return single;
}
return _charAt(source, offset16, single);
}
private static int _charAt(CharSequence source, int offset16, char single) {
if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
return single;
}
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is
// low, look both directions.
if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
++offset16;
if (source.length() != offset16) {
char trail = source.charAt(offset16);
if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(single, trail);
}
}
} else {
--offset16;
if (offset16 >= 0) {
// single is a trail surrogate so
char lead = source.charAt(offset16);
if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(lead, single);
}
}
}
return single; // return unmatched surrogate
}
/**
* Extract a single UTF-32 value from a substring. Used when iterating forwards
* or backwards (with UTF16.getCharCount()
, as well as random
* access. If a validity check is required, use
* UCharacter.isLegal()
*
on the return value. If the char retrieved is part of a surrogate
* pair, its supplementary character will be returned. If a complete
* supplementary character is not found the incomplete character will be
* returned
*
* @param source Array of UTF-16 chars
* @param start Offset to substring in the source array for analyzing
* @param limit Offset to substring in the source array for analyzing
* @param offset16 UTF-16 offset relative to start
* @return UTF-32 value for the UTF-32 value that contains the char at offset16.
* The boundaries of that codepoint are the same as in
* bounds32()
.
* @exception IndexOutOfBoundsException Thrown if offset16 is not within the
* range of start and limit.
* @stable ICU 2.1
*/
public static int charAt(char source[], int start, int limit, int offset16) {
offset16 += start;
if (offset16 < start || offset16 >= limit) {
throw new ArrayIndexOutOfBoundsException(offset16);
}
char single = source[offset16];
if (!isSurrogate(single)) {
return single;
}
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is
// low, look both directions.
if (single <= LEAD_SURROGATE_MAX_VALUE) {
offset16++;
if (offset16 >= limit) {
return single;
}
char trail = source[offset16];
if (isTrailSurrogate(trail)) {
return UCharacterProperty.getRawSupplementary(single, trail);
}
} else { // isTrailSurrogate(single), so
if (offset16 == start) {
return single;
}
offset16--;
char lead = source[offset16];
if (isLeadSurrogate(lead))
return UCharacterProperty.getRawSupplementary(lead, single);
}
return single; // return unmatched surrogate
}
/**
* Determines how many chars this char32 requires. If a validity check is
* required, use
* isLegal()
on
* char32 before calling.
*
* @param char32 the input codepoint.
* @return 2 if is in supplementary space, otherwise 1.
* @stable ICU 2.1
*/
public static int getCharCount(int char32) {
if (char32 < SUPPLEMENTARY_MIN_VALUE) {
return 1;
}
return 2;
}
/**
* Determines whether the code value is a surrogate.
*
* @param char16 the input character.
* @return true if the input character is a surrogate.
* @stable ICU 2.1
*/
public static boolean isSurrogate(char char16) {
return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
}
/**
* Determines whether the character is a trail surrogate.
*
* @param char16 the input character.
* @return true if the input character is a trail surrogate.
* @stable ICU 2.1
*/
public static boolean isTrailSurrogate(char char16) {
return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
}
/**
* Determines whether the character is a lead surrogate.
*
* @param char16 the input character.
* @return true if the input character is a lead surrogate
* @stable ICU 2.1
*/
public static boolean isLeadSurrogate(char char16) {
return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
}
/**
* Returns the lead surrogate. If a validity check is required, use
* isLegal()
on
* char32 before calling.
*
* @param char32 the input character.
* @return lead surrogate if the getCharCount(ch) is 2; isLegal()
on
* char32 before calling.
*
* @param char32 the input character.
* @return the trail surrogate if the getCharCount(ch) is 2; isLegal()
on
* char32 before calling.
*
* @param char32 the input character.
* @return string value of char32 in UTF16 format
* @exception IllegalArgumentException thrown if char32 is a invalid codepoint.
* @stable ICU 2.1
*/
public static String valueOf(int char32) {
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
throw new IllegalArgumentException("Illegal codepoint");
}
return toString(char32);
}
/**
* Append a single UTF-32 value to the end of a StringBuffer. If a validity
* check is required, use
* isLegal()
on
* char32 before calling.
*
* @param target the buffer to append to
* @param char32 value to append.
* @return the updated StringBuffer
* @exception IllegalArgumentException thrown when char32 does not lie within
* the range of the Unicode codepoints
* @stable ICU 2.1
*/
public static StringBuffer append(StringBuffer target, int char32) {
// Check for irregular values
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
}
// Write the UTF-16 values
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
target.append(getLeadSurrogate(char32));
target.append(getTrailSurrogate(char32));
} else {
target.append((char) char32);
}
return target;
}
/**
* Shifts offset16 by the argument number of codepoints within a subarray.
*
* @param source char array
* @param start position of the subarray to be performed on
* @param limit position of the subarray to be performed on
* @param offset16 UTF16 position to shift relative to start
* @param shift32 number of codepoints to shift
* @return new shifted offset16 relative to start
* @exception IndexOutOfBoundsException if the new offset16 is out of bounds
* with respect to the subarray or the
* subarray bounds are out of range.
* @stable ICU 2.1
*/
public static int moveCodePointOffset(char source[], int start, int limit, int offset16, int shift32) {
int size = source.length;
int count;
char ch;
int result = offset16 + start;
if (start < 0 || limit < start) {
throw new StringIndexOutOfBoundsException(start);
}
if (limit > size) {
throw new StringIndexOutOfBoundsException(limit);
}
if (offset16 < 0 || result > limit) {
throw new StringIndexOutOfBoundsException(offset16);
}
if (shift32 > 0) {
if (shift32 + result > size) {
throw new StringIndexOutOfBoundsException(result);
}
count = shift32;
while (result < limit && count > 0) {
ch = source[result];
if (isLeadSurrogate(ch) && (result + 1 < limit) && isTrailSurrogate(source[result + 1])) {
result++;
}
count--;
result++;
}
} else {
if (result + shift32 < start) {
throw new StringIndexOutOfBoundsException(result);
}
for (count = -shift32; count > 0; count--) {
result--;
if (result < start) {
break;
}
ch = source[result];
if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
result--;
}
}
}
if (count != 0) {
throw new StringIndexOutOfBoundsException(shift32);
}
result -= start;
return result;
}
// private data members -------------------------------------------------
/**
* Shift value for lead surrogate to form a supplementary character.
*/
private static final int LEAD_SURROGATE_SHIFT_ = 10;
/**
* Mask to retrieve the significant value from a trail surrogate.
*/
private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
/**
* Value that all lead surrogate starts with
*/
private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE
- (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
// private methods ------------------------------------------------------
/**
* * Converts argument code point and returns a String object representing the * code point's value in UTF16 format. *
* This method does not check for the validity of the codepoint, the results are * not guaranteed if a invalid codepoint is passed as argument. *
* The result is a string whose length is 1 for non-supplementary code points, 2 * otherwise. * * @param ch code point * @return string representation of the code point */ private static String toString(int ch) { if (ch < SUPPLEMENTARY_MIN_VALUE) { return String.valueOf((char) ch); } StringBuilder result = new StringBuilder(); result.append(getLeadSurrogate(ch)); result.append(getTrailSurrogate(ch)); return result.toString(); } }