/* * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ /* ******************************************************************************* * Copyright (C) 2003-2004, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ // // CHANGELOG // 2005-05-19 Edward Wang // - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/Punycode.java // - move from package com.ibm.icu.text to package sun.net.idn // - use ParseException instead of StringPrepParseException // 2007-08-14 Martin Buchholz // - remove redundant casts // package jdk_internal.icu.impl; import java.text.ParseException; import jdk_internal.icu.lang.UCharacter; import jdk_internal.icu.text.UTF16; /** * Ported code from ICU punycode.c * * @author ram */ /* Package Private class */ public final class Punycode { /* Punycode parameters for Bootstring */ private static final int BASE = 36; private static final int TMIN = 1; private static final int TMAX = 26; private static final int SKEW = 38; private static final int DAMP = 700; private static final int INITIAL_BIAS = 72; private static final int INITIAL_N = 0x80; /* "Basic" Unicode/ASCII code points */ private static final int HYPHEN = 0x2d; private static final int DELIMITER = HYPHEN; private static final int ZERO = 0x30; private static final int NINE = 0x39; private static final int SMALL_A = 0x61; private static final int SMALL_Z = 0x7a; private static final int CAPITAL_A = 0x41; private static final int CAPITAL_Z = 0x5a; // TODO: eliminate the 256 limitation private static final int MAX_CP_COUNT = 256; private static final int UINT_MAGIC = 0x80000000; private static final long ULONG_MAGIC = 0x8000000000000000L; private static int adaptBias(int delta, int length, boolean firstTime) { if (firstTime) { delta /= DAMP; } else { delta /= 2; } delta += delta / length; int count = 0; for (; delta > ((BASE - TMIN) * TMAX) / 2; count += BASE) { delta /= (BASE - TMIN); } return count + (((BASE - TMIN + 1) * delta) / (delta + SKEW)); } /** * basicToDigit[] contains the numeric value of a basic code point (for use in * representing integers) in the range 0 to BASE-1, or -1 if b is does not * represent a value. */ static final int[] basicToDigit = new int[] { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; private static char asciiCaseMap(char b, boolean uppercase) { if (uppercase) { if (SMALL_A <= b && b <= SMALL_Z) { b -= (SMALL_A - CAPITAL_A); } } else { if (CAPITAL_A <= b && b <= CAPITAL_Z) { b += (SMALL_A - CAPITAL_A); } } return b; } /** * digitToBasic() returns the basic code point whose value (when used for * representing integers) is d, which must be in the range 0 to BASE-1. The * lowercase form is used unless the uppercase flag is nonzero, in which case * the uppercase form is used. */ private static char digitToBasic(int digit, boolean uppercase) { /* 0..25 map to ASCII a..z or A..Z */ /* 26..35 map to ASCII 0..9 */ if (digit < 26) { if (uppercase) { return (char) (CAPITAL_A + digit); } else { return (char) (SMALL_A + digit); } } else { return (char) ((ZERO - 26) + digit); } } /** * Converts Unicode to Punycode. The input string must not contain single, * unpaired surrogates. The output will be represented as an array of ASCII code * points. * * @param src * @param caseFlags * @return * @throws ParseException */ public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException { int[] cpBuffer = new int[MAX_CP_COUNT]; int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount; char c, c2; int srcLength = src.length(); int destCapacity = MAX_CP_COUNT; char[] dest = new char[destCapacity]; StringBuffer result = new StringBuffer(); /* * Handle the basic code points and convert extended ones to UTF-32 in cpBuffer * (caseFlag in sign bit): */ srcCPCount = destLength = 0; for (j = 0; j < srcLength; ++j) { if (srcCPCount == MAX_CP_COUNT) { /* too many input code points */ throw new ParseException("Too many input code points", -1); } c = src.charAt(j); if (isBasic(c)) { if (destLength < destCapacity) { cpBuffer[srcCPCount++] = 0; dest[destLength] = caseFlags != null ? asciiCaseMap(c, caseFlags[j]) : c; } ++destLength; } else { n = ((caseFlags != null && caseFlags[j]) ? 1 : 0) << 31L; if (!UTF16.isSurrogate(c)) { n |= c; } else if (UTF16.isLeadSurrogate(c) && (j + 1) < srcLength && UTF16.isTrailSurrogate(c2 = src.charAt(j + 1))) { ++j; n |= UCharacter.getCodePoint(c, c2); } else { /* error: unmatched surrogate */ throw new ParseException("Illegal char found", -1); } cpBuffer[srcCPCount++] = n; } } /* Finish the basic string - if it is not empty - with a delimiter. */ basicLength = destLength; if (basicLength > 0) { if (destLength < destCapacity) { dest[destLength] = DELIMITER; } ++destLength; } /* * handledCPCount is the number of code points that have been handled * basicLength is the number of basic code points destLength is the number of * chars that have been output */ /* Initialize the state: */ n = INITIAL_N; delta = 0; bias = INITIAL_BIAS; /* Main encoding loop: */ for (handledCPCount = basicLength; handledCPCount < srcCPCount; /* no op */) { /* * All non-basic code points < n have been handled already. Find the next larger * one: */ for (m = 0x7fffffff, j = 0; j < srcCPCount; ++j) { q = cpBuffer[j] & 0x7fffffff; /* remove case flag from the sign bit */ if (n <= q && q < m) { m = q; } } /* * Increase delta enough to advance the decoder's state to , but * guard against overflow: */ if (m - n > (0x7fffffff - MAX_CP_COUNT - delta) / (handledCPCount + 1)) { throw new RuntimeException("Internal program error"); } delta += (m - n) * (handledCPCount + 1); n = m; /* Encode a sequence of same code points n */ for (j = 0; j < srcCPCount; ++j) { q = cpBuffer[j] & 0x7fffffff; /* remove case flag from the sign bit */ if (q < n) { ++delta; } else if (q == n) { /* Represent delta as a generalized variable-length integer: */ for (q = delta, k = BASE; /* no condition */; k += BASE) { /** * RAM: comment out the old code for conformance with * draft-ietf-idn-punycode-03.txt * * t=k-bias; if(tTMAX) { t=TMAX; } */ t = k - bias; if (t < TMIN) { t = TMIN; } else if (k >= (bias + TMAX)) { t = TMAX; } if (q < t) { break; } if (destLength < destCapacity) { dest[destLength++] = digitToBasic(t + (q - t) % (BASE - t), false); } q = (q - t) / (BASE - t); } if (destLength < destCapacity) { dest[destLength++] = digitToBasic(q, (cpBuffer[j] < 0)); } bias = adaptBias(delta, handledCPCount + 1, (handledCPCount == basicLength)); delta = 0; ++handledCPCount; } } ++delta; ++n; } return result.append(dest, 0, destLength); } private static boolean isBasic(int ch) { return (ch < INITIAL_N); } private static boolean isBasicUpperCase(int ch) { return (CAPITAL_A <= ch && ch <= CAPITAL_Z); } private static boolean isSurrogate(int ch) { return (((ch) & 0xfffff800) == 0xd800); } /** * Converts Punycode to Unicode. The Unicode string will be at most as long as * the Punycode string. * * @param src * @param caseFlags * @return * @throws ParseException */ public static StringBuffer decode(StringBuffer src, boolean[] caseFlags) throws ParseException { int srcLength = src.length(); StringBuffer result = new StringBuffer(); int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t, destCPCount, firstSupplementaryIndex, cpLength; char b; int destCapacity = MAX_CP_COUNT; char[] dest = new char[destCapacity]; /* * Handle the basic code points: Let basicLength be the number of input code * points before the last delimiter, or 0 if there is none, then copy the first * basicLength code points to the output. * * The two following loops iterate backward. */ for (j = srcLength; j > 0;) { if (src.charAt(--j) == DELIMITER) { break; } } destLength = basicLength = destCPCount = j; while (j > 0) { b = src.charAt(--j); if (!isBasic(b)) { throw new ParseException("Illegal char found", -1); } if (j < destCapacity) { dest[j] = b; if (caseFlags != null) { caseFlags[j] = isBasicUpperCase(b); } } } /* Initialize the state: */ n = INITIAL_N; i = 0; bias = INITIAL_BIAS; firstSupplementaryIndex = 1000000000; /* * Main decoding loop: Start just after the last delimiter if any basic code * points were copied; start at the beginning otherwise. */ for (in = basicLength > 0 ? basicLength + 1 : 0; in < srcLength; /* no op */) { /* * in is the index of the next character to be consumed, and destCPCount is the * number of code points in the output array. * * Decode a generalized variable-length integer into delta, which gets added to * i. The overflow checking is easier if we increase i as we go, then subtract * off its starting value at the end to obtain delta. */ for (oldi = i, w = 1, k = BASE; /* no condition */; k += BASE) { if (in >= srcLength) { throw new ParseException("Illegal char found", -1); } digit = basicToDigit[(byte) src.charAt(in++)]; if (digit < 0) { throw new ParseException("Invalid char found", -1); } if (digit > (0x7fffffff - i) / w) { /* integer overflow */ throw new ParseException("Illegal char found", -1); } i += digit * w; t = k - bias; if (t < TMIN) { t = TMIN; } else if (k >= (bias + TMAX)) { t = TMAX; } if (digit < t) { break; } if (w > 0x7fffffff / (BASE - t)) { /* integer overflow */ throw new ParseException("Illegal char found", -1); } w *= BASE - t; } /* * Modification from sample code: Increments destCPCount here, where needed * instead of in for() loop tail. */ ++destCPCount; bias = adaptBias(i - oldi, destCPCount, (oldi == 0)); /* * i was supposed to wrap around from (incremented) destCPCount to 0, * incrementing n each time, so we'll fix that now: */ if (i / destCPCount > (0x7fffffff - n)) { /* integer overflow */ throw new ParseException("Illegal char found", -1); } n += i / destCPCount; i %= destCPCount; /* not needed for Punycode: */ /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ if (n > 0x10ffff || isSurrogate(n)) { /* Unicode code point overflow */ throw new ParseException("Illegal char found", -1); } /* Insert n at position i of the output: */ cpLength = UTF16.getCharCount(n); if ((destLength + cpLength) < destCapacity) { int codeUnitIndex; /* * Handle indexes when supplementary code points are present. * * In almost all cases, there will be only BMP code points before i and even in * the entire string. This is handled with the same efficiency as with UTF-32. * * Only the rare cases with supplementary code points are handled more slowly - * but not too bad since this is an insertion anyway. */ if (i <= firstSupplementaryIndex) { codeUnitIndex = i; if (cpLength > 1) { firstSupplementaryIndex = codeUnitIndex; } else { ++firstSupplementaryIndex; } } else { codeUnitIndex = firstSupplementaryIndex; codeUnitIndex = UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i - codeUnitIndex); } /* use the UChar index codeUnitIndex instead of the code point index i */ if (codeUnitIndex < destLength) { System.arraycopy(dest, codeUnitIndex, dest, codeUnitIndex + cpLength, (destLength - codeUnitIndex)); if (caseFlags != null) { System.arraycopy(caseFlags, codeUnitIndex, caseFlags, codeUnitIndex + cpLength, destLength - codeUnitIndex); } } if (cpLength == 1) { /* BMP, insert one code unit */ dest[codeUnitIndex] = (char) n; } else { /* supplementary character, insert two code units */ dest[codeUnitIndex] = UTF16.getLeadSurrogate(n); dest[codeUnitIndex + 1] = UTF16.getTrailSurrogate(n); } if (caseFlags != null) { /* Case of last character determines uppercase flag: */ caseFlags[codeUnitIndex] = isBasicUpperCase(src.charAt(in - 1)); if (cpLength == 2) { caseFlags[codeUnitIndex + 1] = false; } } } destLength += cpLength; ++i; } result.append(dest, 0, destLength); return result; } }