punycode.js
  1  'use strict';
  2  
  3  /** Highest positive signed 32-bit float value */
  4  const maxInt = 2147483647; // aka. 0x7FFFFFFF or 2^31-1
  5  
  6  /** Bootstring parameters */
  7  const base = 36;
  8  const tMin = 1;
  9  const tMax = 26;
 10  const skew = 38;
 11  const damp = 700;
 12  const initialBias = 72;
 13  const initialN = 128; // 0x80
 14  const delimiter = '-'; // '\x2D'
 15  
 16  /** Regular expressions */
 17  const regexPunycode = /^xn--/;
 18  const regexNonASCII = /[^\0-\x7E]/; // non-ASCII chars
 19  const regexSeparators = /[\x2E\u3002\uFF0E\uFF61]/g; // RFC 3490 separators
 20  
 21  /** Error messages */
 22  const errors = {
 23  	'overflow': 'Overflow: input needs wider integers to process',
 24  	'not-basic': 'Illegal input >= 0x80 (not a basic code point)',
 25  	'invalid-input': 'Invalid input'
 26  };
 27  
 28  /** Convenience shortcuts */
 29  const baseMinusTMin = base - tMin;
 30  const floor = Math.floor;
 31  const stringFromCharCode = String.fromCharCode;
 32  
 33  /*--------------------------------------------------------------------------*/
 34  
 35  /**
 36   * A generic error utility function.
 37   * @private
 38   * @param {String} type The error type.
 39   * @returns {Error} Throws a `RangeError` with the applicable error message.
 40   */
 41  function error(type) {
 42  	throw new RangeError(errors[type]);
 43  }
 44  
 45  /**
 46   * A generic `Array#map` utility function.
 47   * @private
 48   * @param {Array} array The array to iterate over.
 49   * @param {Function} callback The function that gets called for every array
 50   * item.
 51   * @returns {Array} A new array of values returned by the callback function.
 52   */
 53  function map(array, fn) {
 54  	const result = [];
 55  	let length = array.length;
 56  	while (length--) {
 57  		result[length] = fn(array[length]);
 58  	}
 59  	return result;
 60  }
 61  
 62  /**
 63   * A simple `Array#map`-like wrapper to work with domain name strings or email
 64   * addresses.
 65   * @private
 66   * @param {String} domain The domain name or email address.
 67   * @param {Function} callback The function that gets called for every
 68   * character.
 69   * @returns {Array} A new string of characters returned by the callback
 70   * function.
 71   */
 72  function mapDomain(string, fn) {
 73  	const parts = string.split('@');
 74  	let result = '';
 75  	if (parts.length > 1) {
 76  		// In email addresses, only the domain name should be punycoded. Leave
 77  		// the local part (i.e. everything up to `@`) intact.
 78  		result = parts[0] + '@';
 79  		string = parts[1];
 80  	}
 81  	// Avoid `split(regex)` for IE8 compatibility. See #17.
 82  	string = string.replace(regexSeparators, '\x2E');
 83  	const labels = string.split('.');
 84  	const encoded = map(labels, fn).join('.');
 85  	return result + encoded;
 86  }
 87  
 88  /**
 89   * Creates an array containing the numeric code points of each Unicode
 90   * character in the string. While JavaScript uses UCS-2 internally,
 91   * this function will convert a pair of surrogate halves (each of which
 92   * UCS-2 exposes as separate characters) into a single code point,
 93   * matching UTF-16.
 94   * @see `punycode.ucs2.encode`
 95   * @see <https://mathiasbynens.be/notes/javascript-encoding>
 96   * @memberOf punycode.ucs2
 97   * @name decode
 98   * @param {String} string The Unicode input string (UCS-2).
 99   * @returns {Array} The new array of code points.
100   */
101  function ucs2decode(string) {
102  	const output = [];
103  	let counter = 0;
104  	const length = string.length;
105  	while (counter < length) {
106  		const value = string.charCodeAt(counter++);
107  		if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
108  			// It's a high surrogate, and there is a next character.
109  			const extra = string.charCodeAt(counter++);
110  			if ((extra & 0xFC00) == 0xDC00) { // Low surrogate.
111  				output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
112  			} else {
113  				// It's an unmatched surrogate; only append this code unit, in case the
114  				// next code unit is the high surrogate of a surrogate pair.
115  				output.push(value);
116  				counter--;
117  			}
118  		} else {
119  			output.push(value);
120  		}
121  	}
122  	return output;
123  }
124  
125  /**
126   * Creates a string based on an array of numeric code points.
127   * @see `punycode.ucs2.decode`
128   * @memberOf punycode.ucs2
129   * @name encode
130   * @param {Array} codePoints The array of numeric code points.
131   * @returns {String} The new Unicode string (UCS-2).
132   */
133  const ucs2encode = array => String.fromCodePoint(...array);
134  
135  /**
136   * Converts a basic code point into a digit/integer.
137   * @see `digitToBasic()`
138   * @private
139   * @param {Number} codePoint The basic numeric code point value.
140   * @returns {Number} The numeric value of a basic code point (for use in
141   * representing integers) in the range `0` to `base - 1`, or `base` if
142   * the code point does not represent a value.
143   */
144  const basicToDigit = function(codePoint) {
145  	if (codePoint - 0x30 < 0x0A) {
146  		return codePoint - 0x16;
147  	}
148  	if (codePoint - 0x41 < 0x1A) {
149  		return codePoint - 0x41;
150  	}
151  	if (codePoint - 0x61 < 0x1A) {
152  		return codePoint - 0x61;
153  	}
154  	return base;
155  };
156  
157  /**
158   * Converts a digit/integer into a basic code point.
159   * @see `basicToDigit()`
160   * @private
161   * @param {Number} digit The numeric value of a basic code point.
162   * @returns {Number} The basic code point whose value (when used for
163   * representing integers) is `digit`, which needs to be in the range
164   * `0` to `base - 1`. If `flag` is non-zero, the uppercase form is
165   * used; else, the lowercase form is used. The behavior is undefined
166   * if `flag` is non-zero and `digit` has no uppercase form.
167   */
168  const digitToBasic = function(digit, flag) {
169  	//  0..25 map to ASCII a..z or A..Z
170  	// 26..35 map to ASCII 0..9
171  	return digit + 22 + 75 * (digit < 26) - ((flag != 0) << 5);
172  };
173  
174  /**
175   * Bias adaptation function as per section 3.4 of RFC 3492.
176   * https://tools.ietf.org/html/rfc3492#section-3.4
177   * @private
178   */
179  const adapt = function(delta, numPoints, firstTime) {
180  	let k = 0;
181  	delta = firstTime ? floor(delta / damp) : delta >> 1;
182  	delta += floor(delta / numPoints);
183  	for (/* no initialization */; delta > baseMinusTMin * tMax >> 1; k += base) {
184  		delta = floor(delta / baseMinusTMin);
185  	}
186  	return floor(k + (baseMinusTMin + 1) * delta / (delta + skew));
187  };
188  
189  /**
190   * Converts a Punycode string of ASCII-only symbols to a string of Unicode
191   * symbols.
192   * @memberOf punycode
193   * @param {String} input The Punycode string of ASCII-only symbols.
194   * @returns {String} The resulting string of Unicode symbols.
195   */
196  const decode = function(input) {
197  	// Don't use UCS-2.
198  	const output = [];
199  	const inputLength = input.length;
200  	let i = 0;
201  	let n = initialN;
202  	let bias = initialBias;
203  
204  	// Handle the basic code points: let `basic` be the number of input code
205  	// points before the last delimiter, or `0` if there is none, then copy
206  	// the first basic code points to the output.
207  
208  	let basic = input.lastIndexOf(delimiter);
209  	if (basic < 0) {
210  		basic = 0;
211  	}
212  
213  	for (let j = 0; j < basic; ++j) {
214  		// if it's not a basic code point
215  		if (input.charCodeAt(j) >= 0x80) {
216  			error('not-basic');
217  		}
218  		output.push(input.charCodeAt(j));
219  	}
220  
221  	// Main decoding loop: start just after the last delimiter if any basic code
222  	// points were copied; start at the beginning otherwise.
223  
224  	for (let index = basic > 0 ? basic + 1 : 0; index < inputLength; /* no final expression */) {
225  
226  		// `index` is the index of the next character to be consumed.
227  		// Decode a generalized variable-length integer into `delta`,
228  		// which gets added to `i`. The overflow checking is easier
229  		// if we increase `i` as we go, then subtract off its starting
230  		// value at the end to obtain `delta`.
231  		let oldi = i;
232  		for (let w = 1, k = base; /* no condition */; k += base) {
233  
234  			if (index >= inputLength) {
235  				error('invalid-input');
236  			}
237  
238  			const digit = basicToDigit(input.charCodeAt(index++));
239  
240  			if (digit >= base || digit > floor((maxInt - i) / w)) {
241  				error('overflow');
242  			}
243  
244  			i += digit * w;
245  			const t = k <= bias ? tMin : (k >= bias + tMax ? tMax : k - bias);
246  
247  			if (digit < t) {
248  				break;
249  			}
250  
251  			const baseMinusT = base - t;
252  			if (w > floor(maxInt / baseMinusT)) {
253  				error('overflow');
254  			}
255  
256  			w *= baseMinusT;
257  
258  		}
259  
260  		const out = output.length + 1;
261  		bias = adapt(i - oldi, out, oldi == 0);
262  
263  		// `i` was supposed to wrap around from `out` to `0`,
264  		// incrementing `n` each time, so we'll fix that now:
265  		if (floor(i / out) > maxInt - n) {
266  			error('overflow');
267  		}
268  
269  		n += floor(i / out);
270  		i %= out;
271  
272  		// Insert `n` at position `i` of the output.
273  		output.splice(i++, 0, n);
274  
275  	}
276  
277  	return String.fromCodePoint(...output);
278  };
279  
280  /**
281   * Converts a string of Unicode symbols (e.g. a domain name label) to a
282   * Punycode string of ASCII-only symbols.
283   * @memberOf punycode
284   * @param {String} input The string of Unicode symbols.
285   * @returns {String} The resulting Punycode string of ASCII-only symbols.
286   */
287  const encode = function(input) {
288  	const output = [];
289  
290  	// Convert the input in UCS-2 to an array of Unicode code points.
291  	input = ucs2decode(input);
292  
293  	// Cache the length.
294  	let inputLength = input.length;
295  
296  	// Initialize the state.
297  	let n = initialN;
298  	let delta = 0;
299  	let bias = initialBias;
300  
301  	// Handle the basic code points.
302  	for (const currentValue of input) {
303  		if (currentValue < 0x80) {
304  			output.push(stringFromCharCode(currentValue));
305  		}
306  	}
307  
308  	let basicLength = output.length;
309  	let handledCPCount = basicLength;
310  
311  	// `handledCPCount` is the number of code points that have been handled;
312  	// `basicLength` is the number of basic code points.
313  
314  	// Finish the basic string with a delimiter unless it's empty.
315  	if (basicLength) {
316  		output.push(delimiter);
317  	}
318  
319  	// Main encoding loop:
320  	while (handledCPCount < inputLength) {
321  
322  		// All non-basic code points < n have been handled already. Find the next
323  		// larger one:
324  		let m = maxInt;
325  		for (const currentValue of input) {
326  			if (currentValue >= n && currentValue < m) {
327  				m = currentValue;
328  			}
329  		}
330  
331  		// Increase `delta` enough to advance the decoder's <n,i> state to <m,0>,
332  		// but guard against overflow.
333  		const handledCPCountPlusOne = handledCPCount + 1;
334  		if (m - n > floor((maxInt - delta) / handledCPCountPlusOne)) {
335  			error('overflow');
336  		}
337  
338  		delta += (m - n) * handledCPCountPlusOne;
339  		n = m;
340  
341  		for (const currentValue of input) {
342  			if (currentValue < n && ++delta > maxInt) {
343  				error('overflow');
344  			}
345  			if (currentValue == n) {
346  				// Represent delta as a generalized variable-length integer.
347  				let q = delta;
348  				for (let k = base; /* no condition */; k += base) {
349  					const t = k <= bias ? tMin : (k >= bias + tMax ? tMax : k - bias);
350  					if (q < t) {
351  						break;
352  					}
353  					const qMinusT = q - t;
354  					const baseMinusT = base - t;
355  					output.push(
356  						stringFromCharCode(digitToBasic(t + qMinusT % baseMinusT, 0))
357  					);
358  					q = floor(qMinusT / baseMinusT);
359  				}
360  
361  				output.push(stringFromCharCode(digitToBasic(q, 0)));
362  				bias = adapt(delta, handledCPCountPlusOne, handledCPCount == basicLength);
363  				delta = 0;
364  				++handledCPCount;
365  			}
366  		}
367  
368  		++delta;
369  		++n;
370  
371  	}
372  	return output.join('');
373  };
374  
375  /**
376   * Converts a Punycode string representing a domain name or an email address
377   * to Unicode. Only the Punycoded parts of the input will be converted, i.e.
378   * it doesn't matter if you call it on a string that has already been
379   * converted to Unicode.
380   * @memberOf punycode
381   * @param {String} input The Punycoded domain name or email address to
382   * convert to Unicode.
383   * @returns {String} The Unicode representation of the given Punycode
384   * string.
385   */
386  const toUnicode = function(input) {
387  	return mapDomain(input, function(string) {
388  		return regexPunycode.test(string)
389  			? decode(string.slice(4).toLowerCase())
390  			: string;
391  	});
392  };
393  
394  /**
395   * Converts a Unicode string representing a domain name or an email address to
396   * Punycode. Only the non-ASCII parts of the domain name will be converted,
397   * i.e. it doesn't matter if you call it with a domain that's already in
398   * ASCII.
399   * @memberOf punycode
400   * @param {String} input The domain name or email address to convert, as a
401   * Unicode string.
402   * @returns {String} The Punycode representation of the given domain name or
403   * email address.
404   */
405  const toASCII = function(input) {
406  	return mapDomain(input, function(string) {
407  		return regexNonASCII.test(string)
408  			? 'xn--' + encode(string)
409  			: string;
410  	});
411  };
412  
413  /*--------------------------------------------------------------------------*/
414  
415  /** Define the public API */
416  const punycode = {
417  	/**
418  	 * A string representing the current Punycode.js version number.
419  	 * @memberOf punycode
420  	 * @type String
421  	 */
422  	'version': '2.1.0',
423  	/**
424  	 * An object of methods to convert from JavaScript's internal character
425  	 * representation (UCS-2) to Unicode code points, and back.
426  	 * @see <https://mathiasbynens.be/notes/javascript-encoding>
427  	 * @memberOf punycode
428  	 * @type Object
429  	 */
430  	'ucs2': {
431  		'decode': ucs2decode,
432  		'encode': ucs2encode
433  	},
434  	'decode': decode,
435  	'encode': encode,
436  	'toASCII': toASCII,
437  	'toUnicode': toUnicode
438  };
439  
440  module.exports = punycode;