utf32.js
  1  'use strict';
  2  
  3  var Buffer = require('safer-buffer').Buffer;
  4  
  5  // == UTF32-LE/BE codec. ==========================================================
  6  
  7  exports._utf32 = Utf32Codec;
  8  
  9  function Utf32Codec(codecOptions, iconv) {
 10      this.iconv = iconv;
 11      this.bomAware = true;
 12      this.isLE = codecOptions.isLE;
 13  }
 14  
 15  exports.utf32le = { type: '_utf32', isLE: true };
 16  exports.utf32be = { type: '_utf32', isLE: false };
 17  
 18  // Aliases
 19  exports.ucs4le = 'utf32le';
 20  exports.ucs4be = 'utf32be';
 21  
 22  Utf32Codec.prototype.encoder = Utf32Encoder;
 23  Utf32Codec.prototype.decoder = Utf32Decoder;
 24  
 25  // -- Encoding
 26  
 27  function Utf32Encoder(options, codec) {
 28      this.isLE = codec.isLE;
 29      this.highSurrogate = 0;
 30  }
 31  
 32  Utf32Encoder.prototype.write = function(str) {
 33      var src = Buffer.from(str, 'ucs2');
 34      var dst = Buffer.alloc(src.length * 2);
 35      var write32 = this.isLE ? dst.writeUInt32LE : dst.writeUInt32BE;
 36      var offset = 0;
 37  
 38      for (var i = 0; i < src.length; i += 2) {
 39          var code = src.readUInt16LE(i);
 40          var isHighSurrogate = (0xD800 <= code && code < 0xDC00);
 41          var isLowSurrogate = (0xDC00 <= code && code < 0xE000);
 42  
 43          if (this.highSurrogate) {
 44              if (isHighSurrogate || !isLowSurrogate) {
 45                  // There shouldn't be two high surrogates in a row, nor a high surrogate which isn't followed by a low
 46                  // surrogate. If this happens, keep the pending high surrogate as a stand-alone semi-invalid character
 47                  // (technically wrong, but expected by some applications, like Windows file names).
 48                  write32.call(dst, this.highSurrogate, offset);
 49                  offset += 4;
 50              }
 51              else {
 52                  // Create 32-bit value from high and low surrogates;
 53                  var codepoint = (((this.highSurrogate - 0xD800) << 10) | (code - 0xDC00)) + 0x10000;
 54  
 55                  write32.call(dst, codepoint, offset);
 56                  offset += 4;
 57                  this.highSurrogate = 0;
 58  
 59                  continue;
 60              }
 61          }
 62  
 63          if (isHighSurrogate)
 64              this.highSurrogate = code;
 65          else {
 66              // Even if the current character is a low surrogate, with no previous high surrogate, we'll
 67              // encode it as a semi-invalid stand-alone character for the same reasons expressed above for
 68              // unpaired high surrogates.
 69              write32.call(dst, code, offset);
 70              offset += 4;
 71              this.highSurrogate = 0;
 72          }
 73      }
 74  
 75      if (offset < dst.length)
 76          dst = dst.slice(0, offset);
 77  
 78      return dst;
 79  };
 80  
 81  Utf32Encoder.prototype.end = function() {
 82      // Treat any leftover high surrogate as a semi-valid independent character.
 83      if (!this.highSurrogate)
 84          return;
 85  
 86      var buf = Buffer.alloc(4);
 87  
 88      if (this.isLE)
 89          buf.writeUInt32LE(this.highSurrogate, 0);
 90      else
 91          buf.writeUInt32BE(this.highSurrogate, 0);
 92  
 93      this.highSurrogate = 0;
 94  
 95      return buf;
 96  };
 97  
 98  // -- Decoding
 99  
100  function Utf32Decoder(options, codec) {
101      this.isLE = codec.isLE;
102      this.badChar = codec.iconv.defaultCharUnicode.charCodeAt(0);
103      this.overflow = [];
104  }
105  
106  Utf32Decoder.prototype.write = function(src) {
107      if (src.length === 0)
108          return '';
109  
110      var i = 0;
111      var codepoint = 0;
112      var dst = Buffer.alloc(src.length + 4);
113      var offset = 0;
114      var isLE = this.isLE;
115      var overflow = this.overflow;
116      var badChar = this.badChar;
117  
118      if (overflow.length > 0) {
119          for (; i < src.length && overflow.length < 4; i++)
120              overflow.push(src[i]);
121          
122          if (overflow.length === 4) {
123              // NOTE: codepoint is a signed int32 and can be negative.
124              // NOTE: We copied this block from below to help V8 optimize it (it works with array, not buffer).
125              if (isLE) {
126                  codepoint = overflow[i] | (overflow[i+1] << 8) | (overflow[i+2] << 16) | (overflow[i+3] << 24);
127              } else {
128                  codepoint = overflow[i+3] | (overflow[i+2] << 8) | (overflow[i+1] << 16) | (overflow[i] << 24);
129              }
130              overflow.length = 0;
131  
132              offset = _writeCodepoint(dst, offset, codepoint, badChar);
133          }
134      }
135  
136      // Main loop. Should be as optimized as possible.
137      for (; i < src.length - 3; i += 4) {
138          // NOTE: codepoint is a signed int32 and can be negative.
139          if (isLE) {
140              codepoint = src[i] | (src[i+1] << 8) | (src[i+2] << 16) | (src[i+3] << 24);
141          } else {
142              codepoint = src[i+3] | (src[i+2] << 8) | (src[i+1] << 16) | (src[i] << 24);
143          }
144          offset = _writeCodepoint(dst, offset, codepoint, badChar);
145      }
146  
147      // Keep overflowing bytes.
148      for (; i < src.length; i++) {
149          overflow.push(src[i]);
150      }
151  
152      return dst.slice(0, offset).toString('ucs2');
153  };
154  
155  function _writeCodepoint(dst, offset, codepoint, badChar) {
156      // NOTE: codepoint is signed int32 and can be negative. We keep it that way to help V8 with optimizations.
157      if (codepoint < 0 || codepoint > 0x10FFFF) {
158          // Not a valid Unicode codepoint
159          codepoint = badChar;
160      } 
161  
162      // Ephemeral Planes: Write high surrogate.
163      if (codepoint >= 0x10000) {
164          codepoint -= 0x10000;
165  
166          var high = 0xD800 | (codepoint >> 10);
167          dst[offset++] = high & 0xff;
168          dst[offset++] = high >> 8;
169  
170          // Low surrogate is written below.
171          var codepoint = 0xDC00 | (codepoint & 0x3FF);
172      }
173  
174      // Write BMP char or low surrogate.
175      dst[offset++] = codepoint & 0xff;
176      dst[offset++] = codepoint >> 8;
177  
178      return offset;
179  };
180  
181  Utf32Decoder.prototype.end = function() {
182      this.overflow.length = 0;
183  };
184  
185  // == UTF-32 Auto codec =============================================================
186  // Decoder chooses automatically from UTF-32LE and UTF-32BE using BOM and space-based heuristic.
187  // Defaults to UTF-32LE. http://en.wikipedia.org/wiki/UTF-32
188  // Encoder/decoder default can be changed: iconv.decode(buf, 'utf32', {defaultEncoding: 'utf-32be'});
189  
190  // Encoder prepends BOM (which can be overridden with (addBOM: false}).
191  
192  exports.utf32 = Utf32AutoCodec;
193  exports.ucs4 = 'utf32';
194  
195  function Utf32AutoCodec(options, iconv) {
196      this.iconv = iconv;
197  }
198  
199  Utf32AutoCodec.prototype.encoder = Utf32AutoEncoder;
200  Utf32AutoCodec.prototype.decoder = Utf32AutoDecoder;
201  
202  // -- Encoding
203  
204  function Utf32AutoEncoder(options, codec) {
205      options = options || {};
206  
207      if (options.addBOM === undefined)
208          options.addBOM = true;
209  
210      this.encoder = codec.iconv.getEncoder(options.defaultEncoding || 'utf-32le', options);
211  }
212  
213  Utf32AutoEncoder.prototype.write = function(str) {
214      return this.encoder.write(str);
215  };
216  
217  Utf32AutoEncoder.prototype.end = function() {
218      return this.encoder.end();
219  };
220  
221  // -- Decoding
222  
223  function Utf32AutoDecoder(options, codec) {
224      this.decoder = null;
225      this.initialBufs = [];
226      this.initialBufsLen = 0;
227      this.options = options || {};
228      this.iconv = codec.iconv;
229  }
230  
231  Utf32AutoDecoder.prototype.write = function(buf) {
232      if (!this.decoder) { 
233          // Codec is not chosen yet. Accumulate initial bytes.
234          this.initialBufs.push(buf);
235          this.initialBufsLen += buf.length;
236  
237          if (this.initialBufsLen < 32) // We need more bytes to use space heuristic (see below)
238              return '';
239  
240          // We have enough bytes -> detect endianness.
241          var encoding = detectEncoding(this.initialBufs, this.options.defaultEncoding);
242          this.decoder = this.iconv.getDecoder(encoding, this.options);
243  
244          var resStr = '';
245          for (var i = 0; i < this.initialBufs.length; i++)
246              resStr += this.decoder.write(this.initialBufs[i]);
247  
248          this.initialBufs.length = this.initialBufsLen = 0;
249          return resStr;
250      }
251  
252      return this.decoder.write(buf);
253  };
254  
255  Utf32AutoDecoder.prototype.end = function() {
256      if (!this.decoder) {
257          var encoding = detectEncoding(this.initialBufs, this.options.defaultEncoding);
258          this.decoder = this.iconv.getDecoder(encoding, this.options);
259  
260          var resStr = '';
261          for (var i = 0; i < this.initialBufs.length; i++)
262              resStr += this.decoder.write(this.initialBufs[i]);
263  
264          var trail = this.decoder.end();
265          if (trail)
266              resStr += trail;
267  
268          this.initialBufs.length = this.initialBufsLen = 0;
269          return resStr;
270      }
271  
272      return this.decoder.end();
273  };
274  
275  function detectEncoding(bufs, defaultEncoding) {
276      var b = [];
277      var charsProcessed = 0;
278      var invalidLE = 0, invalidBE = 0;   // Number of invalid chars when decoded as LE or BE.
279      var bmpCharsLE = 0, bmpCharsBE = 0; // Number of BMP chars when decoded as LE or BE.
280  
281      outer_loop:
282      for (var i = 0; i < bufs.length; i++) {
283          var buf = bufs[i];
284          for (var j = 0; j < buf.length; j++) {
285              b.push(buf[j]);
286              if (b.length === 4) {
287                  if (charsProcessed === 0) {
288                      // Check BOM first.
289                      if (b[0] === 0xFF && b[1] === 0xFE && b[2] === 0 && b[3] === 0) {
290                          return 'utf-32le';
291                      }
292                      if (b[0] === 0 && b[1] === 0 && b[2] === 0xFE && b[3] === 0xFF) {
293                          return 'utf-32be';
294                      }
295                  }
296  
297                  if (b[0] !== 0 || b[1] > 0x10) invalidBE++;
298                  if (b[3] !== 0 || b[2] > 0x10) invalidLE++;
299  
300                  if (b[0] === 0 && b[1] === 0 && (b[2] !== 0 || b[3] !== 0)) bmpCharsBE++;
301                  if ((b[0] !== 0 || b[1] !== 0) && b[2] === 0 && b[3] === 0) bmpCharsLE++;
302  
303                  b.length = 0;
304                  charsProcessed++;
305  
306                  if (charsProcessed >= 100) {
307                      break outer_loop;
308                  }
309              }
310          }
311      }
312  
313      // Make decisions.
314      if (bmpCharsBE - invalidBE > bmpCharsLE - invalidLE)  return 'utf-32be';
315      if (bmpCharsBE - invalidBE < bmpCharsLE - invalidLE)  return 'utf-32le';
316  
317      // Couldn't decide (likely all zeros or not enough data).
318      return defaultEncoding || 'utf-32le';
319  }