Source for gnu.gcj.convert.Input_UTF8

   1: /* Copyright (C) 1999, 2000  Free Software Foundation
   2: 
   3:    This file is part of libgcj.
   4: 
   5: This software is copyrighted work licensed under the terms of the
   6: Libgcj License.  Please consult the file "LIBGCJ_LICENSE" for
   7: details.  */
   8: 
   9: package gnu.gcj.convert;
  10: 
  11: /**
  12:  * Convert UTF8 to Unicode.
  13:  * @author Per Bothner <bothner@cygnus.com>
  14:  * @date March 1999.
  15:  */
  16: 
  17: public class Input_UTF8 extends BytesToUnicode
  18: {
  19:   public String getName() { return "UTF8"; }
  20: 
  21:   int partial = 0;
  22:   int partial_bytes_expected = 0;
  23:   //int suggogate_second = -1;
  24: 
  25:   public int read (char[] outbuffer, int outpos, int count)
  26:   {
  27:     int origpos = outpos;
  28:     for (;;)
  29:       {
  30:     if (outpos - origpos >= count)
  31:       break;
  32:     if (inpos >= inlength)
  33:       break;
  34:     int b = inbuffer[inpos++];
  35:     if (b >= 0)
  36:       outbuffer[outpos++] = (char) b;
  37:     else
  38:       {
  39:         if ((b & 0xC0) == 0x80) // Continuation byte
  40:           {
  41:         partial = (partial << 6) | (b & 0x3F);
  42:         --partial_bytes_expected;
  43:         if (partial_bytes_expected == 1)
  44:           {
  45:             if (partial > (0xFFFF>>6))
  46:               {
  47:             // The next continuation byte will cause the result
  48:             // to exceed 0xFFFF, so we must use a surrogate pair.
  49:             // The "Unicode scalar value" (see D28 in section 3.7
  50:             // of the Unicode Standard 2.0) is defined as:
  51:             // value == (hi-0xD800)*0x400+(lo-0xDC00)+0x10000,
  52:             // where (hi, lo) is the Unicode surrogate pair.
  53:             // After reading the first three bytes, we have:
  54:             // partial == (value >> 6).
  55:             // Substituting and simplifying, we get:
  56:             // partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400.
  57:             // The definition lo>=0xDC00 && lo<=0xDFFF implies
  58:             // that (lo-0xDC00)>>6 is in the range 0..15.
  59:             // Hence we can solve for `hi' and we can emit
  60:             // the high-surrogate without waiting for the
  61:             // final byte:
  62:             outbuffer[outpos++]
  63:               = (char) (0xD800 + ((partial - 0x400) >> 4));
  64: 
  65:             // Now we want to set it up so that when we read
  66:             // the final byte on the next iteration, we will
  67:             // get the low-surrogate without special handling.
  68:             // I.e. we want:
  69:             // lo == (next_partial << 6) | (next & 0x3F)
  70:             // where next is the next input byte and next_partial
  71:             // is the value of partial at the end of this
  72:             // iteration.  This implies:  next_partial == lo >> 6.
  73:             // We can simplify the previous:
  74:             // partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400,
  75:             // to: partial == (hi-0xD800)*0x10+(lo>>6)+0x90.
  76:             // Inserting the values of hi and next_partial,
  77:             // and simplifying, we get:  partial ==
  78:             // ( (partial-0x400)&~0xF) + next_partial + 0x90.
  79:             // Solving for next_partial, we get:
  80:             // next_partial = partial+0x400-0x90-(partial&~0xF):
  81:             // or: next_partial = (partial&0xF) + 0x370.  Hence:
  82:             partial = (partial & 0xF) + 0x370;
  83:               }
  84:           }
  85:         else if (partial_bytes_expected == 0)
  86:           {
  87:             outbuffer[outpos++] = (char) partial;
  88:             partial = 0;
  89:             partial_bytes_expected = 0;
  90:           }
  91:           }
  92:         else // prefix byte
  93:           {
  94:         if ((b & 0xE0) == 0xC0)
  95:           {
  96:             partial = b & 0x1F;
  97:             partial_bytes_expected = 1;
  98:           }
  99:         else if ((b & 0xF0) == 0xE0)
 100:           {
 101:             partial = b & 0xF;
 102:             partial_bytes_expected = 2;
 103:           }
 104:         else
 105:           {
 106:             partial = b & 7;
 107:             partial_bytes_expected = 3;
 108:           }
 109:           }
 110:       }
 111:       }
 112:     return outpos - origpos;
 113:   }
 114: }