Source for gnu.gcj.convert.Output_UTF8

   1: /* Copyright (C) 1999, 2000, 2003, 2006  Free Software Foundation
   2: 
   3:    This file is part of libgcj.
   4: 
   5: This software is copyrighted work licensed under the terms of the
   6: Libgcj License.  Please consult the file "LIBGCJ_LICENSE" for
   7: details.  */
   8: 
   9: package gnu.gcj.convert;
  10: 
  11: /**
  12:  * Convert Unicode to UTF8.
  13:  * @author Per Bothner <bothner@cygnus.com>
  14:  * @date Match 1999.
  15:  */
  16: 
  17: public class Output_UTF8 extends UnicodeToBytes
  18: {
  19:   public String getName() { return "UTF8"; }
  20: 
  21:   /** True if a surrogate pair should be emitted as a single UTF8 sequence.
  22:    * Otherwise, a surrogate pair is treated as two separate characters.
  23:    * Also, '\0' is emitted as {0} if true, and as {0xC0,0x80} if false. */
  24:   public boolean standardUTF8 = true;
  25: 
  26:   // Saves the previous char if it was a high-surrogate.
  27:   char hi_part;
  28:   // Value of incomplete character.
  29:   int value;
  30:   // Number of continuation bytes still to emit.
  31:   int bytes_todo;
  32: 
  33:   public int write (char[] inbuffer, int inpos, int inlength)
  34:   {
  35:     int start_pos = inpos;
  36:     int avail = buf.length - count;
  37:     for (;;)
  38:       {
  39:     if (avail == 0 || (inlength == 0 && bytes_todo == 0 && hi_part == 0))
  40:       break;
  41:     // The algorithm is made more complicated because we want to write
  42:     // at least one byte in the output buffer, if there is room for
  43:     // that byte, and at least one input character is available.
  44:     // This makes the code more robust, since client code will
  45:     // always "make progress", even in the complicated cases,
  46:     // where the output buffer only has room for only *part* of a
  47:     // multi-byte sequence, or the input char buffer only has half
  48:     // of a surrogate pair (when standardUTF8 is set), or both.
  49: 
  50:     // Handle continuation characters we did not have room for before.
  51:     if (bytes_todo > 0)
  52:       {
  53:         do
  54:           {
  55:         bytes_todo--;
  56:         buf[count++] = (byte)
  57:           (((value >> (bytes_todo * 6)) & 0x3F) | 0x80);
  58:         avail--;
  59:           }
  60:         while (bytes_todo > 0 && avail > 0);
  61:         continue;
  62:       }
  63: 
  64:     // Handle a high surrogate at the end of the input stream.
  65:     if (inlength == 0 && hi_part != 0)
  66:       {
  67:         buf[count++] = (byte) (0xE0 | (hi_part >> 12));
  68:         value = hi_part;
  69:         hi_part = 0;
  70:         avail--;
  71:         bytes_todo = 2;
  72:         continue;
  73:       }
  74: 
  75:     char ch = inbuffer[inpos++];
  76:     inlength--;
  77: 
  78:     if (hi_part != 0 && (ch <= 0xDBFF || ch > 0xDFFF))
  79:       {
  80:         // If the previous character was a high surrogate, and we
  81:         // don't now have a low surrogate, we print the high
  82:         // surrogate as an isolated character.
  83:         --inpos;
  84:         ++inlength;
  85:         buf[count++] = (byte) (0xE0 | (hi_part >> 12));
  86:         value = hi_part;
  87:         hi_part = 0;
  88:         avail--;
  89:         bytes_todo = 2;
  90:       }
  91:     else if (hi_part == 0 && ch >= 0xDC00 && ch <= 0xDFFF)
  92:       {
  93:         // If this character is a low surrogate and we didn't
  94:         // previously see a high surrogate, we do the same thing
  95:         // as above.
  96:         buf[count++] = (byte) (0xE0 | (ch >> 12));
  97:         value = ch;
  98:         avail--;
  99:         bytes_todo = 2;
 100:       }
 101:     else if (ch < 128 && (ch != 0 || standardUTF8))
 102:       {
 103:         avail--;
 104:         buf[count++] = (byte) ch;
 105:       }
 106:     else if (ch <= 0x07FF)
 107:       {
 108:         buf[count++] = (byte) (0xC0 | (ch >> 6));
 109:         avail--;
 110:         value = ch;
 111:         bytes_todo = 1;
 112:       }
 113:     else if (ch >= 0xD800 && ch <= 0xDFFF && standardUTF8)
 114:       {
 115:         if (ch <= 0xDBFF)  // High surrogates
 116:           {
 117:         // Just save the high surrogate until the next
 118:         // character comes along.
 119:         hi_part = ch;
 120:           }
 121:         else // Low surrogates
 122:           {
 123:         value = (hi_part - 0xD800) * 0x400 + (ch - 0xDC00) + 0x10000;
 124:         buf[count++] = (byte) (0xF0 | (value >> 18));
 125:         avail--;
 126:         bytes_todo = 3;
 127:         hi_part = 0;
 128:           }
 129:       }
 130:     else
 131:       {
 132:         buf[count++] = (byte) (0xE0 | (ch >> 12));
 133:         value = ch;
 134:         avail--;
 135:         bytes_todo = 2;
 136:       }
 137:       }
 138:     return inpos - start_pos;
 139:   }
 140: 
 141:   public boolean havePendingBytes()
 142:   {
 143:     return bytes_todo > 0 || hi_part != 0;
 144:   }
 145: 
 146: }