Source for gnu.xml.stream.UnicodeReader

   1: /* UnicodeReader.java --
   2:    Copyright (C) 2005  Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: package gnu.xml.stream;
  39: 
  40: import java.io.IOException;
  41: import java.io.Reader;
  42: 
  43: /**
  44:  * A reader that converts UTF-16 characters to Unicode code points.
  45:  *
  46:  * @author <a href='mailto:dog@gnu.org'>Chris Burdess</a>
  47:  */
  48: public class UnicodeReader
  49: {
  50: 
  51:   final Reader in;
  52: 
  53:   UnicodeReader(Reader in)
  54:   {
  55:     this.in = in;
  56:   }
  57: 
  58:   public void mark(int limit)
  59:     throws IOException
  60:   {
  61:     in.mark(limit * 2);
  62:   }
  63: 
  64:   public void reset()
  65:     throws IOException
  66:   {
  67:     in.reset();
  68:   }
  69: 
  70:   public int read()
  71:     throws IOException
  72:   {
  73:     int ret = in.read();
  74:     if (ret == -1)
  75:       return ret;
  76:     if (ret >= 0xd800 && ret < 0xdc00)
  77:       {
  78:         // Unicode surrogate?
  79:         int low = in.read();
  80:         if (low >= 0xdc00 && low < 0xe000)
  81:           ret = Character.toCodePoint((char) ret, (char) low);
  82:         else
  83:           throw new IOException("unpaired surrogate: U+" +
  84:                                 Integer.toHexString(ret));
  85:       }
  86:     else if (ret >= 0xdc00 && ret < 0xe000)
  87:       throw new IOException("unpaired surrogate: U+" +
  88:                             Integer.toHexString(ret));
  89:     return ret;
  90:   }
  91: 
  92:   public int read(int[] buf, int off, int len)
  93:     throws IOException
  94:   {
  95:     if (len == 0)
  96:       return 0;
  97:     char[] b2 = new char[len];
  98:     int ret = in.read(b2, 0, len);
  99:     if (ret <= 0)
 100:       return ret;
 101:     int l = ret - 1;
 102:     int i = 0, j = off;
 103:     for (; i < l; i++)
 104:       {
 105:         char c = b2[i];
 106:         if (c >= 0xd800 && c < 0xdc00)
 107:           {
 108:             // Unicode surrogate?
 109:             char d = b2[i + 1];
 110:             if (d >= 0xdc00 && d < 0xe000)
 111:               {
 112:                 buf[j++] = Character.toCodePoint(c, d);
 113:                 i++;
 114:                 continue;
 115:               }
 116:             else
 117:               throw new IOException("unpaired surrogate: U+" +
 118:                                     Integer.toHexString(c));
 119:           }
 120:         else if (c >= 0xdc00 && c < 0xe000)
 121:           throw new IOException("unpaired surrogate: U+" +
 122:                                 Integer.toHexString(c));
 123:         buf[j++] = (int) c;
 124:       }
 125:     if (i == l)
 126:       {
 127:         // last char
 128:         char c = b2[l];
 129:         if (c >= 0xd800 && c < 0xdc00)
 130:           {
 131:             int low = in.read();
 132:             if (low >= 0xdc00 && low < 0xe000)
 133:               {
 134:                 buf[j++] = Character.toCodePoint(c, (char) low);
 135:                 return j;
 136:               }
 137:             else
 138:               throw new IOException("unpaired surrogate: U+" +
 139:                                     Integer.toHexString(c));
 140:           }
 141:         else if (c >= 0xdc00 && c < 0xe000)
 142:           throw new IOException("unpaired surrogate: U+" +
 143:                                 Integer.toHexString(c));
 144:         buf[j++] = (int) c;
 145:       }
 146:     return j;
 147:   }
 148: 
 149:   public void close()
 150:     throws IOException
 151:   {
 152:     in.close();
 153:   }
 154: 
 155:   /**
 156:    * Returns the specified UTF-16 char array as an array of Unicode code
 157:    * points.
 158:    */
 159:   public static int[] toCodePointArray(String text)
 160:     throws IOException
 161:   {
 162:     char[] b2 = text.toCharArray();
 163:     int[] buf = new int[b2.length];
 164:     if (b2.length > 0)
 165:       {
 166:         int l = b2.length - 1;
 167:         int i = 0, j = 0;
 168:         for (; i < l; i++)
 169:           {
 170:             char c = b2[i];
 171:             if (c >= 0xd800 && c < 0xdc00)
 172:               {
 173:                 // Unicode surrogate?
 174:                 char d = b2[i + 1];
 175:                 if (d >= 0xdc00 && d < 0xe000)
 176:                   {
 177:                     buf[j++] = Character.toCodePoint(c, d);
 178:                     i++;
 179:                     continue;
 180:                   }
 181:                 else
 182:                   throw new IOException("unpaired surrogate: U+" +
 183:                                         Integer.toHexString(c));
 184:               }
 185:             else if (c >= 0xdc00 && c < 0xe000)
 186:               throw new IOException("unpaired surrogate: U+" +
 187:                                     Integer.toHexString(c));
 188:             buf[j++] = (int) c;
 189:           }
 190:         if (i == l)
 191:           {
 192:             // last char
 193:             buf[j++] = (int) b2[l];
 194:             if (j < buf.length)
 195:               {
 196:                 int[] buf2 = new int[j];
 197:                 System.arraycopy(buf, 0, buf2, 0, j);
 198:                 buf = buf2;
 199:               }
 200:           }
 201:       }
 202:     return buf;
 203:   }
 204: 
 205: }