Frames | No Frames |
1: /* DomHTMLParser.java -- 2: Copyright (C) 2005 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: 39: package gnu.xml.dom.html2; 40: 41: import java.io.IOException; 42: import java.io.Reader; 43: 44: import java.util.Enumeration; 45: import java.util.Iterator; 46: import java.util.LinkedList; 47: 48: import javax.swing.text.AttributeSet; 49: import javax.swing.text.html.HTML; 50: import javax.swing.text.html.parser.DTD; 51: import javax.swing.text.html.parser.TagElement; 52: 53: import org.w3c.dom.NamedNodeMap; 54: import org.w3c.dom.Node; 55: import org.w3c.dom.html2.HTMLDocument; 56: 57: /** 58: * This parser reads HTML from the given stream and stores into 59: * {@link HTMLDocument}. The HTML tag becomes the {@link Node}. 60: * The tag attributes become the node attributes. The text inside 61: * HTML tag is inserted as one or several text nodes. The nested 62: * HTML tags are inserted as child nodes. 63: * 64: * If the strict tree structure, closing the tag means closing all 65: * nested tags. To work around this, this parser closes the nested 66: * tags and immediately reopens them after the closed tag. 67: * In this way, <code><b><i>c</b>d</code> 68: * is parsed as <code><b><i>c</i></b><i>d</code> . 69: * 70: * @author Audrius Meskauskas (AudriusA@Bioinformatics.org) 71: */ 72: public class DomHTMLParser 73: extends gnu.javax.swing.text.html.parser.support.Parser 74: { 75: /** 76: * The target where HTML document will be inserted. 77: */ 78: protected DomHTMLDocument document; 79: 80: /** 81: * The subsequently created new nodes will be inserted as the 82: * childs of this cursor. 83: */ 84: protected Node cursor; 85: 86: /** 87: * Create parser using the given DTD. 88: * 89: * @param dtd the DTD (for example, 90: * {@link gnu.javax.swing.text.html.parser.HTML_401F}). 91: */ 92: public DomHTMLParser(DTD dtd) 93: { 94: super(dtd); 95: } 96: 97: /** 98: * Parse SGML insertion ( <! ... > ). 99: * Currently just treats it as comment. 100: */ 101: public boolean parseMarkupDeclarations(StringBuffer strBuff) 102: throws java.io.IOException 103: { 104: Node c = document.createComment(strBuff.toString()); 105: cursor.appendChild(c); 106: return false; 107: } 108: 109: /** 110: * Read the document, present in the given stream, and 111: * return the corresponding {@link HTMLDocument}. 112: * 113: * @param input a stream to read from. 114: * @return a document, reflecting the structure of the provided HTML 115: * text. 116: * 117: * @throws IOException if the reader throws one. 118: */ 119: public HTMLDocument parseDocument(Reader input) 120: throws IOException 121: { 122: try 123: { 124: document = new DomHTMLDocument(); 125: document.setCheckWellformedness(false); 126: document.setCheckingCharacters(false); 127: 128: cursor = document; 129: 130: parse(input); 131: 132: DomHTMLDocument h = document; 133: document = null; 134: return h; 135: } 136: catch (Exception ex) 137: { 138: ex.printStackTrace(); 139: throw new IOException("Exception: " + ex.getMessage()); 140: } 141: } 142: 143: /** 144: * Create a new node. 145: * @param name the name of node, case insensitive. 146: * @return the created node. 147: */ 148: protected Node createNode(String name) 149: { 150: Node new_node = document.createElement(name.toLowerCase()); 151: AttributeSet hatts = getAttributes(); 152: NamedNodeMap natts = new_node.getAttributes(); 153: 154: Enumeration enumeration = hatts.getAttributeNames(); 155: Object key; 156: Node attribute; 157: 158: while (hatts != null) 159: { 160: while (enumeration.hasMoreElements()) 161: { 162: key = enumeration.nextElement(); 163: attribute = document.createAttribute(key.toString()); 164: attribute.setNodeValue(hatts.getAttribute(key).toString()); 165: natts.setNamedItem(attribute); 166: } 167: 168: // The default values are stored in a parent node. 169: hatts = hatts.getResolveParent(); 170: } 171: 172: return new_node; 173: } 174: 175: /** 176: * Handle comment by inserting the comment node. 177: * @param text the comment text. 178: */ 179: protected void handleComment(char[] text) 180: { 181: Node c = document.createComment(new String(text)); 182: cursor.appendChild(c); 183: } 184: 185: /** 186: * Handle the tag with no content. 187: * @param tag the tag to handle. 188: */ 189: protected void handleEmptyTag(TagElement tag) 190: { 191: String name = tag.getHTMLTag().toString(); 192: 193: if (name.equalsIgnoreCase("#pcdata")) 194: return; 195: 196: Node c = createNode(name); 197: cursor.appendChild(c); 198: } 199: 200: /** 201: * Close the given tag. Close and reopen all nested tags. 202: * @param tag the tag to close. 203: */ 204: protected void handleEndTag(TagElement tag) 205: { 206: String name = tag.getHTMLTag().toString(); 207: String nname = cursor.getNodeName(); 208: 209: // Closing the current tag. 210: if (nname != null && nname.equalsIgnoreCase(name)) 211: { 212: cursor = cursor.getParentNode(); 213: } 214: else 215: { 216: Node nCursor = cursor.getParentNode(); 217: 218: // Remember the opened nodes. 219: LinkedList open = new LinkedList(); 220: Node close = cursor; 221: while (close != null && !close.getNodeName().equalsIgnoreCase(name)) 222: { 223: if (close != document) 224: open.addFirst(close); 225: close = close.getParentNode(); 226: } 227: if (close == null) 228: cursor = document; 229: else 230: cursor = close.getParentNode(); 231: 232: // Insert the copies of the opened nodes. 233: Iterator iter = open.iterator(); 234: while (iter.hasNext()) 235: { 236: Node item = (Node) iter.next(); 237: cursor.appendChild(item); 238: cursor = item; 239: } 240: } 241: } 242: 243: /** 244: * Handle the start tag by inserting the HTML element. 245: * @param tag the tag to handle. 246: */ 247: protected void handleStartTag(TagElement tag) 248: { 249: HTML.Tag h = tag.getHTMLTag(); 250: Node c = createNode(h.toString()); 251: cursor.appendChild(c); 252: cursor = c; 253: } 254: 255: /** 256: * Handle text by inserting the text node. 257: * @param text the text to insert. 258: */ 259: protected void handleText(char[] text) 260: { 261: Node c = document.createTextNode(text, 0, text.length); 262: cursor.appendChild(c); 263: } 264: }