Frames | No Frames |
1: /* LinkFilter.java -- 2: Copyright (C) 1999,2000,2001 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: package gnu.xml.pipeline; 39: 40: import java.io.IOException; 41: import java.net.URL; 42: import java.util.Enumeration; 43: import java.util.Vector; 44: 45: import org.xml.sax.Attributes; 46: import org.xml.sax.SAXException; 47: 48: 49: /** 50: * Pipeline filter to remember XHTML links found in a document, 51: * so they can later be crawled. Fragments are not counted, and duplicates 52: * are ignored. Callers are responsible for filtering out URLs they aren't 53: * interested in. Events are passed through unmodified. 54: * 55: * <p> Input MUST include a setDocumentLocator() call, as it's used to 56: * resolve relative links in the absence of a "base" element. Input MUST 57: * also include namespace identifiers, since it is the XHTML namespace 58: * identifier which is used to identify the relevant elements. 59: * 60: * <p><em>FIXME:</em> handle xml:base attribute ... in association with 61: * a stack of base URIs. Similarly, recognize/support XLink data. 62: * 63: * @author David Brownell 64: */ 65: public class LinkFilter extends EventFilter 66: { 67: // for storing URIs 68: private Vector vector = new Vector (); 69: 70: // struct for "full" link record (tbd) 71: // these for troubleshooting original source: 72: // original uri 73: // uri as resolved (base, relative, etc) 74: // URI of originating doc 75: // line # 76: // original element + attrs (img src, desc, etc) 77: 78: // XLink model of the link ... for inter-site pairups ? 79: 80: private String baseURI; 81: 82: private boolean siteRestricted = false; 83: 84: // 85: // XXX leverage blacklist info (like robots.txt) 86: // 87: // XXX constructor w/param ... pipeline for sending link data 88: // probably XHTML --> XLink, providing info as sketched above 89: // 90: 91: 92: /** 93: * Constructs a new event filter, which collects links in private data 94: * structure for later enumeration. 95: */ 96: // constructor used by PipelineFactory 97: public LinkFilter () 98: { 99: super.setContentHandler (this); 100: } 101: 102: 103: /** 104: * Constructs a new event filter, which collects links in private data 105: * structure for later enumeration and passes all events, unmodified, 106: * to the next consumer. 107: */ 108: // constructor used by PipelineFactory 109: public LinkFilter (EventConsumer next) 110: { 111: super (next); 112: super.setContentHandler (this); 113: } 114: 115: 116: /** 117: * Returns an enumeration of the links found since the filter 118: * was constructed, or since removeAllLinks() was called. 119: * 120: * @return enumeration of strings. 121: */ 122: public Enumeration getLinks () 123: { 124: return vector.elements (); 125: } 126: 127: /** 128: * Removes records about all links reported to the event 129: * stream, as if the filter were newly created. 130: */ 131: public void removeAllLinks () 132: { 133: vector = new Vector (); 134: } 135: 136: 137: /** 138: * Collects URIs for (X)HTML content from elements which hold them. 139: */ 140: public void startElement ( 141: String uri, 142: String localName, 143: String qName, 144: Attributes atts 145: ) throws SAXException 146: { 147: String link; 148: 149: // Recognize XHTML links. 150: if ("http://www.w3.org/1999/xhtml".equals (uri)) { 151: 152: if ("a".equals (localName) || "base".equals (localName) 153: || "area".equals (localName)) 154: link = atts.getValue ("href"); 155: else if ("iframe".equals (localName) || "frame".equals (localName)) 156: link = atts.getValue ("src"); 157: else if ("blockquote".equals (localName) || "q".equals (localName) 158: || "ins".equals (localName) || "del".equals (localName)) 159: link = atts.getValue ("cite"); 160: else 161: link = null; 162: link = maybeAddLink (link); 163: 164: // "base" modifies designated baseURI 165: if ("base".equals (localName) && link != null) 166: baseURI = link; 167: 168: if ("iframe".equals (localName) || "img".equals (localName)) 169: maybeAddLink (atts.getValue ("longdesc")); 170: } 171: 172: super.startElement (uri, localName, qName, atts); 173: } 174: 175: private String maybeAddLink (String link) 176: { 177: int index; 178: 179: // ignore empty links and fragments inside docs 180: if (link == null) 181: return null; 182: if ((index = link.indexOf ("#")) >= 0) 183: link = link.substring (0, index); 184: if (link.equals ("")) 185: return null; 186: 187: try { 188: // get the real URI 189: URL base = new URL ((baseURI != null) 190: ? baseURI 191: : getDocumentLocator ().getSystemId ()); 192: URL url = new URL (base, link); 193: 194: link = url.toString (); 195: 196: // ignore duplicates 197: if (vector.contains (link)) 198: return link; 199: 200: // other than what "base" does, stick to original site: 201: if (siteRestricted) { 202: // don't switch protocols 203: if (!base.getProtocol ().equals (url.getProtocol ())) 204: return link; 205: // don't switch servers 206: if (base.getHost () != null 207: && !base.getHost ().equals (url.getHost ())) 208: return link; 209: } 210: 211: vector.addElement (link); 212: 213: return link; 214: 215: } catch (IOException e) { 216: // bad URLs we don't want 217: } 218: return null; 219: } 220: 221: /** 222: * Reports an error if no Locator has been made available. 223: */ 224: public void startDocument () 225: throws SAXException 226: { 227: if (getDocumentLocator () == null) 228: throw new SAXException ("no Locator!"); 229: } 230: 231: /** 232: * Forgets about any base URI information that may be recorded. 233: * Applications will often want to call removeAllLinks(), likely 234: * after examining the links which were reported. 235: */ 236: public void endDocument () 237: throws SAXException 238: { 239: baseURI = null; 240: super.endDocument (); 241: } 242: }