Source for gnu.xml.pipeline.LinkFilter

   1: /* LinkFilter.java --
   2:    Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: package gnu.xml.pipeline;
  39: 
  40: import java.io.IOException;
  41: import java.net.URL;
  42: import java.util.Enumeration;
  43: import java.util.Vector;
  44: 
  45: import org.xml.sax.Attributes;
  46: import org.xml.sax.SAXException;
  47: 
  48: 
  49: /**
  50:  * Pipeline filter to remember XHTML links found in a document,
  51:  * so they can later be crawled.  Fragments are not counted, and duplicates
  52:  * are ignored.  Callers are responsible for filtering out URLs they aren't
  53:  * interested in.  Events are passed through unmodified.
  54:  *
  55:  * <p> Input MUST include a setDocumentLocator() call, as it's used to
  56:  * resolve relative links in the absence of a "base" element.  Input MUST
  57:  * also include namespace identifiers, since it is the XHTML namespace
  58:  * identifier which is used to identify the relevant elements.
  59:  *
  60:  * <p><em>FIXME:</em> handle xml:base attribute ... in association with
  61:  * a stack of base URIs.  Similarly, recognize/support XLink data.
  62:  *
  63:  * @author David Brownell
  64:  */
  65: public class LinkFilter extends EventFilter
  66: {
  67:     // for storing URIs
  68:     private Vector              vector = new Vector ();
  69: 
  70:         // struct for "full" link record (tbd)
  71:         // these for troubleshooting original source:
  72:         //      original uri
  73:         //      uri as resolved (base, relative, etc)
  74:         //      URI of originating doc
  75:         //      line #
  76:         //      original element + attrs (img src, desc, etc)
  77: 
  78:         // XLink model of the link ... for inter-site pairups ?
  79: 
  80:     private String              baseURI;
  81: 
  82:     private boolean             siteRestricted = false;
  83: 
  84:     //
  85:     // XXX leverage blacklist info (like robots.txt)
  86:     //
  87:     // XXX constructor w/param ... pipeline for sending link data
  88:     // probably XHTML --> XLink, providing info as sketched above
  89:     //
  90: 
  91: 
  92:     /**
  93:      * Constructs a new event filter, which collects links in private data
  94:      * structure for later enumeration.
  95:      */
  96:         // constructor used by PipelineFactory
  97:     public LinkFilter ()
  98:     {
  99:         super.setContentHandler (this);
 100:     }
 101: 
 102: 
 103:     /**
 104:      * Constructs a new event filter, which collects links in private data
 105:      * structure for later enumeration and passes all events, unmodified,
 106:      * to the next consumer.
 107:      */
 108:         // constructor used by PipelineFactory
 109:     public LinkFilter (EventConsumer next)
 110:     {
 111:         super (next);
 112:         super.setContentHandler (this);
 113:     }
 114: 
 115: 
 116:     /**
 117:      * Returns an enumeration of the links found since the filter
 118:      * was constructed, or since removeAllLinks() was called.
 119:      *
 120:      * @return enumeration of strings.
 121:      */
 122:     public Enumeration getLinks ()
 123:     {
 124:         return vector.elements ();
 125:     }
 126: 
 127:     /**
 128:      * Removes records about all links reported to the event
 129:      * stream, as if the filter were newly created.
 130:      */
 131:     public void removeAllLinks ()
 132:     {
 133:         vector = new Vector ();
 134:     }
 135: 
 136: 
 137:     /**
 138:      * Collects URIs for (X)HTML content from elements which hold them.
 139:      */
 140:     public void startElement (
 141:         String          uri,
 142:         String          localName,
 143:         String          qName,
 144:         Attributes      atts
 145:     ) throws SAXException
 146:     {
 147:         String  link;
 148: 
 149:         // Recognize XHTML links.
 150:         if ("http://www.w3.org/1999/xhtml".equals (uri)) {
 151: 
 152:             if ("a".equals (localName) || "base".equals (localName)
 153:                     || "area".equals (localName))
 154:                 link = atts.getValue ("href");
 155:             else if ("iframe".equals (localName) || "frame".equals (localName))
 156:                 link = atts.getValue ("src");
 157:             else if ("blockquote".equals (localName) || "q".equals (localName)
 158:                     || "ins".equals (localName) || "del".equals (localName))
 159:                 link = atts.getValue ("cite");
 160:             else
 161:                 link = null;
 162:             link = maybeAddLink (link);
 163: 
 164:             // "base" modifies designated baseURI
 165:             if ("base".equals (localName) && link != null)
 166:                 baseURI = link;
 167: 
 168:             if ("iframe".equals (localName) || "img".equals (localName))
 169:                 maybeAddLink (atts.getValue ("longdesc"));
 170:         }
 171: 
 172:         super.startElement (uri, localName, qName, atts);
 173:     }
 174: 
 175:     private String maybeAddLink (String link)
 176:     {
 177:         int             index;
 178: 
 179:         // ignore empty links and fragments inside docs
 180:         if (link == null)
 181:             return null;
 182:         if ((index = link.indexOf ("#")) >= 0)
 183:             link = link.substring (0, index);
 184:         if (link.equals (""))
 185:             return null;
 186: 
 187:         try {
 188:             // get the real URI
 189:             URL         base = new URL ((baseURI != null)
 190:                                     ? baseURI
 191:                                     : getDocumentLocator ().getSystemId ());
 192:             URL         url = new URL (base, link);
 193: 
 194:             link = url.toString ();
 195: 
 196:             // ignore duplicates
 197:             if (vector.contains (link))
 198:                 return link;
 199: 
 200:             // other than what "base" does, stick to original site:
 201:             if (siteRestricted) {
 202:                 // don't switch protocols
 203:                 if (!base.getProtocol ().equals (url.getProtocol ()))
 204:                     return link;
 205:                 // don't switch servers
 206:                 if (base.getHost () != null
 207:                         && !base.getHost ().equals (url.getHost ()))
 208:                     return link;
 209:             }
 210: 
 211:             vector.addElement (link);
 212: 
 213:             return link;
 214: 
 215:         } catch (IOException e) {
 216:             // bad URLs we don't want
 217:         }
 218:         return null;
 219:     }
 220: 
 221:     /**
 222:      * Reports an error if no Locator has been made available.
 223:      */
 224:     public void startDocument ()
 225:     throws SAXException
 226:     {
 227:         if (getDocumentLocator () == null)
 228:             throw new SAXException ("no Locator!");
 229:     }
 230: 
 231:     /**
 232:      * Forgets about any base URI information that may be recorded.
 233:      * Applications will often want to call removeAllLinks(), likely
 234:      * after examining the links which were reported.
 235:      */
 236:     public void endDocument ()
 237:     throws SAXException
 238:     {
 239:         baseURI = null;
 240:         super.endDocument ();
 241:     }
 242: }