info.bliki.wiki.dump
Class WikiXMLParser

java.lang.Object
  extended by org.xml.sax.helpers.DefaultHandler
      extended by info.bliki.wiki.dump.WikiXMLParser
All Implemented Interfaces:
org.xml.sax.ContentHandler, org.xml.sax.DTDHandler, org.xml.sax.EntityResolver, org.xml.sax.ErrorHandler

public class WikiXMLParser
extends org.xml.sax.helpers.DefaultHandler

A Wikipedia XML dump file parser Original version with permission from Marco Schmidt. See: http://schmidt.devlib.org/software/lucene-wikipedia.html

Author:
Marco Schmidt

Constructor Summary
WikiXMLParser(java.io.InputStream inputStream, IArticleFilter filter)
           
WikiXMLParser(java.lang.String filename, IArticleFilter filter)
           
 
Method Summary
 void characters(char[] ch, int start, int length)
          parse an unlimited amount of characters between 2 enclosing XML-Tags
 void endDocument()
           
 void endElement(java.lang.String uri, java.lang.String name, java.lang.String qName)
           
 void parse()
           
 void startDocument()
           
 void startElement(java.lang.String namespaceURI, java.lang.String localName, java.lang.String qName, org.xml.sax.Attributes atts)
           
 
Methods inherited from class org.xml.sax.helpers.DefaultHandler
endPrefixMapping, error, fatalError, ignorableWhitespace, notationDecl, processingInstruction, resolveEntity, setDocumentLocator, skippedEntity, startPrefixMapping, unparsedEntityDecl, warning
 
Methods inherited from class java.lang.Object
equals, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Constructor Detail

WikiXMLParser

public WikiXMLParser(java.lang.String filename,
                     IArticleFilter filter)
              throws org.xml.sax.SAXException,
                     java.io.FileNotFoundException
Throws:
org.xml.sax.SAXException
java.io.FileNotFoundException

WikiXMLParser

public WikiXMLParser(java.io.InputStream inputStream,
                     IArticleFilter filter)
              throws org.xml.sax.SAXException
Throws:
org.xml.sax.SAXException
Method Detail

startDocument

public void startDocument()
Specified by:
startDocument in interface org.xml.sax.ContentHandler
Overrides:
startDocument in class org.xml.sax.helpers.DefaultHandler

endDocument

public void endDocument()
Specified by:
endDocument in interface org.xml.sax.ContentHandler
Overrides:
endDocument in class org.xml.sax.helpers.DefaultHandler

startElement

public void startElement(java.lang.String namespaceURI,
                         java.lang.String localName,
                         java.lang.String qName,
                         org.xml.sax.Attributes atts)
Specified by:
startElement in interface org.xml.sax.ContentHandler
Overrides:
startElement in class org.xml.sax.helpers.DefaultHandler

endElement

public void endElement(java.lang.String uri,
                       java.lang.String name,
                       java.lang.String qName)
Specified by:
endElement in interface org.xml.sax.ContentHandler
Overrides:
endElement in class org.xml.sax.helpers.DefaultHandler

characters

public void characters(char[] ch,
                       int start,
                       int length)
                throws org.xml.sax.SAXException
parse an unlimited amount of characters between 2 enclosing XML-Tags

Specified by:
characters in interface org.xml.sax.ContentHandler
Overrides:
characters in class org.xml.sax.helpers.DefaultHandler
Throws:
org.xml.sax.SAXException
See Also:
DefaultHandler.characters(char[], int, int)

parse

public void parse()
           throws java.io.IOException,
                  org.xml.sax.SAXException
Throws:
java.io.IOException
org.xml.sax.SAXException