// XmlParser.java: the main parser class.
// NO WARRANTY! See README, and copyright below.
// $Id: XmlParser.java,v 1.1.1.1 2001/08/20 22:31:30 gfx Exp $
package com.microstar.xml;
import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.InputStream;
import java.io.Reader;
import java.net.URL;
import java.net.URLConnection;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Stack;
/**
* Parse XML documents and return parse events through call-backs.
*
You need to define a class implementing the XmlHandler
* interface: an object belonging to this class will receive the
* callbacks for the events. (As an alternative to implementing
* the full XmlHandler interface, you can simply extend the
* HandlerBase convenience class.)
*
Usage (assuming that MyHandler is your implementation
* of the XmlHandler interface):
*
Alternatively, you can use the standard SAX interfaces
* with the SAXDriver class as your entry point.
* @author Copyright (c) 1997, 1998 by Microstar Software Ltd.
* @author Written by David Megginson <dmeggins@microstar.com>
* @version 1.1
* @see XmlHandler
* @see HandlerBase
* @see SAXDriver
*/
public class XmlParser {
//
// Use special cheats that speed up the code (currently about 50%),
// but may cause problems with future maintenance and add to the
// class file size (about 500 bytes).
//
private final static boolean USE_CHEATS = true;
//////////////////////////////////////////////////////////////////////
// Constructors.
////////////////////////////////////////////////////////////////////////
/**
* Construct a new parser with no associated handler.
* @see #setHandler
* @see #parse
*/
public XmlParser ()
{
}
/**
* Set the handler that will receive parsing events.
* @param handler The handler to receive callback events.
* @see #parse
* @see XmlHandler
*/
public void setHandler (XmlHandler handler)
{
this.handler = handler;
}
/**
* Parse an XML document from a URI.
*
You may parse a document more than once, but only one thread
* may call this method for an object at one time.
* @param systemId The URI of the document.
* @param publicId The public identifier of the document, or null.
* @param encoding The suggested encoding, or null if unknown.
* @exception java.lang.Exception Any exception thrown by your
* own handlers, or any derivation of java.io.IOException
* thrown by the parser itself.
*/
public void parse (String systemId, String publicId, String encoding)
throws java.lang.Exception
{
doParse(systemId, publicId, null, null, encoding);
}
/**
* Parse an XML document from a byte stream.
*
The URI that you supply will become the base URI for
* resolving relative links, but Ælfred will actually read
* the document from the supplied input stream.
*
You may parse a document more than once, but only one thread
* may call this method for an object at one time.
* @param systemId The base URI of the document, or null if not
* known.
* @param publicId The public identifier of the document, or null
* if not known.
* @param stream A byte input stream.
* @param encoding The suggested encoding, or null if unknown.
* @exception java.lang.Exception Any exception thrown by your
* own handlers, or any derivation of java.io.IOException
* thrown by the parser itself.
*/
public void parse (String systemId, String publicId,
InputStream stream, String encoding)
throws java.lang.Exception
{
doParse(systemId, publicId, null, stream, encoding);
}
/**
* Parse an XML document from a character stream.
*
The URI that you supply will become the base URI for
* resolving relative links, but Ælfred will actually read
* the document from the supplied input stream.
*
You may parse a document more than once, but only one thread
* may call this method for an object at one time.
* @param systemId The base URI of the document, or null if not
* known.
* @param publicId The public identifier of the document, or null
* if not known.
* @param reader A character stream.
* @exception java.lang.Exception Any exception thrown by your
* own handlers, or any derivation of java.io.IOException
* thrown by the parser itself.
*/
public void parse (String systemId, String publicId, Reader reader)
throws java.lang.Exception
{
doParse(systemId, publicId, reader, null, null);
}
private synchronized void doParse (String systemId, String publicId,
Reader reader, InputStream stream,
String encoding)
throws java.lang.Exception
{
basePublicId = publicId;
baseURI = systemId;
baseReader = reader;
baseInputStream = stream;
initializeVariables();
// Set the default entities here.
setInternalEntity(intern("amp"), "&");
setInternalEntity(intern("lt"), "<");
setInternalEntity(intern("gt"), ">");
setInternalEntity(intern("apos"), "'");
setInternalEntity(intern("quot"), """);
if (handler != null) {
handler.startDocument();
}
pushURL("[document]", basePublicId, baseURI, baseReader, baseInputStream,
encoding);
parseDocument();
if (handler != null) {
handler.endDocument();
}
cleanupVariables();
}
////////////////////////////////////////////////////////////////////////
// Constants.
////////////////////////////////////////////////////////////////////////
//
// Constants for element content type.
//
/**
* Constant: an element has not been declared.
* @see #getElementContentType
*/
public final static int CONTENT_UNDECLARED = 0;
/**
* Constant: the element has a content model of ANY.
* @see #getElementContentType
*/
public final static int CONTENT_ANY = 1;
/**
* Constant: the element has declared content of EMPTY.
* @see #getElementContentType
*/
public final static int CONTENT_EMPTY = 2;
/**
* Constant: the element has mixed content.
* @see #getElementContentType
*/
public final static int CONTENT_MIXED = 3;
/**
* Constant: the element has element content.
* @see #getElementContentType
*/
public final static int CONTENT_ELEMENTS = 4;
//
// Constants for the entity type.
//
/**
* Constant: the entity has not been declared.
* @see #getEntityType
*/
public final static int ENTITY_UNDECLARED = 0;
/**
* Constant: the entity is internal.
* @see #getEntityType
*/
public final static int ENTITY_INTERNAL = 1;
/**
* Constant: the entity is external, non-XML data.
* @see #getEntityType
*/
public final static int ENTITY_NDATA = 2;
/**
* Constant: the entity is external XML data.
* @see #getEntityType
*/
public final static int ENTITY_TEXT = 3;
//
// Constants for attribute type.
//
/**
* Constant: the attribute has not been declared for this element type.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_UNDECLARED = 0;
/**
* Constant: the attribute value is a string value.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_CDATA = 1;
/**
* Constant: the attribute value is a unique identifier.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_ID = 2;
/**
* Constant: the attribute value is a reference to a unique identifier.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_IDREF = 3;
/**
* Constant: the attribute value is a list of ID references.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_IDREFS = 4;
/**
* Constant: the attribute value is the name of an entity.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_ENTITY = 5;
/**
* Constant: the attribute value is a list of entity names.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_ENTITIES = 6;
/**
* Constant: the attribute value is a name token.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_NMTOKEN = 7;
/**
* Constant: the attribute value is a list of name tokens.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_NMTOKENS = 8;
/**
* Constant: the attribute value is a token from an enumeration.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_ENUMERATED = 9;
/**
* Constant: the attribute is the name of a notation.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_NOTATION = 10;
//
// When the class is loaded, populate the hash table of
// attribute types.
//
/**
* Hash table of attribute types.
*/
private static Hashtable attributeTypeHash;
static {
attributeTypeHash = new Hashtable();
attributeTypeHash.put("CDATA", new Integer(ATTRIBUTE_CDATA));
attributeTypeHash.put("ID", new Integer(ATTRIBUTE_ID));
attributeTypeHash.put("IDREF", new Integer(ATTRIBUTE_IDREF));
attributeTypeHash.put("IDREFS", new Integer(ATTRIBUTE_IDREFS));
attributeTypeHash.put("ENTITY", new Integer(ATTRIBUTE_ENTITY));
attributeTypeHash.put("ENTITIES", new Integer(ATTRIBUTE_ENTITIES));
attributeTypeHash.put("NMTOKEN", new Integer(ATTRIBUTE_NMTOKEN));
attributeTypeHash.put("NMTOKENS", new Integer(ATTRIBUTE_NMTOKENS));
attributeTypeHash.put("NOTATION", new Integer(ATTRIBUTE_NOTATION));
}
//
// Constants for supported encodings.
//
private final static int ENCODING_UTF_8 = 1;
private final static int ENCODING_ISO_8859_1 = 2;
private final static int ENCODING_UCS_2_12 = 3;
private final static int ENCODING_UCS_2_21 = 4;
private final static int ENCODING_UCS_4_1234 = 5;
private final static int ENCODING_UCS_4_4321 = 6;
private final static int ENCODING_UCS_4_2143 = 7;
private final static int ENCODING_UCS_4_3412 = 8;
//
// Constants for attribute default value.
//
/**
* Constant: the attribute is not declared.
* @see #getAttributeDefaultValueType
*/
public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 0;
/**
* Constant: the attribute has a literal default value specified.
* @see #getAttributeDefaultValueType
* @see #getAttributeDefaultValue
*/
public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 1;
/**
* Constant: the attribute was declared #IMPLIED.
* @see #getAttributeDefaultValueType
*/
public final static int ATTRIBUTE_DEFAULT_IMPLIED = 2;
/**
* Constant: the attribute was declared #REQUIRED.
* @see #getAttributeDefaultValueType
*/
public final static int ATTRIBUTE_DEFAULT_REQUIRED = 3;
/**
* Constant: the attribute was declared #FIXED.
* @see #getAttributeDefaultValueType
* @see #getAttributeDefaultValue
*/
public final static int ATTRIBUTE_DEFAULT_FIXED = 4;
//
// Constants for input.
//
private final static int INPUT_NONE = 0;
private final static int INPUT_INTERNAL = 1;
private final static int INPUT_EXTERNAL = 2;
private final static int INPUT_STREAM = 3;
private final static int INPUT_BUFFER = 4;
private final static int INPUT_READER = 5;
//
// Flags for reading literals.
//
private final static int LIT_CHAR_REF = 1;
private final static int LIT_ENTITY_REF = 2;
private final static int LIT_PE_REF = 4;
private final static int LIT_NORMALIZE = 8;
//
// Flags for parsing context.
//
private final static int CONTEXT_NONE = 0;
private final static int CONTEXT_DTD = 1;
private final static int CONTEXT_ENTITYVALUE = 2;
private final static int CONTEXT_ATTRIBUTEVALUE = 3;
//////////////////////////////////////////////////////////////////////
// Error reporting.
//////////////////////////////////////////////////////////////////////
/**
* Report an error.
* @param message The error message.
* @param textFound The text that caused the error (or null).
* @see XmlHandler#error
* @see #line
*/
void error (String message, String textFound, String textExpected)
throws java.lang.Exception
{
errorCount++;
if (textFound != null) {
message = message + " (found \"" + textFound + "\")";
}
if (textExpected != null) {
message = message + " (expected \"" + textExpected + "\")";
}
if (handler != null) {
String uri = null;
if (externalEntity != null) {
uri = externalEntity.getURL().toString();
}
handler.error(message, uri, line, column);
}
}
/**
* Report a serious error.
* @param message The error message.
* @param textFound The text that caused the error (or null).
*/
void error (String message, char textFound, String textExpected)
throws java.lang.Exception
{
error(message, new Character(textFound).toString(), textExpected);
}
//////////////////////////////////////////////////////////////////////
// Major syntactic productions.
//////////////////////////////////////////////////////////////////////
/**
* Parse an XML document.
*
* [1] document ::= prolog element Misc*
*
*
This is the top-level parsing function for a single XML
* document. As a minimum, a well-formed document must have
* a document element, and a valid document must have a prolog
* as well.
*/
void parseDocument ()
throws java.lang.Exception
{
char c;
parseProlog();
require('<');
parseElement();
try
{
parseMisc(); //skip all white, PIs, and comments
c=readCh(); //if this doesn't throw an exception...
error("unexpected characters after document end",c,null);
}
catch (EOFException e)
{return;}
}
/**
* Skip a comment.
*
(The <!-- has already been read.)
*/
void parseComment ()
throws java.lang.Exception
{
skipUntil("-->");
}
/**
* Parse a processing instruction and do a call-back.
*
* [19] PI ::= '<?' Name (S (Char* - (Char* '?>' Char*)))? '?>'
*
*
(The <? has already been read.)
*
An XML processing instruction must begin with
* a Name, which is the instruction's target.
*/
void parsePI ()
throws java.lang.Exception
{
String name;
name = readNmtoken(true);
if (!tryRead("?>")) {
requireWhitespace();
parseUntil("?>");
}
if (handler != null) {
handler.processingInstruction(name, dataBufferToString());
}
}
/**
* Parse a CDATA marked section.
*
Note that this just appends characters to the dataBuffer,
* without actually generating an event.
*/
void parseCDSect ()
throws java.lang.Exception
{
parseUntil("]]>");
}
/**
* Parse the prolog of an XML document.
*
There are a couple of tricks here. First, it is necessary to
* declare the XML default attributes after the DTD (if present)
* has been read. Second, it is not possible to expand general
* references in attribute value literals until after the entire
* DTD (if present) has been parsed.
*
We do not look for the XML declaration here, because it is
* handled by pushURL().
* @see pushURL
*/
void parseProlog ()
throws java.lang.Exception
{
parseMisc();
if (tryRead("
* [25] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
* [26] VersionInfo ::= S 'version' Eq ('"1.0"' | "'1.0'")
* [33] SDDecl ::= S 'standalone' Eq "'" ('yes' | 'no') "'"
* | S 'standalone' Eq '"' ("yes" | "no") '"'
* [78] EncodingDecl ::= S 'encoding' Eq QEncoding
*
*
([80] to [82] are also significant.)
*
(The <?xml and whitespace have already been read.)
*
TODO: validate value of standalone.
* @see #parseTextDecl
* @see #checkEncoding
*/
void parseXMLDecl (boolean ignoreEncoding)
throws java.lang.Exception
{
String version;
String encodingName = null;
String standalone = null;
// Read the version.
require("version");
parseEq();
version = readLiteral(0);
if (!version.equals("1.0")) {
error("unsupported XML version", version, "1.0");
}
// Try reading an encoding declaration.
skipWhitespace();
if (tryRead("encoding")) {
parseEq();
encodingName = readLiteral(0);
checkEncoding(encodingName, ignoreEncoding);
}
// Try reading a standalone declaration
skipWhitespace();
if (tryRead("standalone")) {
parseEq();
standalone = readLiteral(0);
}
skipWhitespace();
require("?>");
}
/**
* Parse the Encoding PI.
*
(The <?xml' and whitespace have already been read.)
* @see #parseXMLDecl
* @see #checkEncoding
*/
void parseTextDecl (boolean ignoreEncoding)
throws java.lang.Exception
{
String encodingName = null;
// Read an optional version.
if (tryRead("version")) {
String version;
parseEq();
version = readLiteral(0);
if (!version.equals("1.0")) {
error("unsupported XML version", version, "1.0");
}
requireWhitespace();
}
// Read the encoding.
require("encoding");
parseEq();
encodingName = readLiteral(0);
checkEncoding(encodingName, ignoreEncoding);
skipWhitespace();
require("?>");
}
/**
* Check that the encoding specified makes sense.
*
Compare what the author has specified in the XML declaration
* or encoding PI with what we have detected.
*
This is also important for distinguishing among the various
* 7- and 8-bit encodings, such as ISO-LATIN-1 (I cannot autodetect
* those).
* @param encodingName The name of the encoding specified by the user.
* @see #parseXMLDecl
* @see #parseTextDecl
*/
void checkEncoding (String encodingName, boolean ignoreEncoding)
throws java.lang.Exception
{
encodingName = encodingName.toUpperCase();
if (ignoreEncoding) {
return;
}
switch (encoding) {
// 8-bit encodings
case ENCODING_UTF_8:
if (encodingName.equals("ISO-8859-1")) {
encoding = ENCODING_ISO_8859_1;
} else if (!encodingName.equals("UTF-8")) {
error("unsupported 8-bit encoding",
encodingName,
"UTF-8 or ISO-8859-1");
}
break;
// 16-bit encodings
case ENCODING_UCS_2_12:
case ENCODING_UCS_2_21:
if (!encodingName.equals("ISO-10646-UCS-2") &&
!encodingName.equals("UTF-16")) {
error("unsupported 16-bit encoding",
encodingName,
"ISO-10646-UCS-2");
}
break;
// 32-bit encodings
case ENCODING_UCS_4_1234:
case ENCODING_UCS_4_4321:
case ENCODING_UCS_4_2143:
case ENCODING_UCS_4_3412:
if (!encodingName.equals("ISO-10646-UCS-4")) {
error("unsupported 32-bit encoding",
encodingName,
"ISO-10646-UCS-4");
}
}
}
/**
* Parse miscellaneous markup outside the document element and DOCTYPE
* declaration.
*
* [27] Misc ::= Comment | PI | S
*
*/
void parseMisc ()
throws java.lang.Exception
{
while (true)
{
skipWhitespace();
if (tryRead(""))
{parsePI();}
else if (tryRead("
Copyright 1998-2008 Alvin Alexander
All Rights Reserved.
devdaily.com is based in louisville, kentucky, and this web site is hosted by godaddy.com