You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
282 lines
9.9 KiB
282 lines
9.9 KiB
/*
|
|
* Copyright (c) 1998, 2013, Oracle and/or its affiliates. All rights reserved.
|
|
* ORACLE PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*
|
|
*/
|
|
|
|
package javax.swing.text.html.parser;
|
|
|
|
import javax.swing.text.SimpleAttributeSet;
|
|
import javax.swing.text.html.HTMLEditorKit;
|
|
import javax.swing.text.html.HTML;
|
|
import javax.swing.text.ChangedCharSetException;
|
|
|
|
import java.util.*;
|
|
import java.io.*;
|
|
import java.net.*;
|
|
|
|
/**
|
|
* A Parser for HTML Documents (actually, you can specify a DTD, but
|
|
* you should really only use this class with the html dtd in swing).
|
|
* Reads an InputStream of HTML and
|
|
* invokes the appropriate methods in the ParserCallback class. This
|
|
* is the default parser used by HTMLEditorKit to parse HTML url's.
|
|
* <p>This will message the callback for all valid tags, as well as
|
|
* tags that are implied but not explicitly specified. For example, the
|
|
* html string (<p>blah) only has a p tag defined. The callback
|
|
* will see the following methods:
|
|
* <ol><li><i>handleStartTag(html, ...)</i></li>
|
|
* <li><i>handleStartTag(head, ...)</i></li>
|
|
* <li><i>handleEndTag(head)</i></li>
|
|
* <li><i>handleStartTag(body, ...)</i></li>
|
|
* <li><i>handleStartTag(p, ...)</i></li>
|
|
* <li><i>handleText(...)</i></li>
|
|
* <li><i>handleEndTag(p)</i></li>
|
|
* <li><i>handleEndTag(body)</i></li>
|
|
* <li><i>handleEndTag(html)</i></li>
|
|
* </ol>
|
|
* The items in <i>italic</i> are implied, that is, although they were not
|
|
* explicitly specified, to be correct html they should have been present
|
|
* (head isn't necessary, but it is still generated). For tags that
|
|
* are implied, the AttributeSet argument will have a value of
|
|
* <code>Boolean.TRUE</code> for the key
|
|
* <code>HTMLEditorKit.ParserCallback.IMPLIED</code>.
|
|
* <p>HTML.Attributes defines a type safe enumeration of html attributes.
|
|
* If an attribute key of a tag is defined in HTML.Attribute, the
|
|
* HTML.Attribute will be used as the key, otherwise a String will be used.
|
|
* For example <p foo=bar class=neat> has two attributes. foo is
|
|
* not defined in HTML.Attribute, where as class is, therefore the
|
|
* AttributeSet will have two values in it, HTML.Attribute.CLASS with
|
|
* a String value of 'neat' and the String key 'foo' with a String value of
|
|
* 'bar'.
|
|
* <p>The position argument will indicate the start of the tag, comment
|
|
* or text. Similar to arrays, the first character in the stream has a
|
|
* position of 0. For tags that are
|
|
* implied the position will indicate
|
|
* the location of the next encountered tag. In the first example,
|
|
* the implied start body and html tags will have the same position as the
|
|
* p tag, and the implied end p, html and body tags will all have the same
|
|
* position.
|
|
* <p>As html skips whitespace the position for text will be the position
|
|
* of the first valid character, eg in the string '\n\n\nblah'
|
|
* the text 'blah' will have a position of 3, the newlines are skipped.
|
|
* <p>
|
|
* For attributes that do not have a value, eg in the html
|
|
* string <code><foo blah></code> the attribute <code>blah</code>
|
|
* does not have a value, there are two possible values that will be
|
|
* placed in the AttributeSet's value:
|
|
* <ul>
|
|
* <li>If the DTD does not contain an definition for the element, or the
|
|
* definition does not have an explicit value then the value in the
|
|
* AttributeSet will be <code>HTML.NULL_ATTRIBUTE_VALUE</code>.
|
|
* <li>If the DTD contains an explicit value, as in:
|
|
* <code><!ATTLIST OPTION selected (selected) #IMPLIED></code>
|
|
* this value from the dtd (in this case selected) will be used.
|
|
* </ul>
|
|
* <p>
|
|
* Once the stream has been parsed, the callback is notified of the most
|
|
* likely end of line string. The end of line string will be one of
|
|
* \n, \r or \r\n, which ever is encountered the most in parsing the
|
|
* stream.
|
|
*
|
|
* @author Sunita Mani
|
|
*/
|
|
public class DocumentParser extends javax.swing.text.html.parser.Parser {
|
|
|
|
private int inbody;
|
|
private int intitle;
|
|
private int inhead;
|
|
private int instyle;
|
|
private int inscript;
|
|
private boolean seentitle;
|
|
private HTMLEditorKit.ParserCallback callback = null;
|
|
private boolean ignoreCharSet = false;
|
|
private static final boolean debugFlag = false;
|
|
|
|
public DocumentParser(DTD dtd) {
|
|
super(dtd);
|
|
}
|
|
|
|
public void parse(Reader in, HTMLEditorKit.ParserCallback callback, boolean ignoreCharSet) throws IOException {
|
|
this.ignoreCharSet = ignoreCharSet;
|
|
this.callback = callback;
|
|
parse(in);
|
|
// end of line
|
|
callback.handleEndOfLineString(getEndOfLineString());
|
|
}
|
|
|
|
/**
|
|
* Handle Start Tag.
|
|
*/
|
|
protected void handleStartTag(TagElement tag) {
|
|
|
|
Element elem = tag.getElement();
|
|
if (elem == dtd.body) {
|
|
inbody++;
|
|
} else if (elem == dtd.html) {
|
|
} else if (elem == dtd.head) {
|
|
inhead++;
|
|
} else if (elem == dtd.title) {
|
|
intitle++;
|
|
} else if (elem == dtd.style) {
|
|
instyle++;
|
|
} else if (elem == dtd.script) {
|
|
inscript++;
|
|
}
|
|
if (debugFlag) {
|
|
if (tag.fictional()) {
|
|
debug("Start Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
|
|
} else {
|
|
debug("Start Tag: " + tag.getHTMLTag() + " attributes: " +
|
|
getAttributes() + " pos: " + getCurrentPos());
|
|
}
|
|
}
|
|
if (tag.fictional()) {
|
|
SimpleAttributeSet attrs = new SimpleAttributeSet();
|
|
attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
|
|
Boolean.TRUE);
|
|
callback.handleStartTag(tag.getHTMLTag(), attrs,
|
|
getBlockStartPosition());
|
|
} else {
|
|
callback.handleStartTag(tag.getHTMLTag(), getAttributes(),
|
|
getBlockStartPosition());
|
|
flushAttributes();
|
|
}
|
|
}
|
|
|
|
|
|
protected void handleComment(char text[]) {
|
|
if (debugFlag) {
|
|
debug("comment: ->" + new String(text) + "<-"
|
|
+ " pos: " + getCurrentPos());
|
|
}
|
|
callback.handleComment(text, getBlockStartPosition());
|
|
}
|
|
|
|
/**
|
|
* Handle Empty Tag.
|
|
*/
|
|
protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
|
|
|
|
Element elem = tag.getElement();
|
|
if (elem == dtd.meta && !ignoreCharSet) {
|
|
SimpleAttributeSet atts = getAttributes();
|
|
if (atts != null) {
|
|
String content = (String)atts.getAttribute(HTML.Attribute.CONTENT);
|
|
if (content != null) {
|
|
if ("content-type".equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {
|
|
if (!content.equalsIgnoreCase("text/html") &&
|
|
!content.equalsIgnoreCase("text/plain")) {
|
|
throw new ChangedCharSetException(content, false);
|
|
}
|
|
} else if ("charset" .equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {
|
|
throw new ChangedCharSetException(content, true);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (inbody != 0 || elem == dtd.meta || elem == dtd.base || elem == dtd.isindex || elem == dtd.style || elem == dtd.link) {
|
|
if (debugFlag) {
|
|
if (tag.fictional()) {
|
|
debug("Empty Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
|
|
} else {
|
|
debug("Empty Tag: " + tag.getHTMLTag() + " attributes: "
|
|
+ getAttributes() + " pos: " + getCurrentPos());
|
|
}
|
|
}
|
|
if (tag.fictional()) {
|
|
SimpleAttributeSet attrs = new SimpleAttributeSet();
|
|
attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
|
|
Boolean.TRUE);
|
|
callback.handleSimpleTag(tag.getHTMLTag(), attrs,
|
|
getBlockStartPosition());
|
|
} else {
|
|
callback.handleSimpleTag(tag.getHTMLTag(), getAttributes(),
|
|
getBlockStartPosition());
|
|
flushAttributes();
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Handle End Tag.
|
|
*/
|
|
protected void handleEndTag(TagElement tag) {
|
|
Element elem = tag.getElement();
|
|
if (elem == dtd.body) {
|
|
inbody--;
|
|
} else if (elem == dtd.title) {
|
|
intitle--;
|
|
seentitle = true;
|
|
} else if (elem == dtd.head) {
|
|
inhead--;
|
|
} else if (elem == dtd.style) {
|
|
instyle--;
|
|
} else if (elem == dtd.script) {
|
|
inscript--;
|
|
}
|
|
if (debugFlag) {
|
|
debug("End Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
|
|
}
|
|
callback.handleEndTag(tag.getHTMLTag(), getBlockStartPosition());
|
|
|
|
}
|
|
|
|
/**
|
|
* Handle Text.
|
|
*/
|
|
protected void handleText(char data[]) {
|
|
if (data != null) {
|
|
if (inscript != 0) {
|
|
callback.handleComment(data, getBlockStartPosition());
|
|
return;
|
|
}
|
|
if (inbody != 0 || ((instyle != 0) ||
|
|
((intitle != 0) && !seentitle))) {
|
|
if (debugFlag) {
|
|
debug("text: ->" + new String(data) + "<-" + " pos: " + getCurrentPos());
|
|
}
|
|
callback.handleText(data, getBlockStartPosition());
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Error handling.
|
|
*/
|
|
protected void handleError(int ln, String errorMsg) {
|
|
if (debugFlag) {
|
|
debug("Error: ->" + errorMsg + "<-" + " pos: " + getCurrentPos());
|
|
}
|
|
/* PENDING: need to improve the error string. */
|
|
callback.handleError(errorMsg, getCurrentPos());
|
|
}
|
|
|
|
|
|
/*
|
|
* debug messages
|
|
*/
|
|
private void debug(String msg) {
|
|
System.out.println(msg);
|
|
}
|
|
}
|