* This class is intended to parse markup languages, not to validate them. * "Malformed" data is interpreted as graciously as possible, in order to * extract as much information as possible. For instance: spaces are * allowed between the "<" and the tag name, values in tags do not need * to be quoted, and unbalanced quotes are accepted. *
*
* One type of "malformed" data specifically not handled is a quoted
* ">" character occurring within the body of a tag. Even if it is
* quoted, a ">" in the attributes of a tag will be interpreted as the
* end of the tag. For example, the single tag <img src='foo.jpg'
* alt='xyz > abc'>
will be erroneously broken by
* this parser into two tokens:
<img src='foo.jpg' alt='xyz >
*
* This class also may not properly parse all well-formed XML tags, such
* as tags with extended paired delimiters
* For comments, the "tag" is the first word in the comment. This can
* be used to help parse comments that are structured similar to regular
* tags, such as server-side include comments like
*
* Any quote marks in the body, either single or double quotes, are
* left on the values, so that the values can be easily re-emitted
* and still form a valid body.
*
* For names that have no associated value in the tag, the value is
* stored as the empty string "". Therefore, the two tags
*
* Example use: to help the parser in circumstances such as the HTML
* "<script>" tag where the script body doesn't the obey the rules
* because it might contain lone "<" or ">" characters, which this
* parser would interpret as the start or end of funny-looking tags.
*
* @return The unparsed remainder of the string.
*
* @see #replace
*/
public String
rest() {
return done ? null : str.substring(tokenEnd);
}
/**
* Changes the string that this LexML is parsing.
*
* Example use: the caller decided to parse part of the body,
* and now wants this LexML to pick up and parse the rest of it.
*
* @param str
* The string that this LexML should now parse. Whatever
* string this LexML was parsing is forgotten, and it now
* starts parsing at the beginning of the new string.
*
* @see #rest
*/
public void
replace(String str)
{
this.type=STRING;
this.str = str;
this.tokenStart = 0;
this.tokenEnd = 0;
this.tagStart = 0;
singleton = false;
if (str == null) {
done = true;
} else {
this.strEnd = str.length();
done=(tokenEnd >= strEnd);
}
}
private int
skip(String pattern, String str, int i, int end)
{
for ( ; i < end; i++) {
if (pattern.indexOf(str.charAt(i)) < 0) {
break;
}
}
return i;
}
private int
next(String pattern, String str, int i, int end)
{
for ( ; i < end; i++) {
if (pattern.indexOf(str.charAt(i)) >= 0) {
break;
}
}
return i;
}
}
<&
and
* &>
, <?
and ?>
, or
* <![CDATA[
and ]]>
.
* Additionally, XML tags that have embedded comments containing the
* ">" character will not be parsed correctly (for example:
* <!DOCTYPE foo SYSTEM -- a > b -- foo.dtd>
),
* since the ">" in the comment will be interpreted as
* the end of declaration tag, for the same reason mentioned
* above.
*
* Note: this behavior may be changed on a per-application basis by
* overriding the findClose
method in a subclass.
*
* @author Colin Stevens (colin.stevens@sun.com)
* @version 2.8
*/
public class LexML
{
/**
* The value returned by getType
for comment tokens
*/
public static final int COMMENT = 0;
/**
* The value returned by getType
for tag tokens
*/
public static final int TAG = 1;
/**
* The value returned by getType
for string tokens
*/
public static final int STRING = 2;
private static final String SPACE = " \t\r\n";
private static final String SPACE_EQUAL = SPACE + "=";
protected int type;
boolean singleton; // Tag of form
boolean done; // set when we run out of tokens
protected String str; // The string we are scanning
protected int tokenStart;
protected int tokenEnd;
protected int strEnd;
int tagStart;
int tagEnd;
int argsStart;
int argsEnd;
/**
* Create a new ML parser, which can be used to iterate over the
* tokens in the given string.
*
* @param str
* The ML to parse.
*/
public
LexML(String str) {
replace(str);
}
/**
* Advances to the next token. The user can then call the other methods
* in this class to get information about the new current token.
*
* @return true
if a token was found, false
* if there were no more tokens left.
*/
public boolean
nextToken()
{
if (tokenEnd >= strEnd) {
done=true;
singleton = false;
type=STRING;
return false;
}
tokenStart = tokenEnd;
if (str.startsWith("", tokenStart + 4);
} catch (StringIndexOutOfBoundsException e) {
tokenEnd = -1;
}
if (tokenEnd < 0) {
str += "-->";
tokenEnd = strEnd;
strEnd += 3;
}
tokenEnd += 3;
type = COMMENT;
} else if (str.charAt(tokenStart) == '<') {
tokenEnd = findClose(tokenStart);
if (tokenEnd < 0) {
str += ">";
strEnd++;
tokenEnd = strEnd;
done=true;
}
tokenEnd++;
type = TAG;
} else {
tokenEnd = str.indexOf('<', tokenStart);
if (tokenEnd < 0) {
tokenEnd = strEnd;
}
type = STRING;
}
return true;
}
/**
* Find the closing tag ">".
* This may be overriden by sub-classes to allow more sophisticated
* behavior.
* @param start The starting index in str
to look for
* the matching >
* @return The index of str
that contains the
* matching >
*/
protected int
findClose(int start) {
return str.indexOf('>', start);
}
/**
* Gets the type of the current token.
*
* @return The type.
*
* @see #COMMENT
* @see #TAG
* @see #STRING
*/
public int
getType() {
return type;
}
/**
* A tag is a "singleton" if the closing ">" is preceded by
* a slash (/). (e.g.
<br/>
*/
public boolean
isSingleton() {
if (type == TAG) {
split(); // computes singleton as a side effect
}
return singleton;
}
/**
* Gets the string making up the whole current token, including the
* brackets or comment delimiters, if appropriate.
*
* @return The current token.
*/
public String
getToken() {
return done ? null : str.substring(tokenStart, tokenEnd);
}
/**
* Gets the string making up the current token, not including the angle
* brackets or comment delimiters, if appropriate.
*
* @return The body of the token.
*/
public String
getBody() {
if (done) {
return null;
} else if (type == TAG) {
return str.substring(tokenStart + 1, tokenEnd - 1);
} else if (type == COMMENT) {
return str.substring(tokenStart + 4, tokenEnd - 3);
} else {
return str.substring(tokenStart, tokenEnd);
}
}
/**
* Return the string we are currently processing
*/
public String
getString() {
return str;
}
/**
* Return the current processing location.
* @return The character index of the current tag.
*/
public int
getLocation() {
return tokenStart;
}
private void
split() {
if (tagStart <= tokenStart) {
int off = tokenStart + 1;
int end = (type == TAG) ? tokenEnd - 1 : tokenEnd - 3;
tagStart = skip(SPACE, str, off, end);
tagEnd = next(SPACE, str, tagStart, end);
argsStart = skip(SPACE, str, tagEnd, end);
argsEnd = end;
singleton = false;
if (str.charAt(argsEnd - 1) == '/') {
singleton = true;
argsEnd--;
if (argsStart > argsEnd) {
argsStart = argsEnd;
}
if (tagEnd > argsEnd) {
tagEnd = argsEnd;
}
}
}
}
/**
* Gets the tag name at the beginning of the current tag. In other
* words, the tag name for <table border=3>
is
* "table". Any surrounding space characters are removed, but the
* case of the tag is preserved.
* <!--#include virtual="file.inc">
. The tag in
* this case would be "!--#include".
*
* @return The tag name, or null
if the current token
* was a string.
*
*/
public String
getTag() {
if (type == STRING) {
return null;
}
split();
return str.substring(tagStart, tagEnd);
}
/**
* Gets the name/value pairs in the body of the current tag as a
* string.
*
* @return The name/value pairs, or null
if
* the current token was a string.
*/
public String
getArgs() {
if (type == STRING) {
return null;
}
split();
return str.substring(argsStart, argsEnd);
}
/**
* Gets the name/value pairs in the body of the current tag as a
* table.
* <table border>
and
* <table border="">
cannot be distinguished
* based on the result of calling getAttributes
.
*
* @return The table of name/value pairs, or null
if
* the current token was a string.
*/
public StringMap
getAttributes() {
if (type == STRING) {
return null;
}
StringMap map = new StringMap();
split();
int off = argsStart;
int end = argsEnd;
String token = str;
while (off < end) {
int nameStart = off;
int nameEnd = next(SPACE_EQUAL, token, off + 1, end);
String name = token.substring(nameStart, nameEnd);
off = skip(SPACE, token, nameEnd, end);
if ((off < end) && (token.charAt(off) == '=')) {
off = skip(SPACE, token, off + 1, end);
if (off < end) {
char ch = token.charAt(off);
int valueStart = off;
int valueEnd;
if ((ch == '"') || (ch == '\'')) {
off++;
if (off < end) {
off = token.indexOf(ch, off);
if (off < 0) {
off = end;
}
}
off++;
valueEnd = off;
} else {
off = next(SPACE, token, off, end);
valueEnd = off;
}
map.add(name, token.substring(valueStart, valueEnd));
off = skip(SPACE, token, off, end);
continue;
}
}
map.add(name, "");
}
return map;
}
/**
* Gets the rest of the string that has not yet been parsed.
*