// Jericho HTML Parser - Java based library for analysing and manipulating HTML // Version 3.2 // Copyright (C) 2004-2009 Martin Jericho // http://jericho.htmlparser.net/ // // This library is free software; you can redistribute it and/or // modify it under the terms of either one of the following licences: // // 1. The Eclipse Public License (EPL) version 1.0, // included in this distribution in the file licence-epl-1.0.html // or available at http://www.eclipse.org/legal/epl-v10.html // // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later, // included in this distribution in the file licence-lgpl-2.1.txt // or available at http://www.gnu.org/licenses/lgpl.txt // // This library is distributed on an "AS IS" basis, // WITHOUT WARRANTY OF ANY KIND, either express or implied. // See the individual licence texts for more details. package net.htmlparser.jericho; import java.io.Writer; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; /** * Represents a segment of a {@link Source} document. *

* Many of the tag search methods are defined in this class. *

* The span of a segment is defined by the combination of its begin and end character positions. */ public class Segment implements Comparable, CharSequence { final int begin; final int end; final Source source; private static final char[] WHITESPACE = { ' ', '\n', '\r', '\t', '\f', '\u200B' }; // see comments in isWhiteSpace(char) method /** * Constructs a new Segment within the specified {@linkplain Source source} document with the specified begin and end character positions. * * @param source * the {@link Source} document, must not be null. * @param begin * the character position in the source where this segment {@linkplain #getBegin() begins}, inclusive. * @param end * the character position in the source where this segment {@linkplain #getEnd() ends}, exclusive. */ public Segment(final Source source, final int begin, final int end) { if (begin == -1 || end == -1 || begin > end) throw new IllegalArgumentException(); this.begin = begin; this.end = end; if (source == null) throw new IllegalArgumentException("source argument must not be null"); this.source = source; } // Only called from Source constructor Segment(final int length) { begin = 0; this.end = length; source = (Source) this; } // Only used for creating dummy flag instances of this type (see Tag.NOT_CACHED and Element.NOT_CACHED) Segment() { this(0, 0); } // Only used for creating dummy flag instances of this type (see Segment() constructor and StreamedSource.START_SEGMENT) Segment(final int begin, final int end) { this.begin = begin; this.end = end; source = null; } /** * Returns the {@link Source} document containing this segment. *

* If a {@link StreamedSource} is in use, this method throws an UnsupportedOperationException. * * @return the {@link Source} document containing this segment. */ public final Source getSource() { if (source.isStreamed()) throw new UnsupportedOperationException("Source object is not available when using StreamedSource"); return source; } /** * Returns the character position in the {@link Source} document at which this segment begins, inclusive. *

* Use the {@link Source#getRowColumnVector(int pos)} method to determine the row and column numbers corresponding to this character position. * * @return the character position in the {@link Source} document at which this segment begins, inclusive. */ public final int getBegin() { return begin; } /** * Returns the character position in the {@link Source} document immediately after the end of this segment. *

* The character at the position specified by this property is not included in the segment. * * @return the character position in the {@link Source} document immediately after the end of this segment. * @see #getBegin() */ public final int getEnd() { return end; } /** * Compares the specified object with this Segment for equality. *

* Returns true if and only if the specified object is also a Segment, and both segments have the same {@link Source}, and the same begin and end positions. * * @param object * the object to be compared for equality with this Segment. * @return true if the specified object is equal to this Segment, otherwise false. */ public final boolean equals(final Object object) { if (this == object) return true; if (object == null || !(object instanceof Segment)) return false; final Segment segment = (Segment) object; return segment.begin == begin && segment.end == end && segment.source == source; } /** * Returns a hash code value for the segment. *

* The current implementation returns the sum of the begin and end positions, although this is not guaranteed in future versions. * * @return a hash code value for the segment. */ public int hashCode() { return begin + end; } /** * Returns the length of the segment. * This is defined as the number of characters between the begin and end positions. * * @return the length of the segment. */ public int length() { return end - begin; } /** * Indicates whether this Segment encloses the specified Segment. *

* This is the case if {@link #getBegin()}<=segment.{@link #getBegin()} && {@link #getEnd()}>=segment.{@link #getEnd()}. *

* Note that a segment encloses itself. * * @param segment * the segment to be tested for being enclosed by this segment. * @return true if this Segment encloses the specified Segment, otherwise false. */ public final boolean encloses(final Segment segment) { return begin <= segment.begin && end >= segment.end; } /** * Indicates whether this segment encloses the specified character position in the source document. *

* This is the case if {@link #getBegin()} <= pos < {@link #getEnd()}. * * @param pos * the position in the {@link Source} document. * @return true if this segment encloses the specified character position in the source document, otherwise false. */ public final boolean encloses(final int pos) { return begin <= pos && pos < end; } /** * Returns the source text of this segment as a String. *

* The returned String is newly created with every call to this method, unless this segment is itself an instance of {@link Source}. * * @return the source text of this segment as a String. */ public String toString() { return source.subSequence(begin, end).toString(); } /** * Performs a simple rendering of the HTML markup in this segment into text. *

* The output can be configured by setting any number of properties on the returned {@link Renderer} instance before {@linkplain Renderer#writeTo(Writer) obtaining its output}. * * @return an instance of {@link Renderer} based on this segment. * @see #getTextExtractor() */ public Renderer getRenderer() { return new Renderer(this); } /** * Extracts the textual content from the HTML markup of this segment. *

* The output can be configured by setting properties on the returned {@link TextExtractor} instance before {@linkplain TextExtractor#writeTo(Writer) obtaining its output}. *

* * @return an instance of {@link TextExtractor} based on this segment. * @see #getRenderer() */ public TextExtractor getTextExtractor() { return new TextExtractor(this); } /** * Returns an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and plain text segment contained within this segment. *

* See the {@link Source#iterator()} method for a detailed description. *

*

*
Example:
*
*

* The following code demonstrates the typical usage of this method to make an exact copy of this segment to writer (assuming no server tags are present): *

* *
	 * for (Iterator<Segment> nodeIterator = segment.getNoteIterator(); nodeIterator.hasNext();)
	 * {
	 * 	Segment nodeSegment = nodeIterator.next();
	 * 	if (nodeSegment instanceof Tag)
	 * 	{
	 * 		Tag tag = (Tag) nodeSegment;
	 * 		// HANDLE TAG
	 * 		// Uncomment the following line to ensure each tag is valid XML:
	 * 		// writer.write(tag.tidy()); continue;
	 * 	}
	 * 	else if (nodeSegment instanceof CharacterReference)
	 * 	{
	 * 		CharacterReference characterReference = (CharacterReference) nodeSegment;
	 * 		// HANDLE CHARACTER REFERENCE
	 * 		// Uncomment the following line to decode all character references instead of copying them verbatim:
	 * 		// characterReference.appendCharTo(writer); continue;
	 * 	}
	 * 	else
	 * 	{
	 * 		// HANDLE PLAIN TEXT
	 * 	}
	 * 	// unless specific handling has prevented getting to here, simply output the segment as is:
	 * 	writer.write(nodeSegment.toString());
	 * }
	 * 
* *
*
* * @return an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and plain text segment contained within this segment. */ public Iterator getNodeIterator() { return new NodeIterator(this); } /** * Returns a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. *

* The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself. *

* See the {@link Tag} class documentation for more details about the behaviour of this method. * * @return a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. */ public List getAllTags() { return getAllTags(null); } /** * Returns a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment. *

* See the {@link Tag} class documentation for more details about the behaviour of this method. *

* Specifying a null argument to the tagType parameter is equivalent to {@link #getAllTags()}. * * @param tagType * the {@linkplain TagType type} of tags to get. * @return a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment. * @see #getAllStartTags(StartTagType) */ public List getAllTags(final TagType tagType) { Tag tag = checkTagEnclosure(Tag.getNextTag(source, begin, tagType)); if (tag == null) return Collections.emptyList(); final ArrayList list = new ArrayList(); do { list.add(tag); tag = checkTagEnclosure(tag.getNextTag(tagType)); } while (tag != null); return list; } /** * Returns a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. *

* The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself. *

* See the {@link Tag} class documentation for more details about the behaviour of this method. * * @return a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. */ public List getAllStartTags() { StartTag startTag = checkEnclosure(StartTag.getNext(source, begin)); if (startTag == null) return Collections.emptyList(); final ArrayList list = new ArrayList(); do { list.add(startTag); startTag = checkEnclosure(startTag.getNextStartTag()); } while (startTag != null); return list; } /** * Returns a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment. *

* See the {@link Tag} class documentation for more details about the behaviour of this method. *

* Specifying a null argument to the startTagType parameter is equivalent to {@link #getAllStartTags()}. * * @param startTagType * the {@linkplain StartTagType type} of tags to get. * @return a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment. */ public List getAllStartTags(final StartTagType startTagType) { if (startTagType == null) return getAllStartTags(); StartTag startTag = (StartTag) checkTagEnclosure(Tag.getNextTag(source, begin, startTagType)); if (startTag == null) return Collections.emptyList(); final ArrayList list = new ArrayList(); do { list.add(startTag); startTag = (StartTag) checkTagEnclosure(startTag.getNextTag(startTagType)); } while (startTag != null); return list; } /** * Returns a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment. *

* See the {@link Tag} class documentation for more details about the behaviour of this method. *

* Specifying a null argument to the name parameter is equivalent to {@link #getAllStartTags()}, which may include non-{@linkplain StartTagType#NORMAL normal} start tags. *

* This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}. * * @param name * the {@linkplain StartTag#getName() name} of the start tags to get. * @return a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment. */ public List getAllStartTags(String name) { if (name == null) return getAllStartTags(); final boolean isXMLTagName = Tag.isXMLName(name); StartTag startTag = checkEnclosure(StartTag.getNext(source, begin, name, StartTagType.NORMAL, isXMLTagName)); if (startTag == null) return Collections.emptyList(); final ArrayList list = new ArrayList(); do { list.add(startTag); startTag = checkEnclosure(StartTag.getNext(source, startTag.begin + 1, name, StartTagType.NORMAL, isXMLTagName)); } while (startTag != null); return list; } /** * Returns a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment. *

* See the {@link Tag} class documentation for more details about the behaviour of this method. * * @param attributeName * the attribute name (case insensitive) to search for, must not be null. * @param value * the value of the specified attribute to search for, must not be null. * @param valueCaseSensitive * specifies whether the attribute value matching is case sensitive. * @return a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment. * @see #getAllStartTags(String attributeName, Pattern valueRegexPattern) */ public List getAllStartTags(final String attributeName, final String value, final boolean valueCaseSensitive) { StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, value, valueCaseSensitive)); if (startTag == null) return Collections.emptyList(); final ArrayList list = new ArrayList(); do { list.add(startTag); startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, value, valueCaseSensitive)); } while (startTag != null); return list; } /** * Returns a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment. *

* Specifying a null argument to the valueRegexPattern parameter performs the search on the attribute name only, without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all. *

* See the {@link Tag} class documentation for more details about the behaviour of this method. * * @param attributeName * the attribute name (case insensitive) to search for, must not be null. * @param valueRegexPattern * the regular expression pattern that must match the attribute value, may be null. * @return a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment. * @see #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive) */ public List getAllStartTags(final String attributeName, final Pattern valueRegexPattern) { StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, valueRegexPattern)); if (startTag == null) return Collections.emptyList(); final ArrayList list = new ArrayList(); do { list.add(startTag); startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, valueRegexPattern)); } while (startTag != null); return list; } /** * Returns a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment. *

* This matches start tags with a class attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple class names separated by white space in the attribute value. *

* See the {@link Tag} class documentation for more details about the behaviour of this method. * * @param className * the class name (case sensitive) to search for, must not be null. * @return a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment. */ public List getAllStartTagsByClass(final String className) { return getAllStartTags("class", getClassPattern(className)); } /** * Returns a list of the immediate children of this segment in the document element hierarchy. *

* The returned list may include an element that extends beyond the end of this segment, as long as it begins within this segment. *

* An element found at the start of this segment is included in the list. Note however that if this segment is an {@link Element}, the overriding {@link Element#getChildElements()} method is called instead, which only returns the children of the element. *

* Calling getChildElements() on an Element is much more efficient than calling it on a Segment. *

* The objects in the list are all of type {@link Element}. *

* The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself. *

* See the {@link Source#getChildElements()} method for more details. * * @return the a list of the immediate children of this segment in the document element hierarchy, guaranteed not null. * @see Element#getParentElement() */ public List getChildElements() { if (length() == 0) return Collections.emptyList(); List childElements = new ArrayList(); int pos = begin; while (true) { final StartTag childStartTag = source.getNextStartTag(pos); if (childStartTag == null || childStartTag.begin >= end) break; if (!Config.IncludeServerTagsInElementHierarchy && childStartTag.getTagType().isServerTag()) { pos = childStartTag.end; continue; } final Element childElement = childStartTag.getElement(); childElements.add(childElement); childElement.getChildElements(); pos = childElement.end; } return childElements; } /** * Returns a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. *

* The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself. *

* The elements returned correspond exactly with the start tags returned in the {@link #getAllStartTags()} method. *

* If this segment is itself an {@link Element}, the result includes this element in the list. * * @return a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. */ public List getAllElements() { return getAllElements(getAllStartTags()); } /** * Returns a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment. *

* The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String name)} method, except that elements which are not entirely enclosed by this segment are excluded. *

* Specifying a null argument to the name parameter is equivalent to {@link #getAllElements()}, which may include elements of non-{@linkplain StartTagType#NORMAL normal} tags. *

* This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}. *

* If this segment is itself an {@link Element} with the specified name, the result includes this element in the list. * * @param name * the {@linkplain Element#getName() name} of the elements to get. * @return a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment. */ public List getAllElements(String name) { return getAllElements(getAllStartTags(name)); } /** * Returns a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment. *

* The elements returned correspond with the start tags returned in the {@link #getAllTags(TagType)} method, except that elements which are not entirely enclosed by this segment are excluded. *

* If this segment is itself an {@link Element} with the specified type, the result includes this element in the list. * * @param startTagType * the {@linkplain StartTagType type} of start tags to get, must not be null. * @return a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment. */ public List getAllElements(final StartTagType startTagType) { if (startTagType == null) throw new IllegalArgumentException("startTagType argument must not be null"); return getAllElements(getAllStartTags(startTagType)); } /** * Returns a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment. *

* The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)} method, except that elements which are not entirely enclosed by this segment are excluded. *

* If this segment is itself an {@link Element} with the specified name/value pair, the result includes this element in the list. * * @param attributeName * the attribute name (case insensitive) to search for, must not be null. * @param value * the value of the specified attribute to search for, must not be null. * @param valueCaseSensitive * specifies whether the attribute value matching is case sensitive. * @return a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment. * @see #getAllElements(String attributeName, Pattern valueRegexPattern) */ public List getAllElements(final String attributeName, final String value, final boolean valueCaseSensitive) { return getAllElements(getAllStartTags(attributeName, value, valueCaseSensitive)); } /** * Returns a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment. *

* The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, Pattern valueRegexPattern)} method, except that elements which are not entirely enclosed by this segment are excluded. *

* Specifying a null argument to the valueRegexPattern parameter performs the search on the attribute name only, without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all. *

* If this segment is itself an {@link Element} with the specified attribute name and value pattern, the result includes this element in the list. * * @param attributeName * the attribute name (case insensitive) to search for, must not be null. * @param valueRegexPattern * the regular expression pattern that must match the attribute value, may be null. * @return a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment. * @see #getAllElements(String attributeName, String value, boolean valueCaseSensitive) */ public List getAllElements(final String attributeName, final Pattern valueRegexPattern) { return getAllElements(getAllStartTags(attributeName, valueRegexPattern)); } /** * Returns a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment. *

* This matches elements with a class attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple class names separated by white space in the attribute value. *

* The elements returned correspond with the start tags returned in the {@link #getAllStartTagsByClass(String className)} method, except that elements which are not entirely enclosed by this segment are excluded. *

* If this segment is itself an {@link Element} with the specified class, the result includes this element in the list. * * @param className * the class name (case sensitive) to search for, must not be null. * @return a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment. */ public List getAllElementsByClass(final String className) { return getAllElements(getAllStartTagsByClass(className)); } /** * Returns a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. * * @return a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. */ public List getAllCharacterReferences() { CharacterReference characterReference = getNextCharacterReference(begin); if (characterReference == null) return Collections.emptyList(); final ArrayList list = new ArrayList(); do { list.add(characterReference); characterReference = getNextCharacterReference(characterReference.end); } while (characterReference != null); return list; } /** * Returns a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have URI values. *

* According to the HTML 4.01 specification, the following attributes have URI values: * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
HTML element name * Attribute name *
{@link HTMLElementName#A A} * href *
{@link HTMLElementName#APPLET APPLET} * codebase *
{@link HTMLElementName#AREA AREA} * href *
{@link HTMLElementName#BASE BASE} * href *
{@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE} * cite *
{@link HTMLElementName#BODY BODY} * background *
{@link HTMLElementName#FORM FORM} * action *
{@link HTMLElementName#FRAME FRAME} * longdesc *
{@link HTMLElementName#FRAME FRAME} * src *
{@link HTMLElementName#DEL DEL} * cite *
{@link HTMLElementName#HEAD HEAD} * profile *
{@link HTMLElementName#IFRAME IFRAME} * longdesc *
{@link HTMLElementName#IFRAME IFRAME} * src *
{@link HTMLElementName#IMG IMG} * longdesc *
{@link HTMLElementName#IMG IMG} * src *
{@link HTMLElementName#IMG IMG} * usemap *
{@link HTMLElementName#INPUT INPUT} * src *
{@link HTMLElementName#INPUT INPUT} * usemap *
{@link HTMLElementName#INS INS} * cite *
{@link HTMLElementName#LINK LINK} * href *
{@link HTMLElementName#OBJECT OBJECT} * classid *
{@link HTMLElementName#OBJECT OBJECT} * codebase *
{@link HTMLElementName#OBJECT OBJECT} * data *
{@link HTMLElementName#OBJECT OBJECT} * usemap *
{@link HTMLElementName#Q Q} * cite *
{@link HTMLElementName#SCRIPT SCRIPT} * src *
*

* Attributes from other elements may also be returned if the attribute name matches one of those in the list above. *

* This method is often used in conjunction with the {@link #getStyleURISegments()} method in order to find all URIs in a document. *

* The attributes are returned in order of appearance. * * @return a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have URI values. * @see #getStyleURISegments() */ public List getURIAttributes() { return URIAttributes.getList(this); } /** * Returns a list of all URI {@linkplain Segment segments} inside style attribute values {@linkplain #encloses(Segment) enclosed} by this segment. *

* If this segment does not contain any tags, the entire segment is assumed to be a style attribute value. *

* The URI segments are found by searching the style attribute values for the functional notation "url()" as described in section 4.3.4 of the CSS2 specification. *

* The segments are returned in order of appearance. * * @return a list of all URI {@linkplain Segment segments} inside style attribute values {@linkplain #encloses(Segment) enclosed} by this segment. * @see #getURIAttributes() */ public List getStyleURISegments() { return URIAttributes.getStyleURISegments(this); } /** * Returns the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment. *

* This is functionally equivalent to {@link #getAllStartTags()}.iterator().next(), but does not search beyond the first start tag and returns null if no such start tag exists. * * @return the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists. */ public final StartTag getFirstStartTag() { return checkEnclosure(source.getNextStartTag(begin)); } /** * Returns the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment. *

* This is functionally equivalent to {@link #getAllStartTags(StartTagType) getAllStartTags(startTagType)}.iterator().next(), but does not search beyond the first start tag and returns null if no such start tag exists. * * @param startTagType * the StartTagType to search for. * @return the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists. */ public final StartTag getFirstStartTag(StartTagType startTagType) { return checkEnclosure(source.getNextStartTag(begin, startTagType)); } /** * Returns the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment. *

* This is functionally equivalent to {@link #getAllStartTags(String) getAllStartTags(name)}.iterator().next(), but does not search beyond the first start tag and returns null if no such start tag exists. *

* Specifying a null argument to the name parameter is equivalent to {@link #getFirstStartTag()}. * * @param name * the {@linkplain StartTag#getName() name} of the start tag to search for, may be null. * @return the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists. */ public final StartTag getFirstStartTag(String name) { return checkEnclosure(source.getNextStartTag(begin, name)); } /** * Returns the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment. *

* This is functionally equivalent to {@link #getAllStartTags(String,String,boolean) getAllStartTags(attributeName,value,valueCaseSensitive)}.iterator().next(), but does not search beyond the first start tag and returns null if no such start tag exists. * * @param attributeName * the attribute name (case insensitive) to search for, must not be null. * @param value * the value of the specified attribute to search for, must not be null. * @param valueCaseSensitive * specifies whether the attribute value matching is case sensitive. * @return the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists. * @see #getFirstStartTag(String attributeName, Pattern valueRegexPattern) */ public final StartTag getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive) { return checkEnclosure(source.getNextStartTag(begin, attributeName, value, valueCaseSensitive)); } /** * Returns the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment. *

* This is functionally equivalent to {@link #getAllStartTags(String,Pattern) getAllStartTags(attributeName,valueRegexPattern)}.iterator().next(), but does not search beyond the first start tag and returns null if no such start tag exists. * * @param attributeName * the attribute name (case insensitive) to search for, must not be null. * @param valueRegexPattern * the regular expression pattern that must match the attribute value, may be null. * @return the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists. * @see #getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive) */ public final StartTag getFirstStartTag(final String attributeName, final Pattern valueRegexPattern) { return checkEnclosure(source.getNextStartTag(begin, attributeName, valueRegexPattern)); } /** * Returns the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment. *

* This is functionally equivalent to {@link #getAllStartTagsByClass(String) getAllStartTagsByClass(className)}.iterator().next(), but does not search beyond the first start tag and returns null if no such start tag exists. * * @param className * the class name (case sensitive) to search for, must not be null. * @return the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists. */ public final StartTag getFirstStartTagByClass(final String className) { return checkEnclosure(source.getNextStartTagByClass(begin, className)); } /** * Returns the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment. *

* This is functionally equivalent to {@link #getAllElements()}.iterator().next(), but does not search beyond the first enclosed element and returns null if no such element exists. *

* If this segment is itself an {@link Element}, this element is returned, not the first child element. * * @return the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists. */ public final Element getFirstElement() { StartTag startTag = checkEnclosure(StartTag.getNext(source, begin)); while (startTag != null) { final Element element = startTag.getElement(); if (element.end <= end) return element; startTag = checkEnclosure(startTag.getNextStartTag()); } return null; } /** * Returns the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment. *

* This is functionally equivalent to {@link #getAllElements(String) getAllElements(name)}.iterator().next(), but does not search beyond the first enclosed element and returns null if no such element exists. *

* Specifying a null argument to the name parameter is equivalent to {@link #getFirstElement()}. *

* If this segment is itself an {@link Element} with the specified name, this element is returned. * * @param name * the {@linkplain Element#getName() name} of the element to search for. * @return the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists. */ public final Element getFirstElement(String name) { if (name == null) return getFirstElement(); final boolean isXMLTagName = Tag.isXMLName(name); StartTag startTag = checkEnclosure(StartTag.getNext(source, begin, name, StartTagType.NORMAL, isXMLTagName)); while (startTag != null) { final Element element = startTag.getElement(); if (element.end <= end) return element; startTag = checkEnclosure(StartTag.getNext(source, startTag.begin + 1, name, StartTagType.NORMAL, isXMLTagName)); } return null; } /** * Returns the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment. *

* This is functionally equivalent to {@link #getAllElements(String,String,boolean) getAllElements(attributeName,value,valueCaseSensitive)}.iterator().next(), but does not search beyond the first enclosed element and returns null if no such element exists. *

* If this segment is itself an {@link Element} with the specified attribute name/value pair, this element is returned. * * @param attributeName * the attribute name (case insensitive) to search for, must not be null. * @param value * the value of the specified attribute to search for, must not be null. * @param valueCaseSensitive * specifies whether the attribute value matching is case sensitive. * @return the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists. * @see #getFirstElement(String attributeName, Pattern valueRegexPattern) */ public final Element getFirstElement(String attributeName, String value, boolean valueCaseSensitive) { StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, value, valueCaseSensitive)); while (startTag != null) { final Element element = startTag.getElement(); if (element.end <= end) return element; startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, value, valueCaseSensitive)); } return null; } /** * Returns the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment. *

* This is functionally equivalent to {@link #getAllElements(String,Pattern) getAllElements(attributeName,valueRegexPattern)}.iterator().next(), but does not search beyond the first enclosed element and returns null if no such element exists. *

* If this segment is itself an {@link Element} with the specified attribute name and value pattern, this element is returned. * * @param attributeName * the attribute name (case insensitive) to search for, must not be null. * @param valueRegexPattern * the regular expression pattern that must match the attribute value, may be null. * @return the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists. * @see #getFirstElement(String attributeName, String value, boolean valueCaseSensitive) */ public final Element getFirstElement(final String attributeName, final Pattern valueRegexPattern) { StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, valueRegexPattern)); while (startTag != null) { final Element element = startTag.getElement(); if (element.end <= end) return element; startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, valueRegexPattern)); } return null; } /** * Returns the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment. *

* This is functionally equivalent to {@link #getAllElementsByClass(String) getAllElementsByClass(className)}.iterator().next(), but does not search beyond the first enclosed element and returns null if no such element exists. *

* If this segment is itself an {@link Element} with the specified class, this element is returned. * * @param className * the class name (case sensitive) to search for, must not be null. * @return the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or null if none exists. */ public final Element getFirstElementByClass(final String className) { StartTag startTag = checkEnclosure(source.getNextStartTagByClass(begin, className)); while (startTag != null) { final Element element = startTag.getElement(); if (element.end <= end) return element; startTag = checkEnclosure(source.getNextStartTagByClass(startTag.begin + 1, className)); } return null; } /** * Returns a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. * * @return a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. */ public List getFormControls() { return FormControl.getAll(this); } /** * Returns the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment. *

* This is equivalent to {@link FormFields#FormFields(Collection) new FormFields}({@link #getFormControls()}). * * @return the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment. * @see #getFormControls() */ public FormFields getFormFields() { return new FormFields(getFormControls()); } /** * Parses any {@link Attributes} within this segment. * This method is only used in the unusual situation where attributes exist outside of a start tag. * The {@link StartTag#getAttributes()} method should be used in normal situations. *

* This is equivalent to source.{@link Source#parseAttributes(int,int) parseAttributes}({@link #getBegin()},{@link #getEnd()}). * * @return the {@link Attributes} within this segment, or null if too many errors occur while parsing. */ public Attributes parseAttributes() { return source.parseAttributes(begin, end); } /** * Causes the this segment to be ignored when parsing. *

* Ignored segments are treated as blank spaces by the parsing mechanism, but are included as normal text in all other functions. *

* This method was originally the only means of preventing {@linkplain TagType#isServerTag() server tags} located inside {@linkplain StartTagType#NORMAL normal} tags from interfering with the parsing of the tags (such as where an {@linkplain Attribute attribute} of a normal tag uses a server tag to dynamically set its value), as well as preventing non-server tags from being recognised inside server tags. *

* It is not necessary to use this method to ignore {@linkplain TagType#isServerTag() server tags} located inside normal tags, as the attributes parser automatically ignores any server tags. *

* It is not necessary to use this method to ignore non-server tags inside server tags, or the contents of {@link HTMLElementName#SCRIPT SCRIPT} elements, as the parser does this automatically when performing a {@linkplain Source#fullSequentialParse() full sequential parse}. *

* This leaves only very few scenarios where calling this method still provides a significant benefit. *

* One such case is where XML-style server tags are used inside {@linkplain StartTagType#NORMAL normal} tags. Here is an example using an XML-style JSP tag:

<a href="<i18n:resource path="/Portal"/>?BACK=TRUE">back</a>
The first double-quote of "/Portal" will be interpreted as the end quote for the href attribute, as there is no way for the parser to recognise the il8n:resource element * as a server tag. Such use of XML-style server tags inside {@linkplain StartTagType#NORMAL normal} tags is generally seen as bad practice, but it is nevertheless valid JSP. The only way to ensure that this library is able to parse the normal tag surrounding it is to find these server tags first and call the ignoreWhenParsing method to ignore them before parsing the rest of the document. *

* It is important to understand the difference between ignoring the segment when parsing and removing the segment completely. Any text inside a segment that is ignored when parsing is treated by most functions as content, and as such is included in the output of tools such as {@link TextExtractor} and {@link Renderer}. *

* To remove segments completely, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment. Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())} and perform the desired operations on this new source object. *

* Calling this method after the {@link Source#fullSequentialParse()} method has been called is not permitted and throws an IllegalStateException. *

* Any tags appearing in this segment that are found before this method is called will remain in the {@linkplain Source#getCacheDebugInfo() tag cache}, and so will continue to be found by the tag search methods. If this is undesirable, the {@link Source#clearCache()} method can be called to remove them from the cache. Calling the {@link Source#fullSequentialParse()} method after this method clears the cache automatically. *

* For best performance, this method should be called on all segments that need to be ignored without calling any of the tag search methods in between. * * @see Source#ignoreWhenParsing(Collection segments) */ public void ignoreWhenParsing() { source.ignoreWhenParsing(begin, end); } /** * Compares this Segment object to another object. *

* If the argument is not a Segment, a ClassCastException is thrown. *

* A segment is considered to be before another segment if its begin position is earlier, or in the case that both segments begin at the same position, its end position is earlier. *

* Segments that begin and end at the same position are considered equal for the purposes of this comparison, even if they relate to different source documents. *

* Note: this class has a natural ordering that is inconsistent with equals. This means that this method may return zero in some cases where calling the {@link #equals(Object)} method with the same argument returns false. * * @param segment * the segment to be compared * @return a negative integer, zero, or a positive integer as this segment is before, equal to, or after the specified segment. * @throws ClassCastException * if the argument is not a Segment */ public int compareTo(final Segment segment) { if (this == segment) return 0; if (begin < segment.begin) return -1; if (begin > segment.begin) return 1; if (end < segment.end) return -1; if (end > segment.end) return 1; return 0; } /** * Indicates whether this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}. * * @return true if this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}, otherwise false. */ public final boolean isWhiteSpace() { for (int i = begin; i < end; i++) if (!isWhiteSpace(source.charAt(i))) return false; return true; } /** * Returns an indication of the maximum depth of nested elements within this segment. *

* A high return value can indicate that the segment contains a large number of incorrectly nested tags that could result in a StackOverflowException if its content is parsed. *

* The usefulness of this method is debatable as a StackOverflowException is a recoverable error that can be easily caught. The use of this method to pre-detect and avoid a stack overflow may save some memory and processing resources in certain circumstances, but the cost of calling this method to check every segment or document will very often exceed any benefit. *

* It is up to the application developer to determine what return value constitutes an unreasonable level of nesting given the stack space allocated to the application and other factors. *

* Note that the return value is an approximation only and is usually greater than the actual maximum element depth that would be reported by calling the {@link Element#getDepth()} method on the most nested element. * * @return an indication of the maximum depth of nested elements within this segment. */ public int getMaxDepthIndicator() { int maxDepth = 0; int depth = 0; for (Tag tag : getAllTags()) { if (tag instanceof StartTag) { StartTag startTag = (StartTag) tag; if (startTag.getStartTagType().getCorrespondingEndTagType() == null) continue; if (HTMLElements.getEndTagForbiddenElementNames().contains(startTag.getName())) continue; if (startTag.isEmptyElementTag()) continue; depth++; if (depth > maxDepth) maxDepth++; } else { depth--; } } return maxDepth; } /** * Indicates whether the specified character is white space. *

* The HTML 4.01 specification section 9.1 specifies the following white space characters: *

*

* Despite the explicit inclusion of the zero-width space in the HTML specification, Microsoft IE6 does not recognise them as white space and renders them as an unprintable character (empty square). Even zero-width spaces included using the numeric character reference &#x200B; are rendered this way. * * @param ch * the character to test. * @return true if the specified character is white space, otherwise false. */ public static final boolean isWhiteSpace(final char ch) { for (char whiteSpaceChar : WHITESPACE) if (ch == whiteSpaceChar) return true; return false; } /** * Returns a string representation of this object useful for debugging purposes. * * @return a string representation of this object useful for debugging purposes. */ public String getDebugInfo() { final StringBuilder sb = new StringBuilder(50); sb.append('('); source.getRowColumnVector(begin).appendTo(sb); sb.append('-'); source.getRowColumnVector(end).appendTo(sb); sb.append(')'); return sb.toString(); } /** * Returns the character at the specified index. *

* This is logically equivalent to toString().charAt(index) for valid argument values 0 <= index < length(). *

* However because this implementation works directly on the underlying document source string, it should not be assumed that an IndexOutOfBoundsException is thrown for an invalid argument value. * * @param index * the index of the character. * @return the character at the specified index. */ public char charAt(final int index) { return source.charAt(begin + index); } /** * Returns a new character sequence that is a subsequence of this sequence. *

* This is logically equivalent to toString().subSequence(beginIndex,endIndex) for valid values of beginIndex and endIndex. *

* However because this implementation works directly on the underlying document source text, it should not be assumed that an IndexOutOfBoundsException is thrown for invalid argument values as described in the String.subSequence(int,int) method. * * @param beginIndex * the begin index, inclusive. * @param endIndex * the end index, exclusive. * @return a new character sequence that is a subsequence of this sequence. */ public CharSequence subSequence(final int beginIndex, final int endIndex) { return source.subSequence(begin + beginIndex, begin + endIndex); } /** * Collapses the {@linkplain #isWhiteSpace(char) white space} in the specified text. * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space. */ static final StringBuilder appendCollapseWhiteSpace(final StringBuilder sb, final CharSequence text) { final int textLength = text.length(); int i = 0; boolean lastWasWhiteSpace = false; while (true) { if (i >= textLength) return sb; if (!isWhiteSpace(text.charAt(i))) break; i++; } do { final char ch = text.charAt(i++); if (isWhiteSpace(ch)) { lastWasWhiteSpace = true; } else { if (lastWasWhiteSpace) { sb.append(' '); lastWasWhiteSpace = false; } sb.append(ch); } } while (i < textLength); return sb; } static final Pattern getClassPattern(final String className) { return Pattern.compile(".*(\\s|^)" + className + "(\\s|$).*", Pattern.DOTALL); } private List getAllElements(final List startTags) { if (startTags.isEmpty()) return Collections.emptyList(); final ArrayList elements = new ArrayList(startTags.size()); for (StartTag startTag : startTags) { final Element element = startTag.getElement(); if (element.end <= end) elements.add(element); } return elements; } private StartTag checkEnclosure(final StartTag startTag) { if (startTag == null || startTag.end > end) return null; return startTag; } private Tag checkTagEnclosure(final Tag tag) { if (tag == null || tag.end > end) return null; return tag; } private CharacterReference getNextCharacterReference(final int pos) { final CharacterReference characterReference = source.getNextCharacterReference(pos); if (characterReference == null || characterReference.end > end) return null; return characterReference; } }