… | |
… | |
18 | // WITHOUT WARRANTY OF ANY KIND, either express or implied.
|
18 | // WITHOUT WARRANTY OF ANY KIND, either express or implied.
|
19 | // See the individual licence texts for more details.
|
19 | // See the individual licence texts for more details.
|
20 |
|
20 |
|
21 | package net.htmlparser.jericho;
|
21 | package net.htmlparser.jericho;
|
22 |
|
22 |
|
|
|
23 | import java.io.Writer;
|
|
|
24 | import java.util.ArrayList;
|
|
|
25 | import java.util.Collection;
|
|
|
26 | import java.util.Collections;
|
23 | import java.util.Iterator;
|
27 | import java.util.Iterator;
|
24 | import java.util.List;
|
28 | import java.util.List;
|
25 | import java.util.Collections;
|
|
|
26 | import java.util.ArrayList;
|
|
|
27 | import java.util.regex.Pattern;
|
29 | import java.util.regex.Pattern;
|
28 |
|
30 |
|
29 | /**
|
31 | /**
|
30 | * Represents a segment of a {@link Source} document.
|
32 | * Represents a segment of a {@link Source} document.
|
31 | * <p>
|
33 | * <p>
|
32 | * Many of the <a href="Tag.html#TagSearchMethods">tag search methods</a> are defined in this class.
|
34 | * Many of the <a href="Tag.html#TagSearchMethods">tag search methods</a> are defined in this class.
|
33 | * <p>
|
35 | * <p>
|
34 | * The <i>span</i> of a segment is defined by the combination of its begin and end character positions.
|
36 | * The <i>span</i> of a segment is defined by the combination of its begin and end character positions.
|
35 | */
|
37 | */
|
36 | public class Segment implements Comparable<Segment>, CharSequence {
|
38 | public class Segment implements Comparable<Segment>, CharSequence
|
|
|
39 | {
|
37 | final int begin;
|
40 | final int begin;
|
38 | final int end;
|
41 | final int end;
|
39 | final Source source;
|
42 | final Source source;
|
40 |
|
43 |
|
41 | private static final char[] WHITESPACE={' ','\n','\r','\t','\f','\u200B'}; // see comments in isWhiteSpace(char) method
|
44 | private static final char[] WHITESPACE = { ' ', '\n', '\r', '\t', '\f', '\u200B' }; // see comments in isWhiteSpace(char) method
|
42 |
|
45 |
|
43 | /**
|
46 | /**
|
44 | * Constructs a new <code>Segment</code> within the specified {@linkplain Source source} document with the specified begin and end character positions.
|
47 | * Constructs a new <code>Segment</code> within the specified {@linkplain Source source} document with the specified begin and end character positions.
|
|
|
48 | *
|
|
|
49 | * @param source
|
45 | * @param source the {@link Source} document, must not be <code>null</code>.
|
50 | * the {@link Source} document, must not be <code>null</code>.
|
|
|
51 | * @param begin
|
46 | * @param begin the character position in the source where this segment {@linkplain #getBegin() begins}, inclusive.
|
52 | * the character position in the source where this segment {@linkplain #getBegin() begins}, inclusive.
|
|
|
53 | * @param end
|
47 | * @param end the character position in the source where this segment {@linkplain #getEnd() ends}, exclusive.
|
54 | * the character position in the source where this segment {@linkplain #getEnd() ends}, exclusive.
|
48 | */
|
55 | */
|
49 | public Segment(final Source source, final int begin, final int end) {
|
56 | public Segment(final Source source, final int begin, final int end)
|
|
|
57 | {
|
50 | if (begin==-1 || end==-1 || begin>end) throw new IllegalArgumentException();
|
58 | if (begin == -1 || end == -1 || begin > end) throw new IllegalArgumentException();
|
51 | this.begin=begin;
|
59 | this.begin = begin;
|
52 | this.end=end;
|
60 | this.end = end;
|
53 | if (source==null) throw new IllegalArgumentException("source argument must not be null");
|
61 | if (source == null) throw new IllegalArgumentException("source argument must not be null");
|
54 | this.source=source;
|
62 | this.source = source;
|
55 | }
|
63 | }
|
56 |
|
64 |
|
57 | // Only called from Source constructor
|
65 | // Only called from Source constructor
|
58 | Segment(final int length) {
|
66 | Segment(final int length)
|
|
|
67 | {
|
59 | begin=0;
|
68 | begin = 0;
|
60 | this.end=length;
|
69 | this.end = length;
|
61 | source=(Source)this;
|
70 | source = (Source) this;
|
62 | }
|
71 | }
|
63 |
|
72 |
|
64 | // Only used for creating dummy flag instances of this type (see Tag.NOT_CACHED and Element.NOT_CACHED)
|
73 | // Only used for creating dummy flag instances of this type (see Tag.NOT_CACHED and Element.NOT_CACHED)
|
65 | Segment() {
|
74 | Segment()
|
|
|
75 | {
|
66 | this(0,0);
|
76 | this(0, 0);
|
67 | }
|
77 | }
|
68 |
|
78 |
|
69 | // Only used for creating dummy flag instances of this type (see Segment() constructor and StreamedSource.START_SEGMENT)
|
79 | // Only used for creating dummy flag instances of this type (see Segment() constructor and StreamedSource.START_SEGMENT)
|
70 | Segment(final int begin, final int end) {
|
80 | Segment(final int begin, final int end)
|
|
|
81 | {
|
71 | this.begin=begin;
|
82 | this.begin = begin;
|
72 | this.end=end;
|
83 | this.end = end;
|
73 | source=null;
|
84 | source = null;
|
74 | }
|
85 | }
|
75 |
|
86 |
|
76 | /**
|
87 | /**
|
77 | * Returns the {@link Source} document containing this segment.
|
88 | * Returns the {@link Source} document containing this segment.
|
78 | * <p>
|
89 | * <p>
|
79 | * If a {@link StreamedSource} is in use, this method throws an <code>UnsupportedOperationException</code>.
|
90 | * If a {@link StreamedSource} is in use, this method throws an <code>UnsupportedOperationException</code>.
|
80 | *
|
91 | *
|
81 | * @return the {@link Source} document containing this segment.
|
92 | * @return the {@link Source} document containing this segment.
|
82 | */
|
93 | */
|
83 | public final Source getSource() {
|
94 | public final Source getSource()
|
|
|
95 | {
|
84 | if (source.isStreamed()) throw new UnsupportedOperationException("Source object is not available when using StreamedSource");
|
96 | if (source.isStreamed()) throw new UnsupportedOperationException("Source object is not available when using StreamedSource");
|
85 | return source;
|
97 | return source;
|
86 | }
|
98 | }
|
87 |
|
99 |
|
88 | /**
|
100 | /**
|
89 | * Returns the character position in the {@link Source} document at which this segment begins, inclusive.
|
101 | * Returns the character position in the {@link Source} document at which this segment begins, inclusive.
|
90 | * <p>
|
102 | * <p>
|
91 | * Use the {@link Source#getRowColumnVector(int pos)} method to determine the row and column numbers corresponding to this character position.
|
103 | * Use the {@link Source#getRowColumnVector(int pos)} method to determine the row and column numbers corresponding to this character position.
|
92 | *
|
104 | *
|
93 | * @return the character position in the {@link Source} document at which this segment begins, inclusive.
|
105 | * @return the character position in the {@link Source} document at which this segment begins, inclusive.
|
94 | */
|
106 | */
|
95 | public final int getBegin() {
|
107 | public final int getBegin()
|
|
|
108 | {
|
96 | return begin;
|
109 | return begin;
|
97 | }
|
110 | }
|
98 |
|
111 |
|
99 | /**
|
112 | /**
|
100 | * Returns the character position in the {@link Source} document immediately after the end of this segment.
|
113 | * Returns the character position in the {@link Source} document immediately after the end of this segment.
|
101 | * <p>
|
114 | * <p>
|
102 | * The character at the position specified by this property is <b>not</b> included in the segment.
|
115 | * The character at the position specified by this property is <b>not</b> included in the segment.
|
103 | *
|
116 | *
|
104 | * @return the character position in the {@link Source} document immediately after the end of this segment.
|
117 | * @return the character position in the {@link Source} document immediately after the end of this segment.
|
105 | * @see #getBegin()
|
118 | * @see #getBegin()
|
106 | */
|
119 | */
|
107 | public final int getEnd() {
|
120 | public final int getEnd()
|
|
|
121 | {
|
108 | return end;
|
122 | return end;
|
109 | }
|
123 | }
|
110 |
|
124 |
|
111 | /**
|
125 | /**
|
112 | * Compares the specified object with this <code>Segment</code> for equality.
|
126 | * Compares the specified object with this <code>Segment</code> for equality.
|
113 | * <p>
|
127 | * <p>
|
114 | * Returns <code>true</code> if and only if the specified object is also a <code>Segment</code>,
|
128 | * Returns <code>true</code> if and only if the specified object is also a <code>Segment</code>, and both segments have the same {@link Source}, and the same begin and end positions.
|
115 | * and both segments have the same {@link Source}, and the same begin and end positions.
|
129 | *
|
|
|
130 | * @param object
|
116 | * @param object the object to be compared for equality with this <code>Segment</code>.
|
131 | * the object to be compared for equality with this <code>Segment</code>.
|
117 | * @return <code>true</code> if the specified object is equal to this <code>Segment</code>, otherwise <code>false</code>.
|
132 | * @return <code>true</code> if the specified object is equal to this <code>Segment</code>, otherwise <code>false</code>.
|
118 | */
|
133 | */
|
119 | public final boolean equals(final Object object) {
|
134 | public final boolean equals(final Object object)
|
|
|
135 | {
|
120 | if (this==object) return true;
|
136 | if (this == object) return true;
|
121 | if (object==null || !(object instanceof Segment)) return false;
|
137 | if (object == null || !(object instanceof Segment)) return false;
|
122 | final Segment segment=(Segment)object;
|
138 | final Segment segment = (Segment) object;
|
123 | return segment.begin==begin && segment.end==end && segment.source==source;
|
139 | return segment.begin == begin && segment.end == end && segment.source == source;
|
124 | }
|
140 | }
|
125 |
|
141 |
|
126 | /**
|
142 | /**
|
127 | * Returns a hash code value for the segment.
|
143 | * Returns a hash code value for the segment.
|
128 | * <p>
|
144 | * <p>
|
129 | * The current implementation returns the sum of the begin and end positions, although this is not
|
145 | * The current implementation returns the sum of the begin and end positions, although this is not guaranteed in future versions.
|
130 | * guaranteed in future versions.
|
|
|
131 | *
|
146 | *
|
132 | * @return a hash code value for the segment.
|
147 | * @return a hash code value for the segment.
|
133 | */
|
148 | */
|
134 | public int hashCode() {
|
149 | public int hashCode()
|
|
|
150 | {
|
135 | return begin+end;
|
151 | return begin + end;
|
136 | }
|
152 | }
|
137 |
|
153 |
|
138 | /**
|
154 | /**
|
139 | * Returns the length of the segment.
|
155 | * Returns the length of the segment.
|
140 | * This is defined as the number of characters between the begin and end positions.
|
156 | * This is defined as the number of characters between the begin and end positions.
|
|
|
157 | *
|
141 | * @return the length of the segment.
|
158 | * @return the length of the segment.
|
142 | */
|
159 | */
|
143 | public int length() {
|
160 | public int length()
|
|
|
161 | {
|
144 | return end-begin;
|
162 | return end - begin;
|
145 | }
|
163 | }
|
146 |
|
164 |
|
147 | /**
|
165 | /**
|
148 | * Indicates whether this <code>Segment</code> encloses the specified <code>Segment</code>.
|
166 | * Indicates whether this <code>Segment</code> encloses the specified <code>Segment</code>.
|
149 | * <p>
|
167 | * <p>
|
150 | * This is the case if {@link #getBegin()}<code><=segment.</code>{@link #getBegin()}<code> && </code>{@link #getEnd()}<code>>=segment.</code>{@link #getEnd()}.
|
168 | * This is the case if {@link #getBegin()}<code><=segment.</code>{@link #getBegin()}<code> && </code>{@link #getEnd()}<code>>=segment.</code>{@link #getEnd()}.
|
151 | * <p>
|
169 | * <p>
|
152 | * Note that a segment encloses itself.
|
170 | * Note that a segment encloses itself.
|
153 | *
|
171 | *
|
|
|
172 | * @param segment
|
154 | * @param segment the segment to be tested for being enclosed by this segment.
|
173 | * the segment to be tested for being enclosed by this segment.
|
155 | * @return <code>true</code> if this <code>Segment</code> encloses the specified <code>Segment</code>, otherwise <code>false</code>.
|
174 | * @return <code>true</code> if this <code>Segment</code> encloses the specified <code>Segment</code>, otherwise <code>false</code>.
|
156 | */
|
175 | */
|
157 | public final boolean encloses(final Segment segment) {
|
176 | public final boolean encloses(final Segment segment)
|
|
|
177 | {
|
158 | return begin<=segment.begin && end>=segment.end;
|
178 | return begin <= segment.begin && end >= segment.end;
|
159 | }
|
179 | }
|
160 |
|
180 |
|
161 | /**
|
181 | /**
|
162 | * Indicates whether this segment encloses the specified character position in the source document.
|
182 | * Indicates whether this segment encloses the specified character position in the source document.
|
163 | * <p>
|
183 | * <p>
|
164 | * This is the case if {@link #getBegin()}<code> <= pos < </code>{@link #getEnd()}.
|
184 | * This is the case if {@link #getBegin()}<code> <= pos < </code>{@link #getEnd()}.
|
165 | *
|
185 | *
|
|
|
186 | * @param pos
|
166 | * @param pos the position in the {@link Source} document.
|
187 | * the position in the {@link Source} document.
|
167 | * @return <code>true</code> if this segment encloses the specified character position in the source document, otherwise <code>false</code>.
|
188 | * @return <code>true</code> if this segment encloses the specified character position in the source document, otherwise <code>false</code>.
|
168 | */
|
189 | */
|
169 | public final boolean encloses(final int pos) {
|
190 | public final boolean encloses(final int pos)
|
|
|
191 | {
|
170 | return begin<=pos && pos<end;
|
192 | return begin <= pos && pos < end;
|
171 | }
|
193 | }
|
172 |
|
194 |
|
173 | /**
|
195 | /**
|
174 | * Returns the source text of this segment as a <code>String</code>.
|
196 | * Returns the source text of this segment as a <code>String</code>.
|
175 | * <p>
|
197 | * <p>
|
176 | * The returned <code>String</code> is newly created with every call to this method, unless this
|
198 | * The returned <code>String</code> is newly created with every call to this method, unless this segment is itself an instance of {@link Source}.
|
177 | * segment is itself an instance of {@link Source}.
|
|
|
178 | *
|
199 | *
|
179 | * @return the source text of this segment as a <code>String</code>.
|
200 | * @return the source text of this segment as a <code>String</code>.
|
180 | */
|
201 | */
|
181 | public String toString() {
|
202 | public String toString()
|
|
|
203 | {
|
182 | return source.subSequence(begin,end).toString();
|
204 | return source.subSequence(begin, end).toString();
|
183 | }
|
205 | }
|
184 |
|
206 |
|
185 | /**
|
207 | /**
|
186 | * Performs a simple rendering of the HTML markup in this segment into text.
|
208 | * Performs a simple rendering of the HTML markup in this segment into text.
|
187 | * <p>
|
209 | * <p>
|
188 | * The output can be configured by setting any number of properties on the returned {@link Renderer} instance before
|
210 | * The output can be configured by setting any number of properties on the returned {@link Renderer} instance before {@linkplain Renderer#writeTo(Writer) obtaining its output}.
|
189 | * {@linkplain Renderer#writeTo(Writer) obtaining its output}.
|
|
|
190 | *
|
211 | *
|
191 | * @return an instance of {@link Renderer} based on this segment.
|
212 | * @return an instance of {@link Renderer} based on this segment.
|
192 | * @see #getTextExtractor()
|
213 | * @see #getTextExtractor()
|
193 | */
|
214 | */
|
194 | public Renderer getRenderer() {
|
215 | public Renderer getRenderer()
|
|
|
216 | {
|
195 | return new Renderer(this);
|
217 | return new Renderer(this);
|
196 | }
|
218 | }
|
197 |
|
219 |
|
198 | /**
|
220 | /**
|
199 | * Extracts the textual content from the HTML markup of this segment.
|
221 | * Extracts the textual content from the HTML markup of this segment.
|
200 | * <p>
|
222 | * <p>
|
201 | * The output can be configured by setting properties on the returned {@link TextExtractor} instance before
|
223 | * The output can be configured by setting properties on the returned {@link TextExtractor} instance before {@linkplain TextExtractor#writeTo(Writer) obtaining its output}.
|
202 | * {@linkplain TextExtractor#writeTo(Writer) obtaining its output}.
|
224 | * <p>
|
203 | * <p>
|
225 | *
|
204 | * @return an instance of {@link TextExtractor} based on this segment.
|
226 | * @return an instance of {@link TextExtractor} based on this segment.
|
205 | * @see #getRenderer()
|
227 | * @see #getRenderer()
|
206 | */
|
228 | */
|
207 | public TextExtractor getTextExtractor() {
|
229 | public TextExtractor getTextExtractor()
|
|
|
230 | {
|
208 | return new TextExtractor(this);
|
231 | return new TextExtractor(this);
|
209 | }
|
232 | }
|
210 |
|
233 |
|
211 | /**
|
234 | /**
|
212 | * Returns an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and <a href="Source.html#PlainText">plain text</a> segment contained within this segment.
|
235 | * Returns an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and <a href="Source.html#PlainText">plain text</a> segment contained within this segment.
|
213 | * <p>
|
236 | * <p>
|
214 | * See the {@link Source#iterator()} method for a detailed description.
|
237 | * See the {@link Source#iterator()} method for a detailed description.
|
215 | * <p>
|
238 | * <p>
|
216 | * <dl>
|
239 | * <dl>
|
217 | * <dt>Example:</dt>
|
240 | * <dt>Example:</dt>
|
218 | * <dd>
|
241 | * <dd>
|
219 | * <p>
|
242 | * <p>
|
220 | * The following code demonstrates the typical usage of this method to make an exact copy of this segment to <code>writer</code> (assuming no server tags are present):
|
243 | * The following code demonstrates the typical usage of this method to make an exact copy of this segment to <code>writer</code> (assuming no server tags are present):
|
221 | * </p>
|
244 | * </p>
|
|
|
245 | *
|
222 | * <pre>
|
246 | * <pre>
|
223 | * for (Iterator<Segment> nodeIterator=segment.getNoteIterator(); nodeIterator.hasNext();) {
|
247 | * for (Iterator<Segment> nodeIterator = segment.getNoteIterator(); nodeIterator.hasNext();)
|
|
|
248 | * {
|
224 | * Segment nodeSegment=nodeIterator.next();
|
249 | * Segment nodeSegment = nodeIterator.next();
|
225 | * if (nodeSegment instanceof Tag) {
|
250 | * if (nodeSegment instanceof Tag)
|
|
|
251 | * {
|
226 | * Tag tag=(Tag)nodeSegment;
|
252 | * Tag tag = (Tag) nodeSegment;
|
227 | * // HANDLE TAG
|
253 | * // HANDLE TAG
|
228 | * // Uncomment the following line to ensure each tag is valid XML:
|
254 | * // Uncomment the following line to ensure each tag is valid XML:
|
229 | * // writer.write(tag.tidy()); continue;
|
255 | * // writer.write(tag.tidy()); continue;
|
|
|
256 | * }
|
230 | * } else if (nodeSegment instanceof CharacterReference) {
|
257 | * else if (nodeSegment instanceof CharacterReference)
|
|
|
258 | * {
|
231 | * CharacterReference characterReference=(CharacterReference)nodeSegment;
|
259 | * CharacterReference characterReference = (CharacterReference) nodeSegment;
|
232 | * // HANDLE CHARACTER REFERENCE
|
260 | * // HANDLE CHARACTER REFERENCE
|
233 | * // Uncomment the following line to decode all character references instead of copying them verbatim:
|
261 | * // Uncomment the following line to decode all character references instead of copying them verbatim:
|
234 | * // characterReference.appendCharTo(writer); continue;
|
262 | * // characterReference.appendCharTo(writer); continue;
|
235 | * } else {
|
263 | * }
|
|
|
264 | * else
|
|
|
265 | * {
|
236 | * // HANDLE PLAIN TEXT
|
266 | * // HANDLE PLAIN TEXT
|
237 | * }
|
267 | * }
|
238 | * // unless specific handling has prevented getting to here, simply output the segment as is:
|
268 | * // unless specific handling has prevented getting to here, simply output the segment as is:
|
239 | * writer.write(nodeSegment.toString());
|
269 | * writer.write(nodeSegment.toString());
|
|
|
270 | * }
|
240 | * }</pre>
|
271 | * </pre>
|
|
|
272 | *
|
241 | * </dd>
|
273 | * </dd>
|
242 | * </dl>
|
274 | * </dl>
|
|
|
275 | *
|
243 | * @return an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and <a href="Source.html#PlainText">plain text</a> segment contained within this segment.
|
276 | * @return an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and <a href="Source.html#PlainText">plain text</a> segment contained within this segment.
|
244 | */
|
277 | */
|
245 | public Iterator<Segment> getNodeIterator() {
|
278 | public Iterator<Segment> getNodeIterator()
|
|
|
279 | {
|
246 | return new NodeIterator(this);
|
280 | return new NodeIterator(this);
|
247 | }
|
281 | }
|
248 |
|
282 |
|
249 | /**
|
283 | /**
|
250 | * Returns a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
284 | * Returns a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
251 | * <p>
|
285 | * <p>
|
252 | * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
|
286 | * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself.
|
253 | * if this method is to be used on a large proportion of the source.
|
|
|
254 | * It is called automatically if this method is called on the {@link Source} object itself.
|
|
|
255 | * <p>
|
287 | * <p>
|
256 | * See the {@link Tag} class documentation for more details about the behaviour of this method.
|
288 | * See the {@link Tag} class documentation for more details about the behaviour of this method.
|
257 | *
|
289 | *
|
258 | * @return a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
290 | * @return a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
259 | */
|
291 | */
|
260 | public List<Tag> getAllTags() {
|
292 | public List<Tag> getAllTags()
|
|
|
293 | {
|
261 | return getAllTags(null);
|
294 | return getAllTags(null);
|
262 | }
|
295 | }
|
263 |
|
296 |
|
264 | /**
|
297 | /**
|
265 | * Returns a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
298 | * Returns a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
266 | * <p>
|
299 | * <p>
|
267 | * See the {@link Tag} class documentation for more details about the behaviour of this method.
|
300 | * See the {@link Tag} class documentation for more details about the behaviour of this method.
|
268 | * <p>
|
301 | * <p>
|
269 | * Specifying a <code>null</code> argument to the <code>tagType</code> parameter is equivalent to {@link #getAllTags()}.
|
302 | * Specifying a <code>null</code> argument to the <code>tagType</code> parameter is equivalent to {@link #getAllTags()}.
|
270 | *
|
303 | *
|
|
|
304 | * @param tagType
|
271 | * @param tagType the {@linkplain TagType type} of tags to get.
|
305 | * the {@linkplain TagType type} of tags to get.
|
272 | * @return a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
306 | * @return a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
273 | * @see #getAllStartTags(StartTagType)
|
307 | * @see #getAllStartTags(StartTagType)
|
274 | */
|
308 | */
|
275 | public List<Tag> getAllTags(final TagType tagType) {
|
309 | public List<Tag> getAllTags(final TagType tagType)
|
|
|
310 | {
|
276 | Tag tag=checkTagEnclosure(Tag.getNextTag(source,begin,tagType));
|
311 | Tag tag = checkTagEnclosure(Tag.getNextTag(source, begin, tagType));
|
277 | if (tag==null) return Collections.emptyList();
|
312 | if (tag == null) return Collections.emptyList();
|
278 | final ArrayList<Tag> list=new ArrayList<Tag>();
|
313 | final ArrayList<Tag> list = new ArrayList<Tag>();
|
279 | do {
|
314 | do
|
|
|
315 | {
|
280 | list.add(tag);
|
316 | list.add(tag);
|
281 | tag=checkTagEnclosure(tag.getNextTag(tagType));
|
317 | tag = checkTagEnclosure(tag.getNextTag(tagType));
|
|
|
318 | }
|
282 | } while (tag!=null);
|
319 | while (tag != null);
|
283 | return list;
|
320 | return list;
|
284 | }
|
321 | }
|
285 |
|
322 |
|
286 | /**
|
323 | /**
|
287 | * Returns a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
324 | * Returns a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
288 | * <p>
|
325 | * <p>
|
289 | * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
|
326 | * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself.
|
290 | * if this method is to be used on a large proportion of the source.
|
|
|
291 | * It is called automatically if this method is called on the {@link Source} object itself.
|
|
|
292 | * <p>
|
327 | * <p>
|
293 | * See the {@link Tag} class documentation for more details about the behaviour of this method.
|
328 | * See the {@link Tag} class documentation for more details about the behaviour of this method.
|
294 | *
|
329 | *
|
295 | * @return a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
330 | * @return a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
296 | */
|
331 | */
|
297 | public List<StartTag> getAllStartTags() {
|
332 | public List<StartTag> getAllStartTags()
|
|
|
333 | {
|
298 | StartTag startTag=checkEnclosure(StartTag.getNext(source,begin));
|
334 | StartTag startTag = checkEnclosure(StartTag.getNext(source, begin));
|
299 | if (startTag==null) return Collections.emptyList();
|
335 | if (startTag == null) return Collections.emptyList();
|
300 | final ArrayList<StartTag> list=new ArrayList<StartTag>();
|
336 | final ArrayList<StartTag> list = new ArrayList<StartTag>();
|
301 | do {
|
337 | do
|
|
|
338 | {
|
302 | list.add(startTag);
|
339 | list.add(startTag);
|
303 | startTag=checkEnclosure(startTag.getNextStartTag());
|
340 | startTag = checkEnclosure(startTag.getNextStartTag());
|
|
|
341 | }
|
304 | } while (startTag!=null);
|
342 | while (startTag != null);
|
305 | return list;
|
343 | return list;
|
306 | }
|
344 | }
|
307 |
|
345 |
|
308 | /**
|
346 | /**
|
309 | * Returns a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
347 | * Returns a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
310 | * <p>
|
348 | * <p>
|
311 | * See the {@link Tag} class documentation for more details about the behaviour of this method.
|
349 | * See the {@link Tag} class documentation for more details about the behaviour of this method.
|
312 | * <p>
|
350 | * <p>
|
313 | * Specifying a <code>null</code> argument to the <code>startTagType</code> parameter is equivalent to {@link #getAllStartTags()}.
|
351 | * Specifying a <code>null</code> argument to the <code>startTagType</code> parameter is equivalent to {@link #getAllStartTags()}.
|
314 | *
|
352 | *
|
|
|
353 | * @param startTagType
|
315 | * @param startTagType the {@linkplain StartTagType type} of tags to get.
|
354 | * the {@linkplain StartTagType type} of tags to get.
|
316 | * @return a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
355 | * @return a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
317 | */
|
356 | */
|
318 | public List<StartTag> getAllStartTags(final StartTagType startTagType) {
|
357 | public List<StartTag> getAllStartTags(final StartTagType startTagType)
|
|
|
358 | {
|
319 | if (startTagType==null) return getAllStartTags();
|
359 | if (startTagType == null) return getAllStartTags();
|
320 | StartTag startTag=(StartTag)checkTagEnclosure(Tag.getNextTag(source,begin,startTagType));
|
360 | StartTag startTag = (StartTag) checkTagEnclosure(Tag.getNextTag(source, begin, startTagType));
|
321 | if (startTag==null) return Collections.emptyList();
|
361 | if (startTag == null) return Collections.emptyList();
|
322 | final ArrayList<StartTag> list=new ArrayList<StartTag>();
|
362 | final ArrayList<StartTag> list = new ArrayList<StartTag>();
|
323 | do {
|
363 | do
|
|
|
364 | {
|
324 | list.add(startTag);
|
365 | list.add(startTag);
|
325 | startTag=(StartTag)checkTagEnclosure(startTag.getNextTag(startTagType));
|
366 | startTag = (StartTag) checkTagEnclosure(startTag.getNextTag(startTagType));
|
|
|
367 | }
|
326 | } while (startTag!=null);
|
368 | while (startTag != null);
|
327 | return list;
|
369 | return list;
|
328 | }
|
370 | }
|
329 |
|
371 |
|
330 | /**
|
372 | /**
|
331 | * Returns a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
373 | * Returns a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
… | |
… | |
333 | * See the {@link Tag} class documentation for more details about the behaviour of this method.
|
375 | * See the {@link Tag} class documentation for more details about the behaviour of this method.
|
334 | * <p>
|
376 | * <p>
|
335 | * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getAllStartTags()}, which may include non-{@linkplain StartTagType#NORMAL normal} start tags.
|
377 | * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getAllStartTags()}, which may include non-{@linkplain StartTagType#NORMAL normal} start tags.
|
336 | * <p>
|
378 | * <p>
|
337 | * This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
|
379 | * This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
|
338 | *
|
380 | *
|
|
|
381 | * @param name
|
339 | * @param name the {@linkplain StartTag#getName() name} of the start tags to get.
|
382 | * the {@linkplain StartTag#getName() name} of the start tags to get.
|
340 | * @return a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
383 | * @return a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
341 | */
|
384 | */
|
342 | public List<StartTag> getAllStartTags(String name) {
|
385 | public List<StartTag> getAllStartTags(String name)
|
|
|
386 | {
|
343 | if (name==null) return getAllStartTags();
|
387 | if (name == null) return getAllStartTags();
|
344 | final boolean isXMLTagName=Tag.isXMLName(name);
|
388 | final boolean isXMLTagName = Tag.isXMLName(name);
|
345 | StartTag startTag=checkEnclosure(StartTag.getNext(source,begin,name,StartTagType.NORMAL,isXMLTagName));
|
389 | StartTag startTag = checkEnclosure(StartTag.getNext(source, begin, name, StartTagType.NORMAL, isXMLTagName));
|
346 | if (startTag==null) return Collections.emptyList();
|
390 | if (startTag == null) return Collections.emptyList();
|
347 | final ArrayList<StartTag> list=new ArrayList<StartTag>();
|
391 | final ArrayList<StartTag> list = new ArrayList<StartTag>();
|
348 | do {
|
392 | do
|
|
|
393 | {
|
349 | list.add(startTag);
|
394 | list.add(startTag);
|
350 | startTag=checkEnclosure(StartTag.getNext(source,startTag.begin+1,name,StartTagType.NORMAL,isXMLTagName));
|
395 | startTag = checkEnclosure(StartTag.getNext(source, startTag.begin + 1, name, StartTagType.NORMAL, isXMLTagName));
|
|
|
396 | }
|
351 | } while (startTag!=null);
|
397 | while (startTag != null);
|
352 | return list;
|
398 | return list;
|
353 | }
|
399 | }
|
354 |
|
400 |
|
355 | /**
|
401 | /**
|
356 | * Returns a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
402 | * Returns a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
357 | * <p>
|
403 | * <p>
|
358 | * See the {@link Tag} class documentation for more details about the behaviour of this method.
|
404 | * See the {@link Tag} class documentation for more details about the behaviour of this method.
|
359 | *
|
405 | *
|
|
|
406 | * @param attributeName
|
360 | * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
407 | * the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
|
|
408 | * @param value
|
361 | * @param value the value of the specified attribute to search for, must not be <code>null</code>.
|
409 | * the value of the specified attribute to search for, must not be <code>null</code>.
|
|
|
410 | * @param valueCaseSensitive
|
362 | * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
|
411 | * specifies whether the attribute value matching is case sensitive.
|
363 | * @return a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
412 | * @return a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
364 | * @see #getAllStartTags(String attributeName, Pattern valueRegexPattern)
|
413 | * @see #getAllStartTags(String attributeName, Pattern valueRegexPattern)
|
365 | */
|
414 | */
|
366 | public List<StartTag> getAllStartTags(final String attributeName, final String value, final boolean valueCaseSensitive) {
|
415 | public List<StartTag> getAllStartTags(final String attributeName, final String value, final boolean valueCaseSensitive)
|
|
|
416 | {
|
367 | StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive));
|
417 | StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, value, valueCaseSensitive));
|
368 | if (startTag==null) return Collections.emptyList();
|
418 | if (startTag == null) return Collections.emptyList();
|
369 | final ArrayList<StartTag> list=new ArrayList<StartTag>();
|
419 | final ArrayList<StartTag> list = new ArrayList<StartTag>();
|
370 | do {
|
420 | do
|
|
|
421 | {
|
371 | list.add(startTag);
|
422 | list.add(startTag);
|
372 | startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,value,valueCaseSensitive));
|
423 | startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, value, valueCaseSensitive));
|
|
|
424 | }
|
373 | } while (startTag!=null);
|
425 | while (startTag != null);
|
374 | return list;
|
426 | return list;
|
375 | }
|
427 | }
|
376 |
|
428 |
|
377 | /**
|
429 | /**
|
378 | * Returns a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
430 | * Returns a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
379 | * <p>
|
431 | * <p>
|
380 | * Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only,
|
432 | * Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only, without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
|
381 | * without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
|
|
|
382 | * <p>
|
433 | * <p>
|
383 | * See the {@link Tag} class documentation for more details about the behaviour of this method.
|
434 | * See the {@link Tag} class documentation for more details about the behaviour of this method.
|
384 | *
|
435 | *
|
|
|
436 | * @param attributeName
|
385 | * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
437 | * the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
|
|
438 | * @param valueRegexPattern
|
386 | * @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>.
|
439 | * the regular expression pattern that must match the attribute value, may be <code>null</code>.
|
387 | * @return a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
440 | * @return a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
388 | * @see #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)
|
441 | * @see #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)
|
389 | */
|
442 | */
|
390 | public List<StartTag> getAllStartTags(final String attributeName, final Pattern valueRegexPattern) {
|
443 | public List<StartTag> getAllStartTags(final String attributeName, final Pattern valueRegexPattern)
|
|
|
444 | {
|
391 | StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern));
|
445 | StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, valueRegexPattern));
|
392 | if (startTag==null) return Collections.emptyList();
|
446 | if (startTag == null) return Collections.emptyList();
|
393 | final ArrayList<StartTag> list=new ArrayList<StartTag>();
|
447 | final ArrayList<StartTag> list = new ArrayList<StartTag>();
|
394 | do {
|
448 | do
|
|
|
449 | {
|
395 | list.add(startTag);
|
450 | list.add(startTag);
|
396 | startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,valueRegexPattern));
|
451 | startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, valueRegexPattern));
|
|
|
452 | }
|
397 | } while (startTag!=null);
|
453 | while (startTag != null);
|
398 | return list;
|
454 | return list;
|
399 | }
|
455 | }
|
400 |
|
456 |
|
401 | /**
|
457 | /**
|
402 | * Returns a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
458 | * Returns a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
403 | * <p>
|
459 | * <p>
|
404 | * This matches start tags with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple
|
460 | * This matches start tags with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple class names separated by white space in the attribute value.
|
405 | * class names separated by white space in the attribute value.
|
|
|
406 | * <p>
|
461 | * <p>
|
407 | * See the {@link Tag} class documentation for more details about the behaviour of this method.
|
462 | * See the {@link Tag} class documentation for more details about the behaviour of this method.
|
408 | *
|
463 | *
|
|
|
464 | * @param className
|
409 | * @param className the class name (case sensitive) to search for, must not be <code>null</code>.
|
465 | * the class name (case sensitive) to search for, must not be <code>null</code>.
|
410 | * @return a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
466 | * @return a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
411 | */
|
467 | */
|
412 | public List<StartTag> getAllStartTagsByClass(final String className) {
|
468 | public List<StartTag> getAllStartTagsByClass(final String className)
|
|
|
469 | {
|
413 | return getAllStartTags("class",getClassPattern(className));
|
470 | return getAllStartTags("class", getClassPattern(className));
|
414 | }
|
471 | }
|
415 |
|
472 |
|
416 | /**
|
473 | /**
|
417 | * Returns a list of the immediate children of this segment in the document element hierarchy.
|
474 | * Returns a list of the immediate children of this segment in the document element hierarchy.
|
418 | * <p>
|
475 | * <p>
|
419 | * The returned list may include an element that extends beyond the end of this segment, as long as it begins within this segment.
|
476 | * The returned list may include an element that extends beyond the end of this segment, as long as it begins within this segment.
|
420 | * <p>
|
477 | * <p>
|
421 | * An element found at the start of this segment is included in the list.
|
478 | * An element found at the start of this segment is included in the list. Note however that if this segment <i>is</i> an {@link Element}, the overriding {@link Element#getChildElements()} method is called instead, which only returns the children of the element.
|
422 | * Note however that if this segment <i>is</i> an {@link Element}, the overriding {@link Element#getChildElements()} method is called instead,
|
|
|
423 | * which only returns the children of the element.
|
|
|
424 | * <p>
|
479 | * <p>
|
425 | * Calling <code>getChildElements()</code> on an <code>Element</code> is much more efficient than calling it on a <code>Segment</code>.
|
480 | * Calling <code>getChildElements()</code> on an <code>Element</code> is much more efficient than calling it on a <code>Segment</code>.
|
426 | * <p>
|
481 | * <p>
|
427 | * The objects in the list are all of type {@link Element}.
|
482 | * The objects in the list are all of type {@link Element}.
|
428 | * <p>
|
483 | * <p>
|
429 | * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
|
484 | * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself.
|
430 | * if this method is to be used on a large proportion of the source.
|
|
|
431 | * It is called automatically if this method is called on the {@link Source} object itself.
|
|
|
432 | * <p>
|
485 | * <p>
|
433 | * See the {@link Source#getChildElements()} method for more details.
|
486 | * See the {@link Source#getChildElements()} method for more details.
|
434 | *
|
487 | *
|
435 | * @return the a list of the immediate children of this segment in the document element hierarchy, guaranteed not <code>null</code>.
|
488 | * @return the a list of the immediate children of this segment in the document element hierarchy, guaranteed not <code>null</code>.
|
436 | * @see Element#getParentElement()
|
489 | * @see Element#getParentElement()
|
437 | */
|
490 | */
|
438 | public List<Element> getChildElements() {
|
491 | public List<Element> getChildElements()
|
|
|
492 | {
|
439 | if (length()==0) return Collections.emptyList();
|
493 | if (length() == 0) return Collections.emptyList();
|
440 | List<Element> childElements=new ArrayList<Element>();
|
494 | List<Element> childElements = new ArrayList<Element>();
|
441 | int pos=begin;
|
495 | int pos = begin;
|
442 | while (true) {
|
496 | while (true)
|
|
|
497 | {
|
443 | final StartTag childStartTag=source.getNextStartTag(pos);
|
498 | final StartTag childStartTag = source.getNextStartTag(pos);
|
444 | if (childStartTag==null || childStartTag.begin>=end) break;
|
499 | if (childStartTag == null || childStartTag.begin >= end) break;
|
445 | if (!Config.IncludeServerTagsInElementHierarchy && childStartTag.getTagType().isServerTag()) {
|
500 | if (!Config.IncludeServerTagsInElementHierarchy && childStartTag.getTagType().isServerTag())
|
|
|
501 | {
|
446 | pos=childStartTag.end;
|
502 | pos = childStartTag.end;
|
447 | continue;
|
503 | continue;
|
448 | }
|
504 | }
|
449 | final Element childElement=childStartTag.getElement();
|
505 | final Element childElement = childStartTag.getElement();
|
450 | childElements.add(childElement);
|
506 | childElements.add(childElement);
|
451 | childElement.getChildElements();
|
507 | childElement.getChildElements();
|
452 | pos=childElement.end;
|
508 | pos = childElement.end;
|
453 | }
|
509 | }
|
454 | return childElements;
|
510 | return childElements;
|
455 | }
|
511 | }
|
456 |
|
512 |
|
457 | /**
|
513 | /**
|
458 | * Returns a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
514 | * Returns a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
459 | * <p>
|
515 | * <p>
|
460 | * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
|
516 | * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself.
|
461 | * if this method is to be used on a large proportion of the source.
|
|
|
462 | * It is called automatically if this method is called on the {@link Source} object itself.
|
|
|
463 | * <p>
|
517 | * <p>
|
464 | * The elements returned correspond exactly with the start tags returned in the {@link #getAllStartTags()} method.
|
518 | * The elements returned correspond exactly with the start tags returned in the {@link #getAllStartTags()} method.
|
465 | * <p>
|
519 | * <p>
|
466 | * If this segment is itself an {@link Element}, the result includes this element in the list.
|
520 | * If this segment is itself an {@link Element}, the result includes this element in the list.
|
467 | *
|
521 | *
|
468 | * @return a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
522 | * @return a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
469 | */
|
523 | */
|
470 | public List<Element> getAllElements() {
|
524 | public List<Element> getAllElements()
|
|
|
525 | {
|
471 | return getAllElements(getAllStartTags());
|
526 | return getAllElements(getAllStartTags());
|
472 | }
|
527 | }
|
473 |
|
528 |
|
474 | /**
|
529 | /**
|
475 | * Returns a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
530 | * Returns a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
476 | * <p>
|
531 | * <p>
|
477 | * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String name)} method,
|
532 | * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String name)} method, except that elements which are not entirely enclosed by this segment are excluded.
|
478 | * except that elements which are not entirely enclosed by this segment are excluded.
|
|
|
479 | * <p>
|
533 | * <p>
|
480 | * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getAllElements()}, which may include elements of non-{@linkplain StartTagType#NORMAL normal} tags.
|
534 | * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getAllElements()}, which may include elements of non-{@linkplain StartTagType#NORMAL normal} tags.
|
481 | * <p>
|
535 | * <p>
|
482 | * This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
|
536 | * This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
|
483 | * <p>
|
537 | * <p>
|
484 | * If this segment is itself an {@link Element} with the specified name, the result includes this element in the list.
|
538 | * If this segment is itself an {@link Element} with the specified name, the result includes this element in the list.
|
485 | *
|
539 | *
|
|
|
540 | * @param name
|
486 | * @param name the {@linkplain Element#getName() name} of the elements to get.
|
541 | * the {@linkplain Element#getName() name} of the elements to get.
|
487 | * @return a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
542 | * @return a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
488 | */
|
543 | */
|
489 | public List<Element> getAllElements(String name) {
|
544 | public List<Element> getAllElements(String name)
|
|
|
545 | {
|
490 | return getAllElements(getAllStartTags(name));
|
546 | return getAllElements(getAllStartTags(name));
|
491 | }
|
547 | }
|
492 |
|
548 |
|
493 | /**
|
549 | /**
|
494 | * Returns a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
550 | * Returns a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
495 | * <p>
|
551 | * <p>
|
496 | * The elements returned correspond with the start tags returned in the {@link #getAllTags(TagType)} method,
|
552 | * The elements returned correspond with the start tags returned in the {@link #getAllTags(TagType)} method, except that elements which are not entirely enclosed by this segment are excluded.
|
497 | * except that elements which are not entirely enclosed by this segment are excluded.
|
|
|
498 | * <p>
|
553 | * <p>
|
499 | * If this segment is itself an {@link Element} with the specified type, the result includes this element in the list.
|
554 | * If this segment is itself an {@link Element} with the specified type, the result includes this element in the list.
|
500 | *
|
555 | *
|
|
|
556 | * @param startTagType
|
501 | * @param startTagType the {@linkplain StartTagType type} of start tags to get, must not be <code>null</code>.
|
557 | * the {@linkplain StartTagType type} of start tags to get, must not be <code>null</code>.
|
502 | * @return a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
558 | * @return a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
503 | */
|
559 | */
|
504 | public List<Element> getAllElements(final StartTagType startTagType) {
|
560 | public List<Element> getAllElements(final StartTagType startTagType)
|
|
|
561 | {
|
505 | if (startTagType==null) throw new IllegalArgumentException("startTagType argument must not be null");
|
562 | if (startTagType == null) throw new IllegalArgumentException("startTagType argument must not be null");
|
506 | return getAllElements(getAllStartTags(startTagType));
|
563 | return getAllElements(getAllStartTags(startTagType));
|
507 | }
|
564 | }
|
508 |
|
565 |
|
509 | /**
|
566 | /**
|
510 | * Returns a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
567 | * Returns a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
511 | * <p>
|
568 | * <p>
|
512 | * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)} method,
|
569 | * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)} method, except that elements which are not entirely enclosed by this segment are excluded.
|
513 | * except that elements which are not entirely enclosed by this segment are excluded.
|
|
|
514 | * <p>
|
570 | * <p>
|
515 | * If this segment is itself an {@link Element} with the specified name/value pair, the result includes this element in the list.
|
571 | * If this segment is itself an {@link Element} with the specified name/value pair, the result includes this element in the list.
|
516 | *
|
572 | *
|
|
|
573 | * @param attributeName
|
517 | * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
574 | * the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
|
|
575 | * @param value
|
518 | * @param value the value of the specified attribute to search for, must not be <code>null</code>.
|
576 | * the value of the specified attribute to search for, must not be <code>null</code>.
|
|
|
577 | * @param valueCaseSensitive
|
519 | * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
|
578 | * specifies whether the attribute value matching is case sensitive.
|
520 | * @return a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
579 | * @return a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
521 | * @see #getAllElements(String attributeName, Pattern valueRegexPattern)
|
580 | * @see #getAllElements(String attributeName, Pattern valueRegexPattern)
|
522 | */
|
581 | */
|
523 | public List<Element> getAllElements(final String attributeName, final String value, final boolean valueCaseSensitive) {
|
582 | public List<Element> getAllElements(final String attributeName, final String value, final boolean valueCaseSensitive)
|
|
|
583 | {
|
524 | return getAllElements(getAllStartTags(attributeName,value,valueCaseSensitive));
|
584 | return getAllElements(getAllStartTags(attributeName, value, valueCaseSensitive));
|
525 | }
|
585 | }
|
526 |
|
586 |
|
527 | /**
|
587 | /**
|
528 | * Returns a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
588 | * Returns a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
529 | * <p>
|
589 | * <p>
|
530 | * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, Pattern valueRegexPattern)} method,
|
590 | * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, Pattern valueRegexPattern)} method, except that elements which are not entirely enclosed by this segment are excluded.
|
531 | * except that elements which are not entirely enclosed by this segment are excluded.
|
|
|
532 | * <p>
|
591 | * <p>
|
533 | * Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only,
|
592 | * Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only, without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
|
534 | * without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
|
|
|
535 | * <p>
|
593 | * <p>
|
536 | * If this segment is itself an {@link Element} with the specified attribute name and value pattern, the result includes this element in the list.
|
594 | * If this segment is itself an {@link Element} with the specified attribute name and value pattern, the result includes this element in the list.
|
537 | *
|
595 | *
|
|
|
596 | * @param attributeName
|
538 | * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
597 | * the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
|
|
598 | * @param valueRegexPattern
|
539 | * @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>.
|
599 | * the regular expression pattern that must match the attribute value, may be <code>null</code>.
|
540 | * @return a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
600 | * @return a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
541 | * @see #getAllElements(String attributeName, String value, boolean valueCaseSensitive)
|
601 | * @see #getAllElements(String attributeName, String value, boolean valueCaseSensitive)
|
542 | */
|
602 | */
|
543 | public List<Element> getAllElements(final String attributeName, final Pattern valueRegexPattern) {
|
603 | public List<Element> getAllElements(final String attributeName, final Pattern valueRegexPattern)
|
|
|
604 | {
|
544 | return getAllElements(getAllStartTags(attributeName,valueRegexPattern));
|
605 | return getAllElements(getAllStartTags(attributeName, valueRegexPattern));
|
545 | }
|
606 | }
|
546 |
|
607 |
|
547 | /**
|
608 | /**
|
548 | * Returns a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
609 | * Returns a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
549 | * <p>
|
610 | * <p>
|
550 | * This matches elements with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple
|
611 | * This matches elements with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple class names separated by white space in the attribute value.
|
551 | * class names separated by white space in the attribute value.
|
|
|
552 | * <p>
|
612 | * <p>
|
553 | * The elements returned correspond with the start tags returned in the {@link #getAllStartTagsByClass(String className)} method,
|
613 | * The elements returned correspond with the start tags returned in the {@link #getAllStartTagsByClass(String className)} method, except that elements which are not entirely enclosed by this segment are excluded.
|
554 | * except that elements which are not entirely enclosed by this segment are excluded.
|
|
|
555 | * <p>
|
614 | * <p>
|
556 | * If this segment is itself an {@link Element} with the specified class, the result includes this element in the list.
|
615 | * If this segment is itself an {@link Element} with the specified class, the result includes this element in the list.
|
557 | *
|
616 | *
|
|
|
617 | * @param className
|
558 | * @param className the class name (case sensitive) to search for, must not be <code>null</code>.
|
618 | * the class name (case sensitive) to search for, must not be <code>null</code>.
|
559 | * @return a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
619 | * @return a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
560 | */
|
620 | */
|
561 | public List<Element> getAllElementsByClass(final String className) {
|
621 | public List<Element> getAllElementsByClass(final String className)
|
|
|
622 | {
|
562 | return getAllElements(getAllStartTagsByClass(className));
|
623 | return getAllElements(getAllStartTagsByClass(className));
|
563 | }
|
624 | }
|
564 |
|
625 |
|
565 | /**
|
626 | /**
|
566 | * Returns a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
627 | * Returns a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
|
|
628 | *
|
567 | * @return a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
629 | * @return a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
568 | */
|
630 | */
|
569 | public List<CharacterReference> getAllCharacterReferences() {
|
631 | public List<CharacterReference> getAllCharacterReferences()
|
|
|
632 | {
|
570 | CharacterReference characterReference=getNextCharacterReference(begin);
|
633 | CharacterReference characterReference = getNextCharacterReference(begin);
|
571 | if (characterReference==null) return Collections.emptyList();
|
634 | if (characterReference == null) return Collections.emptyList();
|
572 | final ArrayList<CharacterReference> list=new ArrayList<CharacterReference>();
|
635 | final ArrayList<CharacterReference> list = new ArrayList<CharacterReference>();
|
573 | do {
|
636 | do
|
|
|
637 | {
|
574 | list.add(characterReference);
|
638 | list.add(characterReference);
|
575 | characterReference=getNextCharacterReference(characterReference.end);
|
639 | characterReference = getNextCharacterReference(characterReference.end);
|
|
|
640 | }
|
576 | } while (characterReference!=null);
|
641 | while (characterReference != null);
|
577 | return list;
|
642 | return list;
|
578 | }
|
643 | }
|
579 |
|
644 |
|
580 | /**
|
645 | /**
|
581 | * Returns a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> values.
|
646 | * Returns a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> values.
|
582 | * <p>
|
647 | * <p>
|
583 | * According to the <a target="_blank" href="http://www.w3.org/TR/html401/">HTML 4.01 specification</a>, the following attributes have URI values:
|
648 | * According to the <a target="_blank" href="http://www.w3.org/TR/html401/">HTML 4.01 specification</a>, the following attributes have URI values:
|
584 | * <table class="bordered" cellspacing="0">
|
649 | * <table class="bordered" cellspacing="0">
|
585 | * <tr><th>HTML element name<th>Attribute name
|
650 | * <tr>
|
|
|
651 | * <th>HTML element name
|
|
|
652 | * <th>Attribute name
|
|
|
653 | * <tr>
|
586 | * <tr><td>{@link HTMLElementName#A A}<td>href
|
654 | * <td>{@link HTMLElementName#A A}
|
|
|
655 | * <td>href
|
|
|
656 | * <tr>
|
587 | * <tr><td>{@link HTMLElementName#APPLET APPLET}<td>codebase
|
657 | * <td>{@link HTMLElementName#APPLET APPLET}
|
|
|
658 | * <td>codebase
|
|
|
659 | * <tr>
|
588 | * <tr><td>{@link HTMLElementName#AREA AREA}<td>href
|
660 | * <td>{@link HTMLElementName#AREA AREA}
|
|
|
661 | * <td>href
|
|
|
662 | * <tr>
|
589 | * <tr><td>{@link HTMLElementName#BASE BASE}<td>href
|
663 | * <td>{@link HTMLElementName#BASE BASE}
|
|
|
664 | * <td>href
|
|
|
665 | * <tr>
|
590 | * <tr><td>{@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE}<td>cite
|
666 | * <td>{@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE}
|
|
|
667 | * <td>cite
|
|
|
668 | * <tr>
|
591 | * <tr><td>{@link HTMLElementName#BODY BODY}<td>background
|
669 | * <td>{@link HTMLElementName#BODY BODY}
|
|
|
670 | * <td>background
|
|
|
671 | * <tr>
|
592 | * <tr><td>{@link HTMLElementName#FORM FORM}<td>action
|
672 | * <td>{@link HTMLElementName#FORM FORM}
|
593 | * <tr><td>{@link HTMLElementName#FRAME FRAME}<td>longdesc
|
673 | * <td>action
|
|
|
674 | * <tr>
|
594 | * <tr><td>{@link HTMLElementName#FRAME FRAME}<td>src
|
675 | * <td>{@link HTMLElementName#FRAME FRAME}
|
|
|
676 | * <td>longdesc
|
|
|
677 | * <tr>
|
|
|
678 | * <td>{@link HTMLElementName#FRAME FRAME}
|
|
|
679 | * <td>src
|
|
|
680 | * <tr>
|
595 | * <tr><td>{@link HTMLElementName#DEL DEL}<td>cite
|
681 | * <td>{@link HTMLElementName#DEL DEL}
|
|
|
682 | * <td>cite
|
|
|
683 | * <tr>
|
596 | * <tr><td>{@link HTMLElementName#HEAD HEAD}<td>profile
|
684 | * <td>{@link HTMLElementName#HEAD HEAD}
|
597 | * <tr><td>{@link HTMLElementName#IFRAME IFRAME}<td>longdesc
|
685 | * <td>profile
|
|
|
686 | * <tr>
|
598 | * <tr><td>{@link HTMLElementName#IFRAME IFRAME}<td>src
|
687 | * <td>{@link HTMLElementName#IFRAME IFRAME}
|
599 | * <tr><td>{@link HTMLElementName#IMG IMG}<td>longdesc
|
688 | * <td>longdesc
|
|
|
689 | * <tr>
|
|
|
690 | * <td>{@link HTMLElementName#IFRAME IFRAME}
|
|
|
691 | * <td>src
|
|
|
692 | * <tr>
|
600 | * <tr><td>{@link HTMLElementName#IMG IMG}<td>src
|
693 | * <td>{@link HTMLElementName#IMG IMG}
|
|
|
694 | * <td>longdesc
|
|
|
695 | * <tr>
|
601 | * <tr><td>{@link HTMLElementName#IMG IMG}<td>usemap
|
696 | * <td>{@link HTMLElementName#IMG IMG}
|
|
|
697 | * <td>src
|
|
|
698 | * <tr>
|
|
|
699 | * <td>{@link HTMLElementName#IMG IMG}
|
|
|
700 | * <td>usemap
|
|
|
701 | * <tr>
|
602 | * <tr><td>{@link HTMLElementName#INPUT INPUT}<td>src
|
702 | * <td>{@link HTMLElementName#INPUT INPUT}
|
|
|
703 | * <td>src
|
|
|
704 | * <tr>
|
603 | * <tr><td>{@link HTMLElementName#INPUT INPUT}<td>usemap
|
705 | * <td>{@link HTMLElementName#INPUT INPUT}
|
|
|
706 | * <td>usemap
|
|
|
707 | * <tr>
|
604 | * <tr><td>{@link HTMLElementName#INS INS}<td>cite
|
708 | * <td>{@link HTMLElementName#INS INS}
|
|
|
709 | * <td>cite
|
|
|
710 | * <tr>
|
605 | * <tr><td>{@link HTMLElementName#LINK LINK}<td>href
|
711 | * <td>{@link HTMLElementName#LINK LINK}
|
606 | * <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>classid
|
712 | * <td>href
|
607 | * <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>codebase
|
713 | * <tr>
|
608 | * <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>data
|
714 | * <td>{@link HTMLElementName#OBJECT OBJECT}
|
|
|
715 | * <td>classid
|
|
|
716 | * <tr>
|
609 | * <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>usemap
|
717 | * <td>{@link HTMLElementName#OBJECT OBJECT}
|
|
|
718 | * <td>codebase
|
|
|
719 | * <tr>
|
|
|
720 | * <td>{@link HTMLElementName#OBJECT OBJECT}
|
|
|
721 | * <td>data
|
|
|
722 | * <tr>
|
|
|
723 | * <td>{@link HTMLElementName#OBJECT OBJECT}
|
|
|
724 | * <td>usemap
|
|
|
725 | * <tr>
|
610 | * <tr><td>{@link HTMLElementName#Q Q}<td>cite
|
726 | * <td>{@link HTMLElementName#Q Q}
|
|
|
727 | * <td>cite
|
|
|
728 | * <tr>
|
611 | * <tr><td>{@link HTMLElementName#SCRIPT SCRIPT}<td>src
|
729 | * <td>{@link HTMLElementName#SCRIPT SCRIPT}
|
|
|
730 | * <td>src
|
612 | * </table>
|
731 | * </table>
|
613 | * <p>
|
732 | * <p>
|
614 | * Attributes from other elements may also be returned if the attribute name matches one of those in the list above.
|
733 | * Attributes from other elements may also be returned if the attribute name matches one of those in the list above.
|
615 | * <p>
|
734 | * <p>
|
616 | * This method is often used in conjunction with the {@link #getStyleURISegments()} method in order to find all URIs in a document.
|
735 | * This method is often used in conjunction with the {@link #getStyleURISegments()} method in order to find all URIs in a document.
|
617 | * <p>
|
736 | * <p>
|
618 | * The attributes are returned in order of appearance.
|
737 | * The attributes are returned in order of appearance.
|
619 | *
|
738 | *
|
620 | * @return a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> values.
|
739 | * @return a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> values.
|
621 | * @see #getStyleURISegments()
|
740 | * @see #getStyleURISegments()
|
622 | */
|
741 | */
|
623 | public List<Attribute> getURIAttributes() {
|
742 | public List<Attribute> getURIAttributes()
|
|
|
743 | {
|
624 | return URIAttributes.getList(this);
|
744 | return URIAttributes.getList(this);
|
625 | }
|
745 | }
|
626 |
|
746 |
|
627 | /**
|
747 | /**
|
628 | * Returns a list of all <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> {@linkplain Segment segments}
|
748 | * Returns a list of all <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> {@linkplain Segment segments} inside <code>style</code> attribute values {@linkplain #encloses(Segment) enclosed} by this segment.
|
629 | * inside <code>style</code> attribute values {@linkplain #encloses(Segment) enclosed} by this segment.
|
|
|
630 | * <p>
|
749 | * <p>
|
631 | * If this segment does not contain any tags, the entire segment is assumed to be a <code>style</code> attribute value.
|
750 | * If this segment does not contain any tags, the entire segment is assumed to be a <code>style</code> attribute value.
|
632 | * <p>
|
751 | * <p>
|
633 | * The URI segments are found by searching the <code>style</code> attribute values for the functional notation "<code>url()</code>" as described in
|
752 | * The URI segments are found by searching the <code>style</code> attribute values for the functional notation "<code>url()</code>" as described in <a target="_blank" href="http://www.w3.org/TR/CSS2/syndata.html#uri">section 4.3.4 of the CSS2 specification</a>.
|
634 | * <a target="_blank" href="http://www.w3.org/TR/CSS2/syndata.html#uri">section 4.3.4 of the CSS2 specification</a>.
|
|
|
635 | * <p>
|
753 | * <p>
|
636 | * The segments are returned in order of appearance.
|
754 | * The segments are returned in order of appearance.
|
637 | *
|
755 | *
|
638 | * @return a list of all <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> {@linkplain Segment segments} inside <code>style</code> attribute values {@linkplain #encloses(Segment) enclosed} by this segment.
|
756 | * @return a list of all <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> {@linkplain Segment segments} inside <code>style</code> attribute values {@linkplain #encloses(Segment) enclosed} by this segment.
|
639 | * @see #getURIAttributes()
|
757 | * @see #getURIAttributes()
|
640 | */
|
758 | */
|
641 | public List<Segment> getStyleURISegments() {
|
759 | public List<Segment> getStyleURISegments()
|
|
|
760 | {
|
642 | return URIAttributes.getStyleURISegments(this);
|
761 | return URIAttributes.getStyleURISegments(this);
|
643 | }
|
762 | }
|
644 |
|
763 |
|
645 | /**
|
764 | /**
|
646 | * Returns the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
|
765 | * Returns the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
|
647 | * <p>
|
766 | * <p>
|
648 | * This is functionally equivalent to {@link #getAllStartTags()}<code>.iterator().next()</code>,
|
767 | * This is functionally equivalent to {@link #getAllStartTags()}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
|
649 | * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
|
|
|
650 | *
|
768 | *
|
651 | * @return the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
769 | * @return the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
652 | */
|
770 | */
|
653 | public final StartTag getFirstStartTag() {
|
771 | public final StartTag getFirstStartTag()
|
|
|
772 | {
|
654 | return checkEnclosure(source.getNextStartTag(begin));
|
773 | return checkEnclosure(source.getNextStartTag(begin));
|
655 | }
|
774 | }
|
656 |
|
775 |
|
657 | /**
|
776 | /**
|
658 | * Returns the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment.
|
777 | * Returns the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment.
|
659 | * <p>
|
778 | * <p>
|
660 | * This is functionally equivalent to {@link #getAllStartTags(StartTagType) getAllStartTags(startTagType)}<code>.iterator().next()</code>,
|
779 | * This is functionally equivalent to {@link #getAllStartTags(StartTagType) getAllStartTags(startTagType)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
|
661 | * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
|
|
|
662 | *
|
780 | *
|
|
|
781 | * @param startTagType
|
663 | * @param startTagType the <code>StartTagType</code> to search for.
|
782 | * the <code>StartTagType</code> to search for.
|
664 | * @return the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
783 | * @return the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
665 | */
|
784 | */
|
666 | public final StartTag getFirstStartTag(StartTagType startTagType) {
|
785 | public final StartTag getFirstStartTag(StartTagType startTagType)
|
|
|
786 | {
|
667 | return checkEnclosure(source.getNextStartTag(begin,startTagType));
|
787 | return checkEnclosure(source.getNextStartTag(begin, startTagType));
|
668 | }
|
|
|
669 |
|
788 | }
|
|
|
789 |
|
670 | /**
|
790 | /**
|
671 | * Returns the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
|
791 | * Returns the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
|
672 | * <p>
|
792 | * <p>
|
673 | * This is functionally equivalent to {@link #getAllStartTags(String) getAllStartTags(name)}<code>.iterator().next()</code>,
|
793 | * This is functionally equivalent to {@link #getAllStartTags(String) getAllStartTags(name)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
|
674 | * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
|
|
|
675 | * <p>
|
794 | * <p>
|
676 | * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstStartTag()}.
|
795 | * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstStartTag()}.
|
677 | *
|
796 | *
|
|
|
797 | * @param name
|
678 | * @param name the {@linkplain StartTag#getName() name} of the start tag to search for, may be <code>null</code>.
|
798 | * the {@linkplain StartTag#getName() name} of the start tag to search for, may be <code>null</code>.
|
679 | * @return the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
799 | * @return the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
680 | */
|
800 | */
|
681 | public final StartTag getFirstStartTag(String name) {
|
801 | public final StartTag getFirstStartTag(String name)
|
|
|
802 | {
|
682 | return checkEnclosure(source.getNextStartTag(begin,name));
|
803 | return checkEnclosure(source.getNextStartTag(begin, name));
|
683 | }
|
|
|
684 |
|
804 | }
|
|
|
805 |
|
685 | /**
|
806 | /**
|
686 | * Returns the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
|
807 | * Returns the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
|
687 | * <p>
|
808 | * <p>
|
688 | * This is functionally equivalent to {@link #getAllStartTags(String,String,boolean) getAllStartTags(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>,
|
809 | * This is functionally equivalent to {@link #getAllStartTags(String,String,boolean) getAllStartTags(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
|
689 | * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
|
|
|
690 | *
|
810 | *
|
|
|
811 | * @param attributeName
|
691 | * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
812 | * the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
|
|
813 | * @param value
|
692 | * @param value the value of the specified attribute to search for, must not be <code>null</code>.
|
814 | * the value of the specified attribute to search for, must not be <code>null</code>.
|
|
|
815 | * @param valueCaseSensitive
|
693 | * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
|
816 | * specifies whether the attribute value matching is case sensitive.
|
694 | * @return the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
817 | * @return the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
695 | * @see #getFirstStartTag(String attributeName, Pattern valueRegexPattern)
|
818 | * @see #getFirstStartTag(String attributeName, Pattern valueRegexPattern)
|
696 | */
|
819 | */
|
697 | public final StartTag getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive) {
|
820 | public final StartTag getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive)
|
|
|
821 | {
|
698 | return checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive));
|
822 | return checkEnclosure(source.getNextStartTag(begin, attributeName, value, valueCaseSensitive));
|
699 | }
|
823 | }
|
700 |
|
824 |
|
701 | /**
|
825 | /**
|
702 | * Returns the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
|
826 | * Returns the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
|
703 | * <p>
|
827 | * <p>
|
704 | * This is functionally equivalent to {@link #getAllStartTags(String,Pattern) getAllStartTags(attributeName,valueRegexPattern)}<code>.iterator().next()</code>,
|
828 | * This is functionally equivalent to {@link #getAllStartTags(String,Pattern) getAllStartTags(attributeName,valueRegexPattern)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
|
705 | * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
|
|
|
706 | *
|
829 | *
|
|
|
830 | * @param attributeName
|
707 | * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
831 | * the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
|
|
832 | * @param valueRegexPattern
|
708 | * @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>.
|
833 | * the regular expression pattern that must match the attribute value, may be <code>null</code>.
|
709 | * @return the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
834 | * @return the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
710 | * @see #getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive)
|
835 | * @see #getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive)
|
711 | */
|
836 | */
|
712 | public final StartTag getFirstStartTag(final String attributeName, final Pattern valueRegexPattern) {
|
837 | public final StartTag getFirstStartTag(final String attributeName, final Pattern valueRegexPattern)
|
|
|
838 | {
|
713 | return checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern));
|
839 | return checkEnclosure(source.getNextStartTag(begin, attributeName, valueRegexPattern));
|
714 | }
|
840 | }
|
715 |
|
841 |
|
716 | /**
|
842 | /**
|
717 | * Returns the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
|
843 | * Returns the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
|
718 | * <p>
|
844 | * <p>
|
719 | * This is functionally equivalent to {@link #getAllStartTagsByClass(String) getAllStartTagsByClass(className)}<code>.iterator().next()</code>,
|
845 | * This is functionally equivalent to {@link #getAllStartTagsByClass(String) getAllStartTagsByClass(className)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
|
720 | * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
|
|
|
721 | *
|
846 | *
|
|
|
847 | * @param className
|
722 | * @param className the class name (case sensitive) to search for, must not be <code>null</code>.
|
848 | * the class name (case sensitive) to search for, must not be <code>null</code>.
|
723 | * @return the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
849 | * @return the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
724 | */
|
850 | */
|
725 | public final StartTag getFirstStartTagByClass(final String className) {
|
851 | public final StartTag getFirstStartTagByClass(final String className)
|
|
|
852 | {
|
726 | return checkEnclosure(source.getNextStartTagByClass(begin,className));
|
853 | return checkEnclosure(source.getNextStartTagByClass(begin, className));
|
727 | }
|
|
|
728 |
|
854 | }
|
|
|
855 |
|
729 | /**
|
856 | /**
|
730 | * Returns the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment.
|
857 | * Returns the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment.
|
731 | * <p>
|
858 | * <p>
|
732 | * This is functionally equivalent to {@link #getAllElements()}<code>.iterator().next()</code>,
|
859 | * This is functionally equivalent to {@link #getAllElements()}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
|
733 | * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
|
|
|
734 | * <p>
|
860 | * <p>
|
735 | * If this segment is itself an {@link Element}, this element is returned, not the first child element.
|
861 | * If this segment is itself an {@link Element}, this element is returned, not the first child element.
|
736 | *
|
862 | *
|
737 | * @return the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
863 | * @return the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
738 | */
|
864 | */
|
739 | public final Element getFirstElement() {
|
865 | public final Element getFirstElement()
|
|
|
866 | {
|
740 | StartTag startTag=checkEnclosure(StartTag.getNext(source,begin));
|
867 | StartTag startTag = checkEnclosure(StartTag.getNext(source, begin));
|
741 | while (startTag!=null) {
|
868 | while (startTag != null)
|
|
|
869 | {
|
742 | final Element element=startTag.getElement();
|
870 | final Element element = startTag.getElement();
|
743 | if (element.end<=end) return element;
|
871 | if (element.end <= end) return element;
|
744 | startTag=checkEnclosure(startTag.getNextStartTag());
|
872 | startTag = checkEnclosure(startTag.getNextStartTag());
|
745 | }
|
873 | }
|
746 | return null;
|
874 | return null;
|
747 | }
|
875 | }
|
748 |
|
876 |
|
749 | /**
|
877 | /**
|
750 | * Returns the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment.
|
878 | * Returns the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment.
|
751 | * <p>
|
879 | * <p>
|
752 | * This is functionally equivalent to {@link #getAllElements(String) getAllElements(name)}<code>.iterator().next()</code>,
|
880 | * This is functionally equivalent to {@link #getAllElements(String) getAllElements(name)}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
|
753 | * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
|
|
|
754 | * <p>
|
881 | * <p>
|
755 | * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstElement()}.
|
882 | * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstElement()}.
|
756 | * <p>
|
883 | * <p>
|
757 | * If this segment is itself an {@link Element} with the specified name, this element is returned.
|
884 | * If this segment is itself an {@link Element} with the specified name, this element is returned.
|
758 | *
|
885 | *
|
|
|
886 | * @param name
|
759 | * @param name the {@linkplain Element#getName() name} of the element to search for.
|
887 | * the {@linkplain Element#getName() name} of the element to search for.
|
760 | * @return the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
888 | * @return the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
761 | */
|
889 | */
|
762 | public final Element getFirstElement(String name) {
|
890 | public final Element getFirstElement(String name)
|
|
|
891 | {
|
763 | if (name==null) return getFirstElement();
|
892 | if (name == null) return getFirstElement();
|
764 | final boolean isXMLTagName=Tag.isXMLName(name);
|
893 | final boolean isXMLTagName = Tag.isXMLName(name);
|
765 | StartTag startTag=checkEnclosure(StartTag.getNext(source,begin,name,StartTagType.NORMAL,isXMLTagName));
|
894 | StartTag startTag = checkEnclosure(StartTag.getNext(source, begin, name, StartTagType.NORMAL, isXMLTagName));
|
766 | while (startTag!=null) {
|
895 | while (startTag != null)
|
|
|
896 | {
|
767 | final Element element=startTag.getElement();
|
897 | final Element element = startTag.getElement();
|
768 | if (element.end<=end) return element;
|
898 | if (element.end <= end) return element;
|
769 | startTag=checkEnclosure(StartTag.getNext(source,startTag.begin+1,name,StartTagType.NORMAL,isXMLTagName));
|
899 | startTag = checkEnclosure(StartTag.getNext(source, startTag.begin + 1, name, StartTagType.NORMAL, isXMLTagName));
|
770 | }
|
900 | }
|
771 | return null;
|
901 | return null;
|
772 | }
|
902 | }
|
773 |
|
903 |
|
774 | /**
|
904 | /**
|
775 | * Returns the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
|
905 | * Returns the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
|
776 | * <p>
|
906 | * <p>
|
777 | * This is functionally equivalent to {@link #getAllElements(String,String,boolean) getAllElements(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>,
|
907 | * This is functionally equivalent to {@link #getAllElements(String,String,boolean) getAllElements(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
|
778 | * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
|
|
|
779 | * <p>
|
908 | * <p>
|
780 | * If this segment is itself an {@link Element} with the specified attribute name/value pair, this element is returned.
|
909 | * If this segment is itself an {@link Element} with the specified attribute name/value pair, this element is returned.
|
781 | *
|
910 | *
|
|
|
911 | * @param attributeName
|
782 | * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
912 | * the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
|
|
913 | * @param value
|
783 | * @param value the value of the specified attribute to search for, must not be <code>null</code>.
|
914 | * the value of the specified attribute to search for, must not be <code>null</code>.
|
|
|
915 | * @param valueCaseSensitive
|
784 | * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
|
916 | * specifies whether the attribute value matching is case sensitive.
|
785 | * @return the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
917 | * @return the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
786 | * @see #getFirstElement(String attributeName, Pattern valueRegexPattern)
|
918 | * @see #getFirstElement(String attributeName, Pattern valueRegexPattern)
|
787 | */
|
919 | */
|
788 | public final Element getFirstElement(String attributeName, String value, boolean valueCaseSensitive) {
|
920 | public final Element getFirstElement(String attributeName, String value, boolean valueCaseSensitive)
|
|
|
921 | {
|
789 | StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive));
|
922 | StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, value, valueCaseSensitive));
|
790 | while (startTag!=null) {
|
923 | while (startTag != null)
|
|
|
924 | {
|
791 | final Element element=startTag.getElement();
|
925 | final Element element = startTag.getElement();
|
792 | if (element.end<=end) return element;
|
926 | if (element.end <= end) return element;
|
793 | startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,value,valueCaseSensitive));
|
927 | startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, value, valueCaseSensitive));
|
794 | }
|
928 | }
|
795 | return null;
|
929 | return null;
|
796 | }
|
930 | }
|
797 |
|
931 |
|
798 | /**
|
932 | /**
|
799 | * Returns the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
|
933 | * Returns the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
|
800 | * <p>
|
934 | * <p>
|
801 | * This is functionally equivalent to {@link #getAllElements(String,Pattern) getAllElements(attributeName,valueRegexPattern)}<code>.iterator().next()</code>,
|
935 | * This is functionally equivalent to {@link #getAllElements(String,Pattern) getAllElements(attributeName,valueRegexPattern)}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
|
802 | * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
|
|
|
803 | * <p>
|
936 | * <p>
|
804 | * If this segment is itself an {@link Element} with the specified attribute name and value pattern, this element is returned.
|
937 | * If this segment is itself an {@link Element} with the specified attribute name and value pattern, this element is returned.
|
805 | *
|
938 | *
|
|
|
939 | * @param attributeName
|
806 | * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
940 | * the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
|
|
941 | * @param valueRegexPattern
|
807 | * @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>.
|
942 | * the regular expression pattern that must match the attribute value, may be <code>null</code>.
|
808 | * @return the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
943 | * @return the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
809 | * @see #getFirstElement(String attributeName, String value, boolean valueCaseSensitive)
|
944 | * @see #getFirstElement(String attributeName, String value, boolean valueCaseSensitive)
|
810 | */
|
945 | */
|
811 | public final Element getFirstElement(final String attributeName, final Pattern valueRegexPattern) {
|
946 | public final Element getFirstElement(final String attributeName, final Pattern valueRegexPattern)
|
|
|
947 | {
|
812 | StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern));
|
948 | StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, valueRegexPattern));
|
813 | while (startTag!=null) {
|
949 | while (startTag != null)
|
|
|
950 | {
|
814 | final Element element=startTag.getElement();
|
951 | final Element element = startTag.getElement();
|
815 | if (element.end<=end) return element;
|
952 | if (element.end <= end) return element;
|
816 | startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,valueRegexPattern));
|
953 | startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, valueRegexPattern));
|
817 | }
|
954 | }
|
818 | return null;
|
955 | return null;
|
819 | }
|
956 | }
|
820 |
|
957 |
|
821 | /**
|
958 | /**
|
822 | * Returns the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
|
959 | * Returns the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
|
823 | * <p>
|
960 | * <p>
|
824 | * This is functionally equivalent to {@link #getAllElementsByClass(String) getAllElementsByClass(className)}<code>.iterator().next()</code>,
|
961 | * This is functionally equivalent to {@link #getAllElementsByClass(String) getAllElementsByClass(className)}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
|
825 | * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
|
|
|
826 | * <p>
|
962 | * <p>
|
827 | * If this segment is itself an {@link Element} with the specified class, this element is returned.
|
963 | * If this segment is itself an {@link Element} with the specified class, this element is returned.
|
828 | *
|
964 | *
|
|
|
965 | * @param className
|
829 | * @param className the class name (case sensitive) to search for, must not be <code>null</code>.
|
966 | * the class name (case sensitive) to search for, must not be <code>null</code>.
|
830 | * @return the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
967 | * @return the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
831 | */
|
968 | */
|
832 | public final Element getFirstElementByClass(final String className) {
|
969 | public final Element getFirstElementByClass(final String className)
|
|
|
970 | {
|
833 | StartTag startTag=checkEnclosure(source.getNextStartTagByClass(begin,className));
|
971 | StartTag startTag = checkEnclosure(source.getNextStartTagByClass(begin, className));
|
834 | while (startTag!=null) {
|
972 | while (startTag != null)
|
|
|
973 | {
|
835 | final Element element=startTag.getElement();
|
974 | final Element element = startTag.getElement();
|
836 | if (element.end<=end) return element;
|
975 | if (element.end <= end) return element;
|
837 | startTag=checkEnclosure(source.getNextStartTagByClass(startTag.begin+1,className));
|
976 | startTag = checkEnclosure(source.getNextStartTagByClass(startTag.begin + 1, className));
|
838 | }
|
977 | }
|
839 | return null;
|
978 | return null;
|
840 | }
|
979 | }
|
841 |
|
980 |
|
842 | /**
|
981 | /**
|
843 | * Returns a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
982 | * Returns a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
|
|
983 | *
|
844 | * @return a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
984 | * @return a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
845 | */
|
985 | */
|
846 | public List<FormControl> getFormControls() {
|
986 | public List<FormControl> getFormControls()
|
|
|
987 | {
|
847 | return FormControl.getAll(this);
|
988 | return FormControl.getAll(this);
|
848 | }
|
989 | }
|
849 |
|
990 |
|
850 | /**
|
991 | /**
|
851 | * Returns the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
992 | * Returns the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
852 | * <p>
|
993 | * <p>
|
853 | * This is equivalent to {@link FormFields#FormFields(Collection) new FormFields}<code>(</code>{@link #getFormControls()}<code>)</code>.
|
994 | * This is equivalent to {@link FormFields#FormFields(Collection) new FormFields}<code>(</code>{@link #getFormControls()}<code>)</code>.
|
854 | *
|
995 | *
|
855 | * @return the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
996 | * @return the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
856 | * @see #getFormControls()
|
997 | * @see #getFormControls()
|
857 | */
|
998 | */
|
858 | public FormFields getFormFields() {
|
999 | public FormFields getFormFields()
|
|
|
1000 | {
|
859 | return new FormFields(getFormControls());
|
1001 | return new FormFields(getFormControls());
|
860 | }
|
1002 | }
|
861 |
|
1003 |
|
862 | /**
|
1004 | /**
|
863 | * Parses any {@link Attributes} within this segment.
|
1005 | * Parses any {@link Attributes} within this segment.
|
864 | * This method is only used in the unusual situation where attributes exist outside of a start tag.
|
1006 | * This method is only used in the unusual situation where attributes exist outside of a start tag.
|
865 | * The {@link StartTag#getAttributes()} method should be used in normal situations.
|
1007 | * The {@link StartTag#getAttributes()} method should be used in normal situations.
|
866 | * <p>
|
1008 | * <p>
|
867 | * This is equivalent to <code>source.</code>{@link Source#parseAttributes(int,int) parseAttributes}<code>(</code>{@link #getBegin()}<code>,</code>{@link #getEnd()}<code>)</code>.
|
1009 | * This is equivalent to <code>source.</code>{@link Source#parseAttributes(int,int) parseAttributes}<code>(</code>{@link #getBegin()}<code>,</code>{@link #getEnd()}<code>)</code>.
|
868 | *
|
1010 | *
|
869 | * @return the {@link Attributes} within this segment, or <code>null</code> if too many errors occur while parsing.
|
1011 | * @return the {@link Attributes} within this segment, or <code>null</code> if too many errors occur while parsing.
|
870 | */
|
1012 | */
|
871 | public Attributes parseAttributes() {
|
1013 | public Attributes parseAttributes()
|
|
|
1014 | {
|
872 | return source.parseAttributes(begin,end);
|
1015 | return source.parseAttributes(begin, end);
|
873 | }
|
1016 | }
|
874 |
|
1017 |
|
875 | /**
|
1018 | /**
|
876 | * Causes the this segment to be ignored when parsing.
|
1019 | * Causes the this segment to be ignored when parsing.
|
877 | * <p>
|
1020 | * <p>
|
878 | * Ignored segments are treated as blank spaces by the parsing mechanism, but are included as normal text in all other functions.
|
1021 | * Ignored segments are treated as blank spaces by the parsing mechanism, but are included as normal text in all other functions.
|
879 | * <p>
|
1022 | * <p>
|
880 | * This method was originally the only means of preventing {@linkplain TagType#isServerTag() server tags} located inside
|
1023 | * This method was originally the only means of preventing {@linkplain TagType#isServerTag() server tags} located inside {@linkplain StartTagType#NORMAL normal} tags from interfering with the parsing of the tags (such as where an {@linkplain Attribute attribute} of a normal tag uses a server tag to dynamically set its value), as well as preventing non-server tags from being recognised inside server tags.
|
881 | * {@linkplain StartTagType#NORMAL normal} tags from interfering with the parsing of the tags
|
|
|
882 | * (such as where an {@linkplain Attribute attribute} of a normal tag uses a server tag to dynamically set its value),
|
|
|
883 | * as well as preventing non-server tags from being recognised inside server tags.
|
|
|
884 | * <p>
|
1024 | * <p>
|
885 | * It is not necessary to use this method to ignore {@linkplain TagType#isServerTag() server tags} located inside normal tags,
|
1025 | * It is not necessary to use this method to ignore {@linkplain TagType#isServerTag() server tags} located inside normal tags, as the attributes parser automatically ignores any server tags.
|
886 | * as the attributes parser automatically ignores any server tags.
|
|
|
887 | * <p>
|
1026 | * <p>
|
888 | * It is not necessary to use this method to ignore non-server tags inside server tags, or the contents of {@link HTMLElementName#SCRIPT SCRIPT} elements,
|
1027 | * It is not necessary to use this method to ignore non-server tags inside server tags, or the contents of {@link HTMLElementName#SCRIPT SCRIPT} elements, as the parser does this automatically when performing a {@linkplain Source#fullSequentialParse() full sequential parse}.
|
889 | * as the parser does this automatically when performing a {@linkplain Source#fullSequentialParse() full sequential parse}.
|
|
|
890 | * <p>
|
1028 | * <p>
|
891 | * This leaves only very few scenarios where calling this method still provides a significant benefit.
|
1029 | * This leaves only very few scenarios where calling this method still provides a significant benefit.
|
892 | * <p>
|
1030 | * <p>
|
893 | * One such case is where XML-style server tags are used inside {@linkplain StartTagType#NORMAL normal} tags.
|
1031 | * One such case is where XML-style server tags are used inside {@linkplain StartTagType#NORMAL normal} tags. Here is an example using an XML-style JSP tag: <blockquote class="code"><code><a href="<i18n:resource path="/Portal"/>?BACK=TRUE">back</a></code></blockquote> The first double-quote of <code>"/Portal"</code> will be interpreted as the end quote for the <code>href</code> attribute, as there is no way for the parser to recognise the <code>il8n:resource</code> element
|
894 | * Here is an example using an XML-style JSP tag:
|
1032 | * as a server tag. Such use of XML-style server tags inside {@linkplain StartTagType#NORMAL normal} tags is generally seen as bad practice, but it is nevertheless valid JSP. The only way to ensure that this library is able to parse the normal tag surrounding it is to find these server tags first and call the <code>ignoreWhenParsing</code> method to ignore them before parsing the rest of the document.
|
895 | * <blockquote class="code"><code><a href="<i18n:resource path="/Portal"/>?BACK=TRUE">back</a></code></blockquote>
|
|
|
896 | * The first double-quote of <code>"/Portal"</code> will be interpreted as the end quote for the <code>href</code> attribute,
|
|
|
897 | * as there is no way for the parser to recognise the <code>il8n:resource</code> element as a server tag.
|
|
|
898 | * Such use of XML-style server tags inside {@linkplain StartTagType#NORMAL normal} tags is generally seen as bad practice,
|
|
|
899 | * but it is nevertheless valid JSP. The only way to ensure that this library is able to parse the normal tag surrounding it is to
|
|
|
900 | * find these server tags first and call the <code>ignoreWhenParsing</code> method to ignore them before parsing the rest of the document.
|
|
|
901 | * <p>
|
1033 | * <p>
|
902 | * It is important to understand the difference between ignoring the segment when parsing and removing the segment completely.
|
1034 | * It is important to understand the difference between ignoring the segment when parsing and removing the segment completely. Any text inside a segment that is ignored when parsing is treated by most functions as content, and as such is included in the output of tools such as {@link TextExtractor} and {@link Renderer}.
|
903 | * Any text inside a segment that is ignored when parsing is treated by most functions as content, and as such is included in the output of
|
|
|
904 | * tools such as {@link TextExtractor} and {@link Renderer}.
|
|
|
905 | * <p>
|
1035 | * <p>
|
906 | * To remove segments completely, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or
|
1036 | * To remove segments completely, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment. Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())} and perform the desired operations on this new source object.
|
907 | * {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment.
|
|
|
908 | * Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())}
|
|
|
909 | * and perform the desired operations on this new source object.
|
|
|
910 | * <p>
|
1037 | * <p>
|
911 | * Calling this method after the {@link Source#fullSequentialParse()} method has been called is not permitted and throws an <code>IllegalStateException</code>.
|
1038 | * Calling this method after the {@link Source#fullSequentialParse()} method has been called is not permitted and throws an <code>IllegalStateException</code>.
|
912 | * <p>
|
1039 | * <p>
|
913 | * Any tags appearing in this segment that are found before this method is called will remain in the {@linkplain Source#getCacheDebugInfo() tag cache},
|
1040 | * Any tags appearing in this segment that are found before this method is called will remain in the {@linkplain Source#getCacheDebugInfo() tag cache}, and so will continue to be found by the <a href="Tag.html#TagSearchMethods">tag search methods</a>. If this is undesirable, the {@link Source#clearCache()} method can be called to remove them from the cache. Calling the {@link Source#fullSequentialParse()} method after this method clears the cache automatically.
|
914 | * and so will continue to be found by the <a href="Tag.html#TagSearchMethods">tag search methods</a>.
|
|
|
915 | * If this is undesirable, the {@link Source#clearCache()} method can be called to remove them from the cache.
|
|
|
916 | * Calling the {@link Source#fullSequentialParse()} method after this method clears the cache automatically.
|
|
|
917 | * <p>
|
1041 | * <p>
|
918 | * For best performance, this method should be called on all segments that need to be ignored without calling
|
1042 | * For best performance, this method should be called on all segments that need to be ignored without calling any of the <a href="Tag.html#TagSearchMethods">tag search methods</a> in between.
|
919 | * any of the <a href="Tag.html#TagSearchMethods">tag search methods</a> in between.
|
|
|
920 | *
|
1043 | *
|
921 | * @see Source#ignoreWhenParsing(Collection segments)
|
1044 | * @see Source#ignoreWhenParsing(Collection segments)
|
922 | */
|
1045 | */
|
923 | public void ignoreWhenParsing() {
|
1046 | public void ignoreWhenParsing()
|
|
|
1047 | {
|
924 | source.ignoreWhenParsing(begin,end);
|
1048 | source.ignoreWhenParsing(begin, end);
|
925 | }
|
1049 | }
|
926 |
|
1050 |
|
927 | /**
|
1051 | /**
|
928 | * Compares this <code>Segment</code> object to another object.
|
1052 | * Compares this <code>Segment</code> object to another object.
|
929 | * <p>
|
1053 | * <p>
|
930 | * If the argument is not a <code>Segment</code>, a <code>ClassCastException</code> is thrown.
|
1054 | * If the argument is not a <code>Segment</code>, a <code>ClassCastException</code> is thrown.
|
931 | * <p>
|
1055 | * <p>
|
932 | * A segment is considered to be before another segment if its begin position is earlier,
|
1056 | * A segment is considered to be before another segment if its begin position is earlier, or in the case that both segments begin at the same position, its end position is earlier.
|
933 | * or in the case that both segments begin at the same position, its end position is earlier.
|
|
|
934 | * <p>
|
1057 | * <p>
|
935 | * Segments that begin and end at the same position are considered equal for
|
1058 | * Segments that begin and end at the same position are considered equal for the purposes of this comparison, even if they relate to different source documents.
|
936 | * the purposes of this comparison, even if they relate to different source documents.
|
|
|
937 | * <p>
|
1059 | * <p>
|
938 | * Note: this class has a natural ordering that is inconsistent with equals.
|
1060 | * Note: this class has a natural ordering that is inconsistent with equals. This means that this method may return zero in some cases where calling the {@link #equals(Object)} method with the same argument returns <code>false</code>.
|
939 | * This means that this method may return zero in some cases where calling the
|
|
|
940 | * {@link #equals(Object)} method with the same argument returns <code>false</code>.
|
|
|
941 | *
|
1061 | *
|
942 | * @param segment the segment to be compared
|
1062 | * @param segment
|
|
|
1063 | * the segment to be compared
|
943 | * @return a negative integer, zero, or a positive integer as this segment is before, equal to, or after the specified segment.
|
1064 | * @return a negative integer, zero, or a positive integer as this segment is before, equal to, or after the specified segment.
|
944 | * @throws ClassCastException if the argument is not a <code>Segment</code>
|
1065 | * @throws ClassCastException
|
|
|
1066 | * if the argument is not a <code>Segment</code>
|
945 | */
|
1067 | */
|
946 | public int compareTo(final Segment segment) {
|
1068 | public int compareTo(final Segment segment)
|
|
|
1069 | {
|
947 | if (this==segment) return 0;
|
1070 | if (this == segment) return 0;
|
948 | if (begin<segment.begin) return -1;
|
1071 | if (begin < segment.begin) return -1;
|
949 | if (begin>segment.begin) return 1;
|
1072 | if (begin > segment.begin) return 1;
|
950 | if (end<segment.end) return -1;
|
1073 | if (end < segment.end) return -1;
|
951 | if (end>segment.end) return 1;
|
1074 | if (end > segment.end) return 1;
|
952 | return 0;
|
1075 | return 0;
|
953 | }
|
1076 | }
|
954 |
|
1077 |
|
955 | /**
|
1078 | /**
|
956 | * Indicates whether this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}.
|
1079 | * Indicates whether this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}.
|
|
|
1080 | *
|
957 | * @return <code>true</code> if this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}, otherwise <code>false</code>.
|
1081 | * @return <code>true</code> if this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}, otherwise <code>false</code>.
|
958 | */
|
1082 | */
|
959 | public final boolean isWhiteSpace() {
|
1083 | public final boolean isWhiteSpace()
|
|
|
1084 | {
|
960 | for (int i=begin; i<end; i++)
|
1085 | for (int i = begin; i < end; i++)
|
961 | if (!isWhiteSpace(source.charAt(i))) return false;
|
1086 | if (!isWhiteSpace(source.charAt(i))) return false;
|
962 | return true;
|
1087 | return true;
|
963 | }
|
1088 | }
|
964 |
|
1089 |
|
965 | /**
|
1090 | /**
|
966 | * Returns an indication of the maximum depth of nested elements within this segment.
|
1091 | * Returns an indication of the maximum depth of nested elements within this segment.
|
967 | * <p>
|
1092 | * <p>
|
968 | * A high return value can indicate that the segment contains a large number of incorrectly nested tags that could result in a <code>StackOverflowException</code>
|
1093 | * A high return value can indicate that the segment contains a large number of incorrectly nested tags that could result in a <code>StackOverflowException</code> if its content is parsed.
|
969 | * if its content is parsed.
|
|
|
970 | * <p>
|
1094 | * <p>
|
971 | * The usefulness of this method is debatable as a <code>StackOverflowException</code> is a recoverable error that can be easily caught.
|
1095 | * The usefulness of this method is debatable as a <code>StackOverflowException</code> is a recoverable error that can be easily caught. The use of this method to pre-detect and avoid a stack overflow may save some memory and processing resources in certain circumstances, but the cost of calling this method to check every segment or document will very often exceed any benefit.
|
972 | * The use of this method to pre-detect and avoid a stack overflow may save some memory and processing resources in certain circumstances, but the cost of calling
|
|
|
973 | * this method to check every segment or document will very often exceed any benefit.
|
|
|
974 | * <p>
|
1096 | * <p>
|
975 | * It is up to the application developer to determine what return value constitutes an unreasonable level of nesting given the stack space allocated to the application
|
1097 | * It is up to the application developer to determine what return value constitutes an unreasonable level of nesting given the stack space allocated to the application and other factors.
|
976 | * and other factors.
|
|
|
977 | * <p>
|
1098 | * <p>
|
978 | * Note that the return value is an approximation only and is usually greater than the actual maximum element depth that would be reported by calling the
|
1099 | * Note that the return value is an approximation only and is usually greater than the actual maximum element depth that would be reported by calling the {@link Element#getDepth()} method on the most nested element.
|
979 | * {@link Element#getDepth()} method on the most nested element.
|
|
|
980 | *
|
1100 | *
|
981 | * @return an indication of the maximum depth of nested elements within this segment.
|
1101 | * @return an indication of the maximum depth of nested elements within this segment.
|
982 | */
|
1102 | */
|
983 | public int getMaxDepthIndicator() {
|
1103 | public int getMaxDepthIndicator()
|
|
|
1104 | {
|
984 | int maxDepth=0;
|
1105 | int maxDepth = 0;
|
985 | int depth=0;
|
1106 | int depth = 0;
|
986 | for (Tag tag : getAllTags()) {
|
1107 | for (Tag tag : getAllTags())
|
|
|
1108 | {
|
987 | if (tag instanceof StartTag) {
|
1109 | if (tag instanceof StartTag)
|
|
|
1110 | {
|
988 | StartTag startTag=(StartTag)tag;
|
1111 | StartTag startTag = (StartTag) tag;
|
989 | if (startTag.getStartTagType().getCorrespondingEndTagType()==null) continue;
|
1112 | if (startTag.getStartTagType().getCorrespondingEndTagType() == null) continue;
|
990 | if (HTMLElements.getEndTagForbiddenElementNames().contains(startTag.getName())) continue;
|
1113 | if (HTMLElements.getEndTagForbiddenElementNames().contains(startTag.getName())) continue;
|
991 | if (startTag.isEmptyElementTag()) continue;
|
1114 | if (startTag.isEmptyElementTag()) continue;
|
992 | depth++;
|
1115 | depth++;
|
993 | if (depth>maxDepth) maxDepth++;
|
1116 | if (depth > maxDepth) maxDepth++;
|
|
|
1117 | }
|
994 | } else {
|
1118 | else
|
|
|
1119 | {
|
995 | depth--;
|
1120 | depth--;
|
996 | }
|
1121 | }
|
997 | }
|
1122 | }
|
998 | return maxDepth;
|
1123 | return maxDepth;
|
999 | }
|
1124 | }
|
1000 |
|
1125 |
|
1001 | /**
|
1126 | /**
|
1002 | * Indicates whether the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>.
|
1127 | * Indicates whether the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>.
|
1003 | * <p>
|
1128 | * <p>
|
1004 | * The <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">HTML 4.01 specification section 9.1</a>
|
1129 | * The <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">HTML 4.01 specification section 9.1</a> specifies the following white space characters:
|
1005 | * specifies the following white space characters:
|
|
|
1006 | * <ul>
|
1130 | * <ul>
|
1007 | * <li>space (U+0020)
|
1131 | * <li>space (U+0020)
|
1008 | * <li>tab (U+0009)
|
1132 | * <li>tab (U+0009)
|
1009 | * <li>form feed (U+000C)
|
1133 | * <li>form feed (U+000C)
|
1010 | * <li>line feed (U+000A)
|
1134 | * <li>line feed (U+000A)
|
1011 | * <li>carriage return (U+000D)
|
1135 | * <li>carriage return (U+000D)
|
1012 | * <li>zero-width space (U+200B)
|
1136 | * <li>zero-width space (U+200B)
|
1013 | * </ul>
|
1137 | * </ul>
|
1014 | * <p>
|
1138 | * <p>
|
1015 | * Despite the explicit inclusion of the zero-width space in the HTML specification, Microsoft IE6 does not
|
1139 | * Despite the explicit inclusion of the zero-width space in the HTML specification, Microsoft IE6 does not recognise them as white space and renders them as an unprintable character (empty square). Even zero-width spaces included using the numeric character reference <code>&#x200B;</code> are rendered this way.
|
1016 | * recognise them as white space and renders them as an unprintable character (empty square).
|
|
|
1017 | * Even zero-width spaces included using the numeric character reference <code>&#x200B;</code> are rendered this way.
|
|
|
1018 | *
|
1140 | *
|
|
|
1141 | * @param ch
|
1019 | * @param ch the character to test.
|
1142 | * the character to test.
|
1020 | * @return <code>true</code> if the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>, otherwise <code>false</code>.
|
1143 | * @return <code>true</code> if the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>, otherwise <code>false</code>.
|
1021 | */
|
1144 | */
|
1022 | public static final boolean isWhiteSpace(final char ch) {
|
1145 | public static final boolean isWhiteSpace(final char ch)
|
1023 | for (char whiteSpaceChar : WHITESPACE) if (ch==whiteSpaceChar) return true;
|
1146 | {
|
|
|
1147 | for (char whiteSpaceChar : WHITESPACE)
|
|
|
1148 | if (ch == whiteSpaceChar) return true;
|
1024 | return false;
|
1149 | return false;
|
1025 | }
|
1150 | }
|
1026 |
|
1151 |
|
1027 | /**
|
1152 | /**
|
1028 | * Returns a string representation of this object useful for debugging purposes.
|
1153 | * Returns a string representation of this object useful for debugging purposes.
|
|
|
1154 | *
|
1029 | * @return a string representation of this object useful for debugging purposes.
|
1155 | * @return a string representation of this object useful for debugging purposes.
|
1030 | */
|
1156 | */
|
1031 | public String getDebugInfo() {
|
1157 | public String getDebugInfo()
|
|
|
1158 | {
|
1032 | final StringBuilder sb=new StringBuilder(50);
|
1159 | final StringBuilder sb = new StringBuilder(50);
|
1033 | sb.append('(');
|
1160 | sb.append('(');
|
1034 | source.getRowColumnVector(begin).appendTo(sb);
|
1161 | source.getRowColumnVector(begin).appendTo(sb);
|
1035 | sb.append('-');
|
1162 | sb.append('-');
|
1036 | source.getRowColumnVector(end).appendTo(sb);
|
1163 | source.getRowColumnVector(end).appendTo(sb);
|
1037 | sb.append(')');
|
1164 | sb.append(')');
|
… | |
… | |
1039 | }
|
1166 | }
|
1040 |
|
1167 |
|
1041 | /**
|
1168 | /**
|
1042 | * Returns the character at the specified index.
|
1169 | * Returns the character at the specified index.
|
1043 | * <p>
|
1170 | * <p>
|
1044 | * This is logically equivalent to <code>toString().charAt(index)</code>
|
1171 | * This is logically equivalent to <code>toString().charAt(index)</code> for valid argument values <code>0 <= index < length()</code>.
|
1045 | * for valid argument values <code>0 <= index < length()</code>.
|
|
|
1046 | * <p>
|
1172 | * <p>
|
1047 | * However because this implementation works directly on the underlying document source string,
|
1173 | * However because this implementation works directly on the underlying document source string, it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown for an invalid argument value.
|
1048 | * it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown
|
|
|
1049 | * for an invalid argument value.
|
|
|
1050 | *
|
1174 | *
|
1051 | * @param index the index of the character.
|
1175 | * @param index
|
|
|
1176 | * the index of the character.
|
1052 | * @return the character at the specified index.
|
1177 | * @return the character at the specified index.
|
1053 | */
|
1178 | */
|
1054 | public char charAt(final int index) {
|
1179 | public char charAt(final int index)
|
|
|
1180 | {
|
1055 | return source.charAt(begin+index);
|
1181 | return source.charAt(begin + index);
|
1056 | }
|
1182 | }
|
1057 |
|
1183 |
|
1058 | /**
|
1184 | /**
|
1059 | * Returns a new character sequence that is a subsequence of this sequence.
|
1185 | * Returns a new character sequence that is a subsequence of this sequence.
|
1060 | * <p>
|
1186 | * <p>
|
1061 | * This is logically equivalent to <code>toString().subSequence(beginIndex,endIndex)</code>
|
1187 | * This is logically equivalent to <code>toString().subSequence(beginIndex,endIndex)</code> for valid values of <code>beginIndex</code> and <code>endIndex</code>.
|
1062 | * for valid values of <code>beginIndex</code> and <code>endIndex</code>.
|
|
|
1063 | * <p>
|
1188 | * <p>
|
1064 | * However because this implementation works directly on the underlying document source text,
|
1189 | * However because this implementation works directly on the underlying document source text, it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown for invalid argument values as described in the <code>String.subSequence(int,int)</code> method.
|
1065 | * it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown
|
|
|
1066 | * for invalid argument values as described in the <code>String.subSequence(int,int)</code> method.
|
|
|
1067 | *
|
1190 | *
|
1068 | * @param beginIndex the begin index, inclusive.
|
1191 | * @param beginIndex
|
1069 | * @param endIndex the end index, exclusive.
|
1192 | * the begin index, inclusive.
|
|
|
1193 | * @param endIndex
|
|
|
1194 | * the end index, exclusive.
|
1070 | * @return a new character sequence that is a subsequence of this sequence.
|
1195 | * @return a new character sequence that is a subsequence of this sequence.
|
1071 | */
|
1196 | */
|
1072 | public CharSequence subSequence(final int beginIndex, final int endIndex) {
|
1197 | public CharSequence subSequence(final int beginIndex, final int endIndex)
|
|
|
1198 | {
|
1073 | return source.subSequence(begin+beginIndex,begin+endIndex);
|
1199 | return source.subSequence(begin + beginIndex, begin + endIndex);
|
1074 | }
|
1200 | }
|
1075 |
|
1201 |
|
1076 | /**
|
1202 | /**
|
1077 | * Collapses the {@linkplain #isWhiteSpace(char) white space} in the specified text.
|
1203 | * Collapses the {@linkplain #isWhiteSpace(char) white space} in the specified text.
|
1078 | * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
|
1204 | * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
|
1079 | */
|
1205 | */
|
1080 | static final StringBuilder appendCollapseWhiteSpace(final StringBuilder sb, final CharSequence text) {
|
1206 | static final StringBuilder appendCollapseWhiteSpace(final StringBuilder sb, final CharSequence text)
|
|
|
1207 | {
|
1081 | final int textLength=text.length();
|
1208 | final int textLength = text.length();
|
1082 | int i=0;
|
1209 | int i = 0;
|
1083 | boolean lastWasWhiteSpace=false;
|
1210 | boolean lastWasWhiteSpace = false;
|
1084 | while (true) {
|
1211 | while (true)
|
|
|
1212 | {
|
1085 | if (i>=textLength) return sb;
|
1213 | if (i >= textLength) return sb;
|
1086 | if (!isWhiteSpace(text.charAt(i))) break;
|
1214 | if (!isWhiteSpace(text.charAt(i))) break;
|
1087 | i++;
|
1215 | i++;
|
1088 | }
|
1216 | }
|
1089 | do {
|
1217 | do
|
|
|
1218 | {
|
1090 | final char ch=text.charAt(i++);
|
1219 | final char ch = text.charAt(i++);
|
1091 | if (isWhiteSpace(ch)) {
|
1220 | if (isWhiteSpace(ch))
|
|
|
1221 | {
|
1092 | lastWasWhiteSpace=true;
|
1222 | lastWasWhiteSpace = true;
|
|
|
1223 | }
|
1093 | } else {
|
1224 | else
|
|
|
1225 | {
|
1094 | if (lastWasWhiteSpace) {
|
1226 | if (lastWasWhiteSpace)
|
|
|
1227 | {
|
1095 | sb.append(' ');
|
1228 | sb.append(' ');
|
1096 | lastWasWhiteSpace=false;
|
1229 | lastWasWhiteSpace = false;
|
1097 | }
|
1230 | }
|
1098 | sb.append(ch);
|
1231 | sb.append(ch);
|
1099 | }
|
1232 | }
|
|
|
1233 | }
|
1100 | } while (i<textLength);
|
1234 | while (i < textLength);
|
1101 | return sb;
|
1235 | return sb;
|
1102 | }
|
1236 | }
|
1103 |
|
1237 |
|
1104 | static final Pattern getClassPattern(final String className) {
|
1238 | static final Pattern getClassPattern(final String className)
|
|
|
1239 | {
|
1105 | return Pattern.compile(".*(\\s|^)"+className+"(\\s|$).*",Pattern.DOTALL);
|
1240 | return Pattern.compile(".*(\\s|^)" + className + "(\\s|$).*", Pattern.DOTALL);
|
1106 | }
|
1241 | }
|
1107 |
|
1242 |
|
1108 | private List<Element> getAllElements(final List<StartTag> startTags) {
|
1243 | private List<Element> getAllElements(final List<StartTag> startTags)
|
|
|
1244 | {
|
1109 | if (startTags.isEmpty()) return Collections.emptyList();
|
1245 | if (startTags.isEmpty()) return Collections.emptyList();
|
1110 | final ArrayList<Element> elements=new ArrayList<Element>(startTags.size());
|
1246 | final ArrayList<Element> elements = new ArrayList<Element>(startTags.size());
|
1111 | for (StartTag startTag : startTags) {
|
1247 | for (StartTag startTag : startTags)
|
|
|
1248 | {
|
1112 | final Element element=startTag.getElement();
|
1249 | final Element element = startTag.getElement();
|
1113 | if (element.end<=end) elements.add(element);
|
1250 | if (element.end <= end) elements.add(element);
|
1114 | }
|
1251 | }
|
1115 | return elements;
|
1252 | return elements;
|
1116 | }
|
1253 | }
|
1117 |
|
1254 |
|
1118 | private StartTag checkEnclosure(final StartTag startTag) {
|
1255 | private StartTag checkEnclosure(final StartTag startTag)
|
|
|
1256 | {
|
1119 | if (startTag==null || startTag.end>end) return null;
|
1257 | if (startTag == null || startTag.end > end) return null;
|
1120 | return startTag;
|
1258 | return startTag;
|
1121 | }
|
1259 | }
|
1122 |
|
1260 |
|
1123 | private Tag checkTagEnclosure(final Tag tag) {
|
1261 | private Tag checkTagEnclosure(final Tag tag)
|
|
|
1262 | {
|
1124 | if (tag==null || tag.end>end) return null;
|
1263 | if (tag == null || tag.end > end) return null;
|
1125 | return tag;
|
1264 | return tag;
|
1126 | }
|
1265 | }
|
1127 |
|
1266 |
|
1128 | private CharacterReference getNextCharacterReference(final int pos) {
|
1267 | private CharacterReference getNextCharacterReference(final int pos)
|
|
|
1268 | {
|
1129 | final CharacterReference characterReference=source.getNextCharacterReference(pos);
|
1269 | final CharacterReference characterReference = source.getNextCharacterReference(pos);
|
1130 | if (characterReference==null || characterReference.end>end) return null;
|
1270 | if (characterReference == null || characterReference.end > end) return null;
|
1131 | return characterReference;
|
1271 | return characterReference;
|
1132 | }
|
1272 | }
|
1133 | }
|
1273 | }
|
1134 |
|
|
|