1 |
zoffadmin |
2 |
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
|
2 |
|
|
// Version 3.2
|
3 |
|
|
// Copyright (C) 2004-2009 Martin Jericho
|
4 |
|
|
// http://jericho.htmlparser.net/
|
5 |
|
|
//
|
6 |
|
|
// This library is free software; you can redistribute it and/or
|
7 |
|
|
// modify it under the terms of either one of the following licences:
|
8 |
|
|
//
|
9 |
|
|
// 1. The Eclipse Public License (EPL) version 1.0,
|
10 |
|
|
// included in this distribution in the file licence-epl-1.0.html
|
11 |
|
|
// or available at http://www.eclipse.org/legal/epl-v10.html
|
12 |
|
|
//
|
13 |
|
|
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
|
14 |
|
|
// included in this distribution in the file licence-lgpl-2.1.txt
|
15 |
|
|
// or available at http://www.gnu.org/licenses/lgpl.txt
|
16 |
|
|
//
|
17 |
|
|
// This library is distributed on an "AS IS" basis,
|
18 |
|
|
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
|
19 |
|
|
// See the individual licence texts for more details.
|
20 |
|
|
|
21 |
|
|
package net.htmlparser.jericho;
|
22 |
|
|
|
23 |
|
|
import java.util.Iterator;
|
24 |
|
|
import java.util.List;
|
25 |
|
|
import java.util.Collections;
|
26 |
|
|
import java.util.ArrayList;
|
27 |
|
|
import java.util.regex.Pattern;
|
28 |
|
|
|
29 |
|
|
/**
|
30 |
|
|
* Represents a segment of a {@link Source} document.
|
31 |
|
|
* <p>
|
32 |
|
|
* Many of the <a href="Tag.html#TagSearchMethods">tag search methods</a> are defined in this class.
|
33 |
|
|
* <p>
|
34 |
|
|
* The <i>span</i> of a segment is defined by the combination of its begin and end character positions.
|
35 |
|
|
*/
|
36 |
|
|
public class Segment implements Comparable<Segment>, CharSequence {
|
37 |
|
|
final int begin;
|
38 |
|
|
final int end;
|
39 |
|
|
final Source source;
|
40 |
|
|
|
41 |
|
|
private static final char[] WHITESPACE={' ','\n','\r','\t','\f','\u200B'}; // see comments in isWhiteSpace(char) method
|
42 |
|
|
|
43 |
|
|
/**
|
44 |
|
|
* Constructs a new <code>Segment</code> within the specified {@linkplain Source source} document with the specified begin and end character positions.
|
45 |
|
|
* @param source the {@link Source} document, must not be <code>null</code>.
|
46 |
|
|
* @param begin the character position in the source where this segment {@linkplain #getBegin() begins}, inclusive.
|
47 |
|
|
* @param end the character position in the source where this segment {@linkplain #getEnd() ends}, exclusive.
|
48 |
|
|
*/
|
49 |
|
|
public Segment(final Source source, final int begin, final int end) {
|
50 |
|
|
if (begin==-1 || end==-1 || begin>end) throw new IllegalArgumentException();
|
51 |
|
|
this.begin=begin;
|
52 |
|
|
this.end=end;
|
53 |
|
|
if (source==null) throw new IllegalArgumentException("source argument must not be null");
|
54 |
|
|
this.source=source;
|
55 |
|
|
}
|
56 |
|
|
|
57 |
|
|
// Only called from Source constructor
|
58 |
|
|
Segment(final int length) {
|
59 |
|
|
begin=0;
|
60 |
|
|
this.end=length;
|
61 |
|
|
source=(Source)this;
|
62 |
|
|
}
|
63 |
|
|
|
64 |
|
|
// Only used for creating dummy flag instances of this type (see Tag.NOT_CACHED and Element.NOT_CACHED)
|
65 |
|
|
Segment() {
|
66 |
|
|
this(0,0);
|
67 |
|
|
}
|
68 |
|
|
|
69 |
|
|
// Only used for creating dummy flag instances of this type (see Segment() constructor and StreamedSource.START_SEGMENT)
|
70 |
|
|
Segment(final int begin, final int end) {
|
71 |
|
|
this.begin=begin;
|
72 |
|
|
this.end=end;
|
73 |
|
|
source=null;
|
74 |
|
|
}
|
75 |
|
|
|
76 |
|
|
/**
|
77 |
|
|
* Returns the {@link Source} document containing this segment.
|
78 |
|
|
* <p>
|
79 |
|
|
* If a {@link StreamedSource} is in use, this method throws an <code>UnsupportedOperationException</code>.
|
80 |
|
|
*
|
81 |
|
|
* @return the {@link Source} document containing this segment.
|
82 |
|
|
*/
|
83 |
|
|
public final Source getSource() {
|
84 |
|
|
if (source.isStreamed()) throw new UnsupportedOperationException("Source object is not available when using StreamedSource");
|
85 |
|
|
return source;
|
86 |
|
|
}
|
87 |
|
|
|
88 |
|
|
/**
|
89 |
|
|
* Returns the character position in the {@link Source} document at which this segment begins, inclusive.
|
90 |
|
|
* <p>
|
91 |
|
|
* Use the {@link Source#getRowColumnVector(int pos)} method to determine the row and column numbers corresponding to this character position.
|
92 |
|
|
*
|
93 |
|
|
* @return the character position in the {@link Source} document at which this segment begins, inclusive.
|
94 |
|
|
*/
|
95 |
|
|
public final int getBegin() {
|
96 |
|
|
return begin;
|
97 |
|
|
}
|
98 |
|
|
|
99 |
|
|
/**
|
100 |
|
|
* Returns the character position in the {@link Source} document immediately after the end of this segment.
|
101 |
|
|
* <p>
|
102 |
|
|
* The character at the position specified by this property is <b>not</b> included in the segment.
|
103 |
|
|
*
|
104 |
|
|
* @return the character position in the {@link Source} document immediately after the end of this segment.
|
105 |
|
|
* @see #getBegin()
|
106 |
|
|
*/
|
107 |
|
|
public final int getEnd() {
|
108 |
|
|
return end;
|
109 |
|
|
}
|
110 |
|
|
|
111 |
|
|
/**
|
112 |
|
|
* Compares the specified object with this <code>Segment</code> for equality.
|
113 |
|
|
* <p>
|
114 |
|
|
* Returns <code>true</code> if and only if the specified object is also a <code>Segment</code>,
|
115 |
|
|
* and both segments have the same {@link Source}, and the same begin and end positions.
|
116 |
|
|
* @param object the object to be compared for equality with this <code>Segment</code>.
|
117 |
|
|
* @return <code>true</code> if the specified object is equal to this <code>Segment</code>, otherwise <code>false</code>.
|
118 |
|
|
*/
|
119 |
|
|
public final boolean equals(final Object object) {
|
120 |
|
|
if (this==object) return true;
|
121 |
|
|
if (object==null || !(object instanceof Segment)) return false;
|
122 |
|
|
final Segment segment=(Segment)object;
|
123 |
|
|
return segment.begin==begin && segment.end==end && segment.source==source;
|
124 |
|
|
}
|
125 |
|
|
|
126 |
|
|
/**
|
127 |
|
|
* Returns a hash code value for the segment.
|
128 |
|
|
* <p>
|
129 |
|
|
* The current implementation returns the sum of the begin and end positions, although this is not
|
130 |
|
|
* guaranteed in future versions.
|
131 |
|
|
*
|
132 |
|
|
* @return a hash code value for the segment.
|
133 |
|
|
*/
|
134 |
|
|
public int hashCode() {
|
135 |
|
|
return begin+end;
|
136 |
|
|
}
|
137 |
|
|
|
138 |
|
|
/**
|
139 |
|
|
* Returns the length of the segment.
|
140 |
|
|
* This is defined as the number of characters between the begin and end positions.
|
141 |
|
|
* @return the length of the segment.
|
142 |
|
|
*/
|
143 |
|
|
public int length() {
|
144 |
|
|
return end-begin;
|
145 |
|
|
}
|
146 |
|
|
|
147 |
|
|
/**
|
148 |
|
|
* Indicates whether this <code>Segment</code> encloses the specified <code>Segment</code>.
|
149 |
|
|
* <p>
|
150 |
|
|
* This is the case if {@link #getBegin()}<code><=segment.</code>{@link #getBegin()}<code> && </code>{@link #getEnd()}<code>>=segment.</code>{@link #getEnd()}.
|
151 |
|
|
* <p>
|
152 |
|
|
* Note that a segment encloses itself.
|
153 |
|
|
*
|
154 |
|
|
* @param segment the segment to be tested for being enclosed by this segment.
|
155 |
|
|
* @return <code>true</code> if this <code>Segment</code> encloses the specified <code>Segment</code>, otherwise <code>false</code>.
|
156 |
|
|
*/
|
157 |
|
|
public final boolean encloses(final Segment segment) {
|
158 |
|
|
return begin<=segment.begin && end>=segment.end;
|
159 |
|
|
}
|
160 |
|
|
|
161 |
|
|
/**
|
162 |
|
|
* Indicates whether this segment encloses the specified character position in the source document.
|
163 |
|
|
* <p>
|
164 |
|
|
* This is the case if {@link #getBegin()}<code> <= pos < </code>{@link #getEnd()}.
|
165 |
|
|
*
|
166 |
|
|
* @param pos the position in the {@link Source} document.
|
167 |
|
|
* @return <code>true</code> if this segment encloses the specified character position in the source document, otherwise <code>false</code>.
|
168 |
|
|
*/
|
169 |
|
|
public final boolean encloses(final int pos) {
|
170 |
|
|
return begin<=pos && pos<end;
|
171 |
|
|
}
|
172 |
|
|
|
173 |
|
|
/**
|
174 |
|
|
* Returns the source text of this segment as a <code>String</code>.
|
175 |
|
|
* <p>
|
176 |
|
|
* The returned <code>String</code> is newly created with every call to this method, unless this
|
177 |
|
|
* segment is itself an instance of {@link Source}.
|
178 |
|
|
*
|
179 |
|
|
* @return the source text of this segment as a <code>String</code>.
|
180 |
|
|
*/
|
181 |
|
|
public String toString() {
|
182 |
|
|
return source.subSequence(begin,end).toString();
|
183 |
|
|
}
|
184 |
|
|
|
185 |
|
|
/**
|
186 |
|
|
* Performs a simple rendering of the HTML markup in this segment into text.
|
187 |
|
|
* <p>
|
188 |
|
|
* The output can be configured by setting any number of properties on the returned {@link Renderer} instance before
|
189 |
|
|
* {@linkplain Renderer#writeTo(Writer) obtaining its output}.
|
190 |
|
|
*
|
191 |
|
|
* @return an instance of {@link Renderer} based on this segment.
|
192 |
|
|
* @see #getTextExtractor()
|
193 |
|
|
*/
|
194 |
|
|
public Renderer getRenderer() {
|
195 |
|
|
return new Renderer(this);
|
196 |
|
|
}
|
197 |
|
|
|
198 |
|
|
/**
|
199 |
|
|
* Extracts the textual content from the HTML markup of this segment.
|
200 |
|
|
* <p>
|
201 |
|
|
* The output can be configured by setting properties on the returned {@link TextExtractor} instance before
|
202 |
|
|
* {@linkplain TextExtractor#writeTo(Writer) obtaining its output}.
|
203 |
|
|
* <p>
|
204 |
|
|
* @return an instance of {@link TextExtractor} based on this segment.
|
205 |
|
|
* @see #getRenderer()
|
206 |
|
|
*/
|
207 |
|
|
public TextExtractor getTextExtractor() {
|
208 |
|
|
return new TextExtractor(this);
|
209 |
|
|
}
|
210 |
|
|
|
211 |
|
|
/**
|
212 |
|
|
* Returns an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and <a href="Source.html#PlainText">plain text</a> segment contained within this segment.
|
213 |
|
|
* <p>
|
214 |
|
|
* See the {@link Source#iterator()} method for a detailed description.
|
215 |
|
|
* <p>
|
216 |
|
|
* <dl>
|
217 |
|
|
* <dt>Example:</dt>
|
218 |
|
|
* <dd>
|
219 |
|
|
* <p>
|
220 |
|
|
* The following code demonstrates the typical usage of this method to make an exact copy of this segment to <code>writer</code> (assuming no server tags are present):
|
221 |
|
|
* </p>
|
222 |
|
|
* <pre>
|
223 |
|
|
* for (Iterator<Segment> nodeIterator=segment.getNoteIterator(); nodeIterator.hasNext();) {
|
224 |
|
|
* Segment nodeSegment=nodeIterator.next();
|
225 |
|
|
* if (nodeSegment instanceof Tag) {
|
226 |
|
|
* Tag tag=(Tag)nodeSegment;
|
227 |
|
|
* // HANDLE TAG
|
228 |
|
|
* // Uncomment the following line to ensure each tag is valid XML:
|
229 |
|
|
* // writer.write(tag.tidy()); continue;
|
230 |
|
|
* } else if (nodeSegment instanceof CharacterReference) {
|
231 |
|
|
* CharacterReference characterReference=(CharacterReference)nodeSegment;
|
232 |
|
|
* // HANDLE CHARACTER REFERENCE
|
233 |
|
|
* // Uncomment the following line to decode all character references instead of copying them verbatim:
|
234 |
|
|
* // characterReference.appendCharTo(writer); continue;
|
235 |
|
|
* } else {
|
236 |
|
|
* // HANDLE PLAIN TEXT
|
237 |
|
|
* }
|
238 |
|
|
* // unless specific handling has prevented getting to here, simply output the segment as is:
|
239 |
|
|
* writer.write(nodeSegment.toString());
|
240 |
|
|
* }</pre>
|
241 |
|
|
* </dd>
|
242 |
|
|
* </dl>
|
243 |
|
|
* @return an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and <a href="Source.html#PlainText">plain text</a> segment contained within this segment.
|
244 |
|
|
*/
|
245 |
|
|
public Iterator<Segment> getNodeIterator() {
|
246 |
|
|
return new NodeIterator(this);
|
247 |
|
|
}
|
248 |
|
|
|
249 |
|
|
/**
|
250 |
|
|
* Returns a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
251 |
|
|
* <p>
|
252 |
|
|
* The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
|
253 |
|
|
* if this method is to be used on a large proportion of the source.
|
254 |
|
|
* It is called automatically if this method is called on the {@link Source} object itself.
|
255 |
|
|
* <p>
|
256 |
|
|
* See the {@link Tag} class documentation for more details about the behaviour of this method.
|
257 |
|
|
*
|
258 |
|
|
* @return a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
259 |
|
|
*/
|
260 |
|
|
public List<Tag> getAllTags() {
|
261 |
|
|
return getAllTags(null);
|
262 |
|
|
}
|
263 |
|
|
|
264 |
|
|
/**
|
265 |
|
|
* Returns a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
266 |
|
|
* <p>
|
267 |
|
|
* See the {@link Tag} class documentation for more details about the behaviour of this method.
|
268 |
|
|
* <p>
|
269 |
|
|
* Specifying a <code>null</code> argument to the <code>tagType</code> parameter is equivalent to {@link #getAllTags()}.
|
270 |
|
|
*
|
271 |
|
|
* @param tagType the {@linkplain TagType type} of tags to get.
|
272 |
|
|
* @return a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
273 |
|
|
* @see #getAllStartTags(StartTagType)
|
274 |
|
|
*/
|
275 |
|
|
public List<Tag> getAllTags(final TagType tagType) {
|
276 |
|
|
Tag tag=checkTagEnclosure(Tag.getNextTag(source,begin,tagType));
|
277 |
|
|
if (tag==null) return Collections.emptyList();
|
278 |
|
|
final ArrayList<Tag> list=new ArrayList<Tag>();
|
279 |
|
|
do {
|
280 |
|
|
list.add(tag);
|
281 |
|
|
tag=checkTagEnclosure(tag.getNextTag(tagType));
|
282 |
|
|
} while (tag!=null);
|
283 |
|
|
return list;
|
284 |
|
|
}
|
285 |
|
|
|
286 |
|
|
/**
|
287 |
|
|
* Returns a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
288 |
|
|
* <p>
|
289 |
|
|
* The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
|
290 |
|
|
* if this method is to be used on a large proportion of the source.
|
291 |
|
|
* It is called automatically if this method is called on the {@link Source} object itself.
|
292 |
|
|
* <p>
|
293 |
|
|
* See the {@link Tag} class documentation for more details about the behaviour of this method.
|
294 |
|
|
*
|
295 |
|
|
* @return a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
296 |
|
|
*/
|
297 |
|
|
public List<StartTag> getAllStartTags() {
|
298 |
|
|
StartTag startTag=checkEnclosure(StartTag.getNext(source,begin));
|
299 |
|
|
if (startTag==null) return Collections.emptyList();
|
300 |
|
|
final ArrayList<StartTag> list=new ArrayList<StartTag>();
|
301 |
|
|
do {
|
302 |
|
|
list.add(startTag);
|
303 |
|
|
startTag=checkEnclosure(startTag.getNextStartTag());
|
304 |
|
|
} while (startTag!=null);
|
305 |
|
|
return list;
|
306 |
|
|
}
|
307 |
|
|
|
308 |
|
|
/**
|
309 |
|
|
* Returns a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
310 |
|
|
* <p>
|
311 |
|
|
* See the {@link Tag} class documentation for more details about the behaviour of this method.
|
312 |
|
|
* <p>
|
313 |
|
|
* Specifying a <code>null</code> argument to the <code>startTagType</code> parameter is equivalent to {@link #getAllStartTags()}.
|
314 |
|
|
*
|
315 |
|
|
* @param startTagType the {@linkplain StartTagType type} of tags to get.
|
316 |
|
|
* @return a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
317 |
|
|
*/
|
318 |
|
|
public List<StartTag> getAllStartTags(final StartTagType startTagType) {
|
319 |
|
|
if (startTagType==null) return getAllStartTags();
|
320 |
|
|
StartTag startTag=(StartTag)checkTagEnclosure(Tag.getNextTag(source,begin,startTagType));
|
321 |
|
|
if (startTag==null) return Collections.emptyList();
|
322 |
|
|
final ArrayList<StartTag> list=new ArrayList<StartTag>();
|
323 |
|
|
do {
|
324 |
|
|
list.add(startTag);
|
325 |
|
|
startTag=(StartTag)checkTagEnclosure(startTag.getNextTag(startTagType));
|
326 |
|
|
} while (startTag!=null);
|
327 |
|
|
return list;
|
328 |
|
|
}
|
329 |
|
|
|
330 |
|
|
/**
|
331 |
|
|
* Returns a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
332 |
|
|
* <p>
|
333 |
|
|
* See the {@link Tag} class documentation for more details about the behaviour of this method.
|
334 |
|
|
* <p>
|
335 |
|
|
* Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getAllStartTags()}, which may include non-{@linkplain StartTagType#NORMAL normal} start tags.
|
336 |
|
|
* <p>
|
337 |
|
|
* This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
|
338 |
|
|
*
|
339 |
|
|
* @param name the {@linkplain StartTag#getName() name} of the start tags to get.
|
340 |
|
|
* @return a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
341 |
|
|
*/
|
342 |
|
|
public List<StartTag> getAllStartTags(String name) {
|
343 |
|
|
if (name==null) return getAllStartTags();
|
344 |
|
|
final boolean isXMLTagName=Tag.isXMLName(name);
|
345 |
|
|
StartTag startTag=checkEnclosure(StartTag.getNext(source,begin,name,StartTagType.NORMAL,isXMLTagName));
|
346 |
|
|
if (startTag==null) return Collections.emptyList();
|
347 |
|
|
final ArrayList<StartTag> list=new ArrayList<StartTag>();
|
348 |
|
|
do {
|
349 |
|
|
list.add(startTag);
|
350 |
|
|
startTag=checkEnclosure(StartTag.getNext(source,startTag.begin+1,name,StartTagType.NORMAL,isXMLTagName));
|
351 |
|
|
} while (startTag!=null);
|
352 |
|
|
return list;
|
353 |
|
|
}
|
354 |
|
|
|
355 |
|
|
/**
|
356 |
|
|
* Returns a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
357 |
|
|
* <p>
|
358 |
|
|
* See the {@link Tag} class documentation for more details about the behaviour of this method.
|
359 |
|
|
*
|
360 |
|
|
* @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
361 |
|
|
* @param value the value of the specified attribute to search for, must not be <code>null</code>.
|
362 |
|
|
* @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
|
363 |
|
|
* @return a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
364 |
|
|
* @see #getAllStartTags(String attributeName, Pattern valueRegexPattern)
|
365 |
|
|
*/
|
366 |
|
|
public List<StartTag> getAllStartTags(final String attributeName, final String value, final boolean valueCaseSensitive) {
|
367 |
|
|
StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive));
|
368 |
|
|
if (startTag==null) return Collections.emptyList();
|
369 |
|
|
final ArrayList<StartTag> list=new ArrayList<StartTag>();
|
370 |
|
|
do {
|
371 |
|
|
list.add(startTag);
|
372 |
|
|
startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,value,valueCaseSensitive));
|
373 |
|
|
} while (startTag!=null);
|
374 |
|
|
return list;
|
375 |
|
|
}
|
376 |
|
|
|
377 |
|
|
/**
|
378 |
|
|
* Returns a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
379 |
|
|
* <p>
|
380 |
|
|
* Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only,
|
381 |
|
|
* without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
|
382 |
|
|
* <p>
|
383 |
|
|
* See the {@link Tag} class documentation for more details about the behaviour of this method.
|
384 |
|
|
*
|
385 |
|
|
* @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
386 |
|
|
* @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>.
|
387 |
|
|
* @return a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
388 |
|
|
* @see #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)
|
389 |
|
|
*/
|
390 |
|
|
public List<StartTag> getAllStartTags(final String attributeName, final Pattern valueRegexPattern) {
|
391 |
|
|
StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern));
|
392 |
|
|
if (startTag==null) return Collections.emptyList();
|
393 |
|
|
final ArrayList<StartTag> list=new ArrayList<StartTag>();
|
394 |
|
|
do {
|
395 |
|
|
list.add(startTag);
|
396 |
|
|
startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,valueRegexPattern));
|
397 |
|
|
} while (startTag!=null);
|
398 |
|
|
return list;
|
399 |
|
|
}
|
400 |
|
|
|
401 |
|
|
/**
|
402 |
|
|
* Returns a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
403 |
|
|
* <p>
|
404 |
|
|
* This matches start tags with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple
|
405 |
|
|
* class names separated by white space in the attribute value.
|
406 |
|
|
* <p>
|
407 |
|
|
* See the {@link Tag} class documentation for more details about the behaviour of this method.
|
408 |
|
|
*
|
409 |
|
|
* @param className the class name (case sensitive) to search for, must not be <code>null</code>.
|
410 |
|
|
* @return a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
411 |
|
|
*/
|
412 |
|
|
public List<StartTag> getAllStartTagsByClass(final String className) {
|
413 |
|
|
return getAllStartTags("class",getClassPattern(className));
|
414 |
|
|
}
|
415 |
|
|
|
416 |
|
|
/**
|
417 |
|
|
* Returns a list of the immediate children of this segment in the document element hierarchy.
|
418 |
|
|
* <p>
|
419 |
|
|
* The returned list may include an element that extends beyond the end of this segment, as long as it begins within this segment.
|
420 |
|
|
* <p>
|
421 |
|
|
* An element found at the start of this segment is included in the list.
|
422 |
|
|
* Note however that if this segment <i>is</i> an {@link Element}, the overriding {@link Element#getChildElements()} method is called instead,
|
423 |
|
|
* which only returns the children of the element.
|
424 |
|
|
* <p>
|
425 |
|
|
* Calling <code>getChildElements()</code> on an <code>Element</code> is much more efficient than calling it on a <code>Segment</code>.
|
426 |
|
|
* <p>
|
427 |
|
|
* The objects in the list are all of type {@link Element}.
|
428 |
|
|
* <p>
|
429 |
|
|
* The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
|
430 |
|
|
* if this method is to be used on a large proportion of the source.
|
431 |
|
|
* It is called automatically if this method is called on the {@link Source} object itself.
|
432 |
|
|
* <p>
|
433 |
|
|
* See the {@link Source#getChildElements()} method for more details.
|
434 |
|
|
*
|
435 |
|
|
* @return the a list of the immediate children of this segment in the document element hierarchy, guaranteed not <code>null</code>.
|
436 |
|
|
* @see Element#getParentElement()
|
437 |
|
|
*/
|
438 |
|
|
public List<Element> getChildElements() {
|
439 |
|
|
if (length()==0) return Collections.emptyList();
|
440 |
|
|
List<Element> childElements=new ArrayList<Element>();
|
441 |
|
|
int pos=begin;
|
442 |
|
|
while (true) {
|
443 |
|
|
final StartTag childStartTag=source.getNextStartTag(pos);
|
444 |
|
|
if (childStartTag==null || childStartTag.begin>=end) break;
|
445 |
|
|
if (!Config.IncludeServerTagsInElementHierarchy && childStartTag.getTagType().isServerTag()) {
|
446 |
|
|
pos=childStartTag.end;
|
447 |
|
|
continue;
|
448 |
|
|
}
|
449 |
|
|
final Element childElement=childStartTag.getElement();
|
450 |
|
|
childElements.add(childElement);
|
451 |
|
|
childElement.getChildElements();
|
452 |
|
|
pos=childElement.end;
|
453 |
|
|
}
|
454 |
|
|
return childElements;
|
455 |
|
|
}
|
456 |
|
|
|
457 |
|
|
/**
|
458 |
|
|
* Returns a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
459 |
|
|
* <p>
|
460 |
|
|
* The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
|
461 |
|
|
* if this method is to be used on a large proportion of the source.
|
462 |
|
|
* It is called automatically if this method is called on the {@link Source} object itself.
|
463 |
|
|
* <p>
|
464 |
|
|
* The elements returned correspond exactly with the start tags returned in the {@link #getAllStartTags()} method.
|
465 |
|
|
* <p>
|
466 |
|
|
* If this segment is itself an {@link Element}, the result includes this element in the list.
|
467 |
|
|
*
|
468 |
|
|
* @return a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
469 |
|
|
*/
|
470 |
|
|
public List<Element> getAllElements() {
|
471 |
|
|
return getAllElements(getAllStartTags());
|
472 |
|
|
}
|
473 |
|
|
|
474 |
|
|
/**
|
475 |
|
|
* Returns a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
476 |
|
|
* <p>
|
477 |
|
|
* The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String name)} method,
|
478 |
|
|
* except that elements which are not entirely enclosed by this segment are excluded.
|
479 |
|
|
* <p>
|
480 |
|
|
* Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getAllElements()}, which may include elements of non-{@linkplain StartTagType#NORMAL normal} tags.
|
481 |
|
|
* <p>
|
482 |
|
|
* This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
|
483 |
|
|
* <p>
|
484 |
|
|
* If this segment is itself an {@link Element} with the specified name, the result includes this element in the list.
|
485 |
|
|
*
|
486 |
|
|
* @param name the {@linkplain Element#getName() name} of the elements to get.
|
487 |
|
|
* @return a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
488 |
|
|
*/
|
489 |
|
|
public List<Element> getAllElements(String name) {
|
490 |
|
|
return getAllElements(getAllStartTags(name));
|
491 |
|
|
}
|
492 |
|
|
|
493 |
|
|
/**
|
494 |
|
|
* Returns a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
495 |
|
|
* <p>
|
496 |
|
|
* The elements returned correspond with the start tags returned in the {@link #getAllTags(TagType)} method,
|
497 |
|
|
* except that elements which are not entirely enclosed by this segment are excluded.
|
498 |
|
|
* <p>
|
499 |
|
|
* If this segment is itself an {@link Element} with the specified type, the result includes this element in the list.
|
500 |
|
|
*
|
501 |
|
|
* @param startTagType the {@linkplain StartTagType type} of start tags to get, must not be <code>null</code>.
|
502 |
|
|
* @return a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
503 |
|
|
*/
|
504 |
|
|
public List<Element> getAllElements(final StartTagType startTagType) {
|
505 |
|
|
if (startTagType==null) throw new IllegalArgumentException("startTagType argument must not be null");
|
506 |
|
|
return getAllElements(getAllStartTags(startTagType));
|
507 |
|
|
}
|
508 |
|
|
|
509 |
|
|
/**
|
510 |
|
|
* Returns a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
511 |
|
|
* <p>
|
512 |
|
|
* The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)} method,
|
513 |
|
|
* except that elements which are not entirely enclosed by this segment are excluded.
|
514 |
|
|
* <p>
|
515 |
|
|
* If this segment is itself an {@link Element} with the specified name/value pair, the result includes this element in the list.
|
516 |
|
|
*
|
517 |
|
|
* @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
518 |
|
|
* @param value the value of the specified attribute to search for, must not be <code>null</code>.
|
519 |
|
|
* @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
|
520 |
|
|
* @return a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
521 |
|
|
* @see #getAllElements(String attributeName, Pattern valueRegexPattern)
|
522 |
|
|
*/
|
523 |
|
|
public List<Element> getAllElements(final String attributeName, final String value, final boolean valueCaseSensitive) {
|
524 |
|
|
return getAllElements(getAllStartTags(attributeName,value,valueCaseSensitive));
|
525 |
|
|
}
|
526 |
|
|
|
527 |
|
|
/**
|
528 |
|
|
* Returns a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
529 |
|
|
* <p>
|
530 |
|
|
* The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, Pattern valueRegexPattern)} method,
|
531 |
|
|
* except that elements which are not entirely enclosed by this segment are excluded.
|
532 |
|
|
* <p>
|
533 |
|
|
* Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only,
|
534 |
|
|
* without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
|
535 |
|
|
* <p>
|
536 |
|
|
* If this segment is itself an {@link Element} with the specified attribute name and value pattern, the result includes this element in the list.
|
537 |
|
|
*
|
538 |
|
|
* @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
539 |
|
|
* @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>.
|
540 |
|
|
* @return a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
541 |
|
|
* @see #getAllElements(String attributeName, String value, boolean valueCaseSensitive)
|
542 |
|
|
*/
|
543 |
|
|
public List<Element> getAllElements(final String attributeName, final Pattern valueRegexPattern) {
|
544 |
|
|
return getAllElements(getAllStartTags(attributeName,valueRegexPattern));
|
545 |
|
|
}
|
546 |
|
|
|
547 |
|
|
/**
|
548 |
|
|
* Returns a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
549 |
|
|
* <p>
|
550 |
|
|
* This matches elements with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple
|
551 |
|
|
* class names separated by white space in the attribute value.
|
552 |
|
|
* <p>
|
553 |
|
|
* The elements returned correspond with the start tags returned in the {@link #getAllStartTagsByClass(String className)} method,
|
554 |
|
|
* except that elements which are not entirely enclosed by this segment are excluded.
|
555 |
|
|
* <p>
|
556 |
|
|
* If this segment is itself an {@link Element} with the specified class, the result includes this element in the list.
|
557 |
|
|
*
|
558 |
|
|
* @param className the class name (case sensitive) to search for, must not be <code>null</code>.
|
559 |
|
|
* @return a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
560 |
|
|
*/
|
561 |
|
|
public List<Element> getAllElementsByClass(final String className) {
|
562 |
|
|
return getAllElements(getAllStartTagsByClass(className));
|
563 |
|
|
}
|
564 |
|
|
|
565 |
|
|
/**
|
566 |
|
|
* Returns a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
567 |
|
|
* @return a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
568 |
|
|
*/
|
569 |
|
|
public List<CharacterReference> getAllCharacterReferences() {
|
570 |
|
|
CharacterReference characterReference=getNextCharacterReference(begin);
|
571 |
|
|
if (characterReference==null) return Collections.emptyList();
|
572 |
|
|
final ArrayList<CharacterReference> list=new ArrayList<CharacterReference>();
|
573 |
|
|
do {
|
574 |
|
|
list.add(characterReference);
|
575 |
|
|
characterReference=getNextCharacterReference(characterReference.end);
|
576 |
|
|
} while (characterReference!=null);
|
577 |
|
|
return list;
|
578 |
|
|
}
|
579 |
|
|
|
580 |
|
|
/**
|
581 |
|
|
* Returns a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> values.
|
582 |
|
|
* <p>
|
583 |
|
|
* According to the <a target="_blank" href="http://www.w3.org/TR/html401/">HTML 4.01 specification</a>, the following attributes have URI values:
|
584 |
|
|
* <table class="bordered" cellspacing="0">
|
585 |
|
|
* <tr><th>HTML element name<th>Attribute name
|
586 |
|
|
* <tr><td>{@link HTMLElementName#A A}<td>href
|
587 |
|
|
* <tr><td>{@link HTMLElementName#APPLET APPLET}<td>codebase
|
588 |
|
|
* <tr><td>{@link HTMLElementName#AREA AREA}<td>href
|
589 |
|
|
* <tr><td>{@link HTMLElementName#BASE BASE}<td>href
|
590 |
|
|
* <tr><td>{@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE}<td>cite
|
591 |
|
|
* <tr><td>{@link HTMLElementName#BODY BODY}<td>background
|
592 |
|
|
* <tr><td>{@link HTMLElementName#FORM FORM}<td>action
|
593 |
|
|
* <tr><td>{@link HTMLElementName#FRAME FRAME}<td>longdesc
|
594 |
|
|
* <tr><td>{@link HTMLElementName#FRAME FRAME}<td>src
|
595 |
|
|
* <tr><td>{@link HTMLElementName#DEL DEL}<td>cite
|
596 |
|
|
* <tr><td>{@link HTMLElementName#HEAD HEAD}<td>profile
|
597 |
|
|
* <tr><td>{@link HTMLElementName#IFRAME IFRAME}<td>longdesc
|
598 |
|
|
* <tr><td>{@link HTMLElementName#IFRAME IFRAME}<td>src
|
599 |
|
|
* <tr><td>{@link HTMLElementName#IMG IMG}<td>longdesc
|
600 |
|
|
* <tr><td>{@link HTMLElementName#IMG IMG}<td>src
|
601 |
|
|
* <tr><td>{@link HTMLElementName#IMG IMG}<td>usemap
|
602 |
|
|
* <tr><td>{@link HTMLElementName#INPUT INPUT}<td>src
|
603 |
|
|
* <tr><td>{@link HTMLElementName#INPUT INPUT}<td>usemap
|
604 |
|
|
* <tr><td>{@link HTMLElementName#INS INS}<td>cite
|
605 |
|
|
* <tr><td>{@link HTMLElementName#LINK LINK}<td>href
|
606 |
|
|
* <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>classid
|
607 |
|
|
* <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>codebase
|
608 |
|
|
* <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>data
|
609 |
|
|
* <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>usemap
|
610 |
|
|
* <tr><td>{@link HTMLElementName#Q Q}<td>cite
|
611 |
|
|
* <tr><td>{@link HTMLElementName#SCRIPT SCRIPT}<td>src
|
612 |
|
|
* </table>
|
613 |
|
|
* <p>
|
614 |
|
|
* Attributes from other elements may also be returned if the attribute name matches one of those in the list above.
|
615 |
|
|
* <p>
|
616 |
|
|
* This method is often used in conjunction with the {@link #getStyleURISegments()} method in order to find all URIs in a document.
|
617 |
|
|
* <p>
|
618 |
|
|
* The attributes are returned in order of appearance.
|
619 |
|
|
*
|
620 |
|
|
* @return a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> values.
|
621 |
|
|
* @see #getStyleURISegments()
|
622 |
|
|
*/
|
623 |
|
|
public List<Attribute> getURIAttributes() {
|
624 |
|
|
return URIAttributes.getList(this);
|
625 |
|
|
}
|
626 |
|
|
|
627 |
|
|
/**
|
628 |
|
|
* Returns a list of all <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> {@linkplain Segment segments}
|
629 |
|
|
* inside <code>style</code> attribute values {@linkplain #encloses(Segment) enclosed} by this segment.
|
630 |
|
|
* <p>
|
631 |
|
|
* If this segment does not contain any tags, the entire segment is assumed to be a <code>style</code> attribute value.
|
632 |
|
|
* <p>
|
633 |
|
|
* The URI segments are found by searching the <code>style</code> attribute values for the functional notation "<code>url()</code>" as described in
|
634 |
|
|
* <a target="_blank" href="http://www.w3.org/TR/CSS2/syndata.html#uri">section 4.3.4 of the CSS2 specification</a>.
|
635 |
|
|
* <p>
|
636 |
|
|
* The segments are returned in order of appearance.
|
637 |
|
|
*
|
638 |
|
|
* @return a list of all <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> {@linkplain Segment segments} inside <code>style</code> attribute values {@linkplain #encloses(Segment) enclosed} by this segment.
|
639 |
|
|
* @see #getURIAttributes()
|
640 |
|
|
*/
|
641 |
|
|
public List<Segment> getStyleURISegments() {
|
642 |
|
|
return URIAttributes.getStyleURISegments(this);
|
643 |
|
|
}
|
644 |
|
|
|
645 |
|
|
/**
|
646 |
|
|
* Returns the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
|
647 |
|
|
* <p>
|
648 |
|
|
* This is functionally equivalent to {@link #getAllStartTags()}<code>.iterator().next()</code>,
|
649 |
|
|
* but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
|
650 |
|
|
*
|
651 |
|
|
* @return the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
652 |
|
|
*/
|
653 |
|
|
public final StartTag getFirstStartTag() {
|
654 |
|
|
return checkEnclosure(source.getNextStartTag(begin));
|
655 |
|
|
}
|
656 |
|
|
|
657 |
|
|
/**
|
658 |
|
|
* Returns the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment.
|
659 |
|
|
* <p>
|
660 |
|
|
* This is functionally equivalent to {@link #getAllStartTags(StartTagType) getAllStartTags(startTagType)}<code>.iterator().next()</code>,
|
661 |
|
|
* but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
|
662 |
|
|
*
|
663 |
|
|
* @param startTagType the <code>StartTagType</code> to search for.
|
664 |
|
|
* @return the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
665 |
|
|
*/
|
666 |
|
|
public final StartTag getFirstStartTag(StartTagType startTagType) {
|
667 |
|
|
return checkEnclosure(source.getNextStartTag(begin,startTagType));
|
668 |
|
|
}
|
669 |
|
|
|
670 |
|
|
/**
|
671 |
|
|
* Returns the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
|
672 |
|
|
* <p>
|
673 |
|
|
* This is functionally equivalent to {@link #getAllStartTags(String) getAllStartTags(name)}<code>.iterator().next()</code>,
|
674 |
|
|
* but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
|
675 |
|
|
* <p>
|
676 |
|
|
* Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstStartTag()}.
|
677 |
|
|
*
|
678 |
|
|
* @param name the {@linkplain StartTag#getName() name} of the start tag to search for, may be <code>null</code>.
|
679 |
|
|
* @return the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
680 |
|
|
*/
|
681 |
|
|
public final StartTag getFirstStartTag(String name) {
|
682 |
|
|
return checkEnclosure(source.getNextStartTag(begin,name));
|
683 |
|
|
}
|
684 |
|
|
|
685 |
|
|
/**
|
686 |
|
|
* Returns the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
|
687 |
|
|
* <p>
|
688 |
|
|
* This is functionally equivalent to {@link #getAllStartTags(String,String,boolean) getAllStartTags(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>,
|
689 |
|
|
* but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
|
690 |
|
|
*
|
691 |
|
|
* @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
692 |
|
|
* @param value the value of the specified attribute to search for, must not be <code>null</code>.
|
693 |
|
|
* @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
|
694 |
|
|
* @return the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
695 |
|
|
* @see #getFirstStartTag(String attributeName, Pattern valueRegexPattern)
|
696 |
|
|
*/
|
697 |
|
|
public final StartTag getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive) {
|
698 |
|
|
return checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive));
|
699 |
|
|
}
|
700 |
|
|
|
701 |
|
|
/**
|
702 |
|
|
* Returns the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
|
703 |
|
|
* <p>
|
704 |
|
|
* This is functionally equivalent to {@link #getAllStartTags(String,Pattern) getAllStartTags(attributeName,valueRegexPattern)}<code>.iterator().next()</code>,
|
705 |
|
|
* but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
|
706 |
|
|
*
|
707 |
|
|
* @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
708 |
|
|
* @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>.
|
709 |
|
|
* @return the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
710 |
|
|
* @see #getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive)
|
711 |
|
|
*/
|
712 |
|
|
public final StartTag getFirstStartTag(final String attributeName, final Pattern valueRegexPattern) {
|
713 |
|
|
return checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern));
|
714 |
|
|
}
|
715 |
|
|
|
716 |
|
|
/**
|
717 |
|
|
* Returns the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
|
718 |
|
|
* <p>
|
719 |
|
|
* This is functionally equivalent to {@link #getAllStartTagsByClass(String) getAllStartTagsByClass(className)}<code>.iterator().next()</code>,
|
720 |
|
|
* but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
|
721 |
|
|
*
|
722 |
|
|
* @param className the class name (case sensitive) to search for, must not be <code>null</code>.
|
723 |
|
|
* @return the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
724 |
|
|
*/
|
725 |
|
|
public final StartTag getFirstStartTagByClass(final String className) {
|
726 |
|
|
return checkEnclosure(source.getNextStartTagByClass(begin,className));
|
727 |
|
|
}
|
728 |
|
|
|
729 |
|
|
/**
|
730 |
|
|
* Returns the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment.
|
731 |
|
|
* <p>
|
732 |
|
|
* This is functionally equivalent to {@link #getAllElements()}<code>.iterator().next()</code>,
|
733 |
|
|
* but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
|
734 |
|
|
* <p>
|
735 |
|
|
* If this segment is itself an {@link Element}, this element is returned, not the first child element.
|
736 |
|
|
*
|
737 |
|
|
* @return the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
738 |
|
|
*/
|
739 |
|
|
public final Element getFirstElement() {
|
740 |
|
|
StartTag startTag=checkEnclosure(StartTag.getNext(source,begin));
|
741 |
|
|
while (startTag!=null) {
|
742 |
|
|
final Element element=startTag.getElement();
|
743 |
|
|
if (element.end<=end) return element;
|
744 |
|
|
startTag=checkEnclosure(startTag.getNextStartTag());
|
745 |
|
|
}
|
746 |
|
|
return null;
|
747 |
|
|
}
|
748 |
|
|
|
749 |
|
|
/**
|
750 |
|
|
* Returns the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment.
|
751 |
|
|
* <p>
|
752 |
|
|
* This is functionally equivalent to {@link #getAllElements(String) getAllElements(name)}<code>.iterator().next()</code>,
|
753 |
|
|
* but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
|
754 |
|
|
* <p>
|
755 |
|
|
* Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstElement()}.
|
756 |
|
|
* <p>
|
757 |
|
|
* If this segment is itself an {@link Element} with the specified name, this element is returned.
|
758 |
|
|
*
|
759 |
|
|
* @param name the {@linkplain Element#getName() name} of the element to search for.
|
760 |
|
|
* @return the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
761 |
|
|
*/
|
762 |
|
|
public final Element getFirstElement(String name) {
|
763 |
|
|
if (name==null) return getFirstElement();
|
764 |
|
|
final boolean isXMLTagName=Tag.isXMLName(name);
|
765 |
|
|
StartTag startTag=checkEnclosure(StartTag.getNext(source,begin,name,StartTagType.NORMAL,isXMLTagName));
|
766 |
|
|
while (startTag!=null) {
|
767 |
|
|
final Element element=startTag.getElement();
|
768 |
|
|
if (element.end<=end) return element;
|
769 |
|
|
startTag=checkEnclosure(StartTag.getNext(source,startTag.begin+1,name,StartTagType.NORMAL,isXMLTagName));
|
770 |
|
|
}
|
771 |
|
|
return null;
|
772 |
|
|
}
|
773 |
|
|
|
774 |
|
|
/**
|
775 |
|
|
* Returns the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
|
776 |
|
|
* <p>
|
777 |
|
|
* This is functionally equivalent to {@link #getAllElements(String,String,boolean) getAllElements(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>,
|
778 |
|
|
* but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
|
779 |
|
|
* <p>
|
780 |
|
|
* If this segment is itself an {@link Element} with the specified attribute name/value pair, this element is returned.
|
781 |
|
|
*
|
782 |
|
|
* @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
783 |
|
|
* @param value the value of the specified attribute to search for, must not be <code>null</code>.
|
784 |
|
|
* @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
|
785 |
|
|
* @return the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
786 |
|
|
* @see #getFirstElement(String attributeName, Pattern valueRegexPattern)
|
787 |
|
|
*/
|
788 |
|
|
public final Element getFirstElement(String attributeName, String value, boolean valueCaseSensitive) {
|
789 |
|
|
StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive));
|
790 |
|
|
while (startTag!=null) {
|
791 |
|
|
final Element element=startTag.getElement();
|
792 |
|
|
if (element.end<=end) return element;
|
793 |
|
|
startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,value,valueCaseSensitive));
|
794 |
|
|
}
|
795 |
|
|
return null;
|
796 |
|
|
}
|
797 |
|
|
|
798 |
|
|
/**
|
799 |
|
|
* Returns the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
|
800 |
|
|
* <p>
|
801 |
|
|
* This is functionally equivalent to {@link #getAllElements(String,Pattern) getAllElements(attributeName,valueRegexPattern)}<code>.iterator().next()</code>,
|
802 |
|
|
* but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
|
803 |
|
|
* <p>
|
804 |
|
|
* If this segment is itself an {@link Element} with the specified attribute name and value pattern, this element is returned.
|
805 |
|
|
*
|
806 |
|
|
* @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
|
807 |
|
|
* @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>.
|
808 |
|
|
* @return the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
809 |
|
|
* @see #getFirstElement(String attributeName, String value, boolean valueCaseSensitive)
|
810 |
|
|
*/
|
811 |
|
|
public final Element getFirstElement(final String attributeName, final Pattern valueRegexPattern) {
|
812 |
|
|
StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern));
|
813 |
|
|
while (startTag!=null) {
|
814 |
|
|
final Element element=startTag.getElement();
|
815 |
|
|
if (element.end<=end) return element;
|
816 |
|
|
startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,valueRegexPattern));
|
817 |
|
|
}
|
818 |
|
|
return null;
|
819 |
|
|
}
|
820 |
|
|
|
821 |
|
|
/**
|
822 |
|
|
* Returns the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
|
823 |
|
|
* <p>
|
824 |
|
|
* This is functionally equivalent to {@link #getAllElementsByClass(String) getAllElementsByClass(className)}<code>.iterator().next()</code>,
|
825 |
|
|
* but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
|
826 |
|
|
* <p>
|
827 |
|
|
* If this segment is itself an {@link Element} with the specified class, this element is returned.
|
828 |
|
|
*
|
829 |
|
|
* @param className the class name (case sensitive) to search for, must not be <code>null</code>.
|
830 |
|
|
* @return the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
|
831 |
|
|
*/
|
832 |
|
|
public final Element getFirstElementByClass(final String className) {
|
833 |
|
|
StartTag startTag=checkEnclosure(source.getNextStartTagByClass(begin,className));
|
834 |
|
|
while (startTag!=null) {
|
835 |
|
|
final Element element=startTag.getElement();
|
836 |
|
|
if (element.end<=end) return element;
|
837 |
|
|
startTag=checkEnclosure(source.getNextStartTagByClass(startTag.begin+1,className));
|
838 |
|
|
}
|
839 |
|
|
return null;
|
840 |
|
|
}
|
841 |
|
|
|
842 |
|
|
/**
|
843 |
|
|
* Returns a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
844 |
|
|
* @return a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
845 |
|
|
*/
|
846 |
|
|
public List<FormControl> getFormControls() {
|
847 |
|
|
return FormControl.getAll(this);
|
848 |
|
|
}
|
849 |
|
|
|
850 |
|
|
/**
|
851 |
|
|
* Returns the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
852 |
|
|
* <p>
|
853 |
|
|
* This is equivalent to {@link FormFields#FormFields(Collection) new FormFields}<code>(</code>{@link #getFormControls()}<code>)</code>.
|
854 |
|
|
*
|
855 |
|
|
* @return the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
|
856 |
|
|
* @see #getFormControls()
|
857 |
|
|
*/
|
858 |
|
|
public FormFields getFormFields() {
|
859 |
|
|
return new FormFields(getFormControls());
|
860 |
|
|
}
|
861 |
|
|
|
862 |
|
|
/**
|
863 |
|
|
* Parses any {@link Attributes} within this segment.
|
864 |
|
|
* This method is only used in the unusual situation where attributes exist outside of a start tag.
|
865 |
|
|
* The {@link StartTag#getAttributes()} method should be used in normal situations.
|
866 |
|
|
* <p>
|
867 |
|
|
* This is equivalent to <code>source.</code>{@link Source#parseAttributes(int,int) parseAttributes}<code>(</code>{@link #getBegin()}<code>,</code>{@link #getEnd()}<code>)</code>.
|
868 |
|
|
*
|
869 |
|
|
* @return the {@link Attributes} within this segment, or <code>null</code> if too many errors occur while parsing.
|
870 |
|
|
*/
|
871 |
|
|
public Attributes parseAttributes() {
|
872 |
|
|
return source.parseAttributes(begin,end);
|
873 |
|
|
}
|
874 |
|
|
|
875 |
|
|
/**
|
876 |
|
|
* Causes the this segment to be ignored when parsing.
|
877 |
|
|
* <p>
|
878 |
|
|
* Ignored segments are treated as blank spaces by the parsing mechanism, but are included as normal text in all other functions.
|
879 |
|
|
* <p>
|
880 |
|
|
* This method was originally the only means of preventing {@linkplain TagType#isServerTag() server tags} located inside
|
881 |
|
|
* {@linkplain StartTagType#NORMAL normal} tags from interfering with the parsing of the tags
|
882 |
|
|
* (such as where an {@linkplain Attribute attribute} of a normal tag uses a server tag to dynamically set its value),
|
883 |
|
|
* as well as preventing non-server tags from being recognised inside server tags.
|
884 |
|
|
* <p>
|
885 |
|
|
* It is not necessary to use this method to ignore {@linkplain TagType#isServerTag() server tags} located inside normal tags,
|
886 |
|
|
* as the attributes parser automatically ignores any server tags.
|
887 |
|
|
* <p>
|
888 |
|
|
* It is not necessary to use this method to ignore non-server tags inside server tags, or the contents of {@link HTMLElementName#SCRIPT SCRIPT} elements,
|
889 |
|
|
* as the parser does this automatically when performing a {@linkplain Source#fullSequentialParse() full sequential parse}.
|
890 |
|
|
* <p>
|
891 |
|
|
* This leaves only very few scenarios where calling this method still provides a significant benefit.
|
892 |
|
|
* <p>
|
893 |
|
|
* One such case is where XML-style server tags are used inside {@linkplain StartTagType#NORMAL normal} tags.
|
894 |
|
|
* Here is an example using an XML-style JSP tag:
|
895 |
|
|
* <blockquote class="code"><code><a href="<i18n:resource path="/Portal"/>?BACK=TRUE">back</a></code></blockquote>
|
896 |
|
|
* The first double-quote of <code>"/Portal"</code> will be interpreted as the end quote for the <code>href</code> attribute,
|
897 |
|
|
* as there is no way for the parser to recognise the <code>il8n:resource</code> element as a server tag.
|
898 |
|
|
* Such use of XML-style server tags inside {@linkplain StartTagType#NORMAL normal} tags is generally seen as bad practice,
|
899 |
|
|
* but it is nevertheless valid JSP. The only way to ensure that this library is able to parse the normal tag surrounding it is to
|
900 |
|
|
* find these server tags first and call the <code>ignoreWhenParsing</code> method to ignore them before parsing the rest of the document.
|
901 |
|
|
* <p>
|
902 |
|
|
* It is important to understand the difference between ignoring the segment when parsing and removing the segment completely.
|
903 |
|
|
* Any text inside a segment that is ignored when parsing is treated by most functions as content, and as such is included in the output of
|
904 |
|
|
* tools such as {@link TextExtractor} and {@link Renderer}.
|
905 |
|
|
* <p>
|
906 |
|
|
* To remove segments completely, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or
|
907 |
|
|
* {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment.
|
908 |
|
|
* Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())}
|
909 |
|
|
* and perform the desired operations on this new source object.
|
910 |
|
|
* <p>
|
911 |
|
|
* Calling this method after the {@link Source#fullSequentialParse()} method has been called is not permitted and throws an <code>IllegalStateException</code>.
|
912 |
|
|
* <p>
|
913 |
|
|
* Any tags appearing in this segment that are found before this method is called will remain in the {@linkplain Source#getCacheDebugInfo() tag cache},
|
914 |
|
|
* and so will continue to be found by the <a href="Tag.html#TagSearchMethods">tag search methods</a>.
|
915 |
|
|
* If this is undesirable, the {@link Source#clearCache()} method can be called to remove them from the cache.
|
916 |
|
|
* Calling the {@link Source#fullSequentialParse()} method after this method clears the cache automatically.
|
917 |
|
|
* <p>
|
918 |
|
|
* For best performance, this method should be called on all segments that need to be ignored without calling
|
919 |
|
|
* any of the <a href="Tag.html#TagSearchMethods">tag search methods</a> in between.
|
920 |
|
|
*
|
921 |
|
|
* @see Source#ignoreWhenParsing(Collection segments)
|
922 |
|
|
*/
|
923 |
|
|
public void ignoreWhenParsing() {
|
924 |
|
|
source.ignoreWhenParsing(begin,end);
|
925 |
|
|
}
|
926 |
|
|
|
927 |
|
|
/**
|
928 |
|
|
* Compares this <code>Segment</code> object to another object.
|
929 |
|
|
* <p>
|
930 |
|
|
* If the argument is not a <code>Segment</code>, a <code>ClassCastException</code> is thrown.
|
931 |
|
|
* <p>
|
932 |
|
|
* A segment is considered to be before another segment if its begin position is earlier,
|
933 |
|
|
* or in the case that both segments begin at the same position, its end position is earlier.
|
934 |
|
|
* <p>
|
935 |
|
|
* Segments that begin and end at the same position are considered equal for
|
936 |
|
|
* the purposes of this comparison, even if they relate to different source documents.
|
937 |
|
|
* <p>
|
938 |
|
|
* Note: this class has a natural ordering that is inconsistent with equals.
|
939 |
|
|
* This means that this method may return zero in some cases where calling the
|
940 |
|
|
* {@link #equals(Object)} method with the same argument returns <code>false</code>.
|
941 |
|
|
*
|
942 |
|
|
* @param segment the segment to be compared
|
943 |
|
|
* @return a negative integer, zero, or a positive integer as this segment is before, equal to, or after the specified segment.
|
944 |
|
|
* @throws ClassCastException if the argument is not a <code>Segment</code>
|
945 |
|
|
*/
|
946 |
|
|
public int compareTo(final Segment segment) {
|
947 |
|
|
if (this==segment) return 0;
|
948 |
|
|
if (begin<segment.begin) return -1;
|
949 |
|
|
if (begin>segment.begin) return 1;
|
950 |
|
|
if (end<segment.end) return -1;
|
951 |
|
|
if (end>segment.end) return 1;
|
952 |
|
|
return 0;
|
953 |
|
|
}
|
954 |
|
|
|
955 |
|
|
/**
|
956 |
|
|
* Indicates whether this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}.
|
957 |
|
|
* @return <code>true</code> if this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}, otherwise <code>false</code>.
|
958 |
|
|
*/
|
959 |
|
|
public final boolean isWhiteSpace() {
|
960 |
|
|
for (int i=begin; i<end; i++)
|
961 |
|
|
if (!isWhiteSpace(source.charAt(i))) return false;
|
962 |
|
|
return true;
|
963 |
|
|
}
|
964 |
|
|
|
965 |
|
|
/**
|
966 |
|
|
* Returns an indication of the maximum depth of nested elements within this segment.
|
967 |
|
|
* <p>
|
968 |
|
|
* A high return value can indicate that the segment contains a large number of incorrectly nested tags that could result in a <code>StackOverflowException</code>
|
969 |
|
|
* if its content is parsed.
|
970 |
|
|
* <p>
|
971 |
|
|
* The usefulness of this method is debatable as a <code>StackOverflowException</code> is a recoverable error that can be easily caught.
|
972 |
|
|
* The use of this method to pre-detect and avoid a stack overflow may save some memory and processing resources in certain circumstances, but the cost of calling
|
973 |
|
|
* this method to check every segment or document will very often exceed any benefit.
|
974 |
|
|
* <p>
|
975 |
|
|
* It is up to the application developer to determine what return value constitutes an unreasonable level of nesting given the stack space allocated to the application
|
976 |
|
|
* and other factors.
|
977 |
|
|
* <p>
|
978 |
|
|
* Note that the return value is an approximation only and is usually greater than the actual maximum element depth that would be reported by calling the
|
979 |
|
|
* {@link Element#getDepth()} method on the most nested element.
|
980 |
|
|
*
|
981 |
|
|
* @return an indication of the maximum depth of nested elements within this segment.
|
982 |
|
|
*/
|
983 |
|
|
public int getMaxDepthIndicator() {
|
984 |
|
|
int maxDepth=0;
|
985 |
|
|
int depth=0;
|
986 |
|
|
for (Tag tag : getAllTags()) {
|
987 |
|
|
if (tag instanceof StartTag) {
|
988 |
|
|
StartTag startTag=(StartTag)tag;
|
989 |
|
|
if (startTag.getStartTagType().getCorrespondingEndTagType()==null) continue;
|
990 |
|
|
if (HTMLElements.getEndTagForbiddenElementNames().contains(startTag.getName())) continue;
|
991 |
|
|
if (startTag.isEmptyElementTag()) continue;
|
992 |
|
|
depth++;
|
993 |
|
|
if (depth>maxDepth) maxDepth++;
|
994 |
|
|
} else {
|
995 |
|
|
depth--;
|
996 |
|
|
}
|
997 |
|
|
}
|
998 |
|
|
return maxDepth;
|
999 |
|
|
}
|
1000 |
|
|
|
1001 |
|
|
/**
|
1002 |
|
|
* Indicates whether the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>.
|
1003 |
|
|
* <p>
|
1004 |
|
|
* The <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">HTML 4.01 specification section 9.1</a>
|
1005 |
|
|
* specifies the following white space characters:
|
1006 |
|
|
* <ul>
|
1007 |
|
|
* <li>space (U+0020)
|
1008 |
|
|
* <li>tab (U+0009)
|
1009 |
|
|
* <li>form feed (U+000C)
|
1010 |
|
|
* <li>line feed (U+000A)
|
1011 |
|
|
* <li>carriage return (U+000D)
|
1012 |
|
|
* <li>zero-width space (U+200B)
|
1013 |
|
|
* </ul>
|
1014 |
|
|
* <p>
|
1015 |
|
|
* Despite the explicit inclusion of the zero-width space in the HTML specification, Microsoft IE6 does not
|
1016 |
|
|
* recognise them as white space and renders them as an unprintable character (empty square).
|
1017 |
|
|
* Even zero-width spaces included using the numeric character reference <code>&#x200B;</code> are rendered this way.
|
1018 |
|
|
*
|
1019 |
|
|
* @param ch the character to test.
|
1020 |
|
|
* @return <code>true</code> if the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>, otherwise <code>false</code>.
|
1021 |
|
|
*/
|
1022 |
|
|
public static final boolean isWhiteSpace(final char ch) {
|
1023 |
|
|
for (char whiteSpaceChar : WHITESPACE) if (ch==whiteSpaceChar) return true;
|
1024 |
|
|
return false;
|
1025 |
|
|
}
|
1026 |
|
|
|
1027 |
|
|
/**
|
1028 |
|
|
* Returns a string representation of this object useful for debugging purposes.
|
1029 |
|
|
* @return a string representation of this object useful for debugging purposes.
|
1030 |
|
|
*/
|
1031 |
|
|
public String getDebugInfo() {
|
1032 |
|
|
final StringBuilder sb=new StringBuilder(50);
|
1033 |
|
|
sb.append('(');
|
1034 |
|
|
source.getRowColumnVector(begin).appendTo(sb);
|
1035 |
|
|
sb.append('-');
|
1036 |
|
|
source.getRowColumnVector(end).appendTo(sb);
|
1037 |
|
|
sb.append(')');
|
1038 |
|
|
return sb.toString();
|
1039 |
|
|
}
|
1040 |
|
|
|
1041 |
|
|
/**
|
1042 |
|
|
* Returns the character at the specified index.
|
1043 |
|
|
* <p>
|
1044 |
|
|
* This is logically equivalent to <code>toString().charAt(index)</code>
|
1045 |
|
|
* for valid argument values <code>0 <= index < length()</code>.
|
1046 |
|
|
* <p>
|
1047 |
|
|
* However because this implementation works directly on the underlying document source string,
|
1048 |
|
|
* it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown
|
1049 |
|
|
* for an invalid argument value.
|
1050 |
|
|
*
|
1051 |
|
|
* @param index the index of the character.
|
1052 |
|
|
* @return the character at the specified index.
|
1053 |
|
|
*/
|
1054 |
|
|
public char charAt(final int index) {
|
1055 |
|
|
return source.charAt(begin+index);
|
1056 |
|
|
}
|
1057 |
|
|
|
1058 |
|
|
/**
|
1059 |
|
|
* Returns a new character sequence that is a subsequence of this sequence.
|
1060 |
|
|
* <p>
|
1061 |
|
|
* This is logically equivalent to <code>toString().subSequence(beginIndex,endIndex)</code>
|
1062 |
|
|
* for valid values of <code>beginIndex</code> and <code>endIndex</code>.
|
1063 |
|
|
* <p>
|
1064 |
|
|
* However because this implementation works directly on the underlying document source text,
|
1065 |
|
|
* it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown
|
1066 |
|
|
* for invalid argument values as described in the <code>String.subSequence(int,int)</code> method.
|
1067 |
|
|
*
|
1068 |
|
|
* @param beginIndex the begin index, inclusive.
|
1069 |
|
|
* @param endIndex the end index, exclusive.
|
1070 |
|
|
* @return a new character sequence that is a subsequence of this sequence.
|
1071 |
|
|
*/
|
1072 |
|
|
public CharSequence subSequence(final int beginIndex, final int endIndex) {
|
1073 |
|
|
return source.subSequence(begin+beginIndex,begin+endIndex);
|
1074 |
|
|
}
|
1075 |
|
|
|
1076 |
|
|
/**
|
1077 |
|
|
* Collapses the {@linkplain #isWhiteSpace(char) white space} in the specified text.
|
1078 |
|
|
* All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
|
1079 |
|
|
*/
|
1080 |
|
|
static final StringBuilder appendCollapseWhiteSpace(final StringBuilder sb, final CharSequence text) {
|
1081 |
|
|
final int textLength=text.length();
|
1082 |
|
|
int i=0;
|
1083 |
|
|
boolean lastWasWhiteSpace=false;
|
1084 |
|
|
while (true) {
|
1085 |
|
|
if (i>=textLength) return sb;
|
1086 |
|
|
if (!isWhiteSpace(text.charAt(i))) break;
|
1087 |
|
|
i++;
|
1088 |
|
|
}
|
1089 |
|
|
do {
|
1090 |
|
|
final char ch=text.charAt(i++);
|
1091 |
|
|
if (isWhiteSpace(ch)) {
|
1092 |
|
|
lastWasWhiteSpace=true;
|
1093 |
|
|
} else {
|
1094 |
|
|
if (lastWasWhiteSpace) {
|
1095 |
|
|
sb.append(' ');
|
1096 |
|
|
lastWasWhiteSpace=false;
|
1097 |
|
|
}
|
1098 |
|
|
sb.append(ch);
|
1099 |
|
|
}
|
1100 |
|
|
} while (i<textLength);
|
1101 |
|
|
return sb;
|
1102 |
|
|
}
|
1103 |
|
|
|
1104 |
|
|
static final Pattern getClassPattern(final String className) {
|
1105 |
|
|
return Pattern.compile(".*(\\s|^)"+className+"(\\s|$).*",Pattern.DOTALL);
|
1106 |
|
|
}
|
1107 |
|
|
|
1108 |
|
|
private List<Element> getAllElements(final List<StartTag> startTags) {
|
1109 |
|
|
if (startTags.isEmpty()) return Collections.emptyList();
|
1110 |
|
|
final ArrayList<Element> elements=new ArrayList<Element>(startTags.size());
|
1111 |
|
|
for (StartTag startTag : startTags) {
|
1112 |
|
|
final Element element=startTag.getElement();
|
1113 |
|
|
if (element.end<=end) elements.add(element);
|
1114 |
|
|
}
|
1115 |
|
|
return elements;
|
1116 |
|
|
}
|
1117 |
|
|
|
1118 |
|
|
private StartTag checkEnclosure(final StartTag startTag) {
|
1119 |
|
|
if (startTag==null || startTag.end>end) return null;
|
1120 |
|
|
return startTag;
|
1121 |
|
|
}
|
1122 |
|
|
|
1123 |
|
|
private Tag checkTagEnclosure(final Tag tag) {
|
1124 |
|
|
if (tag==null || tag.end>end) return null;
|
1125 |
|
|
return tag;
|
1126 |
|
|
}
|
1127 |
|
|
|
1128 |
|
|
private CharacterReference getNextCharacterReference(final int pos) {
|
1129 |
|
|
final CharacterReference characterReference=source.getNextCharacterReference(pos);
|
1130 |
|
|
if (characterReference==null || characterReference.end>end) return null;
|
1131 |
|
|
return characterReference;
|
1132 |
|
|
}
|
1133 |
|
|
}
|
1134 |
|
|
|