1 |
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
|
2 |
// Version 3.2
|
3 |
// Copyright (C) 2004-2009 Martin Jericho
|
4 |
// http://jericho.htmlparser.net/
|
5 |
//
|
6 |
// This library is free software; you can redistribute it and/or
|
7 |
// modify it under the terms of either one of the following licences:
|
8 |
//
|
9 |
// 1. The Eclipse Public License (EPL) version 1.0,
|
10 |
// included in this distribution in the file licence-epl-1.0.html
|
11 |
// or available at http://www.eclipse.org/legal/epl-v10.html
|
12 |
//
|
13 |
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
|
14 |
// included in this distribution in the file licence-lgpl-2.1.txt
|
15 |
// or available at http://www.gnu.org/licenses/lgpl.txt
|
16 |
//
|
17 |
// This library is distributed on an "AS IS" basis,
|
18 |
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
|
19 |
// See the individual licence texts for more details.
|
20 |
|
21 |
package net.htmlparser.jericho;
|
22 |
|
23 |
import java.util.*;
|
24 |
import java.io.*;
|
25 |
import java.net.*;
|
26 |
|
27 |
/**
|
28 |
* Extracts the textual content from HTML markup.
|
29 |
* <p>
|
30 |
* The output is ideal for feeding into a text search engine such as <a target="_blank" href="http://lucene.apache.org/java/">Apache Lucene</a>,
|
31 |
* especially when the {@link #setIncludeAttributes(boolean) IncludeAttributes} property has been set to <code>true</code>.
|
32 |
* <p>
|
33 |
* Use one of the following methods to obtain the output:
|
34 |
* <ul style="margin-top: 0">
|
35 |
* <li>{@link #writeTo(Writer)}</li>
|
36 |
* <li>{@link #appendTo(Appendable)}</li>
|
37 |
* <li>{@link #toString()}</li>
|
38 |
* <li>{@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)}</li>
|
39 |
* </ul>
|
40 |
* <p>
|
41 |
* The process removes all of the tags and
|
42 |
* {@linkplain CharacterReference#decodeCollapseWhiteSpace(CharSequence) decodes the result, collapsing all white space}.
|
43 |
* A space character is included in the output where a <a href="TagType.html#Normal">normal</a> tag is present in the source,
|
44 |
* unless the tag belongs to an {@linkplain HTMLElements#getInlineLevelElementNames() inline-level} element.
|
45 |
* An exception to this is the {@link HTMLElementName#BR BR} element, which is also converted to a space despite being an inline-level element.
|
46 |
* <p>
|
47 |
* Text inside {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements contained within this segment
|
48 |
* is ignored.
|
49 |
* <p>
|
50 |
* Setting the {@link #setExcludeNonHTMLElements(boolean) ExcludeNonHTMLElements} property results in the exclusion of any content within a
|
51 |
* <a href="HTMLElements.html#NonHTMLElement">non-HTML element</a>.
|
52 |
* <p>
|
53 |
* See the {@link #excludeElement(StartTag)} method for details on how to implement a more complex mechanism to determine whether the
|
54 |
* {@linkplain Element#getContent() content} of each {@link Element} is to be excluded from the output.
|
55 |
* <p>
|
56 |
* All tags that are not <a href="TagType.html#Normal">normal</a> tags, such as {@linkplain TagType#isServerTag() server tags},
|
57 |
* {@linkplain StartTagType#COMMENT comments} etc., are removed from the output without adding white space to the output.
|
58 |
* <p>
|
59 |
* Note that segments on which the {@link Segment#ignoreWhenParsing()} method has been called are treated as text rather than markup,
|
60 |
* resulting in their inclusion in the output.
|
61 |
* To remove specific segments before extracting the text, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or
|
62 |
* {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment to be removed.
|
63 |
* Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())}
|
64 |
* and perform the text extraction on this new source object.
|
65 |
* <p>
|
66 |
* Extracting the text from an entire {@link Source} object performs a {@linkplain Source#fullSequentialParse() full sequential parse} automatically.
|
67 |
* <p>
|
68 |
* To perform a simple rendering of HTML markup into text, which is more readable than the output of this class, use the {@link Renderer} class instead.
|
69 |
* <dl>
|
70 |
* <dt>Example:</dt>
|
71 |
* <dd>Using the default settings, the source segment:<br />
|
72 |
* "<code><div><b>O</b>ne</div><div title="Two"><b>Th</b><script>//a script </script>ree</div></code>"<br />
|
73 |
* produces the text "<code>One Two Three</code>".
|
74 |
* </dl>
|
75 |
*/
|
76 |
public class TextExtractor implements CharStreamSource {
|
77 |
private final Segment segment;
|
78 |
private boolean convertNonBreakingSpaces=Config.ConvertNonBreakingSpaces;
|
79 |
private boolean includeAttributes=false;
|
80 |
private boolean excludeNonHTMLElements=false;
|
81 |
|
82 |
private static final Map<String,AttributeIncludeChecker> map; // maps each possibly included attribute name to an AttributeIncludeChecker instance, initialised in static block below.
|
83 |
|
84 |
/**
|
85 |
* Constructs a new <code>TextExtractor</code> based on the specified {@link Segment}.
|
86 |
* @param segment the segment from which the text will be extracted.
|
87 |
* @see Segment#getTextExtractor()
|
88 |
*/
|
89 |
public TextExtractor(final Segment segment) {
|
90 |
this.segment=segment;
|
91 |
}
|
92 |
|
93 |
// Documentation inherited from CharStreamSource
|
94 |
public void writeTo(final Writer writer) throws IOException {
|
95 |
appendTo(writer);
|
96 |
writer.flush();
|
97 |
}
|
98 |
|
99 |
// Documentation inherited from CharStreamSource
|
100 |
public void appendTo(final Appendable appendable) throws IOException {
|
101 |
appendable.append(toString());
|
102 |
}
|
103 |
|
104 |
// Documentation inherited from CharStreamSource
|
105 |
public long getEstimatedMaximumOutputLength() {
|
106 |
return segment.length();
|
107 |
}
|
108 |
|
109 |
// Documentation inherited from CharStreamSource
|
110 |
public String toString() {
|
111 |
return new Processor(segment,getConvertNonBreakingSpaces(),getIncludeAttributes(),getExcludeNonHTMLElements()).toString();
|
112 |
}
|
113 |
|
114 |
/**
|
115 |
* Sets whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces.
|
116 |
* <p>
|
117 |
* The default value is that of the static {@link Config#ConvertNonBreakingSpaces} property at the time the <code>TextExtractor</code> is instantiated.
|
118 |
*
|
119 |
* @param convertNonBreakingSpaces specifies whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces.
|
120 |
* @return this <code>TextExtractor</code> instance, allowing multiple property setting methods to be chained in a single statement.
|
121 |
* @see #getConvertNonBreakingSpaces()
|
122 |
*/
|
123 |
public TextExtractor setConvertNonBreakingSpaces(boolean convertNonBreakingSpaces) {
|
124 |
this.convertNonBreakingSpaces=convertNonBreakingSpaces;
|
125 |
return this;
|
126 |
}
|
127 |
|
128 |
/**
|
129 |
* Indicates whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces.
|
130 |
* <p>
|
131 |
* See the {@link #setConvertNonBreakingSpaces(boolean)} method for a full description of this property.
|
132 |
*
|
133 |
* @return <code>true</code> if non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces, otherwise <code>false</code>.
|
134 |
*/
|
135 |
public boolean getConvertNonBreakingSpaces() {
|
136 |
return convertNonBreakingSpaces;
|
137 |
}
|
138 |
|
139 |
/**
|
140 |
* Sets whether any attribute values are included in the output.
|
141 |
* <p>
|
142 |
* If the value of this property is <code>true</code>, then each attribute still has to match the conditions implemented in the
|
143 |
* {@link #includeAttribute(StartTag,Attribute)} method in order for its value to be included in the output.
|
144 |
* <p>
|
145 |
* The default value is <code>false</code>.
|
146 |
*
|
147 |
* @param includeAttributes specifies whether any attribute values are included in the output.
|
148 |
* @return this <code>TextExtractor</code> instance, allowing multiple property setting methods to be chained in a single statement.
|
149 |
* @see #getIncludeAttributes()
|
150 |
*/
|
151 |
public TextExtractor setIncludeAttributes(boolean includeAttributes) {
|
152 |
this.includeAttributes=includeAttributes;
|
153 |
return this;
|
154 |
}
|
155 |
|
156 |
/**
|
157 |
* Indicates whether any attribute values are included in the output.
|
158 |
* <p>
|
159 |
* See the {@link #setIncludeAttributes(boolean)} method for a full description of this property.
|
160 |
*
|
161 |
* @return <code>true</code> if any attribute values are included in the output, otherwise <code>false</code>.
|
162 |
*/
|
163 |
public boolean getIncludeAttributes() {
|
164 |
return includeAttributes;
|
165 |
}
|
166 |
|
167 |
/**
|
168 |
* Indicates whether the value of the specified {@linkplain Attribute attribute} in the specified {@linkplain StartTag start tag} is included in the output.
|
169 |
* <p>
|
170 |
* This method is ignored if the {@link #setIncludeAttributes(boolean) IncludeAttributes} property is set to <code>false</code>, in which case
|
171 |
* no attribute values are included in the output.
|
172 |
* <p>
|
173 |
* If the {@link #setIncludeAttributes(boolean) IncludeAttributes} property is set to <code>true</code>, every attribute of every
|
174 |
* start tag encountered in the segment is checked using this method to determine whether the value of the attribute should be included in the output.
|
175 |
* <p>
|
176 |
* The default implementation of this method returns <code>true</code> if the {@linkplain Attribute#getName() name} of the specified {@linkplain Attribute attribute}
|
177 |
* is one of
|
178 |
* <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-title">title</a>,
|
179 |
* <a target="_blank" href="http://www.w3.org/TR/html401/struct/objects.html#adef-alt">alt</a>,
|
180 |
* <a target="_blank" href="http://www.w3.org/TR/html401/interact/forms.html#adef-label-OPTION">label</a>,
|
181 |
* <a target="_blank" href="http://www.w3.org/TR/html401/struct/tables.html#adef-summary">summary</a>,
|
182 |
* <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-content">content</a>*, or
|
183 |
* <a target="_blank" href="http://www.w3.org/TR/html401/struct/links.html#adef-href">href</a>,
|
184 |
* but the method can be overridden in a subclass to perform a check of arbitrary complexity on each attribute.
|
185 |
* <p>
|
186 |
* * The value of a <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-content">content</a> attribute is only included if a
|
187 |
* <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-name-META">name</a> attribute is also present in the specified start tag,
|
188 |
* as the content attribute of a {@link HTMLElementName#META META} tag only contains human readable text if the name attribute is used as opposed to an
|
189 |
* <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-http-equiv">http-equiv</a> attribute.
|
190 |
* <p>
|
191 |
* <dl>
|
192 |
* <dt>Example:</dt>
|
193 |
* <dd>
|
194 |
* To include only the value of <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-title">title</a> and
|
195 |
* <a target="_blank" href="http://www.w3.org/TR/html401/struct/objects.html#adef-alt">alt</a> attributes:<br /><br />
|
196 |
* <code>
|
197 |
* final Set includeAttributeNames=new HashSet(Arrays.asList(new String[] {"title","alt"}));<br />
|
198 |
* TextExtractor textExtractor=new TextExtractor(segment) {<br />
|
199 |
* public boolean includeAttribute(StartTag startTag, Attribute attribute) {<br />
|
200 |
* return includeAttributeNames.contains(attribute.getKey());<br />
|
201 |
* }<br />
|
202 |
* };<br />
|
203 |
* textExtractor.setIncludeAttributes(true);<br />
|
204 |
* String extractedText=textExtractor.toString();
|
205 |
* </code>
|
206 |
* </dd>
|
207 |
* </dl>
|
208 |
* @param startTag the start tag of the element to check for inclusion.
|
209 |
* @return <true> if the text inside the {@link Element} of the specified start tag should be excluded from the output, otherwise <code>false</code>.
|
210 |
*/
|
211 |
public boolean includeAttribute(final StartTag startTag, final Attribute attribute) {
|
212 |
AttributeIncludeChecker attributeIncludeChecker=map.get(attribute.getKey());
|
213 |
if (attributeIncludeChecker==null) return false;
|
214 |
return attributeIncludeChecker.includeAttribute(startTag,attribute);
|
215 |
}
|
216 |
|
217 |
/**
|
218 |
* Sets whether the content of <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output.
|
219 |
* <p>
|
220 |
* The default value is <code>false</code>, meaning that content from all elements meeting the other criteria is included.
|
221 |
*
|
222 |
* @param excludeNonHTMLElements specifies whether content <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output.
|
223 |
* @return this <code>TextExtractor</code> instance, allowing multiple property setting methods to be chained in a single statement.
|
224 |
* @see #getExcludeNonHTMLElements()
|
225 |
*/
|
226 |
public TextExtractor setExcludeNonHTMLElements(boolean excludeNonHTMLElements) {
|
227 |
this.excludeNonHTMLElements=excludeNonHTMLElements;
|
228 |
return this;
|
229 |
}
|
230 |
|
231 |
/**
|
232 |
* Indicates whether the content of <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output.
|
233 |
* <p>
|
234 |
* See the {@link #setExcludeNonHTMLElements(boolean)} method for a full description of this property.
|
235 |
*
|
236 |
* @return <code>true</code> if the content of <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output, otherwise <code>false</code>.
|
237 |
*/
|
238 |
public boolean getExcludeNonHTMLElements() {
|
239 |
return excludeNonHTMLElements;
|
240 |
}
|
241 |
|
242 |
/**
|
243 |
* Indicates whether the text inside the {@link Element} of the specified start tag should be excluded from the output.
|
244 |
* <p>
|
245 |
* During the text extraction process, every start tag encountered in the segment is checked using this method to determine whether the text inside its
|
246 |
* {@linkplain StartTag#getElement() associated element} should be excluded from the output.
|
247 |
* <p>
|
248 |
* The default implementation of this method is to always return <code>false</code>, so that every element is included,
|
249 |
* but the method can be overridden in a subclass to perform a check of arbitrary complexity on each start tag.
|
250 |
* <p>
|
251 |
* All elements nested inside an excluded element are also implicitly excluded, as are all
|
252 |
* {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements.
|
253 |
* Such elements are skipped over without calling this method, so there is no way to include them by overriding the method.
|
254 |
* <p>
|
255 |
* <dl>
|
256 |
* <dt>Example:</dt>
|
257 |
* <dd>
|
258 |
* To extract the text from a <code>segment</code>, excluding any text inside elements with the attribute <code>class="NotIndexed"</code>:<br /><br />
|
259 |
* <code>
|
260 |
* TextExtractor textExtractor=new TextExtractor(segment) {<br />
|
261 |
* public boolean excludeElement(StartTag startTag) {<br />
|
262 |
* return "NotIndexed".equalsIgnoreCase(startTag.getAttributeValue("class"));<br />
|
263 |
* }<br />
|
264 |
* };<br />
|
265 |
* String extractedText=textExtractor.toString();
|
266 |
* </code>
|
267 |
* </dd>
|
268 |
* </dl>
|
269 |
* @param startTag the start tag of the element to check for inclusion.
|
270 |
* @return <true> if the text inside the {@link Element} of the specified start tag should be excluded from the output, otherwise <code>false</code>.
|
271 |
*/
|
272 |
public boolean excludeElement(final StartTag startTag) {
|
273 |
return false;
|
274 |
}
|
275 |
|
276 |
private static interface AttributeIncludeChecker {
|
277 |
boolean includeAttribute(final StartTag startTag, final Attribute attribute);
|
278 |
}
|
279 |
|
280 |
private static AttributeIncludeChecker ALWAYS_INCLUDE=new AttributeIncludeChecker() {
|
281 |
public boolean includeAttribute(final StartTag startTag, final Attribute attribute) {
|
282 |
return true;
|
283 |
}
|
284 |
};
|
285 |
|
286 |
private static AttributeIncludeChecker INCLUDE_IF_NAME_ATTRIBUTE_PRESENT=new AttributeIncludeChecker() {
|
287 |
public boolean includeAttribute(final StartTag startTag, final Attribute attribute) {
|
288 |
return startTag.getAttributes().get("name")!=null;
|
289 |
}
|
290 |
};
|
291 |
|
292 |
static {
|
293 |
map=new HashMap<String,AttributeIncludeChecker>();
|
294 |
map.put("title",ALWAYS_INCLUDE); // add title attribute
|
295 |
map.put("alt",ALWAYS_INCLUDE); // add alt attribute (APPLET, AREA, IMG and INPUT elements)
|
296 |
map.put("label",ALWAYS_INCLUDE); // add label attribute (OPTION and OPTGROUP elements)
|
297 |
map.put("summary",ALWAYS_INCLUDE); // add summary attribute (TABLE element)
|
298 |
map.put("content",INCLUDE_IF_NAME_ATTRIBUTE_PRESENT); // add content attribute (META element)
|
299 |
map.put("href",ALWAYS_INCLUDE); // add href attribute (A, AREA and LINK elements)
|
300 |
// don't bother with the prompt attribute from the ININDEX element as the element is deprecated and very rarely used.
|
301 |
}
|
302 |
|
303 |
/**
|
304 |
* This class does the actual work, but is first passed final copies of all the parameters for efficiency.
|
305 |
* Note at present this is not implemented in a memory-efficient manner.
|
306 |
* Once the CharacterReference.decodeCollapseWhiteSpace functionality is available as a FilterWriter (possible with java 5 support),
|
307 |
* the main algorithm can be implemented in the writeTo(Writer) method to allow for more memory-efficient processing.
|
308 |
*/
|
309 |
private final class Processor {
|
310 |
private final Segment segment;
|
311 |
private final Source source;
|
312 |
private final boolean convertNonBreakingSpaces;
|
313 |
private final boolean includeAttributes;
|
314 |
private final boolean excludeNonHTMLElements;
|
315 |
|
316 |
public Processor(final Segment segment, final boolean convertNonBreakingSpaces, final boolean includeAttributes, final boolean excludeNonHTMLElements) {
|
317 |
this.segment=segment;
|
318 |
source=segment.source;
|
319 |
this.convertNonBreakingSpaces=convertNonBreakingSpaces;
|
320 |
this.includeAttributes=includeAttributes;
|
321 |
this.excludeNonHTMLElements=excludeNonHTMLElements;
|
322 |
}
|
323 |
|
324 |
public String toString() {
|
325 |
final StringBuilder sb=new StringBuilder(segment.length());
|
326 |
for (NodeIterator nodeIterator=new NodeIterator(segment); nodeIterator.hasNext();) {
|
327 |
Segment segment=nodeIterator.next();
|
328 |
if (segment instanceof Tag) {
|
329 |
final Tag tag=(Tag)segment;
|
330 |
if (tag.getTagType().isServerTag()) {
|
331 |
// elementContainsMarkup should be made into a TagType property one day.
|
332 |
// for the time being assume all server element content is code, although this is not true for some Mason elements.
|
333 |
final boolean elementContainsMarkup=false;
|
334 |
if (!elementContainsMarkup) {
|
335 |
final Element element=tag.getElement();
|
336 |
if (element!=null && element.getEnd()>tag.getEnd()) nodeIterator.skipToPos(element.getEnd());
|
337 |
}
|
338 |
continue;
|
339 |
}
|
340 |
if (tag.getTagType()==StartTagType.NORMAL) {
|
341 |
final StartTag startTag=(StartTag)tag;
|
342 |
if (tag.name==HTMLElementName.SCRIPT || tag.name==HTMLElementName.STYLE || excludeElement(startTag) || (excludeNonHTMLElements && !HTMLElements.getElementNames().contains(tag.name))) {
|
343 |
nodeIterator.skipToPos(startTag.getElement().getEnd());
|
344 |
continue;
|
345 |
}
|
346 |
if (includeAttributes) {
|
347 |
for (Attribute attribute : startTag.getAttributes()) {
|
348 |
if (includeAttribute(startTag,attribute)) sb.append(' ').append(attribute.getValueSegment()).append(' ');
|
349 |
}
|
350 |
}
|
351 |
}
|
352 |
// Treat both start and end tags not belonging to inline-level elements as whitespace:
|
353 |
if (tag.getName()==HTMLElementName.BR || !HTMLElements.getInlineLevelElementNames().contains(tag.getName())) sb.append(' ');
|
354 |
} else {
|
355 |
sb.append(segment);
|
356 |
}
|
357 |
}
|
358 |
final String decodedText=CharacterReference.decodeCollapseWhiteSpace(sb,convertNonBreakingSpaces);
|
359 |
return decodedText;
|
360 |
}
|
361 |
}
|
362 |
}
|