/[aagtl_public1]/src/net/htmlparser/jericho/TextExtractor.java
aagtl

Contents of /src/net/htmlparser/jericho/TextExtractor.java

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2 - (show annotations) (download)
Sun Aug 5 13:48:36 2012 UTC (11 years, 7 months ago) by zoffadmin
File size: 19454 byte(s)
initial import of aagtl source code
1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2 // Version 3.2
3 // Copyright (C) 2004-2009 Martin Jericho
4 // http://jericho.htmlparser.net/
5 //
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of either one of the following licences:
8 //
9 // 1. The Eclipse Public License (EPL) version 1.0,
10 // included in this distribution in the file licence-epl-1.0.html
11 // or available at http://www.eclipse.org/legal/epl-v10.html
12 //
13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14 // included in this distribution in the file licence-lgpl-2.1.txt
15 // or available at http://www.gnu.org/licenses/lgpl.txt
16 //
17 // This library is distributed on an "AS IS" basis,
18 // WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 // See the individual licence texts for more details.
20
21 package net.htmlparser.jericho;
22
23 import java.util.*;
24 import java.io.*;
25 import java.net.*;
26
27 /**
28 * Extracts the textual content from HTML markup.
29 * <p>
30 * The output is ideal for feeding into a text search engine such as <a target="_blank" href="http://lucene.apache.org/java/">Apache Lucene</a>,
31 * especially when the {@link #setIncludeAttributes(boolean) IncludeAttributes} property has been set to <code>true</code>.
32 * <p>
33 * Use one of the following methods to obtain the output:
34 * <ul style="margin-top: 0">
35 * <li>{@link #writeTo(Writer)}</li>
36 * <li>{@link #appendTo(Appendable)}</li>
37 * <li>{@link #toString()}</li>
38 * <li>{@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)}</li>
39 * </ul>
40 * <p>
41 * The process removes all of the tags and
42 * {@linkplain CharacterReference#decodeCollapseWhiteSpace(CharSequence) decodes the result, collapsing all white space}.
43 * A space character is included in the output where a <a href="TagType.html#Normal">normal</a> tag is present in the source,
44 * unless the tag belongs to an {@linkplain HTMLElements#getInlineLevelElementNames() inline-level} element.
45 * An exception to this is the {@link HTMLElementName#BR BR} element, which is also converted to a space despite being an inline-level element.
46 * <p>
47 * Text inside {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements contained within this segment
48 * is ignored.
49 * <p>
50 * Setting the {@link #setExcludeNonHTMLElements(boolean) ExcludeNonHTMLElements} property results in the exclusion of any content within a
51 * <a href="HTMLElements.html#NonHTMLElement">non-HTML element</a>.
52 * <p>
53 * See the {@link #excludeElement(StartTag)} method for details on how to implement a more complex mechanism to determine whether the
54 * {@linkplain Element#getContent() content} of each {@link Element} is to be excluded from the output.
55 * <p>
56 * All tags that are not <a href="TagType.html#Normal">normal</a> tags, such as {@linkplain TagType#isServerTag() server tags},
57 * {@linkplain StartTagType#COMMENT comments} etc., are removed from the output without adding white space to the output.
58 * <p>
59 * Note that segments on which the {@link Segment#ignoreWhenParsing()} method has been called are treated as text rather than markup,
60 * resulting in their inclusion in the output.
61 * To remove specific segments before extracting the text, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or
62 * {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment to be removed.
63 * Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())}
64 * and perform the text extraction on this new source object.
65 * <p>
66 * Extracting the text from an entire {@link Source} object performs a {@linkplain Source#fullSequentialParse() full sequential parse} automatically.
67 * <p>
68 * To perform a simple rendering of HTML markup into text, which is more readable than the output of this class, use the {@link Renderer} class instead.
69 * <dl>
70 * <dt>Example:</dt>
71 * <dd>Using the default settings, the source segment:<br />
72 * "<code>&lt;div&gt;&lt;b&gt;O&lt;/b&gt;ne&lt;/div&gt;&lt;div title="Two"&gt;&lt;b&gt;Th&lt;/b&gt;&lt;script&gt;//a&nbsp;script&nbsp;&lt;/script&gt;ree&lt;/div&gt;</code>"<br />
73 * produces the text "<code>One Two Three</code>".
74 * </dl>
75 */
76 public class TextExtractor implements CharStreamSource {
77 private final Segment segment;
78 private boolean convertNonBreakingSpaces=Config.ConvertNonBreakingSpaces;
79 private boolean includeAttributes=false;
80 private boolean excludeNonHTMLElements=false;
81
82 private static final Map<String,AttributeIncludeChecker> map; // maps each possibly included attribute name to an AttributeIncludeChecker instance, initialised in static block below.
83
84 /**
85 * Constructs a new <code>TextExtractor</code> based on the specified {@link Segment}.
86 * @param segment the segment from which the text will be extracted.
87 * @see Segment#getTextExtractor()
88 */
89 public TextExtractor(final Segment segment) {
90 this.segment=segment;
91 }
92
93 // Documentation inherited from CharStreamSource
94 public void writeTo(final Writer writer) throws IOException {
95 appendTo(writer);
96 writer.flush();
97 }
98
99 // Documentation inherited from CharStreamSource
100 public void appendTo(final Appendable appendable) throws IOException {
101 appendable.append(toString());
102 }
103
104 // Documentation inherited from CharStreamSource
105 public long getEstimatedMaximumOutputLength() {
106 return segment.length();
107 }
108
109 // Documentation inherited from CharStreamSource
110 public String toString() {
111 return new Processor(segment,getConvertNonBreakingSpaces(),getIncludeAttributes(),getExcludeNonHTMLElements()).toString();
112 }
113
114 /**
115 * Sets whether non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to spaces.
116 * <p>
117 * The default value is that of the static {@link Config#ConvertNonBreakingSpaces} property at the time the <code>TextExtractor</code> is instantiated.
118 *
119 * @param convertNonBreakingSpaces specifies whether non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to spaces.
120 * @return this <code>TextExtractor</code> instance, allowing multiple property setting methods to be chained in a single statement.
121 * @see #getConvertNonBreakingSpaces()
122 */
123 public TextExtractor setConvertNonBreakingSpaces(boolean convertNonBreakingSpaces) {
124 this.convertNonBreakingSpaces=convertNonBreakingSpaces;
125 return this;
126 }
127
128 /**
129 * Indicates whether non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to spaces.
130 * <p>
131 * See the {@link #setConvertNonBreakingSpaces(boolean)} method for a full description of this property.
132 *
133 * @return <code>true</code> if non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to spaces, otherwise <code>false</code>.
134 */
135 public boolean getConvertNonBreakingSpaces() {
136 return convertNonBreakingSpaces;
137 }
138
139 /**
140 * Sets whether any attribute values are included in the output.
141 * <p>
142 * If the value of this property is <code>true</code>, then each attribute still has to match the conditions implemented in the
143 * {@link #includeAttribute(StartTag,Attribute)} method in order for its value to be included in the output.
144 * <p>
145 * The default value is <code>false</code>.
146 *
147 * @param includeAttributes specifies whether any attribute values are included in the output.
148 * @return this <code>TextExtractor</code> instance, allowing multiple property setting methods to be chained in a single statement.
149 * @see #getIncludeAttributes()
150 */
151 public TextExtractor setIncludeAttributes(boolean includeAttributes) {
152 this.includeAttributes=includeAttributes;
153 return this;
154 }
155
156 /**
157 * Indicates whether any attribute values are included in the output.
158 * <p>
159 * See the {@link #setIncludeAttributes(boolean)} method for a full description of this property.
160 *
161 * @return <code>true</code> if any attribute values are included in the output, otherwise <code>false</code>.
162 */
163 public boolean getIncludeAttributes() {
164 return includeAttributes;
165 }
166
167 /**
168 * Indicates whether the value of the specified {@linkplain Attribute attribute} in the specified {@linkplain StartTag start tag} is included in the output.
169 * <p>
170 * This method is ignored if the {@link #setIncludeAttributes(boolean) IncludeAttributes} property is set to <code>false</code>, in which case
171 * no attribute values are included in the output.
172 * <p>
173 * If the {@link #setIncludeAttributes(boolean) IncludeAttributes} property is set to <code>true</code>, every attribute of every
174 * start tag encountered in the segment is checked using this method to determine whether the value of the attribute should be included in the output.
175 * <p>
176 * The default implementation of this method returns <code>true</code> if the {@linkplain Attribute#getName() name} of the specified {@linkplain Attribute attribute}
177 * is one of
178 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-title">title</a>,
179 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/objects.html#adef-alt">alt</a>,
180 * <a target="_blank" href="http://www.w3.org/TR/html401/interact/forms.html#adef-label-OPTION">label</a>,
181 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/tables.html#adef-summary">summary</a>,
182 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-content">content</a>*, or
183 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/links.html#adef-href">href</a>,
184 * but the method can be overridden in a subclass to perform a check of arbitrary complexity on each attribute.
185 * <p>
186 * * The value of a <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-content">content</a> attribute is only included if a
187 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-name-META">name</a> attribute is also present in the specified start tag,
188 * as the content attribute of a {@link HTMLElementName#META META} tag only contains human readable text if the name attribute is used as opposed to an
189 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-http-equiv">http-equiv</a> attribute.
190 * <p>
191 * <dl>
192 * <dt>Example:</dt>
193 * <dd>
194 * To include only the value of <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-title">title</a> and
195 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/objects.html#adef-alt">alt</a> attributes:<br /><br />
196 * <code>
197 * final Set includeAttributeNames=new HashSet(Arrays.asList(new String[] {"title","alt"}));<br />
198 * TextExtractor textExtractor=new TextExtractor(segment) {<br />
199 * &nbsp; &nbsp; public boolean includeAttribute(StartTag startTag, Attribute attribute) {<br />
200 * &nbsp; &nbsp; &nbsp; &nbsp; return includeAttributeNames.contains(attribute.getKey());<br />
201 * &nbsp; &nbsp; }<br />
202 * };<br />
203 * textExtractor.setIncludeAttributes(true);<br />
204 * String extractedText=textExtractor.toString();
205 * </code>
206 * </dd>
207 * </dl>
208 * @param startTag the start tag of the element to check for inclusion.
209 * @return <true> if the text inside the {@link Element} of the specified start tag should be excluded from the output, otherwise <code>false</code>.
210 */
211 public boolean includeAttribute(final StartTag startTag, final Attribute attribute) {
212 AttributeIncludeChecker attributeIncludeChecker=map.get(attribute.getKey());
213 if (attributeIncludeChecker==null) return false;
214 return attributeIncludeChecker.includeAttribute(startTag,attribute);
215 }
216
217 /**
218 * Sets whether the content of <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output.
219 * <p>
220 * The default value is <code>false</code>, meaning that content from all elements meeting the other criteria is included.
221 *
222 * @param excludeNonHTMLElements specifies whether content <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output.
223 * @return this <code>TextExtractor</code> instance, allowing multiple property setting methods to be chained in a single statement.
224 * @see #getExcludeNonHTMLElements()
225 */
226 public TextExtractor setExcludeNonHTMLElements(boolean excludeNonHTMLElements) {
227 this.excludeNonHTMLElements=excludeNonHTMLElements;
228 return this;
229 }
230
231 /**
232 * Indicates whether the content of <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output.
233 * <p>
234 * See the {@link #setExcludeNonHTMLElements(boolean)} method for a full description of this property.
235 *
236 * @return <code>true</code> if the content of <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output, otherwise <code>false</code>.
237 */
238 public boolean getExcludeNonHTMLElements() {
239 return excludeNonHTMLElements;
240 }
241
242 /**
243 * Indicates whether the text inside the {@link Element} of the specified start tag should be excluded from the output.
244 * <p>
245 * During the text extraction process, every start tag encountered in the segment is checked using this method to determine whether the text inside its
246 * {@linkplain StartTag#getElement() associated element} should be excluded from the output.
247 * <p>
248 * The default implementation of this method is to always return <code>false</code>, so that every element is included,
249 * but the method can be overridden in a subclass to perform a check of arbitrary complexity on each start tag.
250 * <p>
251 * All elements nested inside an excluded element are also implicitly excluded, as are all
252 * {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements.
253 * Such elements are skipped over without calling this method, so there is no way to include them by overriding the method.
254 * <p>
255 * <dl>
256 * <dt>Example:</dt>
257 * <dd>
258 * To extract the text from a <code>segment</code>, excluding any text inside elements with the attribute <code>class="NotIndexed"</code>:<br /><br />
259 * <code>
260 * TextExtractor textExtractor=new TextExtractor(segment) {<br />
261 * &nbsp; &nbsp; public boolean excludeElement(StartTag startTag) {<br />
262 * &nbsp; &nbsp; &nbsp; &nbsp; return "NotIndexed".equalsIgnoreCase(startTag.getAttributeValue("class"));<br />
263 * &nbsp; &nbsp; }<br />
264 * };<br />
265 * String extractedText=textExtractor.toString();
266 * </code>
267 * </dd>
268 * </dl>
269 * @param startTag the start tag of the element to check for inclusion.
270 * @return <true> if the text inside the {@link Element} of the specified start tag should be excluded from the output, otherwise <code>false</code>.
271 */
272 public boolean excludeElement(final StartTag startTag) {
273 return false;
274 }
275
276 private static interface AttributeIncludeChecker {
277 boolean includeAttribute(final StartTag startTag, final Attribute attribute);
278 }
279
280 private static AttributeIncludeChecker ALWAYS_INCLUDE=new AttributeIncludeChecker() {
281 public boolean includeAttribute(final StartTag startTag, final Attribute attribute) {
282 return true;
283 }
284 };
285
286 private static AttributeIncludeChecker INCLUDE_IF_NAME_ATTRIBUTE_PRESENT=new AttributeIncludeChecker() {
287 public boolean includeAttribute(final StartTag startTag, final Attribute attribute) {
288 return startTag.getAttributes().get("name")!=null;
289 }
290 };
291
292 static {
293 map=new HashMap<String,AttributeIncludeChecker>();
294 map.put("title",ALWAYS_INCLUDE); // add title attribute
295 map.put("alt",ALWAYS_INCLUDE); // add alt attribute (APPLET, AREA, IMG and INPUT elements)
296 map.put("label",ALWAYS_INCLUDE); // add label attribute (OPTION and OPTGROUP elements)
297 map.put("summary",ALWAYS_INCLUDE); // add summary attribute (TABLE element)
298 map.put("content",INCLUDE_IF_NAME_ATTRIBUTE_PRESENT); // add content attribute (META element)
299 map.put("href",ALWAYS_INCLUDE); // add href attribute (A, AREA and LINK elements)
300 // don't bother with the prompt attribute from the ININDEX element as the element is deprecated and very rarely used.
301 }
302
303 /**
304 * This class does the actual work, but is first passed final copies of all the parameters for efficiency.
305 * Note at present this is not implemented in a memory-efficient manner.
306 * Once the CharacterReference.decodeCollapseWhiteSpace functionality is available as a FilterWriter (possible with java 5 support),
307 * the main algorithm can be implemented in the writeTo(Writer) method to allow for more memory-efficient processing.
308 */
309 private final class Processor {
310 private final Segment segment;
311 private final Source source;
312 private final boolean convertNonBreakingSpaces;
313 private final boolean includeAttributes;
314 private final boolean excludeNonHTMLElements;
315
316 public Processor(final Segment segment, final boolean convertNonBreakingSpaces, final boolean includeAttributes, final boolean excludeNonHTMLElements) {
317 this.segment=segment;
318 source=segment.source;
319 this.convertNonBreakingSpaces=convertNonBreakingSpaces;
320 this.includeAttributes=includeAttributes;
321 this.excludeNonHTMLElements=excludeNonHTMLElements;
322 }
323
324 public String toString() {
325 final StringBuilder sb=new StringBuilder(segment.length());
326 for (NodeIterator nodeIterator=new NodeIterator(segment); nodeIterator.hasNext();) {
327 Segment segment=nodeIterator.next();
328 if (segment instanceof Tag) {
329 final Tag tag=(Tag)segment;
330 if (tag.getTagType().isServerTag()) {
331 // elementContainsMarkup should be made into a TagType property one day.
332 // for the time being assume all server element content is code, although this is not true for some Mason elements.
333 final boolean elementContainsMarkup=false;
334 if (!elementContainsMarkup) {
335 final Element element=tag.getElement();
336 if (element!=null && element.getEnd()>tag.getEnd()) nodeIterator.skipToPos(element.getEnd());
337 }
338 continue;
339 }
340 if (tag.getTagType()==StartTagType.NORMAL) {
341 final StartTag startTag=(StartTag)tag;
342 if (tag.name==HTMLElementName.SCRIPT || tag.name==HTMLElementName.STYLE || excludeElement(startTag) || (excludeNonHTMLElements && !HTMLElements.getElementNames().contains(tag.name))) {
343 nodeIterator.skipToPos(startTag.getElement().getEnd());
344 continue;
345 }
346 if (includeAttributes) {
347 for (Attribute attribute : startTag.getAttributes()) {
348 if (includeAttribute(startTag,attribute)) sb.append(' ').append(attribute.getValueSegment()).append(' ');
349 }
350 }
351 }
352 // Treat both start and end tags not belonging to inline-level elements as whitespace:
353 if (tag.getName()==HTMLElementName.BR || !HTMLElements.getInlineLevelElementNames().contains(tag.getName())) sb.append(' ');
354 } else {
355 sb.append(segment);
356 }
357 }
358 final String decodedText=CharacterReference.decodeCollapseWhiteSpace(sb,convertNonBreakingSpaces);
359 return decodedText;
360 }
361 }
362 }

   
Visit the aagtl Website