htmlparser/jericho/TextExtractor.java

// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.2
// Copyright (C) 2004-2009 Martin Jericho
// http://jericho.htmlparser.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.

package net.htmlparser.jericho;

import java.util.*;
import java.io.*;
import java.net.*;

/**
 * Extracts the textual content from HTML markup.
 * <p>
 * The output is ideal for feeding into a text search engine such as <a target="_blank" href="http://lucene.apache.org/java/">Apache Lucene</a>,
 * especially when the {@link #setIncludeAttributes(boolean) IncludeAttributes} property has been set to <code>true</code>.
 * <p>
 * Use one of the following methods to obtain the output:
 * <ul style="margin-top: 0">
 *  <li>{@link #writeTo(Writer)}</li>
 *  <li>{@link #appendTo(Appendable)}</li>
 *  <li>{@link #toString()}</li>
 *  <li>{@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)}</li>
 * </ul>
 * <p>
 * The process removes all of the tags and
 * {@linkplain CharacterReference#decodeCollapseWhiteSpace(CharSequence) decodes the result, collapsing all white space}.
 * A space character is included in the output where a <a href="TagType.html#Normal">normal</a> tag is present in the source,
 * unless the tag belongs to an {@linkplain HTMLElements#getInlineLevelElementNames() inline-level} element.
 * An exception to this is the {@link HTMLElementName#BR BR} element, which is also converted to a space despite being an inline-level element.
 * <p>
 * Text inside {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements contained within this segment
 * is ignored.
 * <p>
 * Setting the {@link #setExcludeNonHTMLElements(boolean) ExcludeNonHTMLElements} property results in the exclusion of any content within a
 * <a href="HTMLElements.html#NonHTMLElement">non-HTML element</a>.
 * <p>
 * See the {@link #excludeElement(StartTag)} method for details on how to implement a more complex mechanism to determine whether the
 * {@linkplain Element#getContent() content} of each {@link Element} is to be excluded from the output.
 * <p>
 * All tags that are not <a href="TagType.html#Normal">normal</a> tags, such as {@linkplain TagType#isServerTag() server tags},
 * {@linkplain StartTagType#COMMENT comments} etc., are removed from the output without adding white space to the output.
 * <p>
 * Note that segments on which the {@link Segment#ignoreWhenParsing()} method has been called are treated as text rather than markup,
 * resulting in their inclusion in the output.
 * To remove specific segments before extracting the text, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or
 * {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment to be removed.
 * Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())}
 * and perform the text extraction on this new source object.
 * <p>
 * Extracting the text from an entire {@link Source} object performs a {@linkplain Source#fullSequentialParse() full sequential parse} automatically.
 * <p>
 * To perform a simple rendering of HTML markup into text, which is more readable than the output of this class, use the {@link Renderer} class instead.
 * <dl>
 *  <dt>Example:</dt>
 *  <dd>Using the default settings, the source segment:<br />
 *   "<code>&lt;div&gt;&lt;b&gt;O&lt;/b&gt;ne&lt;/div&gt;&lt;div title="Two"&gt;&lt;b&gt;Th&lt;/b&gt;&lt;script&gt;//a&nbsp;script&nbsp;&lt;/script&gt;ree&lt;/div&gt;</code>"<br />
 *   produces the text "<code>One Two Three</code>".
 * </dl>
 */
public class TextExtractor implements CharStreamSource {
        private final Segment segment;
        private boolean convertNonBreakingSpaces=Config.ConvertNonBreakingSpaces;
        private boolean includeAttributes=false;
        private boolean excludeNonHTMLElements=false;

        private static final Map<String,AttributeIncludeChecker> map; // maps each possibly included attribute name to an AttributeIncludeChecker instance, initialised in static block below.

        /**
         * Constructs a new <code>TextExtractor</code> based on the specified {@link Segment}.
         * @param segment  the segment from which the text will be extracted.
         * @see Segment#getTextExtractor()
         */
        public TextExtractor(final Segment segment) {
                this.segment=segment;
        }

        // Documentation inherited from CharStreamSource
        public void writeTo(final Writer writer) throws IOException {
                appendTo(writer);
                writer.flush();
        }

        // Documentation inherited from CharStreamSource
        public void appendTo(final Appendable appendable) throws IOException {
                appendable.append(toString());
        }

        // Documentation inherited from CharStreamSource
        public long getEstimatedMaximumOutputLength() {
                return segment.length();
        }

        // Documentation inherited from CharStreamSource
        public String toString() {
                return new Processor(segment,getConvertNonBreakingSpaces(),getIncludeAttributes(),getExcludeNonHTMLElements()).toString();
        }

        /**
         * Sets whether non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to spaces.
         * <p>
         * The default value is that of the static {@link Config#ConvertNonBreakingSpaces} property at the time the <code>TextExtractor</code> is instantiated.
         *
         * @param convertNonBreakingSpaces  specifies whether non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to spaces.
         * @return this <code>TextExtractor</code> instance, allowing multiple property setting methods to be chained in a single statement. 
         * @see #getConvertNonBreakingSpaces()
         */
        public TextExtractor setConvertNonBreakingSpaces(boolean convertNonBreakingSpaces) {
                this.convertNonBreakingSpaces=convertNonBreakingSpaces;
                return this;
        }

        /**
         * Indicates whether non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to spaces.
         * <p>
         * See the {@link #setConvertNonBreakingSpaces(boolean)} method for a full description of this property.
         * 
         * @return <code>true</code> if non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to spaces, otherwise <code>false</code>.
         */
        public boolean getConvertNonBreakingSpaces() {
                return convertNonBreakingSpaces;
        }

        /**
         * Sets whether any attribute values are included in the output.
         * <p>
         * If the value of this property is <code>true</code>, then each attribute still has to match the conditions implemented in the
         * {@link #includeAttribute(StartTag,Attribute)} method in order for its value to be included in the output.
         * <p>
         * The default value is <code>false</code>.
         *
         * @param includeAttributes  specifies whether any attribute values are included in the output.
         * @return this <code>TextExtractor</code> instance, allowing multiple property setting methods to be chained in a single statement. 
         * @see #getIncludeAttributes()
         */
        public TextExtractor setIncludeAttributes(boolean includeAttributes) {
                this.includeAttributes=includeAttributes;
                return this;
        }
        
        /**
         * Indicates whether any attribute values are included in the output.
         * <p>
         * See the {@link #setIncludeAttributes(boolean)} method for a full description of this property.
         * 
         * @return <code>true</code> if any attribute values are included in the output, otherwise <code>false</code>.
         */
        public boolean getIncludeAttributes() {
                return includeAttributes;
        }

        /**
         * Indicates whether the value of the specified {@linkplain Attribute attribute} in the specified {@linkplain StartTag start tag} is included in the output.
         * <p>
         * This method is ignored if the {@link #setIncludeAttributes(boolean) IncludeAttributes} property is set to <code>false</code>, in which case
         * no attribute values are included in the output.
         * <p>
         * If the {@link #setIncludeAttributes(boolean) IncludeAttributes} property is set to <code>true</code>, every attribute of every
         * start tag encountered in the segment is checked using this method to determine whether the value of the attribute should be included in the output.
         * <p>
         * The default implementation of this method returns <code>true</code> if the {@linkplain Attribute#getName() name} of the specified {@linkplain Attribute attribute}
         * is one of
         * <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-title">title</a>,
         * <a target="_blank" href="http://www.w3.org/TR/html401/struct/objects.html#adef-alt">alt</a>,
         * <a target="_blank" href="http://www.w3.org/TR/html401/interact/forms.html#adef-label-OPTION">label</a>,
         * <a target="_blank" href="http://www.w3.org/TR/html401/struct/tables.html#adef-summary">summary</a>,
         * <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-content">content</a>*, or
         * <a target="_blank" href="http://www.w3.org/TR/html401/struct/links.html#adef-href">href</a>,
         * but the method can be overridden in a subclass to perform a check of arbitrary complexity on each attribute.
         * <p>
         * * The value of a <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-content">content</a> attribute is only included if a 
         * <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-name-META">name</a> attribute is also present in the specified start tag,
         * as the content attribute of a {@link HTMLElementName#META META} tag only contains human readable text if the name attribute is used as opposed to an
         * <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-http-equiv">http-equiv</a> attribute.
         * <p>
         * <dl>
         *  <dt>Example:</dt>
         *  <dd>
         *   To include only the value of <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-title">title</a> and
         *   <a target="_blank" href="http://www.w3.org/TR/html401/struct/objects.html#adef-alt">alt</a> attributes:<br /><br />
         *   <code>
         *    final Set includeAttributeNames=new HashSet(Arrays.asList(new String[] {"title","alt"}));<br />
         *    TextExtractor textExtractor=new TextExtractor(segment) {<br />
         *    &nbsp; &nbsp; public boolean includeAttribute(StartTag startTag, Attribute attribute) {<br />
         *    &nbsp; &nbsp; &nbsp; &nbsp; return includeAttributeNames.contains(attribute.getKey());<br />
         *    &nbsp; &nbsp; }<br />
         *    };<br />
         *    textExtractor.setIncludeAttributes(true);<br />
         *    String extractedText=textExtractor.toString();
         *   </code>
         *  </dd>
         * </dl>
         * @param startTag  the start tag of the element to check for inclusion.
         * @return <true> if the text inside the {@link Element} of the specified start tag should be excluded from the output, otherwise <code>false</code>.
         */
        public boolean includeAttribute(final StartTag startTag, final Attribute attribute) {
                AttributeIncludeChecker attributeIncludeChecker=map.get(attribute.getKey());
                if (attributeIncludeChecker==null) return false;
                return attributeIncludeChecker.includeAttribute(startTag,attribute);
        }

        /**
         * Sets whether the content of <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output.
         * <p>
         * The default value is <code>false</code>, meaning that content from all elements meeting the other criteria is included.
         *
         * @param excludeNonHTMLElements  specifies whether content <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output.
         * @return this <code>TextExtractor</code> instance, allowing multiple property setting methods to be chained in a single statement. 
         * @see #getExcludeNonHTMLElements()
         */
        public TextExtractor setExcludeNonHTMLElements(boolean excludeNonHTMLElements) {
                this.excludeNonHTMLElements=excludeNonHTMLElements;
                return this;
        }
        
        /**
         * Indicates whether the content of <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output.
         * <p>
         * See the {@link #setExcludeNonHTMLElements(boolean)} method for a full description of this property.
         * 
         * @return <code>true</code> if the content of <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output, otherwise <code>false</code>.
         */
        public boolean getExcludeNonHTMLElements() {
                return excludeNonHTMLElements;
        }

        /**
         * Indicates whether the text inside the {@link Element} of the specified start tag should be excluded from the output.
         * <p>
         * During the text extraction process, every start tag encountered in the segment is checked using this method to determine whether the text inside its
         * {@linkplain StartTag#getElement() associated element} should be excluded from the output.
         * <p>
         * The default implementation of this method is to always return <code>false</code>, so that every element is included,
         * but the method can be overridden in a subclass to perform a check of arbitrary complexity on each start tag.
         * <p>
         * All elements nested inside an excluded element are also implicitly excluded, as are all
         * {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements.
         * Such elements are skipped over without calling this method, so there is no way to include them by overriding the method.
         * <p>
         * <dl>
         *  <dt>Example:</dt>
         *  <dd>
         *   To extract the text from a <code>segment</code>, excluding any text inside elements with the attribute <code>class="NotIndexed"</code>:<br /><br />
         *   <code>
         *    TextExtractor textExtractor=new TextExtractor(segment) {<br />
         *    &nbsp; &nbsp; public boolean excludeElement(StartTag startTag) {<br />
         *    &nbsp; &nbsp; &nbsp; &nbsp; return "NotIndexed".equalsIgnoreCase(startTag.getAttributeValue("class"));<br />
         *    &nbsp; &nbsp; }<br />
         *    };<br />
         *    String extractedText=textExtractor.toString();
         *   </code>
         *  </dd>
         * </dl>
         * @param startTag  the start tag of the element to check for inclusion.
         * @return <true> if the text inside the {@link Element} of the specified start tag should be excluded from the output, otherwise <code>false</code>.
         */
        public boolean excludeElement(final StartTag startTag) {
                return false;
        }

        private static interface AttributeIncludeChecker {
                boolean includeAttribute(final StartTag startTag, final Attribute attribute);
        }

        private static AttributeIncludeChecker ALWAYS_INCLUDE=new AttributeIncludeChecker() {
                public boolean includeAttribute(final StartTag startTag, final Attribute attribute) {
                        return true;
                }
        };

        private static AttributeIncludeChecker INCLUDE_IF_NAME_ATTRIBUTE_PRESENT=new AttributeIncludeChecker() {
                public boolean includeAttribute(final StartTag startTag, final Attribute attribute) {
                        return startTag.getAttributes().get("name")!=null;
                }
        };

        static {
                map=new HashMap<String,AttributeIncludeChecker>();
                map.put("title",ALWAYS_INCLUDE); // add title attribute
                map.put("alt",ALWAYS_INCLUDE); // add alt attribute (APPLET, AREA, IMG and INPUT elements)
                map.put("label",ALWAYS_INCLUDE); // add label attribute (OPTION and OPTGROUP elements)
                map.put("summary",ALWAYS_INCLUDE); // add summary attribute (TABLE element)
                map.put("content",INCLUDE_IF_NAME_ATTRIBUTE_PRESENT); // add content attribute (META element)
                map.put("href",ALWAYS_INCLUDE); // add href attribute (A, AREA and LINK elements)
                // don't bother with the prompt attribute from the ININDEX element as the element is deprecated and very rarely used.
        }

        /**
         * This class does the actual work, but is first passed final copies of all the parameters for efficiency.
         * Note at present this is not implemented in a memory-efficient manner.
         * Once the CharacterReference.decodeCollapseWhiteSpace functionality is available as a FilterWriter (possible with java 5 support),
         * the main algorithm can be implemented in the writeTo(Writer) method to allow for more memory-efficient processing.
         */
        private final class Processor {
                private final Segment segment;
                private final Source source;
                private final boolean convertNonBreakingSpaces;
                private final boolean includeAttributes;
                private final boolean excludeNonHTMLElements;

                public Processor(final Segment segment, final boolean convertNonBreakingSpaces, final boolean includeAttributes, final boolean excludeNonHTMLElements) {
                        this.segment=segment;
                        source=segment.source;
                        this.convertNonBreakingSpaces=convertNonBreakingSpaces;
                        this.includeAttributes=includeAttributes;
                        this.excludeNonHTMLElements=excludeNonHTMLElements;
                }

                public String toString() {
                        final StringBuilder sb=new StringBuilder(segment.length());
                        for (NodeIterator nodeIterator=new NodeIterator(segment); nodeIterator.hasNext();) {
                                Segment segment=nodeIterator.next();
                                if (segment instanceof Tag) {
                                        final Tag tag=(Tag)segment;
                                        if (tag.getTagType().isServerTag()) {
                                                // elementContainsMarkup should be made into a TagType property one day.
                                                // for the time being assume all server element content is code, although this is not true for some Mason elements.
                                                final boolean elementContainsMarkup=false;
                                                if (!elementContainsMarkup) {
                                                        final Element element=tag.getElement();
                                                        if (element!=null && element.getEnd()>tag.getEnd()) nodeIterator.skipToPos(element.getEnd());
                                                }
                                                continue;
                                        }
                                        if (tag.getTagType()==StartTagType.NORMAL) {
                                                final StartTag startTag=(StartTag)tag;
                                                if (tag.name==HTMLElementName.SCRIPT || tag.name==HTMLElementName.STYLE || excludeElement(startTag) || (excludeNonHTMLElements && !HTMLElements.getElementNames().contains(tag.name))) {
                                                        nodeIterator.skipToPos(startTag.getElement().getEnd());
                                                        continue;
                                                }
                                                if (includeAttributes) {
                                                        for (Attribute attribute : startTag.getAttributes()) {
                                                                if (includeAttribute(startTag,attribute)) sb.append(' ').append(attribute.getValueSegment()).append(' ');
                                                        }
                                                }
                                        }
                                        // Treat both start and end tags not belonging to inline-level elements as whitespace:
                                        if (tag.getName()==HTMLElementName.BR || !HTMLElements.getInlineLevelElementNames().contains(tag.getName())) sb.append(' ');
                                } else {
                                        sb.append(segment);
                                }
                        }
                        final String decodedText=CharacterReference.decodeCollapseWhiteSpace(sb,convertNonBreakingSpaces);
                        return decodedText;
                }
        }
}
1	// Jericho HTML Parser - Java based library for analysing and manipulating HTML
2	// Version 3.2
3	// Copyright (C) 2004-2009 Martin Jericho
4	// http://jericho.htmlparser.net/
5	//
6	// This library is free software; you can redistribute it and/or
7	// modify it under the terms of either one of the following licences:
8	//
9	// 1. The Eclipse Public License (EPL) version 1.0,
10	// included in this distribution in the file licence-epl-1.0.html
11	// or available at http://www.eclipse.org/legal/epl-v10.html
12	//
13	// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14	// included in this distribution in the file licence-lgpl-2.1.txt
15	// or available at http://www.gnu.org/licenses/lgpl.txt
16	//
17	// This library is distributed on an "AS IS" basis,
18	// WITHOUT WARRANTY OF ANY KIND, either express or implied.
19	// See the individual licence texts for more details.
20
21	package net.htmlparser.jericho;
22
23	import java.util.*;
24	import java.io.*;
25	import java.net.*;
26
27	/**
28	* Extracts the textual content from HTML markup.
29	* <p>
30	* The output is ideal for feeding into a text search engine such as <a target="_blank" href="http://lucene.apache.org/java/">Apache Lucene</a>,
31	* especially when the {@link #setIncludeAttributes(boolean) IncludeAttributes} property has been set to <code>true</code>.
32	* <p>
33	* Use one of the following methods to obtain the output:
34	* <ul style="margin-top: 0">
35	* <li>{@link #writeTo(Writer)}</li>
36	* <li>{@link #appendTo(Appendable)}</li>
37	* <li>{@link #toString()}</li>
38	* <li>{@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)}</li>
39	* </ul>
40	* <p>
41	* The process removes all of the tags and
42	* {@linkplain CharacterReference#decodeCollapseWhiteSpace(CharSequence) decodes the result, collapsing all white space}.
43	* A space character is included in the output where a <a href="TagType.html#Normal">normal</a> tag is present in the source,
44	* unless the tag belongs to an {@linkplain HTMLElements#getInlineLevelElementNames() inline-level} element.
45	* An exception to this is the {@link HTMLElementName#BR BR} element, which is also converted to a space despite being an inline-level element.
46	* <p>
47	* Text inside {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements contained within this segment
48	* is ignored.
49	* <p>
50	* Setting the {@link #setExcludeNonHTMLElements(boolean) ExcludeNonHTMLElements} property results in the exclusion of any content within a
51	* <a href="HTMLElements.html#NonHTMLElement">non-HTML element</a>.
52	* <p>
53	* See the {@link #excludeElement(StartTag)} method for details on how to implement a more complex mechanism to determine whether the
54	* {@linkplain Element#getContent() content} of each {@link Element} is to be excluded from the output.
55	* <p>
56	* All tags that are not <a href="TagType.html#Normal">normal</a> tags, such as {@linkplain TagType#isServerTag() server tags},
57	* {@linkplain StartTagType#COMMENT comments} etc., are removed from the output without adding white space to the output.
58	* <p>
59	* Note that segments on which the {@link Segment#ignoreWhenParsing()} method has been called are treated as text rather than markup,
60	* resulting in their inclusion in the output.
61	* To remove specific segments before extracting the text, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or
62	* {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment to be removed.
63	* Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())}
64	* and perform the text extraction on this new source object.
65	* <p>
66	* Extracting the text from an entire {@link Source} object performs a {@linkplain Source#fullSequentialParse() full sequential parse} automatically.
67	* <p>
68	* To perform a simple rendering of HTML markup into text, which is more readable than the output of this class, use the {@link Renderer} class instead.
69	* <dl>
70	* <dt>Example:</dt>
71	* <dd>Using the default settings, the source segment:<br />
72	* "<code><div><b>O</b>ne</div><div title="Two"><b>Th</b><script>//a script </script>ree</div></code>"<br />
73	* produces the text "<code>One Two Three</code>".
74	* </dl>
75	*/
76	public class TextExtractor implements CharStreamSource {
77	private final Segment segment;
78	private boolean convertNonBreakingSpaces=Config.ConvertNonBreakingSpaces;
79	private boolean includeAttributes=false;
80	private boolean excludeNonHTMLElements=false;
81
82	private static final Map<String,AttributeIncludeChecker> map; // maps each possibly included attribute name to an AttributeIncludeChecker instance, initialised in static block below.
83
84	/**
85	* Constructs a new <code>TextExtractor</code> based on the specified {@link Segment}.
86	* @param segment the segment from which the text will be extracted.
87	* @see Segment#getTextExtractor()
88	*/
89	public TextExtractor(final Segment segment) {
90	this.segment=segment;
91	}
92
93	// Documentation inherited from CharStreamSource
94	public void writeTo(final Writer writer) throws IOException {
95	appendTo(writer);
96	writer.flush();
97	}
98
99	// Documentation inherited from CharStreamSource
100	public void appendTo(final Appendable appendable) throws IOException {
101	appendable.append(toString());
102	}
103
104	// Documentation inherited from CharStreamSource
105	public long getEstimatedMaximumOutputLength() {
106	return segment.length();
107	}
108
109	// Documentation inherited from CharStreamSource
110	public String toString() {
111	return new Processor(segment,getConvertNonBreakingSpaces(),getIncludeAttributes(),getExcludeNonHTMLElements()).toString();
112	}
113
114	/**
115	* Sets whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces.
116	* <p>
117	* The default value is that of the static {@link Config#ConvertNonBreakingSpaces} property at the time the <code>TextExtractor</code> is instantiated.
118	*
119	* @param convertNonBreakingSpaces specifies whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces.
120	* @return this <code>TextExtractor</code> instance, allowing multiple property setting methods to be chained in a single statement.
121	* @see #getConvertNonBreakingSpaces()
122	*/
123	public TextExtractor setConvertNonBreakingSpaces(boolean convertNonBreakingSpaces) {
124	this.convertNonBreakingSpaces=convertNonBreakingSpaces;
125	return this;
126	}
127
128	/**
129	* Indicates whether non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces.
130	* <p>
131	* See the {@link #setConvertNonBreakingSpaces(boolean)} method for a full description of this property.
132	*
133	* @return <code>true</code> if non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character entity references are converted to spaces, otherwise <code>false</code>.
134	*/
135	public boolean getConvertNonBreakingSpaces() {
136	return convertNonBreakingSpaces;
137	}
138
139	/**
140	* Sets whether any attribute values are included in the output.
141	* <p>
142	* If the value of this property is <code>true</code>, then each attribute still has to match the conditions implemented in the
143	* {@link #includeAttribute(StartTag,Attribute)} method in order for its value to be included in the output.
144	* <p>
145	* The default value is <code>false</code>.
146	*
147	* @param includeAttributes specifies whether any attribute values are included in the output.
148	* @return this <code>TextExtractor</code> instance, allowing multiple property setting methods to be chained in a single statement.
149	* @see #getIncludeAttributes()
150	*/
151	public TextExtractor setIncludeAttributes(boolean includeAttributes) {
152	this.includeAttributes=includeAttributes;
153	return this;
154	}
155
156	/**
157	* Indicates whether any attribute values are included in the output.
158	* <p>
159	* See the {@link #setIncludeAttributes(boolean)} method for a full description of this property.
160	*
161	* @return <code>true</code> if any attribute values are included in the output, otherwise <code>false</code>.
162	*/
163	public boolean getIncludeAttributes() {
164	return includeAttributes;
165	}
166
167	/**
168	* Indicates whether the value of the specified {@linkplain Attribute attribute} in the specified {@linkplain StartTag start tag} is included in the output.
169	* <p>
170	* This method is ignored if the {@link #setIncludeAttributes(boolean) IncludeAttributes} property is set to <code>false</code>, in which case
171	* no attribute values are included in the output.
172	* <p>
173	* If the {@link #setIncludeAttributes(boolean) IncludeAttributes} property is set to <code>true</code>, every attribute of every
174	* start tag encountered in the segment is checked using this method to determine whether the value of the attribute should be included in the output.
175	* <p>
176	* The default implementation of this method returns <code>true</code> if the {@linkplain Attribute#getName() name} of the specified {@linkplain Attribute attribute}
177	* is one of
178	* <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-title">title</a>,
179	* <a target="_blank" href="http://www.w3.org/TR/html401/struct/objects.html#adef-alt">alt</a>,
180	* <a target="_blank" href="http://www.w3.org/TR/html401/interact/forms.html#adef-label-OPTION">label</a>,
181	* <a target="_blank" href="http://www.w3.org/TR/html401/struct/tables.html#adef-summary">summary</a>,
182	* <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-content">content</a>*, or
183	* <a target="_blank" href="http://www.w3.org/TR/html401/struct/links.html#adef-href">href</a>,
184	* but the method can be overridden in a subclass to perform a check of arbitrary complexity on each attribute.
185	* <p>
186	* * The value of a <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-content">content</a> attribute is only included if a
187	* <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-name-META">name</a> attribute is also present in the specified start tag,
188	* as the content attribute of a {@link HTMLElementName#META META} tag only contains human readable text if the name attribute is used as opposed to an
189	* <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-http-equiv">http-equiv</a> attribute.
190	* <p>
191	* <dl>
192	* <dt>Example:</dt>
193	* <dd>
194	* To include only the value of <a target="_blank" href="http://www.w3.org/TR/html401/struct/global.html#adef-title">title</a> and
195	* <a target="_blank" href="http://www.w3.org/TR/html401/struct/objects.html#adef-alt">alt</a> attributes:<br /><br />
196	* <code>
197	* final Set includeAttributeNames=new HashSet(Arrays.asList(new String[] {"title","alt"}));<br />
198	* TextExtractor textExtractor=new TextExtractor(segment) {<br />
199	*     public boolean includeAttribute(StartTag startTag, Attribute attribute) {<br />
200	*         return includeAttributeNames.contains(attribute.getKey());<br />
201	*     }<br />
202	* };<br />
203	* textExtractor.setIncludeAttributes(true);<br />
204	* String extractedText=textExtractor.toString();
205	* </code>
206	* </dd>
207	* </dl>
208	* @param startTag the start tag of the element to check for inclusion.
209	* @return <true> if the text inside the {@link Element} of the specified start tag should be excluded from the output, otherwise <code>false</code>.
210	*/
211	public boolean includeAttribute(final StartTag startTag, final Attribute attribute) {
212	AttributeIncludeChecker attributeIncludeChecker=map.get(attribute.getKey());
213	if (attributeIncludeChecker==null) return false;
214	return attributeIncludeChecker.includeAttribute(startTag,attribute);
215	}
216
217	/**
218	* Sets whether the content of <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output.
219	* <p>
220	* The default value is <code>false</code>, meaning that content from all elements meeting the other criteria is included.
221	*
222	* @param excludeNonHTMLElements specifies whether content <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output.
223	* @return this <code>TextExtractor</code> instance, allowing multiple property setting methods to be chained in a single statement.
224	* @see #getExcludeNonHTMLElements()
225	*/
226	public TextExtractor setExcludeNonHTMLElements(boolean excludeNonHTMLElements) {
227	this.excludeNonHTMLElements=excludeNonHTMLElements;
228	return this;
229	}
230
231	/**
232	* Indicates whether the content of <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output.
233	* <p>
234	* See the {@link #setExcludeNonHTMLElements(boolean)} method for a full description of this property.
235	*
236	* @return <code>true</code> if the content of <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a> is excluded from the output, otherwise <code>false</code>.
237	*/
238	public boolean getExcludeNonHTMLElements() {
239	return excludeNonHTMLElements;
240	}
241
242	/**
243	* Indicates whether the text inside the {@link Element} of the specified start tag should be excluded from the output.
244	* <p>
245	* During the text extraction process, every start tag encountered in the segment is checked using this method to determine whether the text inside its
246	* {@linkplain StartTag#getElement() associated element} should be excluded from the output.
247	* <p>
248	* The default implementation of this method is to always return <code>false</code>, so that every element is included,
249	* but the method can be overridden in a subclass to perform a check of arbitrary complexity on each start tag.
250	* <p>
251	* All elements nested inside an excluded element are also implicitly excluded, as are all
252	* {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements.
253	* Such elements are skipped over without calling this method, so there is no way to include them by overriding the method.
254	* <p>
255	* <dl>
256	* <dt>Example:</dt>
257	* <dd>
258	* To extract the text from a <code>segment</code>, excluding any text inside elements with the attribute <code>class="NotIndexed"</code>:<br /><br />
259	* <code>
260	* TextExtractor textExtractor=new TextExtractor(segment) {<br />
261	*     public boolean excludeElement(StartTag startTag) {<br />
262	*         return "NotIndexed".equalsIgnoreCase(startTag.getAttributeValue("class"));<br />
263	*     }<br />
264	* };<br />
265	* String extractedText=textExtractor.toString();
266	* </code>
267	* </dd>
268	* </dl>
269	* @param startTag the start tag of the element to check for inclusion.
270	* @return <true> if the text inside the {@link Element} of the specified start tag should be excluded from the output, otherwise <code>false</code>.
271	*/
272	public boolean excludeElement(final StartTag startTag) {
273	return false;
274	}
275
276	private static interface AttributeIncludeChecker {
277	boolean includeAttribute(final StartTag startTag, final Attribute attribute);
278	}
279
280	private static AttributeIncludeChecker ALWAYS_INCLUDE=new AttributeIncludeChecker() {
281	public boolean includeAttribute(final StartTag startTag, final Attribute attribute) {
282	return true;
283	}
284	};
285
286	private static AttributeIncludeChecker INCLUDE_IF_NAME_ATTRIBUTE_PRESENT=new AttributeIncludeChecker() {
287	public boolean includeAttribute(final StartTag startTag, final Attribute attribute) {
288	return startTag.getAttributes().get("name")!=null;
289	}
290	};
291
292	static {
293	map=new HashMap<String,AttributeIncludeChecker>();
294	map.put("title",ALWAYS_INCLUDE); // add title attribute
295	map.put("alt",ALWAYS_INCLUDE); // add alt attribute (APPLET, AREA, IMG and INPUT elements)
296	map.put("label",ALWAYS_INCLUDE); // add label attribute (OPTION and OPTGROUP elements)
297	map.put("summary",ALWAYS_INCLUDE); // add summary attribute (TABLE element)
298	map.put("content",INCLUDE_IF_NAME_ATTRIBUTE_PRESENT); // add content attribute (META element)
299	map.put("href",ALWAYS_INCLUDE); // add href attribute (A, AREA and LINK elements)
300	// don't bother with the prompt attribute from the ININDEX element as the element is deprecated and very rarely used.
301	}
302
303	/**
304	* This class does the actual work, but is first passed final copies of all the parameters for efficiency.
305	* Note at present this is not implemented in a memory-efficient manner.
306	* Once the CharacterReference.decodeCollapseWhiteSpace functionality is available as a FilterWriter (possible with java 5 support),
307	* the main algorithm can be implemented in the writeTo(Writer) method to allow for more memory-efficient processing.
308	*/
309	private final class Processor {
310	private final Segment segment;
311	private final Source source;
312	private final boolean convertNonBreakingSpaces;
313	private final boolean includeAttributes;
314	private final boolean excludeNonHTMLElements;
315
316	public Processor(final Segment segment, final boolean convertNonBreakingSpaces, final boolean includeAttributes, final boolean excludeNonHTMLElements) {
317	this.segment=segment;
318	source=segment.source;
319	this.convertNonBreakingSpaces=convertNonBreakingSpaces;
320	this.includeAttributes=includeAttributes;
321	this.excludeNonHTMLElements=excludeNonHTMLElements;
322	}
323
324	public String toString() {
325	final StringBuilder sb=new StringBuilder(segment.length());
326	for (NodeIterator nodeIterator=new NodeIterator(segment); nodeIterator.hasNext();) {
327	Segment segment=nodeIterator.next();
328	if (segment instanceof Tag) {
329	final Tag tag=(Tag)segment;
330	if (tag.getTagType().isServerTag()) {
331	// elementContainsMarkup should be made into a TagType property one day.
332	// for the time being assume all server element content is code, although this is not true for some Mason elements.
333	final boolean elementContainsMarkup=false;
334	if (!elementContainsMarkup) {
335	final Element element=tag.getElement();
336	if (element!=null && element.getEnd()>tag.getEnd()) nodeIterator.skipToPos(element.getEnd());
337	}
338	continue;
339	}
340	if (tag.getTagType()==StartTagType.NORMAL) {
341	final StartTag startTag=(StartTag)tag;
342	if (tag.name==HTMLElementName.SCRIPT \|\| tag.name==HTMLElementName.STYLE \|\| excludeElement(startTag) \|\| (excludeNonHTMLElements && !HTMLElements.getElementNames().contains(tag.name))) {
343	nodeIterator.skipToPos(startTag.getElement().getEnd());
344	continue;
345	}
346	if (includeAttributes) {
347	for (Attribute attribute : startTag.getAttributes()) {
348	if (includeAttribute(startTag,attribute)) sb.append(' ').append(attribute.getValueSegment()).append(' ');
349	}
350	}
351	}
352	// Treat both start and end tags not belonging to inline-level elements as whitespace:
353	if (tag.getName()==HTMLElementName.BR \|\| !HTMLElements.getInlineLevelElementNames().contains(tag.getName())) sb.append(' ');
354	} else {
355	sb.append(segment);
356	}
357	}
358	final String decodedText=CharacterReference.decodeCollapseWhiteSpace(sb,convertNonBreakingSpaces);
359	return decodedText;
360	}
361	}
362	}