1 |
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
|
2 |
// Version 3.2
|
3 |
// Copyright (C) 2004-2009 Martin Jericho
|
4 |
// http://jericho.htmlparser.net/
|
5 |
//
|
6 |
// This library is free software; you can redistribute it and/or
|
7 |
// modify it under the terms of either one of the following licences:
|
8 |
//
|
9 |
// 1. The Eclipse Public License (EPL) version 1.0,
|
10 |
// included in this distribution in the file licence-epl-1.0.html
|
11 |
// or available at http://www.eclipse.org/legal/epl-v10.html
|
12 |
//
|
13 |
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
|
14 |
// included in this distribution in the file licence-lgpl-2.1.txt
|
15 |
// or available at http://www.gnu.org/licenses/lgpl.txt
|
16 |
//
|
17 |
// This library is distributed on an "AS IS" basis,
|
18 |
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
|
19 |
// See the individual licence texts for more details.
|
20 |
|
21 |
package net.htmlparser.jericho;
|
22 |
|
23 |
import java.util.*;
|
24 |
|
25 |
/**
|
26 |
* Provides a generic implementation of the abstract {@link StartTagType} class based on the most common start tag behaviour.
|
27 |
* <p>
|
28 |
* This class is only of interest to users who wish to create <a href="TagType.html#Custom">custom tag types</a>.
|
29 |
* <p>
|
30 |
* The only external difference between this class and its abstract superclass {@link StartTagType} is that it provides a default
|
31 |
* implementation of the {@link #constructTagAt(Source, int pos)} method.
|
32 |
* <p>
|
33 |
* Most of the <a href="Tag.html#Predefined">predefined</a> start tag types are implemented using this class or a subclass of it.
|
34 |
*
|
35 |
* @see EndTagTypeGenericImplementation
|
36 |
*/
|
37 |
public class StartTagTypeGenericImplementation extends StartTagType {
|
38 |
final boolean nameCharAfterPrefixAllowed;
|
39 |
|
40 |
/**
|
41 |
* Constructs a new <code>StartTagTypeGenericImplementation</code> object with the specified properties.
|
42 |
* <br />(<a href="TagType.html#ImplementationAssistance">implementation assistance</a> method)
|
43 |
* <p>
|
44 |
* This is equivalent to calling
|
45 |
* <br /><code>new </code>{@link #StartTagTypeGenericImplementation(String,String,String,EndTagType,boolean,boolean,boolean) StartTagTypeGenericImplementation}<code>(description,startDelimiter,closingDelimiter,correspondingEndTagType,isServerTag,false,false)</code>.
|
46 |
*
|
47 |
* @param description a {@linkplain #getDescription() description} of the new start tag type useful for debugging purposes.
|
48 |
* @param startDelimiter the {@linkplain #getStartDelimiter() start delimiter} of the new start tag type.
|
49 |
* @param closingDelimiter the {@linkplain #getClosingDelimiter() closing delimiter} of the new start tag type.
|
50 |
* @param correspondingEndTagType the {@linkplain #getCorrespondingEndTagType() corresponding end tag type} of the new start tag type.
|
51 |
* @param isServerTag indicates whether the new start tag type is a {@linkplain #isServerTag() server tag}.
|
52 |
*/
|
53 |
protected StartTagTypeGenericImplementation(final String description, final String startDelimiter, final String closingDelimiter, final EndTagType correspondingEndTagType, final boolean isServerTag) {
|
54 |
this(description,startDelimiter,closingDelimiter,correspondingEndTagType,isServerTag,false,false);
|
55 |
}
|
56 |
|
57 |
/**
|
58 |
* Constructs a new <code>StartTagTypeGenericImplementation</code> object with the specified properties.
|
59 |
* <br />(<a href="TagType.html#ImplementationAssistance">implementation assistance</a> method)
|
60 |
*
|
61 |
* @param description a {@linkplain #getDescription() description} of the new start tag type useful for debugging purposes.
|
62 |
* @param startDelimiter the {@linkplain #getStartDelimiter() start delimiter} of the new start tag type.
|
63 |
* @param closingDelimiter the {@linkplain #getClosingDelimiter() closing delimiter} of the new start tag type.
|
64 |
* @param correspondingEndTagType the {@linkplain #getCorrespondingEndTagType() corresponding end tag type} of the new start tag type.
|
65 |
* @param isServerTag indicates whether the new start tag type is a {@linkplain #isServerTag() server tag}.
|
66 |
* @param hasAttributes indicates whether the new start tag type {@linkplain #hasAttributes() has attributes}.
|
67 |
* @param isNameAfterPrefixRequired indicates whether a {@linkplain #isNameAfterPrefixRequired() name is required after the prefix}.
|
68 |
*/
|
69 |
protected StartTagTypeGenericImplementation(final String description, final String startDelimiter, final String closingDelimiter, final EndTagType correspondingEndTagType, final boolean isServerTag, final boolean hasAttributes, final boolean isNameAfterPrefixRequired) {
|
70 |
super(description,startDelimiter,closingDelimiter,correspondingEndTagType,isServerTag,hasAttributes,isNameAfterPrefixRequired);
|
71 |
nameCharAfterPrefixAllowed=(getNamePrefix().length()==0 || !Character.isLetter(getNamePrefix().charAt(getNamePrefix().length()-1)));
|
72 |
}
|
73 |
|
74 |
/**
|
75 |
* Constructs a tag of this type at the specified position in the specified source document if it matches all of the required features.
|
76 |
* <br />(<a href="TagType.html#DefaultImplementation">default implementation</a> method)
|
77 |
* <p>
|
78 |
* This default implementation performs the following steps:
|
79 |
* <ol class="Separated">
|
80 |
* <li>
|
81 |
* If a {@linkplain #isNameAfterPrefixRequired() name is required after the prefix}, search for a valid
|
82 |
* {@linkplain Tag#isXMLName(CharSequence) XML tag name} directly after the
|
83 |
* {@linkplain #getNamePrefix() name prefix} using the {@link Source#getNameEnd(int pos)} method.
|
84 |
* If one is found, set the {@linkplain Tag#getName() name} to include it, otherwise return <code>null</code>.
|
85 |
* <li>
|
86 |
* If the last character of the {@linkplain #getNamePrefix() name prefix} is a letter
|
87 |
* (indicating that the prefix includes the full {@linkplain Tag#getName() name} of the tag),
|
88 |
* and the character following the prefix in the source text is also a letter
|
89 |
* or any other valid {@linkplain Tag#isXMLNameChar(char) XML name character},
|
90 |
* return <code>null</code>.
|
91 |
* <br />Example: the source text "<code><?xmlt ?></code>" should not be recognised as an
|
92 |
* {@linkplain #XML_PROCESSING_INSTRUCTION XML processing instruction}, which has the prefix "<code><?xml</code>".
|
93 |
* <li>
|
94 |
* If the tag type {@linkplain #hasAttributes() has attributes}, call
|
95 |
* {@link #parseAttributes(Source,int,String) parseAttributes(source,pos,name)} to parse them.
|
96 |
* Return <code>null</code> if too many errors occur while parsing the attributes.
|
97 |
* <li>
|
98 |
* Find the {@linkplain Tag#getEnd() end} of the tag using the {@link #getEnd(Source, int pos)} method,
|
99 |
* where <code>pos</code> is either the end of the {@linkplain StartTag#getAttributes() attributes} segment or the end of the
|
100 |
* {@linkplain Tag#getName() name} depending on whether the tag type {@linkplain #hasAttributes() has attributes}.
|
101 |
* Return <code>null</code> if the end of the tag can not be found.
|
102 |
* <li>
|
103 |
* Construct the {@link StartTag} object using the
|
104 |
* {@link #constructStartTag(Source,int,int,String,Attributes) constructStartTag(Source, int pos, int end, String name, Attributes)}
|
105 |
* method with the argument values collected over the previous steps.
|
106 |
* </ol>
|
107 |
* <p>
|
108 |
* See {@link TagType#constructTagAt(Source, int pos)} for more important information about this method.
|
109 |
*
|
110 |
* @param source the {@link Source} document.
|
111 |
* @param pos the position in the source document.
|
112 |
* @return a tag of this type at the specified position in the specified source document if it meets all of the required features, or <code>null</code> if it does not meet the criteria.
|
113 |
*/
|
114 |
protected Tag constructTagAt(final Source source, final int pos) {
|
115 |
final ParseText parseText=source.getParseText();
|
116 |
final int nameBegin=pos+1;
|
117 |
String name=getNamePrefix();
|
118 |
int nameEnd=nameBegin+getNamePrefix().length();
|
119 |
if (isNameAfterPrefixRequired()) {
|
120 |
final int extendedNameEnd=source.getNameEnd(nameEnd);
|
121 |
if (extendedNameEnd==-1) return null;
|
122 |
name=source.getName(nameBegin,extendedNameEnd);
|
123 |
nameEnd=extendedNameEnd;
|
124 |
} else if (!nameCharAfterPrefixAllowed && Tag.isXMLNameChar(parseText.charAt(nameEnd))) {
|
125 |
return null;
|
126 |
}
|
127 |
int end;
|
128 |
Attributes attributes=null;
|
129 |
if (hasAttributes()) {
|
130 |
// it is necessary to get the attributes so that we can be sure that the search on the closing delimiter doesn't pick up
|
131 |
// anything from the attribute values, which can legally contain ">" characters.
|
132 |
attributes=parseAttributes(source,pos,name);
|
133 |
if (attributes==null) return null; // happens if attributes not properly formed
|
134 |
end=getEnd(source,attributes.getEnd()); // should always return a valid end
|
135 |
} else {
|
136 |
end=getEnd(source,nameEnd);
|
137 |
if (end<0) {
|
138 |
if (end==-1 && source.logger.isInfoEnabled()) source.logger.info(source.getRowColumnVector(pos).appendTo(new StringBuilder(200).append("StartTag ").append(name).append(" at ")).append(" not recognised as type '").append(getDescription()).append("' because it has no closing delimiter").toString());
|
139 |
return null;
|
140 |
}
|
141 |
}
|
142 |
return constructStartTag(source,pos,end,name,attributes);
|
143 |
}
|
144 |
|
145 |
/**
|
146 |
* Returns the {@linkplain Tag#getEnd() end} of a tag of this type, starting from the specified position in the specified source document.
|
147 |
* <br />(<a href="TagType.html#ImplementationAssistance">implementation assistance</a> method)
|
148 |
* <p>
|
149 |
* This default implementation simply searches for the first occurrence of the
|
150 |
* {@linkplain #getClosingDelimiter() closing delimiter} after the specified position, and returns the position immediately
|
151 |
* after the end of it.
|
152 |
* <p>
|
153 |
* If the closing delimiter is not found, the value <code>-1</code> is returned.
|
154 |
*
|
155 |
* @param source the {@link Source} document.
|
156 |
* @param pos the position in the source document.
|
157 |
* @return the {@linkplain Tag#getEnd() end} of a tag of this type, starting from the specified position in the specified source document, or <code>-1</code> if the end of the tag can not be found.
|
158 |
*/
|
159 |
protected int getEnd(final Source source, final int pos) {
|
160 |
final int delimiterBegin=source.getParseText().indexOf(getClosingDelimiter(),pos);
|
161 |
return (delimiterBegin==-1 ? -1 : delimiterBegin+getClosingDelimiter().length());
|
162 |
}
|
163 |
}
|