1 |
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
|
2 |
// Version 3.2
|
3 |
// Copyright (C) 2004-2009 Martin Jericho
|
4 |
// http://jericho.htmlparser.net/
|
5 |
//
|
6 |
// This library is free software; you can redistribute it and/or
|
7 |
// modify it under the terms of either one of the following licences:
|
8 |
//
|
9 |
// 1. The Eclipse Public License (EPL) version 1.0,
|
10 |
// included in this distribution in the file licence-epl-1.0.html
|
11 |
// or available at http://www.eclipse.org/legal/epl-v10.html
|
12 |
//
|
13 |
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
|
14 |
// included in this distribution in the file licence-lgpl-2.1.txt
|
15 |
// or available at http://www.gnu.org/licenses/lgpl.txt
|
16 |
//
|
17 |
// This library is distributed on an "AS IS" basis,
|
18 |
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
|
19 |
// See the individual licence texts for more details.
|
20 |
|
21 |
package net.htmlparser.jericho;
|
22 |
|
23 |
import java.io.*;
|
24 |
import java.nio.*;
|
25 |
|
26 |
/**
|
27 |
* Implements a buffered window into a stream of characters.
|
28 |
* <p>
|
29 |
* Unless the buffer is explicitly {@linkplain #setBuffer(char[]) set}, it expands automatically as further characters are fetched from the stream.
|
30 |
* <p>
|
31 |
* The {@link #setMinRequiredBufferBegin(int)} method can be used to inform the <code>StreamedText</code> object that characters up to a specified
|
32 |
* position are no longer required, allowing more characters to be fetched without the need to increase the buffer size.
|
33 |
*/
|
34 |
final class StreamedText implements CharSequence {
|
35 |
private final Reader reader;
|
36 |
private char[] buffer;
|
37 |
private boolean expandableBuffer;
|
38 |
private int bufferBegin=0; // the current position of the first byte of the buffer. all text before it has been discarded.
|
39 |
private int readerPos=0; // the next position into which text will be loaded from the reader stream. must be >=bufferBegin and <=bufferBegin+buffer.length, except if one of the "text" argument constructors was used, in which case =Integer.MAX_VALUE.
|
40 |
private int minRequiredBufferBegin=0; // the minimum pos that must be kept in buffer. always >=bufferBegin.
|
41 |
private int end=Integer.MAX_VALUE;
|
42 |
|
43 |
public static int INITIAL_EXPANDABLE_BUFFER_SIZE=8192; // same default as StAX
|
44 |
|
45 |
public StreamedText(final Reader reader, final char[] buffer) {
|
46 |
this.reader=reader;
|
47 |
setBuffer(buffer);
|
48 |
}
|
49 |
|
50 |
public StreamedText(final Reader reader) {
|
51 |
this(reader,null);
|
52 |
}
|
53 |
|
54 |
private StreamedText(final char[] text, final int length) {
|
55 |
reader=null;
|
56 |
buffer=text;
|
57 |
expandableBuffer=false;
|
58 |
end=length;
|
59 |
readerPos=Integer.MAX_VALUE;
|
60 |
}
|
61 |
|
62 |
public StreamedText(final char[] text) {
|
63 |
this(text,text.length);
|
64 |
}
|
65 |
|
66 |
public StreamedText(final CharBuffer text) {
|
67 |
this(text.array(),text.length());
|
68 |
}
|
69 |
|
70 |
public StreamedText(final CharSequence text) {
|
71 |
this(toCharArray(text));
|
72 |
}
|
73 |
|
74 |
public StreamedText setBuffer(char[] buffer) {
|
75 |
if (buffer!=null) {
|
76 |
this.buffer=buffer;
|
77 |
expandableBuffer=false;
|
78 |
} else {
|
79 |
this.buffer=new char[INITIAL_EXPANDABLE_BUFFER_SIZE];
|
80 |
expandableBuffer=true;
|
81 |
}
|
82 |
return this;
|
83 |
}
|
84 |
|
85 |
public boolean hasExpandableBuffer() {
|
86 |
return expandableBuffer;
|
87 |
}
|
88 |
|
89 |
/**
|
90 |
* Returns the character at the specified index.
|
91 |
* @param index the index of the character.
|
92 |
* @return the character at the specified index.
|
93 |
*/
|
94 |
public char charAt(final int pos) {
|
95 |
if (pos>=readerPos) readToPosition(pos);
|
96 |
checkPos(pos);
|
97 |
return buffer[pos-bufferBegin];
|
98 |
}
|
99 |
|
100 |
public void setMinRequiredBufferBegin(final int minRequiredBufferBegin) {
|
101 |
if (minRequiredBufferBegin<bufferBegin) throw new IllegalArgumentException("Cannot set minimum required buffer begin to already discarded position "+minRequiredBufferBegin);
|
102 |
this.minRequiredBufferBegin=minRequiredBufferBegin;
|
103 |
}
|
104 |
|
105 |
public int getMinRequiredBufferBegin() {
|
106 |
return minRequiredBufferBegin;
|
107 |
}
|
108 |
|
109 |
/**
|
110 |
* Returns the length of the text stream.
|
111 |
* <p>
|
112 |
* This method returns Integer.MAX_VALUE until an attempt is made to access a position past the end of the stream.
|
113 |
*
|
114 |
* @return the length of the text stream.
|
115 |
*/
|
116 |
public int length() {
|
117 |
if (end==Integer.MAX_VALUE) throw new IllegalStateException("Length of streamed text cannot be determined until end of file has been reached");
|
118 |
return end;
|
119 |
}
|
120 |
|
121 |
public int getEnd() {
|
122 |
return end;
|
123 |
}
|
124 |
|
125 |
private void prepareBufferRange(final int begin, final int end) {
|
126 |
final int lastRequiredPos=end-1;
|
127 |
if (lastRequiredPos>readerPos) readToPosition(lastRequiredPos);
|
128 |
checkPos(begin);
|
129 |
if (end>this.end) throw new IndexOutOfBoundsException();
|
130 |
}
|
131 |
|
132 |
public void writeTo(final Writer writer, final int begin, final int end) throws IOException {
|
133 |
prepareBufferRange(begin,end);
|
134 |
writer.write(buffer,begin-bufferBegin,end-begin);
|
135 |
}
|
136 |
|
137 |
/**
|
138 |
* Returns a new string that is a substring of this text.
|
139 |
* <p>
|
140 |
* The substring begins at the specified <code>begin</code> position and extends to the character at position <code>end</code> - 1.
|
141 |
* Thus the length of the substring is <code>end-begin</code>.
|
142 |
*
|
143 |
* @param begin the begin position, inclusive.
|
144 |
* @param end the end position, exclusive.
|
145 |
* @return a new string that is a substring of this text.
|
146 |
*/
|
147 |
public String substring(final int begin, final int end) {
|
148 |
prepareBufferRange(begin,end);
|
149 |
return new String(buffer,begin-bufferBegin,end-begin);
|
150 |
}
|
151 |
|
152 |
/**
|
153 |
* Returns a new character sequence that is a subsequence of this sequence.
|
154 |
* <p>
|
155 |
* The returned <code>CharSequence</code> is only guaranteed to be valid as long as no futher operations are performed on this <code>StreamedText</code> object.
|
156 |
* Any subsequent method call could invalidate the underlying buffer used by the <code>CharSequence</code>.
|
157 |
*
|
158 |
* @param begin the begin position, inclusive.
|
159 |
* @param end the end position, exclusive.
|
160 |
* @return a new character sequence that is a subsequence of this sequence.
|
161 |
*/
|
162 |
public CharSequence subSequence(final int begin, final int end) {
|
163 |
// This has not been benchmarked. It is possible that returning substring(begin,end) results in faster code even though it requires more memory allocation.
|
164 |
return getCharBuffer(begin,end);
|
165 |
}
|
166 |
|
167 |
public CharBuffer getCharBuffer(final int begin, final int end) {
|
168 |
prepareBufferRange(begin,end);
|
169 |
return CharBuffer.wrap(buffer,begin-bufferBegin,end-begin);
|
170 |
}
|
171 |
|
172 |
public String toString() {
|
173 |
throw new UnsupportedOperationException("Streamed text can not be converted to a string");
|
174 |
}
|
175 |
|
176 |
public String getDebugInfo() {
|
177 |
return "Buffer size: \""+buffer.length+"\", bufferBegin="+bufferBegin+", minRequiredBufferBegin="+minRequiredBufferBegin+", readerPos="+readerPos;
|
178 |
}
|
179 |
|
180 |
public char[] getBuffer() {
|
181 |
return buffer;
|
182 |
}
|
183 |
|
184 |
public int getBufferBegin() {
|
185 |
return bufferBegin;
|
186 |
}
|
187 |
|
188 |
private void checkPos(final int pos) {
|
189 |
// hopefully inlined by the compiler
|
190 |
if (pos<bufferBegin) throw new IllegalStateException("StreamedText position "+pos+" has been discarded");
|
191 |
if (pos>=end) throw new IndexOutOfBoundsException();
|
192 |
}
|
193 |
|
194 |
public int getBufferOverflowPosition() {
|
195 |
return minRequiredBufferBegin+buffer.length;
|
196 |
}
|
197 |
|
198 |
private void readToPosition(final int pos) {
|
199 |
try {
|
200 |
if (pos>=bufferBegin+buffer.length) {
|
201 |
if (pos>=minRequiredBufferBegin+buffer.length) {
|
202 |
if (!expandableBuffer) throw new BufferOverflowException(); // unfortunately BufferOverflowException doesn't accept a message argument, otherwise it would include the message "StreamedText buffer too small to keep positions "+minRequiredBufferBegin+" and "+pos+" simultaneously"
|
203 |
expandBuffer(pos-minRequiredBufferBegin+1);
|
204 |
}
|
205 |
discardUsedText();
|
206 |
}
|
207 |
while (readerPos<=pos) {
|
208 |
final int charCount=reader.read(buffer,readerPos-bufferBegin,bufferBegin+buffer.length-readerPos);
|
209 |
if (charCount==-1) {
|
210 |
end=readerPos;
|
211 |
break;
|
212 |
}
|
213 |
readerPos+=charCount;
|
214 |
}
|
215 |
} catch (IOException ex) {
|
216 |
throw new RuntimeException(ex);
|
217 |
}
|
218 |
}
|
219 |
|
220 |
private void expandBuffer(final int minSize) throws IOException {
|
221 |
int newSize=buffer.length*2;
|
222 |
if (newSize<minSize) newSize=minSize;
|
223 |
final char[] newBuffer=new char[newSize];
|
224 |
shiftBuffer(buffer,newBuffer);
|
225 |
buffer=newBuffer;
|
226 |
}
|
227 |
|
228 |
private void discardUsedText() throws IOException {
|
229 |
if (minRequiredBufferBegin==bufferBegin) return;
|
230 |
shiftBuffer(buffer,buffer);
|
231 |
}
|
232 |
|
233 |
private void shiftBuffer(final char[] fromBuffer, final char[] toBuffer) throws IOException {
|
234 |
final int shift=minRequiredBufferBegin-bufferBegin;
|
235 |
final int usedBufferLength=readerPos-bufferBegin;
|
236 |
for (int i=shift; i<usedBufferLength; i++) toBuffer[i-shift]=fromBuffer[i];
|
237 |
bufferBegin=minRequiredBufferBegin;
|
238 |
while (readerPos<bufferBegin) {
|
239 |
final long charCount=reader.skip(bufferBegin-readerPos);
|
240 |
if (charCount==0) {
|
241 |
end=readerPos;
|
242 |
break;
|
243 |
}
|
244 |
readerPos+=charCount;
|
245 |
}
|
246 |
}
|
247 |
|
248 |
String getCurrentBufferContent() {
|
249 |
return substring(bufferBegin,Math.min(end,readerPos));
|
250 |
}
|
251 |
|
252 |
private static char[] toCharArray(final CharSequence text) {
|
253 |
if (text instanceof String) return ((String)text).toCharArray();
|
254 |
final char[] charArray=new char[text.length()];
|
255 |
for (int i=0; i<charArray.length; i++) charArray[i]=text.charAt(i);
|
256 |
return charArray;
|
257 |
}
|
258 |
}
|