/[aagtl_public1]/src/net/htmlparser/jericho/StreamedText.java
aagtl

Contents of /src/net/htmlparser/jericho/StreamedText.java

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2 - (show annotations) (download)
Sun Aug 5 13:48:36 2012 UTC (11 years, 8 months ago) by zoffadmin
File size: 9263 byte(s)
initial import of aagtl source code
1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2 // Version 3.2
3 // Copyright (C) 2004-2009 Martin Jericho
4 // http://jericho.htmlparser.net/
5 //
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of either one of the following licences:
8 //
9 // 1. The Eclipse Public License (EPL) version 1.0,
10 // included in this distribution in the file licence-epl-1.0.html
11 // or available at http://www.eclipse.org/legal/epl-v10.html
12 //
13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14 // included in this distribution in the file licence-lgpl-2.1.txt
15 // or available at http://www.gnu.org/licenses/lgpl.txt
16 //
17 // This library is distributed on an "AS IS" basis,
18 // WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 // See the individual licence texts for more details.
20
21 package net.htmlparser.jericho;
22
23 import java.io.*;
24 import java.nio.*;
25
26 /**
27 * Implements a buffered window into a stream of characters.
28 * <p>
29 * Unless the buffer is explicitly {@linkplain #setBuffer(char[]) set}, it expands automatically as further characters are fetched from the stream.
30 * <p>
31 * The {@link #setMinRequiredBufferBegin(int)} method can be used to inform the <code>StreamedText</code> object that characters up to a specified
32 * position are no longer required, allowing more characters to be fetched without the need to increase the buffer size.
33 */
34 final class StreamedText implements CharSequence {
35 private final Reader reader;
36 private char[] buffer;
37 private boolean expandableBuffer;
38 private int bufferBegin=0; // the current position of the first byte of the buffer. all text before it has been discarded.
39 private int readerPos=0; // the next position into which text will be loaded from the reader stream. must be >=bufferBegin and <=bufferBegin+buffer.length, except if one of the "text" argument constructors was used, in which case =Integer.MAX_VALUE.
40 private int minRequiredBufferBegin=0; // the minimum pos that must be kept in buffer. always >=bufferBegin.
41 private int end=Integer.MAX_VALUE;
42
43 public static int INITIAL_EXPANDABLE_BUFFER_SIZE=8192; // same default as StAX
44
45 public StreamedText(final Reader reader, final char[] buffer) {
46 this.reader=reader;
47 setBuffer(buffer);
48 }
49
50 public StreamedText(final Reader reader) {
51 this(reader,null);
52 }
53
54 private StreamedText(final char[] text, final int length) {
55 reader=null;
56 buffer=text;
57 expandableBuffer=false;
58 end=length;
59 readerPos=Integer.MAX_VALUE;
60 }
61
62 public StreamedText(final char[] text) {
63 this(text,text.length);
64 }
65
66 public StreamedText(final CharBuffer text) {
67 this(text.array(),text.length());
68 }
69
70 public StreamedText(final CharSequence text) {
71 this(toCharArray(text));
72 }
73
74 public StreamedText setBuffer(char[] buffer) {
75 if (buffer!=null) {
76 this.buffer=buffer;
77 expandableBuffer=false;
78 } else {
79 this.buffer=new char[INITIAL_EXPANDABLE_BUFFER_SIZE];
80 expandableBuffer=true;
81 }
82 return this;
83 }
84
85 public boolean hasExpandableBuffer() {
86 return expandableBuffer;
87 }
88
89 /**
90 * Returns the character at the specified index.
91 * @param index the index of the character.
92 * @return the character at the specified index.
93 */
94 public char charAt(final int pos) {
95 if (pos>=readerPos) readToPosition(pos);
96 checkPos(pos);
97 return buffer[pos-bufferBegin];
98 }
99
100 public void setMinRequiredBufferBegin(final int minRequiredBufferBegin) {
101 if (minRequiredBufferBegin<bufferBegin) throw new IllegalArgumentException("Cannot set minimum required buffer begin to already discarded position "+minRequiredBufferBegin);
102 this.minRequiredBufferBegin=minRequiredBufferBegin;
103 }
104
105 public int getMinRequiredBufferBegin() {
106 return minRequiredBufferBegin;
107 }
108
109 /**
110 * Returns the length of the text stream.
111 * <p>
112 * This method returns Integer.MAX_VALUE until an attempt is made to access a position past the end of the stream.
113 *
114 * @return the length of the text stream.
115 */
116 public int length() {
117 if (end==Integer.MAX_VALUE) throw new IllegalStateException("Length of streamed text cannot be determined until end of file has been reached");
118 return end;
119 }
120
121 public int getEnd() {
122 return end;
123 }
124
125 private void prepareBufferRange(final int begin, final int end) {
126 final int lastRequiredPos=end-1;
127 if (lastRequiredPos>readerPos) readToPosition(lastRequiredPos);
128 checkPos(begin);
129 if (end>this.end) throw new IndexOutOfBoundsException();
130 }
131
132 public void writeTo(final Writer writer, final int begin, final int end) throws IOException {
133 prepareBufferRange(begin,end);
134 writer.write(buffer,begin-bufferBegin,end-begin);
135 }
136
137 /**
138 * Returns a new string that is a substring of this text.
139 * <p>
140 * The substring begins at the specified <code>begin</code> position and extends to the character at position <code>end</code> - 1.
141 * Thus the length of the substring is <code>end-begin</code>.
142 *
143 * @param begin the begin position, inclusive.
144 * @param end the end position, exclusive.
145 * @return a new string that is a substring of this text.
146 */
147 public String substring(final int begin, final int end) {
148 prepareBufferRange(begin,end);
149 return new String(buffer,begin-bufferBegin,end-begin);
150 }
151
152 /**
153 * Returns a new character sequence that is a subsequence of this sequence.
154 * <p>
155 * The returned <code>CharSequence</code> is only guaranteed to be valid as long as no futher operations are performed on this <code>StreamedText</code> object.
156 * Any subsequent method call could invalidate the underlying buffer used by the <code>CharSequence</code>.
157 *
158 * @param begin the begin position, inclusive.
159 * @param end the end position, exclusive.
160 * @return a new character sequence that is a subsequence of this sequence.
161 */
162 public CharSequence subSequence(final int begin, final int end) {
163 // This has not been benchmarked. It is possible that returning substring(begin,end) results in faster code even though it requires more memory allocation.
164 return getCharBuffer(begin,end);
165 }
166
167 public CharBuffer getCharBuffer(final int begin, final int end) {
168 prepareBufferRange(begin,end);
169 return CharBuffer.wrap(buffer,begin-bufferBegin,end-begin);
170 }
171
172 public String toString() {
173 throw new UnsupportedOperationException("Streamed text can not be converted to a string");
174 }
175
176 public String getDebugInfo() {
177 return "Buffer size: \""+buffer.length+"\", bufferBegin="+bufferBegin+", minRequiredBufferBegin="+minRequiredBufferBegin+", readerPos="+readerPos;
178 }
179
180 public char[] getBuffer() {
181 return buffer;
182 }
183
184 public int getBufferBegin() {
185 return bufferBegin;
186 }
187
188 private void checkPos(final int pos) {
189 // hopefully inlined by the compiler
190 if (pos<bufferBegin) throw new IllegalStateException("StreamedText position "+pos+" has been discarded");
191 if (pos>=end) throw new IndexOutOfBoundsException();
192 }
193
194 public int getBufferOverflowPosition() {
195 return minRequiredBufferBegin+buffer.length;
196 }
197
198 private void readToPosition(final int pos) {
199 try {
200 if (pos>=bufferBegin+buffer.length) {
201 if (pos>=minRequiredBufferBegin+buffer.length) {
202 if (!expandableBuffer) throw new BufferOverflowException(); // unfortunately BufferOverflowException doesn't accept a message argument, otherwise it would include the message "StreamedText buffer too small to keep positions "+minRequiredBufferBegin+" and "+pos+" simultaneously"
203 expandBuffer(pos-minRequiredBufferBegin+1);
204 }
205 discardUsedText();
206 }
207 while (readerPos<=pos) {
208 final int charCount=reader.read(buffer,readerPos-bufferBegin,bufferBegin+buffer.length-readerPos);
209 if (charCount==-1) {
210 end=readerPos;
211 break;
212 }
213 readerPos+=charCount;
214 }
215 } catch (IOException ex) {
216 throw new RuntimeException(ex);
217 }
218 }
219
220 private void expandBuffer(final int minSize) throws IOException {
221 int newSize=buffer.length*2;
222 if (newSize<minSize) newSize=minSize;
223 final char[] newBuffer=new char[newSize];
224 shiftBuffer(buffer,newBuffer);
225 buffer=newBuffer;
226 }
227
228 private void discardUsedText() throws IOException {
229 if (minRequiredBufferBegin==bufferBegin) return;
230 shiftBuffer(buffer,buffer);
231 }
232
233 private void shiftBuffer(final char[] fromBuffer, final char[] toBuffer) throws IOException {
234 final int shift=minRequiredBufferBegin-bufferBegin;
235 final int usedBufferLength=readerPos-bufferBegin;
236 for (int i=shift; i<usedBufferLength; i++) toBuffer[i-shift]=fromBuffer[i];
237 bufferBegin=minRequiredBufferBegin;
238 while (readerPos<bufferBegin) {
239 final long charCount=reader.skip(bufferBegin-readerPos);
240 if (charCount==0) {
241 end=readerPos;
242 break;
243 }
244 readerPos+=charCount;
245 }
246 }
247
248 String getCurrentBufferContent() {
249 return substring(bufferBegin,Math.min(end,readerPos));
250 }
251
252 private static char[] toCharArray(final CharSequence text) {
253 if (text instanceof String) return ((String)text).toCharArray();
254 final char[] charArray=new char[text.length()];
255 for (int i=0; i<charArray.length; i++) charArray[i]=text.charAt(i);
256 return charArray;
257 }
258 }

   
Visit the aagtl Website