1 |
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
|
2 |
// Version 3.2
|
3 |
// Copyright (C) 2004-2009 Martin Jericho
|
4 |
// http://jericho.htmlparser.net/
|
5 |
//
|
6 |
// This library is free software; you can redistribute it and/or
|
7 |
// modify it under the terms of either one of the following licences:
|
8 |
//
|
9 |
// 1. The Eclipse Public License (EPL) version 1.0,
|
10 |
// included in this distribution in the file licence-epl-1.0.html
|
11 |
// or available at http://www.eclipse.org/legal/epl-v10.html
|
12 |
//
|
13 |
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
|
14 |
// included in this distribution in the file licence-lgpl-2.1.txt
|
15 |
// or available at http://www.gnu.org/licenses/lgpl.txt
|
16 |
//
|
17 |
// This library is distributed on an "AS IS" basis,
|
18 |
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
|
19 |
// See the individual licence texts for more details.
|
20 |
|
21 |
package net.htmlparser.jericho;
|
22 |
|
23 |
import java.util.*;
|
24 |
|
25 |
/**
|
26 |
* Iterates over the "nodes" in a segment.
|
27 |
* <p>
|
28 |
* Every object returned is a Segment. All tags found with the Segment.getAllTags() method are included, as well as segments representing the plain text in between them,
|
29 |
* and character references within the plain text are also included as separate nodes.
|
30 |
*/
|
31 |
class NodeIterator implements Iterator<Segment> {
|
32 |
private final Segment segment;
|
33 |
private final Source source;
|
34 |
private int pos;
|
35 |
private Tag nextTag;
|
36 |
private CharacterReference characterReferenceAtCurrentPosition=null;
|
37 |
|
38 |
private final boolean legacyIteratorCompatabilityMode=Source.LegacyIteratorCompatabilityMode;
|
39 |
|
40 |
public NodeIterator(final Segment segment) {
|
41 |
this.segment=segment;
|
42 |
source=segment.source;
|
43 |
if (segment==source) source.fullSequentialParse();
|
44 |
pos=segment.begin;
|
45 |
nextTag=source.getNextTag(pos);
|
46 |
if (nextTag!=null && nextTag.begin>=segment.end) nextTag=null;
|
47 |
}
|
48 |
|
49 |
public boolean hasNext() {
|
50 |
return pos<segment.end || nextTag!=null;
|
51 |
}
|
52 |
|
53 |
public Segment next() {
|
54 |
final int oldPos=pos;
|
55 |
if (nextTag!=null) {
|
56 |
if (oldPos<nextTag.begin) return nextNonTagSegment(oldPos,nextTag.begin);
|
57 |
final Tag tag=nextTag;
|
58 |
nextTag=nextTag.getNextTag();
|
59 |
if (nextTag!=null && nextTag.begin>=segment.end) nextTag=null;
|
60 |
if (pos<tag.end) pos=tag.end;
|
61 |
return tag;
|
62 |
} else {
|
63 |
if (!hasNext()) throw new NoSuchElementException();
|
64 |
return nextNonTagSegment(oldPos,segment.end);
|
65 |
}
|
66 |
}
|
67 |
|
68 |
private Segment nextNonTagSegment(final int begin, final int end) {
|
69 |
if (!legacyIteratorCompatabilityMode) {
|
70 |
final CharacterReference characterReference=characterReferenceAtCurrentPosition;
|
71 |
if (characterReference!=null) {
|
72 |
characterReferenceAtCurrentPosition=null;
|
73 |
pos=characterReference.end;
|
74 |
return characterReference;
|
75 |
}
|
76 |
final ParseText parseText=source.getParseText();
|
77 |
int potentialCharacterReferenceBegin=parseText.indexOf('&',begin,end);
|
78 |
while (potentialCharacterReferenceBegin!=-1) {
|
79 |
final CharacterReference nextCharacterReference=CharacterReference.construct(source,potentialCharacterReferenceBegin,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL);
|
80 |
if (nextCharacterReference!=null) {
|
81 |
if (potentialCharacterReferenceBegin==begin) {
|
82 |
pos=nextCharacterReference.end;
|
83 |
return nextCharacterReference;
|
84 |
} else {
|
85 |
pos=nextCharacterReference.begin;
|
86 |
characterReferenceAtCurrentPosition=nextCharacterReference;
|
87 |
return new Segment(source,begin,pos);
|
88 |
}
|
89 |
}
|
90 |
potentialCharacterReferenceBegin=parseText.indexOf('&',potentialCharacterReferenceBegin+1,end);
|
91 |
}
|
92 |
}
|
93 |
return new Segment(source,begin,pos=end);
|
94 |
}
|
95 |
|
96 |
public void skipToPos(final int pos) {
|
97 |
if (pos<this.pos) return; // can't go backwards
|
98 |
this.pos=pos;
|
99 |
nextTag=source.getNextTag(pos);
|
100 |
}
|
101 |
|
102 |
public void remove() {
|
103 |
throw new UnsupportedOperationException();
|
104 |
}
|
105 |
}
|