1
|
|
/*
|
2
|
|
* JBind
|
3
|
|
*
|
4
|
|
* Copyright (c) by Stefan Wachter. All rights reserved.
|
5
|
|
*
|
6
|
|
* Usage, modification, and redistribution is subject to license terms that are
|
7
|
|
* available at 'http://www.jbind.org'. The JBind license is like the
|
8
|
|
* 'Apache Software License V 1.1'.
|
9
|
|
*/
|
10
|
|
package org.jbind.xml.parser;
|
11
|
|
|
12
|
|
import java.io.BufferedInputStream;
|
13
|
|
import java.io.InputStream;
|
14
|
|
import java.io.InputStreamReader;
|
15
|
|
import java.io.Reader;
|
16
|
|
import java.net.URL;
|
17
|
|
import java.nio.charset.Charset;
|
18
|
|
import java.util.HashMap;
|
19
|
|
import java.util.Map;
|
20
|
|
|
21
|
|
import org.jbind.xml.base.InputSourceLocation;
|
22
|
|
import org.jbind.xml.msg.XmlMessages;
|
23
|
|
import org.xml.sax.EntityResolver;
|
24
|
|
import org.xml.sax.InputSource;
|
25
|
|
|
26
|
|
/**
|
27
|
|
* Parser for an arbitrary input source.
|
28
|
|
*/
|
29
|
|
public class InputSourceParser {
|
30
|
|
|
31
|
|
private Parser myParser = null;
|
32
|
|
|
33
|
|
private byte[] myStartBytes = new byte[4];
|
34
|
|
|
35
|
|
private static final Charset myUtf8 = Charset.forName("UTF-8");
|
36
|
|
private static final Charset myUtf16be = Charset.forName("UTF-16BE");
|
37
|
|
private static final Charset myUtf16le = Charset.forName("UTF-16LE");
|
38
|
|
|
39
|
|
private static final Map ourInternalEntities = new HashMap();
|
40
|
|
|
41
|
|
static {
|
42
|
3
|
ourInternalEntities.put("lt", "<");
|
43
|
3
|
ourInternalEntities.put("gt", ">");
|
44
|
3
|
ourInternalEntities.put("amp", "&");
|
45
|
3
|
ourInternalEntities.put("apos", "'");
|
46
|
3
|
ourInternalEntities.put("quot", "\"");
|
47
|
|
}
|
48
|
|
|
49
|
|
|
50
|
398
|
public InputSourceParser() {}
|
51
|
|
|
52
|
|
/**
|
53
|
|
* Determines the charset that was used to encode the input stream.
|
54
|
|
*
|
55
|
|
* @param anInputStream <i>(required)</i>. The stream is advanced over any
|
56
|
|
* leading byte order marks after this method returned.
|
57
|
|
* @param anErrorHandler <i>(required)</i>. Used to report errors.
|
58
|
|
* @param anInputSource <i>(required)</i>. The source that is parsed.
|
59
|
|
* @return <i>(required)</i>
|
60
|
|
*/
|
61
|
462
|
private Charset determineCharset(InputStream anInputStream, IErrorHandler anErrorHandler, InputSource anInputSource) throws Exception {
|
62
|
462
|
anInputStream.mark(1024);
|
63
|
462
|
if (4 != anInputStream.read(myStartBytes, 0, 4)) {
|
64
|
0
|
anErrorHandler.fatalError(XmlMessages.eof(new InputSourceLocation(anInputSource, -1, -1)));
|
65
|
0
|
throw new ParsingAbortedException();
|
66
|
|
}
|
67
|
|
|
68
|
462
|
long start = myStartBytes[0] | myStartBytes[1] | myStartBytes[2] | myStartBytes[3];
|
69
|
|
|
70
|
462
|
Charset charset = null;
|
71
|
462
|
if (start == 0x3C3F786D) {
|
72
|
0
|
charset = myUtf8;
|
73
|
0
|
anInputStream.reset();
|
74
|
462
|
} else if (start == 0x003C003F) {
|
75
|
0
|
charset = myUtf16be;
|
76
|
0
|
anInputStream.reset();
|
77
|
462
|
} else if (start == 0x3C003F00) {
|
78
|
0
|
charset = myUtf16le;
|
79
|
0
|
anInputStream.reset();
|
80
|
462
|
} else if ((start & 0xFFFFFF00) == 0xEFBBBF00) {
|
81
|
0
|
charset = myUtf8;
|
82
|
0
|
anInputStream.reset();
|
83
|
0
|
anInputStream.read(myStartBytes, 0, 3);
|
84
|
462
|
} else if ((start & 0xFFFF0000) == 0xFEFF0000 && (start & 0xFFFF) != 0) {
|
85
|
0
|
charset = myUtf16be;
|
86
|
0
|
anInputStream.reset();
|
87
|
0
|
anInputStream.read(myStartBytes, 0, 2);
|
88
|
462
|
} else if ((start & 0xFFFF0000) == 0xFFFE0000 && (start & 0xFFFF) != 0) {
|
89
|
0
|
charset = myUtf16le;
|
90
|
0
|
anInputStream.reset();
|
91
|
0
|
anInputStream.read(myStartBytes, 0, 2);
|
92
|
462
|
} else if (start == 0x4C6FA794) {
|
93
|
0
|
anErrorHandler.fatalError(XmlMessages.unsupportedEncoding("EBCDIC", new InputSourceLocation(anInputSource, -1, -1)));
|
94
|
0
|
throw new ParsingAbortedException();
|
95
|
462
|
} else if ((start == 0x3C) || (start == 0x3C00) || (start == 0x3C0000) || (start == 0x3C000000)) {
|
96
|
0
|
anErrorHandler.fatalError(XmlMessages.unsupportedEncoding("UCS-4", new InputSourceLocation(anInputSource, -1, -1)));
|
97
|
0
|
throw new ParsingAbortedException();
|
98
|
|
} else {
|
99
|
462
|
charset = myUtf8;
|
100
|
462
|
anInputStream.reset();
|
101
|
|
}
|
102
|
462
|
return charset;
|
103
|
|
}
|
104
|
|
|
105
|
467
|
private Parser getParser(Reader aReader) {
|
106
|
467
|
if (null == myParser) {
|
107
|
398
|
myParser = new Parser(aReader);
|
108
|
|
} else {
|
109
|
69
|
myParser.ReInit(aReader);
|
110
|
|
}
|
111
|
467
|
return myParser;
|
112
|
|
}
|
113
|
|
|
114
|
466
|
public void parse(InputSource anInputSource, IContentHandler aContentHandler, IErrorHandler anErrorHandler, EntityResolver anEntityResolver) {
|
115
|
466
|
try {
|
116
|
466
|
doParse(anInputSource, aContentHandler, anErrorHandler, anEntityResolver, true);
|
117
|
|
} catch (ParsingAbortedException e) {
|
118
|
|
// ignore
|
119
|
|
}
|
120
|
|
}
|
121
|
|
|
122
|
1
|
public void xInclude(InputSource anInputSource, IContentHandler aContentHandler, IErrorHandler anErrorHandler, EntityResolver anEntityResolver) throws ParsingAbortedException {
|
123
|
1
|
doParse(anInputSource, aContentHandler, anErrorHandler, anEntityResolver, false);
|
124
|
|
}
|
125
|
|
|
126
|
|
/**
|
127
|
|
* @param aSignalDocumentEvents Determines whether the start and end of an xml document
|
128
|
|
* is signalled to the content handler or not. This functionallity is used
|
129
|
|
* for XIncludes where the start and the end of an included document is not signalled.
|
130
|
|
*/
|
131
|
467
|
private synchronized void doParse(InputSource anInputSource, IContentHandler aContentHandler, IErrorHandler anErrorHandler, EntityResolver anEntityResolver, boolean aSignalDocumentEvents) throws ParsingAbortedException {
|
132
|
467
|
InputSource source = anInputSource;
|
133
|
|
|
134
|
467
|
try {
|
135
|
|
|
136
|
467
|
if ((null == anInputSource.getByteStream()) && (null == anInputSource.getCharacterStream())) {
|
137
|
218
|
if (null != anEntityResolver) {
|
138
|
218
|
source = anEntityResolver.resolveEntity(anInputSource.getPublicId(), anInputSource.getSystemId());
|
139
|
218
|
if (null == source) {
|
140
|
218
|
source = anInputSource;
|
141
|
|
}
|
142
|
|
}
|
143
|
|
}
|
144
|
|
|
145
|
467
|
INamespaceContext namespaceContext = new NamespaceContext();
|
146
|
|
|
147
|
467
|
ParserContext parserContext = new ParserContext(source, aContentHandler, anErrorHandler, anEntityResolver, namespaceContext, ourInternalEntities, aSignalDocumentEvents);
|
148
|
|
|
149
|
467
|
if (null != source.getCharacterStream()) {
|
150
|
5
|
Parser parser = getParser(source.getCharacterStream());
|
151
|
5
|
parser.initParser(parserContext);
|
152
|
5
|
parser.document();
|
153
|
|
} else {
|
154
|
462
|
InputStream inputStream = source.getByteStream();
|
155
|
462
|
if (null == inputStream) {
|
156
|
218
|
if (null == source.getSystemId()) {
|
157
|
0
|
anErrorHandler.fatalError(XmlMessages.invalidInputSource(new InputSourceLocation(anInputSource, -1, -1)));
|
158
|
|
}
|
159
|
218
|
URL url = new URL(source.getSystemId());
|
160
|
218
|
inputStream = url.openStream();
|
161
|
|
}
|
162
|
|
|
163
|
462
|
BufferedInputStream in = new BufferedInputStream(inputStream, 4096);
|
164
|
|
|
165
|
462
|
Charset charset = null;
|
166
|
462
|
if (null != source.getEncoding()) {
|
167
|
0
|
charset = Charset.forName(source.getEncoding());
|
168
|
|
} else {
|
169
|
462
|
charset = determineCharset(in, anErrorHandler, source);
|
170
|
|
}
|
171
|
|
|
172
|
462
|
in.mark(1024);
|
173
|
462
|
Reader reader = new InputStreamReader(in, charset);
|
174
|
462
|
Parser parser = getParser(reader);
|
175
|
462
|
parser.initParser(parserContext);
|
176
|
|
|
177
|
462
|
XmlDecl xmlDecl = parser.optionalXmlDecl();
|
178
|
462
|
if (null != xmlDecl) {
|
179
|
0
|
String encoding = xmlDecl.getEncoding();
|
180
|
0
|
if ((null != encoding) && !encoding.equals(charset.name())) {
|
181
|
|
// the declared encoding is different from the used encoding
|
182
|
|
// -> reset the stream and start again with the proper encoding
|
183
|
0
|
in.reset();
|
184
|
0
|
charset = Charset.forName(encoding);
|
185
|
0
|
reader = new InputStreamReader(in, charset);
|
186
|
0
|
parser.ReInit(reader);
|
187
|
|
}
|
188
|
|
|
189
|
|
}
|
190
|
|
|
191
|
462
|
parser.document();
|
192
|
|
|
193
|
|
}
|
194
|
|
|
195
|
|
} catch (ParsingAbortedException e) {
|
196
|
0
|
throw e;
|
197
|
|
} catch (TokenMgrError e) {
|
198
|
0
|
anErrorHandler.fatalError(XmlMessages.wrappedException(e, new InputSourceLocation(source, e.getErrorLine(), e.getErrorColumn())));
|
199
|
0
|
throw new ParsingAbortedException();
|
200
|
|
} catch (ParseException e) {
|
201
|
0
|
int line = -1;
|
202
|
0
|
int column = -1;
|
203
|
0
|
if (null != e.currentToken) {
|
204
|
0
|
line = e.currentToken.beginLine;
|
205
|
0
|
column = e.currentToken.beginColumn;
|
206
|
|
}
|
207
|
0
|
anErrorHandler.fatalError(XmlMessages.wrappedException(e, new InputSourceLocation(source, line, column)));
|
208
|
0
|
throw new ParsingAbortedException();
|
209
|
|
} catch (Exception e) {
|
210
|
0
|
anErrorHandler.exception(e, source);
|
211
|
0
|
throw new ParsingAbortedException();
|
212
|
|
}
|
213
|
|
|
214
|
|
}
|
215
|
|
|
216
|
|
}
|
217
|
|
|