git://wamblee.org
/
utils
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
added support for configuring request headers.
[utils]
/
crawler
/
basic
/
src
/
org
/
wamblee
/
crawler
/
AbstractPageRequest.java
diff --git
a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java
b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java
index 66627ca523faa4c56a112811e7ccdf9381107825..28482d7fbd096fb8641db821cfcbaab339841726 100644
(file)
--- a/
crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java
+++ b/
crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java
@@
-39,9
+39,8
@@
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XMLSerializer;
import org.w3c.dom.Document;
import org.w3c.tidy.Tidy;
import org.apache.xml.serialize.XMLSerializer;
import org.w3c.dom.Document;
import org.w3c.tidy.Tidy;
-import org.wamblee.xml.ClasspathUriResolver;
-import org.wamblee.xml.DOMUtility;
-import org.wamblee.xml.XSLT;
+import org.wamblee.xml.DomUtils;
+import org.wamblee.xml.XslTransformer;
/**
* General support claas for all kinds of requests.
/**
* General support claas for all kinds of requests.
@@
-57,8
+56,12
@@
public abstract class AbstractPageRequest implements PageRequest {
private int _maxDelay;
private NameValuePair[] _params;
private int _maxDelay;
private NameValuePair[] _params;
+
+ private NameValuePair[] _headers;
private String _xslt;
private String _xslt;
+
+ private XslTransformer _transformer;
/**
* Constructs the request.
/**
* Constructs the request.
@@
-69,21
+72,28
@@
public abstract class AbstractPageRequest implements PageRequest {
* Maximum delay before executing a request.
* @param aParams
* Request parameters to use.
* Maximum delay before executing a request.
* @param aParams
* Request parameters to use.
+ * @param aHeaders
+ * Request headers to use.
* @param aXslt
* XSLT used to convert the response.
*/
protected AbstractPageRequest(int aMaxTries, int aMaxDelay,
* @param aXslt
* XSLT used to convert the response.
*/
protected AbstractPageRequest(int aMaxTries, int aMaxDelay,
- NameValuePair[] aParams,
String aXslt
) {
+ NameValuePair[] aParams,
NameValuePair[] aHeaders, String aXslt, XslTransformer aTransformer
) {
if (aParams == null) {
throw new IllegalArgumentException("aParams is null");
}
if (aParams == null) {
throw new IllegalArgumentException("aParams is null");
}
+ if (aHeaders == null) {
+ throw new IllegalArgumentException("aHeaders is null");
+ }
if (aXslt == null) {
throw new IllegalArgumentException("aXslt is null");
}
_maxTries = aMaxTries;
_maxDelay = aMaxDelay;
_params = aParams;
if (aXslt == null) {
throw new IllegalArgumentException("aXslt is null");
}
_maxTries = aMaxTries;
_maxDelay = aMaxDelay;
_params = aParams;
+ _headers = aHeaders;
_xslt = aXslt;
_xslt = aXslt;
+ _transformer = aTransformer;
}
/*
}
/*
@@
-103,6
+113,14
@@
public abstract class AbstractPageRequest implements PageRequest {
protected NameValuePair[] getParameters() {
return _params;
}
protected NameValuePair[] getParameters() {
return _params;
}
+
+ /**
+ * Gets the headers for the request.
+ * @return Request headers.
+ */
+ protected NameValuePair[] getHeaders() {
+ return _headers;
+ }
/**
* Executes the request with a random delay and with a maximum number of
/**
* Executes the request with a random delay and with a maximum number of
@@
-120,6
+138,11
@@
public abstract class AbstractPageRequest implements PageRequest {
*/
protected Document executeMethod(HttpClient aClient, HttpMethod aMethod)
throws IOException, TransformerException {
*/
protected Document executeMethod(HttpClient aClient, HttpMethod aMethod)
throws IOException, TransformerException {
+
+ for (NameValuePair header: getHeaders()) {
+ aMethod.setRequestHeader(header.getName(), header.getValue());
+ }
+
int triesLeft = _maxTries;
while (triesLeft > 0) {
triesLeft--;
int triesLeft = _maxTries;
while (triesLeft > 0) {
triesLeft--;
@@
-153,10
+176,9
@@
public abstract class AbstractPageRequest implements PageRequest {
try {
aMethod = executeWithRedirects(aClient, aMethod);
byte[] xhtmlData = getXhtml(aMethod);
try {
aMethod = executeWithRedirects(aClient, aMethod);
byte[] xhtmlData = getXhtml(aMethod);
-
- XSLT xsltProcessor = new XSLT(new ClasspathUriResolver());
- Document transformed = xsltProcessor.transform(xhtmlData,
- xsltProcessor.resolve(_xslt));
+
+ Document transformed = _transformer.transform(xhtmlData,
+ _transformer.resolve(_xslt));
ByteArrayOutputStream os = new ByteArrayOutputStream();
Transformer transformer = TransformerFactory.newInstance()
.newTransformer();
ByteArrayOutputStream os = new ByteArrayOutputStream();
Transformer transformer = TransformerFactory.newInstance()
.newTransformer();
@@
-167,7
+189,7
@@
public abstract class AbstractPageRequest implements PageRequest {
LOG.debug("Transformed result is \n" + os.toString());
return transformed;
} catch (TransformerConfigurationException e) {
LOG.debug("Transformed result is \n" + os.toString());
return transformed;
} catch (TransformerConfigurationException e) {
- throw new
RuntimeException(e.getMessage()
, e);
+ throw new
TransformerException("Transformer configuration problem"
, e);
} finally {
// Release the connection.
aMethod.releaseConnection();
} finally {
// Release the connection.
aMethod.releaseConnection();
@@
-196,7
+218,7
@@
public abstract class AbstractPageRequest implements PageRequest {
// in a system wide way.
ByteArrayOutputStream os = new ByteArrayOutputStream();
Document w3cDoc = tidy.parseDOM(aMethod.getResponseBodyAsStream(), os);
// in a system wide way.
ByteArrayOutputStream os = new ByteArrayOutputStream();
Document w3cDoc = tidy.parseDOM(aMethod.getResponseBodyAsStream(), os);
- D
OMUtility
.removeDuplicateAttributes(w3cDoc);
+ D
omUtils
.removeDuplicateAttributes(w3cDoc);
LOG.debug("Content of response is \n" + os.toString());
ByteArrayOutputStream xhtml = new ByteArrayOutputStream();
LOG.debug("Content of response is \n" + os.toString());
ByteArrayOutputStream xhtml = new ByteArrayOutputStream();
@@
-251,7
+273,7
@@
public abstract class AbstractPageRequest implements PageRequest {
// recursion.
}
default: {
// recursion.
}
default: {
- throw new
Runtime
Exception("Method failed: "
+ throw new
IO
Exception("Method failed: "
+ aMethod.getStatusLine());
}
}
+ aMethod.getStatusLine());
}
}