From bc261b857facb7111e9d6ae68da1f5cc2400d21d Mon Sep 17 00:00:00 2001 From: erik Date: Tue, 15 Aug 2006 20:11:27 +0000 Subject: [PATCH 1/1] added support for configuring request headers. Required since kiss checks for this header now. --- .../wamblee/crawler/AbstractPageRequest.java | 25 +++++++++++++-- .../org/wamblee/crawler/GetPageRequest.java | 6 ++-- .../org/wamblee/crawler/PostPageRequest.java | 8 +++-- .../crawler/impl/ConfigurationParser.java | 31 +++++++++++++------ 4 files changed, 55 insertions(+), 15 deletions(-) diff --git a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java index 7607f764..28482d7f 100644 --- a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java @@ -56,6 +56,8 @@ public abstract class AbstractPageRequest implements PageRequest { private int _maxDelay; private NameValuePair[] _params; + + private NameValuePair[] _headers; private String _xslt; @@ -70,20 +72,26 @@ public abstract class AbstractPageRequest implements PageRequest { * Maximum delay before executing a request. * @param aParams * Request parameters to use. + * @param aHeaders + * Request headers to use. * @param aXslt * XSLT used to convert the response. */ protected AbstractPageRequest(int aMaxTries, int aMaxDelay, - NameValuePair[] aParams, String aXslt, XslTransformer aTransformer) { + NameValuePair[] aParams, NameValuePair[] aHeaders, String aXslt, XslTransformer aTransformer) { if (aParams == null) { throw new IllegalArgumentException("aParams is null"); } + if (aHeaders == null) { + throw new IllegalArgumentException("aHeaders is null"); + } if (aXslt == null) { throw new IllegalArgumentException("aXslt is null"); } _maxTries = aMaxTries; _maxDelay = aMaxDelay; _params = aParams; + _headers = aHeaders; _xslt = aXslt; _transformer = aTransformer; } @@ -105,6 +113,14 @@ public abstract class AbstractPageRequest implements PageRequest { protected NameValuePair[] getParameters() { return _params; } + + /** + * Gets the headers for the request. + * @return Request headers. + */ + protected NameValuePair[] getHeaders() { + return _headers; + } /** * Executes the request with a random delay and with a maximum number of @@ -122,6 +138,11 @@ public abstract class AbstractPageRequest implements PageRequest { */ protected Document executeMethod(HttpClient aClient, HttpMethod aMethod) throws IOException, TransformerException { + + for (NameValuePair header: getHeaders()) { + aMethod.setRequestHeader(header.getName(), header.getValue()); + } + int triesLeft = _maxTries; while (triesLeft > 0) { triesLeft--; @@ -155,7 +176,7 @@ public abstract class AbstractPageRequest implements PageRequest { try { aMethod = executeWithRedirects(aClient, aMethod); byte[] xhtmlData = getXhtml(aMethod); - + Document transformed = _transformer.transform(xhtmlData, _transformer.resolve(_xslt)); ByteArrayOutputStream os = new ByteArrayOutputStream(); diff --git a/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java b/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java index ac1f9301..40a34211 100644 --- a/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java @@ -37,10 +37,12 @@ public class GetPageRequest extends AbstractPageRequest { * @param aMaxTries Maximum number of retries. * @param aMaxDelay Maximum delay before executing the request. * @param aParams Request parameters to use. + * @param aHeaders Request headers to use. * @param aXslt XSLT to use. */ - public GetPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt, XslTransformer aTransformer) { - super(aMaxTries, aMaxDelay, aParams, aXslt, aTransformer); + public GetPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, + NameValuePair[] aHeaders, String aXslt, XslTransformer aTransformer) { + super(aMaxTries, aMaxDelay, aParams, aHeaders, aXslt, aTransformer); } /* diff --git a/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java b/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java index a090627f..af160f60 100644 --- a/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java @@ -36,10 +36,14 @@ public class PostPageRequest extends AbstractPageRequest { * @param aMaxTries Maximum number of retries. * @param aMaxDelay Maximum delay before executing the request. * @param aParams Request parameters to use. + * @param aHeaders Request headers to use. * @param aXslt XSLT to use. */ - public PostPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt, XslTransformer aTransformer) { - super(aMaxTries, aMaxDelay, aParams, aXslt, aTransformer); + public PostPageRequest(int aMaxTries, int aMaxDelay, + NameValuePair[] aParams, + NameValuePair[] aHeaders, + String aXslt, XslTransformer aTransformer) { + super(aMaxTries, aMaxDelay, aParams, aHeaders, aXslt, aTransformer); } /* diff --git a/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java b/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java index 7e15d4a3..3069104d 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java @@ -48,6 +48,8 @@ public class ConfigurationParser { private static final String ELEM_XSLT = "xslt"; private static final String ELEM_PARAM = "param"; + + private static final String ELEM_HEADER = "header"; private static final String AT_NAME = "name"; @@ -149,20 +151,17 @@ public class ConfigurationParser { private PageRequest parseRequestConfig(Element aElem) { String method = aElem.elementText(ELEM_METHOD); String xslt = aElem.elementText(ELEM_XSLT); - List params = new ArrayList(); - for (Iterator i = aElem.elementIterator(ELEM_PARAM); i.hasNext();) { - Element paramElem = (Element) i.next(); - NameValuePair param = parseParameter(paramElem); - params.add(param); - } - + List params = parseNameValuePairs(aElem, ELEM_PARAM); + List headers = parseNameValuePairs(aElem, ELEM_HEADER); + NameValuePair[] paramsArray = params.toArray(new NameValuePair[0]); + NameValuePair[] headersArray = headers.toArray(new NameValuePair[0]); PageRequest request; if (METHOD_POST.equals(method)) { - request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, + request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, headersArray, xslt, _transformer); } else if (METHOD_GET.equals(method) || method == null) { - request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, + request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, headersArray, xslt, _transformer); } else { throw new RuntimeException("Unknown request method '" + method @@ -172,6 +171,20 @@ public class ConfigurationParser { return request; } + /** + * @param aElem + * @return + */ + private List parseNameValuePairs(Element aElem, String aElemName) { + List headers = new ArrayList(); + for (Iterator i = aElem.elementIterator(aElemName); i.hasNext();) { + Element paramElem = (Element) i.next(); + NameValuePair header = parseParameter(paramElem); + headers.add(header); + } + return headers; + } + /** * Parses a parameter definition. * @param aParam Parameter. -- 2.31.1