X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;ds=inline;f=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2FAbstractPageRequest.java;h=28482d7fbd096fb8641db821cfcbaab339841726;hb=bc261b857facb7111e9d6ae68da1f5cc2400d21d;hp=7a3755febaa6fbe8d86259500317079358567b63;hpb=4ca88c7dec30b0fae2338844b44f43d4592a42c6;p=utils diff --git a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java index 7a3755fe..28482d7f 100644 --- a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java @@ -39,8 +39,7 @@ import org.apache.xml.serialize.OutputFormat; import org.apache.xml.serialize.XMLSerializer; import org.w3c.dom.Document; import org.w3c.tidy.Tidy; -import org.wamblee.xml.ClasspathUriResolver; -import org.wamblee.xml.DOMUtility; +import org.wamblee.xml.DomUtils; import org.wamblee.xml.XslTransformer; /** @@ -57,8 +56,12 @@ public abstract class AbstractPageRequest implements PageRequest { private int _maxDelay; private NameValuePair[] _params; + + private NameValuePair[] _headers; private String _xslt; + + private XslTransformer _transformer; /** * Constructs the request. @@ -69,21 +72,28 @@ public abstract class AbstractPageRequest implements PageRequest { * Maximum delay before executing a request. * @param aParams * Request parameters to use. + * @param aHeaders + * Request headers to use. * @param aXslt * XSLT used to convert the response. */ protected AbstractPageRequest(int aMaxTries, int aMaxDelay, - NameValuePair[] aParams, String aXslt) { + NameValuePair[] aParams, NameValuePair[] aHeaders, String aXslt, XslTransformer aTransformer) { if (aParams == null) { throw new IllegalArgumentException("aParams is null"); } + if (aHeaders == null) { + throw new IllegalArgumentException("aHeaders is null"); + } if (aXslt == null) { throw new IllegalArgumentException("aXslt is null"); } _maxTries = aMaxTries; _maxDelay = aMaxDelay; _params = aParams; + _headers = aHeaders; _xslt = aXslt; + _transformer = aTransformer; } /* @@ -103,6 +113,14 @@ public abstract class AbstractPageRequest implements PageRequest { protected NameValuePair[] getParameters() { return _params; } + + /** + * Gets the headers for the request. + * @return Request headers. + */ + protected NameValuePair[] getHeaders() { + return _headers; + } /** * Executes the request with a random delay and with a maximum number of @@ -120,6 +138,11 @@ public abstract class AbstractPageRequest implements PageRequest { */ protected Document executeMethod(HttpClient aClient, HttpMethod aMethod) throws IOException, TransformerException { + + for (NameValuePair header: getHeaders()) { + aMethod.setRequestHeader(header.getName(), header.getValue()); + } + int triesLeft = _maxTries; while (triesLeft > 0) { triesLeft--; @@ -153,10 +176,9 @@ public abstract class AbstractPageRequest implements PageRequest { try { aMethod = executeWithRedirects(aClient, aMethod); byte[] xhtmlData = getXhtml(aMethod); - - XslTransformer xsltProcessor = new XslTransformer(new ClasspathUriResolver()); - Document transformed = xsltProcessor.transform(xhtmlData, - xsltProcessor.resolve(_xslt)); + + Document transformed = _transformer.transform(xhtmlData, + _transformer.resolve(_xslt)); ByteArrayOutputStream os = new ByteArrayOutputStream(); Transformer transformer = TransformerFactory.newInstance() .newTransformer(); @@ -167,7 +189,7 @@ public abstract class AbstractPageRequest implements PageRequest { LOG.debug("Transformed result is \n" + os.toString()); return transformed; } catch (TransformerConfigurationException e) { - throw new RuntimeException(e.getMessage(), e); + throw new TransformerException("Transformer configuration problem", e); } finally { // Release the connection. aMethod.releaseConnection(); @@ -196,7 +218,7 @@ public abstract class AbstractPageRequest implements PageRequest { // in a system wide way. ByteArrayOutputStream os = new ByteArrayOutputStream(); Document w3cDoc = tidy.parseDOM(aMethod.getResponseBodyAsStream(), os); - DOMUtility.removeDuplicateAttributes(w3cDoc); + DomUtils.removeDuplicateAttributes(w3cDoc); LOG.debug("Content of response is \n" + os.toString()); ByteArrayOutputStream xhtml = new ByteArrayOutputStream(); @@ -251,7 +273,7 @@ public abstract class AbstractPageRequest implements PageRequest { // recursion. } default: { - throw new RuntimeException("Method failed: " + throw new IOException("Method failed: " + aMethod.getStatusLine()); } }