X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2FAbstractPageRequest.java;h=2e598005ff3a453150a50884cccfd649889ee3c6;hb=5685a836b9208ff8babfe5ac5b30c5f86d27cf96;hp=dd9e8ae71a5d94dc9856a27a253daaa4f4a3181b;hpb=0c7e22e06b8aa3e5e0e516f2f3c46eee6215bd85;p=utils diff --git a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java index dd9e8ae7..2e598005 100644 --- a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java @@ -17,9 +17,10 @@ package org.wamblee.crawler; import java.io.ByteArrayOutputStream; -import java.io.File; import java.io.IOException; -import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; @@ -41,9 +42,8 @@ import org.apache.xml.serialize.OutputFormat; import org.apache.xml.serialize.XMLSerializer; import org.w3c.dom.Document; import org.w3c.tidy.Tidy; -import org.wamblee.io.FileResource; -import org.wamblee.xml.DOMUtility; -import org.wamblee.xml.XSLT; +import org.wamblee.xml.DomUtils; +import org.wamblee.xml.XslTransformer; /** * General support claas for all kinds of requests. @@ -59,10 +59,12 @@ public abstract class AbstractPageRequest implements PageRequest { private int _maxDelay; private NameValuePair[] _params; + + private NameValuePair[] _headers; private String _xslt; - - private PrintStream _os; + + private XslTransformer _transformer; /** * Constructs the request. @@ -73,24 +75,28 @@ public abstract class AbstractPageRequest implements PageRequest { * Maximum delay before executing a request. * @param aParams * Request parameters to use. + * @param aHeaders + * Request headers to use. * @param aXslt * XSLT used to convert the response. - * @param aOs - * Output stream for logging (if null then no logging is done). */ protected AbstractPageRequest(int aMaxTries, int aMaxDelay, - NameValuePair[] aParams, String aXslt, PrintStream aOs) { + NameValuePair[] aParams, NameValuePair[] aHeaders, String aXslt, XslTransformer aTransformer) { if (aParams == null) { throw new IllegalArgumentException("aParams is null"); } + if (aHeaders == null) { + throw new IllegalArgumentException("aHeaders is null"); + } if (aXslt == null) { throw new IllegalArgumentException("aXslt is null"); } _maxTries = aMaxTries; _maxDelay = aMaxDelay; _params = aParams; + _headers = aHeaders; _xslt = aXslt; - _os = aOs; + _transformer = aTransformer; } /* @@ -105,10 +111,23 @@ public abstract class AbstractPageRequest implements PageRequest { /** * Gets the parameters for the request. * + * @param aParams Additional parameters to use, obtained from another page, most likely as + * hidden form fields. * @return Request parameters. */ - protected NameValuePair[] getParameters() { - return _params; + protected NameValuePair[] getParameters(NameValuePair[] aParams) { + List params = new ArrayList(); + params.addAll(Arrays.asList(_params)); + params.addAll(Arrays.asList(aParams)); + return params.toArray(new NameValuePair[0]); + } + + /** + * Gets the headers for the request. + * @return Request headers. + */ + protected NameValuePair[] getHeaders() { + return _headers; } /** @@ -127,6 +146,11 @@ public abstract class AbstractPageRequest implements PageRequest { */ protected Document executeMethod(HttpClient aClient, HttpMethod aMethod) throws IOException, TransformerException { + + for (NameValuePair header: getHeaders()) { + aMethod.setRequestHeader(header.getName(), header.getValue()); + } + int triesLeft = _maxTries; while (triesLeft > 0) { triesLeft--; @@ -160,20 +184,21 @@ public abstract class AbstractPageRequest implements PageRequest { try { aMethod = executeWithRedirects(aClient, aMethod); byte[] xhtmlData = getXhtml(aMethod); - - Document transformed = new XSLT().transform(xhtmlData, - new FileResource(new File(_xslt))); - _os.println("Transformed result is: "); + + + Document transformed = _transformer.transform(xhtmlData, + _transformer.resolve(_xslt)); + ByteArrayOutputStream os = new ByteArrayOutputStream(); Transformer transformer = TransformerFactory.newInstance() .newTransformer(); transformer.setParameter(OutputKeys.INDENT, "yes"); transformer.setParameter(OutputKeys.METHOD, "xml"); transformer.transform(new DOMSource(transformed), new StreamResult( - _os)); - + os)); + LOG.debug("Transformed result is \n" + os.toString()); return transformed; } catch (TransformerConfigurationException e) { - throw new RuntimeException(e.getMessage(), e); + throw new TransformerException("Transformer configuration problem", e); } finally { // Release the connection. aMethod.releaseConnection(); @@ -195,24 +220,21 @@ public abstract class AbstractPageRequest implements PageRequest { tidy.setXHTML(true); tidy.setQuiet(true); tidy.setShowWarnings(false); - if (_os != null) { - _os.println("Content of '" + aMethod.getURI() + "'"); - _os.println(); - } + // We write the jtidy output to XML since the DOM tree it produces is // not namespace aware and namespace awareness is required by XSLT. // An alternative is to configure namespace awareness of the XML parser // in a system wide way. - Document w3cDoc = tidy.parseDOM(aMethod.getResponseBodyAsStream(), _os); - DOMUtility.removeDuplicateAttributes(w3cDoc); + ByteArrayOutputStream os = new ByteArrayOutputStream(); + Document w3cDoc = tidy.parseDOM(aMethod.getResponseBodyAsStream(), os); + DomUtils.removeDuplicateAttributes(w3cDoc); + LOG.debug("Content of response is \n" + os.toString()); ByteArrayOutputStream xhtml = new ByteArrayOutputStream(); XMLSerializer serializer = new XMLSerializer(xhtml, new OutputFormat()); serializer.serialize(w3cDoc); xhtml.flush(); - if (_os != null) { - _os.println(); - } + return xhtml.toByteArray(); } @@ -260,7 +282,7 @@ public abstract class AbstractPageRequest implements PageRequest { // recursion. } default: { - throw new RuntimeException("Method failed: " + throw new IOException("Method failed: " + aMethod.getStatusLine()); } }