X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2FAbstractPageRequest.java;h=5cb4fae6e40bc2f3b0215118bad67ca52028e657;hb=e1aafb0930f726a00368ce3468a48193d0fb6fac;hp=144abe786317e7c8e62d4452c23ad0380c0d29fd;hpb=2415e335184c5d6f58f261d26b95f6c22f55ae0d;p=utils diff --git a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java index 144abe78..5cb4fae6 100644 --- a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java @@ -17,9 +17,7 @@ package org.wamblee.crawler; import java.io.ByteArrayOutputStream; -import java.io.File; import java.io.IOException; -import java.io.PrintStream; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; @@ -34,7 +32,6 @@ import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.NameValuePair; -import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -42,9 +39,8 @@ import org.apache.xml.serialize.OutputFormat; import org.apache.xml.serialize.XMLSerializer; import org.w3c.dom.Document; import org.w3c.tidy.Tidy; -import org.wamblee.io.FileResource; import org.wamblee.xml.DOMUtility; -import org.wamblee.xml.XSLT; +import org.wamblee.xml.XslTransformer; /** * General support claas for all kinds of requests. @@ -62,8 +58,8 @@ public abstract class AbstractPageRequest implements PageRequest { private NameValuePair[] _params; private String _xslt; - - private PrintStream _os; + + private XslTransformer _transformer; /** * Constructs the request. @@ -76,11 +72,9 @@ public abstract class AbstractPageRequest implements PageRequest { * Request parameters to use. * @param aXslt * XSLT used to convert the response. - * @param aOs - * Output stream for logging (if null then no logging is done). */ protected AbstractPageRequest(int aMaxTries, int aMaxDelay, - NameValuePair[] aParams, String aXslt, PrintStream aOs) { + NameValuePair[] aParams, String aXslt, XslTransformer aTransformer) { if (aParams == null) { throw new IllegalArgumentException("aParams is null"); } @@ -91,7 +85,7 @@ public abstract class AbstractPageRequest implements PageRequest { _maxDelay = aMaxDelay; _params = aParams; _xslt = aXslt; - _os = aOs; + _transformer = aTransformer; } /* @@ -162,16 +156,16 @@ public abstract class AbstractPageRequest implements PageRequest { aMethod = executeWithRedirects(aClient, aMethod); byte[] xhtmlData = getXhtml(aMethod); - Document transformed = new XSLT().transform(xhtmlData, - new FileResource(new File(_xslt))); - _os.println("Transformed result is: "); + Document transformed = _transformer.transform(xhtmlData, + _transformer.resolve(_xslt)); + ByteArrayOutputStream os = new ByteArrayOutputStream(); Transformer transformer = TransformerFactory.newInstance() .newTransformer(); transformer.setParameter(OutputKeys.INDENT, "yes"); transformer.setParameter(OutputKeys.METHOD, "xml"); transformer.transform(new DOMSource(transformed), new StreamResult( - _os)); - + os)); + LOG.debug("Transformed result is \n" + os.toString()); return transformed; } catch (TransformerConfigurationException e) { throw new RuntimeException(e.getMessage(), e); @@ -196,24 +190,21 @@ public abstract class AbstractPageRequest implements PageRequest { tidy.setXHTML(true); tidy.setQuiet(true); tidy.setShowWarnings(false); - if (_os != null) { - _os.println("Content of '" + aMethod.getURI() + "'"); - _os.println(); - } + // We write the jtidy output to XML since the DOM tree it produces is // not namespace aware and namespace awareness is required by XSLT. // An alternative is to configure namespace awareness of the XML parser // in a system wide way. - Document w3cDoc = tidy.parseDOM(aMethod.getResponseBodyAsStream(), _os); + ByteArrayOutputStream os = new ByteArrayOutputStream(); + Document w3cDoc = tidy.parseDOM(aMethod.getResponseBodyAsStream(), os); DOMUtility.removeDuplicateAttributes(w3cDoc); + LOG.debug("Content of response is \n" + os.toString()); ByteArrayOutputStream xhtml = new ByteArrayOutputStream(); XMLSerializer serializer = new XMLSerializer(xhtml, new OutputFormat()); serializer.serialize(w3cDoc); xhtml.flush(); - if (_os != null) { - _os.println(); - } + return xhtml.toByteArray(); }