X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2FAbstractPageRequest.java;h=baf9510b8c2ee9d560d1e888fbb58b8bd5b7b170;hb=8baad2389febcbcd9132fbb62e6329247275a000;hp=dd9e8ae71a5d94dc9856a27a253daaa4f4a3181b;hpb=0c7e22e06b8aa3e5e0e516f2f3c46eee6215bd85;p=utils diff --git a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java index dd9e8ae7..baf9510b 100644 --- a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java @@ -19,7 +19,6 @@ package org.wamblee.crawler; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; -import java.io.PrintStream; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; @@ -62,8 +61,6 @@ public abstract class AbstractPageRequest implements PageRequest { private String _xslt; - private PrintStream _os; - /** * Constructs the request. * @@ -75,11 +72,9 @@ public abstract class AbstractPageRequest implements PageRequest { * Request parameters to use. * @param aXslt * XSLT used to convert the response. - * @param aOs - * Output stream for logging (if null then no logging is done). */ protected AbstractPageRequest(int aMaxTries, int aMaxDelay, - NameValuePair[] aParams, String aXslt, PrintStream aOs) { + NameValuePair[] aParams, String aXslt) { if (aParams == null) { throw new IllegalArgumentException("aParams is null"); } @@ -90,7 +85,6 @@ public abstract class AbstractPageRequest implements PageRequest { _maxDelay = aMaxDelay; _params = aParams; _xslt = aXslt; - _os = aOs; } /* @@ -163,14 +157,14 @@ public abstract class AbstractPageRequest implements PageRequest { Document transformed = new XSLT().transform(xhtmlData, new FileResource(new File(_xslt))); - _os.println("Transformed result is: "); + ByteArrayOutputStream os = new ByteArrayOutputStream(); Transformer transformer = TransformerFactory.newInstance() .newTransformer(); transformer.setParameter(OutputKeys.INDENT, "yes"); transformer.setParameter(OutputKeys.METHOD, "xml"); transformer.transform(new DOMSource(transformed), new StreamResult( - _os)); - + os)); + LOG.debug("Transformed result is \n" + os.toString()); return transformed; } catch (TransformerConfigurationException e) { throw new RuntimeException(e.getMessage(), e); @@ -195,24 +189,21 @@ public abstract class AbstractPageRequest implements PageRequest { tidy.setXHTML(true); tidy.setQuiet(true); tidy.setShowWarnings(false); - if (_os != null) { - _os.println("Content of '" + aMethod.getURI() + "'"); - _os.println(); - } + // We write the jtidy output to XML since the DOM tree it produces is // not namespace aware and namespace awareness is required by XSLT. // An alternative is to configure namespace awareness of the XML parser // in a system wide way. - Document w3cDoc = tidy.parseDOM(aMethod.getResponseBodyAsStream(), _os); + ByteArrayOutputStream os = new ByteArrayOutputStream(); + Document w3cDoc = tidy.parseDOM(aMethod.getResponseBodyAsStream(), os); DOMUtility.removeDuplicateAttributes(w3cDoc); + LOG.debug("Content of response is \n" + os.toString()); ByteArrayOutputStream xhtml = new ByteArrayOutputStream(); XMLSerializer serializer = new XMLSerializer(xhtml, new OutputFormat()); serializer.serialize(w3cDoc); xhtml.flush(); - if (_os != null) { - _os.println(); - } + return xhtml.toByteArray(); }