X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2FAbstractPageRequest.java;h=fa41a680ec39d2ad0d9583de051bae337e475db8;hb=602681b15e687db2a7ea0e92796857a6df31bba2;hp=304772043f9aea11852c7b47d196390c8ed7c69d;hpb=33e20b064b1afdda35b18827a55b2b69535e64ae;p=utils diff --git a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java index 30477204..fa41a680 100644 --- a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java @@ -19,6 +19,7 @@ package org.wamblee.crawler; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; +import java.io.OutputStream; import java.io.PrintStream; import javax.xml.transform.OutputKeys; @@ -35,12 +36,16 @@ import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.NameValuePair; +import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.xml.serialize.OutputFormat; +import org.apache.xml.serialize.XMLSerializer; import org.w3c.dom.Document; import org.w3c.tidy.Tidy; import org.wamblee.io.FileResource; +import org.wamblee.xml.DOMUtility; import org.wamblee.xml.XSLT; /** @@ -137,36 +142,9 @@ public abstract class AbstractPageRequest implements PageRequest { protected Document executeMethodWithoutRetries(HttpClient aClient, HttpMethod aMethod) throws TransformerException { try { - // Execute the method. aMethod = executeWithRedirects(aClient, aMethod); - - // Transform the HTML into wellformed XML. - Tidy tidy = new Tidy(); - tidy.setXHTML(true); - tidy.setQuiet(true); - tidy.setShowWarnings(false); - if (_os != null) { - _os.println("Content of '" + aMethod.getURI() + "'"); - _os.println(); - } - // We let jtidy produce raw output because the DOM it produces is - // is not namespace aware. We let the XSLT processor parse the XML - // again - // to ensure that the XSLT uses a namespace aware DOM tree. An - // alternative - // is to configure namespace awareness of the XML parser in a system - // wide way. - ByteArrayOutputStream xhtml = new ByteArrayOutputStream(); - tidy.parse(aMethod.getResponseBodyAsStream(), xhtml); - _os.print(new String(xhtml.toByteArray())); - // Obtaining the XML as dom is not used. - // Document w3cDoc = tidy.parseDOM(method.getResponseBodyAsStream(), - // _os); - if (_os != null) { - _os.println(); - } - xhtml.flush(); - byte[] xhtmlData = xhtml.toByteArray(); + byte[] xhtmlData = getXhtml(aMethod); + Document transformed = new XSLT().transform(xhtmlData, new FileResource(new File(_xslt))); _os.println("Transformed result is: "); @@ -190,6 +168,41 @@ public abstract class AbstractPageRequest implements PageRequest { } } + /** + * Gets the result of the HTTP method as an XHTML document. + * @param aMethod Method to invoke. + * @return XHTML as a byte array. + * @throws URIException In case of poblems with the URI + * @throws IOException In case of problems obtaining the XHTML. + */ + private byte[] getXhtml(HttpMethod aMethod) throws URIException, IOException { + // Transform the HTML into wellformed XML. + Tidy tidy = new Tidy(); + tidy.setXHTML(true); + tidy.setQuiet(true); + tidy.setShowWarnings(false); + if (_os != null) { + _os.println("Content of '" + aMethod.getURI() + "'"); + _os.println(); + } + // We write the jtidy output to XML since the DOM tree it produces is + // not namespace aware and namespace awareness is required by XSLT. + // An alternative is to configure namespace awareness of the XML parser + // in a system wide way. + Document w3cDoc = tidy.parseDOM(aMethod.getResponseBodyAsStream(), + _os); + DOMUtility.removeDuplicateAttributes(w3cDoc); + + ByteArrayOutputStream xhtml = new ByteArrayOutputStream(); + XMLSerializer serializer = new XMLSerializer(xhtml, new OutputFormat()); + serializer.serialize(w3cDoc); + xhtml.flush(); + if (_os != null) { + _os.println(); + } + return xhtml.toByteArray(); + } + /** * Sleeps for a random time but no more than the maximum delay. *