From 602681b15e687db2a7ea0e92796857a6df31bba2 Mon Sep 17 00:00:00 2001 From: richard Date: Fri, 17 Mar 2006 18:05:41 +0000 Subject: [PATCH] --- .classpath | 1 + .../wamblee/crawler/AbstractPageRequest.java | 71 +++++++++++-------- .../org/wamblee/crawler/kiss/KissCrawler.java | 2 +- support/src/org/wamblee/xml/DOMUtility.java | 71 +++++++++++++++++++ 4 files changed, 115 insertions(+), 30 deletions(-) create mode 100644 support/src/org/wamblee/xml/DOMUtility.java diff --git a/.classpath b/.classpath index c8cf0086..0aa75ddd 100644 --- a/.classpath +++ b/.classpath @@ -71,5 +71,6 @@ + diff --git a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java index 30477204..fa41a680 100644 --- a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java @@ -19,6 +19,7 @@ package org.wamblee.crawler; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; +import java.io.OutputStream; import java.io.PrintStream; import javax.xml.transform.OutputKeys; @@ -35,12 +36,16 @@ import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.NameValuePair; +import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.xml.serialize.OutputFormat; +import org.apache.xml.serialize.XMLSerializer; import org.w3c.dom.Document; import org.w3c.tidy.Tidy; import org.wamblee.io.FileResource; +import org.wamblee.xml.DOMUtility; import org.wamblee.xml.XSLT; /** @@ -137,36 +142,9 @@ public abstract class AbstractPageRequest implements PageRequest { protected Document executeMethodWithoutRetries(HttpClient aClient, HttpMethod aMethod) throws TransformerException { try { - // Execute the method. aMethod = executeWithRedirects(aClient, aMethod); - - // Transform the HTML into wellformed XML. - Tidy tidy = new Tidy(); - tidy.setXHTML(true); - tidy.setQuiet(true); - tidy.setShowWarnings(false); - if (_os != null) { - _os.println("Content of '" + aMethod.getURI() + "'"); - _os.println(); - } - // We let jtidy produce raw output because the DOM it produces is - // is not namespace aware. We let the XSLT processor parse the XML - // again - // to ensure that the XSLT uses a namespace aware DOM tree. An - // alternative - // is to configure namespace awareness of the XML parser in a system - // wide way. - ByteArrayOutputStream xhtml = new ByteArrayOutputStream(); - tidy.parse(aMethod.getResponseBodyAsStream(), xhtml); - _os.print(new String(xhtml.toByteArray())); - // Obtaining the XML as dom is not used. - // Document w3cDoc = tidy.parseDOM(method.getResponseBodyAsStream(), - // _os); - if (_os != null) { - _os.println(); - } - xhtml.flush(); - byte[] xhtmlData = xhtml.toByteArray(); + byte[] xhtmlData = getXhtml(aMethod); + Document transformed = new XSLT().transform(xhtmlData, new FileResource(new File(_xslt))); _os.println("Transformed result is: "); @@ -190,6 +168,41 @@ public abstract class AbstractPageRequest implements PageRequest { } } + /** + * Gets the result of the HTTP method as an XHTML document. + * @param aMethod Method to invoke. + * @return XHTML as a byte array. + * @throws URIException In case of poblems with the URI + * @throws IOException In case of problems obtaining the XHTML. + */ + private byte[] getXhtml(HttpMethod aMethod) throws URIException, IOException { + // Transform the HTML into wellformed XML. + Tidy tidy = new Tidy(); + tidy.setXHTML(true); + tidy.setQuiet(true); + tidy.setShowWarnings(false); + if (_os != null) { + _os.println("Content of '" + aMethod.getURI() + "'"); + _os.println(); + } + // We write the jtidy output to XML since the DOM tree it produces is + // not namespace aware and namespace awareness is required by XSLT. + // An alternative is to configure namespace awareness of the XML parser + // in a system wide way. + Document w3cDoc = tidy.parseDOM(aMethod.getResponseBodyAsStream(), + _os); + DOMUtility.removeDuplicateAttributes(w3cDoc); + + ByteArrayOutputStream xhtml = new ByteArrayOutputStream(); + XMLSerializer serializer = new XMLSerializer(xhtml, new OutputFormat()); + serializer.serialize(w3cDoc); + xhtml.flush(); + if (_os != null) { + _os.println(); + } + return xhtml.toByteArray(); + } + /** * Sleeps for a random time but no more than the maximum delay. * diff --git a/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java b/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java index 5ada1009..fc076a5a 100644 --- a/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java +++ b/crawler/kiss/src/org/wamblee/crawler/kiss/KissCrawler.java @@ -118,7 +118,7 @@ public class KissCrawler { try { HttpClient client = new HttpClient(); - // client.getHostConfiguration().setProxy("localhost", 3128); + client.getHostConfiguration().setProxy("127.0.0.1", 3128); Crawler crawler = createCrawler(aCrawlerConfig, os, client); diff --git a/support/src/org/wamblee/xml/DOMUtility.java b/support/src/org/wamblee/xml/DOMUtility.java new file mode 100644 index 00000000..0efcc973 --- /dev/null +++ b/support/src/org/wamblee/xml/DOMUtility.java @@ -0,0 +1,71 @@ +package org.wamblee.xml; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +import org.w3c.dom.Attr; +import org.w3c.dom.Element; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +import sun.security.krb5.internal.ktab.l; + +/** + * Utility class for performing various operations on DOM trees. + */ +public final class DOMUtility { + + /** + * Disabled constructor. + * + */ + private DOMUtility() { + // Empty + } + + /** + * Removes duplicate attributes from a DOM tree. + * @param aNode Node to remove duplicate attributes from (recursively). + * Attributes of the node itself are not dealt with. Only the child + * nodes are dealt with. + */ + public static void removeDuplicateAttributes(Node aNode) { + NodeList list = aNode.getChildNodes(); + for (int i = 0; i < list.getLength(); i++) { + Node node = list.item(i); + if ( node instanceof Element ) { + removeDuplicateAttributes((Element)node); + removeDuplicateAttributes(node); + } + } + } + + /** + * Removes duplicate attributes from an element. + * @param aElement Element. + */ + private static void removeDuplicateAttributes(Element aElement) { + NamedNodeMap attributes = aElement.getAttributes(); + Map uniqueAttributes = new TreeMap(); + List attlist = new ArrayList(); + for (int i = 0; i < attributes.getLength(); i++) { + Attr attribute = (Attr)attributes.item(i); + if ( uniqueAttributes.containsKey(attribute.getNodeName())) { + System.out.println("Detected duplicate attribute '" + attribute.getNodeName() + "'"); + } + uniqueAttributes.put(attribute.getNodeName(), attribute); + attlist.add(attribute); + } + // Remove all attributes from the element. + for (Attr att: attlist) { + aElement.removeAttributeNode(att); + } + // Add the unique attributes back to the element. + for (Attr att: uniqueAttributes.values()) { + aElement.setAttributeNode(att); + } + } +} -- 2.31.1