+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.HttpException;
+import org.apache.commons.httpclient.HttpMethod;
+import org.apache.commons.httpclient.HttpStatus;
+import org.apache.commons.httpclient.NameValuePair;
+import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.w3c.dom.Document;
+import org.w3c.tidy.Tidy;
+import org.wamblee.xml.XSLT;
+
+/**
+ * General support claas for all kinds of requests.
+ */
+public abstract class AbstractPageRequest implements PageRequest {
+
+ private static final Log LOG = LogFactory.getLog(AbstractPageRequest.class);
+ private static final String REDIRECT_HEADER = "Location";
+
+ private NameValuePair[] _params;
+
+ private String _xslt;
+
+ private PrintStream _os;
+
+ protected AbstractPageRequest(NameValuePair[] aParams, String aXslt, PrintStream aOs) {
+ if ( aParams == null ) {
+ throw new IllegalArgumentException("aParams is null");
+ }
+ if ( aXslt == null ) {
+ throw new IllegalArgumentException("aXslt is null");
+ }
+ _params = aParams;
+ _xslt = aXslt;
+ _os = aOs;
+ }
+
+ /* (non-Javadoc)
+ * @see org.wamblee.crawler.PageRequest#overrideXslt(java.lang.String)
+ */
+ public void overrideXslt(String aXslt) {
+ _xslt = aXslt;
+ }
+
+ protected NameValuePair[] getParameters() {
+ return _params;
+ }
+
+ protected Document executeMethod(HttpClient client, HttpMethod method) {
+ try {
+ // Execute the method.
+ method = executeWithRedirects(client, method);
+
+ // Transform the HTML into wellformed XML.
+ Tidy tidy = new Tidy();
+ tidy.setXHTML(true);
+ tidy.setQuiet(true);
+ tidy.setShowWarnings(false);
+ if ( _os != null ) {
+ _os.println("Content of '" + method.getURI() + "'");
+ _os.println();
+ }
+ // We let jtidy produce raw output because the DOM it produces is
+ // is not namespace aware. We let the XSLT processor parse the XML again
+ // to ensure that the XSLT uses a namespace aware DOM tree. An alternative
+ // is to configure namespace awareness of the XML parser in a system wide way.
+ ByteArrayOutputStream xhtml = new ByteArrayOutputStream();
+ tidy.parse(method.getResponseBodyAsStream(), xhtml);
+ _os.print(new String(xhtml.toByteArray()));
+ // Obtaining the XML as dom is not used.
+ //Document w3cDoc = tidy.parseDOM(method.getResponseBodyAsStream(),
+ // _os);
+ if ( _os != null ) {
+ _os.println();
+ }
+ xhtml.flush();
+ byte[] xhtmlData = xhtml.toByteArray();
+ Document transformed = XSLT.transform(xhtmlData, new File(_xslt));
+ _os.println("Transformed result is: ");
+ Transformer transformer = TransformerFactory.newInstance().newTransformer();
+ transformer.setParameter(OutputKeys.INDENT, "yes");
+ transformer.setParameter(OutputKeys.METHOD, "xml");
+ transformer.transform(new DOMSource(transformed), new StreamResult(_os));
+
+ return transformed;
+ } catch (Exception e) {
+ throw new RuntimeException(e.getMessage(), e);
+ } finally {
+ // Release the connection.
+ method.releaseConnection();
+ }
+ }
+
+ /**
+ * @param aClient
+ * @param aMethod
+ * @throws IOException
+ * @throws HttpException
+ */
+ private HttpMethod executeWithRedirects(HttpClient aClient, HttpMethod aMethod) throws IOException, HttpException {
+ int statusCode = aClient.executeMethod(aMethod);
+
+ switch (statusCode) {
+ case HttpStatus.SC_OK: {
+ return aMethod;
+ }
+ case HttpStatus.SC_MOVED_PERMANENTLY:
+ case HttpStatus.SC_MOVED_TEMPORARILY:
+ case HttpStatus.SC_SEE_OTHER: {
+ aMethod.releaseConnection();
+ Header header = aMethod.getResponseHeader(REDIRECT_HEADER);
+ aMethod = new GetMethod(header.getValue());
+ return executeWithRedirects(aClient, aMethod); // TODO protect against infinite recursion.
+ }
+ default: {
+ throw new RuntimeException("Method failed: "
+ + aMethod.getStatusLine());
+ }
+ }
+ }
+}