From 1cd31f6b7721ae7f794466471ed20ec0d7556bc1 Mon Sep 17 00:00:00 2001 From: erik Date: Thu, 16 Mar 2006 18:56:17 +0000 Subject: [PATCH] --- .../wamblee/crawler/AbstractPageRequest.java | 46 ++++- .../basic/src/org/wamblee/crawler/Action.java | 2 +- .../src/org/wamblee/crawler/Crawler.java | 4 +- .../org/wamblee/crawler/GetPageRequest.java | 37 ++-- .../src/org/wamblee/crawler/PageRequest.java | 2 +- .../org/wamblee/crawler/PostPageRequest.java | 36 ++-- .../org/wamblee/crawler/impl/ActionImpl.java | 3 +- .../src/org/wamblee/crawler/impl/App.java | 34 ++-- .../crawler/impl/ConfigurationParser.java | 7 +- .../org/wamblee/crawler/impl/CrawlerImpl.java | 10 +- trunk/crawler/kiss/build.xml | 2 +- .../org/wamblee/crawler/kiss/KissCrawler.java | 179 ++++++++++++++---- .../src/org/wamblee/crawler/kiss/Program.java | 3 +- 13 files changed, 270 insertions(+), 95 deletions(-) diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java b/trunk/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java index 63764f53..73132aa8 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java @@ -23,6 +23,8 @@ import java.io.PrintStream; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; @@ -38,6 +40,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.w3c.dom.Document; import org.w3c.tidy.Tidy; +import org.wamblee.io.FileResource; import org.wamblee.xml.XSLT; /** @@ -47,6 +50,9 @@ public abstract class AbstractPageRequest implements PageRequest { private static final Log LOG = LogFactory.getLog(AbstractPageRequest.class); private static final String REDIRECT_HEADER = "Location"; + + private int _maxTries; + private int _maxDelay; private NameValuePair[] _params; @@ -54,13 +60,15 @@ public abstract class AbstractPageRequest implements PageRequest { private PrintStream _os; - protected AbstractPageRequest(NameValuePair[] aParams, String aXslt, PrintStream aOs) { + protected AbstractPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt, PrintStream aOs) { if ( aParams == null ) { throw new IllegalArgumentException("aParams is null"); } if ( aXslt == null ) { throw new IllegalArgumentException("aXslt is null"); } + _maxTries = aMaxTries; + _maxDelay = aMaxDelay; _params = aParams; _xslt = aXslt; _os = aOs; @@ -76,8 +84,24 @@ public abstract class AbstractPageRequest implements PageRequest { protected NameValuePair[] getParameters() { return _params; } + + protected Document executeMethod(HttpClient client, HttpMethod method) throws TransformerException { + int triesLeft = _maxTries; + while ( triesLeft > 0 ) { + triesLeft--; + try { + return executeMethodWithoutRetries(client, method); + } catch (TransformerException e) { + if ( triesLeft == 0 ) { + throw e; + } + } + } + throw new RuntimeException("Code should never reach this point"); + } + - protected Document executeMethod(HttpClient client, HttpMethod method) { + protected Document executeMethodWithoutRetries(HttpClient client, HttpMethod method) throws TransformerException { try { // Execute the method. method = executeWithRedirects(client, method); @@ -106,7 +130,7 @@ public abstract class AbstractPageRequest implements PageRequest { } xhtml.flush(); byte[] xhtmlData = xhtml.toByteArray(); - Document transformed = XSLT.transform(xhtmlData, new File(_xslt)); + Document transformed = new XSLT().transform(xhtmlData, new FileResource(new File(_xslt))); _os.println("Transformed result is: "); Transformer transformer = TransformerFactory.newInstance().newTransformer(); transformer.setParameter(OutputKeys.INDENT, "yes"); @@ -114,13 +138,26 @@ public abstract class AbstractPageRequest implements PageRequest { transformer.transform(new DOMSource(transformed), new StreamResult(_os)); return transformed; - } catch (Exception e) { + } catch (HttpException e) { + throw new RuntimeException(e.getMessage(), e); + } catch (IOException e) { + throw new RuntimeException(e.getMessage(), e); + } catch (TransformerConfigurationException e) { throw new RuntimeException(e.getMessage(), e); } finally { // Release the connection. method.releaseConnection(); } } + + private void delay() { + try { + Thread.sleep((long)((float)_maxDelay* Math.random())); + } catch (InterruptedException e) { + // + } + } + /** * @param aClient @@ -129,6 +166,7 @@ public abstract class AbstractPageRequest implements PageRequest { * @throws HttpException */ private HttpMethod executeWithRedirects(HttpClient aClient, HttpMethod aMethod) throws IOException, HttpException { + delay(); int statusCode = aClient.executeMethod(aMethod); switch (statusCode) { diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/Action.java b/trunk/crawler/basic/src/org/wamblee/crawler/Action.java index a4df7a1f..0002814d 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/Action.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/Action.java @@ -33,7 +33,7 @@ public interface Action { * Executes the action. * @return */ - Page execute(); + Page execute() throws PageException; /** * Gets a description of the action. THe element returned is the action element diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/Crawler.java b/trunk/crawler/basic/src/org/wamblee/crawler/Crawler.java index f55eebb3..07dff3fd 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/Crawler.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/Crawler.java @@ -27,7 +27,7 @@ public interface Crawler { * @param aUrl Url of page. * @return Page to retrieve. */ - Page getPage(String aUrl); + Page getPage(String aUrl) throws PageException; /** * Gets the content for a specific page. @@ -35,5 +35,5 @@ public interface Crawler { * @param aType Type of page. * @return Page. */ - Page getPage(String aUrl, PageType aType); + Page getPage(String aUrl, PageType aType) throws PageException; } diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java b/trunk/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java index 7d99c1e8..9a9d02e4 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/GetPageRequest.java @@ -12,12 +12,14 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + */ package org.wamblee.crawler; import java.io.PrintStream; +import javax.xml.transform.TransformerException; + import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.NameValuePair; @@ -25,35 +27,40 @@ import org.apache.commons.httpclient.methods.GetMethod; import org.w3c.dom.Document; /** - * Gets a page by issueing a get request. + * Gets a page by issueing a get request. */ public class GetPageRequest extends AbstractPageRequest { - - public GetPageRequest(NameValuePair[] aParams, String aXslt) { - super(aParams, aXslt, null); + + public GetPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt) { + super(aMaxTries, aMaxDelay, aParams, aXslt, null); } - - public GetPageRequest(NameValuePair[] aParams, String aXslt, PrintStream aOs) { - super(aParams, aXslt, aOs); + + public GetPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt, PrintStream aOs) { + super(aMaxTries, aMaxDelay, aParams, aXslt, aOs); } - - /* (non-Javadoc) + /* + * (non-Javadoc) + * * @see org.wamblee.crawler.PageRequest#getPage(org.apache.commons.httpclient.HttpClient) */ - public Document execute(String aUrl, HttpClient aClient) { + public Document execute(String aUrl, HttpClient aClient) + throws PageException { HttpMethod method = new GetMethod(aUrl); - if ( getParameters().length > 0 ) { + if (getParameters().length > 0) { String oldQueryString = method.getQueryString(); method.setQueryString(getParameters()); String queryString = method.getQueryString(); - if ( oldQueryString.length() > 0 ) { + if (oldQueryString.length() > 0) { queryString = queryString + '&' + oldQueryString; method.setQueryString(queryString); } } - - return executeMethod(aClient, method); + try { + return executeMethod(aClient, method); + } catch (TransformerException e) { + throw new PageException(e.getMessage(), e); + } } } diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/PageRequest.java b/trunk/crawler/basic/src/org/wamblee/crawler/PageRequest.java index cf88bbf8..753bc05b 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/PageRequest.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/PageRequest.java @@ -29,7 +29,7 @@ public interface PageRequest { * @param aClient Http client to use. * @return Client. */ - Document execute(String aUrl, HttpClient aClient); + Document execute(String aUrl, HttpClient aClient) throws PageException; /** * Overrides the Xslt to use. diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java b/trunk/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java index 10ad783a..22576522 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/PostPageRequest.java @@ -12,12 +12,14 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + */ package org.wamblee.crawler; import java.io.PrintStream; +import javax.xml.transform.TransformerException; + import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.NameValuePair; import org.apache.commons.httpclient.methods.PostMethod; @@ -27,23 +29,31 @@ import org.w3c.dom.Document; * Retrieving pages using the post method. */ public class PostPageRequest extends AbstractPageRequest { - - public PostPageRequest(NameValuePair[] aParams, String aXslt) { - super(aParams, aXslt, null); + + public PostPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt) { + super(aMaxTries, aMaxDelay, aParams, aXslt, null); } - - public PostPageRequest(NameValuePair[] aParams, String aXslt, PrintStream aOs) { - super(aParams, aXslt, aOs); + + public PostPageRequest(int aMaxTries, int aMaxDelay, NameValuePair[] aParams, String aXslt, + PrintStream aOs) { + super(aMaxTries, aMaxDelay, aParams, aXslt, aOs); } - - - /* (non-Javadoc) - * @see org.wamblee.crawler.PageRequest#execute(java.lang.String, org.apache.commons.httpclient.HttpClient) + + /* + * (non-Javadoc) + * + * @see org.wamblee.crawler.PageRequest#execute(java.lang.String, + * org.apache.commons.httpclient.HttpClient) */ - public Document execute(String aUrl, HttpClient aClient) { + public Document execute(String aUrl, HttpClient aClient) + throws PageException { PostMethod method = new PostMethod(aUrl); method.addParameters(getParameters()); - return executeMethod(aClient, method); + try { + return executeMethod(aClient, method); + } catch (TransformerException e) { + throw new PageException(e.getMessage(), e); + } } } diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/impl/ActionImpl.java b/trunk/crawler/basic/src/org/wamblee/crawler/impl/ActionImpl.java index d0fe0806..e5dac7d0 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/impl/ActionImpl.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/impl/ActionImpl.java @@ -20,6 +20,7 @@ import org.dom4j.Element; import org.wamblee.crawler.Action; import org.wamblee.crawler.Crawler; import org.wamblee.crawler.Page; +import org.wamblee.crawler.PageException; import org.wamblee.crawler.PageType; /** @@ -59,7 +60,7 @@ public class ActionImpl implements Action { /* (non-Javadoc) * @see org.wamblee.crawler.Action#execute() */ - public Page execute() { + public Page execute() throws PageException { if ( _type == null) { return _crawler.getPage(_reference); } diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/impl/App.java b/trunk/crawler/basic/src/org/wamblee/crawler/impl/App.java index 75fd3b09..15e740a6 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/impl/App.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/impl/App.java @@ -7,11 +7,14 @@ import java.io.InputStream; import java.io.PrintStream; import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.dom4j.Element; import org.wamblee.crawler.Action; import org.wamblee.crawler.Configuration; import org.wamblee.crawler.Crawler; import org.wamblee.crawler.Page; +import org.wamblee.crawler.PageException; /* * Copyright 2005 the original author or authors. @@ -34,12 +37,14 @@ import org.wamblee.crawler.Page; */ public class App { + private static final Log LOG = LogFactory.getLog(App.class); + private static final String LOG_FILE = "crawler.log"; public static void main(String[] args) throws Exception { String configFileName = args[0]; String starturl = args[1]; - + FileOutputStream fos = new FileOutputStream(new File(LOG_FILE)); PrintStream os = new PrintStream(fos); @@ -51,7 +56,7 @@ public class App { HttpClient client = new HttpClient(); // client.getHostConfiguration().setProxy("localhost", 3128); - + Crawler crawler = new CrawlerImpl(client, config); System.out.println("Retrieving: " + starturl); @@ -79,27 +84,28 @@ public class App { */ private static void showPage(Page aPage) { Action[] links = aPage.getActions(); - for (Action link: links) { + for (Action link : links) { System.out.println("Link found '" + link.getName() + "'"); } - Element element = aPage.getContent(); + Element element = aPage.getContent(); System.out.println("Retrieved content: " + element.asXML()); } - - private static void recordInterestingShows(Page page) { + + private static void recordInterestingShows(Page page) throws PageException { Action[] channels = page.getActions(); - for (Action channel: channels) { - examineChannel(channel.getName(), channel.execute().getAction("right-now").execute()); + for (Action channel : channels) { + examineChannel(channel.getName(), channel.execute().getAction( + "right-now").execute()); } } - - private static void examineChannel(String aChannel, Page aPage) { - Action[] programs = aPage.getActions(); - for (Action program: programs) { + + private static void examineChannel(String aChannel, Page aPage) throws PageException { + Action[] programs = aPage.getActions(); + for (Action program : programs) { System.out.println(aChannel + " - " + program.getName()); - if ( program.getName().toLowerCase().matches(".*babe.*")) { + if (program.getName().toLowerCase().matches(".*babe.*")) { Page programPage = program.execute(); - Action record = programPage.getAction("record"); + Action record = programPage.getAction("record"); System.out.println("Recording possible: " + record != null); } } diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java b/trunk/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java index 89e815c8..dafbc832 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java @@ -49,6 +49,9 @@ public class ConfigurationParser { private static final String METHOD_POST = "post"; private static final String METHOD_GET = "get"; + private static final int MAX_TRIES = 3; + private static final int MAX_DELAY = 5000; + private PrintStream _os; public ConfigurationParser(PrintStream aOs) { @@ -122,10 +125,10 @@ public class ConfigurationParser { NameValuePair[] paramsArray = params.toArray(new NameValuePair[0]); PageRequest request; if ( METHOD_POST.equals(method)) { - request = new PostPageRequest(paramsArray, xslt, _os); + request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, xslt, _os); } else if ( METHOD_GET.equals(method) || method == null ){ - request = new GetPageRequest(paramsArray, xslt, _os); + request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, xslt, _os); } else { throw new RuntimeException("Unknown request method '" + method + "'. Only " + METHOD_GET + " and " + METHOD_POST + " are supported"); diff --git a/trunk/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java b/trunk/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java index 8db31606..53a3873a 100644 --- a/trunk/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java +++ b/trunk/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java @@ -25,6 +25,7 @@ import org.w3c.dom.Document; import org.wamblee.crawler.Configuration; import org.wamblee.crawler.Crawler; import org.wamblee.crawler.Page; +import org.wamblee.crawler.PageException; import org.wamblee.crawler.PageRequest; import org.wamblee.crawler.PageType; @@ -34,9 +35,10 @@ import org.wamblee.crawler.PageType; public class CrawlerImpl implements Crawler { private static final Log LOG = LogFactory.getLog(CrawlerImpl.class); + private static final int MAX_DELAY = 5000; private HttpClient _client; - private Configuration _config; + private Configuration _config; public CrawlerImpl(HttpClient aClient, Configuration aConfig) { _client = aClient; @@ -47,7 +49,7 @@ public class CrawlerImpl implements Crawler { * (non-Javadoc) * @see org.wamblee.crawler.Crawler#getPage(java.lang.String) */ - public Page getPage(String aUrl) { + public Page getPage(String aUrl) throws PageException { LOG.info("Getting page: url = '" + aUrl + "'"); PageRequest request = _config.getRequest(aUrl); Document content = request.execute(aUrl, _client); @@ -57,13 +59,13 @@ public class CrawlerImpl implements Crawler { /* (non-Javadoc) * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, java.lang.String) */ - public Page getPage(String aUrl, PageType aType) { + public Page getPage(String aUrl, PageType aType) throws PageException { LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'"); PageRequest request = _config.getRequest(aType); Document content = request.execute(aUrl, _client); return transformToDom4jDoc(content); } - + /** * @param aUrl * @param request diff --git a/trunk/crawler/kiss/build.xml b/trunk/crawler/kiss/build.xml index 7e625bf4..a2b58b48 100644 --- a/trunk/crawler/kiss/build.xml +++ b/trunk/crawler/kiss/build.xml @@ -17,7 +17,7 @@ &header; + depends="logging.d,mail.d,commons-codec.d,dom4j.d,xerces.d,httpclient.d,jtidy.d,wamblee.support.d,wamblee.crawler.d">