X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fimpl%2FCrawlerImpl.java;h=53a3873ab43e7e936f90ddbacf78464a74f73ada;hb=23ca0d89ea9870f730cb96ba83a3d4b32a7dd2e6;hp=8db31606fc476e419cde8a4055a5286ad8f2c324;hpb=30671b398473b876e5c42d063f0c8e169ad3163c;p=utils diff --git a/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java b/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java index 8db31606..53a3873a 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java @@ -25,6 +25,7 @@ import org.w3c.dom.Document; import org.wamblee.crawler.Configuration; import org.wamblee.crawler.Crawler; import org.wamblee.crawler.Page; +import org.wamblee.crawler.PageException; import org.wamblee.crawler.PageRequest; import org.wamblee.crawler.PageType; @@ -34,9 +35,10 @@ import org.wamblee.crawler.PageType; public class CrawlerImpl implements Crawler { private static final Log LOG = LogFactory.getLog(CrawlerImpl.class); + private static final int MAX_DELAY = 5000; private HttpClient _client; - private Configuration _config; + private Configuration _config; public CrawlerImpl(HttpClient aClient, Configuration aConfig) { _client = aClient; @@ -47,7 +49,7 @@ public class CrawlerImpl implements Crawler { * (non-Javadoc) * @see org.wamblee.crawler.Crawler#getPage(java.lang.String) */ - public Page getPage(String aUrl) { + public Page getPage(String aUrl) throws PageException { LOG.info("Getting page: url = '" + aUrl + "'"); PageRequest request = _config.getRequest(aUrl); Document content = request.execute(aUrl, _client); @@ -57,13 +59,13 @@ public class CrawlerImpl implements Crawler { /* (non-Javadoc) * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, java.lang.String) */ - public Page getPage(String aUrl, PageType aType) { + public Page getPage(String aUrl, PageType aType) throws PageException { LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'"); PageRequest request = _config.getRequest(aType); Document content = request.execute(aUrl, _client); return transformToDom4jDoc(content); } - + /** * @param aUrl * @param request