X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fimpl%2FCrawlerImpl.java;fp=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fimpl%2FCrawlerImpl.java;h=53a3873ab43e7e936f90ddbacf78464a74f73ada;hb=d4bb47fd284738756cd112b788a49caa1a9d5c38;hp=8db31606fc476e419cde8a4055a5286ad8f2c324;hpb=1785ad1948da7bf80f07d705c968726991507376;p=utils diff --git a/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java b/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java index 8db31606..53a3873a 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java @@ -25,6 +25,7 @@ import org.w3c.dom.Document; import org.wamblee.crawler.Configuration; import org.wamblee.crawler.Crawler; import org.wamblee.crawler.Page; +import org.wamblee.crawler.PageException; import org.wamblee.crawler.PageRequest; import org.wamblee.crawler.PageType; @@ -34,9 +35,10 @@ import org.wamblee.crawler.PageType; public class CrawlerImpl implements Crawler { private static final Log LOG = LogFactory.getLog(CrawlerImpl.class); + private static final int MAX_DELAY = 5000; private HttpClient _client; - private Configuration _config; + private Configuration _config; public CrawlerImpl(HttpClient aClient, Configuration aConfig) { _client = aClient; @@ -47,7 +49,7 @@ public class CrawlerImpl implements Crawler { * (non-Javadoc) * @see org.wamblee.crawler.Crawler#getPage(java.lang.String) */ - public Page getPage(String aUrl) { + public Page getPage(String aUrl) throws PageException { LOG.info("Getting page: url = '" + aUrl + "'"); PageRequest request = _config.getRequest(aUrl); Document content = request.execute(aUrl, _client); @@ -57,13 +59,13 @@ public class CrawlerImpl implements Crawler { /* (non-Javadoc) * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, java.lang.String) */ - public Page getPage(String aUrl, PageType aType) { + public Page getPage(String aUrl, PageType aType) throws PageException { LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'"); PageRequest request = _config.getRequest(aType); Document content = request.execute(aUrl, _client); return transformToDom4jDoc(content); } - + /** * @param aUrl * @param request