import org.wamblee.crawler.Configuration;
import org.wamblee.crawler.Crawler;
import org.wamblee.crawler.Page;
+import org.wamblee.crawler.PageException;
import org.wamblee.crawler.PageRequest;
import org.wamblee.crawler.PageType;
public class CrawlerImpl implements Crawler {
private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);
+ private static final int MAX_DELAY = 5000;
private HttpClient _client;
- private Configuration _config;
+ private Configuration _config;
public CrawlerImpl(HttpClient aClient, Configuration aConfig) {
_client = aClient;
* (non-Javadoc)
* @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
*/
- public Page getPage(String aUrl) {
+ public Page getPage(String aUrl) throws PageException {
LOG.info("Getting page: url = '" + aUrl + "'");
PageRequest request = _config.getRequest(aUrl);
Document content = request.execute(aUrl, _client);
/* (non-Javadoc)
* @see org.wamblee.crawler.Crawler#getPage(java.lang.String, java.lang.String)
*/
- public Page getPage(String aUrl, PageType aType) {
+ public Page getPage(String aUrl, PageType aType) throws PageException {
LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
PageRequest request = _config.getRequest(aType);
Document content = request.execute(aUrl, _client);
return transformToDom4jDoc(content);
}
-
+
/**
* @param aUrl
* @param request