package org.wamblee.crawler.impl;
import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Element;
private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);
- private static final int MAX_DELAY = 5000;
-
private HttpClient _client;
private Configuration _config;
*
* @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
*/
- public Page getPage(String aUrl) throws PageException {
- LOG.info("Getting page: url = '" + aUrl + "'");
+ public Page getPage(String aUrl, NameValuePair[] aParams) throws PageException {
+ LOG.debug("Getting page: url = '" + aUrl + "'");
PageRequest request = _config.getRequest(aUrl);
- Document content = request.execute(aUrl, _client);
- return transformToDom4jDoc(content);
+ Document content = request.execute(aUrl, aParams, _client);
+ return transformToDom4jDoc(aUrl, content);
}
/*
* @see org.wamblee.crawler.Crawler#getPage(java.lang.String,
* java.lang.String)
*/
- public Page getPage(String aUrl, PageType aType) throws PageException {
- LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
+ public Page getPage(String aUrl, NameValuePair[] aParams, PageType aType) throws PageException {
+ LOG.debug("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
PageRequest request = _config.getRequest(aType);
- Document content = request.execute(aUrl, _client);
- return transformToDom4jDoc(content);
+ Document content = request.execute(aUrl, aParams, _client);
+ return transformToDom4jDoc(aUrl, content);
}
/**
* @param content DOM document.
* @return
*/
- private Page transformToDom4jDoc(Document content) {
+ private Page transformToDom4jDoc(String aUrl, Document content) {
DOMReader reader = new DOMReader();
org.dom4j.Document dom4jDoc = reader.read(content);
Element root = dom4jDoc.getRootElement();
dom4jDoc.remove(root);
- return new PageImpl(this, replaceReferencesWithContent(root));
+ return new PageImpl(aUrl, this, replaceReferencesWithContent(root));
}
/**