* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- */
+ */
package org.wamblee.crawler.impl;
import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Element;
import org.wamblee.crawler.PageType;
/**
- * Crawler implementation.
+ * Crawler implementation.
*/
public class CrawlerImpl implements Crawler {
-
+
private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);
- private static final int MAX_DELAY = 5000;
-
- private HttpClient _client;
+
+ private HttpClient _client;
+
private Configuration _config;
-
+
+ /**
+ * Constructs the crawler.
+ *
+ * @param aClient
+ * Http client to use.
+ * @param aConfig
+ * Configuration.
+ */
public CrawlerImpl(HttpClient aClient, Configuration aConfig) {
- _client = aClient;
- _config = aConfig;
+ _client = aClient;
+ _config = aConfig;
}
/*
- * (non-Javadoc)
+ * (non-Javadoc)
+ *
* @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
*/
- public Page getPage(String aUrl) throws PageException {
- LOG.info("Getting page: url = '" + aUrl + "'");
+ public Page getPage(String aUrl, NameValuePair[] aParams) throws PageException {
+ LOG.debug("Getting page: url = '" + aUrl + "'");
PageRequest request = _config.getRequest(aUrl);
- Document content = request.execute(aUrl, _client);
- return transformToDom4jDoc(content);
+ Document content = request.execute(aUrl, aParams, _client);
+ return transformToDom4jDoc(aUrl, content);
}
-
- /* (non-Javadoc)
- * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, java.lang.String)
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.wamblee.crawler.Crawler#getPage(java.lang.String,
+ * java.lang.String)
*/
- public Page getPage(String aUrl, PageType aType) throws PageException {
- LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
+ public Page getPage(String aUrl, NameValuePair[] aParams, PageType aType) throws PageException {
+ LOG.debug("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
PageRequest request = _config.getRequest(aType);
- Document content = request.execute(aUrl, _client);
- return transformToDom4jDoc(content);
+ Document content = request.execute(aUrl, aParams, _client);
+ return transformToDom4jDoc(aUrl, content);
}
-
+
/**
- * @param aUrl
- * @param request
+ * Converts a w3c DOM document to a page object.
+ * @param content DOM document.
* @return
*/
- private Page transformToDom4jDoc(Document content) {
-
+ private Page transformToDom4jDoc(String aUrl, Document content) {
DOMReader reader = new DOMReader();
org.dom4j.Document dom4jDoc = reader.read(content);
Element root = dom4jDoc.getRootElement();
dom4jDoc.remove(root);
-
- return new PageImpl(this, replaceReferencesWithContent(root));
+
+ return new PageImpl(aUrl, this, replaceReferencesWithContent(root));
}
-
+
/**
- * Perform crawling. Find references in the retrieved content and replace them
- * by the content they refer to by retrieving the appropriate pages as well.
- * @param content Content which must be made complete.
- * @return Fully processed content.
+ * Perform crawling. Find references in the retrieved content and replace
+ * them by the content they refer to by retrieving the appropriate pages as
+ * well.
+ *
+ * @param content
+ * Content which must be made complete.
+ * @return Fully processed content.
*/
- private Element replaceReferencesWithContent(Element content) {
- return content; // TODO implement.
+ private Element replaceReferencesWithContent(Element content) {
+ return content; // TODO implement.
}
}