--- /dev/null
+/*
+ * Copyright 2005 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wamblee.crawler.impl;
+
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.NameValuePair;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.dom4j.Element;
+import org.dom4j.io.DOMReader;
+import org.w3c.dom.Document;
+import org.wamblee.crawler.Configuration;
+import org.wamblee.crawler.Crawler;
+import org.wamblee.crawler.Page;
+import org.wamblee.crawler.PageException;
+import org.wamblee.crawler.PageRequest;
+import org.wamblee.crawler.PageType;
+
+/**
+ * Crawler implementation.
+ */
+public class CrawlerImpl implements Crawler {
+
+ private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);
+
+ private HttpClient _client;
+
+ private Configuration _config;
+
+ /**
+ * Constructs the crawler.
+ *
+ * @param aClient
+ * Http client to use.
+ * @param aConfig
+ * Configuration.
+ */
+ public CrawlerImpl(HttpClient aClient, Configuration aConfig) {
+ _client = aClient;
+ _config = aConfig;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
+ */
+ public Page getPage(String aUrl, NameValuePair[] aParams) throws PageException {
+ LOG.debug("Getting page: url = '" + aUrl + "'");
+ PageRequest request = _config.getRequest(aUrl);
+ Document content = request.execute(aUrl, aParams, _client);
+ return transformToDom4jDoc(aUrl, content);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.wamblee.crawler.Crawler#getPage(java.lang.String,
+ * java.lang.String)
+ */
+ public Page getPage(String aUrl, NameValuePair[] aParams, PageType aType) throws PageException {
+ LOG.debug("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
+ PageRequest request = _config.getRequest(aType);
+ Document content = request.execute(aUrl, aParams, _client);
+ return transformToDom4jDoc(aUrl, content);
+ }
+
+ /**
+ * Converts a w3c DOM document to a page object.
+ * @param content DOM document.
+ * @return
+ */
+ private Page transformToDom4jDoc(String aUrl, Document content) {
+ DOMReader reader = new DOMReader();
+ org.dom4j.Document dom4jDoc = reader.read(content);
+ Element root = dom4jDoc.getRootElement();
+ dom4jDoc.remove(root);
+
+ return new PageImpl(aUrl, this, replaceReferencesWithContent(root));
+ }
+
+ /**
+ * Perform crawling. Find references in the retrieved content and replace
+ * them by the content they refer to by retrieving the appropriate pages as
+ * well.
+ *
+ * @param content
+ * Content which must be made complete.
+ * @return Fully processed content.
+ */
+ private Element replaceReferencesWithContent(Element content) {
+ return content; // TODO implement.
+ }
+}