X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fimpl%2FCrawlerImpl.java;h=098ed91f42ee30d072b2ce5d5339e37d3e8a2726;hb=f53c06ddca33e21e772c479179b7f858a3a8b8d4;hp=8db31606fc476e419cde8a4055a5286ad8f2c324;hpb=81bc61121a8f17f754fc99eb66603a59df242ddc;p=utils

diff --git a/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java b/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java
index 8db31606..098ed91f 100644
--- a/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java
+++ b/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java
@@ -12,11 +12,12 @@
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- */ 
+ */
 
 package org.wamblee.crawler.impl;
 
 import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.NameValuePair;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.dom4j.Element;
@@ -25,67 +26,83 @@ import org.w3c.dom.Document;
 import org.wamblee.crawler.Configuration;
 import org.wamblee.crawler.Crawler;
 import org.wamblee.crawler.Page;
+import org.wamblee.crawler.PageException;
 import org.wamblee.crawler.PageRequest;
 import org.wamblee.crawler.PageType;
 
 /**
- * Crawler implementation. 
+ * Crawler implementation.
  */
 public class CrawlerImpl implements Crawler {
-    
+
     private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);
-    
-    private HttpClient _client; 
-    private Configuration _config; 
-    
+
+    private HttpClient _client;
+
+    private Configuration _config;
+
+    /**
+     * Constructs the crawler.
+     * 
+     * @param aClient
+     *            Http client to use.
+     * @param aConfig
+     *            Configuration.
+     */
     public CrawlerImpl(HttpClient aClient, Configuration aConfig) {
-        _client = aClient; 
-        _config = aConfig; 
+        _client = aClient;
+        _config = aConfig;
     }
 
     /*
-     *  (non-Javadoc)
+     * (non-Javadoc)
+     * 
      * @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
      */
-    public Page getPage(String aUrl) {
-        LOG.info("Getting page: url = '" + aUrl + "'");
+    public Page getPage(String aUrl, NameValuePair[] aParams) throws PageException {
+        LOG.debug("Getting page: url = '" + aUrl + "'");
         PageRequest request = _config.getRequest(aUrl);
-        Document content = request.execute(aUrl, _client);
-        return transformToDom4jDoc(content); 
+        Document content = request.execute(aUrl, aParams, _client);
+        return transformToDom4jDoc(aUrl, content);
     }
- 
-    /* (non-Javadoc)
-     * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, java.lang.String)
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see org.wamblee.crawler.Crawler#getPage(java.lang.String,
+     *      java.lang.String)
      */
-    public Page getPage(String aUrl, PageType aType) {
-        LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
+    public Page getPage(String aUrl, NameValuePair[] aParams, PageType aType) throws PageException {
+        LOG.debug("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
         PageRequest request = _config.getRequest(aType);
-        Document content = request.execute(aUrl, _client);
-        return transformToDom4jDoc(content); 
+        Document content = request.execute(aUrl, aParams, _client);
+        return transformToDom4jDoc(aUrl, content);
     }
 
     /**
-     * @param aUrl
-     * @param request
+     * Converts a w3c DOM document to a page object. 
+     * @param content DOM document. 
      * @return
      */
-    private Page transformToDom4jDoc(Document content) {
-      
+    private Page transformToDom4jDoc(String aUrl, Document content) {
         DOMReader reader = new DOMReader();
         org.dom4j.Document dom4jDoc = reader.read(content);
         Element root = dom4jDoc.getRootElement();
         dom4jDoc.remove(root);
-        
-        return new PageImpl(this, replaceReferencesWithContent(root));
+
+        return new PageImpl(aUrl, this, replaceReferencesWithContent(root));
     }
-    
+
     /**
-     * Perform crawling. Find references in the retrieved content and replace them 
-     * by the content they refer to by retrieving the appropriate pages as well. 
-     * @param content Content which must be made complete. 
-     * @return Fully processed content. 
+     * Perform crawling. Find references in the retrieved content and replace
+     * them by the content they refer to by retrieving the appropriate pages as
+     * well.
+     * 
+     * @param content
+     *            Content which must be made complete.
+     * @return Fully processed content.
      */
-    private Element replaceReferencesWithContent(Element content) { 
-        return content; // TODO implement. 
+    private Element replaceReferencesWithContent(Element content) {
+        return content; // TODO implement.
     }
 }