support for parameters on actions.

[utils] / crawler / basic / src / org / wamblee / crawler / impl / CrawlerImpl.java
diff --git a/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java b/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java

index 53a3873ab43e7e936f90ddbacf78464a74f73ada..098ed91f42ee30d072b2ce5d5339e37d3e8a2726 100644 (file)
--- a/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java
+++ b/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java
@@ -12,11 +12,12 @@
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
- */ 
+ */
  
  package org.wamblee.crawler.impl;
  
  import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.NameValuePair;
  import org.apache.commons.logging.Log;
  import org.apache.commons.logging.LogFactory;
  import org.dom4j.Element;
@@ -30,64 +31,78 @@ import org.wamblee.crawler.PageRequest;
  import org.wamblee.crawler.PageType;
  
  /**
- * Crawler implementation. 
+ * Crawler implementation.
   */
  public class CrawlerImpl implements Crawler {
-    
+
      private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);
-    private static final int MAX_DELAY = 5000; 
-    
-    private HttpClient _client; 
+
+    private HttpClient _client;
+
      private Configuration _config;
-    
+
+    /**
+     * Constructs the crawler.
+     * 
+     * @param aClient
+     *            Http client to use.
+     * @param aConfig
+     *            Configuration.
+     */
      public CrawlerImpl(HttpClient aClient, Configuration aConfig) {
-        _client = aClient; 
-        _config = aConfig; 
+        _client = aClient;
+        _config = aConfig;
      }
  
      /*
-     *  (non-Javadoc)
+     * (non-Javadoc)
+     * 
       * @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
       */
-    public Page getPage(String aUrl) throws PageException {
-        LOG.info("Getting page: url = '" + aUrl + "'");
+    public Page getPage(String aUrl, NameValuePair[] aParams) throws PageException {
+        LOG.debug("Getting page: url = '" + aUrl + "'");
          PageRequest request = _config.getRequest(aUrl);
-        Document content = request.execute(aUrl, _client);
-        return transformToDom4jDoc(content); 
+        Document content = request.execute(aUrl, aParams, _client);
+        return transformToDom4jDoc(aUrl, content);
      }
- 
-    /* (non-Javadoc)
-     * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, java.lang.String)
+
+    /*
+     * (non-Javadoc)
+     * 
+     * @see org.wamblee.crawler.Crawler#getPage(java.lang.String,
+     *      java.lang.String)
       */
-    public Page getPage(String aUrl, PageType aType) throws PageException {
-        LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
+    public Page getPage(String aUrl, NameValuePair[] aParams, PageType aType) throws PageException {
+        LOG.debug("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
          PageRequest request = _config.getRequest(aType);
-        Document content = request.execute(aUrl, _client);
-        return transformToDom4jDoc(content); 
+        Document content = request.execute(aUrl, aParams, _client);
+        return transformToDom4jDoc(aUrl, content);
      }
-    
+
      /**
-     * @param aUrl
-     * @param request
+     * Converts a w3c DOM document to a page object. 
+     * @param content DOM document. 
       * @return
       */
-    private Page transformToDom4jDoc(Document content) {
-      
+    private Page transformToDom4jDoc(String aUrl, Document content) {
          DOMReader reader = new DOMReader();
          org.dom4j.Document dom4jDoc = reader.read(content);
          Element root = dom4jDoc.getRootElement();
          dom4jDoc.remove(root);
-        
-        return new PageImpl(this, replaceReferencesWithContent(root));
+
+        return new PageImpl(aUrl, this, replaceReferencesWithContent(root));
      }
-    
+
      /**
-     * Perform crawling. Find references in the retrieved content and replace them 
-     * by the content they refer to by retrieving the appropriate pages as well. 
-     * @param content Content which must be made complete. 
-     * @return Fully processed content. 
+     * Perform crawling. Find references in the retrieved content and replace
+     * them by the content they refer to by retrieving the appropriate pages as
+     * well.
+     * 
+     * @param content
+     *            Content which must be made complete.
+     * @return Fully processed content.
       */
-    private Element replaceReferencesWithContent(Element content) { 
-        return content; // TODO implement. 
+    private Element replaceReferencesWithContent(Element content) {
+        return content; // TODO implement.
      }
  }