wamblee.org Git - utils/blob

   1 /*
   2  * Copyright 2005 the original author or authors.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 package org.wamblee.crawler.impl;
  18
  19 import org.apache.commons.httpclient.HttpClient;
  20 import org.apache.commons.httpclient.NameValuePair;
  21 import org.apache.commons.logging.Log;
  22 import org.apache.commons.logging.LogFactory;
  23 import org.dom4j.Element;
  24 import org.dom4j.io.DOMReader;
  25 import org.w3c.dom.Document;
  26 import org.wamblee.crawler.Configuration;
  27 import org.wamblee.crawler.Crawler;
  28 import org.wamblee.crawler.Page;
  29 import org.wamblee.crawler.PageException;
  30 import org.wamblee.crawler.PageRequest;
  31 import org.wamblee.crawler.PageType;
  32
  33 /**
  34  * Crawler implementation.
  35  */
  36 public class CrawlerImpl implements Crawler {
  37
  38     private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);
  39
  40     private HttpClient _client;
  41
  42     private Configuration _config;
  43
  44     /**
  45      * Constructs the crawler.
  46      *
  47      * @param aClient
  48      *            Http client to use.
  49      * @param aConfig
  50      *            Configuration.
  51      */
  52     public CrawlerImpl(HttpClient aClient, Configuration aConfig) {
  53         _client = aClient;
  54         _config = aConfig;
  55     }
  56
  57     /*
  58      * (non-Javadoc)
  59      *
  60      * @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
  61      */
  62     public Page getPage(String aUrl, NameValuePair[] aParams) throws PageException {
  63         LOG.debug("Getting page: url = '" + aUrl + "'");
  64         PageRequest request = _config.getRequest(aUrl);
  65         Document content = request.execute(aUrl, aParams, _client);
  66         return transformToDom4jDoc(aUrl, content);
  67     }
  68
  69     /*
  70      * (non-Javadoc)
  71      *
  72      * @see org.wamblee.crawler.Crawler#getPage(java.lang.String,
  73      *      java.lang.String)
  74      */
  75     public Page getPage(String aUrl, NameValuePair[] aParams, PageType aType) throws PageException {
  76         LOG.debug("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
  77         PageRequest request = _config.getRequest(aType);
  78         Document content = request.execute(aUrl, aParams, _client);
  79         return transformToDom4jDoc(aUrl, content);
  80     }
  81
  82     /**
  83      * Converts a w3c DOM document to a page object.
  84      * @param content DOM document.
  85      * @return
  86      */
  87     private Page transformToDom4jDoc(String aUrl, Document content) {
  88         DOMReader reader = new DOMReader();
  89         org.dom4j.Document dom4jDoc = reader.read(content);
  90         Element root = dom4jDoc.getRootElement();
  91         dom4jDoc.remove(root);
  92
  93         return new PageImpl(aUrl, this, replaceReferencesWithContent(root));
  94     }
  95
  96     /**
  97      * Perform crawling. Find references in the retrieved content and replace
  98      * them by the content they refer to by retrieving the appropriate pages as
  99      * well.
 100      *
 101      * @param content
 102      *            Content which must be made complete.
 103      * @return Fully processed content.
 104      */
 105     private Element replaceReferencesWithContent(Element content) {
 106         return content; // TODO implement.
 107     }
 108 }