wamblee.org Git - utils/blob - crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java

   1 /*
   2  * Copyright 2005 the original author or authors.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 package org.wamblee.crawler.impl;
  18
  19 import org.apache.commons.httpclient.HttpClient;
  20 import org.apache.commons.logging.Log;
  21 import org.apache.commons.logging.LogFactory;
  22 import org.dom4j.Element;
  23 import org.dom4j.io.DOMReader;
  24 import org.w3c.dom.Document;
  25 import org.wamblee.crawler.Configuration;
  26 import org.wamblee.crawler.Crawler;
  27 import org.wamblee.crawler.Page;
  28 import org.wamblee.crawler.PageException;
  29 import org.wamblee.crawler.PageRequest;
  30 import org.wamblee.crawler.PageType;
  31
  32 /**
  33  * Crawler implementation.
  34  */
  35 public class CrawlerImpl implements Crawler {
  36
  37     private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);
  38
  39     private HttpClient _client;
  40
  41     private Configuration _config;
  42
  43     /**
  44      * Constructs the crawler.
  45      *
  46      * @param aClient
  47      *            Http client to use.
  48      * @param aConfig
  49      *            Configuration.
  50      */
  51     public CrawlerImpl(HttpClient aClient, Configuration aConfig) {
  52         _client = aClient;
  53         _config = aConfig;
  54     }
  55
  56     /*
  57      * (non-Javadoc)
  58      *
  59      * @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
  60      */
  61     public Page getPage(String aUrl) throws PageException {
  62         LOG.debug("Getting page: url = '" + aUrl + "'");
  63         PageRequest request = _config.getRequest(aUrl);
  64         Document content = request.execute(aUrl, _client);
  65         return transformToDom4jDoc(content);
  66     }
  67
  68     /*
  69      * (non-Javadoc)
  70      *
  71      * @see org.wamblee.crawler.Crawler#getPage(java.lang.String,
  72      *      java.lang.String)
  73      */
  74     public Page getPage(String aUrl, PageType aType) throws PageException {
  75         LOG.debug("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
  76         PageRequest request = _config.getRequest(aType);
  77         Document content = request.execute(aUrl, _client);
  78         return transformToDom4jDoc(content);
  79     }
  80
  81     /**
  82      * Converts a w3c DOM document to a page object.
  83      * @param content DOM document.
  84      * @return
  85      */
  86     private Page transformToDom4jDoc(Document content) {
  87         DOMReader reader = new DOMReader();
  88         org.dom4j.Document dom4jDoc = reader.read(content);
  89         Element root = dom4jDoc.getRootElement();
  90         dom4jDoc.remove(root);
  91
  92         return new PageImpl(this, replaceReferencesWithContent(root));
  93     }
  94
  95     /**
  96      * Perform crawling. Find references in the retrieved content and replace
  97      * them by the content they refer to by retrieving the appropriate pages as
  98      * well.
  99      *
 100      * @param content
 101      *            Content which must be made complete.
 102      * @return Fully processed content.
 103      */
 104     private Element replaceReferencesWithContent(Element content) {
 105         return content; // TODO implement.
 106     }
 107 }