wamblee.org Git - utils/blob - crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java

   1 /*
   2  * Copyright 2005 the original author or authors.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 package org.wamblee.crawler.impl;
  18
  19 import org.apache.commons.httpclient.HttpClient;
  20 import org.apache.commons.logging.Log;
  21 import org.apache.commons.logging.LogFactory;
  22 import org.dom4j.Element;
  23 import org.dom4j.io.DOMReader;
  24 import org.w3c.dom.Document;
  25 import org.wamblee.crawler.Configuration;
  26 import org.wamblee.crawler.Crawler;
  27 import org.wamblee.crawler.Page;
  28 import org.wamblee.crawler.PageException;
  29 import org.wamblee.crawler.PageRequest;
  30 import org.wamblee.crawler.PageType;
  31
  32 /**
  33  * Crawler implementation.
  34  */
  35 public class CrawlerImpl implements Crawler {
  36
  37     private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);
  38     private static final int MAX_DELAY = 5000;
  39
  40     private HttpClient _client;
  41     private Configuration _config;
  42
  43     public CrawlerImpl(HttpClient aClient, Configuration aConfig) {
  44         _client = aClient;
  45         _config = aConfig;
  46     }
  47
  48     /*
  49      *  (non-Javadoc)
  50      * @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
  51      */
  52     public Page getPage(String aUrl) throws PageException {
  53         LOG.info("Getting page: url = '" + aUrl + "'");
  54         PageRequest request = _config.getRequest(aUrl);
  55         Document content = request.execute(aUrl, _client);
  56         return transformToDom4jDoc(content);
  57     }
  58
  59     /* (non-Javadoc)
  60      * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, java.lang.String)
  61      */
  62     public Page getPage(String aUrl, PageType aType) throws PageException {
  63         LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
  64         PageRequest request = _config.getRequest(aType);
  65         Document content = request.execute(aUrl, _client);
  66         return transformToDom4jDoc(content);
  67     }
  68
  69     /**
  70      * @param aUrl
  71      * @param request
  72      * @return
  73      */
  74     private Page transformToDom4jDoc(Document content) {
  75
  76         DOMReader reader = new DOMReader();
  77         org.dom4j.Document dom4jDoc = reader.read(content);
  78         Element root = dom4jDoc.getRootElement();
  79         dom4jDoc.remove(root);
  80
  81         return new PageImpl(this, replaceReferencesWithContent(root));
  82     }
  83
  84     /**
  85      * Perform crawling. Find references in the retrieved content and replace them
  86      * by the content they refer to by retrieving the appropriate pages as well.
  87      * @param content Content which must be made complete.
  88      * @return Fully processed content.
  89      */
  90     private Element replaceReferencesWithContent(Element content) {
  91         return content; // TODO implement.
  92     }
  93 }