2 * Copyright 2005 the original author or authors.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 package org.wamblee.crawler.impl;
19 import org.apache.commons.httpclient.HttpClient;
20 import org.apache.commons.httpclient.NameValuePair;
21 import org.apache.commons.logging.Log;
22 import org.apache.commons.logging.LogFactory;
23 import org.dom4j.Element;
24 import org.dom4j.io.DOMReader;
25 import org.w3c.dom.Document;
26 import org.wamblee.crawler.Configuration;
27 import org.wamblee.crawler.Crawler;
28 import org.wamblee.crawler.Page;
29 import org.wamblee.crawler.PageException;
30 import org.wamblee.crawler.PageRequest;
31 import org.wamblee.crawler.PageType;
34 * Crawler implementation.
36 * @author Erik Brakkee
38 public class CrawlerImpl implements Crawler {
40 private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);
42 private HttpClient _client;
44 private Configuration _config;
47 * Constructs the crawler.
54 public CrawlerImpl(HttpClient aClient, Configuration aConfig) {
62 * @see org.wamblee.crawler.Crawler#getPage(java.lang.String)
64 public Page getPage(String aUrl, NameValuePair[] aParams) throws PageException {
65 LOG.debug("Getting page: url = '" + aUrl + "'");
66 PageRequest request = _config.getRequest(aUrl);
67 Document content = request.execute(aUrl, aParams, _client);
68 return transformToDom4jDoc(aUrl, content);
74 * @see org.wamblee.crawler.Crawler#getPage(java.lang.String,
77 public Page getPage(String aUrl, NameValuePair[] aParams, PageType aType) throws PageException {
78 LOG.debug("Getting page: url = '" + aUrl + "', type = '" + aType + "'");
79 PageRequest request = _config.getRequest(aType);
80 Document content = request.execute(aUrl, aParams, _client);
81 return transformToDom4jDoc(aUrl, content);
85 * Converts a w3c DOM document to a page object.
86 * @param content DOM document.
89 private Page transformToDom4jDoc(String aUrl, Document content) {
90 DOMReader reader = new DOMReader();
91 org.dom4j.Document dom4jDoc = reader.read(content);
92 Element root = dom4jDoc.getRootElement();
93 dom4jDoc.remove(root);
95 return new PageImpl(aUrl, this, replaceReferencesWithContent(root));
99 * Perform crawling. Find references in the retrieved content and replace
100 * them by the content they refer to by retrieving the appropriate pages as
104 * Content which must be made complete.
105 * @return Fully processed content.
107 private Element replaceReferencesWithContent(Element content) {
108 return content; // TODO implement.