2 * Copyright 2005 the original author or authors.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 package org.wamblee.crawler.impl;
19 import java.net.MalformedURLException;
21 import java.util.ArrayList;
22 import java.util.List;
24 import org.apache.commons.httpclient.NameValuePair;
25 import org.dom4j.DocumentHelper;
26 import org.dom4j.Element;
27 import org.dom4j.XPath;
28 import org.wamblee.crawler.Action;
29 import org.wamblee.crawler.Crawler;
30 import org.wamblee.crawler.Page;
31 import org.wamblee.crawler.PageType;
34 * Page implementation.
36 * @author Erik Brakkee
38 public class PageImpl implements Page {
40 private static final String ELEM_NAME = "action";
42 private static final String ATT_NAME = "name";
44 private static final String ATT_HREF = "reference";
46 private static final String ATT_TYPE = "type";
48 private static final String ELEM_PARAM = "param";
50 private static final String ATT_VALUE = "value";
54 private Crawler _crawler;
56 private Element _content;
58 private Action[] _actions;
65 public PageImpl(String aHref, Crawler aCrawler, Element aContent) {
69 _actions = computeActions();
75 * @see org.wamblee.crawler.Page#getLinkNames()
77 private Action[] computeActions() {
78 XPath xpath = DocumentHelper.createXPath(ELEM_NAME);
79 List<Element> results = (List<Element>) xpath.selectNodes(_content);
80 List<Action> names = new ArrayList<Action>();
81 for (Element elem : results) {
82 String name = elem.attributeValue(ATT_NAME);
83 String href = elem.attributeValue(ATT_HREF);
84 String type = elem.attributeValue(ATT_TYPE);
85 NameValuePair[] params = getMandatoryParameters(elem);
86 href = absolutizeHref(_href, href);
88 names.add(new ActionImpl(_crawler, elem, name, href, params));
90 names.add(new ActionImpl(_crawler, elem, name, href,
91 new PageType(type), params));
94 return names.toArray(new Action[0]);
98 * Absolutize the hyperlink
99 * @param aPageHref Absolute page reference.
100 * @param aLinkHref Possibly relative link reference.
101 * @return Absolute hyperlink.
103 private String absolutizeHref(String aPageHref, String aLinkHref) {
106 URL pageUrl = new URL(aPageHref);
107 URL newUrl = new URL(pageUrl, aLinkHref);
108 return newUrl.toString(); // TODO need to use URL instead of String throughout the code.
109 } catch (MalformedURLException e) {
110 throw new RuntimeException("Malformed URL", e);
114 private NameValuePair[] getMandatoryParameters(Element aAction) {
115 List<NameValuePair> result = new ArrayList<NameValuePair>();
116 for (Element param: (List<Element>)aAction.elements(ELEM_PARAM)) {
117 String name = param.attributeValue(ATT_NAME);
118 String value = param.attributeValue(ATT_VALUE);
119 result.add(new NameValuePair(name, value));
121 return result.toArray(new NameValuePair[0]);
127 * @see org.wamblee.crawler.Page#getContent()
129 public Element getContent() {
136 * @see org.wamblee.crawler.Page#getActions()
138 public Action[] getActions() {
145 * @see org.wamblee.crawler.Page#getAction(java.lang.String)
147 public Action getAction(String aName) {
148 List<Action> results = new ArrayList<Action>();
149 for (Action action : _actions) {
150 if (action.getName().equals(aName)) {
154 if (results.size() == 0) {
157 if (results.size() > 1) {
158 throw new RuntimeException("Duplicate action '" + aName + "'");
160 return results.get(0);