2 * Copyright 2005 the original author or authors.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 package org.wamblee.crawler.impl;
19 import java.net.MalformedURLException;
21 import java.util.ArrayList;
22 import java.util.List;
24 import org.apache.commons.httpclient.NameValuePair;
25 import org.dom4j.DocumentHelper;
26 import org.dom4j.Element;
27 import org.dom4j.XPath;
28 import org.wamblee.crawler.Action;
29 import org.wamblee.crawler.Crawler;
30 import org.wamblee.crawler.Page;
31 import org.wamblee.crawler.PageType;
34 * Page implementation.
36 public class PageImpl implements Page {
38 private static final String ELEM_NAME = "action";
40 private static final String ATT_NAME = "name";
42 private static final String ATT_HREF = "reference";
44 private static final String ATT_TYPE = "type";
46 private static final String ELEM_PARAM = "param";
48 private static final String ATT_VALUE = "value";
52 private Crawler _crawler;
54 private Element _content;
56 private Action[] _actions;
63 public PageImpl(String aHref, Crawler aCrawler, Element aContent) {
67 _actions = computeActions();
73 * @see org.wamblee.crawler.Page#getLinkNames()
75 private Action[] computeActions() {
76 XPath xpath = DocumentHelper.createXPath(ELEM_NAME);
77 List<Element> results = (List<Element>) xpath.selectNodes(_content);
78 List<Action> names = new ArrayList<Action>();
79 for (Element elem : results) {
80 String name = elem.attributeValue(ATT_NAME);
81 String href = elem.attributeValue(ATT_HREF);
82 String type = elem.attributeValue(ATT_TYPE);
83 NameValuePair[] params = getMandatoryParameters(elem);
84 href = absolutizeHref(_href, href);
86 names.add(new ActionImpl(_crawler, elem, name, href, params));
88 names.add(new ActionImpl(_crawler, elem, name, href,
89 new PageType(type), params));
92 return names.toArray(new Action[0]);
96 * Absolutize the hyperlink
97 * @param aPageHref Absolute page reference.
98 * @param aLinkHref Possibly relative link reference.
99 * @return Absolute hyperlink.
101 private String absolutizeHref(String aPageHref, String aLinkHref) {
104 URL pageUrl = new URL(aPageHref);
105 URL newUrl = new URL(pageUrl, aLinkHref);
106 return newUrl.toString(); // TODO need to use URL instead of String throughout the code.
107 } catch (MalformedURLException e) {
108 throw new RuntimeException("Malformed URL", e);
112 private NameValuePair[] getMandatoryParameters(Element aAction) {
113 List<NameValuePair> result = new ArrayList<NameValuePair>();
114 for (Element param: (List<Element>)aAction.elements(ELEM_PARAM)) {
115 String name = param.attributeValue(ATT_NAME);
116 String value = param.attributeValue(ATT_VALUE);
117 result.add(new NameValuePair(name, value));
119 return result.toArray(new NameValuePair[0]);
125 * @see org.wamblee.crawler.Page#getContent()
127 public Element getContent() {
134 * @see org.wamblee.crawler.Page#getActions()
136 public Action[] getActions() {
143 * @see org.wamblee.crawler.Page#getAction(java.lang.String)
145 public Action getAction(String aName) {
146 List<Action> results = new ArrayList<Action>();
147 for (Action action : _actions) {
148 if (action.getName().equals(aName)) {
152 if (results.size() == 0) {
155 if (results.size() > 1) {
156 throw new RuntimeException("Duplicate action '" + aName + "'");
158 return results.get(0);