/* * Copyright 2005 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.wamblee.crawler.impl; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.List; import org.apache.commons.httpclient.NameValuePair; import org.dom4j.DocumentHelper; import org.dom4j.Element; import org.dom4j.XPath; import org.wamblee.crawler.Action; import org.wamblee.crawler.Crawler; import org.wamblee.crawler.Page; import org.wamblee.crawler.PageType; /** * Page implementation. * * @author Erik Brakkee */ public class PageImpl implements Page { private static final String ELEM_NAME = "action"; private static final String ATT_NAME = "name"; private static final String ATT_HREF = "reference"; private static final String ATT_TYPE = "type"; private static final String ELEM_PARAM = "param"; private static final String ATT_VALUE = "value"; private String _href; private Crawler _crawler; private Element _content; private Action[] _actions; /** * Constructs a page. * * @param aContent */ public PageImpl(String aHref, Crawler aCrawler, Element aContent) { _href = aHref; _crawler = aCrawler; _content = aContent; _actions = computeActions(); } /* * (non-Javadoc) * * @see org.wamblee.crawler.Page#getLinkNames() */ private Action[] computeActions() { XPath xpath = DocumentHelper.createXPath(ELEM_NAME); List results = (List) xpath.selectNodes(_content); List names = new ArrayList(); for (Element elem : results) { String name = elem.attributeValue(ATT_NAME); String href = elem.attributeValue(ATT_HREF); String type = elem.attributeValue(ATT_TYPE); NameValuePair[] params = getMandatoryParameters(elem); href = absolutizeHref(_href, href); if (type == null) { names.add(new ActionImpl(_crawler, elem, name, href, params)); } else { names.add(new ActionImpl(_crawler, elem, name, href, new PageType(type), params)); } } return names.toArray(new Action[0]); } /** * Absolutize the hyperlink * @param aPageHref Absolute page reference. * @param aLinkHref Possibly relative link reference. * @return Absolute hyperlink. */ private String absolutizeHref(String aPageHref, String aLinkHref) { try { URL pageUrl = new URL(aPageHref); URL newUrl = new URL(pageUrl, aLinkHref); return newUrl.toString(); // TODO need to use URL instead of String throughout the code. } catch (MalformedURLException e) { throw new RuntimeException("Malformed URL", e); } } private NameValuePair[] getMandatoryParameters(Element aAction) { List result = new ArrayList(); for (Element param: (List)aAction.elements(ELEM_PARAM)) { String name = param.attributeValue(ATT_NAME); String value = param.attributeValue(ATT_VALUE); result.add(new NameValuePair(name, value)); } return result.toArray(new NameValuePair[0]); } /* * (non-Javadoc) * * @see org.wamblee.crawler.Page#getContent() */ public Element getContent() { return _content; } /* * (non-Javadoc) * * @see org.wamblee.crawler.Page#getActions() */ public Action[] getActions() { return _actions; } /* * (non-Javadoc) * * @see org.wamblee.crawler.Page#getAction(java.lang.String) */ public Action getAction(String aName) { List results = new ArrayList(); for (Action action : _actions) { if (action.getName().equals(aName)) { results.add(action); } } if (results.size() == 0) { return null; } if (results.size() > 1) { throw new RuntimeException("Duplicate action '" + aName + "'"); } return results.get(0); } }