X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fimpl%2FPageImpl.java;h=1265bc2933ddb18f3c0e12cdce91cb40e0dc01a6;hb=07cedd3f0730646ea35a7f668b3e1e872a4605d9;hp=905122229edb08bfc5380e7d60725add6f7daac0;hpb=30671b398473b876e5c42d063f0c8e169ad3163c;p=utils diff --git a/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java b/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java index 90512222..1265bc29 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java @@ -16,9 +16,12 @@ package org.wamblee.crawler.impl; +import java.net.MalformedURLException; +import java.net.URL; import java.util.ArrayList; import java.util.List; +import org.apache.commons.httpclient.NameValuePair; import org.dom4j.DocumentHelper; import org.dom4j.Element; import org.dom4j.XPath; @@ -39,24 +42,31 @@ public class PageImpl implements Page { private static final String ATT_HREF = "reference"; private static final String ATT_TYPE = "type"; + + private static final String ELEM_PARAM = "param"; + + private static final String ATT_VALUE = "value"; + private String _href; + private Crawler _crawler; private Element _content; - - private Action[] _actions; + + private Action[] _actions; /** * Constructs a page. * * @param aContent */ - public PageImpl(Crawler aCrawler, Element aContent) { + public PageImpl(String aHref, Crawler aCrawler, Element aContent) { + _href = aHref; _crawler = aCrawler; _content = aContent; _actions = computeActions(); } - + /* * (non-Javadoc) * @@ -70,15 +80,44 @@ public class PageImpl implements Page { String name = elem.attributeValue(ATT_NAME); String href = elem.attributeValue(ATT_HREF); String type = elem.attributeValue(ATT_TYPE); - if (type == null ) { - names.add(new ActionImpl(_crawler, elem, name, href)); - } - else { - names.add(new ActionImpl(_crawler, elem, name, href, new PageType(type))); + NameValuePair[] params = getMandatoryParameters(elem); + href = absolutizeHref(_href, href); + if (type == null) { + names.add(new ActionImpl(_crawler, elem, name, href, params)); + } else { + names.add(new ActionImpl(_crawler, elem, name, href, + new PageType(type), params)); } } return names.toArray(new Action[0]); } + + /** + * Absolutize the hyperlink + * @param aPageHref Absolute page reference. + * @param aLinkHref Possibly relative link reference. + * @return Absolute hyperlink. + */ + private String absolutizeHref(String aPageHref, String aLinkHref) { + + try { + URL pageUrl = new URL(aPageHref); + URL newUrl = new URL(pageUrl, aLinkHref); + return newUrl.toString(); // TODO need to use URL instead of String throughout the code. + } catch (MalformedURLException e) { + throw new RuntimeException("Malformed URL", e); + } + } + + private NameValuePair[] getMandatoryParameters(Element aAction) { + List result = new ArrayList(); + for (Element param: (List)aAction.elements(ELEM_PARAM)) { + String name = param.attributeValue(ATT_NAME); + String value = param.attributeValue(ATT_VALUE); + result.add(new NameValuePair(name, value)); + } + return result.toArray(new NameValuePair[0]); + } /* * (non-Javadoc) @@ -89,21 +128,24 @@ public class PageImpl implements Page { return _content; } - /* (non-Javadoc) + /* + * (non-Javadoc) + * * @see org.wamblee.crawler.Page#getActions() */ public Action[] getActions() { return _actions; } - + /* - * (non-Javadoc) + * (non-Javadoc) + * * @see org.wamblee.crawler.Page#getAction(java.lang.String) */ public Action getAction(String aName) { List results = new ArrayList(); - for (Action action: _actions) { - if ( action.getName().equals(aName)) { + for (Action action : _actions) { + if (action.getName().equals(aName)) { results.add(action); } } @@ -111,7 +153,7 @@ public class PageImpl implements Page { return null; } if (results.size() > 1) { - throw new RuntimeException("Duplicate link '" + aName + "'"); + throw new RuntimeException("Duplicate action '" + aName + "'"); } return results.get(0); }