X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;f=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fimpl%2FPageImpl.java;h=1265bc2933ddb18f3c0e12cdce91cb40e0dc01a6;hb=f53c06ddca33e21e772c479179b7f858a3a8b8d4;hp=d478ed4b73f1a11181d46d16aa57cab7114710dd;hpb=5685a836b9208ff8babfe5ac5b30c5f86d27cf96;p=utils diff --git a/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java b/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java index d478ed4b..1265bc29 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java @@ -16,9 +16,12 @@ package org.wamblee.crawler.impl; +import java.net.MalformedURLException; +import java.net.URL; import java.util.ArrayList; import java.util.List; +import org.apache.commons.httpclient.NameValuePair; import org.dom4j.DocumentHelper; import org.dom4j.Element; import org.dom4j.XPath; @@ -39,7 +42,13 @@ public class PageImpl implements Page { private static final String ATT_HREF = "reference"; private static final String ATT_TYPE = "type"; + + private static final String ELEM_PARAM = "param"; + + private static final String ATT_VALUE = "value"; + private String _href; + private Crawler _crawler; private Element _content; @@ -51,7 +60,8 @@ public class PageImpl implements Page { * * @param aContent */ - public PageImpl(Crawler aCrawler, Element aContent) { + public PageImpl(String aHref, Crawler aCrawler, Element aContent) { + _href = aHref; _crawler = aCrawler; _content = aContent; _actions = computeActions(); @@ -70,15 +80,44 @@ public class PageImpl implements Page { String name = elem.attributeValue(ATT_NAME); String href = elem.attributeValue(ATT_HREF); String type = elem.attributeValue(ATT_TYPE); + NameValuePair[] params = getMandatoryParameters(elem); + href = absolutizeHref(_href, href); if (type == null) { - names.add(new ActionImpl(_crawler, elem, name, href)); + names.add(new ActionImpl(_crawler, elem, name, href, params)); } else { names.add(new ActionImpl(_crawler, elem, name, href, - new PageType(type))); + new PageType(type), params)); } } return names.toArray(new Action[0]); } + + /** + * Absolutize the hyperlink + * @param aPageHref Absolute page reference. + * @param aLinkHref Possibly relative link reference. + * @return Absolute hyperlink. + */ + private String absolutizeHref(String aPageHref, String aLinkHref) { + + try { + URL pageUrl = new URL(aPageHref); + URL newUrl = new URL(pageUrl, aLinkHref); + return newUrl.toString(); // TODO need to use URL instead of String throughout the code. + } catch (MalformedURLException e) { + throw new RuntimeException("Malformed URL", e); + } + } + + private NameValuePair[] getMandatoryParameters(Element aAction) { + List result = new ArrayList(); + for (Element param: (List)aAction.elements(ELEM_PARAM)) { + String name = param.attributeValue(ATT_NAME); + String value = param.attributeValue(ATT_VALUE); + result.add(new NameValuePair(name, value)); + } + return result.toArray(new NameValuePair[0]); + } /* * (non-Javadoc)