X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fimpl%2FConfigurationParser.java;fp=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fimpl%2FConfigurationParser.java;h=89e815c8c943375165d9a9c83a7b015ae0667139;hb=30671b398473b876e5c42d063f0c8e169ad3163c;hp=0000000000000000000000000000000000000000;hpb=643d979c351150ace01a8f9682f6c9f223854cd6;p=utils diff --git a/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java b/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java new file mode 100644 index 00000000..89e815c8 --- /dev/null +++ b/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java @@ -0,0 +1,141 @@ +/* + * Copyright 2005 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wamblee.crawler.impl; + +import java.io.InputStream; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.commons.httpclient.NameValuePair; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Element; +import org.dom4j.io.SAXReader; +import org.wamblee.crawler.Configuration; +import org.wamblee.crawler.GetPageRequest; +import org.wamblee.crawler.PageRequest; +import org.wamblee.crawler.PostPageRequest; + +/** + * Parsing of the configuration from an XML file. + */ +public class ConfigurationParser { + + private static final String ELEM_URL = "url"; + private static final String ELEM_TYPE = "type"; + private static final String ELEM_PATTERN = "pattern"; + private static final String ELEM_METHOD= "method"; + private static final String ELEM_XSLT = "xslt"; + private static final String ELEM_PARAM = "param"; + private static final String AT_NAME = "name"; + private static final String AT_VALUE = "value"; + + private static final String METHOD_POST = "post"; + private static final String METHOD_GET = "get"; + + private PrintStream _os; + + public ConfigurationParser(PrintStream aOs) { + _os = aOs; + } + + public Configuration parse(InputStream aStream) { + try { + SAXReader reader = new SAXReader(); + Document document = reader.read(aStream); + + Element root = document.getRootElement(); + List urlConfigs = parseUrlConfigs(root); + List pageTypeConfigs = parsePageTypeConfigs(root); + return new ConfigurationImpl(urlConfigs, pageTypeConfigs); + } catch (DocumentException e) { + throw new RuntimeException("Problem parsing config file", e); + } + } + + /** + * @param root + * @return + */ + private List parseUrlConfigs(Element root) { + List configs = new ArrayList(); + for (Iterator i = root.elementIterator(ELEM_URL); i.hasNext(); ) { + Element url = (Element)i.next(); + UrlConfig config = parseUrlConfig(url); + configs.add(config); + } + return configs; + } + + private List parsePageTypeConfigs(Element root) { + List configs = new ArrayList(); + for (Iterator i = root.elementIterator(ELEM_TYPE); i.hasNext(); ) { + Element url = (Element)i.next(); + PageTypeConfig config = parsePageTypeConfig(url); + configs.add(config); + } + return configs; + } + + private UrlConfig parseUrlConfig(Element aUrlElem) { + String pattern = aUrlElem.elementText(ELEM_PATTERN); + PageRequest request = parseRequestConfig(aUrlElem); + return new UrlConfig(pattern, request); + } + + private PageTypeConfig parsePageTypeConfig(Element aTypeElem) { + String pattern = aTypeElem.elementText(ELEM_PATTERN); + PageRequest request = parseRequestConfig(aTypeElem); + return new PageTypeConfig(pattern, request); + } + + /** + * @param aUrlElem + * @return + */ + private PageRequest parseRequestConfig(Element aUrlElem) { + String method = aUrlElem.elementText(ELEM_METHOD); + String xslt = aUrlElem.elementText(ELEM_XSLT); + List params = new ArrayList(); + for (Iterator i = aUrlElem.elementIterator(ELEM_PARAM); i.hasNext(); ) { + Element paramElem = (Element)i.next(); + NameValuePair param = parseParameter(paramElem); + params.add(param); + } + + NameValuePair[] paramsArray = params.toArray(new NameValuePair[0]); + PageRequest request; + if ( METHOD_POST.equals(method)) { + request = new PostPageRequest(paramsArray, xslt, _os); + } + else if ( METHOD_GET.equals(method) || method == null ){ + request = new GetPageRequest(paramsArray, xslt, _os); + } else { + throw new RuntimeException("Unknown request method '" + method + "'. Only " + + METHOD_GET + " and " + METHOD_POST + " are supported"); + } + return request; + } + + private NameValuePair parseParameter(Element aParam) { + String name = aParam.attributeValue(AT_NAME); + String value = aParam.attributeValue(AT_VALUE); + return new NameValuePair(name, value); + } +}